Repository: jctian98/e2e_lfmmi
Branch: master
Commit: 34b805690663
Files: 1103
Total size: 7.8 MB

Directory structure:
gitextract_ni0k430x/

├── .gitignore
├── .run.sh.swp
├── README.md
├── __init__.py
├── asr/
│   ├── __init__.py
│   ├── asr_mix_utils.py
│   ├── asr_utils.py
│   ├── chainer_backend/
│   │   ├── __init__.py
│   │   └── asr.py
│   └── pytorch_backend/
│       ├── __init__.py
│       ├── asr.py
│       ├── asr_init.py
│       ├── asr_mix.py
│       └── recog.py
├── bin/
│   ├── __init__.py
│   ├── asr_align.py
│   ├── asr_enhance.py
│   ├── asr_recog.py
│   ├── asr_train.py
│   ├── lm_train.py
│   ├── mt_train.py
│   ├── mt_trans.py
│   ├── st_train.py
│   ├── st_trans.py
│   ├── tts_decode.py
│   ├── tts_train.py
│   ├── vc_decode.py
│   └── vc_train.py
├── egs/
│   ├── .gitignore
│   ├── aishell1/
│   │   ├── .gitignore
│   │   ├── aed.sh
│   │   ├── cmd.sh
│   │   ├── conf/
│   │   │   ├── fbank.conf
│   │   │   ├── gpu.conf
│   │   │   ├── lm.yaml
│   │   │   ├── lm_rnn.yaml
│   │   │   ├── lm_transformer.yaml
│   │   │   ├── pitch.conf
│   │   │   ├── queue.conf
│   │   │   ├── slurm.conf
│   │   │   ├── specaug.yaml
│   │   │   ├── specaug_test.yaml
│   │   │   └── tuning/
│   │   │       ├── decode_pytorch_transformer.yaml
│   │   │       ├── decode_rnn.yaml
│   │   │       ├── train_pytorch_conformer_kernel15.yaml
│   │   │       ├── train_pytorch_conformer_kernel31.yaml
│   │   │       ├── train_pytorch_conformer_kernel31_large.yaml
│   │   │       ├── train_pytorch_conformer_kernel31_small.yaml
│   │   │       ├── train_pytorch_transformer.yaml
│   │   │       ├── train_rnn.yaml
│   │   │       └── transducer/
│   │   │           ├── decode_default.yaml
│   │   │           ├── train_conformer-rnn_transducer.yaml
│   │   │           ├── train_conformer-rnn_transducer_aux_ngpu4.yaml
│   │   │           ├── train_conformer-rnn_transducer_aux_ngpu4_att.yaml
│   │   │           ├── train_conformer-rnn_transducer_aux_ngpu4_small.yaml
│   │   │           ├── train_conformer-rnn_transducer_ngpu4.yaml
│   │   │           ├── train_conformer-rnn_transducer_ngpu4_large.yaml
│   │   │           ├── train_transducer.yaml
│   │   │           └── train_transducer_aux.yaml
│   │   ├── local/
│   │   │   ├── add_lex_disambig.pl
│   │   │   ├── aishell_data_prep.sh
│   │   │   ├── aishell_train_lms.sh
│   │   │   ├── apply_map.pl
│   │   │   ├── build_sp_text.py
│   │   │   ├── build_word_mapping.py
│   │   │   ├── compile_bigram.sh
│   │   │   ├── download_and_untar.sh
│   │   │   ├── fstaddselfloops.pl
│   │   │   ├── k2_aishell_prepare_dict.sh
│   │   │   ├── k2_aishell_prepare_dict_char.sh
│   │   │   ├── k2_prepare_lang.sh
│   │   │   ├── make_lexicon_fst.py
│   │   │   ├── max_rescore.py
│   │   │   ├── parse_options.sh
│   │   │   ├── parse_text_jieba.py
│   │   │   ├── prepare_word_lex.py
│   │   │   └── sym2int.pl
│   │   ├── nt.sh
│   │   ├── path.sh
│   │   └── prepare.sh
│   ├── aishell2/
│   │   ├── .gitignore
│   │   ├── aed.sh
│   │   ├── conf/
│   │   │   ├── .fbank.conf.swp
│   │   │   ├── fbank.conf
│   │   │   ├── gpu.conf
│   │   │   ├── lm.yaml
│   │   │   ├── lm_rnn.yaml
│   │   │   ├── lm_transformer.yaml
│   │   │   ├── pitch.conf
│   │   │   ├── queue.conf
│   │   │   ├── slurm.conf
│   │   │   ├── specaug.yaml
│   │   │   ├── specaug_test.yaml
│   │   │   └── tuning/
│   │   │       ├── decode_pytorch_transformer.yaml
│   │   │       ├── decode_rnn.yaml
│   │   │       ├── train_pytorch_conformer_kernel15.yaml
│   │   │       ├── train_pytorch_conformer_kernel31.yaml
│   │   │       ├── train_pytorch_transformer.yaml
│   │   │       ├── train_rnn.yaml
│   │   │       └── transducer/
│   │   │           ├── decode_default.yaml
│   │   │           ├── train_conformer-rnn_transducer.yaml
│   │   │           ├── train_conformer-rnn_transducer_aux_ngpu4.yaml
│   │   │           ├── train_conformer-rnn_transducer_ngpu4.yaml
│   │   │           ├── train_transducer.yaml
│   │   │           └── train_transducer_aux.yaml
│   │   ├── local/
│   │   │   ├── add_lex_disambig.pl
│   │   │   ├── apply_map.pl
│   │   │   ├── fstaddselfloops.pl
│   │   │   ├── jieba_split_text.py
│   │   │   ├── k2_prepare_lang.sh
│   │   │   ├── make_lexicon_fst.py
│   │   │   ├── max_rescore.py
│   │   │   ├── mmi_rescore.sh
│   │   │   ├── parse_options.sh
│   │   │   ├── prepare_data.sh
│   │   │   ├── prepare_dict.sh
│   │   │   ├── rerank.py
│   │   │   ├── sym2int.pl
│   │   │   ├── train_lms.sh
│   │   │   └── word_segmentation.py
│   │   ├── nt.sh
│   │   └── prepare.sh
│   ├── asrucs/
│   │   ├── .gitignore
│   │   ├── cmd.sh
│   │   ├── conf/
│   │   │   ├── decode.yaml
│   │   │   ├── fbank.conf
│   │   │   ├── gpu.conf
│   │   │   ├── lm.yaml
│   │   │   ├── lm_rnn.yaml
│   │   │   ├── lm_transformer.yaml
│   │   │   ├── pitch.conf
│   │   │   ├── pure_ctc.yaml
│   │   │   ├── queue.conf
│   │   │   ├── slurm.conf
│   │   │   ├── specaug.yaml
│   │   │   ├── specaug_test.yaml
│   │   │   ├── train.yaml
│   │   │   ├── train_conformer-rnn_transducer_cs.yaml
│   │   │   └── tuning/
│   │   │       ├── decode_pytorch_transformer.yaml
│   │   │       ├── decode_rnn.yaml
│   │   │       ├── train_pytorch_conformer_kernel15.yaml
│   │   │       ├── train_pytorch_conformer_kernel31.yaml
│   │   │       ├── train_pytorch_conformer_kernel31_large.yaml
│   │   │       ├── train_pytorch_conformer_kernel31_small.yaml
│   │   │       ├── train_pytorch_transformer.yaml
│   │   │       ├── train_rnn.yaml
│   │   │       └── transducer/
│   │   │           ├── decode_default.yaml
│   │   │           ├── train_conformer-rnn_transducer.yaml
│   │   │           ├── train_conformer-rnn_transducer_aux_ngpu4.yaml
│   │   │           ├── train_conformer-rnn_transducer_aux_ngpu4_att.yaml
│   │   │           ├── train_conformer-rnn_transducer_aux_ngpu4_small.yaml
│   │   │           ├── train_conformer-rnn_transducer_ngpu4.yaml
│   │   │           ├── train_conformer-rnn_transducer_ngpu4_large.yaml
│   │   │           ├── train_transducer.yaml
│   │   │           └── train_transducer_aux.yaml
│   │   ├── espnet
│   │   ├── espnet_utils
│   │   ├── local/
│   │   │   ├── add_seperator.py
│   │   │   ├── generate_fake_cs.py
│   │   │   └── prepare_fake_cs.sh
│   │   ├── nt.sh
│   │   ├── path.sh
│   │   ├── prepare.sh
│   │   ├── steps
│   │   ├── text
│   │   └── utils
│   ├── espnet_utils/
│   │   ├── add_uttcls_json.py
│   │   ├── addjson.py
│   │   ├── apply-cmvn.py
│   │   ├── asr_align_wav.sh
│   │   ├── average_checkpoints.py
│   │   ├── build_fake_lexicon.py
│   │   ├── build_sp_text.py
│   │   ├── calculate_rtf.py
│   │   ├── change_root.py
│   │   ├── change_yaml.py
│   │   ├── clean_corpus.sh
│   │   ├── compute-cmvn-stats.py
│   │   ├── compute-fbank-feats.py
│   │   ├── compute-stft-feats.py
│   │   ├── concat_json_multiref.py
│   │   ├── concatjson.py
│   │   ├── convert_fbank.sh
│   │   ├── convert_fbank_to_wav.py
│   │   ├── copy-feats.py
│   │   ├── data2json.sh
│   │   ├── divide_lang.sh
│   │   ├── double_precious_cer.py
│   │   ├── download_from_google_drive.sh
│   │   ├── dump-pcm.py
│   │   ├── dump.sh
│   │   ├── dump_pcm.sh
│   │   ├── eval-source-separation.py
│   │   ├── eval_perm_free_error.py
│   │   ├── eval_source_separation.sh
│   │   ├── feat-to-shape.py
│   │   ├── feat_to_shape.sh
│   │   ├── feats2npy.py
│   │   ├── filt.py
│   │   ├── filter_all_eng_utts.py
│   │   ├── filter_scp.py
│   │   ├── filter_trn.py
│   │   ├── free-gpu.sh
│   │   ├── gdown.pl
│   │   ├── generate_wav.sh
│   │   ├── generate_wav_from_fbank.py
│   │   ├── get_yaml.py
│   │   ├── jieba_build_dict.py
│   │   ├── json2sctm.py
│   │   ├── json2text.py
│   │   ├── json2trn.py
│   │   ├── json2trn_mt.py
│   │   ├── json2trn_wo_dict.py
│   │   ├── k2/
│   │   │   ├── add_lex_disambig.pl
│   │   │   ├── apply_map.pl
│   │   │   ├── fstaddselfloops.pl
│   │   │   ├── k2_prepare_lang.sh
│   │   │   ├── parse_options.sh
│   │   │   └── sym2int.pl
│   │   ├── make_fbank.sh
│   │   ├── make_pair_json.py
│   │   ├── make_stft.sh
│   │   ├── mbr_analysis.py
│   │   ├── mcd_calculate.py
│   │   ├── merge_scp2json.py
│   │   ├── mergejson.py
│   │   ├── mix-mono-wav-scp.py
│   │   ├── mmi_rescore.sh
│   │   ├── pack_model.sh
│   │   ├── prepare_block_load.sh
│   │   ├── prepare_mer.py
│   │   ├── queue-freegpu.pl
│   │   ├── recog_wav.sh
│   │   ├── reduce_data_dir.sh
│   │   ├── remove_longshortdata.sh
│   │   ├── remove_punctuation.pl
│   │   ├── rerank_mmi.py
│   │   ├── result2json.py
│   │   ├── score_bleu.sh
│   │   ├── score_lang_id.py
│   │   ├── score_sclite.sh
│   │   ├── score_sclite_case.sh
│   │   ├── score_sclite_wo_dict.sh
│   │   ├── scp2json.py
│   │   ├── show_result.sh
│   │   ├── significant_test.sh
│   │   ├── sort_scp_by_length.py
│   │   ├── speed_perturb.sh
│   │   ├── split_scp.py
│   │   ├── split_scp_fix_length.py
│   │   ├── splitjson.py
│   │   ├── spm_decode
│   │   ├── spm_encode
│   │   ├── spm_train
│   │   ├── stdout.pl
│   │   ├── synth_wav.sh
│   │   ├── text2token.py
│   │   ├── text2vocabulary.py
│   │   ├── text_norm.py
│   │   ├── trace_rnnt.py
│   │   ├── train_lms_srilm.sh
│   │   ├── translate_wav.sh
│   │   ├── trim_silence.py
│   │   ├── trim_silence.sh
│   │   ├── trn2ctm.py
│   │   ├── trn2stm.py
│   │   ├── update_json.sh
│   │   ├── word_ngram_rescore.py
│   │   └── word_ngram_rescore.sh
│   ├── steps/
│   │   ├── align_basis_fmllr.sh
│   │   ├── align_basis_fmllr_lats.sh
│   │   ├── align_fmllr.sh
│   │   ├── align_fmllr_lats.sh
│   │   ├── align_lvtln.sh
│   │   ├── align_raw_fmllr.sh
│   │   ├── align_sgmm2.sh
│   │   ├── align_si.sh
│   │   ├── best_path_weights.sh
│   │   ├── cleanup/
│   │   │   ├── clean_and_segment_data.sh
│   │   │   ├── clean_and_segment_data_nnet3.sh
│   │   │   ├── combine_short_segments.py
│   │   │   ├── create_segments_from_ctm.pl
│   │   │   ├── debug_lexicon.sh
│   │   │   ├── decode_fmllr_segmentation.sh
│   │   │   ├── decode_segmentation.sh
│   │   │   ├── decode_segmentation_nnet3.sh
│   │   │   ├── find_bad_utts.sh
│   │   │   ├── find_bad_utts_nnet.sh
│   │   │   ├── internal/
│   │   │   │   ├── align_ctm_ref.py
│   │   │   │   ├── compute_tf_idf.py
│   │   │   │   ├── ctm_to_text.pl
│   │   │   │   ├── get_ctm_edits.py
│   │   │   │   ├── get_non_scored_words.py
│   │   │   │   ├── get_pron_stats.py
│   │   │   │   ├── make_one_biased_lm.py
│   │   │   │   ├── modify_ctm_edits.py
│   │   │   │   ├── resolve_ctm_edits_overlaps.py
│   │   │   │   ├── retrieve_similar_docs.py
│   │   │   │   ├── segment_ctm_edits.py
│   │   │   │   ├── segment_ctm_edits_mild.py
│   │   │   │   ├── split_text_into_docs.pl
│   │   │   │   ├── stitch_documents.py
│   │   │   │   ├── taint_ctm_edits.py
│   │   │   │   └── tf_idf.py
│   │   │   ├── lattice_oracle_align.sh
│   │   │   ├── make_biased_lm_graphs.sh
│   │   │   ├── make_biased_lms.py
│   │   │   ├── make_segmentation_data_dir.sh
│   │   │   ├── make_segmentation_graph.sh
│   │   │   ├── make_utterance_fsts.pl
│   │   │   ├── make_utterance_graph.sh
│   │   │   ├── segment_long_utterances.sh
│   │   │   ├── segment_long_utterances_nnet3.sh
│   │   │   └── split_long_utterance.sh
│   │   ├── combine_ali_dirs.sh
│   │   ├── combine_trans_dirs.sh
│   │   ├── compare_alignments.sh
│   │   ├── compute_cmvn_stats.sh
│   │   ├── compute_vad_decision.sh
│   │   ├── conf/
│   │   │   ├── append_eval_to_ctm.py
│   │   │   ├── append_prf_to_ctm.py
│   │   │   ├── apply_calibration.sh
│   │   │   ├── convert_ctm_to_tra.py
│   │   │   ├── get_ctm_conf.sh
│   │   │   ├── lattice_depth_per_frame.sh
│   │   │   ├── parse_arpa_unigrams.py
│   │   │   ├── prepare_calibration_data.py
│   │   │   ├── prepare_word_categories.py
│   │   │   └── train_calibration.sh
│   │   ├── copy_ali_dir.sh
│   │   ├── copy_lat_dir.sh
│   │   ├── copy_trans_dir.sh
│   │   ├── data/
│   │   │   ├── augment_data_dir.py
│   │   │   ├── data_dir_manipulation_lib.py
│   │   │   ├── make_musan.py
│   │   │   ├── make_musan.sh
│   │   │   └── reverberate_data_dir.py
│   │   ├── decode.sh
│   │   ├── decode_basis_fmllr.sh
│   │   ├── decode_biglm.sh
│   │   ├── decode_combine.sh
│   │   ├── decode_fmllr.sh
│   │   ├── decode_fmllr_extra.sh
│   │   ├── decode_fmmi.sh
│   │   ├── decode_fromlats.sh
│   │   ├── decode_lvtln.sh
│   │   ├── decode_nolats.sh
│   │   ├── decode_raw_fmllr.sh
│   │   ├── decode_sgmm2.sh
│   │   ├── decode_sgmm2_fromlats.sh
│   │   ├── decode_sgmm2_rescore.sh
│   │   ├── decode_sgmm2_rescore_project.sh
│   │   ├── decode_with_map.sh
│   │   ├── diagnostic/
│   │   │   ├── analyze_alignments.sh
│   │   │   ├── analyze_lats.sh
│   │   │   ├── analyze_lattice_depth_stats.py
│   │   │   └── analyze_phone_length_stats.py
│   │   ├── dict/
│   │   │   ├── apply_g2p.sh
│   │   │   ├── apply_g2p_phonetisaurus.sh
│   │   │   ├── apply_lexicon_edits.py
│   │   │   ├── get_pron_stats.py
│   │   │   ├── internal/
│   │   │   │   ├── get_subsegments.py
│   │   │   │   ├── prune_pron_candidates.py
│   │   │   │   └── sum_arc_info.py
│   │   │   ├── learn_lexicon_bayesian.sh
│   │   │   ├── learn_lexicon_greedy.sh
│   │   │   ├── merge_learned_lexicons.py
│   │   │   ├── prons_to_lexicon.py
│   │   │   ├── prune_pron_candidates.py
│   │   │   ├── select_prons_bayesian.py
│   │   │   ├── select_prons_greedy.py
│   │   │   ├── train_g2p.sh
│   │   │   └── train_g2p_phonetisaurus.sh
│   │   ├── get_ctm.sh
│   │   ├── get_ctm_conf_fast.sh
│   │   ├── get_ctm_fast.sh
│   │   ├── get_fmllr_basis.sh
│   │   ├── get_lexicon_probs.sh
│   │   ├── get_prons.sh
│   │   ├── get_train_ctm.sh
│   │   ├── info/
│   │   │   ├── chain_dir_info.pl
│   │   │   ├── gmm_dir_info.pl
│   │   │   ├── nnet2_dir_info.pl
│   │   │   ├── nnet3_dir_info.pl
│   │   │   └── nnet3_disc_dir_info.pl
│   │   ├── libs/
│   │   │   ├── __init__.py
│   │   │   ├── common.py
│   │   │   └── nnet3/
│   │   │       ├── __init__.py
│   │   │       ├── report/
│   │   │       │   ├── __init__.py
│   │   │       │   └── log_parse.py
│   │   │       ├── train/
│   │   │       │   ├── __init__.py
│   │   │       │   ├── chain_objf/
│   │   │       │   │   ├── __init__.py
│   │   │       │   │   └── acoustic_model.py
│   │   │       │   ├── common.py
│   │   │       │   ├── dropout_schedule.py
│   │   │       │   └── frame_level_objf/
│   │   │       │       ├── __init__.py
│   │   │       │       ├── acoustic_model.py
│   │   │       │       ├── common.py
│   │   │       │       └── raw_model.py
│   │   │       └── xconfig/
│   │   │           ├── __init__.py
│   │   │           ├── attention.py
│   │   │           ├── basic_layers.py
│   │   │           ├── composite_layers.py
│   │   │           ├── convolution.py
│   │   │           ├── gru.py
│   │   │           ├── layers.py
│   │   │           ├── lstm.py
│   │   │           ├── parser.py
│   │   │           ├── stats_layer.py
│   │   │           ├── trivial_layers.py
│   │   │           └── utils.py
│   │   ├── lmrescore.sh
│   │   ├── lmrescore_const_arpa.sh
│   │   ├── lmrescore_const_arpa_undeterminized.sh
│   │   ├── lmrescore_rnnlm_lat.sh
│   │   ├── make_denlats.sh
│   │   ├── make_denlats_sgmm2.sh
│   │   ├── make_fbank.sh
│   │   ├── make_fbank_pitch.sh
│   │   ├── make_index.sh
│   │   ├── make_mfcc.sh
│   │   ├── make_mfcc_pitch.sh
│   │   ├── make_mfcc_pitch_online.sh
│   │   ├── make_phone_graph.sh
│   │   ├── make_plp.sh
│   │   ├── make_plp_pitch.sh
│   │   ├── nnet/
│   │   │   ├── align.sh
│   │   │   ├── decode.sh
│   │   │   ├── ivector/
│   │   │   │   ├── extract_ivectors.sh
│   │   │   │   ├── train_diag_ubm.sh
│   │   │   │   └── train_ivector_extractor.sh
│   │   │   ├── make_bn_feats.sh
│   │   │   ├── make_denlats.sh
│   │   │   ├── make_fmllr_feats.sh
│   │   │   ├── make_fmmi_feats.sh
│   │   │   ├── make_priors.sh
│   │   │   ├── pretrain_dbn.sh
│   │   │   ├── train.sh
│   │   │   ├── train_mmi.sh
│   │   │   ├── train_mpe.sh
│   │   │   └── train_scheduler.sh
│   │   ├── nnet2/
│   │   │   ├── adjust_priors.sh
│   │   │   ├── align.sh
│   │   │   ├── check_ivectors_compatible.sh
│   │   │   ├── convert_lda_to_raw.sh
│   │   │   ├── convert_nnet1_to_nnet2.sh
│   │   │   ├── create_appended_model.sh
│   │   │   ├── decode.sh
│   │   │   ├── dump_bottleneck_features.sh
│   │   │   ├── get_egs.sh
│   │   │   ├── get_egs2.sh
│   │   │   ├── get_egs_discriminative2.sh
│   │   │   ├── get_ivector_id.sh
│   │   │   ├── get_lda.sh
│   │   │   ├── get_lda_block.sh
│   │   │   ├── get_perturbed_feats.sh
│   │   │   ├── make_denlats.sh
│   │   │   ├── make_multisplice_configs.py
│   │   │   ├── relabel_egs.sh
│   │   │   ├── relabel_egs2.sh
│   │   │   ├── remove_egs.sh
│   │   │   ├── retrain_fast.sh
│   │   │   ├── retrain_simple2.sh
│   │   │   ├── retrain_tanh.sh
│   │   │   ├── train_block.sh
│   │   │   ├── train_convnet_accel2.sh
│   │   │   ├── train_discriminative.sh
│   │   │   ├── train_discriminative2.sh
│   │   │   ├── train_discriminative_multilang2.sh
│   │   │   ├── train_more.sh
│   │   │   ├── train_more2.sh
│   │   │   ├── train_multilang2.sh
│   │   │   ├── train_multisplice_accel2.sh
│   │   │   ├── train_multisplice_ensemble.sh
│   │   │   ├── train_pnorm.sh
│   │   │   ├── train_pnorm_accel2.sh
│   │   │   ├── train_pnorm_bottleneck_fast.sh
│   │   │   ├── train_pnorm_ensemble.sh
│   │   │   ├── train_pnorm_fast.sh
│   │   │   ├── train_pnorm_multisplice.sh
│   │   │   ├── train_pnorm_multisplice2.sh
│   │   │   ├── train_pnorm_simple.sh
│   │   │   ├── train_pnorm_simple2.sh
│   │   │   ├── train_tanh.sh
│   │   │   ├── train_tanh_bottleneck.sh
│   │   │   ├── train_tanh_fast.sh
│   │   │   └── update_nnet.sh
│   │   ├── nnet3/
│   │   │   ├── adjust_priors.sh
│   │   │   ├── align.sh
│   │   │   ├── align_lats.sh
│   │   │   ├── chain/
│   │   │   │   ├── align_lats.sh
│   │   │   │   ├── build_tree.sh
│   │   │   │   ├── build_tree_multiple_sources.sh
│   │   │   │   ├── e2e/
│   │   │   │   │   ├── README.txt
│   │   │   │   │   ├── compute_biphone_stats.py
│   │   │   │   │   ├── get_egs_e2e.sh
│   │   │   │   │   ├── prepare_e2e.sh
│   │   │   │   │   ├── text_to_phones.py
│   │   │   │   │   └── train_e2e.py
│   │   │   │   ├── gen_topo.pl
│   │   │   │   ├── gen_topo.py
│   │   │   │   ├── gen_topo2.py
│   │   │   │   ├── gen_topo3.py
│   │   │   │   ├── gen_topo4.py
│   │   │   │   ├── gen_topo5.py
│   │   │   │   ├── gen_topo_orig.py
│   │   │   │   ├── get_egs.sh
│   │   │   │   ├── get_model_context.sh
│   │   │   │   ├── get_phone_post.sh
│   │   │   │   ├── make_weighted_den_fst.sh
│   │   │   │   ├── multilingual/
│   │   │   │   │   └── combine_egs.sh
│   │   │   │   ├── train.py
│   │   │   │   └── train_tdnn.sh
│   │   │   ├── chain2/
│   │   │   │   ├── combine_egs.sh
│   │   │   │   ├── compute_preconditioning_matrix.sh
│   │   │   │   ├── get_raw_egs.sh
│   │   │   │   ├── internal/
│   │   │   │   │   ├── get_best_model.sh
│   │   │   │   │   └── get_train_schedule.py
│   │   │   │   ├── process_egs.sh
│   │   │   │   ├── randomize_egs.sh
│   │   │   │   ├── train.sh
│   │   │   │   ├── validate_processed_egs.sh
│   │   │   │   ├── validate_randomized_egs.sh
│   │   │   │   └── validate_raw_egs.sh
│   │   │   ├── components.py
│   │   │   ├── compute_output.sh
│   │   │   ├── convert_nnet2_to_nnet3.py
│   │   │   ├── decode.sh
│   │   │   ├── decode_grammar.sh
│   │   │   ├── decode_lookahead.sh
│   │   │   ├── decode_looped.sh
│   │   │   ├── decode_score_fusion.sh
│   │   │   ├── decode_semisup.sh
│   │   │   ├── dot/
│   │   │   │   ├── descriptor_parser.py
│   │   │   │   └── nnet3_to_dot.py
│   │   │   ├── get_degs.sh
│   │   │   ├── get_egs.sh
│   │   │   ├── get_egs_discriminative.sh
│   │   │   ├── get_egs_targets.sh
│   │   │   ├── get_saturation.pl
│   │   │   ├── get_successful_models.py
│   │   │   ├── lstm/
│   │   │   │   ├── make_configs.py
│   │   │   │   └── train.sh
│   │   │   ├── make_bottleneck_features.sh
│   │   │   ├── make_denlats.sh
│   │   │   ├── make_tdnn_configs.py
│   │   │   ├── multilingual/
│   │   │   │   ├── allocate_multilingual_examples.py
│   │   │   │   └── combine_egs.sh
│   │   │   ├── nnet3_to_dot.sh
│   │   │   ├── report/
│   │   │   │   ├── convert_model.py
│   │   │   │   ├── generate_plots.py
│   │   │   │   └── summarize_compute_debug_timing.py
│   │   │   ├── tdnn/
│   │   │   │   ├── make_configs.py
│   │   │   │   ├── train.sh
│   │   │   │   └── train_raw_nnet.sh
│   │   │   ├── train_discriminative.sh
│   │   │   ├── train_dnn.py
│   │   │   ├── train_raw_dnn.py
│   │   │   ├── train_raw_rnn.py
│   │   │   ├── train_rnn.py
│   │   │   ├── train_tdnn.sh
│   │   │   ├── xconfig_to_config.py
│   │   │   └── xconfig_to_configs.py
│   │   ├── online/
│   │   │   ├── decode.sh
│   │   │   ├── nnet2/
│   │   │   │   ├── align.sh
│   │   │   │   ├── copy_data_dir.sh
│   │   │   │   ├── copy_ivector_dir.sh
│   │   │   │   ├── decode.sh
│   │   │   │   ├── dump_nnet_activations.sh
│   │   │   │   ├── extract_ivectors.sh
│   │   │   │   ├── extract_ivectors_online.sh
│   │   │   │   ├── get_egs.sh
│   │   │   │   ├── get_egs2.sh
│   │   │   │   ├── get_egs_discriminative2.sh
│   │   │   │   ├── get_pca_transform.sh
│   │   │   │   ├── make_denlats.sh
│   │   │   │   ├── prepare_online_decoding.sh
│   │   │   │   ├── prepare_online_decoding_retrain.sh
│   │   │   │   ├── prepare_online_decoding_transfer.sh
│   │   │   │   ├── train_diag_ubm.sh
│   │   │   │   └── train_ivector_extractor.sh
│   │   │   ├── nnet3/
│   │   │   │   ├── decode.sh
│   │   │   │   ├── decode_wake_word.sh
│   │   │   │   └── prepare_online_decoding.sh
│   │   │   └── prepare_online_decoding.sh
│   │   ├── oracle_wer.sh
│   │   ├── overlap/
│   │   │   ├── get_overlap_segments.py
│   │   │   ├── get_overlap_targets.py
│   │   │   ├── output_to_rttm.py
│   │   │   ├── post_process_output.sh
│   │   │   └── prepare_overlap_graph.py
│   │   ├── paste_feats.sh
│   │   ├── pytorchnn/
│   │   │   ├── check_py.py
│   │   │   ├── compute_sentence_scores.py
│   │   │   ├── data.py
│   │   │   ├── lmrescore_nbest_pytorchnn.sh
│   │   │   ├── model.py
│   │   │   └── train.py
│   │   ├── resegment_data.sh
│   │   ├── resegment_text.sh
│   │   ├── rnnlmrescore.sh
│   │   ├── scoring/
│   │   │   ├── score_kaldi_cer.sh
│   │   │   ├── score_kaldi_compare.sh
│   │   │   └── score_kaldi_wer.sh
│   │   ├── search_index.sh
│   │   ├── segmentation/
│   │   │   ├── ali_to_targets.sh
│   │   │   ├── combine_targets_dirs.sh
│   │   │   ├── convert_targets_dir_to_whole_recording.sh
│   │   │   ├── convert_utt2spk_and_segments_to_rttm.py
│   │   │   ├── copy_targets_dir.sh
│   │   │   ├── decode_sad.sh
│   │   │   ├── detect_speech_activity.sh
│   │   │   ├── evaluate_segmentation.pl
│   │   │   ├── get_targets_for_out_of_segments.sh
│   │   │   ├── internal/
│   │   │   │   ├── arc_info_to_targets.py
│   │   │   │   ├── find_oov_phone.py
│   │   │   │   ├── get_default_targets_for_out_of_segments.py
│   │   │   │   ├── get_transform_probs_mat.py
│   │   │   │   ├── merge_segment_targets_to_recording.py
│   │   │   │   ├── merge_targets.py
│   │   │   │   ├── prepare_sad_graph.py
│   │   │   │   ├── resample_targets.py
│   │   │   │   ├── sad_to_segments.py
│   │   │   │   └── verify_phones_list.py
│   │   │   ├── lats_to_targets.sh
│   │   │   ├── merge_targets_dirs.sh
│   │   │   ├── post_process_sad_to_segments.sh
│   │   │   ├── prepare_targets_gmm.sh
│   │   │   ├── resample_targets_dir.sh
│   │   │   └── validate_targets_dir.sh
│   │   ├── select_feats.sh
│   │   ├── shift_feats.sh
│   │   ├── subset_ali_dir.sh
│   │   ├── tandem/
│   │   │   ├── align_fmllr.sh
│   │   │   ├── align_sgmm2.sh
│   │   │   ├── align_si.sh
│   │   │   ├── decode.sh
│   │   │   ├── decode_fmllr.sh
│   │   │   ├── decode_sgmm2.sh
│   │   │   ├── make_denlats.sh
│   │   │   ├── make_denlats_sgmm2.sh
│   │   │   ├── mk_aslf_lda_mllt.sh
│   │   │   ├── mk_aslf_sgmm2.sh
│   │   │   ├── train_deltas.sh
│   │   │   ├── train_lda_mllt.sh
│   │   │   ├── train_mllt.sh
│   │   │   ├── train_mmi.sh
│   │   │   ├── train_mmi_sgmm2.sh
│   │   │   ├── train_mono.sh
│   │   │   ├── train_sat.sh
│   │   │   ├── train_sgmm2.sh
│   │   │   └── train_ubm.sh
│   │   ├── tfrnnlm/
│   │   │   ├── check_py.py
│   │   │   ├── check_tensorflow_installed.sh
│   │   │   ├── lmrescore_rnnlm_lat.sh
│   │   │   ├── lmrescore_rnnlm_lat_pruned.sh
│   │   │   ├── lstm.py
│   │   │   ├── lstm_fast.py
│   │   │   ├── reader.py
│   │   │   └── vanilla_rnnlm.py
│   │   ├── train_deltas.sh
│   │   ├── train_diag_ubm.sh
│   │   ├── train_lda_mllt.sh
│   │   ├── train_lvtln.sh
│   │   ├── train_map.sh
│   │   ├── train_mmi.sh
│   │   ├── train_mmi_fmmi.sh
│   │   ├── train_mmi_fmmi_indirect.sh
│   │   ├── train_mmi_sgmm2.sh
│   │   ├── train_mono.sh
│   │   ├── train_mpe.sh
│   │   ├── train_quick.sh
│   │   ├── train_raw_sat.sh
│   │   ├── train_sat.sh
│   │   ├── train_sat_basis.sh
│   │   ├── train_segmenter.sh
│   │   ├── train_sgmm2.sh
│   │   ├── train_sgmm2_group.sh
│   │   ├── train_smbr.sh
│   │   ├── train_ubm.sh
│   │   └── word_align_lattices.sh
│   └── utils/
│       ├── add_disambig.pl
│       ├── add_lex_disambig.pl
│       ├── analyze_segments.pl
│       ├── apply_map.pl
│       ├── best_wer.sh
│       ├── build_const_arpa_lm.sh
│       ├── combine_data.sh
│       ├── convert_slf.pl
│       ├── convert_slf_parallel.sh
│       ├── copy_data_dir.sh
│       ├── create_data_link.pl
│       ├── create_split_dir.pl
│       ├── ctm/
│       │   ├── convert_ctm.pl
│       │   ├── fix_ctm.sh
│       │   └── resolve_ctm_overlaps.py
│       ├── data/
│       │   ├── combine_short_segments.sh
│       │   ├── convert_data_dir_to_whole.sh
│       │   ├── extend_segment_times.py
│       │   ├── extract_wav_segments_data_dir.sh
│       │   ├── fix_subsegment_feats.pl
│       │   ├── get_allowed_durations.py
│       │   ├── get_frame_shift.sh
│       │   ├── get_num_frames.sh
│       │   ├── get_reco2dur.sh
│       │   ├── get_reco2utt_for_data.sh
│       │   ├── get_segments_for_data.sh
│       │   ├── get_uniform_subsegments.py
│       │   ├── get_utt2dur.sh
│       │   ├── get_utt2num_frames.sh
│       │   ├── internal/
│       │   │   ├── choose_utts_to_combine.py
│       │   │   ├── combine_segments_to_recording.py
│       │   │   ├── modify_speaker_info.py
│       │   │   └── perturb_volume.py
│       │   ├── limit_feature_dim.sh
│       │   ├── modify_speaker_info.sh
│       │   ├── modify_speaker_info_to_recording.sh
│       │   ├── normalize_data_range.pl
│       │   ├── perturb_data_dir_speed_3way.sh
│       │   ├── perturb_data_dir_volume.sh
│       │   ├── perturb_speed_to_allowed_lengths.py
│       │   ├── remove_dup_utts.sh
│       │   ├── resample_data_dir.sh
│       │   ├── shift_and_combine_feats.sh
│       │   ├── shift_feats.sh
│       │   └── subsegment_data_dir.sh
│       ├── dict_dir_add_pronprobs.sh
│       ├── eps2disambig.pl
│       ├── filt.py
│       ├── filter_scp.pl
│       ├── filter_scps.pl
│       ├── find_arpa_oovs.pl
│       ├── fix_data_dir.sh
│       ├── format_lm.sh
│       ├── format_lm_sri.sh
│       ├── gen_topo.pl
│       ├── int2sym.pl
│       ├── kwslist_post_process.pl
│       ├── lang/
│       │   ├── add_unigrams_arpa.pl
│       │   ├── adjust_unk_arpa.pl
│       │   ├── adjust_unk_graph.sh
│       │   ├── bpe/
│       │   │   ├── add_final_optional_silence.sh
│       │   │   ├── apply_bpe.py
│       │   │   ├── bidi.py
│       │   │   ├── learn_bpe.py
│       │   │   ├── prepend_words.py
│       │   │   └── reverse.py
│       │   ├── check_g_properties.pl
│       │   ├── check_phones_compatible.sh
│       │   ├── compute_sentence_probs_arpa.py
│       │   ├── extend_lang.sh
│       │   ├── get_word_position_phone_map.pl
│       │   ├── grammar/
│       │   │   ├── augment_phones_txt.py
│       │   │   └── augment_words_txt.py
│       │   ├── internal/
│       │   │   ├── apply_unk_lm.sh
│       │   │   ├── arpa2fst_constrained.py
│       │   │   └── modify_unk_pron.py
│       │   ├── limit_arpa_unk_history.py
│       │   ├── make_kn_lm.py
│       │   ├── make_lexicon_fst.py
│       │   ├── make_lexicon_fst_silprob.py
│       │   ├── make_phone_bigram_lang.sh
│       │   ├── make_phone_lm.py
│       │   ├── make_position_dependent_subword_lexicon.py
│       │   ├── make_subword_lexicon_fst.py
│       │   ├── make_unk_lm.sh
│       │   └── validate_disambig_sym_file.pl
│       ├── ln.pl
│       ├── make_absolute.sh
│       ├── make_lexicon_fst.pl
│       ├── make_lexicon_fst_silprob.pl
│       ├── make_unigram_grammar.pl
│       ├── map_arpa_lm.pl
│       ├── mkgraph.sh
│       ├── mkgraph_lookahead.sh
│       ├── nnet/
│       │   ├── gen_dct_mat.py
│       │   ├── gen_hamm_mat.py
│       │   ├── gen_splice.py
│       │   ├── make_blstm_proto.py
│       │   ├── make_cnn_proto.py
│       │   ├── make_lstm_proto.py
│       │   ├── make_nnet_proto.py
│       │   └── subset_data_tr_cv.sh
│       ├── nnet-cpu/
│       │   ├── make_nnet_config.pl
│       │   ├── make_nnet_config_block.pl
│       │   ├── make_nnet_config_preconditioned.pl
│       │   └── update_learning_rates.pl
│       ├── nnet3/
│       │   └── convert_config_tdnn_to_affine.py
│       ├── parallel/
│       │   ├── limit_num_gpus.sh
│       │   ├── pbs.pl
│       │   ├── queue.pl
│       │   ├── retry.pl
│       │   ├── run.pl
│       │   └── slurm.pl
│       ├── parse_options.sh
│       ├── perturb_data_dir_speed.sh
│       ├── pinyin_map.pl
│       ├── prepare_extended_lang.sh
│       ├── prepare_lang.sh
│       ├── prepare_online_nnet_dist_build.sh
│       ├── remove_data_links.sh
│       ├── remove_oovs.pl
│       ├── reverse_arpa.py
│       ├── rnnlm_compute_scores.sh
│       ├── s2eps.pl
│       ├── scoring/
│       │   ├── wer_ops_details.pl
│       │   ├── wer_per_spk_details.pl
│       │   ├── wer_per_utt_details.pl
│       │   └── wer_report.pl
│       ├── segmentation.pl
│       ├── show_lattice.sh
│       ├── shuffle_list.pl
│       ├── spk2utt_to_utt2spk.pl
│       ├── split_data.sh
│       ├── split_scp.pl
│       ├── ssh.pl
│       ├── subset_data_dir.sh
│       ├── subset_scp.pl
│       ├── subword/
│       │   ├── prepare_lang_subword.sh
│       │   └── prepare_subword_text.sh
│       ├── summarize_logs.pl
│       ├── summarize_warnings.pl
│       ├── sym2int.pl
│       ├── utt2spk_to_spk2utt.pl
│       ├── validate_data_dir.sh
│       ├── validate_dict_dir.pl
│       ├── validate_lang.pl
│       ├── validate_text.pl
│       └── write_kwslist.pl
├── env/
│   └── build_env.sh
├── kaldi
├── lm/
│   ├── __init__.py
│   ├── chainer_backend/
│   │   ├── __init__.py
│   │   ├── extlm.py
│   │   └── lm.py
│   ├── lm_utils.py
│   └── pytorch_backend/
│       ├── __init__.py
│       ├── extlm.py
│       └── lm.py
├── mt/
│   ├── __init__.py
│   ├── mt_utils.py
│   └── pytorch_backend/
│       ├── __init__.py
│       └── mt.py
├── nets/
│   ├── __init__.py
│   ├── asr_interface.py
│   ├── batch_beam_search.py
│   ├── batch_beam_search_online_sim.py
│   ├── beam_search.py
│   ├── beam_search_transducer.py
│   ├── chainer_backend/
│   │   ├── __init__.py
│   │   ├── asr_interface.py
│   │   ├── ctc.py
│   │   ├── deterministic_embed_id.py
│   │   ├── e2e_asr.py
│   │   ├── e2e_asr_transformer.py
│   │   ├── nets_utils.py
│   │   ├── rnn/
│   │   │   ├── __init__.py
│   │   │   ├── attentions.py
│   │   │   ├── decoders.py
│   │   │   ├── encoders.py
│   │   │   └── training.py
│   │   └── transformer/
│   │       ├── __init__.py
│   │       ├── attention.py
│   │       ├── ctc.py
│   │       ├── decoder.py
│   │       ├── decoder_layer.py
│   │       ├── embedding.py
│   │       ├── encoder.py
│   │       ├── encoder_layer.py
│   │       ├── label_smoothing_loss.py
│   │       ├── layer_norm.py
│   │       ├── mask.py
│   │       ├── positionwise_feed_forward.py
│   │       ├── subsampling.py
│   │       └── training.py
│   ├── ctc_prefix_score.py
│   ├── e2e_asr_common.py
│   ├── e2e_mt_common.py
│   ├── lm_interface.py
│   ├── mt_interface.py
│   ├── pytorch_backend/
│   │   ├── __init__.py
│   │   ├── conformer/
│   │   │   ├── __init__.py
│   │   │   ├── argument.py
│   │   │   ├── convolution.py
│   │   │   ├── encoder.py
│   │   │   ├── encoder_layer.py
│   │   │   └── swish.py
│   │   ├── ctc.py
│   │   ├── e2e_asr.py
│   │   ├── e2e_asr_conformer.py
│   │   ├── e2e_asr_maskctc.py
│   │   ├── e2e_asr_mix.py
│   │   ├── e2e_asr_mix_transformer.py
│   │   ├── e2e_asr_mulenc.py
│   │   ├── e2e_asr_transducer.py
│   │   ├── e2e_asr_transducer_cs.py
│   │   ├── e2e_asr_transformer.py
│   │   ├── e2e_mt.py
│   │   ├── e2e_mt_transformer.py
│   │   ├── e2e_st.py
│   │   ├── e2e_st_conformer.py
│   │   ├── e2e_st_transformer.py
│   │   ├── e2e_tts_fastspeech.py
│   │   ├── e2e_tts_tacotron2.py
│   │   ├── e2e_tts_transformer.py
│   │   ├── e2e_vc_tacotron2.py
│   │   ├── e2e_vc_transformer.py
│   │   ├── fastspeech/
│   │   │   ├── __init__.py
│   │   │   ├── duration_calculator.py
│   │   │   ├── duration_predictor.py
│   │   │   └── length_regulator.py
│   │   ├── frontends/
│   │   │   ├── __init__.py
│   │   │   ├── beamformer.py
│   │   │   ├── dnn_beamformer.py
│   │   │   ├── dnn_wpe.py
│   │   │   ├── feature_transform.py
│   │   │   ├── frontend.py
│   │   │   └── mask_estimator.py
│   │   ├── gtn_ctc.py
│   │   ├── initialization.py
│   │   ├── lm/
│   │   │   ├── __init__.py
│   │   │   ├── default.py
│   │   │   ├── seq_rnn.py
│   │   │   └── transformer.py
│   │   ├── maskctc/
│   │   │   ├── __init__.py
│   │   │   ├── add_mask_token.py
│   │   │   └── mask.py
│   │   ├── nets_utils.py
│   │   ├── rnn/
│   │   │   ├── __init__.py
│   │   │   ├── argument.py
│   │   │   ├── attentions.py
│   │   │   ├── decoders.py
│   │   │   └── encoders.py
│   │   ├── streaming/
│   │   │   ├── __init__.py
│   │   │   ├── segment.py
│   │   │   └── window.py
│   │   ├── tacotron2/
│   │   │   ├── __init__.py
│   │   │   ├── cbhg.py
│   │   │   ├── decoder.py
│   │   │   └── encoder.py
│   │   ├── transducer/
│   │   │   ├── __init__.py
│   │   │   ├── arguments.py
│   │   │   ├── auxiliary_task.py
│   │   │   ├── blocks.py
│   │   │   ├── causal_conv1d.py
│   │   │   ├── custom_decoder.py
│   │   │   ├── custom_encoder.py
│   │   │   ├── error_calculator.py
│   │   │   ├── initializer.py
│   │   │   ├── joint_network.py
│   │   │   ├── loss.py
│   │   │   ├── rnn_decoder.py
│   │   │   ├── rnn_encoder.py
│   │   │   ├── tdnn.py
│   │   │   ├── transformer_decoder_layer.py
│   │   │   ├── utils.py
│   │   │   └── vgg2l.py
│   │   ├── transformer/
│   │   │   ├── __init__.py
│   │   │   ├── add_sos_eos.py
│   │   │   ├── argument.py
│   │   │   ├── attention.py
│   │   │   ├── contextual_block_encoder_layer.py
│   │   │   ├── decoder.py
│   │   │   ├── decoder_layer.py
│   │   │   ├── dynamic_conv.py
│   │   │   ├── dynamic_conv2d.py
│   │   │   ├── embedding.py
│   │   │   ├── encoder.py
│   │   │   ├── encoder_layer.py
│   │   │   ├── encoder_mix.py
│   │   │   ├── initializer.py
│   │   │   ├── label_smoothing_loss.py
│   │   │   ├── layer_norm.py
│   │   │   ├── lightconv.py
│   │   │   ├── lightconv2d.py
│   │   │   ├── mask.py
│   │   │   ├── multi_layer_conv.py
│   │   │   ├── optimizer.py
│   │   │   ├── plot.py
│   │   │   ├── positionwise_feed_forward.py
│   │   │   ├── repeat.py
│   │   │   ├── sgd_optimizer.py
│   │   │   ├── subsampling.py
│   │   │   └── subsampling_without_posenc.py
│   │   └── wavenet.py
│   ├── scorer_interface.py
│   ├── scorers/
│   │   ├── .mmi_rnnt_scorer.py.swp
│   │   ├── __init__.py
│   │   ├── _mmi_utils.py
│   │   ├── ctc.py
│   │   ├── ctc_rnnt_scorer.py
│   │   ├── length_bonus.py
│   │   ├── lookahead.py
│   │   ├── mmi.py
│   │   ├── mmi_alignment_score.py
│   │   ├── mmi_frame_prefix_scorer.py
│   │   ├── mmi_frame_scorer.py
│   │   ├── mmi_frame_scorer_trace.py
│   │   ├── mmi_lookahead.py
│   │   ├── mmi_lookahead_bak.py
│   │   ├── mmi_lookahead_split.py
│   │   ├── mmi_prefix_score.py
│   │   ├── mmi_rescorer.py
│   │   ├── mmi_rnnt_lookahead_scorer.py
│   │   ├── mmi_rnnt_scorer.py
│   │   ├── mmi_utils.py
│   │   ├── new_mmi_frame_scorer.py
│   │   ├── ngram.py
│   │   ├── sorted_matcher.py
│   │   ├── test.py
│   │   ├── tlg_scorer.py
│   │   ├── trace_frame.py
│   │   └── word_ngram.py
│   ├── st_interface.py
│   ├── transducer_decoder_interface.py
│   └── tts_interface.py
├── optimizer/
│   ├── __init__.py
│   ├── chainer.py
│   ├── factory.py
│   ├── parser.py
│   └── pytorch.py
├── scheduler/
│   ├── __init__.py
│   ├── chainer.py
│   ├── pytorch.py
│   └── scheduler.py
├── snowfall/
│   ├── __init__.py
│   ├── common.py
│   ├── data/
│   │   ├── __init__.py
│   │   ├── aishell.py
│   │   ├── asr_datamodule.py
│   │   ├── datamodule.py
│   │   └── librispeech.py
│   ├── decoding/
│   │   ├── __init__.py
│   │   ├── graph.py
│   │   └── lm_rescore.py
│   ├── dist.py
│   ├── lexicon.py
│   ├── models/
│   │   ├── __init__.py
│   │   ├── conformer.py
│   │   ├── contextnet.py
│   │   ├── interface.py
│   │   ├── tdnn.py
│   │   ├── tdnn_lstm.py
│   │   ├── tdnnf.py
│   │   └── transformer.py
│   ├── objectives/
│   │   ├── __init__.py
│   │   ├── common.py
│   │   ├── ctc.py
│   │   └── mmi.py
│   ├── training/
│   │   ├── __init__.py
│   │   ├── ctc_graph.py
│   │   ├── diagnostics.py
│   │   ├── mmi_graph.py
│   │   └── mmi_mbr_graph.py
│   └── warpper/
│       ├── k2_decode.py
│       ├── mmi_test.py
│       ├── mmi_utils.py
│       ├── prefix_scorer.py
│       ├── warpper_ctc.py
│       └── warpper_mmi.py
├── st/
│   ├── __init__.py
│   └── pytorch_backend/
│       ├── __init__.py
│       └── st.py
├── transform/
│   ├── __init__.py
│   ├── add_deltas.py
│   ├── channel_selector.py
│   ├── cmvn.py
│   ├── functional.py
│   ├── perturb.py
│   ├── spec_augment.py
│   ├── spectrogram.py
│   ├── transform_interface.py
│   ├── transformation.py
│   └── wpe.py
├── tts/
│   ├── __init__.py
│   └── pytorch_backend/
│       ├── __init__.py
│       └── tts.py
├── utils/
│   ├── __init__.py
│   ├── bmuf.py
│   ├── check_kwargs.py
│   ├── cli_readers.py
│   ├── cli_utils.py
│   ├── cli_writers.py
│   ├── dataset.py
│   ├── deterministic_utils.py
│   ├── draw_num_fst.py
│   ├── dynamic_import.py
│   ├── fill_missing_args.py
│   ├── io_utils.py
│   ├── parse_decoding_process.py
│   ├── parse_npy.py
│   ├── print.py
│   ├── rtf_calculator.py
│   ├── sampler.py
│   ├── spec_augment.py
│   └── training/
│       ├── __init__.py
│       ├── batchfy.py
│       ├── evaluator.py
│       ├── iterators.py
│       ├── tensorboard_logger.py
│       └── train_utils.py
├── vc/
│   └── pytorch_backend/
│       └── vc.py
└── version.txt

================================================
FILE CONTENTS
================================================

================================================
FILE: .gitignore
================================================
*.pyc
interface


================================================
FILE: README.md
================================================
# End-to-end speech secognition toolkit
This is an E2E ASR toolkit modified from Espnet1 (version 0.9.9).  
If this repositry can help you, we will be appreciate if you can star it and cite our papers.

This is the official implementation following papers:  
[**Consistent Training and Decoding For End-to-end Speech Recognition Using Lattice-free MMI**](https://ieeexplore.ieee.org/document/9746579/) (Accepted by ICASSP 2022)  
[**Improving Mandarin End-to-End Speech Recognition with Word N-gram Language Model**](https://ieeexplore.ieee.org/document/9721084) (Accepted by SPL)  
[**Integrate Lattice-Free MMI into End-to-End Speech Recognition**](https://arxiv.org/abs/2203.15614) (Submitted to TASLP) 


We achieve state-of-the-art results on two of the most popular results in Aishell-1 and AIshell-2 Mandarin datasets.  
Please feel free to change / modify the code as you like. :)
### Update
- 2021/12/29: Release the first version, which contains all MMI-related features, including MMI training criteria, MMI Prefix Score (for attention-based encoder-decoder, AED) and MMI Alignment Score (For neural transducer, NT).
- 2022/1/6: Release the word-level N-gram LM scorer.
- 2022/1/12: We update the instructions to build the environment. We also release the trained NT model for Aishell-1 for quick performance check. We update the guildline to run our code.
- 2022/3/29 We release a new CTC / RNN-T recipe for code-switch problem based on ASRU 2019 Mandarin-English code-switch dataset (see egs/asrucs); Results on Aishell-1 and Aishell-2 are also updated.

### Environment:
The main dependencies of this code can be divided into three part: `kaldi`, `espnet` and `k2`  
Please follow the instructions in [build_env.sh](https://github.com/jctian98/e2e_lfmmi/blob/master/env/build_env.sh) to build the environment.  
Note the script cannot run automatically and you need to run it line-by-line.
### Results
Currently we have released examples on Aishell-1 and Aishell-2 datasets.  
With MMI training & decoding methods and the word-level N-gram LM. We achieve results on Aishell-1 and Aishell-2 as below. All results are in CER%  
The model file of Aishell-1 NT system is [here](https://drive.google.com/file/d/1VE2YtLb70UpQkeGWE8WhHJl7sSwNa_zG/view?usp=sharing) for quick performance check.

|  Test set                      | Aishell-1-dev | Aishell-1-test | Aishell-2-ios | Aishell-2-android | Aishell-2-mic |  
|  :----                         | :-: | :--: | :-: | :-----: | :-: |
| AED                            | 4.60| 5.07 | 5.72| 6.60    | 6.58| 
| AED + MMI + Word Ngram         | 4.08| 4.45 | 5.15| 5.92    | 5.77|
| NT                             | 4.41| 4.82 | 5.81| 6.52    | 6.52|
| NT + MMI + Word Ngram          | 3.79| 4.10 | 5.02| 5.85    | 5.66|
 
### Get Start
Take Aishell-1 as an example. Working process for other examples are very similar.  
step 1: clone the code and link kaldi
```
conda activate lfmmi
git clone https://github.com/jctian98/e2e_lfmmi E2E-ASR-Framework # clone and RENAME
cd E2E-ASR-Framework
ln -s <path-to-kaldi> kaldi                                       # link kaldi
```
step 2: prepare data, lexicon and LMs. Before you run, please set the datadir in `prepare.sh`
```
cd egs/aishell1
bash prepare.sh 
```
step 3: model training. You should split the data before start the training.  
You can skip this step and download our trained model [here](https://drive.google.com/file/d/1VE2YtLb70UpQkeGWE8WhHJl7sSwNa_zG/view?usp=sharing)
```
python3 espnet_utils/splitjson.py -p <ngpu> dump/train_sp/deltafalse/data.json
bash nt.sh --stop_stage 1
```
step 4: decode 
```
bash nt.sh --stage 2 --mmi-weight 0.2 --word-ngram-weight 0.4
```
Several Hint:
1. Please change the paths in `path.sh` accordingly before you start
2. Please change the `data` to config your data path in `prepare.sh`
3. Our code runs in DDP style and requires some global variables. Before you start, you need to set them manually. We assume Pytorch distributed API works well on your machine.  
```
export HOST_GPU_NUM=x       # number of GPUs on each host
export HOST_NUM=x           # number of hosts
export NODE_NUM=x           # number of GPUs in total (on all hosts)
export INDEX=x              # index of this host
export CHIEF_IP=xx.xx.xx.xx # IP of the master host
```
4. You may encounter some problem about `k2`. Try to delete `data/lang_phone/Linv.pt` (in training) and `data/word_3gram/G.pt`(in decoding) and re-generate them again. 
5. Multiple choices are available during decoding (we take `nt.sh` as an example, but the usage of `aed.sh` is the same).  
   To use the MMI-related scorers, you need train the model with MMI auxiliary criterion;  
   
  To use MMI Prefix Score (in AED) or MMI Alignment score (in NT):
  ```
  bash nt.sh --stage 2 --mmi-weight 0.2
  ```
  To use any external LM, you need to train them in advance (as implemented in `prepare.sh`)  
  
  To use word-level N-gram LM:
  ```
  bash nt.sh --stage 2 --word-ngram-weight 0.4
  ```
  To use character-level N-gram LM:
  ```
  bash nt.sh --stage 2 --ngram-weight 1.0
  ```
  To use neural network LM:
  ```
  bash nt.sh --stage 2 --lm-weight 1.0
  ```
### Reference
kaldi: https://github.com/kaldi-asr/kaldi  
Espent: https://github.com/espnet/espnet  
k2-fsa: https://github.com/k2-fsa/k2  
### Citations
```
@INPROCEEDINGS{9746579,
  author={Tian, Jinchuan and Yu, Jianwei and Weng, Chao and Zhang, Shi-Xiong and Su, Dan and Yu, Dong and Zou, Yuexian},
  booktitle={ICASSP 2022 - 2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)}, 
  title={Consistent Training and Decoding for End-to-End Speech Recognition Using Lattice-Free MMI}, 
  year={2022},
  volume={},
  number={},
  pages={7782-7786},
  doi={10.1109/ICASSP43922.2022.9746579}}

@ARTICLE{9721084,
  author={Tian, Jinchuan and Yu, Jianwei and Weng, Chao and Zou, Yuexian and Yu, Dong},
  journal={IEEE Signal Processing Letters}, 
  title={Improving Mandarin End-to-End Speech Recognition with Word N-gram Language Model}, 
  year={2022},
  volume={},
  number={},
  pages={1-1},
  doi={10.1109/LSP.2022.3154241}}
  
@article{tian2022integrate,
  title={Integrate Lattice-Free MMI into End-to-End Speech Recognition},
  author={Tian, Jinchuan and Yu, Jianwei and Weng, Chao and Zou, Yuexian and Yu, Dong},
  journal={arXiv preprint arXiv:2203.15614},
  year={2022}
}
```
### Authorship
Jinchuan Tian;  tianjinchuan@stu.pku.edu.cn or tyriontian@tencent.com  
Jianwei Yu; tomasyu@tencent.com (supervisor)  
Chao Weng; cweng@tencent.com  
Yuexian Zou; zouyx@pku.edu.cn


================================================
FILE: __init__.py
================================================
"""Initialize espnet package."""

import os
dirname = os.path.dirname(__file__)
version_file = os.path.join(dirname, "version.txt")
with open(version_file, "r") as f:
    __version__ = f.read().strip()


================================================
FILE: asr/__init__.py
================================================
"""Initialize sub package."""


================================================
FILE: asr/asr_mix_utils.py
================================================
#!/usr/bin/env python3

"""
This script is used to provide utility functions designed for multi-speaker ASR.

Copyright 2017 Johns Hopkins University (Shinji Watanabe)
 Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

Most functions can be directly used as in asr_utils.py:
    CompareValueTrigger, restore_snapshot, adadelta_eps_decay, chainer_load,
    torch_snapshot, torch_save, torch_resume, AttributeDict, get_model_conf.

"""

import copy
import logging
import os

from chainer.training import extension

import matplotlib

from espnet.asr.asr_utils import parse_hypothesis


matplotlib.use("Agg")


# * -------------------- chainer extension related -------------------- *
class PlotAttentionReport(extension.Extension):
    """Plot attention reporter.

    Args:
        att_vis_fn (espnet.nets.*_backend.e2e_asr.calculate_all_attentions):
            Function of attention visualization.
        data (list[tuple(str, dict[str, dict[str, Any]])]): List json utt key items.
        outdir (str): Directory to save figures.
        converter (espnet.asr.*_backend.asr.CustomConverter):
            CustomConverter object. Function to convert data.
        device (torch.device): The destination device to send tensor.
        reverse (bool): If True, input and output length are reversed.

    """

    def __init__(self, att_vis_fn, data, outdir, converter, device, reverse=False):
        """Initialize PlotAttentionReport."""
        self.att_vis_fn = att_vis_fn
        self.data = copy.deepcopy(data)
        self.outdir = outdir
        self.converter = converter
        self.device = device
        self.reverse = reverse
        if not os.path.exists(self.outdir):
            os.makedirs(self.outdir)

    def __call__(self, trainer):
        """Plot and save imaged matrix of att_ws."""
        att_ws_sd = self.get_attention_weights()
        for ns, att_ws in enumerate(att_ws_sd):
            for idx, att_w in enumerate(att_ws):
                filename = "%s/%s.ep.{.updater.epoch}.output%d.png" % (
                    self.outdir,
                    self.data[idx][0],
                    ns + 1,
                )
                att_w = self.get_attention_weight(idx, att_w, ns)
                self._plot_and_save_attention(att_w, filename.format(trainer))

    def log_attentions(self, logger, step):
        """Add image files of attention matrix to tensorboard."""
        att_ws_sd = self.get_attention_weights()
        for ns, att_ws in enumerate(att_ws_sd):
            for idx, att_w in enumerate(att_ws):
                att_w = self.get_attention_weight(idx, att_w, ns)
                plot = self.draw_attention_plot(att_w)
                logger.add_figure("%s" % (self.data[idx][0]), plot.gcf(), step)
                plot.clf()

    def get_attention_weights(self):
        """Return attention weights.

        Returns:
            arr_ws_sd (numpy.ndarray): attention weights. It's shape would be
                differ from bachend.dtype=float
                * pytorch-> 1) multi-head case => (B, H, Lmax, Tmax). 2)
                  other case => (B, Lmax, Tmax).
                * chainer-> attention weights (B, Lmax, Tmax).

        """
        batch = self.converter([self.converter.transform(self.data)], self.device)
        att_ws_sd = self.att_vis_fn(*batch)
        return att_ws_sd

    def get_attention_weight(self, idx, att_w, spkr_idx):
        """Transform attention weight in regard to self.reverse."""
        if self.reverse:
            dec_len = int(self.data[idx][1]["input"][0]["shape"][0])
            enc_len = int(self.data[idx][1]["output"][spkr_idx]["shape"][0])
        else:
            dec_len = int(self.data[idx][1]["output"][spkr_idx]["shape"][0])
            enc_len = int(self.data[idx][1]["input"][0]["shape"][0])
        if len(att_w.shape) == 3:
            att_w = att_w[:, :dec_len, :enc_len]
        else:
            att_w = att_w[:dec_len, :enc_len]
        return att_w

    def draw_attention_plot(self, att_w):
        """Visualize attention weights matrix.

        Args:
            att_w(Tensor): Attention weight matrix.

        Returns:
            matplotlib.pyplot: pyplot object with attention matrix image.

        """
        import matplotlib.pyplot as plt

        if len(att_w.shape) == 3:
            for h, aw in enumerate(att_w, 1):
                plt.subplot(1, len(att_w), h)
                plt.imshow(aw, aspect="auto")
                plt.xlabel("Encoder Index")
                plt.ylabel("Decoder Index")
        else:
            plt.imshow(att_w, aspect="auto")
            plt.xlabel("Encoder Index")
            plt.ylabel("Decoder Index")
        plt.tight_layout()
        return plt

    def _plot_and_save_attention(self, att_w, filename):
        plt = self.draw_attention_plot(att_w)
        plt.savefig(filename)
        plt.close()


def add_results_to_json(js, nbest_hyps_sd, char_list):
    """Add N-best results to json.

    Args:
        js (dict[str, Any]): Groundtruth utterance dict.
        nbest_hyps_sd (list[dict[str, Any]]):
            List of hypothesis for multi_speakers (# Utts x # Spkrs).
        char_list (list[str]): List of characters.

    Returns:
        dict[str, Any]: N-best results added utterance dict.

    """
    # copy old json info
    new_js = dict()
    new_js["utt2spk"] = js["utt2spk"]
    num_spkrs = len(nbest_hyps_sd)
    new_js["output"] = []

    for ns in range(num_spkrs):
        tmp_js = []
        nbest_hyps = nbest_hyps_sd[ns]

        for n, hyp in enumerate(nbest_hyps, 1):
            # parse hypothesis
            rec_text, rec_token, rec_tokenid, score = parse_hypothesis(hyp, char_list)

            # copy ground-truth
            out_dic = dict(js["output"][ns].items())

            # update name
            out_dic["name"] += "[%d]" % n

            # add recognition results
            out_dic["rec_text"] = rec_text
            out_dic["rec_token"] = rec_token
            out_dic["rec_tokenid"] = rec_tokenid
            out_dic["score"] = score

            # add to list of N-best result dicts
            tmp_js.append(out_dic)

            # show 1-best result
            if n == 1:
                logging.info("groundtruth: %s" % out_dic["text"])
                logging.info("prediction : %s" % out_dic["rec_text"])

        new_js["output"].append(tmp_js)
    return new_js


================================================
FILE: asr/asr_utils.py
================================================
# Copyright 2017 Johns Hopkins University (Shinji Watanabe)
# Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

import argparse
import copy
import json
import logging
import os
import shutil
import tempfile
import numpy as np
import torch


# * -------------------- training iterator related -------------------- *


class CompareValueTrigger(object):
    """Trigger invoked when key value getting bigger or lower than before.

    Args:
        key (str) : Key of value.
        compare_fn ((float, float) -> bool) : Function to compare the values.
        trigger (tuple(int, str)) : Trigger that decide the comparison interval.

    """

    def __init__(self, key, compare_fn, trigger=(1, "epoch")):
        from chainer import training

        self._key = key
        self._best_value = None
        self._interval_trigger = training.util.get_trigger(trigger)
        self._init_summary()
        self._compare_fn = compare_fn

    def __call__(self, trainer):
        """Get value related to the key and compare with current value."""
        observation = trainer.observation
        summary = self._summary
        key = self._key
        if key in observation:
            summary.add({key: observation[key]})

        if not self._interval_trigger(trainer):
            return False

        stats = summary.compute_mean()
        value = float(stats[key])  # copy to CPU
        self._init_summary()

        if self._best_value is None:
            # initialize best value
            self._best_value = value
            return False
        elif self._compare_fn(self._best_value, value):
            return True
        else:
            self._best_value = value
            return False

    def _init_summary(self):
        import chainer

        self._summary = chainer.reporter.DictSummary()


try:
    from chainer.training import extension
except ImportError:
    PlotAttentionReport = None
else:

    class PlotAttentionReport(extension.Extension):
        """Plot attention reporter.

        Args:
            att_vis_fn (espnet.nets.*_backend.e2e_asr.E2E.calculate_all_attentions):
                Function of attention visualization.
            data (list[tuple(str, dict[str, list[Any]])]): List json utt key items.
            outdir (str): Directory to save figures.
            converter (espnet.asr.*_backend.asr.CustomConverter):
                Function to convert data.
            device (int | torch.device): Device.
            reverse (bool): If True, input and output length are reversed.
            ikey (str): Key to access input
                (for ASR/ST ikey="input", for MT ikey="output".)
            iaxis (int): Dimension to access input
                (for ASR/ST iaxis=0, for MT iaxis=1.)
            okey (str): Key to access output
                (for ASR/ST okey="input", MT okay="output".)
            oaxis (int): Dimension to access output
                (for ASR/ST oaxis=0, for MT oaxis=0.)
            subsampling_factor (int): subsampling factor in encoder

        """

        def __init__(
            self,
            att_vis_fn,
            data,
            outdir,
            converter,
            transform,
            device,
            reverse=False,
            ikey="input",
            iaxis=0,
            okey="output",
            oaxis=0,
            subsampling_factor=1,
        ):
            self.att_vis_fn = att_vis_fn
            self.data = copy.deepcopy(data)
            self.data_dict = {k: v for k, v in copy.deepcopy(data)}
            # key is utterance ID
            self.outdir = outdir
            self.converter = converter
            self.transform = transform
            self.device = device
            self.reverse = reverse
            self.ikey = ikey
            self.iaxis = iaxis
            self.okey = okey
            self.oaxis = oaxis
            self.factor = subsampling_factor
            if not os.path.exists(self.outdir):
                os.makedirs(self.outdir)

        def __call__(self, trainer):
            """Plot and save image file of att_ws matrix."""
            att_ws, uttid_list = self.get_attention_weights()
            if isinstance(att_ws, list):  # multi-encoder case
                num_encs = len(att_ws) - 1
                # atts
                for i in range(num_encs):
                    for idx, att_w in enumerate(att_ws[i]):
                        filename = "%s/%s.ep.{.updater.epoch}.att%d.png" % (
                            self.outdir,
                            uttid_list[idx],
                            i + 1,
                        )
                        att_w = self.trim_attention_weight(uttid_list[idx], att_w)
                        np_filename = "%s/%s.ep.{.updater.epoch}.att%d.npy" % (
                            self.outdir,
                            uttid_list[idx],
                            i + 1,
                        )
                        np.save(np_filename.format(trainer), att_w)
                        self._plot_and_save_attention(att_w, filename.format(trainer))
                # han
                for idx, att_w in enumerate(att_ws[num_encs]):
                    filename = "%s/%s.ep.{.updater.epoch}.han.png" % (
                        self.outdir,
                        uttid_list[idx],
                    )
                    att_w = self.trim_attention_weight(uttid_list[idx], att_w)
                    np_filename = "%s/%s.ep.{.updater.epoch}.han.npy" % (
                        self.outdir,
                        uttid_list[idx],
                    )
                    np.save(np_filename.format(trainer), att_w)
                    self._plot_and_save_attention(
                        att_w, filename.format(trainer), han_mode=True
                    )
            else:
                for idx, att_w in enumerate(att_ws):
                    filename = "%s/%s.ep.{.updater.epoch}.png" % (
                        self.outdir,
                        uttid_list[idx],
                    )
                    att_w = self.trim_attention_weight(uttid_list[idx], att_w)
                    np_filename = "%s/%s.ep.{.updater.epoch}.npy" % (
                        self.outdir,
                        uttid_list[idx],
                    )
                    np.save(np_filename.format(trainer), att_w)
                    self._plot_and_save_attention(att_w, filename.format(trainer))

        def log_attentions(self, logger, step):
            """Add image files of att_ws matrix to the tensorboard."""
            att_ws, uttid_list = self.get_attention_weights()
            if isinstance(att_ws, list):  # multi-encoder case
                num_encs = len(att_ws) - 1
                # atts
                for i in range(num_encs):
                    for idx, att_w in enumerate(att_ws[i]):
                        att_w = self.trim_attention_weight(uttid_list[idx], att_w)
                        plot = self.draw_attention_plot(att_w)
                        logger.add_figure(
                            "%s_att%d" % (uttid_list[idx], i + 1),
                            plot.gcf(),
                            step,
                        )
                # han
                for idx, att_w in enumerate(att_ws[num_encs]):
                    att_w = self.trim_attention_weight(uttid_list[idx], att_w)
                    plot = self.draw_han_plot(att_w)
                    logger.add_figure(
                        "%s_han" % (uttid_list[idx]),
                        plot.gcf(),
                        step,
                    )
            else:
                for idx, att_w in enumerate(att_ws):
                    att_w = self.trim_attention_weight(uttid_list[idx], att_w)
                    plot = self.draw_attention_plot(att_w)
                    logger.add_figure("%s" % (uttid_list[idx]), plot.gcf(), step)

        def get_attention_weights(self):
            """Return attention weights.

            Returns:
                numpy.ndarray: attention weights. float. Its shape would be
                    differ from backend.
                    * pytorch-> 1) multi-head case => (B, H, Lmax, Tmax), 2)
                      other case => (B, Lmax, Tmax).
                    * chainer-> (B, Lmax, Tmax)

            """
            return_batch, uttid_list = self.transform(self.data, return_uttid=True)
            batch = self.converter([return_batch], self.device)
            if isinstance(batch, tuple):
                att_ws = self.att_vis_fn(*batch)
            else:
                att_ws = self.att_vis_fn(**batch)
            return att_ws, uttid_list

        def trim_attention_weight(self, uttid, att_w):
            """Transform attention matrix with regard to self.reverse."""
            if self.reverse:
                enc_key, enc_axis = self.okey, self.oaxis
                dec_key, dec_axis = self.ikey, self.iaxis
            else:
                enc_key, enc_axis = self.ikey, self.iaxis
                dec_key, dec_axis = self.okey, self.oaxis
            dec_len = int(self.data_dict[uttid][dec_key][dec_axis]["shape"][0])
            enc_len = int(self.data_dict[uttid][enc_key][enc_axis]["shape"][0])
            if self.factor > 1:
                enc_len //= self.factor
            if len(att_w.shape) == 3:
                att_w = att_w[:, :dec_len, :enc_len]
            else:
                att_w = att_w[:dec_len, :enc_len]
            return att_w

        def draw_attention_plot(self, att_w):
            """Plot the att_w matrix.

            Returns:
                matplotlib.pyplot: pyplot object with attention matrix image.

            """
            import matplotlib

            matplotlib.use("Agg")
            import matplotlib.pyplot as plt

            plt.clf()
            att_w = att_w.astype(np.float32)
            if len(att_w.shape) == 3:
                for h, aw in enumerate(att_w, 1):
                    plt.subplot(1, len(att_w), h)
                    plt.imshow(aw, aspect="auto")
                    plt.xlabel("Encoder Index")
                    plt.ylabel("Decoder Index")
            else:
                plt.imshow(att_w, aspect="auto")
                plt.xlabel("Encoder Index")
                plt.ylabel("Decoder Index")
            plt.tight_layout()
            return plt

        def draw_han_plot(self, att_w):
            """Plot the att_w matrix for hierarchical attention.

            Returns:
                matplotlib.pyplot: pyplot object with attention matrix image.

            """
            import matplotlib

            matplotlib.use("Agg")
            import matplotlib.pyplot as plt

            plt.clf()
            if len(att_w.shape) == 3:
                for h, aw in enumerate(att_w, 1):
                    legends = []
                    plt.subplot(1, len(att_w), h)
                    for i in range(aw.shape[1]):
                        plt.plot(aw[:, i])
                        legends.append("Att{}".format(i))
                    plt.ylim([0, 1.0])
                    plt.xlim([0, aw.shape[0]])
                    plt.grid(True)
                    plt.ylabel("Attention Weight")
                    plt.xlabel("Decoder Index")
                    plt.legend(legends)
            else:
                legends = []
                for i in range(att_w.shape[1]):
                    plt.plot(att_w[:, i])
                    legends.append("Att{}".format(i))
                plt.ylim([0, 1.0])
                plt.xlim([0, att_w.shape[0]])
                plt.grid(True)
                plt.ylabel("Attention Weight")
                plt.xlabel("Decoder Index")
                plt.legend(legends)
            plt.tight_layout()
            return plt

        def _plot_and_save_attention(self, att_w, filename, han_mode=False):
            if han_mode:
                plt = self.draw_han_plot(att_w)
            else:
                plt = self.draw_attention_plot(att_w)
            plt.savefig(filename)
            plt.close()


try:
    from chainer.training import extension
except ImportError:
    PlotCTCReport = None
else:

    class PlotCTCReport(extension.Extension):
        """Plot CTC reporter.

        Args:
            ctc_vis_fn (espnet.nets.*_backend.e2e_asr.E2E.calculate_all_ctc_probs):
                Function of CTC visualization.
            data (list[tuple(str, dict[str, list[Any]])]): List json utt key items.
            outdir (str): Directory to save figures.
            converter (espnet.asr.*_backend.asr.CustomConverter):
                Function to convert data.
            device (int | torch.device): Device.
            reverse (bool): If True, input and output length are reversed.
            ikey (str): Key to access input
                (for ASR/ST ikey="input", for MT ikey="output".)
            iaxis (int): Dimension to access input
                (for ASR/ST iaxis=0, for MT iaxis=1.)
            okey (str): Key to access output
                (for ASR/ST okey="input", MT okay="output".)
            oaxis (int): Dimension to access output
                (for ASR/ST oaxis=0, for MT oaxis=0.)
            subsampling_factor (int): subsampling factor in encoder

        """

        def __init__(
            self,
            ctc_vis_fn,
            data,
            outdir,
            converter,
            transform,
            device,
            reverse=False,
            ikey="input",
            iaxis=0,
            okey="output",
            oaxis=0,
            subsampling_factor=1,
        ):
            self.ctc_vis_fn = ctc_vis_fn
            self.data = copy.deepcopy(data)
            self.data_dict = {k: v for k, v in copy.deepcopy(data)}
            # key is utterance ID
            self.outdir = outdir
            self.converter = converter
            self.transform = transform
            self.device = device
            self.reverse = reverse
            self.ikey = ikey
            self.iaxis = iaxis
            self.okey = okey
            self.oaxis = oaxis
            self.factor = subsampling_factor
            if not os.path.exists(self.outdir):
                os.makedirs(self.outdir)

        def __call__(self, trainer):
            """Plot and save image file of ctc prob."""
            ctc_probs, uttid_list = self.get_ctc_probs()
            if isinstance(ctc_probs, list):  # multi-encoder case
                num_encs = len(ctc_probs) - 1
                for i in range(num_encs):
                    for idx, ctc_prob in enumerate(ctc_probs[i]):
                        filename = "%s/%s.ep.{.updater.epoch}.ctc%d.png" % (
                            self.outdir,
                            uttid_list[idx],
                            i + 1,
                        )
                        ctc_prob = self.trim_ctc_prob(uttid_list[idx], ctc_prob)
                        np_filename = "%s/%s.ep.{.updater.epoch}.ctc%d.npy" % (
                            self.outdir,
                            uttid_list[idx],
                            i + 1,
                        )
                        np.save(np_filename.format(trainer), ctc_prob)
                        self._plot_and_save_ctc(ctc_prob, filename.format(trainer))
            else:
                for idx, ctc_prob in enumerate(ctc_probs):
                    filename = "%s/%s.ep.{.updater.epoch}.png" % (
                        self.outdir,
                        uttid_list[idx],
                    )
                    ctc_prob = self.trim_ctc_prob(uttid_list[idx], ctc_prob)
                    np_filename = "%s/%s.ep.{.updater.epoch}.npy" % (
                        self.outdir,
                        uttid_list[idx],
                    )
                    np.save(np_filename.format(trainer), ctc_prob)
                    self._plot_and_save_ctc(ctc_prob, filename.format(trainer))

        def log_ctc_probs(self, logger, step):
            """Add image files of ctc probs to the tensorboard."""
            ctc_probs, uttid_list = self.get_ctc_probs()
            if isinstance(ctc_probs, list):  # multi-encoder case
                num_encs = len(ctc_probs) - 1
                for i in range(num_encs):
                    for idx, ctc_prob in enumerate(ctc_probs[i]):
                        ctc_prob = self.trim_ctc_prob(uttid_list[idx], ctc_prob)
                        plot = self.draw_ctc_plot(ctc_prob)
                        logger.add_figure(
                            "%s_ctc%d" % (uttid_list[idx], i + 1),
                            plot.gcf(),
                            step,
                        )
            else:
                for idx, ctc_prob in enumerate(ctc_probs):
                    ctc_prob = self.trim_ctc_prob(uttid_list[idx], ctc_prob)
                    plot = self.draw_ctc_plot(ctc_prob)
                    logger.add_figure("%s" % (uttid_list[idx]), plot.gcf(), step)

        def get_ctc_probs(self):
            """Return CTC probs.

            Returns:
                numpy.ndarray: CTC probs. float. Its shape would be
                    differ from backend. (B, Tmax, vocab).

            """
            return_batch, uttid_list = self.transform(self.data, return_uttid=True)
            batch = self.converter([return_batch], self.device)
            if isinstance(batch, tuple):
                probs = self.ctc_vis_fn(*batch)
            else:
                probs = self.ctc_vis_fn(**batch)
            return probs, uttid_list

        def trim_ctc_prob(self, uttid, prob):
            """Trim CTC posteriors accoding to input lengths."""
            enc_len = int(self.data_dict[uttid][self.ikey][self.iaxis]["shape"][0])
            if self.factor > 1:
                enc_len //= self.factor
            prob = prob[:enc_len]
            return prob

        def draw_ctc_plot(self, ctc_prob):
            """Plot the ctc_prob matrix.

            Returns:
                matplotlib.pyplot: pyplot object with CTC prob matrix image.

            """
            import matplotlib

            matplotlib.use("Agg")
            import matplotlib.pyplot as plt

            ctc_prob = ctc_prob.astype(np.float32)

            plt.clf()
            topk_ids = np.argsort(ctc_prob, axis=1)
            n_frames, vocab = ctc_prob.shape
            times_probs = np.arange(n_frames)

            plt.figure(figsize=(20, 8))

            # NOTE: index 0 is reserved for blank
            for idx in set(topk_ids.reshape(-1).tolist()):
                if idx == 0:
                    plt.plot(
                        times_probs, ctc_prob[:, 0], ":", label="<blank>", color="grey"
                    )
                else:
                    plt.plot(times_probs, ctc_prob[:, idx])
            plt.xlabel(u"Input [frame]", fontsize=12)
            plt.ylabel("Posteriors", fontsize=12)
            plt.xticks(list(range(0, int(n_frames) + 1, 10)))
            plt.yticks(list(range(0, 2, 1)))
            plt.tight_layout()
            return plt

        def _plot_and_save_ctc(self, ctc_prob, filename):
            plt = self.draw_ctc_plot(ctc_prob)
            plt.savefig(filename)
            plt.close()


def restore_snapshot(model, snapshot, load_fn=None):
    """Extension to restore snapshot.

    Returns:
        An extension function.

    """
    import chainer
    from chainer import training

    if load_fn is None:
        load_fn = chainer.serializers.load_npz

    @training.make_extension(trigger=(1, "epoch"))
    def restore_snapshot(trainer):
        _restore_snapshot(model, snapshot, load_fn)

    return restore_snapshot


def _restore_snapshot(model, snapshot, load_fn=None):
    if load_fn is None:
        import chainer

        load_fn = chainer.serializers.load_npz

    load_fn(snapshot, model)
    logging.info("restored from " + str(snapshot))


def adadelta_eps_decay(eps_decay):
    """Extension to perform adadelta eps decay.

    Args:
        eps_decay (float): Decay rate of eps.

    Returns:
        An extension function.

    """
    from chainer import training

    @training.make_extension(trigger=(1, "epoch"))
    def adadelta_eps_decay(trainer):
        _adadelta_eps_decay(trainer, eps_decay)

    return adadelta_eps_decay


def _adadelta_eps_decay(trainer, eps_decay):
    optimizer = trainer.updater.get_optimizer("main")
    # for chainer
    if hasattr(optimizer, "eps"):
        current_eps = optimizer.eps
        setattr(optimizer, "eps", current_eps * eps_decay)
        logging.info("adadelta eps decayed to " + str(optimizer.eps))
    # pytorch
    else:
        for p in optimizer.param_groups:
            p["eps"] *= eps_decay
            logging.info("adadelta eps decayed to " + str(p["eps"]))


def adam_lr_decay(eps_decay):
    """Extension to perform adam lr decay.

    Args:
        eps_decay (float): Decay rate of lr.

    Returns:
        An extension function.

    """
    from chainer import training

    @training.make_extension(trigger=(1, "epoch"))
    def adam_lr_decay(trainer):
        _adam_lr_decay(trainer, eps_decay)

    return adam_lr_decay


def _adam_lr_decay(trainer, eps_decay):
    optimizer = trainer.updater.get_optimizer("main")
    # for chainer
    if hasattr(optimizer, "lr"):
        current_lr = optimizer.lr
        setattr(optimizer, "lr", current_lr * eps_decay)
        logging.info("adam lr decayed to " + str(optimizer.lr))
    # pytorch
    else:
        for p in optimizer.param_groups:
            p["lr"] *= eps_decay
            logging.info("adam lr decayed to " + str(p["lr"]))


def torch_snapshot(savefun=torch.save, filename="snapshot.ep.{.updater.epoch}"):
    """Extension to take snapshot of the trainer for pytorch.

    Returns:
        An extension function.

    """
    from chainer.training import extension

    @extension.make_extension(trigger=(1, "epoch"), priority=-100)
    def torch_snapshot(trainer):
        _torch_snapshot_object(trainer, trainer, filename.format(trainer), savefun)

    return torch_snapshot


def _torch_snapshot_object(trainer, target, filename, savefun):
    from chainer.serializers import DictionarySerializer

    # make snapshot_dict dictionary
    s = DictionarySerializer()
    s.save(trainer)
    if hasattr(trainer.updater.model, "model"):
        # (for TTS)
        if hasattr(trainer.updater.model.model, "module"):
            model_state_dict = trainer.updater.model.model.module.state_dict()
        else:
            model_state_dict = trainer.updater.model.model.state_dict()
    else:
        # (for ASR)
        if hasattr(trainer.updater.model, "module"):
            model_state_dict = trainer.updater.model.module.state_dict()
        else:
            model_state_dict = trainer.updater.model.state_dict()
    

    snapshot_dict = {
        "trainer": s.target,
        "model": model_state_dict,
    }

    if hasattr(trainer.updater, "ddp_trainer"):
        # For ASR
        snapshot_dict["optimizer"] = trainer.updater.ddp_trainer.optimizer.state_dict()
    else:
        # Others like LM
        snapshot_dict["optimizer"] = trainer.updater.get_optimizer("main").state_dict() 

    # save snapshot dictionary
    fn = filename.format(trainer)
    prefix = "tmp" + fn
    tmpdir = tempfile.mkdtemp(prefix=prefix, dir=trainer.out)
    tmppath = os.path.join(tmpdir, fn)
    try:
        savefun(snapshot_dict, tmppath)
        shutil.move(tmppath, os.path.join(trainer.out, fn))
    finally:
        shutil.rmtree(tmpdir)


def add_gradient_noise(model, iteration, duration=100, eta=1.0, scale_factor=0.55):
    """Adds noise from a standard normal distribution to the gradients.

    The standard deviation (`sigma`) is controlled by the three hyper-parameters below.
    `sigma` goes to zero (no noise) with more iterations.

    Args:
        model (torch.nn.model): Model.
        iteration (int): Number of iterations.
        duration (int) {100, 1000}:
            Number of durations to control the interval of the `sigma` change.
        eta (float) {0.01, 0.3, 1.0}: The magnitude of `sigma`.
        scale_factor (float) {0.55}: The scale of `sigma`.
    """
    interval = (iteration // duration) + 1
    sigma = eta / interval ** scale_factor
    for param in model.parameters():
        if param.grad is not None:
            _shape = param.grad.size()
            noise = sigma * torch.randn(_shape).to(param.device)
            param.grad += noise


# * -------------------- general -------------------- *
def get_model_conf(model_path, conf_path=None):
    """Get model config information by reading a model config file (model.json).

    Args:
        model_path (str): Model path.
        conf_path (str): Optional model config path.

    Returns:
        list[int, int, dict[str, Any]]: Config information loaded from json file.

    """
    if conf_path is None:
        model_conf = os.path.dirname(model_path) + "/model.json"
    else:
        model_conf = conf_path
    with open(model_conf, "rb") as f:
        logging.info("reading a config file from " + model_conf)
        confs = json.load(f)
    if isinstance(confs, dict):
        # for lm
        args = confs
        return argparse.Namespace(**args)
    else:
        # for asr, tts, mt
        idim, odim, args = confs
        return idim, odim, argparse.Namespace(**args)


def chainer_load(path, model):
    """Load chainer model parameters.

    Args:
        path (str): Model path or snapshot file path to be loaded.
        model (chainer.Chain): Chainer model.

    """
    import chainer

    if "snapshot" in os.path.basename(path):
        chainer.serializers.load_npz(path, model, path="updater/model:main/")
    else:
        chainer.serializers.load_npz(path, model)


def torch_save(path, model):
    """Save torch model states.

    Args:
        path (str): Model path to be saved.
        model (torch.nn.Module): Torch model.

    """
    if hasattr(model, "module"):
        torch.save(model.module.state_dict(), path)
    else:
        torch.save(model.state_dict(), path)


def snapshot_object(target, filename):
    """Returns a trainer extension to take snapshots of a given object.

    Args:
        target (model): Object to serialize.
        filename (str): Name of the file into which the object is serialized.It can
            be a format string, where the trainer object is passed to
            the :meth: `str.format` method. For example,
            ``'snapshot_{.updater.iteration}'`` is converted to
            ``'snapshot_10000'`` at the 10,000th iteration.

    Returns:
        An extension function.

    """
    from chainer.training import extension

    @extension.make_extension(trigger=(1, "epoch"), priority=-100)
    def snapshot_object(trainer):
        torch_save(os.path.join(trainer.out, filename.format(trainer)), target)

    return snapshot_object


def torch_load(path, model):
    """Load torch model states.

    Args:
        path (str): Model path or snapshot file path to be loaded.
        model (torch.nn.Module): Torch model.

    """
    if "snapshot" in os.path.basename(path):
        model_state_dict = torch.load(path, map_location=lambda storage, loc: storage)[
            "model"
        ]
    else:
        model_state_dict = torch.load(path, map_location=lambda storage, loc: storage)

    if hasattr(model, "module"):
        model.module.load_state_dict(model_state_dict)
    else:
        model.load_state_dict(model_state_dict)

    del model_state_dict


def torch_resume(snapshot_path, trainer, load_trainer_and_opt=True):
    """Resume from snapshot for pytorch.

    Args:
        snapshot_path (str): Snapshot file path.
        trainer (chainer.training.Trainer): Chainer's trainer instance.

    """
    from chainer.serializers import NpzDeserializer

    if not load_trainer_and_opt:
        print("Only model weights are resumed")
        print("trainer and optimizer is ignored")
        print("make sure this is the second-stage training")

    # load snapshot
    snapshot_dict = torch.load(snapshot_path, map_location=lambda storage, loc: storage)

    # restore trainer states
    if load_trainer_and_opt:
        d = NpzDeserializer(snapshot_dict["trainer"])
        d.load(trainer)

    # restore model states
    if hasattr(trainer.updater.model, "model"):
        # (for TTS model)
        if hasattr(trainer.updater.model.model, "module"):
            trainer.updater.model.model.module.load_state_dict(snapshot_dict["model"])
        else:
            trainer.updater.model.model.load_state_dict(snapshot_dict["model"])
    else:
        # (for ASR model)
        if hasattr(trainer.updater.model, "module"):
            trainer.updater.model.module.load_state_dict(snapshot_dict["model"])
        else:
            trainer.updater.model.load_state_dict(snapshot_dict["model"])

    # restore optimizer states
    if load_trainer_and_opt and hasattr(trainer.updater.ddp_trainer, "optimizer"):
        trainer.updater.ddp_trainer.optimizer.load_state_dict(snapshot_dict["optimizer"])

    # delete opened snapshot
    del snapshot_dict


# * ------------------ recognition related ------------------ *
def parse_hypothesis(hyp, char_list):
    """Parse hypothesis.

    Args:
        hyp (list[dict[str, Any]]): Recognition hypothesis.
        char_list (list[str]): List of characters.

    Returns:
        tuple(str, str, str, float)

    """
    # remove sos and get results
    tokenid_as_list = list(map(int, hyp["yseq"][1:]))
    token_as_list = [char_list[idx] for idx in tokenid_as_list]
    score = float(hyp["score"])

    # convert to string
    tokenid = " ".join([str(idx) for idx in tokenid_as_list])
    token = " ".join(token_as_list)
    text = "".join(token_as_list).replace("<space>", " ")

    return text, token, tokenid, score


def add_results_to_json(js, nbest_hyps, char_list):
    """Add N-best results to json.

    Args:
        js (dict[str, Any]): Groundtruth utterance dict.
        nbest_hyps_sd (list[dict[str, Any]]):
            List of hypothesis for multi_speakers: nutts x nspkrs.
        char_list (list[str]): List of characters.

    Returns:
        dict[str, Any]: N-best results added utterance dict.

    """
    # copy old json info
    new_js = dict()
    new_js["utt2spk"] = js["utt2spk"]
    new_js["output"] = []

    for n, hyp in enumerate(nbest_hyps, 1):
        # parse hypothesis
        rec_text, rec_token, rec_tokenid, score = parse_hypothesis(hyp, char_list)

        # copy ground-truth
        if len(js["output"]) > 0:
            out_dic = dict(js["output"][0].items())
        else:
            # for no reference case (e.g., speech translation)
            out_dic = {"name": ""}

        # update name
        out_dic["name"] += "[%d]" % n

        # add recognition results
        out_dic["rec_text"] = rec_text
        out_dic["rec_token"] = rec_token
        out_dic["rec_tokenid"] = rec_tokenid
        out_dic["score"] = score
       
        # RNNT MMI 
        if "mmi_tot_score" in hyp:
            out_dic["mmi_tot_score"] = hyp["mmi_tot_score"]
       
        # LASCTC MMI 
        if "scores" in hyp:
            if "mmi_tot_score" in hyp["scores"]:
                out_dic["mmi_tot_score"] = hyp["scores"]["mmi_tot_score"]
            if "mmi" in hyp["scores"]:
                out_dic["mmi"] = hyp["scores"]["mmi"]

        # add to list of N-best result dicts
        new_js["output"].append(out_dic)

        # show 1-best result
        if n == 1:
            if "text" in out_dic.keys():
                logging.info("groundtruth: %s" % out_dic["text"])
            logging.info("prediction : %s" % out_dic["rec_text"])

    return new_js


def plot_spectrogram(
    plt,
    spec,
    mode="db",
    fs=None,
    frame_shift=None,
    bottom=True,
    left=True,
    right=True,
    top=False,
    labelbottom=True,
    labelleft=True,
    labelright=True,
    labeltop=False,
    cmap="inferno",
):
    """Plot spectrogram using matplotlib.

    Args:
        plt (matplotlib.pyplot): pyplot object.
        spec (numpy.ndarray): Input stft (Freq, Time)
        mode (str): db or linear.
        fs (int): Sample frequency. To convert y-axis to kHz unit.
        frame_shift (int): The frame shift of stft. To convert x-axis to second unit.
        bottom (bool):Whether to draw the respective ticks.
        left (bool):
        right (bool):
        top (bool):
        labelbottom (bool):Whether to draw the respective tick labels.
        labelleft (bool):
        labelright (bool):
        labeltop (bool):
        cmap (str): Colormap defined in matplotlib.

    """
    spec = np.abs(spec)
    if mode == "db":
        x = 20 * np.log10(spec + np.finfo(spec.dtype).eps)
    elif mode == "linear":
        x = spec
    else:
        raise ValueError(mode)

    if fs is not None:
        ytop = fs / 2000
        ylabel = "kHz"
    else:
        ytop = x.shape[0]
        ylabel = "bin"

    if frame_shift is not None and fs is not None:
        xtop = x.shape[1] * frame_shift / fs
        xlabel = "s"
    else:
        xtop = x.shape[1]
        xlabel = "frame"

    extent = (0, xtop, 0, ytop)
    plt.imshow(x[::-1], cmap=cmap, extent=extent)

    if labelbottom:
        plt.xlabel("time [{}]".format(xlabel))
    if labelleft:
        plt.ylabel("freq [{}]".format(ylabel))
    plt.colorbar().set_label("{}".format(mode))

    plt.tick_params(
        bottom=bottom,
        left=left,
        right=right,
        top=top,
        labelbottom=labelbottom,
        labelleft=labelleft,
        labelright=labelright,
        labeltop=labeltop,
    )
    plt.axis("auto")


# * ------------------ recognition related ------------------ *
def format_mulenc_args(args):
    """Format args for multi-encoder setup.

    It deals with following situations:  (when args.num_encs=2):
    1. args.elayers = None -> args.elayers = [4, 4];
    2. args.elayers = 4 -> args.elayers = [4, 4];
    3. args.elayers = [4, 4, 4] -> args.elayers = [4, 4].

    """
    # default values when None is assigned.
    default_dict = {
        "etype": "blstmp",
        "elayers": 4,
        "eunits": 300,
        "subsample": "1",
        "dropout_rate": 0.0,
        "atype": "dot",
        "adim": 320,
        "awin": 5,
        "aheads": 4,
        "aconv_chans": -1,
        "aconv_filts": 100,
    }
    for k in default_dict.keys():
        if isinstance(vars(args)[k], list):
            if len(vars(args)[k]) != args.num_encs:
                logging.warning(
                    "Length mismatch {}: Convert {} to {}.".format(
                        k, vars(args)[k], vars(args)[k][: args.num_encs]
                    )
                )
            vars(args)[k] = vars(args)[k][: args.num_encs]
        else:
            if not vars(args)[k]:
                # assign default value if it is None
                vars(args)[k] = default_dict[k]
                logging.warning(
                    "{} is not specified, use default value {}.".format(
                        k, default_dict[k]
                    )
                )
            # duplicate
            logging.warning(
                "Type mismatch {}: Convert {} to {}.".format(
                    k, vars(args)[k], [vars(args)[k] for _ in range(args.num_encs)]
                )
            )
            vars(args)[k] = [vars(args)[k] for _ in range(args.num_encs)]
    return args


================================================
FILE: asr/chainer_backend/__init__.py
================================================
"""Initialize sub package."""


================================================
FILE: asr/chainer_backend/asr.py
================================================
# Copyright 2017 Johns Hopkins University (Shinji Watanabe)
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

"""Training/decoding definition for the speech recognition task."""

import json
import logging
import os
import six

# chainer related
import chainer

from chainer import training

from chainer.datasets import TransformDataset
from chainer.training import extensions

# espnet related
from espnet.asr.asr_utils import adadelta_eps_decay
from espnet.asr.asr_utils import add_results_to_json
from espnet.asr.asr_utils import chainer_load
from espnet.asr.asr_utils import CompareValueTrigger
from espnet.asr.asr_utils import get_model_conf
from espnet.asr.asr_utils import restore_snapshot
from espnet.nets.asr_interface import ASRInterface
from espnet.utils.deterministic_utils import set_deterministic_chainer
from espnet.utils.dynamic_import import dynamic_import
from espnet.utils.io_utils import LoadInputsAndTargets
from espnet.utils.training.batchfy import make_batchset
from espnet.utils.training.evaluator import BaseEvaluator
from espnet.utils.training.iterators import ShufflingEnabler
from espnet.utils.training.iterators import ToggleableShufflingMultiprocessIterator
from espnet.utils.training.iterators import ToggleableShufflingSerialIterator
from espnet.utils.training.train_utils import check_early_stop
from espnet.utils.training.train_utils import set_early_stop

# rnnlm
import espnet.lm.chainer_backend.extlm as extlm_chainer
import espnet.lm.chainer_backend.lm as lm_chainer

# numpy related
import matplotlib

from espnet.utils.training.tensorboard_logger import TensorboardLogger
from tensorboardX import SummaryWriter

matplotlib.use("Agg")


def train(args):
    """Train with the given args.

    Args:
        args (namespace): The program arguments.

    """
    # display chainer version
    logging.info("chainer version = " + chainer.__version__)

    set_deterministic_chainer(args)

    # check cuda and cudnn availability
    if not chainer.cuda.available:
        logging.warning("cuda is not available")
    if not chainer.cuda.cudnn_enabled:
        logging.warning("cudnn is not available")

    # get input and output dimension info
    with open(args.valid_json, "rb") as f:
        valid_json = json.load(f)["utts"]
    utts = list(valid_json.keys())
    idim = int(valid_json[utts[0]]["input"][0]["shape"][1])
    odim = int(valid_json[utts[0]]["output"][0]["shape"][1])
    logging.info("#input dims : " + str(idim))
    logging.info("#output dims: " + str(odim))

    # specify attention, CTC, hybrid mode
    if args.mtlalpha == 1.0:
        mtl_mode = "ctc"
        logging.info("Pure CTC mode")
    elif args.mtlalpha == 0.0:
        mtl_mode = "att"
        logging.info("Pure attention mode")
    else:
        mtl_mode = "mtl"
        logging.info("Multitask learning mode")

    # specify model architecture
    logging.info("import model module: " + args.model_module)
    model_class = dynamic_import(args.model_module)
    model = model_class(idim, odim, args, flag_return=False)
    assert isinstance(model, ASRInterface)
    total_subsampling_factor = model.get_total_subsampling_factor()

    # write model config
    if not os.path.exists(args.outdir):
        os.makedirs(args.outdir)
    model_conf = args.outdir + "/model.json"
    with open(model_conf, "wb") as f:
        logging.info("writing a model config file to " + model_conf)
        f.write(
            json.dumps(
                (idim, odim, vars(args)), indent=4, ensure_ascii=False, sort_keys=True
            ).encode("utf_8")
        )
    for key in sorted(vars(args).keys()):
        logging.info("ARGS: " + key + ": " + str(vars(args)[key]))

    # Set gpu
    ngpu = args.ngpu
    if ngpu == 1:
        gpu_id = 0
        # Make a specified GPU current
        chainer.cuda.get_device_from_id(gpu_id).use()
        model.to_gpu()  # Copy the model to the GPU
        logging.info("single gpu calculation.")
    elif ngpu > 1:
        gpu_id = 0
        devices = {"main": gpu_id}
        for gid in six.moves.xrange(1, ngpu):
            devices["sub_%d" % gid] = gid
        logging.info("multi gpu calculation (#gpus = %d)." % ngpu)
        logging.warning(
            "batch size is automatically increased (%d -> %d)"
            % (args.batch_size, args.batch_size * args.ngpu)
        )
    else:
        gpu_id = -1
        logging.info("cpu calculation")

    # Setup an optimizer
    if args.opt == "adadelta":
        optimizer = chainer.optimizers.AdaDelta(eps=args.eps)
    elif args.opt == "adam":
        optimizer = chainer.optimizers.Adam()
    elif args.opt == "noam":
        optimizer = chainer.optimizers.Adam(alpha=0, beta1=0.9, beta2=0.98, eps=1e-9)
    else:
        raise NotImplementedError("args.opt={}".format(args.opt))

    optimizer.setup(model)
    optimizer.add_hook(chainer.optimizer.GradientClipping(args.grad_clip))

    # Setup a converter
    converter = model.custom_converter(subsampling_factor=model.subsample[0])

    # read json data
    with open(args.train_json, "rb") as f:
        train_json = json.load(f)["utts"]
    with open(args.valid_json, "rb") as f:
        valid_json = json.load(f)["utts"]

    # set up training iterator and updater
    load_tr = LoadInputsAndTargets(
        mode="asr",
        load_output=True,
        preprocess_conf=args.preprocess_conf,
        preprocess_args={"train": True},  # Switch the mode of preprocessing
    )
    load_cv = LoadInputsAndTargets(
        mode="asr",
        load_output=True,
        preprocess_conf=args.preprocess_conf,
        preprocess_args={"train": False},  # Switch the mode of preprocessing
    )

    use_sortagrad = args.sortagrad == -1 or args.sortagrad > 0
    accum_grad = args.accum_grad
    if ngpu <= 1:
        # make minibatch list (variable length)
        train = make_batchset(
            train_json,
            args.batch_size,
            args.maxlen_in,
            args.maxlen_out,
            args.minibatches,
            min_batch_size=args.ngpu if args.ngpu > 1 else 1,
            shortest_first=use_sortagrad,
            count=args.batch_count,
            batch_bins=args.batch_bins,
            batch_frames_in=args.batch_frames_in,
            batch_frames_out=args.batch_frames_out,
            batch_frames_inout=args.batch_frames_inout,
            iaxis=0,
            oaxis=0,
        )
        # hack to make batchsize argument as 1
        # actual batchsize is included in a list
        if args.n_iter_processes > 0:
            train_iters = [
                ToggleableShufflingMultiprocessIterator(
                    TransformDataset(train, load_tr),
                    batch_size=1,
                    n_processes=args.n_iter_processes,
                    n_prefetch=8,
                    maxtasksperchild=20,
                    shuffle=not use_sortagrad,
                )
            ]
        else:
            train_iters = [
                ToggleableShufflingSerialIterator(
                    TransformDataset(train, load_tr),
                    batch_size=1,
                    shuffle=not use_sortagrad,
                )
            ]

        # set up updater
        updater = model.custom_updater(
            train_iters[0],
            optimizer,
            converter=converter,
            device=gpu_id,
            accum_grad=accum_grad,
        )
    else:
        if args.batch_count not in ("auto", "seq") and args.batch_size == 0:
            raise NotImplementedError(
                "--batch-count 'bin' and 'frame' are not implemented "
                "in chainer multi gpu"
            )
        # set up minibatches
        train_subsets = []
        for gid in six.moves.xrange(ngpu):
            # make subset
            train_json_subset = {
                k: v for i, (k, v) in enumerate(train_json.items()) if i % ngpu == gid
            }
            # make minibatch list (variable length)
            train_subsets += [
                make_batchset(
                    train_json_subset,
                    args.batch_size,
                    args.maxlen_in,
                    args.maxlen_out,
                    args.minibatches,
                )
            ]

        # each subset must have same length for MultiprocessParallelUpdater
        maxlen = max([len(train_subset) for train_subset in train_subsets])
        for train_subset in train_subsets:
            if maxlen != len(train_subset):
                for i in six.moves.xrange(maxlen - len(train_subset)):
                    train_subset += [train_subset[i]]

        # hack to make batchsize argument as 1
        # actual batchsize is included in a list
        if args.n_iter_processes > 0:
            train_iters = [
                ToggleableShufflingMultiprocessIterator(
                    TransformDataset(train_subsets[gid], load_tr),
                    batch_size=1,
                    n_processes=args.n_iter_processes,
                    n_prefetch=8,
                    maxtasksperchild=20,
                    shuffle=not use_sortagrad,
                )
                for gid in six.moves.xrange(ngpu)
            ]
        else:
            train_iters = [
                ToggleableShufflingSerialIterator(
                    TransformDataset(train_subsets[gid], load_tr),
                    batch_size=1,
                    shuffle=not use_sortagrad,
                )
                for gid in six.moves.xrange(ngpu)
            ]

        # set up updater
        updater = model.custom_parallel_updater(
            train_iters, optimizer, converter=converter, devices=devices
        )

    # Set up a trainer
    trainer = training.Trainer(updater, (args.epochs, "epoch"), out=args.outdir)

    if use_sortagrad:
        trainer.extend(
            ShufflingEnabler(train_iters),
            trigger=(args.sortagrad if args.sortagrad != -1 else args.epochs, "epoch"),
        )
    if args.opt == "noam":
        from espnet.nets.chainer_backend.transformer.training import VaswaniRule

        trainer.extend(
            VaswaniRule(
                "alpha",
                d=args.adim,
                warmup_steps=args.transformer_warmup_steps,
                scale=args.transformer_lr,
            ),
            trigger=(1, "iteration"),
        )
    # Resume from a snapshot
    if args.resume:
        chainer.serializers.load_npz(args.resume, trainer)

    # set up validation iterator
    valid = make_batchset(
        valid_json,
        args.batch_size,
        args.maxlen_in,
        args.maxlen_out,
        args.minibatches,
        min_batch_size=args.ngpu if args.ngpu > 1 else 1,
        count=args.batch_count,
        batch_bins=args.batch_bins,
        batch_frames_in=args.batch_frames_in,
        batch_frames_out=args.batch_frames_out,
        batch_frames_inout=args.batch_frames_inout,
        iaxis=0,
        oaxis=0,
    )

    if args.n_iter_processes > 0:
        valid_iter = chainer.iterators.MultiprocessIterator(
            TransformDataset(valid, load_cv),
            batch_size=1,
            repeat=False,
            shuffle=False,
            n_processes=args.n_iter_processes,
            n_prefetch=8,
            maxtasksperchild=20,
        )
    else:
        valid_iter = chainer.iterators.SerialIterator(
            TransformDataset(valid, load_cv), batch_size=1, repeat=False, shuffle=False
        )

    # Evaluate the model with the test dataset for each epoch
    trainer.extend(BaseEvaluator(valid_iter, model, converter=converter, device=gpu_id))

    # Save attention weight each epoch
    if args.num_save_attention > 0 and args.mtlalpha != 1.0:
        data = sorted(
            list(valid_json.items())[: args.num_save_attention],
            key=lambda x: int(x[1]["input"][0]["shape"][1]),
            reverse=True,
        )
        if hasattr(model, "module"):
            att_vis_fn = model.module.calculate_all_attentions
            plot_class = model.module.attention_plot_class
        else:
            att_vis_fn = model.calculate_all_attentions
            plot_class = model.attention_plot_class
        logging.info("Using custom PlotAttentionReport")
        att_reporter = plot_class(
            att_vis_fn,
            data,
            args.outdir + "/att_ws",
            converter=converter,
            transform=load_cv,
            device=gpu_id,
            subsampling_factor=total_subsampling_factor,
        )
        trainer.extend(att_reporter, trigger=(1, "epoch"))
    else:
        att_reporter = None

    # Take a snapshot for each specified epoch
    trainer.extend(
        extensions.snapshot(filename="snapshot.ep.{.updater.epoch}"),
        trigger=(1, "epoch"),
    )

    # Make a plot for training and validation values
    trainer.extend(
        extensions.PlotReport(
            [
                "main/loss",
                "validation/main/loss",
                "main/loss_ctc",
                "validation/main/loss_ctc",
                "main/loss_att",
                "validation/main/loss_att",
            ],
            "epoch",
            file_name="loss.png",
        )
    )
    trainer.extend(
        extensions.PlotReport(
            ["main/acc", "validation/main/acc"], "epoch", file_name="acc.png"
        )
    )

    # Save best models
    trainer.extend(
        extensions.snapshot_object(model, "model.loss.best"),
        trigger=training.triggers.MinValueTrigger("validation/main/loss"),
    )
    if mtl_mode != "ctc":
        trainer.extend(
            extensions.snapshot_object(model, "model.acc.best"),
            trigger=training.triggers.MaxValueTrigger("validation/main/acc"),
        )

    # epsilon decay in the optimizer
    if args.opt == "adadelta":
        if args.criterion == "acc" and mtl_mode != "ctc":
            trainer.extend(
                restore_snapshot(model, args.outdir + "/model.acc.best"),
                trigger=CompareValueTrigger(
                    "validation/main/acc",
                    lambda best_value, current_value: best_value > current_value,
                ),
            )
            trainer.extend(
                adadelta_eps_decay(args.eps_decay),
                trigger=CompareValueTrigger(
                    "validation/main/acc",
                    lambda best_value, current_value: best_value > current_value,
                ),
            )
        elif args.criterion == "loss":
            trainer.extend(
                restore_snapshot(model, args.outdir + "/model.loss.best"),
                trigger=CompareValueTrigger(
                    "validation/main/loss",
                    lambda best_value, current_value: best_value < current_value,
                ),
            )
            trainer.extend(
                adadelta_eps_decay(args.eps_decay),
                trigger=CompareValueTrigger(
                    "validation/main/loss",
                    lambda best_value, current_value: best_value < current_value,
                ),
            )

    # Write a log of evaluation statistics for each epoch
    trainer.extend(
        extensions.LogReport(trigger=(args.report_interval_iters, "iteration"))
    )
    report_keys = [
        "epoch",
        "iteration",
        "main/loss",
        "main/loss_ctc",
        "main/loss_att",
        "validation/main/loss",
        "validation/main/loss_ctc",
        "validation/main/loss_att",
        "main/acc",
        "validation/main/acc",
        "elapsed_time",
    ]
    if args.opt == "adadelta":
        trainer.extend(
            extensions.observe_value(
                "eps", lambda trainer: trainer.updater.get_optimizer("main").eps
            ),
            trigger=(args.report_interval_iters, "iteration"),
        )
        report_keys.append("eps")
    trainer.extend(
        extensions.PrintReport(report_keys),
        trigger=(args.report_interval_iters, "iteration"),
    )

    trainer.extend(extensions.ProgressBar(update_interval=args.report_interval_iters))

    set_early_stop(trainer, args)
    if args.tensorboard_dir is not None and args.tensorboard_dir != "":
        writer = SummaryWriter(args.tensorboard_dir)
        trainer.extend(
            TensorboardLogger(writer, att_reporter),
            trigger=(args.report_interval_iters, "iteration"),
        )

    # Run the training
    trainer.run()
    check_early_stop(trainer, args.epochs)


def recog(args):
    """Decode with the given args.

    Args:
        args (namespace): The program arguments.

    """
    # display chainer version
    logging.info("chainer version = " + chainer.__version__)

    set_deterministic_chainer(args)

    # read training config
    idim, odim, train_args = get_model_conf(args.model, args.model_conf)

    for key in sorted(vars(args).keys()):
        logging.info("ARGS: " + key + ": " + str(vars(args)[key]))

    # specify model architecture
    logging.info("reading model parameters from " + args.model)
    # To be compatible with v.0.3.0 models
    if hasattr(train_args, "model_module"):
        model_module = train_args.model_module
    else:
        model_module = "espnet.nets.chainer_backend.e2e_asr:E2E"
    model_class = dynamic_import(model_module)
    model = model_class(idim, odim, train_args)
    assert isinstance(model, ASRInterface)
    chainer_load(args.model, model)

    # read rnnlm
    if args.rnnlm:
        rnnlm_args = get_model_conf(args.rnnlm, args.rnnlm_conf)
        rnnlm = lm_chainer.ClassifierWithState(
            lm_chainer.RNNLM(
                len(train_args.char_list), rnnlm_args.layer, rnnlm_args.unit
            )
        )
        chainer_load(args.rnnlm, rnnlm)
    else:
        rnnlm = None

    if args.word_rnnlm:
        rnnlm_args = get_model_conf(args.word_rnnlm, args.word_rnnlm_conf)
        word_dict = rnnlm_args.char_list_dict
        char_dict = {x: i for i, x in enumerate(train_args.char_list)}
        word_rnnlm = lm_chainer.ClassifierWithState(
            lm_chainer.RNNLM(len(word_dict), rnnlm_args.layer, rnnlm_args.unit)
        )
        chainer_load(args.word_rnnlm, word_rnnlm)

        if rnnlm is not None:
            rnnlm = lm_chainer.ClassifierWithState(
                extlm_chainer.MultiLevelLM(
                    word_rnnlm.predictor, rnnlm.predictor, word_dict, char_dict
                )
            )
        else:
            rnnlm = lm_chainer.ClassifierWithState(
                extlm_chainer.LookAheadWordLM(
                    word_rnnlm.predictor, word_dict, char_dict
                )
            )

    # read json data
    with open(args.recog_json, "rb") as f:
        js = json.load(f)["utts"]

    load_inputs_and_targets = LoadInputsAndTargets(
        mode="asr",
        load_output=False,
        sort_in_input_length=False,
        preprocess_conf=train_args.preprocess_conf
        if args.preprocess_conf is None
        else args.preprocess_conf,
        preprocess_args={"train": False},  # Switch the mode of preprocessing
    )

    # decode each utterance
    new_js = {}
    with chainer.no_backprop_mode():
        for idx, name in enumerate(js.keys(), 1):
            logging.info("(%d/%d) decoding " + name, idx, len(js.keys()))
            batch = [(name, js[name])]
            feat = load_inputs_and_targets(batch)[0][0]
            nbest_hyps = model.recognize(feat, args, train_args.char_list, rnnlm)
            new_js[name] = add_results_to_json(
                js[name], nbest_hyps, train_args.char_list
            )

    with open(args.result_label, "wb") as f:
        f.write(
            json.dumps(
                {"utts": new_js}, indent=4, ensure_ascii=False, sort_keys=True
            ).encode("utf_8")
        )


================================================
FILE: asr/pytorch_backend/__init__.py
================================================
"""Initialize sub package."""


================================================
FILE: asr/pytorch_backend/asr.py
================================================
# Copyright 2017 Johns Hopkins University (Shinji Watanabe)
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

"""Training/decoding definition for the speech recognition task."""

import copy
import json
import logging
import math
import os
import sys

from chainer import reporter as reporter_module
from chainer import training
from chainer.training import extensions
from chainer.training.updater import StandardUpdater
import numpy as np
import torch
import torch.distributed as dist
import time

from espnet.asr.asr_utils import adadelta_eps_decay
from espnet.asr.asr_utils import add_results_to_json
from espnet.asr.asr_utils import CompareValueTrigger
from espnet.asr.asr_utils import format_mulenc_args
from espnet.asr.asr_utils import get_model_conf
from espnet.asr.asr_utils import plot_spectrogram
from espnet.asr.asr_utils import restore_snapshot
from espnet.asr.asr_utils import snapshot_object
from espnet.asr.asr_utils import torch_load
from espnet.asr.asr_utils import torch_resume
from espnet.asr.asr_utils import torch_snapshot
from espnet.asr.pytorch_backend.asr_init import freeze_modules
from espnet.asr.pytorch_backend.asr_init import load_trained_model
from espnet.asr.pytorch_backend.asr_init import load_trained_modules
import espnet.lm.pytorch_backend.extlm as extlm_pytorch
from espnet.nets.asr_interface import ASRInterface
from espnet.nets.beam_search_transducer import BeamSearchTransducer
from espnet.nets.pytorch_backend.e2e_asr import pad_list
import espnet.nets.pytorch_backend.lm.default as lm_pytorch
from espnet.nets.pytorch_backend.streaming.segment import SegmentStreamingE2E
from espnet.nets.pytorch_backend.streaming.window import WindowStreamingE2E
from espnet.transform.spectrogram import IStft
from espnet.transform.transformation import Transformation
from espnet.utils.cli_writers import file_writer_helper
from espnet.utils.dataset import ChainerDataLoader
from espnet.utils.dataset import TransformDataset
from espnet.utils.deterministic_utils import set_deterministic_pytorch
from espnet.utils.dynamic_import import dynamic_import
from espnet.utils.io_utils import LoadInputsAndTargets
from espnet.utils.training.batchfy import make_batchset
from espnet.utils.training.evaluator import BaseEvaluator
from espnet.utils.training.iterators import ShufflingEnabler
from espnet.utils.training.tensorboard_logger import TensorboardLogger
from espnet.utils.training.train_utils import check_early_stop
from espnet.utils.training.train_utils import set_early_stop
from espnet.snowfall.warpper.k2_decode import k2_decode
import matplotlib

from espnet.utils.parse_decoding_process import plot_decoding_logs
from espnet.utils.bmuf import BlockAdamTrainer
matplotlib.use("Agg")

if sys.version_info[0] == 2:
    from itertools import izip_longest as zip_longest
else:
    from itertools import zip_longest as zip_longest

from espnet.nets.scorers.mmi_rnnt_scorer import MMIRNNTScorer
# from espnet.nets.scorers.mmi_alignment_score import MMIRNNTScorer
from espnet.utils.print import step_print
from espnet.utils.sampler import BufferSampler
from espnet.utils.rtf_calculator import RTF_calculator
from espnet.nets.lm_interface import dynamic_import_lm


def _recursive_to(xs, device):
    if torch.is_tensor(xs):
        return xs.to(device)
    if isinstance(xs, tuple):
        return tuple(_recursive_to(x, device) for x in xs)
    return xs

def is_alphabet(char):
    if (char >= '\u0041' and char <= '\u005a') or (char >= '\u0061' and char <= '\u007a'):
        return True
    else:
        return False

class CustomEvaluator(BaseEvaluator):
    """Custom Evaluator for Pytorch.

    Args:
        model (torch.nn.Module): The model to evaluate.
        iterator (chainer.dataset.Iterator) : The train iterator.

        target (link | dict[str, link]) :Link object or a dictionary of
            links to evaluate. If this is just a link object, the link is
            registered by the name ``'main'``.

        device (torch.device): The device used.
        ngpu (int): The number of GPUs.

    """

    def __init__(self, model, iterator, target, device, ngpu=None):
        super(CustomEvaluator, self).__init__(iterator, target)
        self.model = model
        self.device = device
        if ngpu is not None:
            self.ngpu = ngpu
        elif device.type == "cpu":
            self.ngpu = 0
        else:
            self.ngpu = 1

    # The core part of the update routine can be customized by overriding
    def evaluate(self):
        """Main evaluate routine for CustomEvaluator."""
        iterator = self._iterators["main"]

        if self.eval_hook:
            self.eval_hook(self)

        if hasattr(iterator, "reset"):
            iterator.reset()
            it = iterator
        else:
            it = copy.copy(iterator)

        summary = reporter_module.DictSummary()

        self.model.eval()
        with torch.no_grad():
            for batch in it:
                print("evaluation batch")
                x = _recursive_to(batch, self.device)
                observation = {}
                with reporter_module.report_scope(observation):
                    # read scp files
                    # x: original json with loaded features
                    #    will be converted to chainer variable later
                    if self.ngpu == 0:
                        self.model(*x)
                    else:
                        # apex does not support torch.nn.DataParallel
                        # data_parallel(self.model, x, range(self.ngpu))
                        self.model(*x)
                summary.add(observation)
        self.model.train()

        return summary.compute_mean()


class CustomUpdater(StandardUpdater):
    """Custom Updater for Pytorch.

    Args:
        model (torch.nn.Module): The model to update.
        grad_clip_threshold (float): The gradient clipping value to use.
        train_iter (chainer.dataset.Iterator): The training iterator.
        optimizer (torch.optim.optimizer): The training optimizer.

        device (torch.device): The device to use.
        ngpu (int): The number of gpus to use.
        use_apex (bool): The flag to use Apex in backprop.

    """

    def __init__(
        self,
        model,
        grad_clip_threshold,
        train_iter,
        optimizer,
        device,
        ngpu,
        grad_noise=False,
        accum_grad=1,
        use_apex=False,
        ddp_trainer=None
    ):
        super(CustomUpdater, self).__init__(train_iter, optimizer)
        self.model = model
        self.grad_clip_threshold = grad_clip_threshold
        self.device = device
        self.ngpu = ngpu
        self.accum_grad = accum_grad
        self.forward_count = 0
        self.grad_noise = grad_noise
        self.iteration = 0
        self.use_apex = use_apex
        self.ddp_trainer = ddp_trainer
        self.optimizer = optimizer

    # The core part of the update routine can be customized by overriding.
    def update_core(self):
        """Main update routine of the CustomUpdater."""
        # When we pass one iterator and optimizer to StandardUpdater.__init__,
        # they are automatically named 'main'.
        train_iter = self.get_iterator("main")
        optimizer = self.get_optimizer("main")
        epoch = train_iter.epoch

        
        batch = train_iter.next()
        
        x = _recursive_to(batch, self.device)
        is_new_epoch = train_iter.epoch != epoch
        
        if self.ngpu == 0:
            loss = self.model(*x).mean() / self.accum_grad
        else:
            # apex does not support torch.nn.DataParallel
            #loss = (
            #    data_parallel(self.model, x, range(self.ngpu)).mean() / self.accum_grad
            #)
            loss = self.model(*x) / self.accum_grad
        if self.use_apex:
            from apex import amp

            # NOTE: for a compatibility with noam optimizer
            opt = optimizer.optimizer if hasattr(optimizer, "optimizer") else optimizer
            with amp.scale_loss(loss, opt) as scaled_loss:
                scaled_loss.backward()
        else:
            loss.backward()
        # step_print(f"| forward_count {self.forward_count} | finish backward")
        # gradient noise injection
        if self.grad_noise:
            from espnet.asr.asr_utils import add_gradient_noise

            add_gradient_noise(
                self.model, self.iteration, duration=100, eta=1.0, scale_factor=0.55
            )

        # update parameters
        self.forward_count += 1
        if not is_new_epoch and self.forward_count != self.accum_grad:
            return
        self.forward_count = 0
        # compute the gradient norm to check if it is normal or not
        grad_norm = torch.nn.utils.clip_grad_norm_(
            self.model.parameters(), self.grad_clip_threshold
        )
        logging.info("on device {} grad norm={}".format(self.device, grad_norm))
        if math.isnan(grad_norm):
            logging.warning("grad norm is nan. Do not update model.")
            self.ddp_trainer.optimizer.zero_grad()
        else:
            """
            Optimizer is never used for update. 
            The real updating process and the DDP communication is in 
            this `update_and_sync()`
            """
            # self.optimizer.step()
            self.ddp_trainer.update_and_sync()
            if self.iteration % 1 == 0:
                step_print(f"| iteration: {self.iteration} | gradient applied")

    def update(self):
        self.update_core()
        # #iterations with accum_grad > 1
        # Ref.: https://github.com/espnet/espnet/issues/777
        if self.forward_count == 0:
            self.iteration += 1


class CustomConverter(object):
    """Custom batch converter for Pytorch.

    Args:
        subsampling_factor (int): The subsampling factor.
        dtype (torch.dtype): Data type to convert.

    """

    def __init__(self, subsampling_factor=1, dtype=torch.float32):
        """Construct a CustomConverter object."""
        self.subsampling_factor = subsampling_factor
        self.ignore_id = -1
        self.dtype = dtype

    def __call__(self, batch, device=torch.device("cpu")):
        """Transform a batch and send it to a device.

        Args:
            batch (list): The batch to transform.
            device (torch.device): The device to send to.

        Returns:
            tuple(torch.Tensor, torch.Tensor, torch.Tensor)

        """
        # batch should be located in list
        assert len(batch) == 1
        xs, ys, texts, xs_orig = batch[0]

        # perform subsampling
        if self.subsampling_factor > 1:
            xs = [x[:: self.subsampling_factor, :] for x in xs]

        # get batch of lengths of input sequences
        ilens = np.array([x.shape[0] for x in xs])

        # perform padding and convert to tensor
        # currently only support real number
        if xs[0].dtype.kind == "c":
            xs_pad_real = pad_list(
                [torch.from_numpy(x.real).float() for x in xs], 0
            ).to(device, dtype=self.dtype)
            xs_pad_imag = pad_list(
                [torch.from_numpy(x.imag).float() for x in xs], 0
            ).to(device, dtype=self.dtype)
            # Note(kamo):
            # {'real': ..., 'imag': ...} will be changed to ComplexTensor in E2E.
            # Don't create ComplexTensor and give it E2E here
            # because torch.nn.DataParellel can't handle it.
            xs_pad = {"real": xs_pad_real, "imag": xs_pad_imag}
        else:
            xs_pad = pad_list([torch.from_numpy(x).float() for x in xs], 0).to(
                device, dtype=self.dtype
            )

        xs_pad_orig = pad_list([torch.from_numpy(x).float() for x in xs_orig], 0).to(
            device, dtype=self.dtype
        )

        ilens = torch.from_numpy(ilens).to(device)
        # NOTE: this is for multi-output (e.g., speech translation)
        ys_pad = pad_list(
            [
                torch.from_numpy(
                    np.array(y[0][:]) if isinstance(y, tuple) else y
                ).long()
                for y in ys
            ],
            self.ignore_id,
        ).to(device)

        return xs_pad, ilens, ys_pad, texts, xs_pad_orig


class CustomConverterMulEnc(object):
    """Custom batch converter for Pytorch in multi-encoder case.

    Args:
        subsampling_factors (list): List of subsampling factors for each encoder.
        dtype (torch.dtype): Data type to convert.

    """

    def __init__(self, subsamping_factors=[1, 1], dtype=torch.float32):
        """Initialize the converter."""
        self.subsamping_factors = subsamping_factors
        self.ignore_id = -1
        self.dtype = dtype
        self.num_encs = len(subsamping_factors)

    def __call__(self, batch, device=torch.device("cpu")):
        """Transform a batch and send it to a device.

        Args:
            batch (list): The batch to transform.
            device (torch.device): The device to send to.

        Returns:
            tuple( list(torch.Tensor), list(torch.Tensor), torch.Tensor)

        """
        # batch should be located in list
        assert len(batch) == 1
        xs_list = batch[0][: self.num_encs]
        ys = batch[0][-1]

        # perform subsampling
        if np.sum(self.subsamping_factors) > self.num_encs:
            xs_list = [
                [x[:: self.subsampling_factors[i], :] for x in xs_list[i]]
                for i in range(self.num_encs)
            ]

        # get batch of lengths of input sequences
        ilens_list = [
            np.array([x.shape[0] for x in xs_list[i]]) for i in range(self.num_encs)
        ]

        # perform padding and convert to tensor
        # currently only support real number
        xs_list_pad = [
            pad_list([torch.from_numpy(x).float() for x in xs_list[i]], 0).to(
                device, dtype=self.dtype
            )
            for i in range(self.num_encs)
        ]

        ilens_list = [
            torch.from_numpy(ilens_list[i]).to(device) for i in range(self.num_encs)
        ]
        # NOTE: this is for multi-task learning (e.g., speech translation)
        ys_pad = pad_list(
            [
                torch.from_numpy(np.array(y[0]) if isinstance(y, tuple) else y).long()
                for y in ys
            ],
            self.ignore_id,
        ).to(device)

        return xs_list_pad, ilens_list, ys_pad


def train(args):
    """Train with the given args.

    Args:
        args (namespace): The program arguments.

    """
    set_deterministic_pytorch(args)
    if args.num_encs > 1:
        args = format_mulenc_args(args)

    # check cuda availability
    if not torch.cuda.is_available():
        logging.warning("cuda is not available")

    # get input and output dimension info
    with open(args.valid_json, "rb") as f:
        valid_json = json.load(f)["utts"]
    utts = list(valid_json.keys())
    idim_list = [
        int(valid_json[utts[0]]["input"][i]["shape"][-1]) for i in range(args.num_encs)
    ]
    odim = int(valid_json[utts[0]]["output"][0]["shape"][-1])
    for i in range(args.num_encs):
        logging.info("stream{}: input dims : {}".format(i + 1, idim_list[i]))
    logging.info("#output dims: " + str(odim))

    # specify attention, CTC, hybrid mode
    if "transducer" in args.model_module:
        if (
            getattr(args, "etype", False) == "custom"
            or getattr(args, "dtype", False) == "custom"
        ):
            mtl_mode = "custom_transducer"
        else:
            mtl_mode = "transducer"
        logging.info("Pure transducer mode")
    elif args.mtlalpha == 1.0:
        mtl_mode = "ctc"
        logging.info("Pure CTC mode")
    elif args.mtlalpha == 0.0:
        mtl_mode = "att"
        logging.info("Pure attention mode")
    else:
        mtl_mode = "mtl"
        logging.info("Multitask learning mode")

    if (args.enc_init is not None or args.dec_init is not None) and args.num_encs == 1:
        model = load_trained_modules(idim_list[0], odim, args)
    else:
        model_class = dynamic_import(args.model_module)
        model = model_class(
            idim_list[0] if args.num_encs == 1 else idim_list, odim, args
        )
    assert isinstance(model, ASRInterface)
    total_subsampling_factor = model.get_total_subsampling_factor()

    print(model)
    logging.info(
        " Total parameter of the model = "
        + str(sum(p.numel() for p in model.parameters()))
    )

    if args.rnnlm is not None:
        rnnlm_args = get_model_conf(args.rnnlm, args.rnnlm_conf)
        rnnlm = lm_pytorch.ClassifierWithState(
            lm_pytorch.RNNLM(len(args.char_list), rnnlm_args.layer, rnnlm_args.unit)
        )
        torch_load(args.rnnlm, rnnlm)
        model.rnnlm = rnnlm

    # write model config
    global_rank = args.node_rank * args.node_size + args.local_rank
    args.outdir = args.outdir.replace("RANK", str(global_rank))
    if not os.path.exists(args.outdir):
        os.makedirs(args.outdir)
    model_conf = args.outdir + "/model.json"
    with open(model_conf, "wb") as f:
        logging.info("writing a model config file to " + model_conf)
        f.write(
            json.dumps(
                (idim_list[0] if args.num_encs == 1 else idim_list, odim, vars(args)),
                indent=4,
                ensure_ascii=False,
                sort_keys=True,
            ).encode("utf_8")
        )
    for key in sorted(vars(args).keys()):
        logging.info("ARGS: " + key + ": " + str(vars(args)[key]))

    reporter = model.reporter

    # check the use of multi-gpu
    if args.ngpu > 1:
        if args.batch_size != 0:
            logging.warning(
                "batch size is automatically increased (%d -> %d)"
                % (args.batch_size, args.batch_size * args.ngpu)
            )
            args.batch_size *= args.ngpu
        if args.num_encs > 1:
            # TODO(ruizhili): implement data parallel for multi-encoder setup.
            raise NotImplementedError(
                "Data parallel is not supported for multi-encoder setup."
            )

    # set torch device 
    assert args.ngpu in [1, 0] # this is ddp version
    device = torch.device(f"cuda:{args.local_rank}" if args.ngpu > 0 else "cpu")
    
    if args.train_dtype in ("float16", "float32", "float64"):
        dtype = getattr(torch, args.train_dtype)
    else:
        dtype = torch.float32
    model = model.to(device=device, dtype=dtype)
    if args.freeze_mods:
        model, model_params = freeze_modules(model, args.freeze_mods)
    else:
        model_params = model.parameters()
    logging.warning(
        "num. model params: {:,} (num. trained: {:,} ({:.1f}%))".format(
            sum(p.numel() for p in model.parameters()),
            sum(p.numel() for p in model.parameters() if p.requires_grad),
            sum(p.numel() for p in model.parameters() if p.requires_grad)
            * 100.0
            / sum(p.numel() for p in model.parameters()),
        )
    )

    # We build the SGD optimizer but never use it.
    # Other code needs this
    # The real optimizer is in ddp_trainer
    optimizer = torch.optim.SGD(model_params, lr=1.0)

    # setup apex.amp
    if args.train_dtype in ("O0", "O1", "O2", "O3"):
        try:
            from apex import amp
        except ImportError as e:
            logging.error(
                f"You need to install apex for --train-dtype {args.train_dtype}. "
                "See https://github.com/NVIDIA/apex#linux"
            )
            raise e
        if args.opt == "noam":
            model, optimizer.optimizer = amp.initialize(
                model, optimizer.optimizer, opt_level=args.train_dtype
            )
        else:
            model, optimizer = amp.initialize(
                model, optimizer, opt_level=args.train_dtype
            )
        use_apex = True

        from espnet.nets.pytorch_backend.ctc import CTC

        amp.register_float_function(CTC, "loss_fn")
        amp.init()
        logging.warning("register ctc as float function")
    else:
        use_apex = False

    # FIXME: TOO DIRTY HACK
    setattr(optimizer, "target", reporter)
    setattr(optimizer, "serialize", lambda s: reporter.serialize(s))

    # Setup a converter
    if args.num_encs == 1:
        converter = CustomConverter(subsampling_factor=model.subsample[0], dtype=dtype)
    else:
        converter = CustomConverterMulEnc(
            [i[0] for i in model.subsample_list], dtype=dtype
        )

    # read json data
    args.train_json = args.train_json.replace("RANK", str(global_rank + 1))
    with open(args.train_json, "rb") as f:
        train_json = json.load(f)["utts"]
    with open(args.valid_json, "rb") as f:
        valid_json = json.load(f)["utts"]

    # if use block_load, the utterance must sorted from shortest to longest
    use_sortagrad = args.sortagrad == -1 or args.sortagrad > 0 or args.block_load
    # make minibatch list (variable length)
    # disable the adaptive batch_size to sync DDP training
    # if use frame as the count, we do not set min_batch_size
    train = make_batchset(
        train_json,
        args.batch_size,
        args.maxlen_in,
        args.maxlen_out,
        args.minibatches,
        min_batch_size=args.batch_size if args.batch_size > 0 else 1, #args.ngpu if args.ngpu > 1 else 1,
        shortest_first=use_sortagrad,
        count=args.batch_count,
        batch_bins=args.batch_bins,
        batch_frames_in=args.batch_frames_in,
        batch_frames_out=args.batch_frames_out,
        batch_frames_inout=args.batch_frames_inout,
        iaxis=0,
        oaxis=0,
        no_sort=args.block_load,
    )
    valid = make_batchset(
        valid_json,
        args.batch_size * 2,
        args.maxlen_in,
        args.maxlen_out,
        args.minibatches,
        min_batch_size=args.batch_size, #args.ngpu if args.ngpu > 1 else 1,
        count=args.batch_count,
        batch_bins=args.batch_bins,
        batch_frames_in=args.batch_frames_in,
        batch_frames_out=args.batch_frames_out,
        batch_frames_inout=args.batch_frames_inout,
        iaxis=0,
        oaxis=0,
    )

    if args.block_load:
        assert args.n_iter_processes <= 1, "never use more than one worker"
        sampler = BufferSampler(
            length=len(train),
            utts_per_ark=args.utts_per_ark,
            batch_size=args.batch_size,
            buf_size=args.block_buffer_size,
            seed=args.seed,
        )
        prefetch_factor = sampler.get_prefetch_factor()
        shuffle = None
    else:
        sampler=None
        prefetch_factor = 20
        shuffle = not use_sortagrad

    load_tr = LoadInputsAndTargets(
        mode="asr",
        load_output=True,
        preprocess_conf=args.preprocess_conf,
        preprocess_args={"train": True},  # Switch the mode of preprocessing
        block_load=args.block_load,
    )
    load_cv = LoadInputsAndTargets(
        mode="asr",
        load_output=True,
        preprocess_conf=args.preprocess_conf,
        preprocess_args={"train": False},  # Switch the mode of preprocessing
    )
    # hack to make batchsize argument as 1
    # actual bathsize is included in a list
    # default collate function converts numpy array to pytorch tensor
    # we used an empty collate function instead which returns list
    train_dataset = TransformDataset(train, lambda data: converter([load_tr(data)]))
    valid_dataset = TransformDataset(valid, lambda data: converter([load_cv(data)]))

    train_iter = ChainerDataLoader(
        dataset=train_dataset,
        batch_size=1,
        num_workers=args.n_iter_processes,
        shuffle=shuffle,
        collate_fn=lambda x: x[0],
        prefetch_factor=prefetch_factor,
        sampler=sampler
    )
    # prefetch_factor=5,
    valid_iter = ChainerDataLoader(
        dataset=valid_dataset,
        batch_size=1,
        shuffle=False,
        collate_fn=lambda x: x[0],
        num_workers=args.n_iter_processes,
    )

    
    # Set up a trainer
    ddp_trainer = BlockAdamTrainer(args,
                                   master_node=args.master_node,
                                   rank=global_rank,
                                   world_size=args.world_size,
                                   model=model,
    )
    
    updater = CustomUpdater(
        model,
        args.grad_clip,
        {"main": train_iter},
        optimizer,
        device,
        args.ngpu,
        args.grad_noise,
        args.accum_grad,
        use_apex=use_apex,
        ddp_trainer=ddp_trainer
    )
    trainer = training.Trainer(updater, (args.epochs, "epoch"), out=args.outdir)

    if use_sortagrad and args.sortagrad != 0:
        trainer.extend(
            ShufflingEnabler([train_iter]),
            trigger=(args.sortagrad if args.sortagrad != -1 else args.epochs, "epoch"),
        )

    # Resume from a snapshot
    if args.resume:
        logging.info("resumed from %s" % args.resume)
        torch_resume(args.resume, trainer, args.load_trainer_and_opt)

    # Evaluate the model with the test dataset for each epoch
    if args.save_interval_iters > 0:
        trainer.extend(
            CustomEvaluator(model, {"main": valid_iter}, reporter, device, args.ngpu),
            trigger=(args.save_interval_iters, "iteration"),
        )
    else:
        trainer.extend(
            CustomEvaluator(model, {"main": valid_iter}, reporter, device, args.ngpu)
        )

    # Save attention weight each epoch
    is_attn_plot = (
        "transformer" in args.model_module
        or "conformer" in args.model_module
        or mtl_mode in ["att", "mtl", "custom_transducer"]
    )

    if args.num_save_attention > 0 and is_attn_plot:
        data = sorted(
            list(valid_json.items())[: args.num_save_attention],
            key=lambda x: int(x[1]["input"][0]["shape"][1]),
            reverse=True,
        )
        if hasattr(model, "module"):
            att_vis_fn = model.module.calculate_all_attentions
            plot_class = model.module.attention_plot_class
        else:
            att_vis_fn = model.calculate_all_attentions
            plot_class = model.attention_plot_class
        att_reporter = plot_class(
            att_vis_fn,
            data,
            args.outdir + "/att_ws",
            converter=converter,
            transform=load_cv,
            device=device,
            subsampling_factor=total_subsampling_factor,
        )
        trainer.extend(att_reporter, trigger=(1, "epoch"))
    else:
        att_reporter = None

    # Save CTC prob at each epoch
    if mtl_mode in ["ctc", "mtl"] and args.num_save_ctc > 0:
        # NOTE: sort it by output lengths
        data = sorted(
            list(valid_json.items())[: args.num_save_ctc],
            key=lambda x: int(x[1]["output"][0]["shape"][0]),
            reverse=True,
        )
        if hasattr(model, "module"):
            ctc_vis_fn = model.module.calculate_all_ctc_probs
            plot_class = model.module.ctc_plot_class
        else:
            ctc_vis_fn = model.calculate_all_ctc_probs
            plot_class = model.ctc_plot_class
        ctc_reporter = plot_class(
            ctc_vis_fn,
            data,
            args.outdir + "/ctc_prob",
            converter=converter,
            transform=load_cv,
            device=device,
            subsampling_factor=total_subsampling_factor,
        )
        trainer.extend(ctc_reporter, trigger=(1, "epoch"))
    else:
        ctc_reporter = None

    # Make a plot for training and validation values
    if args.num_encs > 1:
        report_keys_loss_ctc = [
            "main/loss_ctc{}".format(i + 1) for i in range(model.num_encs)
        ] + ["validation/main/loss_ctc{}".format(i + 1) for i in range(model.num_encs)]
        report_keys_cer_ctc = [
            "main/cer_ctc{}".format(i + 1) for i in range(model.num_encs)
        ] + ["validation/main/cer_ctc{}".format(i + 1) for i in range(model.num_encs)]

    if hasattr(model, "is_rnnt"):
        trainer.extend(
            extensions.PlotReport(
                [
                    "main/loss",
                    "validation/main/loss",
                    "main/loss_trans",
                    "validation/main/loss_trans",
                    "main/loss_ctc",
                    "validation/main/loss_ctc",
                    "main/loss_lm",
                    "validation/main/loss_lm",
                    "main/loss_aux_trans",
                    "validation/main/loss_aux_trans",
                    "main/loss_aux_symm_kl",
                    "validation/main/loss_aux_symm_kl",
                    "main/loss_mbr",
                    "validation/main/loss_mbr",
                    "main/loss_mmi",
                    "validation/main/loss_mmi",
                    "main/loss_lang",
                    "validation/main/loss_lang",
                    "main/loss_att",
                    "validation/main/loss_att",
                ],
                "epoch",
                file_name="loss.png",
            )
        )
    else:
        trainer.extend(
            extensions.PlotReport(
                [
                    "main/loss",
                    "validation/main/loss",
                    "main/loss_ctc",
                    "validation/main/loss_ctc",
                    "main/loss_att",
                    "validation/main/loss_att",
                    "main/loss_third",
                    "validation/main/loss_third",
                    "main/loss_mbr",
                    "validation/main/loss_mbr",
                ]
                + ([] if args.num_encs == 1 else report_keys_loss_ctc),
                "epoch",
                file_name="loss.png",
            )
        )

    trainer.extend(
        extensions.PlotReport(
            ["main/acc", "validation/main/acc"], "epoch", file_name="acc.png"
        )
    )
    trainer.extend(
        extensions.PlotReport(
            ["main/cer_ctc", "validation/main/cer_ctc"]
            + ([] if args.num_encs == 1 else report_keys_loss_ctc),
            "epoch",
            file_name="cer.png",
        )
    )

    # save the checkpoint only if this is the master GPU
    if global_rank == 0:
        # Save best models
        trainer.extend(
            snapshot_object(model, "model.loss.best"),
            trigger=training.triggers.MinValueTrigger("validation/main/loss"),
        )
        if mtl_mode not in ["ctc", "transducer", "custom_transducer"]:
            trainer.extend(
                snapshot_object(model, "model.acc.best"),
                trigger=training.triggers.MaxValueTrigger("validation/main/acc"),
            )
    
        # save snapshot which contains model and optimizer states
        if args.save_interval_iters > 0:
            trainer.extend(
                torch_snapshot(filename="snapshot.iter.{.updater.iteration}"),
                trigger=(args.save_interval_iters, "iteration"),
            )
    
        # save snapshot at every epoch - for model averaging
        trainer.extend(torch_snapshot(), trigger=(1, "epoch"))

    # epsilon decay in the optimizer
    if args.opt == "adadelta":
        if args.criterion == "acc" and mtl_mode != "ctc":
            trainer.extend(
                restore_snapshot(
                    model, args.outdir + "/model.acc.best", load_fn=torch_load
                ),
                trigger=CompareValueTrigger(
                    "validation/main/acc",
                    lambda best_value, current_value: best_value > current_value,
                ),
            )
            trainer.extend(
                adadelta_eps_decay(args.eps_decay),
                trigger=CompareValueTrigger(
                    "validation/main/acc",
                    lambda best_value, current_value: best_value > current_value,
                ),
            )
        elif args.criterion == "loss":
            trainer.extend(
                restore_snapshot(
                    model, args.outdir + "/model.loss.best", load_fn=torch_load
                ),
                trigger=CompareValueTrigger(
                    "validation/main/loss",
                    lambda best_value, current_value: best_value < current_value,
                ),
            )
            trainer.extend(
                adadelta_eps_decay(args.eps_decay),
                trigger=CompareValueTrigger(
                    "validation/main/loss",
                    lambda best_value, current_value: best_value < current_value,
                ),
            )
        # NOTE: In some cases, it may take more than one epoch for the model's loss
        # to escape from a local minimum.
        # Thus, restore_snapshot extension is not used here.
        # see details in https://github.com/espnet/espnet/pull/2171
        elif args.criterion == "loss_eps_decay_only":
            trainer.extend(
                adadelta_eps_decay(args.eps_decay),
                trigger=CompareValueTrigger(
                    "validation/main/loss",
                    lambda best_value, current_value: best_value < current_value,
                ),
            )

    # Write a log of evaluation statistics for each epoch
    trainer.extend(
        extensions.LogReport(trigger=(args.report_interval_iters, "iteration"))
    )

    if hasattr(model, "is_rnnt"):
        report_keys = [
            "epoch",
            "iteration",
            "main/loss",
            "main/loss_trans",
            "main/loss_ctc",
            "main/loss_lm",
            "main/loss_aux_trans",
            "main/loss_aux_symm_kl",
            "main/loss_mbr",
            "main/loss_mmi",
            "main/loss_att",
            "main/loss_lang",
            "validation/main/loss",
            "validation/main/loss_trans",
            "validation/main/loss_ctc",
            "validation/main/loss_lm",
            "validation/main/loss_aux_trans",
            "validation/main/loss_aux_symm_kl",
            "validation/main/loss_mbr",
            "validation/main/loss_mmi",
            "validation/main/loss_att",
            "validation/main/loss_lang",
            "elapsed_time",
        ]
    else:
        report_keys = [
            "epoch",
            "iteration",
            "main/loss",
            "main/loss_ctc",
            "main/loss_att",
            "main/loss_third",
            "main/loss_mbr",
            "validation/main/loss",
            "validation/main/loss_ctc",
            "validation/main/loss_att",
            "validation/main/loss_third",
            "validation/main/loss_mbr",
            "main/acc",
            "validation/main/acc",
            "main/cer_ctc",
            "validation/main/cer_ctc",
            "elapsed_time",
        ] + ([] if args.num_encs == 1 else report_keys_cer_ctc + report_keys_loss_ctc)

    if args.opt == "adadelta":
        trainer.extend(
            extensions.observe_value(
                "eps",
                lambda trainer: trainer.updater.get_optimizer("main").param_groups[0][
                    "eps"
                ],
            ),
            trigger=(args.report_interval_iters, "iteration"),
        )
        report_keys.append("eps")
    if args.report_cer:
        report_keys.append("validation/main/cer")
    if args.report_wer:
        report_keys.append("validation/main/wer")

    logwriter = open(args.outdir + f"/train.{global_rank}.log", 'w')
    trainer.extend(
        extensions.PrintReport(report_keys, out=logwriter),
        trigger=(args.report_interval_iters, "iteration"),
    )

    # trainer.extend(extensions.ProgressBar(update_interval=args.report_interval_iters))
    set_early_stop(trainer, args)

    # Run the training
    trainer.run()
    check_early_stop(trainer, args.epochs)


def recog(args):
    """Decode with the given args.

    Args:
        args (namespace): The program arguments.

    """
    set_deterministic_pytorch(args)

    if args.ngpu == 1:
        gpu_id = args.local_rank - 1
        logging.warning("gpu id: " + str(gpu_id))
        device=torch.device("cuda:{}".format(gpu_id))
    else:
        device=torch.device("cpu")
        os.environ["CUDA_VISIBLE_DEVICES"] = "-1" # disable GPU

    model, train_args = load_trained_model(args.model, training=False)
    assert isinstance(model, ASRInterface)
    model.recog_args = args

    if args.streaming_mode and "transformer" in train_args.model_module:
        raise NotImplementedError("streaming mode for transformer is not implemented")
    logging.info(
        " Total parameter of the model = "
        + str(sum(p.numel() for p in model.parameters()))
    )

    # read rnnlm
    if args.rnnlm and args.lm_weight > 0.0:
        rnnlm_args = get_model_conf(args.rnnlm, args.rnnlm_conf)
        if getattr(rnnlm_args, "model_module", "default") == "default":
            rnnlm = lm_pytorch.ClassifierWithState(
                lm_pytorch.RNNLM(
                    len(train_args.char_list),
                    rnnlm_args.layer,
                    rnnlm_args.unit,
                    getattr(rnnlm_args, "embed_unit", None),  # for backward compatibility
                )
            )
        elif getattr(rnnlm_args, "model_module", "default") == "transformer":
            lm_class = dynamic_import_lm("transformer", rnnlm_args.backend)
            rnnlm = lm_class(len(train_args.char_list), rnnlm_args)
        else:
            raise ValueError("Unsupported LM type")

        torch_load(args.rnnlm, rnnlm)
        rnnlm.eval()
    else:
        rnnlm = None

    if args.word_rnnlm:
        rnnlm_args = get_model_conf(args.word_rnnlm, args.word_rnnlm_conf)
        word_dict = rnnlm_args.char_list_dict
        char_dict = {x: i for i, x in enumerate(train_args.char_list)}
        word_rnnlm = lm_pytorch.ClassifierWithState(
            lm_pytorch.RNNLM(
                len(word_dict),
                rnnlm_args.layer,
                rnnlm_args.unit,
                getattr(rnnlm_args, "embed_unit", None),  # for backward compatibility
            )
        )
        torch_load(args.word_rnnlm, word_rnnlm)
        word_rnnlm.eval()

        if rnnlm is not None:
            rnnlm = lm_pytorch.ClassifierWithState(
                extlm_pytorch.MultiLevelLM(
                    word_rnnlm.predictor, rnnlm.predictor, word_dict, char_dict
                )
            )
        else:
            rnnlm = lm_pytorch.ClassifierWithState(
                extlm_pytorch.LookAheadWordLM(
                    word_rnnlm.predictor, word_dict, char_dict
                )
            )

    model = model.to(device)
    if rnnlm:
        rnnlm = rnnlm.to(device)

    # read json data
    with open(args.recog_json, "rb") as f:
        js = json.load(f)["utts"]
    new_js = {}

    load_inputs_and_targets = LoadInputsAndTargets(
        mode="asr",
        load_output=True,
        sort_in_input_length=False,
        preprocess_conf=train_args.preprocess_conf
        if args.preprocess_conf is None
        else args.preprocess_conf,
        preprocess_args={"train": False},
    )

    # load transducer beam search
    if hasattr(model, "is_rnnt"):
        if hasattr(model, "dec"):
            trans_decoder = model.dec
        else:
            trans_decoder = model.decoder
        joint_network = model.joint_network
     
        # We only use the MMIRNNTScorer now 
        if train_args.aux_mmi and train_args.aux_mmi_type == "mmi":
            adim = train_args.enc_block_arch[0]['d_hidden']
            weight_path = os.path.dirname(args.result_label) + "/dump" 
            os.makedirs(weight_path, exist_ok=True)
            model.aux_mmi.dump_weight(args.local_rank, weight_path)

            mmi_scorer_module = MMIRNNTScorer
            mmi_scorer = mmi_scorer_module(lang=model.aux_mmi.lang,
                                       device=device,
                                       idim=adim,
                                       sos_id=model.sos,
                                       rank=args.local_rank,
                                       use_segment=args.use_segment,
                                       char_list=train_args.char_list,
                                       weight_path=weight_path,
                                       lookahead=args.mas_lookahead,
                                       ) 
        else:
            mmi_scorer = None

        if args.ngram_model and args.ngram_weight > 0.0:
            print(f"Using ngram model: {args.ngram_model}", flush=True)
            from espnet.nets.scorers.ngram import NgramPartScorer
            ngram_scorer = NgramPartScorer(args.ngram_model, train_args.char_list)
        else:
            ngram_scorer = None

        if args.word_ngram is not None and args.word_ngram_weight > 0.0:
            from espnet.nets.scorers.word_ngram import WordNgramPartialScorer
            word_ngram_scorer = WordNgramPartialScorer
            word_ngram_scorer = word_ngram_scorer(
                                  args.word_ngram, device, train_args.char_list,
                                  log_semiring=args.word_ngram_log_semiring,
                                  lower_char=args.word_ngram_lower_char)
        else:
            word_ngram_scorer = None

        if args.tlg_scorer is not None and args.tlg_weight > 0.0:
            print(f"Using tlg scorer: {args.tlg_scorer}", flush=True)
            from espnet.nets.scorers.tlg_scorer import TlgPartialScorer
            tlg_scorer = TlgPartialScorer(lang=args.tlg_scorer, 
                                          nonblk_reward=args.tlg_nonblk_reward)
        else:
            tlg_scorer = None

        # for code-switch data
        if args.cs_nt_decode_feature in ["chn", "eng"]:
            ctc_module = getattr(model, "aux_ctc", None)
        else:
            ctc_module = getattr(model, "decoder_ctc", None)

        if args.eng_vocab is not None and os.path.isfile(args.eng_vocab):
            eng_vocab = [s.strip() for s in open(args.eng_vocab, encoding="utf-8").readlines()]
        else:
            eng_vocab = None

        beam_search_transducer = BeamSearchTransducer(
            decoder=trans_decoder,
            joint_network=joint_network,
            beam_size=args.beam_size,
            nbest=args.nbest,
            lm=rnnlm,
            lm_weight=args.lm_weight,
            search_type=args.search_type,
            char_list=train_args.char_list,
            max_sym_exp=args.max_sym_exp,
            u_max=args.u_max,
            nstep=args.nstep,
            prefix_alpha=args.prefix_alpha,
            score_norm=args.score_norm,
            mmi_scorer=mmi_scorer,
            mmi_weight=args.mmi_weight,
            ngram_scorer=ngram_scorer,
            ngram_weight=args.ngram_weight,
            word_ngram_scorer=word_ngram_scorer,
            word_ngram_weight=args.word_ngram_weight,
            tlg_scorer=tlg_scorer,
            tlg_weight=args.tlg_weight,
            forbid_eng=args.forbid_eng,
            ctc_module=ctc_module,
            ctc_weight=args.ctc_weight,
            eng_vocab=eng_vocab
        )

    if args.k2_decode:
        k2_decode(model, device, js, load_inputs_and_targets, args.batchsize, args.use_segment)
        print("Finish FST decoding. Abort!")
        return
    
    nbest_dict = {}
    rtf_calculator = RTF_calculator(js)
    rtf_calculator.tik()
    if args.batchsize == 0:
        with torch.no_grad():
            for idx, name in enumerate(js.keys(), 1):
                logging.info("(%d/%d) decoding " + name, idx, len(js.keys()))
                batch = [(name, js[name])]
                feats = load_inputs_and_targets(batch)
                feat = (
                    feats[0][0]
                    if args.num_encs == 1
                    else [feats[idx][0] for idx in range(model.num_encs)]
                )

                # For Oteam ASR Only: skip all transcriptions that have english chars
                text_trans = js[name]["output"][0]["text"]
                if any([is_alphabet(x) for x in text_trans]) and args.skip_eng:
                    continue

                if args.streaming_mode == "window" and args.num_encs == 1:
                    logging.info(
                        "Using streaming recognizer with window size %d frames",
                        args.streaming_window,
                    )
                    se2e = WindowStreamingE2E(e2e=model, recog_args=args, rnnlm=rnnlm)
                    for i in range(0, feat.shape[0], args.streaming_window):
                        logging.info(
                            "Feeding frames %d - %d", i, i + args.streaming_window
                        )
                        se2e.accept_input(feat[i : i + args.streaming_window])
                    logging.info("Running offline attention decoder")
                    se2e.decode_with_attention_offline()
                    logging.info("Offline attention decoder finished")
                    nbest_hyps = se2e.retrieve_recognition()
                elif args.streaming_mode == "segment" and args.num_encs == 1:
                    logging.info(
                        "Using streaming recognizer with threshold value %d",
                        args.streaming_min_blank_dur,
                    )
                    nbest_hyps = []
                    for n in range(args.nbest):
                        nbest_hyps.append({"yseq": [], "score": 0.0})
                    se2e = SegmentStreamingE2E(e2e=model, recog_args=args, rnnlm=rnnlm)
                    r = np.prod(model.subsample)
                    for i in range(0, feat.shape[0], r):
                        hyps = se2e.accept_input(feat[i : i + r])
                        if hyps is not None:
                            text = "".join(
                                [
                                    train_args.char_list[int(x)]
                                    for x in hyps[0]["yseq"][1:-1]
                                    if int(x) != -1
                                ]
                            )
                            text = text.replace(
                                "\u2581", " "
                            ).strip()  # for SentencePiece
                            text = text.replace(model.space, " ")
                            text = text.replace(model.blank, "")
                            logging.info(text)
                            for n in range(args.nbest):
                                nbest_hyps[n]["yseq"].extend(hyps[n]["yseq"])
                                nbest_hyps[n]["score"] += hyps[n]["score"]
                elif hasattr(model, "is_rnnt"):
                    nbest_hyps = model.recognize(feat, beam_search_transducer,
                                                 decode_feature=args.cs_nt_decode_feature)
                else:
                    nbest_hyps = model.recognize(
                        feat, args, train_args.char_list, rnnlm
                    )
                # visualization
                # decode_dir = os.path.dirname(args.result_label)
                # graph_dir = os.path.join(decode_dir, "graph")
                # os.makedirs(graph_dir, exist_ok=True)
                # plot_decoding_logs(graph_dir, train_args.char_list,
                #                    args, name, nbest_hyps)
                nbest_dict[name] = nbest_hyps
                new_js[name] = add_results_to_json(
                    js[name], nbest_hyps, train_args.char_list
                )

    else:

        def grouper(n, iterable, fillvalue=None):
            kargs = [iter(iterable)] * n
            return zip_longest(*kargs, fillvalue=fillvalue)

        # sort data if batchsize > 1
        keys = list(js.keys())
        if args.batchsize > 1:
            feat_lens = [js[key]["input"][0]["shape"][0] for key in keys]
            sorted_index = sorted(range(len(feat_lens)), key=lambda i: -feat_lens[i])
            keys = [keys[i] for i in sorted_index]

        with torch.no_grad():
            for names in grouper(args.batchsize, keys, None):
                names = [name for name in names if name]
                batch = [(name, js[name]) for name in names]
                feats = (
                    load_inputs_and_targets(batch)[0]
                    if args.num_encs == 1
                    else load_inputs_and_targets(batch)
                )
                if args.streaming_mode == "window" and args.num_encs == 1:
                    raise NotImplementedError
                elif args.streaming_mode == "segment" and args.num_encs == 1:
                    if args.batchsize > 1:
                        raise NotImplementedError
                    feat = feats[0]
                    nbest_hyps = []
                    for n in range(args.nbest):
                        nbest_hyps.append({"yseq": [], "score": 0.0})
                    se2e = SegmentStreamingE2E(e2e=model, recog_args=args, rnnlm=rnnlm)
                    r = np.prod(model.subsample)
                    for i in range(0, feat.shape[0], r):
                        hyps = se2e.accept_input(feat[i : i + r])
                        if hyps is not None:
                            text = "".join(
                                [
                                    train_args.char_list[int(x)]
                                    for x in hyps[0]["yseq"][1:-1]
                                    if int(x) != -1
                                ]
                            )
                            text = text.replace(
                                "\u2581", " "
                            ).strip()  # for SentencePiece
                            text = text.replace(model.space, " ")
                            text = text.replace(model.blank, "")
                            logging.info(text)
                            for n in range(args.nbest):
                                nbest_hyps[n]["yseq"].extend(hyps[n]["yseq"])
                                nbest_hyps[n]["score"] += hyps[n]["score"]
                    nbest_hyps = [nbest_hyps]
                else:
                    nbest_hyps = model.recognize_batch(
                        feats, args, train_args.char_list, rnnlm=rnnlm
                    )

                for i, nbest_hyp in enumerate(nbest_hyps):
                    name = names[i]
                    new_js[name] = add_results_to_json(
                        js[name], nbest_hyp, train_args.char_list
                    )

    rtf_calculator.tok()
    with open(args.result_label, "wb") as f:
        f.write(
            json.dumps(
                {"utts": new_js}, indent=4, ensure_ascii=False, sort_keys=True
            ).encode("utf_8")
        )
    

def enhance(args):
    """Dumping enhanced speech and mask.

    Args:
        args (namespace): The program arguments.
    """
    set_deterministic_pytorch(args)
    # read training config
    idim, odim, train_args = get_model_conf(args.model, args.model_conf)

    # TODO(ruizhili): implement enhance for multi-encoder model
    assert args.num_encs == 1, "number of encoder should be 1 ({} is given)".format(
        args.num_encs
    )

    # load trained model parameters
    logging.info("reading model parameters from " + args.model)
    model_class = dynamic_import(train_args.model_module)
    model = model_class(idim, odim, train_args)
    assert isinstance(model, ASRInterface)
    torch_load(args.model, model)
    model.recog_args = args

    # gpu
    if args.ngpu == 1:
        gpu_id = list(range(args.ngpu))
        logging.info("gpu id: " + str(gpu_id))
        model.cuda()

    # read json data
    with open(args.recog_json, "rb") as f:
        js = json.load(f)["utts"]

    load_inputs_and_targets = LoadInputsAndTargets(
        mode="asr",
        load_output=False,
        sort_in_input_length=False,
        preprocess_conf=None,  # Apply pre_process in outer func
    )
    if args.batchsize == 0:
        args.batchsize = 1

    # Creates writers for outputs from the network
    if args.enh_wspecifier is not None:
        enh_writer = file_writer_helper(args.enh_wspecifier, filetype=args.enh_filetype)
    else:
        enh_writer = None

    # Creates a Transformation instance
    preprocess_conf = (
        train_args.preprocess_conf
        if args.preprocess_conf is None
        else args.preprocess_conf
    )
    if preprocess_conf is not None:
        logging.info(f"Use preprocessing: {preprocess_conf}")
        transform = Transformation(preprocess_conf)
    else:
        transform = None

    # Creates a IStft instance
    istft = None
    frame_shift = args.istft_n_shift  # Used for plot the spectrogram
    if args.apply_istft:
        if preprocess_conf is not None:
            # Read the conffile and find stft setting
            with open(preprocess_conf) as f:
                # Json format: e.g.
                #    {"process": [{"type": "stft",
                #                  "win_length": 400,
                #                  "n_fft": 512, "n_shift": 160,
                #                  "window": "han"},
                #                 {"type": "foo", ...}, ...]}
                conf = json.load(f)
                assert "process" in conf, conf
                # Find stft setting
                for p in conf["process"]:
                    if p["type"] == "stft":
                        istft = IStft(
                            win_length=p["win_length"],
                            n_shift=p["n_shift"],
                            window=p.get("window", "hann"),
                        )
                        logging.info(
                            "stft is found in {}. "
                            "Setting istft config from it\n{}".format(
                                preprocess_conf, istft
                            )
                        )
                        frame_shift = p["n_shift"]
                        break
        if istft is None:
            # Set from command line arguments
            istft = IStft(
                win_length=args.istft_win_length,
                n_shift=args.istft_n_shift,
                window=args.istft_window,
            )
            logging.info(
                "Setting istft config from the command line args\n{}".format(istft)
            )

    # sort data
    keys = list(js.keys())
    feat_lens = [js[key]["input"][0]["shape"][0] for key in keys]
    sorted_index = sorted(range(len(feat_lens)), key=lambda i: -feat_lens[i])
    keys = [keys[i] for i in sorted_index]

    def grouper(n, iterable, fillvalue=None):
        kargs = [iter(iterable)] * n
        return zip_longest(*kargs, fillvalue=fillvalue)

    num_images = 0
    if not os.path.exists(args.image_dir):
        os.makedirs(args.image_dir)

    for names in grouper(args.batchsize, keys, None):
        batch = [(name, js[name]) for name in names]

        # May be in time region: (Batch, [Time, Channel])
        org_feats = load_inputs_and_targets(batch)[0]
        if transform is not None:
            # May be in time-freq region: : (Batch, [Time, Channel, Freq])
            feats = transform(org_feats, train=False)
        else:
            feats = org_feats

        with torch.no_grad():
            enhanced, mask, ilens = model.enhance(feats)

        for idx, name in enumerate(names):
            # Assuming mask, feats : [Batch, Time, Channel. Freq]
            #          enhanced    : [Batch, Time, Freq]
            enh = enhanced[idx][: ilens[idx]]
            mas = mask[idx][: ilens[idx]]
            feat = feats[idx]

            # Plot spectrogram
            if args.image_dir is not None and num_images < args.num_images:
                import matplotlib.pyplot as plt

                num_images += 1
                ref_ch = 0

                plt.figure(figsize=(20, 10))
                plt.subplot(4, 1, 1)
                plt.title("Mask [ref={}ch]".format(ref_ch))
                plot_spectrogram(
                    plt,
                    mas[:, ref_ch].T,
                    fs=args.fs,
                    mode="linear",
                    frame_shift=frame_shift,
                    bottom=False,
                    labelbottom=False,
                )

                plt.subplot(4, 1, 2)
                plt.title("Noisy speech [ref={}ch]".format(ref_ch))
                plot_spectrogram(
                    plt,
                    feat[:, ref_ch].T,
                    fs=args.fs,
                    mode="db",
                    frame_shift=frame_shift,
                    bottom=False,
                    labelbottom=False,
                )

                plt.subplot(4, 1, 3)
                plt.title("Masked speech [ref={}ch]".format(ref_ch))
                plot_spectrogram(
                    plt,
                    (feat[:, ref_ch] * mas[:, ref_ch]).T,
                    frame_shift=frame_shift,
                    fs=args.fs,
                    mode="db",
                    bottom=False,
                    labelbottom=False,
                )

                plt.subplot(4, 1, 4)
                plt.title("Enhanced speech")
                plot_spectrogram(
                    plt, enh.T, fs=args.fs, mode="db", frame_shift=frame_shift
                )

                plt.savefig(os.path.join(args.image_dir, name + ".png"))
                plt.clf()

            # Write enhanced wave files
            if enh_writer is not None:
                if istft is not None:
                    enh = istft(enh)
                else:
                    enh = enh

                if args.keep_length:
                    if len(org_feats[idx]) < len(enh):
                        # Truncate the frames added by stft padding
                        enh = enh[: len(org_feats[idx])]
                    elif len(org_feats) > len(enh):
                        padwidth = [(0, (len(org_feats[idx]) - len(enh)))] + [
                            (0, 0)
                        ] * (enh.ndim - 1)
                        enh = np.pad(enh, padwidth, mode="constant")

                if args.enh_filetype in ("sound", "sound.hdf5"):
                    enh_writer[name] = (args.fs, enh)
                else:
                    # Hint: To dump stft_signal, mask or etc,
                    # enh_filetype='hdf5' might be convenient.
                    enh_writer[name] = enh

            if num_images >= args.num_images and enh_writer is None:
                logging.info("Breaking the process.")
                break


def ctc_align(args):
    """CTC forced alignments with the given args.

    Args:
        args (namespace): The program arguments.
    """

    def add_alignment_to_json(js, alignment, char_list):
        """Add N-best results to json.

        Args:
            js (dict[str, Any]): Groundtruth utterance dict.
            alignment (list[int]): List of alignment.
            char_list (list[str]): List of characters.

        Returns:
            dict[str, Any]: N-best results added utterance dict.

        """
        # copy old json info
        new_js = dict()
        new_js["ctc_alignment"] = []

        alignment_tokens = []
        for idx, a in enumerate(alignment):
            alignment_tokens.append(char_list[a])
        alignment_tokens = " ".join(alignment_tokens)

        new_js["ctc_alignment"] = alignment_tokens

        return new_js

    set_deterministic_pytorch(args)
    model, train_args = load_trained_model(args.model)
    assert isinstance(model, ASRInterface)
    model.eval()

    load_inputs_and_targets = LoadInputsAndTargets(
        mode="asr",
        load_output=True,
        sort_in_input_length=False,
        preprocess_conf=train_args.preprocess_conf
        if args.preprocess_conf is None
        else args.preprocess_conf,
        preprocess_args={"train": False},
    )

    if args.ngpu > 1:
        raise NotImplementedError("only single GPU decoding is supported")
    if args.ngpu == 1:
        device = "cuda"
    else:
        device = "cpu"
    dtype = getattr(torch, args.dtype)
    logging.info(f"Decoding device={device}, dtype={dtype}")
    model.to(device=device, dtype=dtype).eval()

    # read json data
    with open(args.align_json, "rb") as f:
        js = json.load(f)["utts"]
    new_js = {}
    if args.batchsize == 0:
        with torch.no_grad():
            for idx, name in enumerate(js.keys(), 1):
                logging.info("(%d/%d) aligning " + name, idx, len(js.keys()))
                batch = [(name, js[name])]
                feat, label = load_inputs_and_targets(batch)
                feat = feat[0]
                label = label[0]
                enc = model.encode(torch.as_tensor(feat).to(device)).unsqueeze(0)
                alignment = model.ctc.forced_align(enc, label)
                new_js[name] = add_alignment_to_json(
                    js[name], alignment, train_args.char_list
                )
    else:
        raise NotImplementedError("Align_batch is not implemented.")

    with open(args.result_label, "wb") as f:
        f.write(
            json.dumps(
                {"utts": new_js}, indent=4, ensure_ascii=False, sort_keys=True
            ).encode("utf_8")
        )


================================================
FILE: asr/pytorch_backend/asr_init.py
================================================
"""Finetuning methods."""

import logging
import os
import torch

from collections import OrderedDict

from espnet.asr.asr_utils import get_model_conf
from espnet.asr.asr_utils import torch_load
from espnet.nets.asr_interface import ASRInterface
from espnet.nets.mt_interface import MTInterface
from espnet.nets.pytorch_backend.transducer.utils import custom_torch_load
from espnet.nets.tts_interface import TTSInterface
from espnet.utils.dynamic_import import dynamic_import


def freeze_modules(model, modules):
    """Freeze model parameters according to modules list.

    Args:
        model (torch.nn.Module): main model to update
        modules (list): specified module list for freezing

    Return:
        model (torch.nn.Module): updated model
        model_params (filter): filtered model parameters

    """
    for mod, param in model.named_parameters():
        if any(mod.startswith(m) for m in modules):
            logging.info(f"freezing {mod}, it will not be updated.")
            param.requires_grad = False

    model_params = filter(lambda x: x.requires_grad, model.parameters())

    return model, model_params


def transfer_verification(model_state_dict, partial_state_dict, modules):
    """Verify tuples (key, shape) for input model modules match specified modules.

    Args:
        model_state_dict (OrderedDict): the initial model state_dict
        partial_state_dict (OrderedDict): the trained model state_dict
        modules (list): specified module list for transfer

    Return:
        (boolean): allow transfer

    """
    modules_model = []
    partial_modules = []

    for key_p, value_p in partial_state_dict.items():
        if any(key_p.startswith(m) for m in modules):
            partial_modules += [(key_p, value_p.shape)]

    for key_m, value_m in model_state_dict.items():
        if any(key_m.startswith(m) for m in modules):
            modules_model += [(key_m, value_m.shape)]

    len_match = len(modules_model) == len(partial_modules)

    module_match = sorted(modules_model, key=lambda x: (x[0], x[1])) == sorted(
        partial_modules, key=lambda x: (x[0], x[1])
    )

    return len_match and module_match


def get_partial_state_dict(model_state_dict, modules):
    """Create state_dict with specified modules matching input model modules.

    Note that get_partial_lm_state_dict is used if a LM specified.

    Args:
        model_state_dict (OrderedDict): trained model state_dict
        modules (list): specified module list for transfer

    Return:
        new_state_dict (OrderedDict): the updated state_dict

    """
    new_state_dict = OrderedDict()

    for key, value in model_state_dict.items():
        if any(key.startswith(m) for m in modules):
            new_state_dict[key] = value

    return new_state_dict


def get_lm_state_dict(lm_state_dict):
    """Create compatible ASR decoder state dict from LM state dict.

    Args:
        lm_state_dict (OrderedDict): pre-trained LM state_dict

    Return:
        new_state_dict (OrderedDict): LM state_dict with updated keys

    """
    new_state_dict = OrderedDict()

    for key, value in list(lm_state_dict.items()):
        if key == "predictor.embed.weight":
            new_state_dict["dec.embed.weight"] = value
        elif key.startswith("predictor.rnn."):
            _split = key.split(".")

            new_key = "dec.decoder." + _split[2] + "." + _split[3] + "_l0"
            new_state_dict[new_key] = value

    return new_state_dict


def filter_modules(model_state_dict, modules):
    """Filter non-matched modules in module_state_dict.

    Args:
        model_state_dict (OrderedDict): trained model state_dict
        modules (list): specified module list for transfer

    Return:
        new_mods (list): the update module list

    """
    new_mods = []
    incorrect_mods = []

    mods_model = list(model_state_dict.keys())
    for mod in modules:
        if any(key.startswith(mod) for key in mods_model):
            new_mods += [mod]
        else:
            incorrect_mods += [mod]

    if incorrect_mods:
        logging.warning(
            "module(s) %s don't match or (partially match) "
            "available modules in model.",
            incorrect_mods,
        )
        logging.warning("for information, the existing modules in model are:")
        logging.warning("%s", mods_model)

    return new_mods


def load_trained_model(model_path, training=True):
    """Load the trained model for recognition.

    Args:
        model_path (str): Path to model.***.best

    """
    idim, odim, train_args = get_model_conf(
        model_path, os.path.join(os.path.dirname(model_path), "model.json")
    )

    logging.warning("reading model parameters from " + model_path)

    if hasattr(train_args, "model_module"):
        model_module = train_args.model_module
    else:
        model_module = "espnet.nets.pytorch_backend.e2e_asr:E2E"
    # CTC Loss is not needed, default to builtin to prevent import errors
    # if hasattr(train_args, "ctc_type"):
    #     train_args.ctc_type = "builtin"

    model_class = dynamic_import(model_module)

    if "transducer" in model_module:
        model = model_class(idim, odim, train_args, training=training)
        custom_torch_load(model_path, model, training=training)
    else:
        model = model_class(idim, odim, train_args)
        torch_load(model_path, model)

    return model, train_args

# when start decoding jobs with very large nj, this function leads
# to reading error. Do this for many times
def _load_trained_model(model_path, training=True, patience=10):

    for i in range(patience):
        try:
            model, train_args = _load_trained_model(model_path, training=training)
            print(f"Model Init: Successful initialize model in {i}-th trail", flush=True)
            return model, train_args
        except:
            print(f"Model Init: Fail in {i}-th trail. Try again!", flush=True)

def get_trained_model_state_dict(model_path):
    """Extract the trained model state dict for pre-initialization.

    Args:
        model_path (str): Path to model.***.best

    Return:
        model.state_dict() (OrderedDict): the loaded model state_dict
        (bool): Boolean defining whether the model is an LM

    """
    conf_path = os.path.join(os.path.dirname(model_path), "model.json")
    if "rnnlm" in model_path:
        logging.warning("reading model parameters from %s", model_path)

        return get_lm_state_dict(torch.load(model_path))

    idim, odim, args = get_model_conf(model_path, conf_path)

    logging.warning("reading model parameters from " + model_path)

    if hasattr(args, "model_module"):
        model_module = args.model_module
    else:
        model_module = "espnet.nets.pytorch_backend.e2e_asr:E2E"

    model_class = dynamic_import(model_module)
    model = model_class(idim, odim, args)
    torch_load(model_path, model)
    assert (
        isinstance(model, MTInterface)
        or isinstance(model, ASRInterface)
        or isinstance(model, TTSInterface)
    )

    return model.state_dict()


def load_trained_modules(idim, odim, args, interface=ASRInterface):
    """Load model encoder or/and decoder modules with ESPNET pre-trained model(s).

    Args:
        idim (int): initial input dimension.
        odim (int): initial output dimension.
        args (Namespace): The initial model arguments.
        interface (Interface): ASRInterface or STInterface or TTSInterface.

    Return:
        model (torch.nn.Module): The model with pretrained modules.

    """

    def print_new_keys(state_dict, modules, model_path):
        logging.warning("loading %s from model: %s", modules, model_path)

        for k in state_dict.keys():
            logging.warning("override %s" % k)

    enc_model_path = args.enc_init
    dec_model_path = args.dec_init
    enc_modules = args.enc_init_mods
    dec_modules = args.dec_init_mods

    model_class = dynamic_import(args.model_module)
    main_model = model_class(idim, odim, args)
    assert isinstance(main_model, interface)

    main_state_dict = main_model.state_dict()

    logging.warning("model(s) found for pre-initialization")
    for model_path, modules in [
        (enc_model_path, enc_modules),
        (dec_model_path, dec_modules),
    ]:
        if model_path is not None:
            if os.path.isfile(model_path):
                model_state_dict = get_trained_model_state_dict(model_path)

                modules = filter_modules(model_state_dict, modules)

                partial_state_dict = get_partial_state_dict(model_state_dict, modules)

                if partial_state_dict:
                    if transfer_verification(
                        main_state_dict, partial_state_dict, modules
                    ):
                        print_new_keys(partial_state_dict, modules, model_path)
                        main_state_dict.update(partial_state_dict)
                    else:
                        logging.warning(
                            f"modules {modules} in model {model_path} "
                            f"don't match your training config",
                        )
            else:
                logging.warning("model was not found : %s", model_path)

    main_model.load_state_dict(main_state_dict)

    return main_model


================================================
FILE: asr/pytorch_backend/asr_mix.py
================================================
#!/usr/bin/env python3

"""
This script is used for multi-speaker speech recognition.

Copyright 2017 Johns Hopkins University (Shinji Watanabe)
 Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
"""
import json
import logging
import os

# chainer related
from chainer import training
from chainer.training import extensions
from itertools import zip_longest as zip_longest
import numpy as np
from tensorboardX import SummaryWriter
import torch

from espnet.asr.asr_mix_utils import add_results_to_json
from espnet.asr.asr_utils import adadelta_eps_decay

from espnet.asr.asr_utils import CompareValueTrigger
from espnet.asr.asr_utils import get_model_conf
from espnet.asr.asr_utils import restore_snapshot
from espnet.asr.asr_utils import snapshot_object
from espnet.asr.asr_utils import torch_load
from espnet.asr.asr_utils import torch_resume
from espnet.asr.asr_utils import torch_snapshot
from espnet.asr.pytorch_backend.asr import CustomEvaluator
from espnet.asr.pytorch_backend.asr import CustomUpdater
from espnet.asr.pytorch_backend.asr import load_trained_model
import espnet.lm.pytorch_backend.extlm as extlm_pytorch
from espnet.nets.asr_interface import ASRInterface
from espnet.nets.pytorch_backend.e2e_asr_mix import pad_list
import espnet.nets.pytorch_backend.lm.default as lm_pytorch
from espnet.utils.dataset import ChainerDataLoader
from espnet.utils.dataset import TransformDataset
from espnet.utils.deterministic_utils import set_deterministic_pytorch
from espnet.utils.dynamic_import import dynamic_import
from espnet.utils.io_utils import LoadInputsAndTargets
from espnet.utils.training.batchfy import make_batchset
from espnet.utils.training.iterators import ShufflingEnabler
from espnet.utils.training.tensorboard_logger import TensorboardLogger
from espnet.utils.training.train_utils import check_early_stop
from espnet.utils.training.train_utils import set_early_stop

import matplotlib

matplotlib.use("Agg")


class CustomConverter(object):
    """Custom batch converter for Pytorch.

    Args:
        subsampling_factor (int): The subsampling factor.
        dtype (torch.dtype): Data type to convert.

    """

    def __init__(self, subsampling_factor=1, dtype=torch.float32, num_spkrs=2):
        """Initialize the converter."""
        self.subsampling_factor = subsampling_factor
        self.ignore_id = -1
        self.dtype = dtype
        self.num_spkrs = num_spkrs

    def __call__(self, batch, device=torch.device("cpu")):
        """Transform a batch and send it to a device.

        Args:
            batch (list(tuple(str, dict[str, dict[str, Any]]))): The batch to transform.
            device (torch.device): The device to send to.

        Returns:
            tuple(torch.Tensor, torch.Tensor, torch.Tensor): Transformed batch.

        """
        # batch should be located in list
        assert len(batch) == 1
        xs, ys = batch[0][0], batch[0][-self.num_spkrs :]

        # perform subsampling
        if self.subsampling_factor > 1:
            xs = [x[:: self.subsampling_factor, :] for x in xs]

        # get batch of lengths of input sequences
        ilens = np.array([x.shape[0] for x in xs])

        # perform padding and convert to tensor
        # currently only support real number
        if xs[0].dtype.kind == "c":
            xs_pad_real = pad_list(
                [torch.from_numpy(x.real).float() for x in xs], 0
            ).to(device, dtype=self.dtype)
            xs_pad_imag = pad_list(
                [torch.from_numpy(x.imag).float() for x in xs], 0
            ).to(device, dtype=self.dtype)
            # Note(kamo):
            # {'real': ..., 'imag': ...} will be changed to ComplexTensor in E2E.
            # Don't create ComplexTensor and give it to E2E here
            # because torch.nn.DataParallel can't handle it.
            xs_pad = {"real": xs_pad_real, "imag": xs_pad_imag}
        else:
            xs_pad = pad_list([torch.from_numpy(x).float() for x in xs], 0).to(
                device, dtype=self.dtype
            )

        ilens = torch.from_numpy(ilens).to(device)
        if not isinstance(ys[0], np.ndarray):
            ys_pad = []
            for i in range(len(ys)):  # speakers
                ys_pad += [torch.from_numpy(y).long() for y in ys[i]]
            ys_pad = pad_list(ys_pad, self.ignore_id)
            ys_pad = (
                ys_pad.view(self.num_spkrs, -1, ys_pad.size(1))
                .transpose(0, 1)
                .to(device)
            )  # (B, num_spkrs, Tmax)
        else:
            ys_pad = pad_list(
                [torch.from_numpy(y).long() for y in ys], self.ignore_id
            ).to(device)

        return xs_pad, ilens, ys_pad


def train(args):
    """Train with the given args.

    Args:
        args (namespace): The program arguments.

    """
    set_deterministic_pytorch(args)

    # check cuda availability
    if not torch.cuda.is_available():
        logging.warning("cuda is not available")

    # get input and output dimension info
    with open(args.valid_json, "rb") as f:
        valid_json = json.load(f)["utts"]
    utts = list(valid_json.keys())
    idim = int(valid_json[utts[0]]["input"][0]["shape"][-1])
    odim = int(valid_json[utts[0]]["output"][0]["shape"][-1])
    logging.info("#input dims : " + str(idim))
    logging.info("#output dims: " + str(odim))

    # specify attention, CTC, hybrid mode
    if args.mtlalpha == 1.0:
        mtl_mode = "ctc"
        logging.info("Pure CTC mode")
    elif args.mtlalpha == 0.0:
        mtl_mode = "att"
        logging.info("Pure attention mode")
    else:
        mtl_mode = "mtl"
        logging.info("Multitask learning mode")

    # specify model architecture
    model_class = dynamic_import(args.model_module)
    model = model_class(idim, odim, args)
    assert isinstance(model, ASRInterface)
    subsampling_factor = model.subsample[0]

    if args.rnnlm is not None:
        rnnlm_args = get_model_conf(args.rnnlm, args.rnnlm_conf)
        rnnlm = lm_pytorch.ClassifierWithState(
            lm_pytorch.RNNLM(
                len(args.char_list),
                rnnlm_args.layer,
                rnnlm_args.unit,
                getattr(rnnlm_args, "embed_unit", None),  # for backward compatibility
            )
        )
        torch.load(args.rnnlm, rnnlm)
        model.rnnlm = rnnlm

    # write model config
    if not os.path.exists(args.outdir):
        os.makedirs(args.outdir)
    model_conf = args.outdir + "/model.json"
    with open(model_conf, "wb") as f:
        logging.info("writing a model config file to " + model_conf)
        f.write(
            json.dumps(
                (idim, odim, vars(args)), indent=4, ensure_ascii=False, sort_keys=True
            ).encode("utf_8")
        )
    for key in sorted(vars(args).keys()):
        logging.info("ARGS: " + key + ": " + str(vars(args)[key]))

    reporter = model.reporter

    # check the use of multi-gpu
    if args.ngpu > 1:
        if args.batch_size != 0:
            logging.warning(
                "batch size is automatically increased (%d -> %d)"
                % (args.batch_size, args.batch_size * args.ngpu)
            )
            args.batch_size *= args.ngpu

    # set torch device
    device = torch.device("cuda" if args.ngpu > 0 else "cpu")
    if args.train_dtype in ("float16", "float32", "float64"):
        dtype = getattr(torch, args.train_dtype)
    else:
        dtype = torch.float32
    model = model.to(device=device, dtype=dtype)

    logging.warning(
        "num. model params: {:,} (num. trained: {:,} ({:.1f}%))".format(
            sum(p.numel() for p in model.parameters()),
            sum(p.numel() for p in model.parameters() if p.requires_grad),
            sum(p.numel() for p in model.parameters() if p.requires_grad)
            * 100.0
            / sum(p.numel() for p in model.parameters()),
        )
    )

    # Setup an optimizer
    if args.opt == "adadelta":
        optimizer = torch.optim.Adadelta(
            model.parameters(), rho=0.95, eps=args.eps, weight_decay=args.weight_decay
        )
    elif args.opt == "adam":
        optimizer = torch.optim.Adam(model.parameters(), weight_decay=args.weight_decay)
    elif args.opt == "noam":
        from espnet.nets.pytorch_backend.transformer.optimizer import get_std_opt

        optimizer = get_std_opt(
            model.parameters(),
            args.adim,
            args.transformer_warmup_steps,
            args.transformer_lr,
        )
    else:
        raise NotImplementedError("unknown optimizer: " + args.opt)

    # setup apex.amp
    if args.train_dtype in ("O0", "O1", "O2", "O3"):
        try:
            from apex import amp
        except ImportError as e:
            logging.error(
                f"You need to install apex for --train-dtype {args.train_dtype}. "
                "See https://github.com/NVIDIA/apex#linux"
            )
            raise e
        if args.opt == "noam":
            model, optimizer.optimizer = amp.initialize(
                model, optimizer.optimizer, opt_level=args.train_dtype
            )
        else:
            model, optimizer = amp.initialize(
                model, optimizer, opt_level=args.train_dtype
            )
        use_apex = True
    else:
        use_apex = False

    # FIXME: TOO DIRTY HACK
    setattr(optimizer, "target", reporter)
    setattr(optimizer, "serialize", lambda s: reporter.serialize(s))

    # Setup a converter
    converter = CustomConverter(
        subsampling_factor=subsampling_factor, dtype=dtype, num_spkrs=args.num_spkrs
    )

    # read json data
    with open(args.train_json, "rb") as f:
        train_json = json.load(f)["utts"]
    with open(args.valid_json, "rb") as f:
        valid_json = json.load(f)["utts"]

    use_sortagrad = args.sortagrad == -1 or args.sortagrad > 0
    # make minibatch list (variable length)
    train = make_batchset(
        train_json,
        args.batch_size,
        args.maxlen_in,
        args.maxlen_out,
        args.minibatches,
        min_batch_size=args.ngpu if args.ngpu > 1 else 1,
        shortest_first=use_sortagrad,
        count=args.batch_count,
        batch_bins=args.batch_bins,
        batch_frames_in=args.batch_frames_in,
        batch_frames_out=args.batch_frames_out,
        batch_frames_inout=args.batch_frames_inout,
        iaxis=0,
        oaxis=-1,
    )
    valid = make_batchset(
        valid_json,
        args.batch_size,
        args.maxlen_in,
        args.maxlen_out,
        args.minibatches,
        min_batch_size=args.ngpu if args.ngpu > 1 else 1,
        count=args.batch_count,
        batch_bins=args.batch_bins,
        batch_frames_in=args.batch_frames_in,
        batch_frames_out=args.batch_frames_out,
        batch_frames_inout=args.batch_frames_inout,
        iaxis=0,
        oaxis=-1,
    )

    load_tr = LoadInputsAndTargets(
        mode="asr",
        load_output=True,
        preprocess_conf=args.preprocess_conf,
        preprocess_args={"train": True},  # Switch the mode of preprocessing
    )
    load_cv = LoadInputsAndTargets(
        mode="asr",
        load_output=True,
        preprocess_conf=args.preprocess_conf,
        preprocess_args={"train": False},  # Switch the mode of preprocessing
    )
    # hack to make batchsize argument as 1
    # actual bathsize is included in a list
    # default collate function converts numpy array to pytorch tensor
    # we used an empty collate function instead which returns list
    train_iter = {
        "main": ChainerDataLoader(
            dataset=TransformDataset(train, lambda data: converter([load_tr(data)])),
            batch_size=1,
            num_workers=args.n_iter_processes,
            shuffle=True,
            collate_fn=lambda x: x[0],
        )
    }
    valid_iter = {
        "main": ChainerDataLoader(
            dataset=TransformDataset(valid, lambda data: converter([load_cv(data)])),
            batch_size=1,
            shuffle=False,
            collate_fn=lambda x: x[0],
            num_workers=args.n_iter_processes,
        )
    }

    # Set up a trainer
    updater = CustomUpdater(
        model,
        args.grad_clip,
        train_iter,
        optimizer,
        device,
        args.ngpu,
        args.grad_noise,
        args.accum_grad,
        use_apex=use_apex,
    )
    trainer = training.Trainer(updater, (args.epochs, "epoch"), out=args.outdir)

    if use_sortagrad:
        trainer.extend(
            ShufflingEnabler([train_iter]),
            trigger=(args.sortagrad if args.sortagrad != -1 else args.epochs, "epoch"),
        )

    # Resume from a snapshot
    if args.resume:
        logging.info("resumed from %s" % args.resume)
        torch_resume(args.resume, trainer)

    # Evaluate the model with the test dataset for each epoch
    trainer.extend(CustomEvaluator(model, valid_iter, reporter, device, args.ngpu))

    # Save attention weight each epoch
    if args.num_save_attention > 0 and args.mtlalpha != 1.0:
        data = sorted(
            list(valid_json.items())[: args.num_save_attention],
            key=lambda x: int(x[1]["input"][0]["shape"][1]),
            reverse=True,
        )
        if hasattr(model, "module"):
            att_vis_fn = model.module.calculate_all_attentions
            plot_class = model.module.attention_plot_class
        else:
            att_vis_fn = model.calculate_all_attentions
            plot_class = model.attention_plot_class
        att_reporter = plot_class(
            att_vis_fn,
            data,
            args.outdir + "/att_ws",
            converter=converter,
            transform=load_cv,
            device=device,
        )
        trainer.extend(att_reporter, trigger=(1, "epoch"))
    else:
        att_reporter = None

    # Make a plot for training and validation values
    trainer.extend(
        extensions.PlotReport(
            [
                "main/loss",
                "validation/main/loss",
                "main/loss_ctc",
                "validation/main/loss_ctc",
                "main/loss_att",
                "validation/main/loss_att",
            ],
            "epoch",
            file_name="loss.png",
        )
    )
    trainer.extend(
        extensions.PlotReport(
            ["main/acc", "validation/main/acc"], "epoch", file_name="acc.png"
        )
    )
    trainer.extend(
        extensions.PlotReport(
            ["main/cer_ctc", "validation/main/cer_ctc"], "epoch", file_name="cer.png"
        )
    )

    # Save best models
    trainer.extend(
        snapshot_object(model, "model.loss.best"),
        trigger=training.triggers.MinValueTrigger("validation/main/loss"),
    )
    if mtl_mode != "ctc":
        trainer.extend(
            snapshot_object(model, "model.acc.best"),
            trigger=training.triggers.MaxValueTrigger("validation/main/acc"),
        )

    # save snapshot which contains model and optimizer states
    trainer.extend(torch_snapshot(), trigger=(1, "epoch"))

    # epsilon decay in the optimizer
    if args.opt == "adadelta":
        if args.criterion == "acc" and mtl_mode != "ctc":
            trainer.extend(
                restore_snapshot(
                    model, args.outdir + "/model.acc.best", load_fn=torch_load
                ),
                trigger=CompareValueTrigger(
                    "validation/main/acc",
                    lambda best_value, current_value: best_value > current_value,
                ),
            )
            trainer.extend(
                adadelta_eps_decay(args.eps_decay),
                trigger=CompareValueTrigger(
                    "validation/main/acc",
                    lambda best_value, current_value: best_value > current_value,
                ),
            )
        elif args.criterion == "loss":
            trainer.extend(
                restore_snapshot(
                    model, args.outdir + "/model.loss.best", load_fn=torch_load
                ),
                trigger=CompareValueTrigger(
                    "validation/main/loss",
                    lambda best_value, current_value: best_value < current_value,
                ),
            )
            trainer.extend(
                adadelta_eps_decay(args.eps_decay),
                trigger=CompareValueTrigger(
                    "validation/main/loss",
                    lambda best_value, current_value: best_value < current_value,
                ),
            )

    # Write a log of evaluation statistics for each epoch
    trainer.extend(
        extensions.LogReport(trigger=(args.report_interval_iters, "iteration"))
    )
    report_keys = [
        "epoch",
        "iteration",
        "main/loss",
        "main/loss_ctc",
        "main/loss_att",
        "validation/main/loss",
        "validation/main/loss_ctc",
        "validation/main/loss_att",
        "main/acc",
        "validation/main/acc",
        "main/cer_ctc",
        "validation/main/cer_ctc",
        "elapsed_time",
    ]
    if args.opt == "adadelta":
        trainer.extend(
            extensions.observe_value(
                "eps",
                lambda trainer: trainer.updater.get_optimizer("main").param_groups[0][
                    "eps"
                ],
            ),
            trigger=(args.report_interval_iters, "iteration"),
        )
        report_keys.append("eps")
    if args.report_cer:
        report_keys.append("validation/main/cer")
    if args.report_wer:
        report_keys.append("validation/main/wer")
    trainer.extend(
        extensions.PrintReport(report_keys),
        trigger=(args.report_interval_iters, "iteration"),
    )

    trainer.extend(extensions.ProgressBar(update_interval=args.report_interval_iters))
    set_early_stop(trainer, args)

    if args.tensorboard_dir is not None and args.tensorboard_dir != "":
        trainer.extend(
            TensorboardLogger(SummaryWriter(args.tensorboard_dir), att_reporter),
            trigger=(args.report_interval_iters, "iteration"),
        )
    # Run the training
    trainer.run()
    check_early_stop(trainer, args.epochs)


def recog(args):
    """Decode with the given args.

    Args:
        args (namespace): The program arguments.

    """
    set_deterministic_pytorch(args)
    model, train_args = load_trained_model(args.model)
    assert isinstance(model, ASRInterface)
    model.recog_args = args

    # read rnnlm
    if args.rnnlm:
        rnnlm_args = get_model_conf(args.rnnlm, args.rnnlm_conf)
        if getattr(rnnlm_args, "model_module", "default") != "default":
            raise ValueError(
                "use '--api v2' option to decode with non-default language model"
            )
        rnnlm = lm_pytorch.ClassifierWithState(
            lm_pytorch.RNNLM(
                len(train_args.char_list),
                rnnlm_args.layer,
                rnnlm_args.unit,
                getattr(rnnlm_args, "embed_unit", None),  # for backward compatibility
            )
        )
        torch_load(args.rnnlm, rnnlm)
        rnnlm.eval()
    else:
        rnnlm = None

    if args.word_rnnlm:
        rnnlm_args = get_model_conf(args.word_rnnlm, args.word_rnnlm_conf)
        word_dict = rnnlm_args.char_list_dict
        char_dict = {x: i for i, x in enumerate(train_args.char_list)}
        word_rnnlm = lm_pytorch.ClassifierWithState(
            lm_pytorch.RNNLM(len(word_dict), rnnlm_args.layer, rnnlm_args.unit)
        )
        torch_load(args.word_rnnlm, word_rnnlm)
        word_rnnlm.eval()

        if rnnlm is not None:
            rnnlm = lm_pytorch.ClassifierWithState(
                extlm_pytorch.MultiLevelLM(
                    word_rnnlm.predictor, rnnlm.predictor, word_dict, char_dict
                )
            )
        else:
            rnnlm = lm_pytorch.ClassifierWithState(
                extlm_pytorch.LookAheadWordLM(
                    word_rnnlm.predictor, word_dict, char_dict
                )
            )

    # gpu
    if args.ngpu == 1:
        gpu_id = list(range(args.ngpu))
        logging.info("gpu id: " + str(gpu_id))
        model.cuda()
        if rnnlm:
            rnnlm.cuda()

    # read json data
    with open(args.recog_json, "rb") as f:
        js = json.load(f)["utts"]
    new_js = {}

    load_inputs_and_targets = LoadInputsAndTargets(
        mode="asr",
        load_output=False,
        sort_in_input_length=False,
        preprocess_conf=train_args.preprocess_conf
        if args.preprocess_conf is None
        else args.preprocess_conf,
        preprocess_args={"train": False},
    )

    if args.batchsize == 0:
        with torch.no_grad():
            for idx, name in enumerate(js.keys(), 1):
                logging.info("(%d/%d) decoding " + name, idx, len(js.keys()))
                batch = [(name, js[name])]
                feat = load_inputs_and_targets(batch)[0][0]
                nbest_hyps = model.recognize(feat, args, train_args.char_list, rnnlm)
                new_js[name] = add_results_to_json(
                    js[name], nbest_hyps, train_args.char_list
                )

    else:

        def grouper(n, iterable, fillvalue=None):
            kargs = [iter(iterable)] * n
            return zip_longest(*kargs, fillvalue=fillvalue)

        # sort data if batchsize > 1
        keys = list(js.keys())
        if args.batchsize > 1:
            feat_lens = [js[key]["input"][0]["shape"][0] for key in keys]
            sorted_index = sorted(range(len(feat_lens)), key=lambda i: -feat_lens[i])
            keys = [keys[i] for i in sorted_index]

        with torch.no_grad():
            for names in grouper(args.batchsize, keys, None):
                names = [name for name in names if name]
                batch = [(name, js[name]) for name in names]
                feats = load_inputs_and_targets(batch)[0]
                nbest_hyps = model.recognize_batch(
                    feats, args, train_args.char_list, rnnlm=rnnlm
                )

                for i, name in enumerate(names):
                    nbest_hyp = [hyp[i] for hyp in nbest_hyps]
                    new_js[name] = add_results_to_json(
                        js[name], nbest_hyp, train_args.char_list
                    )

    with open(args.result_label, "wb") as f:
        f.write(
            json.dumps(
                {"utts": new_js}, indent=4, ensure_ascii=False, sort_keys=True
            ).encode("utf_8")
        )


================================================
FILE: asr/pytorch_backend/recog.py
================================================
"""V2 backend for `asr_recog.py` using py:class:`espnet.nets.beam_search.BeamSearch`."""

import json
import logging
import os
import torch

from espnet.asr.asr_utils import add_results_to_json
from espnet.asr.asr_utils import get_model_conf
from espnet.asr.asr_utils import torch_load
from espnet.asr.pytorch_backend.asr import load_trained_model
from espnet.nets.asr_interface import ASRInterface
from espnet.nets.batch_beam_search import BatchBeamSearch
from espnet.nets.beam_search import BeamSearch
from espnet.nets.lm_interface import dynamic_import_lm
from espnet.nets.scorer_interface import BatchScorerInterface
from espnet.nets.scorers.length_bonus import LengthBonus
from espnet.utils.deterministic_utils import set_deterministic_pytorch
from espnet.utils.io_utils import LoadInputsAndTargets
from espnet.nets.scorers.mmi_frame_scorer import MMIFrameScorer
# from espnet.nets.scorers.mmi_prefix_score import MMIFrameScorer
from espnet.nets.scorers.ctc import CTCPrefixScorer
from espnet.nets.scorers.word_ngram import WordNgramPartialScorer
from espnet.nets.scorers.mmi_rescorer import MMIRescorer
from espnet.utils.rtf_calculator import RTF_calculator

def recog_v2(args):
    """Decode with custom models that implements ScorerInterface.

    Notes:
        The previous backend espnet.asr.pytorch_backend.asr.recog
        only supports E2E and RNNLM

    Args:
        args (namespace): The program arguments.
        See py:func:`espnet.bin.asr_recog.get_parser` for details

    """
    logging.warning("experimental API for custom LMs is selected by --api v2")
    if args.batchsize > 1:
        raise NotImplementedError("multi-utt batch decoding is not implemented")
    if args.streaming_mode is not None:
        raise NotImplementedError("streaming mode is not implemented")
    if args.word_rnnlm:
        raise NotImplementedError("word LM is not implemented")

    if args.ngpu > 1:
        raise NotImplementedError("only single GPU decoding is supported")
    if args.ngpu == 1:
        device = torch.device("cuda")
    else:
        # So the cuda is not available now
        device = torch.device("cpu")
        os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
        assert torch.cuda.is_available() == False
    print(f"Rank: {args.local_rank} Using device: {device}, ngpu: {args.ngpu}")

    set_deterministic_pytorch(args)
    model, train_args = load_trained_model(args.model)
    assert isinstance(model, ASRInterface)
    model.eval()

    load_inputs_and_targets = LoadInputsAndTargets(
        mode="asr",
        load_output=False,
        sort_in_input_length=False,
        preprocess_conf=train_args.preprocess_conf
        if args.preprocess_conf is None
        else args.preprocess_conf,
        preprocess_args={"train": False},
    )

    if args.rnnlm:
        lm_args = get_model_conf(args.rnnlm, args.rnnlm_conf)
        # NOTE: for a compatibility with less than 0.5.0 version models
        lm_model_module = getattr(lm_args, "model_module", "default")
        lm_class = dynamic_import_lm(lm_model_module, lm_args.backend)
        lm = lm_class(len(train_args.char_list), lm_args)
        torch_load(args.rnnlm, lm)
        lm.eval()
    else:
        lm = None

    if args.ngram_model and args.ngram_weight > 0.0:
        from espnet.nets.scorers.ngram import NgramFullScorer
        from espnet.nets.scorers.ngram import NgramPartScorer

        if args.ngram_scorer == "full":
            ngram = NgramFullScorer(args.ngram_model, train_args.char_list)
        else:
            ngram = NgramPartScorer(args.ngram_model, train_args.char_list)
    else:
        ngram = None

    # load mmi_scorer
    if args.mmi_weight > 0.0:
        # Also make sure it is K2MMI
        assert hasattr(model.ctc, "dump_weight")
        # Dump a pth for each rank to avoid conflits when reading / writing
        weight_path = os.path.dirname(args.result_label) + "/dump"
        os.makedirs(weight_path, exist_ok=True)
        model.ctc.dump_weight(args.local_rank, weight_path)
        mmi_scorer = MMIFrameScorer
        mmi = mmi_scorer(lang=model.ctc.lang,
                         device=device,
                         idim=train_args.adim,
                         sos_id=model.sos,
                         rank=args.local_rank,
                         use_segment=args.use_segment,
                         char_list=train_args.char_list, 
                         weight_path=weight_path)
    else:
        mmi = None

    if args.mmi_rescore:
        weight_path = os.path.dirname(args.result_label) + "/dump"
        os.makedirs(weight_path, exist_ok=True)
        model.ctc.dump_weight(args.local_rank, weight_path)
        assert args.mmi_weight <= 0.0
        mmi_rescorer = MMIRescorer(lang=model.ctc.lang,
                                   device=device,
                                   idim=train_args.adim,
                                   sos_id=model.sos,
                                   rank=args.local_rank,
                                   use_segment=args.use_segment,
                                   char_list=train_args.char_list,
                                   weight_path=weight_path)
    else:
        mmi_rescorer = None

    if args.ctc_weight > 0.0:
        ctc_module = model.third_loss if hasattr(model, "third_loss") else model.ctc
        ctc = CTCPrefixScorer(ctc_module, model.eos)
    else: 
        ctc = None

    if args.word_ngram_weight > 0.0:
        word_ngram_scorer = WordNgramPartialScorer
        print(f"Using word ngram model: {args.word_ngram}", flush=True)
        word_ngram_scorer = WordNgramPartialScorer(args.word_ngram, 
                              device,
                              train_args.char_list, 
                              log_semiring=args.word_ngram_log_semiring)
    else:
        word_ngram_scorer = None
        
    scorers = model.scorers()
    scorers["ctc"] = ctc 
    scorers["mmi"] = mmi 
    scorers["lm"] = lm
    scorers["ngram"] = ngram
    scorers["length_bonus"] = LengthBonus(len(train_args.char_list))
    scorers["word_ngram"] = word_ngram_scorer
    weights = dict(
        decoder=1.0 - args.ctc_weight,
        ctc=args.ctc_weight,
        lm=args.lm_weight,
        ngram=args.ngram_weight,
        length_bonus=args.penalty,
        mmi=args.mmi_weight,
        word_ngram=args.word_ngram_weight,
    )
    beam_search = BeamSearch(
        beam_size=args.beam_size,
        vocab_size=len(train_args.char_list),
        weights=weights,
        scorers=scorers,
        sos=model.sos,
        eos=model.eos,
        token_list=train_args.char_list,
        pre_beam_score_key=None if args.ctc_weight == 1.0 else "full",
        mmi_rescorer=mmi_rescorer,
    )
    # TODO(karita): make all scorers batchfied
    if args.batchsize == 1:
        non_batch = [
            k
            for k, v in beam_search.full_scorers.items()
            if not isinstance(v, BatchScorerInterface)
        ]
        if len(non_batch) == 0:
            beam_search.__class__ = BatchBeamSearch
            logging.info("BatchBeamSearch implementation is selected.")
        else:
            logging.warning(
                f"As non-batch scorers {non_batch} are found, "
                f"fall back to non-batch implementation."
            )

    dtype = getattr(torch, args.dtype)
    logging.info(f"Decoding device={device}, dtype={dtype}")
    model.to(device=device, dtype=dtype).eval()
    # beam_search.to(device=device, dtype=dtype).eval()

    # read json data
    with open(args.recog_json, "rb") as f:
        js = json.load(f)["utts"]
    new_js = {}
    rtf_calculator = RTF_calculator(js)
    rtf_calculator.tik()
    with torch.no_grad():
        for idx, name in enumerate(js.keys(), 1):
            logging.info("(%d/%d) decoding " + name, idx, len(js.keys()))
            batch = [(name, js[name])]
            feat = load_inputs_and_targets(batch)[0][0]
            enc = model.encode(torch.as_tensor(feat).to(device=device, dtype=dtype))
            nbest_hyps = beam_search(
                x=enc, maxlenratio=args.maxlenratio, minlenratio=args.minlenratio
            )
            nbest_hyps = [
                h.asdict() for h in nbest_hyps[: min(len(nbest_hyps), args.nbest)]
            ]
            new_js[name] = add_results_to_json(
                js[name], nbest_hyps, train_args.char_list
            )
    
    rtf_calculator.tok()    

    with open(args.result_label, "wb") as f:
        f.write(
            json.dumps(
                {"utts": new_js}, indent=4, ensure_ascii=False, sort_keys=True
            ).encode("utf_8")
        )


================================================
FILE: bin/__init__.py
================================================
"""Initialize sub package."""


================================================
FILE: bin/asr_align.py
================================================
#!/usr/bin/env python3
# encoding: utf-8

# Copyright 2020 Johns Hopkins University (Xuankai Chang)
#           2020, Technische Universität München;  Dominik Winkelbauer, Ludwig Kürzinger
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

"""
This program performs CTC segmentation to align utterances within audio files.

Inputs:
    `--data-json`:
        A json containing list of utterances and audio files
    `--model`:
        An already trained ASR model

Output:
    `--output`:
        A plain `segments` file with utterance positions in the audio files.

Selected parameters:
    `--min-window-size`:
        Minimum window size considered for a single utterance. The current default value
        should be OK in most cases. Larger values might give better results; too large
        values cause IndexErrors.
    `--subsampling-factor`:
        If the encoder sub-samples its input, the number of frames at the CTC layer is
        reduced by this factor.
    `--frame-duration`:
        This is the non-overlapping duration of a single frame in milliseconds (the
        inverse of frames per millisecond).
    `--set-blank`:
        In the rare case that the blank token has not the index 0 in the character
        dictionary, this parameter sets the index of the blank token.
    `--gratis-blank`:
        Sets the transition cost for blank tokens to zero. Useful if there are longer
        unrelated segments between segments.
    `--replace-spaces-with-blanks`:
        Spaces are replaced with blanks. Helps to model pauses between words. May
        increase length of ground truth. May lead to misaligned segments when combined
        with the option `--gratis-blank`.
"""

import configargparse
import logging
import os
import sys

# imports for inference
from espnet.asr.pytorch_backend.asr_init import load_trained_model
from espnet.nets.asr_interface import ASRInterface
from espnet.utils.io_utils import LoadInputsAndTargets
import json
import torch

# imports for CTC segmentation
from ctc_segmentation import ctc_segmentation
from ctc_segmentation import CtcSegmentationParameters
from ctc_segmentation import determine_utterance_segments
from ctc_segmentation import prepare_text


# NOTE: you need this func to generate our sphinx doc
def get_parser():
    """Get default arguments."""
    parser = configargparse.ArgumentParser(
        description="Align text to audio using CTC segmentation."
        "using a pre-trained speech recognition model.",
        config_file_parser_class=configargparse.YAMLConfigFileParser,
        formatter_class=configargparse.ArgumentDefaultsHelpFormatter,
    )
    # general configuration
    parser.add("--config", is_config_file=True, help="Decoding config file path.")
    parser.add_argument(
        "--ngpu", type=int, default=0, help="Number of GPUs (max. 1 is supported)"
    )
    parser.add_argument(
        "--dtype",
        choices=("float16", "float32", "float64"),
        default="float32",
        help="Float precision (only available in --api v2)",
    )
    parser.add_argument(
        "--backend",
        type=str,
        default="pytorch",
        choices=["pytorch"],
        help="Backend library",
    )
    parser.add_argument("--debugmode", type=int, default=1, help="Debugmode")
    parser.add_argument("--verbose", "-V", type=int, default=1, help="Verbose option")
    parser.add_argument(
        "--preprocess-conf",
        type=str,
        default=None,
        help="The configuration file for the pre-processing",
    )
    # task related
    parser.add_argument(
        "--data-json", type=str, help="Json of recognition data for audio and text"
    )
    parser.add_argument("--utt-text", type=str, help="Text separated into utterances")
    # model (parameter) related
    parser.add_argument(
        "--model", type=str, required=True, help="Model file parameters to read"
    )
    parser.add_argument(
        "--model-conf", type=str, default=None, help="Model config file"
    )
    parser.add_argument(
        "--num-encs", default=1, type=int, help="Number of encoders in the model."
    )
    # ctc-segmentation related
    parser.add_argument(
        "--subsampling-factor",
        type=int,
        default=None,
        help="Subsampling factor."
        " If the encoder sub-samples its input, the number of frames at the CTC layer"
        " is reduced by this factor. For example, a BLSTMP with subsampling 1_2_2_1_1"
        " has a subsampling factor of 4.",
    )
    parser.add_argument(
        "--frame-duration",
        type=int,
        default=None,
        help="Non-overlapping duration of a single frame in milliseconds.",
    )
    parser.add_argument(
        "--min-window-size",
        type=int,
        default=None,
        help="Minimum window size considered for utterance.",
    )
    parser.add_argument(
        "--max-window-size",
        type=int,
        default=None,
        help="Maximum window size considered for utterance.",
    )
    parser.add_argument(
        "--use-dict-blank",
        type=int,
        default=None,
        help="DEPRECATED.",
    )
    parser.add_argument(
        "--set-blank",
        type=int,
        default=None,
        help="Index of model dictionary for blank token (default: 0).",
    )
    parser.add_argument(
        "--gratis-blank",
        type=int,
        default=None,
        help="Set the transition cost of the blank token to zero. Audio sections"
        " labeled with blank tokens can then be skipped without penalty. Useful"
        " if there are unrelated audio segments between utterances.",
    )
    parser.add_argument(
        "--replace-spaces-with-blanks",
        type=int,
        default=None,
        help="Fill blanks in between words to better model pauses between words."
        " Segments can be misaligned if this option is combined with --gratis-blank."
        " May increase length of ground truth.",
    )
    parser.add_argument(
        "--scoring-length",
        type=int,
        default=None,
        help="Changes partitioning length L for calculation of the confidence score.",
    )
    parser.add_argument(
        "--output",
        type=configargparse.FileType("w"),
        required=True,
        help="Output segments file",
    )
    return parser


def main(args):
    """Run the main decoding function."""
    parser = get_parser()
    args, extra = parser.parse_known_args(args)
    # logging info
    if args.verbose == 1:
        logging.basicConfig(
            level=logging.INFO,
            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
        )
    elif args.verbose == 2:
        logging.basicConfig(
            level=logging.DEBUG,
            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
        )
    else:
        logging.basicConfig(
            level=logging.WARN,
            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
        )
        logging.warning("Skip DEBUG/INFO messages")
    if args.ngpu == 0 and args.dtype == "float16":
        raise ValueError(f"--dtype {args.dtype} does not support the CPU backend.")
    # check CUDA_VISIBLE_DEVICES
    device = "cpu"
    if args.ngpu == 1:
        device = "cuda"
        cvd = os.environ.get("CUDA_VISIBLE_DEVICES")
        if cvd is None:
            logging.warning("CUDA_VISIBLE_DEVICES is not set.")
    elif args.ngpu > 1:
        logging.error("Decoding only supports ngpu=1.")
        sys.exit(1)
    # display PYTHONPATH
    logging.info("python path = " + os.environ.get("PYTHONPATH", "(None)"))
    # recog
    logging.info("backend = " + args.backend)
    if args.backend == "pytorch":
        ctc_align(args, device)
    else:
        raise ValueError("Only pytorch is supported.")
    sys.exit(0)


def ctc_align(args, device):
    """ESPnet-specific interface for CTC segmentation.

    Parses configuration, infers the CTC posterior probabilities,
    and then aligns start and end of utterances using CTC segmentation.
    Results are written to the output file given in the args.

    :param args: given configuration
    :param device: for inference; one of ['cuda', 'cpu']
    :return:  0 on success
    """
    model, train_args = load_trained_model(args.model)
    assert isinstance(model, ASRInterface)
    load_inputs_and_targets = LoadInputsAndTargets(
        mode="asr",
        load_output=True,
        sort_in_input_length=False,
        preprocess_conf=train_args.preprocess_conf
        if args.preprocess_conf is None
        else args.preprocess_conf,
        preprocess_args={"train": False},
    )
    logging.info(f"Decoding device={device}")
    # Warn for nets with high memory consumption on long audio files
    if hasattr(model, "enc"):
        encoder_module = model.enc.__class__.__module__
    elif hasattr(model, "encoder"):
        encoder_module = model.encoder.__class__.__module__
    else:
        encoder_module = "Unknown"
    logging.info(f"Encoder module: {encoder_module}")
    logging.info(f"CTC module:     {model.ctc.__class__.__module__}")
    if "rnn" not in encoder_module:
        logging.warning("No BLSTM model detected; memory consumption may be high.")
    model.to(device=device).eval()
    # read audio and text json data
    with open(args.data_json, "rb") as f:
        js = json.load(f)["utts"]
    with open(args.utt_text, "r", encoding="utf-8") as f:
        lines = f.readlines()
        i = 0
        text = {}
        segment_names = {}
        for name in js.keys():
            text_per_audio = []
            segment_names_per_audio = []
            while i < len(lines) and lines[i].startswith(name):
                text_per_audio.append(lines[i][lines[i].find(" ") + 1 :])
                segment_names_per_audio.append(lines[i][: lines[i].find(" ")])
                i += 1
            text[name] = text_per_audio
            segment_names[name] = segment_names_per_audio
    # apply configuration
    config = CtcSegmentationParameters()
    if args.subsampling_factor is not None:
        config.subsampling_factor = args.subsampling_factor
    if args.frame_duration is not None:
        config.frame_duration_ms = args.frame_duration
    if args.min_window_size is not None:
        config.min_window_size = args.min_window_size
    if args.max_window_size is not None:
        config.max_window_size = args.max_window_size
    config.char_list = train_args.char_list
    if args.use_dict_blank is not None:
        logging.warning(
            "The option --use-dict-blank is deprecated. If needed,"
            " use --set-blank instead."
        )
    if args.set_blank is not None:
        config.blank = args.set_blank
    if args.replace_spaces_with_blanks is not None:
        if args.replace_spaces_with_blanks:
            config.replace_spaces_with_blanks = True
        else:
            config.replace_spaces_with_blanks = False
    if args.gratis_blank:
        config.blank_transition_cost_zero = True
    if config.blank_transition_cost_zero and args.replace_spaces_with_blanks:
        logging.error(
            "Blanks are inserted between words, and also the transition cost of blank"
            " is zero. This configuration may lead to misalignments!"
        )
    if args.scoring_length is not None:
        config.score_min_mean_over_L = args.scoring_length
    logging.info(
        f"Frame timings: {config.frame_duration_ms}ms * {config.subsampling_factor}"
    )
    # Iterate over audio files to decode and align
    for idx, name in enumerate(js.keys(), 1):
        logging.info("(%d/%d) Aligning " + name, idx, len(js.keys()))
        batch = [(name, js[name])]
        feat, label = load_inputs_and_targets(batch)
        feat = feat[0]
        with torch.no_grad():
            # Encode input frames
            enc_output = model.encode(torch.as_tensor(feat).to(device)).unsqueeze(0)
            # Apply ctc layer to obtain log character probabilities
            lpz = model.ctc.log_softmax(enc_output)[0].cpu().numpy()
        # Prepare the text for aligning
        ground_truth_mat, utt_begin_indices = prepare_text(config, text[name])
        # Align using CTC segmentation
        timings, char_probs, state_list = ctc_segmentation(
            config, lpz, ground_truth_mat
        )
        logging.debug(f"state_list = {state_list}")
        # Obtain list of utterances with time intervals and confidence score
        segments = determine_utterance_segments(
            config, utt_begin_indices, char_probs, timings, text[name]
        )
        # Write to "segments" file
        for i, boundary in enumerate(segments):
            utt_segment = (
                f"{segment_names[name][i]} {name} {boundary[0]:.2f}"
                f" {boundary[1]:.2f} {boundary[2]:.9f}\n"
            )
            args.output.write(utt_segment)
    return 0


if __name__ == "__main__":
    main(sys.argv[1:])


================================================
FILE: bin/asr_enhance.py
================================================
#!/usr/bin/env python3
import configargparse
from distutils.util import strtobool
import logging
import os
import random
import sys

import numpy as np

from espnet.asr.pytorch_backend.asr import enhance


# NOTE: you need this func to generate our sphinx doc
def get_parser():
    parser = configargparse.ArgumentParser(
        description="Enhance noisy speech for speech recognition",
        config_file_parser_class=configargparse.YAMLConfigFileParser,
        formatter_class=configargparse.ArgumentDefaultsHelpFormatter,
    )
    # general configuration
    parser.add("--config", is_config_file=True, help="config file path")
    parser.add(
        "--config2",
        is_config_file=True,
        help="second config file path that overwrites the settings in `--config`.",
    )
    parser.add(
        "--config3",
        is_config_file=True,
        help="third config file path that overwrites the settings "
        "in `--config` and `--config2`.",
    )

    parser.add_argument("--ngpu", default=0, type=int, help="Number of GPUs")
    parser.add_argument(
        "--backend",
        default="chainer",
        type=str,
        choices=["chainer", "pytorch"],
        help="Backend library",
    )
    parser.add_argument("--debugmode", default=1, type=int, help="Debugmode")
    parser.add_argument("--seed", default=1, type=int, help="Random seed")
    parser.add_argument("--verbose", "-V", default=1, type=int, help="Verbose option")
    parser.add_argument(
        "--batchsize",
        default=1,
        type=int,
        help="Batch size for beam search (0: means no batch processing)",
    )
    parser.add_argument(
        "--preprocess-conf",
        type=str,
        default=None,
        help="The configuration file for the pre-processing",
    )
    # task related
    parser.add_argument(
        "--recog-json", type=str, help="Filename of recognition data (json)"
    )
    # model (parameter) related
    parser.add_argument(
        "--model", type=str, required=True, help="Model file parameters to read"
    )
    parser.add_argument(
        "--model-conf", type=str, default=None, help="Model config file"
    )

    # Outputs configuration
    parser.add_argument(
        "--enh-wspecifier",
        type=str,
        default=None,
        help="Specify the output way for enhanced speech."
        "e.g. ark,scp:outdir,wav.scp",
    )
    parser.add_argument(
        "--enh-filetype",
        type=str,
        default="sound",
        choices=["mat", "hdf5", "sound.hdf5", "sound"],
        help="Specify the file format for enhanced speech. "
        '"mat" is the matrix format in kaldi',
    )
    parser.add_argument("--fs", type=int, default=16000, help="The sample frequency")
    parser.add_argument(
        "--keep-length",
        type=strtobool,
        default=True,
        help="Adjust the output length to match " "with the input for enhanced speech",
    )
    parser.add_argument(
        "--image-dir", type=str, default=None, help="The directory saving the images."
    )
    parser.add_argument(
        "--num-images",
        type=int,
        default=20,
        help="The number of images files to be saved. "
        "If negative, all samples are to be saved.",
    )

    # IStft
    parser.add_argument(
        "--apply-istft",
        type=strtobool,
        default=True,
        help="Apply istft to the output from the network",
    )
    parser.add_argument(
        "--istft-win-length",
        type=int,
        default=512,
        help="The window length for istft. "
        "This option is ignored "
        "if stft is found in the preprocess-conf",
    )
    parser.add_argument(
        "--istft-n-shift",
        type=str,
        default=256,
        help="The window type for istft. "
        "This option is ignored "
        "if stft is found in the preprocess-conf",
    )
    parser.add_argument(
        "--istft-window",
        type=str,
        default="hann",
        help="The window type for istft. "
        "This option is ignored "
        "if stft is found in the preprocess-conf",
    )
    return parser


def main(args):
    parser = get_parser()
    args = parser.parse_args(args)

    # logging info
    if args.verbose == 1:
        logging.basicConfig(
            level=logging.INFO,
            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
        )
    elif args.verbose == 2:
        logging.basicConfig(
            level=logging.DEBUG,
            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
        )
    else:
        logging.basicConfig(
            level=logging.WARN,
            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
        )
        logging.warning("Skip DEBUG/INFO messages")

    # check CUDA_VISIBLE_DEVICES
    if args.ngpu > 0:
        cvd = os.environ.get("CUDA_VISIBLE_DEVICES")
        if cvd is None:
            logging.warning("CUDA_VISIBLE_DEVICES is not set.")
        elif args.ngpu != len(cvd.split(",")):
            logging.error("#gpus is not matched with CUDA_VISIBLE_DEVICES.")
            sys.exit(1)

        # TODO(kamo): support of multiple GPUs
        if args.ngpu > 1:
            logging.error("The program only supports ngpu=1.")
            sys.exit(1)

    # display PYTHONPATH
    logging.info("python path = " + os.environ.get("PYTHONPATH", "(None)"))

    # seed setting
    random.seed(args.seed)
    np.random.seed(args.seed)
    logging.info("set random seed = %d" % args.seed)

    # recog
    logging.info("backend = " + args.backend)
    if args.backend == "pytorch":
        enhance(args)
    else:
        raise ValueError("Only pytorch is supported.")


if __name__ == "__main__":
    main(sys.argv[1:])


================================================
FILE: bin/asr_recog.py
================================================
#!/usr/bin/env python3
# encoding: utf-8

# Copyright 2017 Johns Hopkins University (Shinji Watanabe)
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

"""End-to-end speech recognition model decoding script."""

import configargparse
import logging
import os
import random
import sys
import tracemalloc
import numpy as np

from espnet.utils.cli_utils import strtobool

# NOTE: you need this func to generate our sphinx doc


def get_parser():
    """Get default arguments."""
    parser = configargparse.ArgumentParser(
        description="Transcribe text from speech using "
        "a speech recognition model on one CPU or GPU",
        config_file_parser_class=configargparse.YAMLConfigFileParser,
        formatter_class=configargparse.ArgumentDefaultsHelpFormatter,
    )
    # general configuration
    parser.add("--config", is_config_file=True, help="Config file path")
    parser.add(
        "--config2",
        is_config_file=True,
        help="Second config file path that overwrites the settings in `--config`",
    )
    parser.add(
        "--config3",
        is_config_file=True,
        help="Third config file path that overwrites the settings "
        "in `--config` and `--config2`",
    )

    parser.add_argument("--ngpu", type=int, default=0, help="Number of GPUs")
    parser.add_argument(
        "--dtype",
        choices=("float16", "float32", "float64"),
        default="float32",
        help="Float precision (only available in --api v2)",
    )
    parser.add_argument(
        "--backend",
        type=str,
        default="chainer",
        choices=["chainer", "pytorch"],
        help="Backend library",
    )
    parser.add_argument("--debugmode", type=int, default=1, help="Debugmode")
    parser.add_argument("--seed", type=int, default=1, help="Random seed")
    parser.add_argument("--verbose", "-V", type=int, default=1, help="Verbose option")
    parser.add_argument(
        "--batchsize",
        type=int,
        default=1,
        help="Batch size for beam search (0: means no batch processing)",
    )
    parser.add_argument(
        "--preprocess-conf",
        type=str,
        default=None,
        help="The configuration file for the pre-processing",
    )
    parser.add_argument(
        "--api",
        default="v1",
        choices=["v1", "v2"],
        help="Beam search APIs "
        "v1: Default API. It only supports the ASRInterface.recognize method "
        "and DefaultRNNLM. "
        "v2: Experimental API. It supports any models that implements ScorerInterface.",
    )
    # task related
    parser.add_argument(
        "--recog-json", type=str, help="Filename of recognition data (json)"
    )
    parser.add_argument(
        "--result-label",
        type=str,
        required=True,
        help="Filename of result label data (json)",
    )
    # model (parameter) related
    parser.add_argument(
        "--model", type=str, required=True, help="Model file parameters to read"
    )
    parser.add_argument(
        "--model-conf", type=str, default=None, help="Model config file"
    )
    parser.add_argument(
        "--num-spkrs",
        type=int,
        default=1,
        choices=[1, 2],
        help="Number of speakers in the speech",
    )
    parser.add_argument(
        "--num-encs", default=1, type=int, help="Number of encoders in the model."
    )
    # search related
    parser.add_argument("--nbest", type=int, default=10, help="Output N-best hypotheses")
    parser.add_argument("--beam-size", type=int, default=1, help="Beam size")
    parser.add_argument("--penalty", type=float, default=0.0, help="Incertion penalty")
    parser.add_argument(
        "--maxlenratio",
        type=float,
        default=0.0,
        help="""Input length ratio to obtain max output length.
                        If maxlenratio=0.0 (default), it uses a end-detect function
                        to automatically find maximum hypothesis lengths""",
    )
    parser.add_argument(
        "--minlenratio",
        type=float,
        default=0.0,
        help="Input length ratio to obtain min output length",
    )
    parser.add_argument(
        "--ctc-weight", type=float, default=0.0, help="CTC weight in joint decoding"
    )
    parser.add_argument(
        "--weights-ctc-dec",
        type=float,
        action="append",
        help="ctc weight assigned to each encoder during decoding."
        "[in multi-encoder mode only]",
    )
    parser.add_argument(
        "--ctc-window-margin",
        type=int,
        default=0,
        help="""Use CTC window with margin parameter to accelerate
                        CTC/attention decoding especially on GPU. Smaller magin
                        makes decoding faster, but may increase search errors.
                        If margin=0 (default), this function is disabled""",
    )
    # transducer related
    parser.add_argument(
        "--search-type",
        type=str,
        default="alsd",
        choices=["default", "nsc", "tsd", "alsd", "ctc_greedy", "ctc_beam"],
        help="""Type of beam search implementation to use during inference.
        Can be either: default beam search, n-step constrained beam search ("nsc"),
        time-synchronous decoding ("tsd") or alignment-length synchronous decoding
        ("alsd").
        Additional associated parameters: "nstep" + "prefix-alpha" (for nsc),
        "max-sym-exp" (for tsd) and "u-max" (for alsd)""",
    )
    parser.add_argument(
        "--nstep",
        type=int,
        default=1,
        help="Number of expansion steps allowed in NSC beam search.",
    )
    parser.add_argument(
        "--prefix-alpha",
        type=int,
        default=2,
        help="Length prefix difference allowed in NSC beam search.",
    )
    parser.add_argument(
        "--max-sym-exp",
        type=int,
        default=2,
        help="Number of symbol expansions allowed in TSD decoding.",
    )
    parser.add_argument(
        "--u-max",
        type=int,
        default=400,
        help="Length prefix difference allowed in ALSD beam search.",
    )
    parser.add_argument(
        "--score-norm",
        type=strtobool,
        nargs="?",
        default=True,
        help="Normalize transducer scores by length",
    )
    # rnnlm related
    parser.add_argument(
        "--rnnlm", type=str, default=None, help="RNNLM model file to read"
    )
    parser.add_argument(
        "--rnnlm-conf", type=str, default=None, help="RNNLM model config file to read"
    )
    parser.add_argument(
        "--word-rnnlm", type=str, default=None, help="Word RNNLM model file to read"
    )
    parser.add_argument(
        "--word-rnnlm-conf",
        type=str,
        default=None,
        help="Word RNNLM model config file to read",
    )
    parser.add_argument("--word-dict", type=str, default=None, help="Word list to read")
    parser.add_argument("--lm-weight", type=float, default=0.1, help="RNNLM weight")
    # ngram related
    parser.add_argument(
        "--ngram-model", type=str, default=None, help="ngram model file to read"
    )
    parser.add_argument("--ngram-weight", type=float, default=0.1, help="ngram weight")
    parser.add_argument(
        "--ngram-scorer",
        type=str,
        default="part",
        choices=("full", "part"),
        help="""if the ngram is set as a part scorer, similar with CTC scorer,
                ngram scorer only scores topK hypethesis.
                if the ngram is set as full scorer, ngram scorer scores all hypthesis
                the decoding speed of part scorer is musch faster than full one""",
    )
    # streaming related
    parser.add_argument(
        "--streaming-mode",
        type=str,
        default=None,
        choices=["window", "segment"],
        help="""Use streaming recognizer for inference.
                        `--batchsize` must be set to 0 to enable this mode""",
    )
    parser.add_argument("--streaming-window", type=int, default=10, help="Window size")
    parser.add_argument(
        "--streaming-min-blank-dur",
        type=int,
        default=10,
        help="Minimum blank duration threshold",
    )
    parser.add_argument(
        "--streaming-onset-margin", type=int, default=1, help="Onset margin"
    )
    parser.add_argument(
        "--streaming-offset-margin", type=int, default=1, help="Offset margin"
    )
    # non-autoregressive related
    # Mask CTC related. See https://arxiv.org/abs/2005.08700 for the detail.
    parser.add_argument(
        "--maskctc-n-iterations",
        type=int,
        default=10,
        help="Number of decoding iterations."
        "For Mask CTC, set 0 to predict 1 mask/iter.",
    )
    parser.add_argument(
        "--maskctc-probability-threshold",
        type=float,
        default=0.999,
        help="Threshold probability for CTC output",
    )

    parser.add_argument(
        "--k2-decode",
        type=bool,
        default=False,
        help="Using K2 decoding",
    )
    parser.add_argument(
        "--local-rank",
        type=int,
        default=-1,
        help="To choose GPU",
    )
    parser.add_argument(
        "--mmi-weight",
        type=float,
        default=0.0,
        help="MMI scorer weight",
    )
    parser.add_argument(
        "--mas-lookahead",
        type=int,
        default=0,
        help="Number of frames to look-ahead in MMI alignment scores",
    )
    parser.add_argument(
        "--use-segment",
        type=strtobool,
        default=False,
        help="If true, the MMI score is parsed by jieba. (Chinese only)",
    )
    parser.add_argument(
        "--mmi-rescore",
        type=strtobool,
        default=False,
        help="Do mmi rescoring after decoding, only for lasctc framework"
    )
    parser.add_argument(
        "--word-ngram",
        type=str,
        default="",
        help="Path to word-level N-gram model lang directory"
    )
    parser.add_argument(
        "--word-ngram-weight",
        type=float,
        default=0.0,
        help="weight of the N-gram model"
    )
    parser.add_argument(
        "--word-ngram-log-semiring",
        type=strtobool,
        default=True,
        help="If true, score the lattice with log-semiring, else tropical semiring"
    )
    parser.add_argument(
        "--word-ngram-lower-char",
        type=strtobool,
        default=True,
        help="If true, all english characters will be converted into lower case. otherwise upper case"
    )
    parser.add_argument(
        "--tlg-scorer",
        type=str,
        default="",
        help="lang directory of lang that save the LG.fst. Only useful for RNNT ALSD decoding"
    )   
    parser.add_argument(
        "--tlg-nonblk-reward",
        type=float,
        default=1.5,
        help="Reward whenaver a non-blank token is generated. Used in TLG scorer",
    )
    parser.add_argument(
        "--tlg-weight",
        type=float,
        default=0.0,
        help="weight for TLG scorer in decoding",
    )
    parser.add_argument(
        "--skip-eng",
        type=strtobool,
        default=False,
        help="If true, skip the utterance whose transcription has english alphabet (rnnt only)",
    )
    parser.add_argument(
        "--forbid-eng",
        type=strtobool,
        default=False,
        help="If true, forbid the rnnt model to predict English characters (rnnt only)",
    )
    parser.add_argument(
        "--cs-nt-decode-feature",
        type=str,
        default="combine",
        choices = ["combine", "chn", "eng"],
        help="feature used for decoding",
    )
    parser.add_argument(
        "--cs-lang-weight",
        type=float,
        default="0.0",
        help="weight of language classification loss",
    )
    parser.add_argument(
        "--eng-vocab",
        type=str,
        default=None,
        help="if apply, the hypothesis is valid only if all english words are in this vocab",
    )
    return parser


def main(args):
    """Run the main decoding function."""
    parser = get_parser()
    args = parser.parse_args(args)

    if args.ngpu == 0 and args.dtype == "float16":
        raise ValueError(f"--dtype {args.dtype} does not support the CPU backend.")

    # logging info
    if args.verbose == 1:
        logging.basicConfig(
            level=logging.INFO,
            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
        )
    elif args.verbose == 2:
        logging.basicConfig(
            level=logging.DEBUG,
            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
        )
    else:
        logging.basicConfig(
            level=logging.WARN,
            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
        )
        logging.warning("Skip DEBUG/INFO messages")

    # check CUDA_VISIBLE_DEVICES
    if args.ngpu > 0:
        cvd = os.environ.get("CUDA_VISIBLE_DEVICES")
        if cvd is None:
            logging.warning("CUDA_VISIBLE_DEVICES is not set.")
        elif args.ngpu != len(cvd.split(",")):
            logging.error("#gpus is not matched with CUDA_VISIBLE_DEVICES.")
            sys.exit(1)

        # TODO(mn5k): support of multiple GPUs
        if args.ngpu > 1:
            logging.error("The program only supports ngpu=1.")
            sys.exit(1)

    # display PYTHONPATH
    logging.info("python path = " + os.environ.get("PYTHONPATH", "(None)"))

    # seed setting
    random.seed(args.seed)
    np.random.seed(args.seed)
    logging.info("set random seed = %d" % args.seed)

    # validate rnn options
    if args.rnnlm is not None and args.word_rnnlm is not None:
        logging.error(
            "It seems that both --rnnlm and --word-rnnlm are specified. "
            "Please use either option."
        )
        sys.exit(1)

    # recog
    logging.info("backend = " + args.backend)
    if args.num_spkrs == 1:
        if args.backend == "chainer":
            from espnet.asr.chainer_backend.asr import recog

            recog(args)
        elif args.backend == "pytorch":
            if args.num_encs == 1:
                # Experimental API that supports custom LMs
                if args.api == "v2":
                    from espnet.asr.pytorch_backend.recog import recog_v2

                    recog_v2(args)
                else:
                    from espnet.asr.pytorch_backend.asr import recog

                    if args.dtype != "float32":
                        raise NotImplementedError(
                            f"`--dtype {args.dtype}` is only available with `--api v2`"
                        )
                    recog(args)
            else:
                if args.api == "v2":
                    raise NotImplementedError(
                        f"--num-encs {args.num_encs} > 1 is not supported in --api v2"
                    )
                else:
                    from espnet.asr.pytorch_backend.asr import recog

                    recog(args)
        else:
            raise ValueError("Only chainer and pytorch are supported.")
    elif args.num_spkrs == 2:
        if args.backend == "pytorch":
            from espnet.asr.pytorch_backend.asr_mix import recog

            recog(args)
        else:
            raise ValueError("Only pytorch is supported.")


if __name__ == "__main__":
    # tracemalloc.start(10000)
    main(sys.argv[1:])
    # size, peak = tracemalloc.get_traced_memory()
    # peak /= (1024 ** 2)
    # print(f"Maximum Memory consumed: {peak}MB")


================================================
FILE: bin/asr_train.py
================================================
#!/usr/bin/env python3
# encoding: utf-8

# Copyright 2017 Tomoki Hayashi (Nagoya University)
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

"""Automatic speech recognition model training script."""

import logging
import os
import random
import subprocess
import sys

from distutils.version import LooseVersion

import configargparse
import numpy as np
import torch

from espnet import __version__
from espnet.utils.cli_utils import strtobool
from espnet.utils.training.batchfy import BATCH_COUNT_CHOICES

is_torch_1_2_plus = LooseVersion(torch.__version__) >= LooseVersion("1.2")


# NOTE: you need this func to generate our sphinx doc
def get_parser(parser=None, required=True):
    """Get default arguments."""
    if parser is None:
        parser = configargparse.ArgumentParser(
            description="Train an automatic speech recognition (ASR) model on one CPU, "
            "one or multiple GPUs",
            config_file_parser_class=configargparse.YAMLConfigFileParser,
            formatter_class=configargparse.ArgumentDefaultsHelpFormatter,
        )
    # general configuration
    parser.add("--config", is_config_file=True, help="config file path")
    parser.add(
        "--config2",
        is_config_file=True,
        help="second config file path that overwrites the settings in `--config`.",
    )
    parser.add(
        "--config3",
        is_config_file=True,
        help="third config file path that overwrites the settings in "
        "`--config` and `--config2`.",
    )

    parser.add_argument(
        "--ngpu",
        default=None,
        type=int,
        help="Number of GPUs. If not given, use all visible devices",
    )
    parser.add_argument(
        "--train-dtype",
        default="float32",
        choices=["float16", "float32", "float64", "O0", "O1", "O2", "O3"],
        help="Data type for training (only pytorch backend). "
        "O0,O1,.. flags require apex. "
        "See https://nvidia.github.io/apex/amp.html#opt-levels",
    )
    parser.add_argument(
        "--backend",
        default="chainer",
        type=str,
        choices=["chainer", "pytorch"],
        help="Backend library",
    )
    parser.add_argument(
        "--outdir", type=str, required=required, help="Output directory"
    )
    parser.add_argument("--debugmode", default=1, type=int, help="Debugmode")
    parser.add_argument("--dict", required=required, help="Dictionary")
    parser.add_argument("--seed", default=1, type=int, help="Random seed")
    parser.add_argument("--debugdir", type=str, help="Output directory for debugging")
    parser.add_argument(
        "--resume",
        "-r",
        default="",
        nargs="?",
        help="Resume the training from snapshot",
    )
    parser.add_argument(
        "--minibatches",
        "-N",
        type=int,
        default="-1",
        help="Process only N minibatches (for debug)",
    )
    parser.add_argument("--verbose", "-V", default=0, type=int, help="Verbose option")
    parser.add_argument(
        "--tensorboard-dir",
        default=None,
        type=str,
        nargs="?",
        help="Tensorboard log dir path",
    )
    parser.add_argument(
        "--report-interval-iters",
        default=300,
        type=int,
        help="Report interval iterations",
    )
    parser.add_argument(
        "--save-interval-iters",
        default=0,
        type=int,
        help="Save snapshot interval iterations",
    )
    # task related
    parser.add_argument(
        "--train-json",
        type=str,
        default=None,
        help="Filename of train label data (json)",
    )
    parser.add_argument(
        "--valid-json",
        type=str,
        default=None,
        help="Filename of validation label data (json)",
    )
    # network architecture
    parser.add_argument(
        "--model-module",
        type=str,
        default=None,
        help="model defined module (default: espnet.nets.xxx_backend.e2e_asr:E2E)",
    )
    # encoder
    parser.add_argument(
        "--num-encs", default=1, type=int, help="Number of encoders in the model."
    )
    # loss related
    parser.add_argument(
        "--ctc_type",
        default="warpctc",
        type=str,
        choices=["builtin", "warpctc", "gtnctc", "cudnnctc", "k2mmi", 'k2ctc'],
        help="Type of CTC implementation to calculate loss.",
    )
    parser.add_argument(
        "--mtlalpha",
        default=0.5,
        type=float,
        help="Multitask learning coefficient, "
        "alpha: alpha*ctc_loss + (1-alpha)*att_loss ",
    )
    parser.add_argument(
        "--lsm-weight", default=0.0, type=float, help="Label smoothing weight"
    )
    # recognition options to compute CER/WER
    parser.add_argument(
        "--report-cer",
        default=False,
        action="store_true",
        help="Compute CER on development set",
    )
    parser.add_argument(
        "--report-wer",
        default=False,
        action="store_true",
        help="Compute WER on development set",
    )
    parser.add_argument("--nbest", type=int, default=1, help="Output N-best hypotheses")
    parser.add_argument("--beam-size", type=int, default=4, help="Beam size")
    parser.add_argument("--penalty", default=0.0, type=float, help="Incertion penalty")
    parser.add_argument(
        "--maxlenratio",
        default=0.0,
        type=float,
        help="""Input length ratio to obtain max output length.
                        If maxlenratio=0.0 (default), it uses a end-detect function
                        to automatically find maximum hypothesis lengths""",
    )
    parser.add_argument(
        "--minlenratio",
        default=0.0,
        type=float,
        help="Input length ratio to obtain min output length",
    )
    parser.add_argument(
        "--ctc-weight", default=0.3, type=float, help="CTC weight in joint decoding"
    )
    parser.add_argument(
        "--rnnlm", type=str, default=None, help="RNNLM model file to read"
    )
    parser.add_argument(
        "--rnnlm-conf", type=str, default=None, help="RNNLM model config file to read"
    )
    parser.add_argument("--lm-weight", default=0.1, type=float, help="RNNLM weight.")
    parser.add_argument("--sym-space", default="<space>", type=str, help="Space symbol")
    parser.add_argument("--sym-blank", default="<blank>", type=str, help="Blank symbol")
    # minibatch related
    parser.add_argument(
        "--sortagrad",
        default=0,
        type=int,
        nargs="?",
        help="How many epochs to use sortagrad for. 0 = deactivated, -1 = all epochs",
    )
    parser.add_argument(
        "--batch-count",
        default="auto",
        choices=BATCH_COUNT_CHOICES,
        help="How to count batch_size. "
        "The default (auto) will find how to count by args.",
    )
    parser.add_argument(
        "--batch-size",
        "--batch-seqs",
        "-b",
        default=0,
        type=int,
        help="Maximum seqs in a minibatch (0 to disable)",
    )
    parser.add_argument(
        "--batch-bins",
        default=0,
        type=int,
        help="Maximum bins in a minibatch (0 to disable)",
    )
    parser.add_argument(
        "--batch-frames-in",
        default=0,
        type=int,
        help="Maximum input frames in a minibatch (0 to disable)",
    )
    parser.add_argument(
        "--batch-frames-out",
        default=0,
        type=int,
        help="Maximum output frames in a minibatch (0 to disable)",
    )
    parser.add_argument(
        "--batch-frames-inout",
        default=0,
        type=int,
        help="Maximum input+output frames in a minibatch (0 to disable)",
    )
    parser.add_argument(
        "--maxlen-in",
        "--batch-seq-maxlen-in",
        default=800,
        type=int,
        metavar="ML",
        help="When --batch-count=seq, "
        "batch size is reduced if the input sequence length > ML.",
    )
    parser.add_argument(
        "--maxlen-out",
        "--batch-seq-maxlen-out",
        default=150,
        type=int,
        metavar="ML",
        help="When --batch-count=seq, "
        "batch size is reduced if the output sequence length > ML",
    )
    parser.add_argument(
        "--n-iter-processes",
        default=0,
        type=int,
        help="Number of processes of iterator",
    )
    parser.add_argument(
        "--preprocess-conf",
        type=str,
        default=None,
        nargs="?",
        help="The configuration file for the pre-processing",
    )
    # optimization related
    parser.add_argument(
        "--opt",
        default="noam_sgd",
        type=str,
        choices=["adadelta", "adam", "noam", "noam_sgd"],
        help="Optimizer",
    )
    parser.add_argument(
        "--accum-grad", default=1, type=int, help="Number of gradient accumuration"
    )
    parser.add_argument(
        "--eps", default=1e-8, type=float, help="Epsilon constant for optimizer"
    )
    parser.add_argument(
        "--eps-decay", default=0.01, type=float, help="Decaying ratio of epsilon"
    )
    parser.add_argument(
        "--weight-decay", default=0.0, type=float, help="Weight decay ratio"
    )
    parser.add_argument(
        "--criterion",
        default="acc",
        type=str,
        choices=["loss", "loss_eps_decay_only", "acc"],
        help="Criterion to perform epsilon decay",
    )
    parser.add_argument(
        "--threshold", default=1e-4, type=float, help="Threshold to stop iteration"
    )
    parser.add_argument(
        "--epochs", "-e", default=30, type=int, help="Maximum number of epochs"
    )
    parser.add_argument(
        "--early-stop-criterion",
        default="validation/main/acc",
        type=str,
        nargs="?",
        help="Value to monitor to trigger an early stopping of the training",
    )
    parser.add_argument(
        "--patience",
        default=3,
        type=int,
        nargs="?",
        help="Number of epochs to wait without improvement "
        "before stopping the training",
    )
    parser.add_argument(
        "--grad-clip", default=5, type=float, help="Gradient norm threshold to clip"
    )
    parser.add_argument(
        "--num-save-attention",
        default=0,
        type=int,
        help="Number of samples of attention to be saved",
    )
    parser.add_argument(
        "--num-save-ctc",
        default=0,
        type=int,
        help="Number of samples of CTC probability to be saved",
    )
    parser.add_argument(
        "--grad-noise",
        type=strtobool,
        default=False,
        help="The flag to switch to use noise injection to gradients during training",
    )
    # asr_mix related
    parser.add_argument(
        "--num-spkrs",
        default=1,
        type=int,
        choices=[1, 2],
        help="Number of speakers in the speech.",
    )
    # decoder related
    parser.add_argument(
        "--context-residual",
        default=False,
        type=strtobool,
        nargs="?",
        help="The flag to switch to use context vector residual in the decoder network",
    )
    # finetuning related
    parser.add_argument(
        "--enc-init",
        default=None,
        type=str,
        help="Pre-trained ASR model to initialize encoder.",
    )
    parser.add_argument(
        "--enc-init-mods",
        default="enc.enc.",
        type=lambda s: [str(mod) for mod in s.split(",") if s != ""],
        help="List of encoder modules to initialize, separated by a comma.",
    )
    parser.add_argument(
        "--dec-init",
        default=None,
        type=str,
        help="Pre-trained ASR, MT or LM model to initialize decoder.",
    )
    parser.add_argument(
        "--dec-init-mods",
        default="att.,dec.",
        type=lambda s: [str(mod) for mod in s.split(",") if s != ""],
        help="List of decoder modules to initialize, separated by a comma.",
    )
    parser.add_argument(
        "--freeze-mods",
        default=None,
        type=lambda s: [str(mod) for mod in s.split(",") if s != ""],
        help="List of modules to freeze, separated by a comma.",
    )
    # front end related
    parser.add_argument(
        "--use-frontend",
        type=strtobool,
        default=False,
        help="The flag to switch to use frontend system.",
    )

    # WPE related
    parser.add_argument(
        "--use-wpe",
        type=strtobool,
        default=False,
        help="Apply Weighted Prediction Error",
    )
    parser.add_argument(
        "--wtype",
        default="blstmp",
        type=str,
        choices=[
            "lstm",
            "blstm",
            "lstmp",
            "blstmp",
            "vgglstmp",
            "vggblstmp",
            "vgglstm",
            "vggblstm",
            "gru",
            "bgru",
            "grup",
            "bgrup",
            "vgggrup",
            "vggbgrup",
            "vgggru",
            "vggbgru",
        ],
        help="Type of encoder network architecture "
        "of the mask estimator for WPE. "
        "",
    )
    parser.add_argument("--wlayers", type=int, default=2, help="")
    parser.add_argument("--wunits", type=int, default=300, help="")
    parser.add_argument("--wprojs", type=int, default=300, help="")
    parser.add_argument("--wdropout-rate", type=float, default=0.0, help="")
    parser.add_argument("--wpe-taps", type=int, default=5, help="")
    parser.add_argument("--wpe-delay", type=int, default=3, help="")
    parser.add_argument(
        "--use-dnn-mask-for-wpe",
        type=strtobool,
        default=False,
        help="Use DNN to estimate the power spectrogram. "
        "This option is experimental.",
    )
    # Beamformer related
    parser.add_argument("--use-beamformer", type=strtobool, default=True, help="")
    parser.add_argument(
        "--btype",
        default="blstmp",
        type=str,
        choices=[
            "lstm",
            "blstm",
            "lstmp",
            "blstmp",
            "vgglstmp",
            "vggblstmp",
            "vgglstm",
            "vggblstm",
            "gru",
            "bgru",
            "grup",
            "bgrup",
            "vgggrup",
            "vggbgrup",
            "vgggru",
            "vggbgru",
        ],
        help="Type of encoder network architecture "
        "of the mask estimator for Beamformer.",
    )
    parser.add_argument("--blayers", type=int, default=2, help="")
    parser.add_argument("--bunits", type=int, default=300, help="")
    parser.add_argument("--bprojs", type=int, default=300, help="")
    parser.add_argument("--badim", type=int, default=320, help="")
    parser.add_argument(
        "--bnmask",
        type=int,
        default=2,
        help="Number of beamforming masks, " "default is 2 for [speech, noise].",
    )
    parser.add_argument(
        "--ref-channel",
        type=int,
        default=-1,
        help="The reference channel used for beamformer. "
        "By default, the channel is estimated by DNN.",
    )
    parser.add_argument("--bdropout-rate", type=float, default=0.0, help="")
    # Feature transform: Normalization
    parser.add_argument(
        "--stats-file",
        type=str,
        default=None,
        help="The stats file for the feature normalization",
    )
    parser.add_argument(
        "--apply-uttmvn",
        type=strtobool,
        default=True,
        help="Apply utterance level mean " "variance normalization.",
    )
    parser.add_argument("--uttmvn-norm-means", type=strtobool, default=True, help="")
    parser.add_argument("--uttmvn-norm-vars", type=strtobool, default=False, help="")
    # Feature transform: Fbank
    parser.add_argument(
        "--fbank-fs",
        type=int,
        default=16000,
        help="The sample frequency used for " "the mel-fbank creation.",
    )
    parser.add_argument(
        "--n-mels", type=int, default=80, help="The number of mel-frequency bins."
    )
    parser.add_argument("--fbank-fmin", type=float, default=0.0, help="")
    parser.add_argument("--fbank-fmax", type=float, default=None, help="")

    # K2 
    parser.add_argument("--lang", type=str,
                        help="k2 lang dir")
    parser.add_argument("--den-scale", type=float, default=1.0,
                        help="denumerator scale: loss = num + den_scale * den")
    parser.add_argument("--third-weight", type=float, default=0.0,
                        help="we still need ctc loss if encoder is supervised by MMI. This is ctc_weight")
    parser.add_argument("--use-segment", type=strtobool, default=False,
                        help="If true, MMI supervision is from text_org. If false, it is from ys_pad")
    
    # DDP
    parser.add_argument("--master-node", type=int, default=0,
                        help="master node rank")
    parser.add_argument("--local_rank", type=int, default=-1,
                        help="local GPU rank")
    parser.add_argument("--world-size", type=int, default=-1,
                        help="BMUF world size")
    parser.add_argument("--node-rank", type=int, default=-1,
                        help="DDP node rank")
    parser.add_argument("--node-size", type=int, default=8,
                        help="number of GPU on each node")

    # MBR
    parser.add_argument("--load-trainer-and-opt", type=strtobool, default=True,
                        help="If false, only the model weight would be loaded in snapshot")

    
    parser.add_argument("--block-load", type=strtobool, default=False,
                        help="block loading for training. make sure all batches are in the same ark")
    parser.add_argument("--utts-per-ark", type=int, default=256,
                        help="number of utterance in each ark")
    """
    Due to the slow ceph, we cannot load data completely in random paradigm
    Thus, the randomness is implemented in hierarchical style.
    (1) We sure that each minibatch is from the same ark file. Also, make
        sure the utterances in json file is sorted from shortest to longest.
        You should do this before training starts.
    (2) the whole dataset is divided into many groups, each groups contains 
        "block-buffer-size" arks. The randomness is implemented on both 
        intra- and inter- group styles. The larger the 'block-buffer-size',
        the better the randomness is implemented. But more memory would be
        be consumed.
    (3) At the begining of each epoch, the training would stuck since nearly 
        each update needs to load a new ark. This will not last long: it would
        be smooth once a group of data is completely loaded
    (4) Once a minibatch is consumed, we delete it in memory to avoid OOM
    (5) If we use loading stategy, we can only use one worker process ot load
        the data to avoid the conflicts in memory buffer. But this is fine since
        each ark contains many utterances and one worker is far more than enough
    (6) A buffer that is too large (e.g, size > 100) will make the GPU slow since 
        the virtual memory (actually the disk) is used as the buffer 
    """
    parser.add_argument("--block-buffer-size", type=int, default=80,
                        help="number of arks in buffer. At most 3*block_buffer_size arks would be stored in memory")
    return parser


def main(cmd_args):
    """Run the main training function."""
    parser = get_parser()
    args, _ = parser.parse_known_args(cmd_args)
    if args.backend == "chainer" and args.train_dtype != "float32":
        raise NotImplementedError(
            f"chainer backend does not support --train-dtype {args.train_dtype}."
            "Use --dtype float32."
        )
    if args.ngpu == 0 and args.train_dtype in ("O0", "O1", "O2", "O3", "float16"):
        raise ValueError(
            f"--train-dtype {args.train_dtype} does not support the CPU backend."
        )

    from espnet.utils.dynamic_import import dynamic_import

    if args.model_module is None:
        if args.num_spkrs == 1:
            model_module = "espnet.nets." + args.backend + "_backend.e2e_asr:E2E"
        else:
            model_module = "espnet.nets." + args.backend + "_backend.e2e_asr_mix:E2E"
    else:
        model_module = args.model_module
    model_class = dynamic_import(model_module)
    model_class.add_arguments(parser)

    args = parser.parse_args(cmd_args)
    args.model_module = model_module
    if "chainer_backend" in args.model_module:
        args.backend = "chainer"
    if "pytorch_backend" in args.model_module:
        args.backend = "pytorch"

    # add version info in args
    args.version = __version__

    # logging info
    if args.verbose > 0:
        logging.basicConfig(
            level=logging.INFO,
            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
        )
    else:
        logging.basicConfig(
            level=logging.WARN,
            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
        )
        logging.warning("Skip DEBUG/INFO messages")

    # If --ngpu is not given,
    #   1. if CUDA_VISIBLE_DEVICES is set, all visible devices
    #   2. if nvidia-smi exists, use all devices
    #   3. else ngpu=0
    if args.ngpu is None:
        cvd = os.environ.get("CUDA_VISIBLE_DEVICES")
        if cvd is not None:
            ngpu = len(cvd.split(","))
        else:
            logging.warning("CUDA_VISIBLE_DEVICES is not set.")
            try:
                p = subprocess.run(
                    ["nvidia-smi", "-L"], stdout=subprocess.PIPE, stderr=subprocess.PIPE
                )
            except (subprocess.CalledProcessError, FileNotFoundError):
                ngpu = 0
            else:
                ngpu = len(p.stderr.decode().split("\n")) - 1
    else:
        if is_torch_1_2_plus and args.ngpu != 1:
            logging.debug(
                "There are some bugs with multi-GPU processing in PyTorch 1.2+"
                + " (see https://github.com/pytorch/pytorch/issues/21108)"
            )
        ngpu = args.ngpu
    logging.info(f"ngpu: {ngpu}")

    # display PYTHONPATH
    logging.info("python path = " + os.environ.get("PYTHONPATH", "(None)"))

    # set random seed
    logging.info("random seed = %d" % args.seed)
    random.seed(args.seed)
    np.random.seed(args.seed)

    # load dictionary for debug log
    if args.dict is not None:
        with open(args.dict, "rb") as f:
            dictionary = f.readlines()
        char_list = [entry.decode("utf-8").split(" ")[0] for entry in dictionary]
        char_list.insert(0, "<blank>")
        char_list.append("<eos>")
        # for non-autoregressive maskctc model
        if "maskctc" in args.model_module:
            char_list.append("<mask>")
        args.char_list = char_list
    else:
        args.char_list = None

    # train
    logging.info("backend = " + args.backend)

    if args.num_spkrs == 1:
        if args.backend == "chainer":
            from espnet.asr.chainer_backend.asr import train

            train(args)
        elif args.backend == "pytorch":
            from espnet.asr.pytorch_backend.asr import train

            train(args)
        else:
            raise ValueError("Only chainer and pytorch are supported.")
    else:
        # FIXME(kamo): Support --model-module
        if args.backend == "pytorch":
            from espnet.asr.pytorch_backend.asr_mix import train

            train(args)
        else:
            raise ValueError("Only pytorch is supported.")


if __name__ == "__main__":
    main(sys.argv[1:])


================================================
FILE: bin/lm_train.py
================================================
#!/usr/bin/env python3

# Copyright 2017 Johns Hopkins University (Shinji Watanabe)
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

# This code is ported from the following implementation written in Torch.
# https://github.com/chainer/chainer/blob/master/examples/ptb/train_ptb_custom_loop.py

"""Language model training script."""

import logging
import os
import random
import subprocess
import sys

import configargparse
import numpy as np

from espnet import __version__
from espnet.nets.lm_interface import dynamic_import_lm
from espnet.optimizer.factory import dynamic_import_optimizer
from espnet.scheduler.scheduler import dynamic_import_scheduler


# NOTE: you need this func to generate our sphinx doc
def get_parser(parser=None, required=True):
    """Get parser."""
    if parser is None:
        parser = configargparse.ArgumentParser(
            description="Train a new language model on one CPU or one GPU",
            config_file_parser_class=configargparse.YAMLConfigFileParser,
            formatter_class=configargparse.ArgumentDefaultsHelpFormatter,
        )
    # general configuration
    parser.add("--config", is_config_file=True, help="config file path")
    parser.add(
        "--config2",
        is_config_file=True,
        help="second config file path that overwrites the settings in `--config`.",
    )
    parser.add(
        "--config3",
        is_config_file=True,
        help="third config file path that overwrites the settings "
        "in `--config` and `--config2`.",
    )

    parser.add_argument(
        "--ngpu",
        default=None,
        type=int,
        help="Number of GPUs. If not given, use all visible devices",
    )
    parser.add_argument(
        "--train-dtype",
        default="float32",
        choices=["float16", "float32", "float64", "O0", "O1", "O2", "O3"],
        help="Data type for training (only pytorch backend). "
        "O0,O1,.. flags require apex. "
        "See https://nvidia.github.io/apex/amp.html#opt-levels",
    )
    parser.add_argument(
        "--backend",
        default="chainer",
        type=str,
        choices=["chainer", "pytorch"],
        help="Backend library",
    )
    parser.add_argument(
        "--outdir", type=str, required=required, help="Output directory"
    )
    parser.add_argument("--debugmode", default=1, type=int, help="Debugmode")
    parser.add_argument("--dict", type=str, required=required, help="Dictionary")
    parser.add_argument("--seed", default=1, type=int, help="Random seed")
    parser.add_argument(
        "--resume",
        "-r",
        default="",
        nargs="?",
        help="Resume the training from snapshot",
    )
    parser.add_argument("--verbose", "-V", default=0, type=int, help="Verbose option")
    parser.add_argument(
        "--tensorboard-dir",
        default=None,
        type=str,
        nargs="?",
        help="Tensorboard log dir path",
    )
    parser.add_argument(
        "--report-interval-iters",
        default=100,
        type=int,
        help="Report interval iterations",
    )
    # task related
    parser.add_argument(
        "--train-label",
        type=str,
        required=required,
        help="Filename of train label data",
    )
    parser.add_argument(
        "--valid-label",
        type=str,
        required=required,
        help="Filename of validation label data",
    )
    parser.add_argument("--test-label", type=str, help="Filename of test label data")
    parser.add_argument(
        "--dump-hdf5-path",
        type=str,
        default=None,
        help="Path to dump a preprocessed dataset as hdf5",
    )
    # training configuration
    parser.add_argument("--opt", default="sgd", type=str, help="Optimizer")
    parser.add_argument(
        "--sortagrad",
        default=0,
        type=int,
        nargs="?",
        help="How many epochs to use sortagrad for. 0 = deactivated, -1 = all epochs",
    )
    parser.add_argument(
        "--batchsize",
        "-b",
        type=int,
        default=300,
        help="Number of examples in each mini-batch",
    )
    parser.add_argument(
        "--accum-grad", type=int, default=1, help="Number of gradient accumueration"
    )
    parser.add_argument(
        "--epoch",
        "-e",
        type=int,
        default=20,
        help="Number of sweeps over the dataset to train",
    )
    parser.add_argument(
        "--early-stop-criterion",
        default="validation/main/loss",
        type=str,
        nargs="?",
        help="Value to monitor to trigger an early stopping of the training",
    )
    parser.add_argument(
        "--patience",
        default=3,
        type=int,
        nargs="?",
        help="Number of epochs "
        "to wait without improvement before stopping the training",
    )
    parser.add_argument(
        "--schedulers",
        default=None,
        action="append",
        type=lambda kv: kv.split("="),
        help="optimizer schedulers, you can configure params like:"
        " <optimizer-param>-<scheduler-name>-<schduler-param>"
        ' e.g., "--schedulers lr=noam --lr-noam-warmup 1000".',
    )
    parser.add_argument(
        "--gradclip",
        "-c",
        type=float,
        default=5,
        help="Gradient norm threshold to clip",
    )
    parser.add_argument(
        "--maxlen",
        type=int,
        default=40,
        help="Batch size is reduced if the input sequence > ML",
    )
    parser.add_argument(
        "--model-module",
        type=str,
        default="default",
        help="model defined module "
        "(default: espnet.nets.xxx_backend.lm.default:DefaultRNNLM)",
    )
    return parser


def main(cmd_args):
    """Train LM."""
    parser = get_parser()
    args, _ = parser.parse_known_args(cmd_args)
    if args.backend == "chainer" and args.train_dtype != "float32":
        raise NotImplementedError(
            f"chainer backend does not support --train-dtype {args.train_dtype}."
            "Use --dtype float32."
        )
    if args.ngpu == 0 and args.train_dtype in ("O0", "O1", "O2", "O3", "float16"):
        raise ValueError(
            f"--train-dtype {args.train_dtype} does not support the CPU backend."
        )

    # parse arguments dynamically
    model_class = dynamic_import_lm(args.model_module, args.backend)
    model_class.add_arguments(parser)
    if args.schedulers is not None:
        for k, v in args.schedulers:
            scheduler_class = dynamic_import_scheduler(v)
            scheduler_class.add_arguments(k, parser)

    opt_class = dynamic_import_optimizer(args.opt, args.backend)
    opt_class.add_arguments(parser)

    args = parser.parse_args(cmd_args)

    # add version info in args
    args.version = __version__

    # logging info
    if args.verbose > 0:
        logging.basicConfig(
            level=logging.INFO,
            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
        )
    else:
        logging.basicConfig(
            level=logging.WARN,
            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
        )
        logging.warning("Skip DEBUG/INFO messages")

    # If --ngpu is not given,
    #   1. if CUDA_VISIBLE_DEVICES is set, all visible devices
    #   2. if nvidia-smi exists, use all devices
    #   3. else ngpu=0
    if args.ngpu is None:
        cvd = os.environ.get("CUDA_VISIBLE_DEVICES")
        if cvd is not None:
            ngpu = len(cvd.split(","))
        else:
            logging.warning("CUDA_VISIBLE_DEVICES is not set.")
            try:
                p = subprocess.run(
                    ["nvidia-smi", "-L"], stdout=subprocess.PIPE, stderr=subprocess.PIPE
                )
            except (subprocess.CalledProcessError, FileNotFoundError):
                ngpu = 0
            else:
                ngpu = len(p.stderr.decode().split("\n")) - 1
        args.ngpu = ngpu
    else:
        ngpu = args.ngpu
    logging.info(f"ngpu: {ngpu}")

    # display PYTHONPATH
    logging.info("python path = " + os.environ.get("PYTHONPATH", "(None)"))

    # seed setting
    nseed = args.seed
    random.seed(nseed)
    np.random.seed(nseed)

    # load dictionary
    with open(args.dict, "rb") as f:
        dictionary = f.readlines()
    char_list = [entry.decode("utf-8").split(" ")[0] for entry in dictionary]
    char_list.insert(0, "<blank>")
    char_list.append("<eos>")
    args.char_list_dict = {x: i for i, x in enumerate(char_list)}
    args.n_vocab = len(char_list)

    # train
    logging.info("backend = " + args.backend)
    if args.backend == "chainer":
        from espnet.lm.chainer_backend.lm import train

        train(args)
    elif args.backend == "pytorch":
        from espnet.lm.pytorch_backend.lm import train

        train(args)
    else:
        raise ValueError("Only chainer and pytorch are supported.")


if __name__ == "__main__":
    main(sys.argv[1:])


================================================
FILE: bin/mt_train.py
================================================
#!/usr/bin/env python3
# encoding: utf-8

# Copyright 2019 Kyoto University (Hirofumi Inaguma)
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

"""Neural machine translation model training script."""

import logging
import os
import random
import subprocess
import sys

from distutils.version import LooseVersion

import configargparse
import numpy as np
import torch

from espnet import __version__
from espnet.utils.cli_utils import strtobool
from espnet.utils.training.batchfy import BATCH_COUNT_CHOICES

is_torch_1_2_plus = LooseVersion(torch.__version__) >= LooseVersion("1.2")


# NOTE: you need this func to generate our sphinx doc
def get_parser(parser=None, required=True):
    """Get default arguments."""
    if parser is None:
        parser = configargparse.ArgumentParser(
            description="Train a neural machine translation (NMT) model on one CPU, "
            "one or multiple GPUs",
            config_file_parser_class=configargparse.YAMLConfigFileParser,
            formatter_class=configargparse.ArgumentDefaultsHelpFormatter,
        )
    # general configuration
    parser.add("--config", is_config_file=True, help="config file path")
    parser.add(
        "--config2",
        is_config_file=True,
        help="second config file path that overwrites the settings in `--config`.",
    )
    parser.add(
        "--config3",
        is_config_file=True,
        help="third config file path that overwrites the settings "
        "in `--config` and `--config2`.",
    )

    parser.add_argument(
        "--ngpu",
        default=None,
        type=int,
        help="Number of GPUs. If not given, use all visible devices",
    )
    parser.add_argument(
        "--train-dtype",
        default="float32",
        choices=["float16", "float32", "float64", "O0", "O1", "O2", "O3"],
        help="Data type for training (only pytorch backend). "
        "O0,O1,.. flags require apex. "
        "See https://nvidia.github.io/apex/amp.html#opt-levels",
    )
    parser.add_argument(
        "--backend",
        default="chainer",
        type=str,
        choices=["chainer", "pytorch"],
        help="Backend library",
    )
    parser.add_argument(
        "--outdir", type=str, required=required, help="Output directory"
    )
    parser.add_argument("--debugmode", default=1, type=int, help="Debugmode")
    parser.add_argument(
        "--dict", required=required, help="Dictionary for source/target languages"
    )
    parser.add_argument("--seed", default=1, type=int, help="Random seed")
    parser.add_argument("--debugdir", type=str, help="Output directory for debugging")
    parser.add_argument(
        "--resume",
        "-r",
        default="",
        nargs="?",
        help="Resume the training from snapshot",
    )
    parser.add_argument(
        "--minibatches",
        "-N",
        type=int,
        default="-1",
        help="Process only N minibatches (for debug)",
    )
    parser.add_argument("--verbose", "-V", default=0, type=int, help="Verbose option")
    parser.add_argument(
        "--tensorboard-dir",
        default=None,
        type=str,
        nargs="?",
        help="Tensorboard log dir path",
    )
    parser.add_argument(
        "--report-interval-iters",
        default=100,
        type=int,
        help="Report interval iterations",
    )
    parser.add_argument(
        "--save-interval-iters",
        default=0,
        type=int,
        help="Save snapshot interval iterations",
    )
    # task related
    parser.add_argument(
        "--train-json",
        type=str,
        default=None,
        help="Filename of train label data (json)",
    )
    parser.add_argument(
        "--valid-json",
        type=str,
        default=None,
        help="Filename of validation label data (json)",
    )
    # network architecture
    parser.add_argument(
        "--model-module",
        type=str,
        default=None,
        help="model defined module (default: espnet.nets.xxx_backend.e2e_mt:E2E)",
    )
    # loss related
    parser.add_argument(
        "--lsm-weight", default=0.0, type=float, help="Label smoothing weight"
    )
    # translations options to compute BLEU
    parser.add_argument(
        "--report-bleu",
        default=True,
        action="store_true",
        help="Compute BLEU on development set",
    )
    parser.add_argument("--nbest", type=int, default=1, help="Output N-best hypotheses")
    parser.add_argument("--beam-size", type=int, default=4, help="Beam size")
    parser.add_argument("--penalty", default=0.0, type=float, help="Incertion penalty")
    parser.add_argument(
        "--maxlenratio",
        default=0.0,
        type=float,
        help="""Input length ratio to obtain max output length.
                        If maxlenratio=0.0 (default), it uses a end-detect function
                        to automatically find maximum hypothesis lengths""",
    )
    parser.add_argument(
        "--minlenratio",
        default=0.0,
        type=float,
        help="Input length ratio to obtain min output length",
    )
    parser.add_argument(
        "--rnnlm", type=str, default=None, help="RNNLM model file to read"
    )
    parser.add_argument(
        "--rnnlm-conf", type=str, default=None, help="RNNLM model config file to read"
    )
    parser.add_argument("--lm-weight", default=0.0, type=float, help="RNNLM weight.")
    parser.add_argument("--sym-space", default="<space>", type=str, help="Space symbol")
    parser.add_argument("--sym-blank", default="<blank>", type=str, help="Blank symbol")
    # minibatch related
    parser.add_argument(
        "--sortagrad",
        default=0,
        type=int,
        nargs="?",
        help="How many epochs to use sortagrad for. 0 = deactivated, -1 = all epochs",
    )
    parser.add_argument(
        "--batch-count",
        default="auto",
        choices=BATCH_COUNT_CHOICES,
        help="How to count batch_size. "
        "The default (auto) will find how to count by args.",
    )
    parser.add_argument(
        "--batch-size",
        "--batch-seqs",
        "-b",
        default=0,
        type=int,
        help="Maximum seqs in a minibatch (0 to disable)",
    )
    parser.add_argument(
        "--batch-bins",
        default=0,
        type=int,
        help="Maximum bins in a minibatch (0 to disable)",
    )
    parser.add_argument(
        "--batch-frames-in",
        default=0,
        type=int,
        help="Maximum input frames in a minibatch (0 to disable)",
    )
    parser.add_argument(
        "--batch-frames-out",
        default=0,
        type=int,
        help="Maximum output frames in a minibatch (0 to disable)",
    )
    parser.add_argument(
        "--batch-frames-inout",
        default=0,
        type=int,
        help="Maximum input+output frames in a minibatch (0 to disable)",
    )
    parser.add_argument(
        "--maxlen-in",
        "--batch-seq-maxlen-in",
        default=100,
        type=int,
        metavar="ML",
        help="When --batch-count=seq, "
        "batch size is reduced if the input sequence length > ML.",
    )
    parser.add_argument(
        "--maxlen-out",
        "--batch-seq-maxlen-out",
        default=100,
        type=int,
        metavar="ML",
        help="When --batch-count=seq, "
        "batch size is reduced if the output sequence length > ML",
    )
    parser.add_argument(
        "--n-iter-processes",
        default=0,
        type=int,
        help="Number of processes of iterator",
    )
    # optimization related
    parser.add_argument(
        "--opt",
        default="adadelta",
        type=str,
        choices=["adadelta", "adam", "noam"],
        help="Optimizer",
    )
    parser.add_argument(
        "--accum-grad", default=1, type=int, help="Number of gradient accumuration"
    )
    parser.add_argument(
        "--eps", default=1e-8, type=float, help="Epsilon constant for optimizer"
    )
    parser.add_argument(
        "--eps-decay", default=0.01, type=float, help="Decaying ratio of epsilon"
    )
    parser.add_argument(
        "--lr", default=1e-3, type=float, help="Learning rate for optimizer"
    )
    parser.add_argument(
        "--lr-decay", default=1.0, type=float, help="Decaying ratio of learning rate"
    )
    parser.add_argument(
        "--weight-decay", default=0.0, type=float, help="Weight decay ratio"
    )
    parser.add_argument(
        "--criterion",
        default="acc",
        type=str,
        choices=["loss", "acc"],
        help="Criterion to perform epsilon decay",
    )
    parser.add_argument(
        "--threshold", default=1e-4, type=float, help="Threshold to stop iteration"
    )
    parser.add_argument(
        "--epochs", "-e", default=30, type=int, help="Maximum number of epochs"
    )
    parser.add_argument(
        "--early-stop-criterion",
        default="validation/main/acc",
        type=str,
        nargs="?",
        help="Value to monitor to trigger an early stopping of the training",
    )
    parser.add_argument(
        "--patience",
        default=3,
        type=int,
        nargs="?",
        help="Number of epochs to wait "
        "without improvement before stopping the training",
    )
    parser.add_argument(
        "--grad-clip", default=5, type=float, help="Gradient norm threshold to clip"
    )
    parser.add_argument(
        "--num-save-attention",
        default=3,
        type=int,
        help="Number of samples of attention to be saved",
    )
    # decoder related
    parser.add_argument(
        "--context-residual",
        default=False,
        type=strtobool,
        nargs="?",
        help="The flag to switch to use context vector residual in the decoder network",
    )
    parser.add_argument(
        "--tie-src-tgt-embedding",
        default=False,
        type=strtobool,
        nargs="?",
        help="Tie parameters of source embedding and target embedding.",
    )
    parser.add_argument(
        "--tie-classifier",
        default=False,
        type=strtobool,
        nargs="?",
        help="Tie parameters of target embedding and output projection layer.",
    )
    # finetuning related
    parser.add_argument(
        "--enc-init",
        default=None,
        type=str,
        nargs="?",
        help="Pre-trained ASR model to initialize encoder.",
    )
    parser.add_argument(
        "--enc-init-mods",
        default="enc.enc.",
        type=lambda s: [str(mod) for mod in s.split(",") if s != ""],
        help="List of encoder modules to initialize, separated by a comma.",
    )
    parser.add_argument(
        "--dec-init",
        default=None,
        type=str,
        nargs="?",
        help="Pre-trained ASR, MT or LM model to initialize decoder.",
    )
    parser.add_argument(
        "--dec-init-mods",
        default="att., dec.",
        type=lambda s: [str(mod) for mod in s.split(",") if s != ""],
        help="List of decoder modules to initialize, separated by a comma.",
    )
    # multilingual related
    parser.add_argument(
        "--multilingual",
        default=False,
        type=strtobool,
        help="Prepend target language ID to the source sentence. "
        "Both source/target language IDs must be prepend in the pre-processing stage.",
    )
    parser.add_argument(
        "--replace-sos",
        default=False,
        type=strtobool,
        help="Replace <sos> in the decoder with a target language ID "
        "(the first token in the target sequence)",
    )

    return parser


def main(cmd_args):
    """Run the main training function."""
    parser = get_parser()
    args, _ = parser.parse_known_args(cmd_args)
    if args.backend == "chainer" and args.train_dtype != "float32":
        raise NotImplementedError(
            f"chainer backend does not support --train-dtype {args.train_dtype}."
            "Use --dtype float32."
        )
    if args.ngpu == 0 and args.train_dtype in ("O0", "O1", "O2", "O3", "float16"):
        raise ValueError(
            f"--train-dtype {args.train_dtype} does not support the CPU backend."
        )

    from espnet.utils.dynamic_import import dynamic_import

    if args.model_module is None:
        model_module = "espnet.nets." + args.backend + "_backend.e2e_mt:E2E"
    else:
        model_module = args.model_module
    model_class = dynamic_import(model_module)
    model_class.add_arguments(parser)

    args = parser.parse_args(cmd_args)
    args.model_module = model_module
    if "chainer_backend" in args.model_module:
        args.backend = "chainer"
    if "pytorch_backend" in args.model_module:
        args.backend = "pytorch"

    # add version info in args
    args.version = __version__

    # logging info
    if args.verbose > 0:
        logging.basicConfig(
            level=logging.INFO,
            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
        )
    else:
        logging.basicConfig(
            level=logging.WARN,
            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
        )
        logging.warning("Skip DEBUG/INFO messages")

    # If --ngpu is not given,
    #   1. if CUDA_VISIBLE_DEVICES is set, all visible devices
    #   2. if nvidia-smi exists, use all devices
    #   3. else ngpu=0
    if args.ngpu is None:
        cvd = os.environ.get("CUDA_VISIBLE_DEVICES")
        if cvd is not None:
            ngpu = len(cvd.split(","))
        else:
            logging.warning("CUDA_VISIBLE_DEVICES is not set.")
            try:
                p = subprocess.run(
                    ["nvidia-smi", "-L"], stdout=subprocess.PIPE, stderr=subprocess.PIPE
                )
            except (subprocess.CalledProcessError, FileNotFoundError):
                ngpu = 0
            else:
                ngpu = len(p.stderr.decode().split("\n")) - 1
        args.ngpu = ngpu
    else:
        if is_torch_1_2_plus and args.ngpu != 1:
            logging.debug(
                "There are some bugs with multi-GPU processing in PyTorch 1.2+"
                + " (see https://github.com/pytorch/pytorch/issues/21108)"
            )
        ngpu = args.ngpu
    logging.info(f"ngpu: {ngpu}")

    # display PYTHONPATH
    logging.info("python path = " + os.environ.get("PYTHONPATH", "(None)"))

    # set random seed
    logging.info("random seed = %d" % args.seed)
    random.seed(args.seed)
    np.random.seed(args.seed)

    # load dictionary for debug log
    if args.dict is not None:
        with open(args.dict, "rb") as f:
            dictionary = f.readlines()
        char_list = [entry.decode("utf-8").split(" ")[0] for entry in dictionary]
        char_list.insert(0, "<blank>")
        char_list.append("<eos>")
        args.char_list = char_list
    else:
        args.char_list = None

    # train
    logging.info("backend = " + args.backend)

    if args.backend == "pytorch":
        from espnet.mt.pytorch_backend.mt import train

        train(args)
    else:
        raise ValueError("Only pytorch are supported.")


if __name__ == "__main__":
    main(sys.argv[1:])


================================================
FILE: bin/mt_trans.py
================================================
#!/usr/bin/env python3
# encoding: utf-8

# Copyright 2019 Kyoto University (Hirofumi Inaguma)
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

"""Neural machine translation model decoding script."""

import configargparse
import logging
import os
import random
import sys

import numpy as np


# NOTE: you need this func to generate our sphinx doc
def get_parser():
    """Get default arguments."""
    parser = configargparse.ArgumentParser(
        description="Translate text from speech "
        "using a speech translation model on one CPU or GPU",
        config_file_parser_class=configargparse.YAMLConfigFileParser,
        formatter_class=configargparse.ArgumentDefaultsHelpFormatter,
    )
    # general configuration
    parser.add("--config", is_config_file=True, help="Config file path")
    parser.add(
        "--config2",
        is_config_file=True,
        help="Second config file path that overwrites the settings in `--config`",
    )
    parser.add(
        "--config3",
        is_config_file=True,
        help="Third config file path "
        "that overwrites the settings in `--config` and `--config2`",
    )

    parser.add_argument("--ngpu", type=int, default=0, help="Number of GPUs")
    parser.add_argument(
        "--dtype",
        choices=("float16", "float32", "float64"),
        default="float32",
        help="Float precision (only available in --api v2)",
    )
    parser.add_argument(
        "--backend",
        type=str,
        default="chainer",
        choices=["chainer", "pytorch"],
        help="Backend library",
    )
    parser.add_argument("--debugmode", type=int, default=1, help="Debugmode")
    parser.add_argument("--seed", type=int, default=1, help="Random seed")
    parser.add_argument("--verbose", "-V", type=int, default=1, help="Verbose option")
    parser.add_argument(
        "--batchsize",
        type=int,
        default=1,
        help="Batch size for beam search (0: means no batch processing)",
    )
    parser.add_argument(
        "--preprocess-conf",
        type=str,
        default=None,
        help="The configuration file for the pre-processing",
    )
    parser.add_argument(
        "--api",
        default="v1",
        choices=["v1", "v2"],
        help="Beam search APIs "
        "v1: Default API. It only supports "
        "the ASRInterface.recognize method and DefaultRNNLM. "
        "v2: Experimental API. "
        "It supports any models that implements ScorerInterface.",
    )
    # task related
    parser.add_argument(
        "--trans-json", type=str, help="Filename of translation data (json)"
    )
    parser.add_argument(
        "--result-label",
        type=str,
        required=True,
        help="Filename of result label data (json)",
    )
    # model (parameter) related
    parser.add_argument(
        "--model", type=str, required=True, help="Model file parameters to read"
    )
    parser.add_argument(
        "--model-conf", type=str, default=None, help="Model config file"
    )
    # search related
    parser.add_argument("--nbest", type=int, default=1, help="Output N-best hypotheses")
    parser.add_argument("--beam-size", type=int, default=1, help="Beam size")
    parser.add_argument("--penalty", type=float, default=0.1, help="Incertion penalty")
    parser.add_argument(
        "--maxlenratio",
        type=float,
        default=3.0,
        help="""Input length ratio to obtain max output length.
                        If maxlenratio=0.0 (default), it uses a end-detect function
                        to automatically find maximum hypothesis lengths""",
    )
    parser.add_argument(
        "--minlenratio",
        type=float,
        default=0.0,
        help="Input length ratio to obtain min output length",
    )
    # multilingual related
    parser.add_argument(
        "--tgt-lang",
        default=False,
        type=str,
        help="target language ID (e.g., <en>, <de>, and <fr> etc.)",
    )
    return parser


def main(args):
    """Run the main decoding function."""
    parser = get_parser()
    args = parser.parse_args(args)

    # logging info
    if args.verbose == 1:
        logging.basicConfig(
            level=logging.INFO,
            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
        )
    elif args.verbose == 2:
        logging.basicConfig(
            level=logging.DEBUG,
            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
        )
    else:
        logging.basicConfig(
            level=logging.WARN,
            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
        )
        logging.warning("Skip DEBUG/INFO messages")

    # check CUDA_VISIBLE_DEVICES
    if args.ngpu > 0:
        cvd = os.environ.get("CUDA_VISIBLE_DEVICES")
        if cvd is None:
            logging.warning("CUDA_VISIBLE_DEVICES is not set.")
        elif args.ngpu != len(cvd.split(",")):
            logging.error("#gpus is not matched with CUDA_VISIBLE_DEVICES.")
            sys.exit(1)

        # TODO(mn5k): support of multiple GPUs
        if args.ngpu > 1:
            logging.error("The program only supports ngpu=1.")
            sys.exit(1)

    # display PYTHONPATH
    logging.info("python path = " + os.environ.get("PYTHONPATH", "(None)"))

    # seed setting
    random.seed(args.seed)
    np.random.seed(args.seed)
    logging.info("set random seed = %d" % args.seed)

    # trans
    logging.info("backend = " + args.backend)
    if args.backend == "pytorch":
        # Experimental API that supports custom LMs
        from espnet.mt.pytorch_backend.mt import trans

        if args.dtype != "float32":
            raise NotImplementedError(
                f"`--dtype {args.dtype}` is only available with `--api v2`"
            )
        trans(args)
    else:
        raise ValueError("Only pytorch are supported.")


if __name__ == "__main__":
    main(sys.argv[1:])


================================================
FILE: bin/st_train.py
================================================
#!/usr/bin/env python3
# encoding: utf-8

# Copyright 2019 Kyoto University (Hirofumi Inaguma)
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

"""End-to-end speech translation model training script."""

from distutils.version import LooseVersion
import logging
import os
import random
import subprocess
import sys

import configargparse
import numpy as np
import torch

from espnet import __version__
from espnet.utils.cli_utils import strtobool
from espnet.utils.training.batchfy import BATCH_COUNT_CHOICES

is_torch_1_2_plus = LooseVersion(torch.__version__) >= LooseVersion("1.2")


# NOTE: you need this func to generate our sphinx doc
def get_parser(parser=None, required=True):
    """Get default arguments."""
    if parser is None:
        parser = configargparse.ArgumentParser(
            description="Train a speech translation (ST) model on one CPU, "
            "one or multiple GPUs",
            config_file_parser_class=configargparse.YAMLConfigFileParser,
            formatter_class=configargparse.ArgumentDefaultsHelpFormatter,
        )
    # general configuration
    parser.add("--config", is_config_file=True, help="config file path")
    parser.add(
        "--config2",
        is_config_file=True,
        help="second config file path that overwrites the settings in `--config`.",
    )
    parser.add(
        "--config3",
        is_config_file=True,
        help="third config file path that overwrites the settings "
        "in `--config` and `--config2`.",
    )

    parser.add_argument(
        "--ngpu",
        default=None,
        type=int,
        help="Number of GPUs. If not given, use all visible devices",
    )
    parser.add_argument(
        "--train-dtype",
        default="float32",
        choices=["float16", "float32", "float64", "O0", "O1", "O2", "O3"],
        help="Data type for training (only pytorch backend). "
        "O0,O1,.. flags require apex. "
        "See https://nvidia.github.io/apex/amp.html#opt-levels",
    )
    parser.add_argument(
        "--backend",
        default="chainer",
        type=str,
        choices=["chainer", "pytorch"],
        help="Backend library",
    )
    parser.add_argument(
        "--outdir", type=str, required=required, help="Output directory"
    )
    parser.add_argument("--debugmode", default=1, type=int, help="Debugmode")
    parser.add_argument("--dict", required=required, help="Dictionary")
    parser.add_argument("--seed", default=1, type=int, help="Random seed")
    parser.add_argument("--debugdir", type=str, help="Output directory for debugging")
    parser.add_argument(
        "--resume",
        "-r",
        default="",
        nargs="?",
        help="Resume the training from snapshot",
    )
    parser.add_argument(
        "--minibatches",
        "-N",
        type=int,
        default="-1",
        help="Process only N minibatches (for debug)",
    )
    parser.add_argument("--verbose", "-V", default=0, type=int, help="Verbose option")
    parser.add_argument(
        "--tensorboard-dir",
        default=None,
        type=str,
        nargs="?",
        help="Tensorboard log dir path",
    )
    parser.add_argument(
        "--report-interval-iters",
        default=100,
        type=int,
        help="Report interval iterations",
    )
    parser.add_argument(
        "--save-interval-iters",
        default=0,
        type=int,
        help="Save snapshot interval iterations",
    )
    # task related
    parser.add_argument(
        "--train-json",
        type=str,
        default=None,
        help="Filename of train label data (json)",
    )
    parser.add_argument(
        "--valid-json",
        type=str,
        default=None,
        help="Filename of validation label data (json)",
    )
    # network architecture
    parser.add_argument(
        "--model-module",
        type=str,
        default=None,
        help="model defined module (default: espnet.nets.xxx_backend.e2e_st:E2E)",
    )
    # loss related
    parser.add_argument(
        "--ctc_type",
        default="warpctc",
        type=str,
        choices=["builtin", "warpctc", "gtnctc", "cudnnctc"],
        help="Type of CTC implementation to calculate loss.",
    )
    parser.add_argument(
        "--mtlalpha",
        default=0.0,
        type=float,
        help="Multitask learning coefficient, alpha: \
                                alpha*ctc_loss + (1-alpha)*att_loss",
    )
    parser.add_argument(
        "--asr-weight",
        default=0.0,
        type=float,
        help="Multitask learning coefficient for ASR task, weight: "
        " asr_weight*(alpha*ctc_loss + (1-alpha)*att_loss)"
        " + (1-asr_weight-mt_weight)*st_loss",
    )
    parser.add_argument(
        "--mt-weight",
        default=0.0,
        type=float,
        help="Multitask learning coefficient for MT task, weight: \
                                mt_weight*mt_loss + (1-mt_weight-asr_weight)*st_loss",
    )
    parser.add_argument(
        "--lsm-weight", default=0.0, type=float, help="Label smoothing weight"
    )
    # recognition options to compute CER/WER
    parser.add_argument(
        "--report-cer",
        default=False,
        action="store_true",
        help="Compute CER on development set",
    )
    parser.add_argument(
        "--report-wer",
        default=False,
        action="store_true",
        help="Compute WER on development set",
    )
    # translations options to compute BLEU
    parser.add_argument(
        "--report-bleu",
        default=True,
        action="store_true",
        help="Compute BLEU on development set",
    )
    parser.add_argument("--nbest", type=int, default=1, help="Output N-best hypotheses")
    parser.add_argument("--beam-size", type=int, default=4, help="Beam size")
    parser.add_argument("--penalty", default=0.0, type=float, help="Incertion penalty")
    parser.add_argument(
        "--maxlenratio",
        default=0.0,
        type=float,
        help="""Input length ratio to obtain max output length.
                        If maxlenratio=0.0 (default), it uses a end-detect function
                        to automatically find maximum hypothesis lengths""",
    )
    parser.add_argument(
        "--minlenratio",
        default=0.0,
        type=float,
        help="Input length ratio to obtain min output length",
    )
    parser.add_argument(
        "--rnnlm", type=str, default=None, help="RNNLM model file to read"
    )
    parser.add_argument(
        "--rnnlm-conf", type=str, default=None, help="RNNLM model config file to read"
    )
    parser.add_argument("--lm-weight", default=0.0, type=float, help="RNNLM weight.")
    parser.add_argument("--sym-space", default="<space>", type=str, help="Space symbol")
    parser.add_argument("--sym-blank", default="<blank>", type=str, help="Blank symbol")
    # minibatch related
    parser.add_argument(
        "--sortagrad",
        default=0,
        type=int,
        nargs="?",
        help="How many epochs to use sortagrad for. 0 = deactivated, -1 = all epochs",
    )
    parser.add_argument(
        "--batch-count",
        default="auto",
        choices=BATCH_COUNT_CHOICES,
        help="How to count batch_size. "
        "The default (auto) will find how to count by args.",
    )
    parser.add_argument(
        "--batch-size",
        "--batch-seqs",
        "-b",
        default=0,
        type=int,
        help="Maximum seqs in a minibatch (0 to disable)",
    )
    parser.add_argument(
        "--batch-bins",
        default=0,
        type=int,
        help="Maximum bins in a minibatch (0 to disable)",
    )
    parser.add_argument(
        "--batch-frames-in",
        default=0,
        type=int,
        help="Maximum input frames in a minibatch (0 to disable)",
    )
    parser.add_argument(
        "--batch-frames-out",
        default=0,
        type=int,
        help="Maximum output frames in a minibatch (0 to disable)",
    )
    parser.add_argument(
        "--batch-frames-inout",
        default=0,
        type=int,
        help="Maximum input+output frames in a minibatch (0 to disable)",
    )
    parser.add_argument(
        "--maxlen-in",
        "--batch-seq-maxlen-in",
        default=800,
        type=int,
        metavar="ML",
        help="When --batch-count=seq, batch size is reduced "
        "if the input sequence length > ML.",
    )
    parser.add_argument(
        "--maxlen-out",
        "--batch-seq-maxlen-out",
        default=150,
        type=int,
        metavar="ML",
        help="When --batch-count=seq, "
        "batch size is reduced if the output sequence length > ML",
    )
    parser.add_argument(
        "--n-iter-processes",
        default=0,
        type=int,
        help="Number of processes of iterator",
    )
    parser.add_argument(
        "--preprocess-conf",
        type=str,
        default=None,
        nargs="?",
        help="The configuration file for the pre-processing",
    )
    # optimization related
    parser.add_argument(
        "--opt",
        default="adadelta",
        type=str,
        choices=["adadelta", "adam", "noam"],
        help="Optimizer",
    )
    parser.add_argument(
        "--accum-grad", default=1, type=int, help="Number of gradient accumuration"
    )
    parser.add_argument(
        "--eps", default=1e-8, type=float, help="Epsilon constant for optimizer"
    )
    parser.add_argument(
        "--eps-decay", default=0.01, type=float, help="Decaying ratio of epsilon"
    )
    parser.add_argument(
        "--lr", default=1e-3, type=float, help="Learning rate for optimizer"
    )
    parser.add_argument(
        "--lr-decay", default=1.0, type=float, help="Decaying ratio of learning rate"
    )
    parser.add_argument(
        "--weight-decay", default=0.0, type=float, help="Weight decay ratio"
    )
    parser.add_argument(
        "--criterion",
        default="acc",
        type=str,
        choices=["loss", "acc"],
        help="Criterion to perform epsilon decay",
    )
    parser.add_argument(
        "--threshold", default=1e-4, type=float, help="Threshold to stop iteration"
    )
    parser.add_argument(
        "--epochs", "-e", default=30, type=int, help="Maximum number of epochs"
    )
    parser.add_argument(
        "--early-stop-criterion",
        default="validation/main/acc",
        type=str,
        nargs="?",
        help="Value to monitor to trigger an early stopping of the training",
    )
    parser.add_argument(
        "--patience",
        default=3,
        type=int,
        nargs="?",
        help="Number of epochs to wait "
        "without improvement before stopping the training",
    )
    parser.add_argument(
        "--grad-clip", default=5, type=float, help="Gradient norm threshold to clip"
    )
    parser.add_argument(
        "--num-save-attention",
        default=3,
        type=int,
        help="Number of samples of attention to be saved",
    )
    parser.add_argument(
        "--num-save-ctc",
        default=3,
        type=int,
        help="Number of samples of CTC probability to be saved",
    )
    parser.add_argument(
        "--grad-noise",
        type=strtobool,
        default=False,
        help="The flag to switch to use noise injection to gradients during training",
    )
    # speech translation related
    parser.add_argument(
        "--context-residual",
        default=False,
        type=strtobool,
        nargs="?",
        help="The flag to switch to use context vector residual in the decoder network",
    )
    # finetuning related
    parser.add_argument(
        "--enc-init",
        default=None,
        type=str,
        nargs="?",
        help="Pre-trained ASR model to initialize encoder.",
    )
    parser.add_argument(
        "--enc-init-mods",
        default="enc.enc.",
        type=lambda s: [str(mod) for mod in s.split(",") if s != ""],
        help="List of encoder modules to initialize, separated by a comma.",
    )
    parser.add_argument(
        "--dec-init",
        default=None,
        type=str,
        nargs="?",
        help="Pre-trained ASR, MT or LM model to initialize decoder.",
    )
    parser.add_argument(
        "--dec-init-mods",
        default="att., dec.",
        type=lambda s: [str(mod) for mod in s.split(",") if s != ""],
        help="List of decoder modules to initialize, separated by a comma.",
    )
    # multilingual related
    parser.add_argument(
        "--multilingual",
        default=False,
        type=strtobool,
        help="Prepend target language ID to the source sentence. "
        " Both source/target language IDs must be prepend in the pre-processing stage.",
    )
    parser.add_argument(
        "--replace-sos",
        default=False,
        type=strtobool,
        help="Replace <sos> in the decoder with a target language ID \
                              (the first token in the target sequence)",
    )
    # Feature transform: Normalization
    parser.add_argument(
        "--stats-file",
        type=str,
        default=None,
        help="The stats file for the feature normalization",
    )
    parser.add_argument(
        "--apply-uttmvn",
        type=strtobool,
        default=True,
        help="Apply utterance level mean " "variance normalization.",
    )
    parser.add_argument("--uttmvn-norm-means", type=strtobool, default=True, help="")
    parser.add_argument("--uttmvn-norm-vars", type=strtobool, default=False, help="")
    # Feature transform: Fbank
    parser.add_argument(
        "--fbank-fs",
        type=int,
        default=16000,
        help="The sample frequency used for " "the mel-fbank creation.",
    )
    parser.add_argument(
        "--n-mels", type=int, default=80, help="The number of mel-frequency bins."
    )
    parser.add_argument("--fbank-fmin", type=float, default=0.0, help="")
    parser.add_argument("--fbank-fmax", type=float, default=None, help="")
    return parser


def main(cmd_args):
    """Run the main training function."""
    parser = get_parser()
    args, _ = parser.parse_known_args(cmd_args)
    if args.backend == "chainer" and args.train_dtype != "float32":
        raise NotImplementedError(
            f"chainer backend does not support --train-dtype {args.train_dtype}."
            "Use --dtype float32."
        )
    if args.ngpu == 0 and args.train_dtype in ("O0", "O1", "O2", "O3", "float16"):
        raise ValueError(
            f"--train-dtype {args.train_dtype} does not support the CPU backend."
        )

    from espnet.utils.dynamic_import import dynamic_import

    if args.model_module is None:
        model_module = "espnet.nets." + args.backend + "_backend.e2e_st:E2E"
    else:
        model_module = args.model_module
    model_class = dynamic_import(model_module)
    model_class.add_arguments(parser)

    args = parser.parse_args(cmd_args)
    args.model_module = model_module
    if "chainer_backend" in args.model_module:
        args.backend = "chainer"
    if "pytorch_backend" in args.model_module:
        args.backend = "pytorch"

    # add version info in args
    args.version = __version__

    # logging info
    if args.verbose > 0:
        logging.basicConfig(
            level=logging.INFO,
            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
        )
    else:
        logging.basicConfig(
            level=logging.WARN,
            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
        )
        logging.warning("Skip DEBUG/INFO messages")

    # If --ngpu is not given,
    #   1. if CUDA_VISIBLE_DEVICES is set, all visible devices
    #   2. if nvidia-smi exists, use all devices
    #   3. else ngpu=0
    if args.ngpu is None:
        cvd = os.environ.get("CUDA_VISIBLE_DEVICES")
        if cvd is not None:
            ngpu = len(cvd.split(","))
        else:
            logging.warning("CUDA_VISIBLE_DEVICES is not set.")
            try:
                p = subprocess.run(
                    ["nvidia-smi", "-L"], stdout=subprocess.PIPE, stderr=subprocess.PIPE
                )
            except (subprocess.CalledProcessError, FileNotFoundError):
                ngpu = 0
            else:
                ngpu = len(p.stderr.decode().split("\n")) - 1
        args.ngpu = ngpu
    else:
        if is_torch_1_2_plus and args.ngpu != 1:
            logging.debug(
                "There are some bugs with multi-GPU processing in PyTorch 1.2+"
                + " (see https://github.com/pytorch/pytorch/issues/21108)"
            )
        ngpu = args.ngpu
    logging.info(f"ngpu: {ngpu}")

    # display PYTHONPATH
    logging.info("python path = " + os.environ.get("PYTHONPATH", "(None)"))

    # set random seed
    logging.info("random seed = %d" % args.seed)
    random.seed(args.seed)
    np.random.seed(args.seed)

    # load dictionary for debug log
    if args.dict is not None:
        with open(args.dict, "rb") as f:
            dictionary = f.readlines()
        char_list = [entry.decode("utf-8").split(" ")[0] for entry in dictionary]
        char_list.insert(0, "<blank>")
        char_list.append("<eos>")
        args.char_list = char_list
    else:
        args.char_list = None

    # train
    logging.info("backend = " + args.backend)

    if args.backend == "pytorch":
        from espnet.st.pytorch_backend.st import train

        train(args)
    else:
        raise ValueError("Only pytorch are supported.")


if __name__ == "__main__":
    main(sys.argv[1:])


================================================
FILE: bin/st_trans.py
================================================
#!/usr/bin/env python3
# encoding: utf-8

# Copyright 2019 Kyoto University (Hirofumi Inaguma)
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

"""End-to-end speech translation model decoding script."""

import logging
import os
import random
import sys

import configargparse
import numpy as np


# NOTE: you need this func to generate our sphinx doc
def get_parser():
    """Get default arguments."""
    parser = configargparse.ArgumentParser(
        description="Translate text from speech using a speech translation "
        "model on one CPU or GPU",
        config_file_parser_class=configargparse.YAMLConfigFileParser,
        formatter_class=configargparse.ArgumentDefaultsHelpFormatter,
    )
    # general configuration
    parser.add("--config", is_config_file=True, help="Config file path")
    parser.add(
        "--config2",
        is_config_file=True,
        help="Second config file path that overwrites the settings in `--config`",
    )
    parser.add(
        "--config3",
        is_config_file=True,
        help="Third config file path that overwrites "
        "the settings in `--config` and `--config2`",
    )

    parser.add_argument("--ngpu", type=int, default=0, help="Number of GPUs")
    parser.add_argument(
        "--dtype",
        choices=("float16", "float32", "float64"),
        default="float32",
        help="Float precision (only available in --api v2)",
    )
    parser.add_argument(
        "--backend",
        type=str,
        default="chainer",
        choices=["chainer", "pytorch"],
        help="Backend library",
    )
    parser.add_argument("--debugmode", type=int, default=1, help="Debugmode")
    parser.add_argument("--seed", type=int, default=1, help="Random seed")
    parser.add_argument("--verbose", "-V", type=int, default=1, help="Verbose option")
    parser.add_argument(
        "--batchsize",
        type=int,
        default=1,
        help="Batch size for beam search (0: means no batch processing)",
    )
    parser.add_argument(
        "--preprocess-conf",
        type=str,
        default=None,
        help="The configuration file for the pre-processing",
    )
    parser.add_argument(
        "--api",
        default="v1",
        choices=["v1", "v2"],
        help="Beam search APIs "
        "v1: Default API. "
        "It only supports the ASRInterface.recognize method and DefaultRNNLM. "
        "v2: Experimental API. "
        "It supports any models that implements ScorerInterface.",
    )
    # task related
    parser.add_argument(
        "--trans-json", type=str, help="Filename of translation data (json)"
    )
    parser.add_argument(
        "--result-label",
        type=str,
        required=True,
        help="Filename of result label data (json)",
    )
    # model (parameter) related
    parser.add_argument(
        "--model", type=str, required=True, help="Model file parameters to read"
    )
    # search related
    parser.add_argument("--nbest", type=int, default=1, help="Output N-best hypotheses")
    parser.add_argument("--beam-size", type=int, default=1, help="Beam size")
    parser.add_argument("--penalty", type=float, default=0.0, help="Incertion penalty")
    parser.add_argument(
        "--maxlenratio",
        type=float,
        default=0.0,
        help="""Input length ratio to obtain max output length.
                        If maxlenratio=0.0 (default), it uses a end-detect function
                        to automatically find maximum hypothesis lengths""",
    )
    parser.add_argument(
        "--minlenratio",
        type=float,
        default=0.0,
        help="Input length ratio to obtain min output length",
    )
    # multilingual related
    parser.add_argument(
        "--tgt-lang",
        default=False,
        type=str,
        help="target language ID (e.g., <en>, <de>, and <fr> etc.)",
    )
    return parser


def main(args):
    """Run the main decoding function."""
    parser = get_parser()
    args = parser.parse_args(args)

    # logging info
    if args.verbose == 1:
        logging.basicConfig(
            level=logging.INFO,
            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
        )
    elif args.verbose == 2:
        logging.basicConfig(
            level=logging.DEBUG,
            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
        )
    else:
        logging.basicConfig(
            level=logging.WARN,
            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
        )
        logging.warning("Skip DEBUG/INFO messages")

    # check CUDA_VISIBLE_DEVICES
    if args.ngpu > 0:
        cvd = os.environ.get("CUDA_VISIBLE_DEVICES")
        if cvd is None:
            logging.warning("CUDA_VISIBLE_DEVICES is not set.")
        elif args.ngpu != len(cvd.split(",")):
            logging.error("#gpus is not matched with CUDA_VISIBLE_DEVICES.")
            sys.exit(1)

        # TODO(mn5k): support of multiple GPUs
        if args.ngpu > 1:
            logging.error("The program only supports ngpu=1.")
            sys.exit(1)

    # display PYTHONPATH
    logging.info("python path = " + os.environ.get("PYTHONPATH", "(None)"))

    # seed setting
    random.seed(args.seed)
    np.random.seed(args.seed)
    logging.info("set random seed = %d" % args.seed)

    # trans
    logging.info("backend = " + args.backend)
    if args.backend == "pytorch":
        # Experimental API that supports custom LMs
        from espnet.st.pytorch_backend.st import trans

        if args.dtype != "float32":
            raise NotImplementedError(
                f"`--dtype {args.dtype}` is only available with `--api v2`"
            )
        trans(args)
    else:
        raise ValueError("Only pytorch are supported.")


if __name__ == "__main__":
    main(sys.argv[1:])


================================================
FILE: bin/tts_decode.py
================================================
#!/usr/bin/env python3

# Copyright 2018 Nagoya University (Tomoki Hayashi)
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

"""TTS decoding script."""

import configargparse
import logging
import os
import platform
import subprocess
import sys

from espnet.utils.cli_utils import strtobool


# NOTE: you need this func to generate our sphinx doc
def get_parser():
    """Get parser of decoding arguments."""
    parser = configargparse.ArgumentParser(
        description="Synthesize speech from text using a TTS model on one CPU",
        config_file_parser_class=configargparse.YAMLConfigFileParser,
        formatter_class=configargparse.ArgumentDefaultsHelpFormatter,
    )
    # general configuration
    parser.add("--config", is_config_file=True, help="config file path")
    parser.add(
        "--config2",
        is_config_file=True,
        help="second config file path that overwrites the settings in `--config`.",
    )
    parser.add(
        "--config3",
        is_config_file=True,
        help="third config file path that overwrites "
        "the settings in `--config` and `--config2`.",
    )

    parser.add_argument("--ngpu", default=0, type=int, help="Number of GPUs")
    parser.add_argument(
        "--backend",
        default="pytorch",
        type=str,
        choices=["chainer", "pytorch"],
        help="Backend library",
    )
    parser.add_argument("--debugmode", default=1, type=int, help="Debugmode")
    parser.add_argument("--seed", default=1, type=int, help="Random seed")
    parser.add_argument("--out", type=str, required=True, help="Output filename")
    parser.add_argument("--verbose", "-V", default=0, type=int, help="Verbose option")
    parser.add_argument(
        "--preprocess-conf",
        type=str,
        default=None,
        help="The configuration file for the pre-processing",
    )
    # task related
    parser.add_argument(
        "--json", type=str, required=True, help="Filename of train label data (json)"
    )
    parser.add_argument(
        "--model", type=str, required=True, help="Model file parameters to read"
    )
    parser.add_argument(
        "--model-conf", type=str, default=None, help="Model config file"
    )
    # decoding related
    parser.add_argument(
        "--maxlenratio", type=float, default=5, help="Maximum length ratio in decoding"
    )
    parser.add_argument(
        "--minlenratio", type=float, default=0, help="Minimum length ratio in decoding"
    )
    parser.add_argument(
        "--threshold", type=float, default=0.5, help="Threshold value in decoding"
    )
    parser.add_argument(
        "--use-att-constraint",
        type=strtobool,
        default=False,
        help="Whether to use the attention constraint",
    )
    parser.add_argument(
        "--backward-window",
        type=int,
        default=1,
        help="Backward window size in the attention constraint",
    )
    parser.add_argument(
        "--forward-window",
        type=int,
        default=3,
        help="Forward window size in the attention constraint",
    )
    parser.add_argument(
        "--fastspeech-alpha",
        type=float,
        default=1.0,
        help="Alpha to change the speed for FastSpeech",
    )
    # save related
    parser.add_argument(
        "--save-durations",
        default=False,
        type=strtobool,
        help="Whether to save durations converted from attentions",
    )
    parser.add_argument(
        "--save-focus-rates",
        default=False,
        type=strtobool,
        help="Whether to save focus rates of attentions",
    )
    return parser


def main(args):
    """Run deocding."""
    parser = get_parser()
    args = parser.parse_args(args)

    # logging info
    if args.verbose > 0:
        logging.basicConfig(
            level=logging.INFO,
            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
        )
    else:
        logging.basicConfig(
            level=logging.WARN,
            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
        )
        logging.warning("Skip DEBUG/INFO messages")

    # check CUDA_VISIBLE_DEVICES
    if args.ngpu > 0:
        # python 2 case
        if platform.python_version_tuple()[0] == "2":
            if "clsp.jhu.edu" in subprocess.check_output(["hostname", "-f"]):
                cvd = subprocess.check_output(
                    ["/usr/local/bin/free-gpu", "-n", str(args.ngpu)]
                ).strip()
                logging.info("CLSP: use gpu" + cvd)
                os.environ["CUDA_VISIBLE_DEVICES"] = cvd
        # python 3 case
        else:
            if "clsp.jhu.edu" in subprocess.check_output(["hostname", "-f"]).decode():
                cvd = (
                    subprocess.check_output(
                        ["/usr/local/bin/free-gpu", "-n", str(args.ngpu)]
                    )
                    .decode()
                    .strip()
                )
                logging.info("CLSP: use gpu" + cvd)
                os.environ["CUDA_VISIBLE_DEVICES"] = cvd

        cvd = os.environ.get("CUDA_VISIBLE_DEVICES")
        if cvd is None:
            logging.warning("CUDA_VISIBLE_DEVICES is not set.")
        elif args.ngpu != len(cvd.split(",")):
            logging.error("#gpus is not matched with CUDA_VISIBLE_DEVICES.")
            sys.exit(1)

    # display PYTHONPATH
    logging.info("python path = " + os.environ.get("PYTHONPATH", "(None)"))

    # extract
    logging.info("backend = " + args.backend)
    if args.backend == "pytorch":
        from espnet.tts.pytorch_backend.tts import decode

        decode(args)
    else:
        raise NotImplementedError("Only pytorch is supported.")


if __name__ == "__main__":
    main(sys.argv[1:])


================================================
FILE: bin/tts_train.py
================================================
#!/usr/bin/env python3

# Copyright 2018 Nagoya University (Tomoki Hayashi)
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

"""Text-to-speech model training script."""

import logging
import os
import random
import subprocess
import sys

import configargparse
import numpy as np

from espnet import __version__
from espnet.nets.tts_interface import TTSInterface
from espnet.utils.cli_utils import strtobool
from espnet.utils.training.batchfy import BATCH_COUNT_CHOICES


# NOTE: you need this func to generate our sphinx doc
def get_parser():
    """Get parser of training arguments."""
    parser = configargparse.ArgumentParser(
        description="Train a new text-to-speech (TTS) model on one CPU, "
        "one or multiple GPUs",
        config_file_parser_class=configargparse.YAMLConfigFileParser,
        formatter_class=configargparse.ArgumentDefaultsHelpFormatter,
    )

    # general configuration
    parser.add("--config", is_config_file=True, help="config file path")
    parser.add(
        "--config2",
        is_config_file=True,
        help="second config file path that overwrites the settings in `--config`.",
    )
    parser.add(
        "--config3",
        is_config_file=True,
        help="third config file path that overwrites "
        "the settings in `--config` and `--config2`.",
    )

    parser.add_argument(
        "--ngpu",
        default=None,
        type=int,
        help="Number of GPUs. If not given, use all visible devices",
    )
    parser.add_argument(
        "--backend",
        default="pytorch",
        type=str,
        choices=["chainer", "pytorch"],
        help="Backend library",
    )
    parser.add_argument("--outdir", type=str, required=True, help="Output directory")
    parser.add_argument("--debugmode", default=1, type=int, help="Debugmode")
    parser.add_argument("--seed", default=1, type=int, help="Random seed")
    parser.add_argument(
        "--resume",
        "-r",
        default="",
        type=str,
        nargs="?",
        help="Resume the training from snapshot",
    )
    parser.add_argument(
        "--minibatches",
        "-N",
        type=int,
        default="-1",
        help="Process only N minibatches (for debug)",
    )
    parser.add_argument("--verbose", "-V", default=0, type=int, help="Verbose option")
    parser.add_argument(
        "--tensorboard-dir",
        default=None,
        type=str,
        nargs="?",
        help="Tensorboard log directory path",
    )
    parser.add_argument(
        "--eval-interval-epochs", default=1, type=int, help="Evaluation interval epochs"
    )
    parser.add_argument(
        "--save-interval-epochs", default=1, type=int, help="Save interval epochs"
    )
    parser.add_argument(
        "--report-interval-iters",
        default=100,
        type=int,
        help="Report interval iterations",
    )
    # task related
    parser.add_argument(
        "--train-json", type=str, required=True, help="Filename of training json"
    )
    parser.add_argument(
        "--valid-json", type=str, required=True, help="Filename of validation json"
    )
    # network architecture
    parser.add_argument(
        "--model-module",
        type=str,
        default="espnet.nets.pytorch_backend.e2e_tts_tacotron2:Tacotron2",
        help="model defined module",
    )
    # minibatch related
    parser.add_argument(
        "--sortagrad",
        default=0,
        type=int,
        nargs="?",
        help="How many epochs to use sortagrad for. 0 = deactivated, -1 = all epochs",
    )
    parser.add_argument(
        "--batch-sort-key",
        default="shuffle",
        type=str,
        choices=["shuffle", "output", "input"],
        nargs="?",
        help='Batch sorting key. "shuffle" only work with --batch-count "seq".',
    )
    parser.add_argument(
        "--batch-count",
        default="auto",
        choices=BATCH_COUNT_CHOICES,
        help="How to count batch_size. "
        "The default (auto) will find how to count by args.",
    )
    parser.add_argument(
        "--batch-size",
        "--batch-seqs",
        "-b",
        default=0,
        type=int,
        help="Maximum seqs in a minibatch (0 to disable)",
    )
    parser.add_argument(
        "--batch-bins",
        default=0,
        type=int,
        help="Maximum bins in a minibatch (0 to disable)",
    )
    parser.add_argument(
        "--batch-frames-in",
        default=0,
        type=int,
        help="Maximum input frames in a minibatch (0 to disable)",
    )
    parser.add_argument(
        "--batch-frames-out",
        default=0,
        type=int,
        help="Maximum output frames in a minibatch (0 to disable)",
    )
    parser.add_argument(
        "--batch-frames-inout",
        default=0,
        type=int,
        help="Maximum input+output frames in a minibatch (0 to disable)",
    )
    parser.add_argument(
        "--maxlen-in",
        "--batch-seq-maxlen-in",
        default=100,
        type=int,
        metavar="ML",
        help="When --batch-count=seq, "
        "batch size is reduced if the input sequence length > ML.",
    )
    parser.add_argument(
        "--maxlen-out",
        "--batch-seq-maxlen-out",
        default=200,
        type=int,
        metavar="ML",
        help="When --batch-count=seq, "
        "batch size is reduced if the output sequence length > ML",
    )
    parser.add_argument(
        "--num-iter-processes",
        default=0,
        type=int,
        help="Number of processes of iterator",
    )
    parser.add_argument(
        "--preprocess-conf",
        type=str,
        default=None,
        help="The configuration file for the pre-processing",
    )
    parser.add_argument(
        "--use-speaker-embedding",
        default=False,
        type=strtobool,
        help="Whether to use speaker embedding",
    )
    parser.add_argument(
        "--use-second-target",
        default=False,
        type=strtobool,
        help="Whether to use second target",
    )
    # optimization related
    parser.add_argument(
        "--opt", default="adam", type=str, choices=["adam", "noam"], help="Optimizer"
    )
    parser.add_argument(
        "--accum-grad", default=1, type=int, help="Number of gradient accumuration"
    )
    parser.add_argument(
        "--lr", default=1e-3, type=float, help="Learning rate for optimizer"
    )
    parser.add_argument("--eps", default=1e-6, type=float, help="Epsilon for optimizer")
    parser.add_argument(
        "--weight-decay",
        default=1e-6,
        type=float,
        help="Weight decay coefficient for optimizer",
    )
    parser.add_argument(
        "--epochs", "-e", default=30, type=int, help="Number of maximum epochs"
    )
    parser.add_argument(
        "--early-stop-criterion",
        default="validation/main/loss",
        type=str,
        nargs="?",
        help="Value to monitor to trigger an early stopping of the training",
    )
    parser.add_argument(
        "--patience",
        default=3,
        type=int,
        nargs="?",
        help="Number of epochs to wait "
        "without improvement before stopping the training",
    )
    parser.add_argument(
        "--grad-clip", default=1, type=float, help="Gradient norm threshold to clip"
    )
    parser.add_argument(
        "--num-save-attention",
        default=5,
        type=int,
        help="Number of samples of attention to be saved",
    )
    parser.add_argument(
        "--keep-all-data-on-mem",
        default=False,
        type=strtobool,
        help="Whether to keep all data on memory",
    )
    # finetuning related
    parser.add_argument(
        "--enc-init",
        default=None,
        type=str,
        help="Pre-trained TTS model path to initialize encoder.",
    )
    parser.add_argument(
        "--enc-init-mods",
        default="enc.",
        type=lambda s: [str(mod) for mod in s.split(",") if s != ""],
        help="List of encoder modules to initialize, separated by a comma.",
    )
    parser.add_argument(
        "--dec-init",
        default=None,
        type=str,
        help="Pre-trained TTS model path to initialize decoder.",
    )
    parser.add_argument(
        "--dec-init-mods",
        default="dec.",
        type=lambda s: [str(mod) for mod in s.split(",") if s != ""],
        help="List of decoder modules to initialize, separated by a comma.",
    )
    parser.add_argument(
        "--freeze-mods",
        default=None,
        type=lambda s: [str(mod) for mod in s.split(",") if s != ""],
        help="List of modules to freeze (not to train), separated by a comma.",
    )

    return parser


def main(cmd_args):
    """Run training."""
    parser = get_parser()
    args, _ = parser.parse_known_args(cmd_args)

    from espnet.utils.dynamic_import import dynamic_import

    model_class = dynamic_import(args.model_module)
    assert issubclass(model_class, TTSInterface)
    model_class.add_arguments(parser)
    args = parser.parse_args(cmd_args)

    # add version info in args
    args.version = __version__

    # logging info
    if args.verbose > 0:
        logging.basicConfig(
            level=logging.INFO,
            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
        )
    else:
        logging.basicConfig(
            level=logging.WARN,
            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
        )
        logging.warning("Skip DEBUG/INFO messages")

    # If --ngpu is not given,
    #   1. if CUDA_VISIBLE_DEVICES is set, all visible devices
    #   2. if nvidia-smi exists, use all devices
    #   3. else ngpu=0
    if args.ngpu is None:
        cvd = os.environ.get("CUDA_VISIBLE_DEVICES")
        if cvd is not None:
            ngpu = len(cvd.split(","))
        else:
            logging.warning("CUDA_VISIBLE_DEVICES is not set.")
            try:
                p = subprocess.run(
                    ["nvidia-smi", "-L"], stdout=subprocess.PIPE, stderr=subprocess.PIPE
                )
            except (subprocess.CalledProcessError, FileNotFoundError):
                ngpu = 0
            else:
                ngpu = len(p.stderr.decode().split("\n")) - 1
        args.ngpu = ngpu
    else:
        ngpu = args.ngpu
    logging.info(f"ngpu: {ngpu}")

    # set random seed
    logging.info("random seed = %d" % args.seed)
    random.seed(args.seed)
    np.random.seed(args.seed)

    if args.backend == "pytorch":
        from espnet.tts.pytorch_backend.tts import train

        train(args)
    else:
        raise NotImplementedError("Only pytorch is supported.")


if __name__ == "__main__":
    main(sys.argv[1:])


================================================
FILE: bin/vc_decode.py
================================================
#!/usr/bin/env python3

# Copyright 2020 Nagoya University (Wen-Chin Huang)
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

"""VC decoding script."""

import configargparse
import logging
import os
import platform
import subprocess
import sys

from espnet.utils.cli_utils import strtobool


# NOTE: you need this func to generate our sphinx doc
def get_parser():
    """Get parser of decoding arguments."""
    parser = configargparse.ArgumentParser(
        description="Converting speech using a VC model on one CPU",
        config_file_parser_class=configargparse.YAMLConfigFileParser,
        formatter_class=configargparse.ArgumentDefaultsHelpFormatter,
    )
    # general configuration
    parser.add("--config", is_config_file=True, help="config file path")
    parser.add(
        "--config2",
        is_config_file=True,
        help="second config file path that overwrites the settings in `--config`.",
    )
    parser.add(
        "--config3",
        is_config_file=True,
        help="third config file path that overwrites the settings "
        "in `--config` and `--config2`.",
    )

    parser.add_argument("--ngpu", default=0, type=int, help="Number of GPUs")
    parser.add_argument(
        "--backend",
        default="pytorch",
        type=str,
        choices=["chainer", "pytorch"],
        help="Backend library",
    )
    parser.add_argument("--debugmode", default=1, type=int, help="Debugmode")
    parser.add_argument("--seed", default=1, type=int, help="Random seed")
    parser.add_argument("--out", type=str, required=True, help="Output filename")
    parser.add_argument("--verbose", "-V", default=0, type=int, help="Verbose option")
    parser.add_argument(
        "--preprocess-conf",
        type=str,
        default=None,
        help="The configuration file for the pre-processing",
    )
    # task related
    parser.add_argument(
        "--json", type=str, required=True, help="Filename of train label data (json)"
    )
    parser.add_argument(
        "--model", type=str, required=True, help="Model file parameters to read"
    )
    parser.add_argument(
        "--model-conf", type=str, default=None, help="Model config file"
    )
    # decoding related
    parser.add_argument(
        "--maxlenratio", type=float, default=5, help="Maximum length ratio in decoding"
    )
    parser.add_argument(
        "--minlenratio", type=float, default=0, help="Minimum length ratio in decoding"
    )
    parser.add_argument(
        "--threshold", type=float, default=0.5, help="Threshold value in decoding"
    )
    parser.add_argument(
        "--use-att-constraint",
        type=strtobool,
        default=False,
        help="Whether to use the attention constraint",
    )
    parser.add_argument(
        "--backward-window",
        type=int,
        default=1,
        help="Backward window size in the attention constraint",
    )
    parser.add_argument(
        "--forward-window",
        type=int,
        default=3,
        help="Forward window size in the attention constraint",
    )
    # save related
    parser.add_argument(
        "--save-durations",
        default=False,
        type=strtobool,
        help="Whether to save durations converted from attentions",
    )
    parser.add_argument(
        "--save-focus-rates",
        default=False,
        type=strtobool,
        help="Whether to save focus rates of attentions",
    )
    return parser


def main(args):
    """Run deocding."""
    parser = get_parser()
    args = parser.parse_args(args)

    # logging info
    if args.verbose > 0:
        logging.basicConfig(
            level=logging.INFO,
            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
        )
    else:
        logging.basicConfig(
            level=logging.WARN,
            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
        )
        logging.warning("Skip DEBUG/INFO messages")

    # check CUDA_VISIBLE_DEVICES
    if args.ngpu > 0:
        # python 2 case
        if platform.python_version_tuple()[0] == "2":
            if "clsp.jhu.edu" in subprocess.check_output(["hostname", "-f"]):
                cvd = subprocess.check_output(
                    ["/usr/local/bin/free-gpu", "-n", str(args.ngpu)]
                ).strip()
                logging.info("CLSP: use gpu" + cvd)
                os.environ["CUDA_VISIBLE_DEVICES"] = cvd
        # python 3 case
        else:
            if "clsp.jhu.edu" in subprocess.check_output(["hostname", "-f"]).decode():
                cvd = (
                    subprocess.check_output(
                        ["/usr/local/bin/free-gpu", "-n", str(args.ngpu)]
                    )
                    .decode()
                    .strip()
                )
                logging.info("CLSP: use gpu" + cvd)
                os.environ["CUDA_VISIBLE_DEVICES"] = cvd

        cvd = os.environ.get("CUDA_VISIBLE_DEVICES")
        if cvd is None:
            logging.warning("CUDA_VISIBLE_DEVICES is not set.")
        elif args.ngpu != len(cvd.split(",")):
            logging.error("#gpus is not matched with CUDA_VISIBLE_DEVICES.")
            sys.exit(1)

    # display PYTHONPATH
    logging.info("python path = " + os.environ.get("PYTHONPATH", "(None)"))

    # extract
    logging.info("backend = " + args.backend)
    if args.backend == "pytorch":
        from espnet.vc.pytorch_backend.vc import decode

        decode(args)
    else:
        raise NotImplementedError("Only pytorch is supported.")


if __name__ == "__main__":
    main(sys.argv[1:])


================================================
FILE: bin/vc_train.py
================================================
#!/usr/bin/env python3

# Copyright 2020 Nagoya University (Wen-Chin Huang)
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

"""Voice conversion model training script."""

import logging
import os
import random
import subprocess
import sys

import configargparse
import numpy as np

from espnet import __version__
from espnet.nets.tts_interface import TTSInterface
from espnet.utils.cli_utils import strtobool
from espnet.utils.training.batchfy import BATCH_COUNT_CHOICES


# NOTE: you need this func to generate our sphinx doc
def get_parser():
    """Get parser of training arguments."""
    parser = configargparse.ArgumentParser(
        description="Train a new voice conversion (VC) model on one CPU, "
        "one or multiple GPUs",
        config_file_parser_class=configargparse.YAMLConfigFileParser,
        formatter_class=configargparse.ArgumentDefaultsHelpFormatter,
    )

    # general configuration
    parser.add("--config", is_config_file=True, help="config file path")
    parser.add(
        "--config2",
        is_config_file=True,
        help="second config file path that overwrites the settings in `--config`.",
    )
    parser.add(
        "--config3",
        is_config_file=True,
        help="third config file path that overwrites the settings "
        "in `--config` and `--config2`.",
    )

    parser.add_argument(
        "--ngpu",
        default=None,
        type=int,
        help="Number of GPUs. If not given, use all visible devices",
    )
    parser.add_argument(
        "--backend",
        default="pytorch",
        type=str,
        choices=["chainer", "pytorch"],
        help="Backend library",
    )
    parser.add_argument("--outdir", type=str, required=True, help="Output directory")
    parser.add_argument("--debugmode", default=1, type=int, help="Debugmode")
    parser.add_argument("--seed", default=1, type=int, help="Random seed")
    parser.add_argument(
        "--resume",
        "-r",
        default="",
        type=str,
        nargs="?",
        help="Resume the training from snapshot",
    )
    parser.add_argument(
        "--minibatches",
        "-N",
        type=int,
        default="-1",
        help="Process only N minibatches (for debug)",
    )
    parser.add_argument("--verbose", "-V", default=0, type=int, help="Verbose option")
    parser.add_argument(
        "--tensorboard-dir",
        default=None,
        type=str,
        nargs="?",
        help="Tensorboard log directory path",
    )
    parser.add_argument(
        "--eval-interval-epochs",
        default=100,
        type=int,
        help="Evaluation interval epochs",
    )
    parser.add_argument(
        "--save-interval-epochs", default=1, type=int, help="Save interval epochs"
    )
    parser.add_argument(
        "--report-interval-iters",
        default=10,
        type=int,
        help="Report interval iterations",
    )
    # task related
    parser.add_argument("--srcspk", type=str, help="Source speaker")
    parser.add_argument("--trgspk", type=str, help="Target speaker")
    parser.add_argument(
        "--train-json", type=str, required=True, help="Filename of training json"
    )
    parser.add_argument(
        "--valid-json", type=str, required=True, help="Filename of validation json"
    )

    # network architecture
    parser.add_argument(
        "--model-module",
        type=str,
        default="espnet.nets.pytorch_backend.e2e_tts_tacotron2:Tacotron2",
        help="model defined module",
    )
    # minibatch related
    parser.add_argument(
        "--sortagrad",
        default=0,
        type=int,
        nargs="?",
        help="How many epochs to use sortagrad for. 0 = deactivated, -1 = all epochs",
    )
    parser.add_argument(
        "--batch-sort-key",
        default="shuffle",
        type=str,
        choices=["shuffle", "output", "input"],
        nargs="?",
        help='Batch sorting key. "shuffle" only work with --batch-count "seq".',
    )
    parser.add_argument(
        "--batch-count",
        default="auto",
        choices=BATCH_COUNT_CHOICES,
        help="How to count batch_size. "
        "The default (auto) will find how to count by args.",
    )
    parser.add_argument(
        "--batch-size",
        "--batch-seqs",
        "-b",
        default=0,
        type=int,
        help="Maximum seqs in a minibatch (0 to disable)",
    )
    parser.add_argument(
        "--batch-bins",
        default=0,
        type=int,
        help="Maximum bins in a minibatch (0 to disable)",
    )
    parser.add_argument(
        "--batch-frames-in",
        default=0,
        type=int,
        help="Maximum input frames in a minibatch (0 to disable)",
    )
    parser.add_argument(
        "--batch-frames-out",
        default=0,
        type=int,
        help="Maximum output frames in a minibatch (0 to disable)",
    )
    parser.add_argument(
        "--batch-frames-inout",
        default=0,
        type=int,
        help="Maximum input+output frames in a minibatch (0 to disable)",
    )
    parser.add_argument(
        "--maxlen-in",
        "--batch-seq-maxlen-in",
        default=100,
        type=int,
        metavar="ML",
        help="When --batch-count=seq, "
        "batch size is reduced if the input sequence length > ML.",
    )
    parser.add_argument(
        "--maxlen-out",
        "--batch-seq-maxlen-out",
        default=200,
        type=int,
        metavar="ML",
        help="When --batch-count=seq, "
        "batch size is reduced if the output sequence length > ML",
    )
    parser.add_argument(
        "--num-iter-processes",
        default=0,
        type=int,
        help="Number of processes of iterator",
    )
    parser.add_argument(
        "--preprocess-conf",
        type=str,
        default=None,
        help="The configuration file for the pre-processing",
    )
    parser.add_argument(
        "--use-speaker-embedding",
        default=False,
        type=strtobool,
        help="Whether to use speaker embedding",
    )
    parser.add_argument(
        "--use-second-target",
        default=False,
        type=strtobool,
        help="Whether to use second target",
    )
    # optimization related
    parser.add_argument(
        "--opt",
        default="adam",
        type=str,
        choices=["adam", "noam", "lamb"],
        help="Optimizer",
    )
    parser.add_argument(
        "--accum-grad", default=1, type=int, help="Number of gradient accumuration"
    )
    parser.add_argument(
        "--lr", default=1e-3, type=float, help="Learning rate for optimizer"
    )
    parser.add_argument("--eps", default=1e-6, type=float, help="Epsilon for optimizer")
    parser.add_argument(
        "--weight-decay",
        default=1e-6,
        type=float,
        help="Weight decay coefficient for optimizer",
    )
    parser.add_argument(
        "--epochs", "-e", default=30, type=int, help="Number of maximum epochs"
    )
    parser.add_argument(
        "--early-stop-criterion",
        default="validation/main/loss",
        type=str,
        nargs="?",
        help="Value to monitor to trigger an early stopping of the training",
    )
    parser.add_argument(
        "--patience",
        default=3,
        type=int,
        nargs="?",
        help="Number of epochs to wait without improvement "
        "before stopping the training",
    )
    parser.add_argument(
        "--grad-clip", default=1, type=float, help="Gradient norm threshold to clip"
    )
    parser.add_argument(
        "--num-save-attention",
        default=5,
        type=int,
        help="Number of samples of attention to be saved",
    )
    parser.add_argument(
        "--keep-all-data-on-mem",
        default=False,
        type=strtobool,
        help="Whether to keep all data on memory",
    )

    parser.add_argument(
        "--enc-init",
        default=None,
        type=str,
        help="Pre-trained model path to initialize encoder.",
    )
    parser.add_argument(
        "--enc-init-mods",
        default="enc.",
        type=lambda s: [str(mod) for mod in s.split(",") if s != ""],
        help="List of encoder modules to initialize, separated by a comma.",
    )
    parser.add_argument(
        "--dec-init",
        default=None,
        type=str,
        help="Pre-trained model path to initialize decoder.",
    )
    parser.add_argument(
        "--dec-init-mods",
        default="dec.",
        type=lambda s: [str(mod) for mod in s.split(",") if s != ""],
        help="List of decoder modules to initialize, separated by a comma.",
    )
    parser.add_argument(
        "--freeze-mods",
        default=None,
        type=lambda s: [str(mod) for mod in s.split(",") if s != ""],
        help="List of modules to freeze (not to train), separated by a comma.",
    )

    return parser


def main(cmd_args):
    """Run training."""
    parser = get_parser()
    args, _ = parser.parse_known_args(cmd_args)

    from espnet.utils.dynamic_import import dynamic_import

    model_class = dynamic_import(args.model_module)
    assert issubclass(model_class, TTSInterface)
    model_class.add_arguments(parser)
    args = parser.parse_args(cmd_args)

    # add version info in args
    args.version = __version__

    # logging info
    if args.verbose > 0:
        logging.basicConfig(
            level=logging.INFO,
            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
        )
    else:
        logging.basicConfig(
            level=logging.WARN,
            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
        )
        logging.warning("Skip DEBUG/INFO messages")

    # If --ngpu is not given,
    #   1. if CUDA_VISIBLE_DEVICES is set, all visible devices
    #   2. if nvidia-smi exists, use all devices
    #   3. else ngpu=0
    if args.ngpu is None:
        cvd = os.environ.get("CUDA_VISIBLE_DEVICES")
        if cvd is not None:
            ngpu = len(cvd.split(","))
        else:
            logging.warning("CUDA_VISIBLE_DEVICES is not set.")
            try:
                p = subprocess.run(
                    ["nvidia-smi", "-L"], stdout=subprocess.PIPE, stderr=subprocess.PIPE
                )
            except (subprocess.CalledProcessError, FileNotFoundError):
                ngpu = 0
            else:
                ngpu = len(p.stderr.decode().split("\n")) - 1
    else:
        ngpu = args.ngpu
    logging.info(f"ngpu: {ngpu}")

    # set random seed
    logging.info("random seed = %d" % args.seed)
    random.seed(args.seed)
    np.random.seed(args.seed)

    if args.backend == "pytorch":
        from espnet.vc.pytorch_backend.vc import train

        train(args)
    else:
        raise NotImplementedError("Only pytorch is supported.")


if __name__ == "__main__":
    main(sys.argv[1:])


================================================
FILE: egs/.gitignore
================================================
launch
espnet-2021724
segment_aishell1
word_ngram


================================================
FILE: egs/aishell1/.gitignore
================================================
dump
dump32
dump64
data
exp
fbank


================================================
FILE: egs/aishell1/aed.sh
================================================
#!/usr/bin/env bash

# author: tyriontian
# tyriontian@tencent.com

. ./path.sh || exit 1;
. ./cmd.sh || exit 1;

# general configuration
backend=pytorch
stage=0        # start from 0 if you need to start from data preparation
stop_stage=100
debugmode=1
dumpdir=dump   # directory to dump full features
N=0            # number of minibatches to be used (mainly for debugging). "0" uses all minibatches.
verbose=0      # verbose option
resume=        # Resume the training from snapshot
debug=false

# feature configuration
do_delta=false

preprocess_config=conf/specaug.yaml
train_config=conf/tuning/train_pytorch_conformer_kernel31.yaml
lm_config=conf/lm.yaml
decode_config=conf/decode.yaml

# rnnlm related
lm_resume=         # specify a snapshot file to resume LM training
lmtag=             # tag for managing LMs

# ngram
ngramtag=
n_gram=4

# decoding parameter
recog_model=model.acc.best # set a model to be used for decoding: 'model.acc.best' or 'model.loss.best'

# data
data=/data/asr_data/aishell/
data_url=www.openslr.org/resources/33
dict=data/lang_1char/train_sp_units.txt
lang=data/lang_phone

### Configurable parameters ###
tag="8v100_lasmmictc_alpha03_ctc03"
ngpu=8

# Train config
seed=888
batch_size=8
accum_grad=1
epochs=100
use_segment=true # if true, use word-level transcription in MMI criterion
ctc_type="k2mmi" # k2mmi | k2ctc | default
mtlalpha=0.3
third_weight=0.3

# MBR training config
aux_mbr=false
aux_mbr_weight=1.0
aux_mbr_beam=4
mbr_epochs=100
mbr_lr=0.1
mbr_warmup=2500
mbr_resume=

# Decode config
idx_average=41_50
mmi_weight=0.0 # MMI / phonectc joint decoding
ctc_weight=0.5 # char ctc joint decoding
ngram_weight=0.0
ngram_order=4
word_ngram_tag=word_3gram
word_ngram_weight=0.0
word_ngram_log_semiring=true
lm_weight=0.0
beam_size=10
mmi_rescore=false
recog_set="test dev"

. utils/parse_options.sh || exit 1;

if [ $debug == true ]; then
    export HOST_GPU_NUM=1
    export HOST_NUM=1
    export NODE_NUM=1
    export INDEX=0
    export CHIEF_IP="9.135.217.29"
fi

train_opts=\
"\
--seed $seed \
--batch-size $batch_size \
--accum-grad $accum_grad \
--epochs $epochs \
--use-segment $use_segment \
--ctc_type $ctc_type \
--mtlalpha $mtlalpha \
--third-weight $third_weight \
"

if [ $aux_mbr == true ]; then
    train_opts="$train_opts \
                --aux-mbr $aux_mbr \
                --aux-mbr-weight $aux_mbr_weight \
                --aux-mbr-beam $aux_mbr_beam \
                --transformer-lr $mbr_lr \
                --epochs $mbr_epochs \
                --transformer-warmup-steps $mbr_warmup \
                --resume $mbr_resume \
                --load-trainer-and-opt false \
                --save-interval-iters 1000 \
                "
    export OMP_NUM_THREADS=6 # for on-the-fly decoding
fi

decode_opts=\
"\
--ctc-weight $ctc_weight \
--mmi-weight $mmi_weight \
--ngram-weight $ngram_weight \
--mmi-rescore $mmi_rescore \
--beam-size $beam_size \
--word-ngram data/${word_ngram_tag} \
--word-ngram-weight $word_ngram_weight \
--word-ngram-log-semiring $word_ngram_log_semiring \
--lm-weight $lm_weight \
"

# Set bash to 'debug' mode, it will exit on :
# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
set -e
set -u
set -o pipefail

train_set=train_sp
train_dev=dev

expname=${train_set}_${backend}_${tag}
expdir=exp/${expname}
mkdir -p ${expdir}
feat_tr_dir=${dumpdir}/${train_set}/delta${do_delta}; mkdir -p ${feat_tr_dir}
feat_dt_dir=${dumpdir}/${train_dev}/delta${do_delta}; mkdir -p ${feat_dt_dir}
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    echo "stage 1: Network Training"

    # make sure in jizhi config file: "exec_start_in_all_mpi_pods": true, 
    MASTER_PORT=22277
    NCCL_DEBUG=TRACE python3 -m torch.distributed.launch \
        --nproc_per_node ${HOST_GPU_NUM} --master_port $MASTER_PORT \
        --nnodes=${HOST_NUM} --node_rank=${INDEX} --master_addr=${CHIEF_IP} \
        ${MAIN_ROOT}/bin/asr_train.py \
        --config ${train_config} \
        --preprocess-conf ${preprocess_config} \
        --ngpu 1 \
        --backend ${backend} \
        --outdir ${expdir}/results_RANK \
        --debugmode ${debugmode} \
        --dict ${dict} \
        --debugdir ${expdir} \
        --minibatches ${N} \
        --verbose ${verbose} \
        --resume ${resume} \
        --train-json ${feat_tr_dir}/split${ngpu}utt/data_tiny.RANK.json \
        --valid-json ${feat_dt_dir}/data.json \
        --lang $lang \
        --opt "noam_sgd" \
        --n-iter-processes 8 \
        --world-size $ngpu \
        --node-rank ${INDEX} \
        --node-size ${HOST_GPU_NUM} \
        $train_opts > ${expdir}/global_record.${INDEX}.txt 2>&1
fi

if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
    echo "stage 2: Decoding"
    nj=500
    if [[ $(get_yaml.py ${train_config} model-module) = *transformer* ]] || \
           [[ $(get_yaml.py ${train_config} model-module) = *conformer* ]] || \
           [[ $(get_yaml.py ${train_config} etype) = custom ]] || \
           [[ $(get_yaml.py ${train_config} dtype) = custom ]]; then
        recog_model=model.last${idx_average}.avg.best
        echo ${expdir}/results_0/${recog_model}
        average_checkpoints.py --backend ${backend} \
         		       --snapshots ${expdir}/results_0/snapshot.ep.* \
        		       --out ${expdir}/results_0/${recog_model} \
        		       --num ${idx_average}
    fi

    decode_parent_dir=decode_mmi${mmi_weight}_${word_ngram_tag}${word_ngram_weight}_ctc${ctc_weight}_beam${beam_size}_${idx_average}
    for rtask in ${recog_set}; do
        decode_dir=$decode_parent_dir/$rtask
        feat_recog_dir=${dumpdir}/${rtask}/delta${do_delta}

        # split data
        splitjson.py --parts ${nj} ${feat_recog_dir}/data.json

        #### use CPU for decoding
        ngpu=0

        ${decode_cmd} JOB=1:$nj ${expdir}/${decode_dir}/log/decode.JOB.log \
            asr_recog.py \
            --config ${decode_config} \
            --ngpu ${ngpu} \
            --backend ${backend} \
            --batchsize 0 \
            --recog-json ${feat_recog_dir}/split${nj}utt/data.JOB.json \
            --result-label ${expdir}/${decode_dir}/data.JOB.json \
            --model ${expdir}/results_0/${recog_model}  \
            --ngram-model exp/train_ngram/${ngram_order}gram.bin \
            --rnnlm exp/train_rnnlm_pytorch_lm_transformer/rnnlm.model.best \
            --rnnlm-conf exp/train_rnnlm_pytorch_lm_transformer/model.json \
            --local-rank JOB --api v2 \
            $decode_opts

        score_sclite.sh ${expdir}/${decode_dir} ${dict} \
          > ${expdir}/${decode_dir}/decode_result.txt

    done
    echo "Finished"
fi


================================================
FILE: egs/aishell1/cmd.sh
================================================
# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
# e.g.
#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
#
# Options:
#   --time <time>: Limit the maximum time to execute.
#   --mem <mem>: Limit the maximum memory usage.
#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
#   --num-threads <ngpu>: Specify the number of CPU core.
#   --gpu <ngpu>: Specify the number of GPU devices.
#   --config: Change the configuration file from default.
#
# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
#
# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
# These options are mapping to specific options for each backend and
# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
# If jobs failed, your configuration might be wrong for your environment.
#
#
# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
# =========================================================~


# Select the backend used by run.sh from "local", "sge", "slurm", or "ssh"
cmd_backend='local'

# Local machine, without any Job scheduling system
if [ "${cmd_backend}" = local ]; then

    # The other usage
    export train_cmd="run.pl"
    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
    export cuda_cmd="run.pl"
    # Used for "*_recog.py"
    export decode_cmd="run.pl"

# "qsub" (SGE, Torque, PBS, etc.)
elif [ "${cmd_backend}" = sge ]; then
    # The default setting is written in conf/queue.conf.
    # You must change "-q g.q" for the "queue" for your environment.
    # To know the "queue" names, type "qhost -q"
    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.

    export train_cmd="queue.pl"
    export cuda_cmd="queue.pl"
    export decode_cmd="queue.pl"

# "sbatch" (Slurm)
elif [ "${cmd_backend}" = slurm ]; then
    # The default setting is written in conf/slurm.conf.
    # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
    # To know the "partion" names, type "sinfo".
    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".

    export train_cmd="slurm.pl"
    export cuda_cmd="slurm.pl"
    export decode_cmd="slurm.pl"

elif [ "${cmd_backend}" = ssh ]; then
    # You have to create ".queue/machines" to specify the host to execute jobs.
    # e.g. .queue/machines
    #   host1
    #   host2
    #   host3
    # Assuming you can login them without any password, i.e. You have to set ssh keys.

    export train_cmd="ssh.pl"
    export cuda_cmd="ssh.pl"
    export decode_cmd="ssh.pl"

# This is an example of specifying several unique options in the JHU CLSP cluster setup.
# Users can modify/add their own command options according to their cluster environments.
elif [ "${cmd_backend}" = jhu ]; then

    export train_cmd="queue.pl --mem 2G"
    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/gpu.conf"
    export decode_cmd="queue.pl --mem 4G"

else
    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
    return 1
fi


================================================
FILE: egs/aishell1/conf/fbank.conf
================================================
--sample-frequency=16000 
--num-mel-bins=80


================================================
FILE: egs/aishell1/conf/gpu.conf
================================================
# Default configuration
command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
option mem=* -l mem_free=$0,ram_free=$0
option mem=0          # Do not add anything to qsub_opts
option num_threads=* -pe smp $0
option num_threads=1  # Do not add anything to qsub_opts
option max_jobs_run=* -tc $0
default gpu=0
option gpu=0
option gpu=* -l 'hostname=b1[12345678]*|c*,gpu=$0' -q g.q

================================================
FILE: egs/aishell1/conf/lm.yaml
================================================
# rnnlm related
layer: 2
unit: 650
opt: sgd        # or adam
batchsize: 64   # batch size in LM training
epoch: 20      # if the data size is large, we can reduce this
patience: 3
maxlen: 100     # if sentence length > lm_maxlen, lm_batchsize is automatically reduced


================================================
FILE: egs/aishell1/conf/lm_rnn.yaml
================================================
lm.yaml

================================================
FILE: egs/aishell1/conf/lm_transformer.yaml
================================================
# This Transformer LM setting w/ 4 GPUs took around 60 days for 50 epochs.
# However, you can get better results in 6 days for 5 epochs (WER: 2.2/5.4/2.6/5.7)
# than LSTM LM (WER: 2.6/5.6/2.6/5.7) in 60 days for 20 epochs
# And if you does not have 4 GPUs, try accum-grad=4.

# network architecture
model-module: transformer
att-unit: 512
embed-unit: 128
head: 8
layer: 16
pos-enc: none
unit: 2048

# minibatch related
batchsize: 32
maxlen: 40

# optimization related
opt: adam
schedulers: lr=cosine
dropout-rate: 0.0
epoch: 50
gradclip: 1.0
lr: 1e-4
lr-cosine-total: 100000
lr-cosine-warmup: 1000
patience: 0
sortagrad: 0


================================================
FILE: egs/aishell1/conf/pitch.conf
================================================
--sample-frequency=16000


================================================
FILE: egs/aishell1/conf/queue.conf
================================================
# Default configuration
command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
option mem=* -l mem_free=$0,ram_free=$0
option mem=0          # Do not add anything to qsub_opts
option num_threads=* -pe smp $0
option num_threads=1  # Do not add anything to qsub_opts
option max_jobs_run=* -tc $0
default gpu=0
option gpu=0
option gpu=* -l gpu=$0 -q g.q


================================================
FILE: egs/aishell1/conf/slurm.conf
================================================
# Default configuration
command sbatch --export=PATH
option name=* --job-name $0
option time=* --time $0
option mem=* --mem-per-cpu $0
option mem=0
option num_threads=* --cpus-per-task $0
option num_threads=1 --cpus-per-task 1
option num_nodes=* --nodes $0
default gpu=0
option gpu=0 -p cpu
option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
# note: the --max-jobs-run option is supported as a special case
# by slurm.pl and you don't have to handle it in the config file.


================================================
FILE: egs/aishell1/conf/specaug.yaml
================================================
process:
  # these three processes are a.k.a. SpecAugument
  - type: "time_warp"
    max_time_warp: 5
    inplace: true
    mode: "PIL"
  - type: "freq_mask"
    F: 30
    n_mask: 2
    inplace: true
    replace_with_zero: false
  - type: "time_mask"
    T: 40
    n_mask: 2
    inplace: true
    replace_with_zero: false


================================================
FILE: egs/aishell1/conf/specaug_test.yaml
================================================
process:
  # these three processes are a.k.a. SpecAugument
  - type: "time_warp"
    max_time_warp: 0
    inplace: true
    mode: "PIL"
  - type: "freq_mask"
    F: 30
    n_mask: 2
    inplace: true
    replace_with_zero: true
  - type: "time_mask"
    T: 40
    n_mask: 2
    inplace: true
    replace_with_zero: true


================================================
FILE: egs/aishell1/conf/tuning/decode_pytorch_transformer.yaml
================================================
batchsize: 0
beam-size: 10
penalty: 0.0
maxlenratio: 0.0
minlenratio: 0.0
ctc-weight: 0.5
lm-weight: 0.0
ngram-weight: 0.3


================================================
FILE: egs/aishell1/conf/tuning/decode_rnn.yaml
================================================
beam-size: 20
penalty: 0.0
maxlenratio: 0.0
minlenratio: 0.0
ctc-weight: 0.6
lm-weight: 0.3


================================================
FILE: egs/aishell1/conf/tuning/train_pytorch_conformer_kernel15.yaml
================================================
# network architecture
# encoder related
elayers: 12
eunits: 2048
# decoder related
dlayers: 6
dunits: 2048
# attention related
adim: 256
aheads: 4

# hybrid CTC/attention
mtlalpha: 0.3

# label smoothing
lsm-weight: 0.1

# minibatch related
batch-size: 32
maxlen-in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
maxlen-out: 150 # if output length > maxlen-out, batchsize is automatically reduced

# optimization related
sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
opt: noam
accum-grad: 2
grad-clip: 5
patience: 0
epochs: 50
dropout-rate: 0.1

# transformer specific setting
backend: pytorch
model-module: "espnet.nets.pytorch_backend.e2e_asr_conformer:E2E"
transformer-input-layer: conv2d     # encoder architecture type
transformer-lr: 1.0
transformer-warmup-steps: 25000
transformer-attn-dropout-rate: 0.0
transformer-length-normalized-loss: false
transformer-init: pytorch

# conformer specific setting
transformer-encoder-pos-enc-layer-type: rel_pos
transformer-encoder-selfattn-layer-type: rel_selfattn
transformer-encoder-activation-type: swish
rel-pos-type: latest
macaron-style: true
use-cnn-module: true
cnn-module-kernel: 15


================================================
FILE: egs/aishell1/conf/tuning/train_pytorch_conformer_kernel31.yaml
================================================
# network architecture
# encoder related
elayers: 12
eunits: 2048
# decoder related
dlayers: 6
dunits: 2048
# attention related
adim: 256
aheads: 4

# hybrid CTC/attention
mtlalpha: 0.3

# label smoothing
lsm-weight: 0.1

# minibatch related
batch-size: 32
maxlen-in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
maxlen-out: 150 # if output length > maxlen-out, batchsize is automatically reduced

# optimization related
sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
opt: noam
accum-grad: 2
grad-clip: 5
patience: 0
epochs: 50
dropout-rate: 0.1

# transformer specific setting
backend: pytorch
model-module: "espnet.nets.pytorch_backend.e2e_asr_conformer:E2E"
transformer-input-layer: conv2d     # encoder architecture type
transformer-lr: 1.0
transformer-warmup-steps: 25000
transformer-attn-dropout-rate: 0.0
transformer-length-normalized-loss: false
transformer-init: pytorch

# conformer specific setting
transformer-encoder-pos-enc-layer-type: rel_pos
transformer-encoder-selfattn-layer-type: rel_selfattn
transformer-encoder-activation-type: swish
macaron-style: true
use-cnn-module: true
cnn-module-kernel: 31


================================================
FILE: egs/aishell1/conf/tuning/train_pytorch_conformer_kernel31_large.yaml
================================================
# network architecture
# encoder related
elayers: 16
eunits: 2048
# decoder related
dlayers: 6
dunits: 2048
# attention related
adim: 512
aheads: 8

# hybrid CTC/attention
mtlalpha: 0.3

# label smoothing
lsm-weight: 0.1

# minibatch related
batch-size: 32
maxlen-in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
maxlen-out: 150 # if output length > maxlen-out, batchsize is automatically reduced

# optimization related
sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
opt: noam
accum-grad: 2
grad-clip: 5
patience: 0
epochs: 50
dropout-rate: 0.1

# transformer specific setting
backend: pytorch
model-module: "espnet.nets.pytorch_backend.e2e_asr_conformer:E2E"
transformer-input-layer: conv2d     # encoder architecture type
transformer-lr: 1.0
transformer-warmup-steps: 25000
transformer-attn-dropout-rate: 0.0
transformer-length-normalized-loss: false
transformer-init: pytorch

# conformer specific setting
transformer-encoder-pos-enc-layer-type: rel_pos
transformer-encoder-selfattn-layer-type: rel_selfattn
transformer-encoder-activation-type: swish
macaron-style: true
use-cnn-module: true
cnn-module-kernel: 31


================================================
FILE: egs/aishell1/conf/tuning/train_pytorch_conformer_kernel31_small.yaml
================================================
# network architecture
# encoder related
elayers: 8
eunits: 1024
# decoder related
dlayers: 4
dunits: 1024
# attention related
adim: 256
aheads: 4

# hybrid CTC/attention
mtlalpha: 0.3

# label smoothing
lsm-weight: 0.1

# minibatch related
batch-size: 32
maxlen-in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
maxlen-out: 150 # if output length > maxlen-out, batchsize is automatically reduced

# optimization related
sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
opt: noam
accum-grad: 2
grad-clip: 5
patience: 0
epochs: 50
dropout-rate: 0.1

# transformer specific setting
backend: pytorch
model-module: "espnet.nets.pytorch_backend.e2e_asr_conformer:E2E"
transformer-input-layer: conv2d     # encoder architecture type
transformer-lr: 1.0
transformer-warmup-steps: 25000
transformer-attn-dropout-rate: 0.0
transformer-length-normalized-loss: false
transformer-init: pytorch

# conformer specific setting
transformer-encoder-pos-enc-layer-type: rel_pos
transformer-encoder-selfattn-layer-type: rel_selfattn
transformer-encoder-activation-type: swish
macaron-style: true
use-cnn-module: true
cnn-module-kernel: 31


================================================
FILE: egs/aishell1/conf/tuning/train_pytorch_transformer.yaml
================================================
# network architecture
# encoder related
elayers: 12
eunits: 2048
# decoder related
dlayers: 6
dunits: 2048
# attention related
adim: 256
aheads: 4

# hybrid CTC/attention
mtlalpha: 0.3

# label smoothing
lsm-weight: 0.1

# minibatch related
batch-size: 32
maxlen-in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
maxlen-out: 150 # if output length > maxlen-out, batchsize is automatically reduced

# optimization related
sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
opt: noam
accum-grad: 2
grad-clip: 5
patience: 0
epochs: 50
dropout-rate: 0.1

# transformer specific setting
backend: pytorch
model-module: "espnet.nets.pytorch_backend.e2e_asr_transformer:E2E"
transformer-input-layer: conv2d     # encoder architecture type
transformer-lr: 1.0
transformer-warmup-steps: 25000
transformer-attn-dropout-rate: 0.0
transformer-length-normalized-loss: false
transformer-init: pytorch


================================================
FILE: egs/aishell1/conf/tuning/train_rnn.yaml
================================================
# network architecture
# encoder related
etype: vggblstm     # encoder architecture type
elayers: 3
eunits: 1024
eprojs: 1024
subsample: "1_2_2_1_1" # skip every n frame from input to nth layers
# decoder related
dlayers: 2
dunits: 1024
# attention related
atype: location
adim: 1024
aconv-chans: 10
aconv-filts: 100

# hybrid CTC/attention
mtlalpha: 0.5

# minibatch related
batch-size: 30
maxlen-in: 800  # if input length  > maxlen_in, batchsize is automatically reduced
maxlen-out: 150 # if output length > maxlen_out, batchsize is automatically reduced

# optimization related
opt: adadelta
epochs: 10
patience: 0

# scheduled sampling option
sampling-probability: 0.0


================================================
FILE: egs/aishell1/conf/tuning/transducer/decode_default.yaml
================================================
# decoding parameters
batch: 0
beam-size: 10
search-type: default
score-norm: True


================================================
FILE: egs/aishell1/conf/tuning/transducer/train_conformer-rnn_transducer.yaml
================================================
# minibatch related
batch-size: 64
maxlen-in: 512
maxlen-out: 150

# optimization related
criterion: loss
early-stop-criterion: "validation/main/loss"
sortagrad: 0
opt: noam
transformer-lr: 1.0
transformer-warmup-steps: 25000
epochs: 100
patience: 0
accum-grad: 2
grad-clip: 5.0

# network architecture
## general
custom-enc-positional-encoding-type: rel_pos
custom-enc-self-attn-type: rel_self_attn
custom-enc-pw-activation-type: swish
## encoder related
etype: custom
custom-enc-input-layer: vgg2l
enc-block-arch:
        - type: conformer
          d_hidden: 512
          d_ff: 2048
          heads: 4
          macaron_style: True
          use_conv_mod: True
          conv_mod_kernel: 15
          dropout-rate: 0.3
          att-dropout-rate: 0.3
enc-block-repeat: 12
## decoder related
dtype: lstm
dlayers: 1
dec-embed-dim: 1024
dunits: 512
dropout-rate-embed-decoder: 0.2
dropout-rate-decoder: 0.1
## joint network related
joint-dim: 512

# transducer related
model-module: "espnet.nets.pytorch_backend.e2e_asr_transducer:E2E"

# reporter related
report-wer: True
report-cer: True


================================================
FILE: egs/aishell1/conf/tuning/transducer/train_conformer-rnn_transducer_aux_ngpu4.yaml
================================================
# minibatch related
batch-size: 32
maxlen-in: 512
maxlen-out: 150

# optimization related
criterion: loss
early-stop-criterion: "validation/main/loss"
sortagrad: 0
opt: noam
transformer-lr: 1.0
transformer-warmup-steps: 25000
epochs: 100
patience: 0
#accum-grad: 2
grad-clip: 5.0

# network architecture
## general
custom-enc-positional-encoding-type: rel_pos
custom-enc-self-attn-type: rel_self_attn
custom-enc-pw-activation-type: swish
## encoder related
etype: custom
custom-enc-input-layer: vgg2l
enc-block-arch:
        - type: conformer
          d_hidden: 512
          d_ff: 2048
          heads: 4
          macaron_style: True
          use_conv_mod: True
          conv_mod_kernel: 15
          dropout-rate: 0.3
          att-dropout-rate: 0.3
enc-block-repeat: 12
## decoder related
dtype: lstm
dlayers: 1
dec-embed-dim: 1024
dunits: 512
dropout-rate-embed-decoder: 0.2
dropout-rate-decoder: 0.1
## joint network related
joint-dim: 512

# transducer related
model-module: "espnet.nets.pytorch_backend.e2e_asr_transducer:E2E"

# reporter related
report-wer: True
report-cer: True


================================================
FILE: egs/aishell1/conf/tuning/transducer/train_conformer-rnn_transducer_aux_ngpu4_att.yaml
================================================
# minibatch related
batch-size: 32
maxlen-in: 512
maxlen-out: 150

# optimization related
criterion: loss
early-stop-criterion: "validation/main/loss"
sortagrad: 0
opt: noam
transformer-lr: 1.0
transformer-warmup-steps: 25000
epochs: 100
patience: 0
#accum-grad: 2
grad-clip: 5.0

# network architecture
## general
custom-enc-positional-encoding-type: rel_pos
custom-enc-self-attn-type: rel_self_attn
custom-enc-pw-activation-type: swish
## encoder related
etype: custom
custom-enc-input-layer: vgg2l
enc-block-arch:
        - type: conformer
          d_hidden: 512
          d_ff: 2048
          heads: 4
          macaron_style: True
          use_conv_mod: True
          conv_mod_kernel: 15
          dropout-rate: 0.3
          att-dropout-rate: 0.3
enc-block-repeat: 12
## decoder related
dtype: lstm
dlayers: 1
dec-embed-dim: 1024
dunits: 512
dropout-rate-embed-decoder: 0.2
dropout-rate-decoder: 0.1
## joint network related
joint-dim: 512

# transducer related
model-module: "espnet.nets.pytorch_backend.e2e_asr_transducer:E2E"

# reporter related
report-wer: True
report-cer: True

# Attention scorer auxiliary task: mainly follow the settings in LASCTC decoder
att-adim: 512
att-aheads: 8
att-dlayers: 6
att-dunits: 2048
att-dropout-rate: 0.1
att-attn-dropout-rate: 0.0
att-length-normalized-loss: false
lsm-weight: 0.1


================================================
FILE: egs/aishell1/conf/tuning/transducer/train_conformer-rnn_transducer_aux_ngpu4_small.yaml
================================================
# minibatch related
batch-size: 32
maxlen-in: 512
maxlen-out: 150

# optimization related
criterion: loss
early-stop-criterion: "validation/main/loss"
sortagrad: 0
opt: noam
transformer-lr: 1.0
transformer-warmup-steps: 25000
epochs: 100
patience: 0
#accum-grad: 2
grad-clip: 5.0

# network architecture
## general
custom-enc-positional-encoding-type: rel_pos
custom-enc-self-attn-type: rel_self_attn
custom-enc-pw-activation-type: swish
## encoder related
etype: custom
custom-enc-input-layer: vgg2l
enc-block-arch:
        - type: conformer
          d_hidden: 256
          d_ff: 1024
          heads: 4
          macaron_style: True
          use_conv_mod: True
          conv_mod_kernel: 15
          dropout-rate: 0.3
          att-dropout-rate: 0.3
enc-block-repeat: 12
## decoder related
dtype: lstm
dlayers: 1
dec-embed-dim: 512
dunits: 256
dropout-rate-embed-decoder: 0.2
dropout-rate-decoder: 0.1
## joint network related
joint-dim: 256

# transducer related
model-module: "espnet.nets.pytorch_backend.e2e_asr_transducer:E2E"

# reporter related
report-wer: True
report-cer: True

# auxiliary task
#aux-ctc: True
#aux-ctc-weight: 0.5
#aux-ctc-dropout-rate: 0.1


================================================
FILE: egs/aishell1/conf/tuning/transducer/train_conformer-rnn_transducer_ngpu4.yaml
================================================
# minibatch related
batch-size: 32
maxlen-in: 512
maxlen-out: 150

# optimization related
criterion: loss
early-stop-criterion: "validation/main/loss"
sortagrad: 0
opt: noam
transformer-lr: 1.0
transformer-warmup-steps: 25000
epochs: 100
patience: 0
#accum-grad: 2
grad-clip: 5.0

# network architecture
## general
custom-enc-positional-encoding-type: rel_pos
custom-enc-self-attn-type: rel_self_attn
custom-enc-pw-activation-type: swish
## encoder related
etype: custom
custom-enc-input-layer: vgg2l
enc-block-arch:
        - type: conformer
          d_hidden: 512
          d_ff: 2048
          heads: 4
          macaron_style: True
          use_conv_mod: True
          conv_mod_kernel: 15
          dropout-rate: 0.3
          att-dropout-rate: 0.3
enc-block-repeat: 12
## decoder related
dtype: lstm
dlayers: 1
dec-embed-dim: 1024
dunits: 512
dropout-rate-embed-decoder: 0.2
dropout-rate-decoder: 0.1
## joint network related
joint-dim: 512

# transducer related
model-module: "espnet.nets.pytorch_backend.e2e_asr_transducer:E2E"

# reporter related
report-wer: True
report-cer: True

# auxiliary task
aux-ctc: False
aux-ctc-weight: 0.0
aux-ctc-dropout-rate: 0.0


================================================
FILE: egs/aishell1/conf/tuning/transducer/train_conformer-rnn_transducer_ngpu4_large.yaml
================================================
# minibatch related
batch-size: 32
maxlen-in: 512
maxlen-out: 150

# optimization related
criterion: loss
early-stop-criterion: "validation/main/loss"
sortagrad: 0
opt: noam
transformer-lr: 1.0
transformer-warmup-steps: 25000
epochs: 100
patience: 0
#accum-grad: 2
grad-clip: 5.0

# network architecture
## general
custom-enc-positional-encoding-type: rel_pos
custom-enc-self-attn-type: rel_self_attn
custom-enc-pw-activation-type: swish
## encoder related
etype: custom
custom-enc-input-layer: vgg2l
enc-block-arch:
        - type: conformer
          d_hidden: 512
          d_ff: 2048
          heads: 4
          macaron_style: True
          use_conv_mod: True
          conv_mod_kernel: 31
          dropout-rate: 0.3
          att-dropout-rate: 0.3
enc-block-repeat: 16
## decoder related
dtype: lstm
dlayers: 2
dec-embed-dim: 1024
dunits: 1024
dropout-rate-embed-decoder: 0.2
dropout-rate-decoder: 0.1
## joint network related
joint-dim: 512

# transducer related
model-module: "espnet.nets.pytorch_backend.e2e_asr_transducer:E2E"

# reporter related
report-wer: True
report-cer: True

# auxiliary task
aux-ctc: False
aux-ctc-weight: 0.0
aux-ctc-dropout-rate: 0.0


================================================
FILE: egs/aishell1/conf/tuning/transducer/train_transducer.yaml
================================================
# minibatch related
batch-size: 64
maxlen-in: 512
maxlen-out: 150

# optimization related
criterion: loss
early-stop-criterion: "validation/main/loss"
sortagrad: 0
opt: adadelta
epochs: 30
patience: 3
accum-grad: 2

# network architecture
## encoder related
etype: vggblstm
elayers: 6
eunits: 512
eprojs: 512
dropout-rate: 0.4
## decoder related
dtype: lstm
dlayers: 1
dec-embed-dim: 1024
dunits: 512
dropout-rate-embed-decoder: 0.2
dropout-rate-decoder: 0.1
## joint network related
joint-dim: 512

# transducer related
model-module: "espnet.nets.pytorch_backend.e2e_asr_transducer:E2E"

# reporter related
report-wer: True
report-cer: True


================================================
FILE: egs/aishell1/conf/tuning/transducer/train_transducer_aux.yaml
================================================
# minibatch related
batch-size: 64
maxlen-in: 512
maxlen-out: 150

# optimization related
criterion: loss
early-stop-criterion: "validation/main/loss"
sortagrad: 0
opt: adadelta
epochs: 30
patience: 3
accum-grad: 2

# network architecture
## encoder related
etype: vggblstm
elayers: 6
eunits: 512
eprojs: 512
dropout-rate: 0.4
## decoder related
dtype: lstm
dlayers: 1
dec-embed-dim: 1024
dunits: 512
dropout-rate-embed-decoder: 0.2
dropout-rate-decoder: 0.1
## joint network related
joint-dim: 512

# transducer related
model-module: "espnet.nets.pytorch_backend.e2e_asr_transducer:E2E"

# reporter related
report-wer: True
report-cer: True

# auxiliary task
aux-ctc: True
aux-ctc-weight: 0.1
aux-ctc-dropout-rate: 0.1


================================================
FILE: egs/aishell1/local/add_lex_disambig.pl
================================================
#!/usr/bin/env perl
#  Copyright 2010-2011  Microsoft Corporation
#            2013-2016  Johns Hopkins University (author: Daniel Povey)
#                 2015  Hainan Xu
#                 2015  Guoguo Chen

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#  http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.


# Adds disambiguation symbols to a lexicon.
# Outputs still in the normal lexicon format.
# Disambig syms are numbered #1, #2, #3, etc. (#0
# reserved for symbol in grammar).
# Outputs the number of disambig syms to the standard output.
# With the --pron-probs option, expects the second field
# of each lexicon line to be a pron-prob.
# With the --sil-probs option, expects three additional
# fields after the pron-prob, representing various components
# of the silence probability model.

$pron_probs = 0;
$sil_probs = 0;
$first_allowed_disambig = 1;

for ($n = 1; $n <= 3 && @ARGV > 0; $n++) {
  if ($ARGV[0] eq "--pron-probs") {
    $pron_probs = 1;
    shift @ARGV;
  }
  if ($ARGV[0] eq "--sil-probs") {
    $sil_probs = 1;
    shift @ARGV;
  }
  if ($ARGV[0] eq "--first-allowed-disambig") {
    $first_allowed_disambig = 0 + $ARGV[1];
    if ($first_allowed_disambig < 1) {
      die "add_lex_disambig.pl: invalid --first-allowed-disambig option: $first_allowed_disambig\n";
    }
    shift @ARGV;
    shift @ARGV;
  }
}

if (@ARGV != 2) {
  die "Usage: add_lex_disambig.pl [opts] <lexicon-in> <lexicon-out>\n" .
    "This script adds disambiguation symbols to a lexicon in order to\n" .
    "make decoding graphs determinizable; it adds pseudo-phone\n" .
    "disambiguation symbols #1, #2 and so on at the ends of phones\n" .
    "to ensure that all pronunciations are different, and that none\n" .
    "is a prefix of another.\n" .
    "It prints to the standard output the number of the largest-numbered" .
    "disambiguation symbol that was used.\n" .
    "\n" .
    "Options:   --pron-probs       Expect pronunciation probabilities in the 2nd field\n" .
    "           --sil-probs        [should be with --pron-probs option]\n" .
    "                              Expect 3 extra fields after the pron-probs, for aspects of\n" .
    "                              the silence probability model\n" .
    "           --first-allowed-disambig <n>  The number of the first disambiguation symbol\n" .
    "                              that this script is allowed to add.  By default this is\n" .
    "                              #1, but you can set this to a larger value using this option.\n" .
    "e.g.:\n" .
    " add_lex_disambig.pl lexicon.txt lexicon_disambig.txt\n" .
    " add_lex_disambig.pl --pron-probs lexiconp.txt lexiconp_disambig.txt\n" .
    " add_lex_disambig.pl --pron-probs --sil-probs lexiconp_silprob.txt lexiconp_silprob_disambig.txt\n";
}


$lexfn = shift @ARGV;
$lexoutfn = shift @ARGV;

open(L, "<$lexfn") || die "Error opening lexicon $lexfn";

# (1)  Read in the lexicon.
@L = ( );
while(<L>) {
    @A = split(" ", $_);
    push @L, join(" ", @A);
}

# (2) Work out the count of each phone-sequence in the
# lexicon.

foreach $l (@L) {
    @A = split(" ", $l);
    shift @A; # Remove word.
    if ($pron_probs) {
      $p = shift @A;
      if (!($p > 0.0 && $p <= 1.0)) { die "Bad lexicon line $l (expecting pron-prob as second field)"; }
    }
    if ($sil_probs) {
      $silp = shift @A;
      if (!($silp > 0.0 && $silp <= 1.0)) { die "Bad lexicon line $l for silprobs"; }
      $correction = shift @A;
      if ($correction <= 0.0) { die "Bad lexicon line $l for silprobs"; }
      $correction = shift @A;
      if ($correction <= 0.0) { die "Bad lexicon line $l for silprobs"; }
    }
    if (!(@A)) {
      die "Bad lexicon line $1, no phone in phone list";
    }
    $count{join(" ",@A)}++;
}

# (3) For each left sub-sequence of each phone-sequence, note down
# that it exists (for identifying prefixes of longer strings).

foreach $l (@L) {
    @A = split(" ", $l);
    shift @A; # Remove word.
    if ($pron_probs) { shift @A; } # remove pron-prob.
    if ($sil_probs) {
      shift @A; # Remove silprob
      shift @A; # Remove silprob
      shift @A; # Remove silprob, there three numbers for sil_probs
    }
    while(@A > 0) {
        pop @A;  # Remove last phone
        $issubseq{join(" ",@A)} = 1;
    }
}

# (4) For each entry in the lexicon:
#  if the phone sequence is unique and is not a
#  prefix of another word, no diambig symbol.
#  Else output #1, or #2, #3, ... if the same phone-seq
#  has already been assigned a disambig symbol.


open(O, ">$lexoutfn") || die "Opening lexicon file $lexoutfn for writing.\n";

# max_disambig will always be the highest-numbered disambiguation symbol that
# has been used so far.
$max_disambig = $first_allowed_disambig - 1;

foreach $l (@L) {
  @A = split(" ", $l);
  $word = shift @A;
  if ($pron_probs) {
    $pron_prob = shift @A;
  }
  if ($sil_probs) {
    $sil_word_prob = shift @A;
    $word_sil_correction = shift @A;
    $prev_nonsil_correction = shift @A
  }
  $phnseq = join(" ", @A);
  if (!defined $issubseq{$phnseq}
      && $count{$phnseq} == 1) {
    ;                           # Do nothing.
  } else {
    if ($phnseq eq "") {        # need disambig symbols for the empty string
      # that are not use anywhere else.
      $max_disambig++;
      $reserved_for_the_empty_string{$max_disambig} = 1;
      $phnseq = "#$max_disambig";
    } else {
      $cur_disambig = $last_used_disambig_symbol_of{$phnseq};
      if (!defined $cur_disambig) {
        $cur_disambig = $first_allowed_disambig;
      } else {
        $cur_disambig++;           # Get a number that has not been used yet for
                                   # this phone sequence.
      }
      while (defined $reserved_for_the_empty_string{$cur_disambig}) {
        $cur_disambig++;
      }
      if ($cur_disambig > $max_disambig) {
        $max_disambig = $cur_disambig;
      }
      $last_used_disambig_symbol_of{$phnseq} = $cur_disambig;
      $phnseq = $phnseq . " #" . $cur_disambig;
    }
  }
  if ($pron_probs) {
    if ($sil_probs) {
      print O "$word\t$pron_prob\t$sil_word_prob\t$word_sil_correction\t$prev_nonsil_correction\t$phnseq\n";
    } else {
      print O "$word\t$pron_prob\t$phnseq\n";
    }
  } else {
    print O "$word\t$phnseq\n";
  }
}

print $max_disambig . "\n";


================================================
FILE: egs/aishell1/local/aishell_data_prep.sh
================================================
#!/usr/bin/env bash

# Copyright 2017 Xingyu Na
# Apache 2.0

. ./path.sh || exit 1;

if [ $# != 2 ]; then
  echo "Usage: $0 <audio-path> <text-path>"
  echo " $0 /export/a05/xna/data/data_aishell/wav /export/a05/xna/data/data_aishell/transcript"
  exit 1;
fi

aishell_audio_dir=$1
aishell_text=$2/aishell_transcript_v0.8.txt

train_dir=data/local/train
dev_dir=data/local/dev
test_dir=data/local/test
tmp_dir=data/local/tmp

mkdir -p $train_dir
mkdir -p $dev_dir
mkdir -p $test_dir
mkdir -p $tmp_dir

# data directory check
if [ ! -d $aishell_audio_dir ] || [ ! -f $aishell_text ]; then
  echo "Error: $0 requires two directory arguments"
  exit 1;
fi

# find wav audio file for train, dev and test resp.
find $aishell_audio_dir -iname "*.wav" > $tmp_dir/wav.flist
n=`cat $tmp_dir/wav.flist | wc -l`
[ $n -ne 141925 ] && \
  echo Warning: expected 141925 data data files, found $n

grep -i "wav/train" $tmp_dir/wav.flist > $train_dir/wav.flist || exit 1;
grep -i "wav/dev" $tmp_dir/wav.flist > $dev_dir/wav.flist || exit 1;
grep -i "wav/test" $tmp_dir/wav.flist > $test_dir/wav.flist || exit 1;

rm -r $tmp_dir

# Transcriptions preparation
for dir in $train_dir $dev_dir $test_dir; do
  echo Preparing $dir transcriptions
  sed -e 's/\.wav//' $dir/wav.flist | awk -F '/' '{print $NF}' > $dir/utt.list
  sed -e 's/\.wav//' $dir/wav.flist | awk -F '/' '{i=NF-1;printf("%s %s\n",$NF,$i)}' > $dir/utt2spk_all
  paste -d' ' $dir/utt.list $dir/wav.flist > $dir/wav.scp_all
  utils/filter_scp.pl -f 1 $dir/utt.list $aishell_text > $dir/transcripts.txt
  awk '{print $1}' $dir/transcripts.txt > $dir/utt.list
  utils/filter_scp.pl -f 1 $dir/utt.list $dir/utt2spk_all | sort -u > $dir/utt2spk
  utils/filter_scp.pl -f 1 $dir/utt.list $dir/wav.scp_all | sort -u > $dir/wav.scp
  sort -u $dir/transcripts.txt > $dir/text
  utils/utt2spk_to_spk2utt.pl $dir/utt2spk > $dir/spk2utt
done

mkdir -p data/train data/dev data/test

for f in spk2utt utt2spk wav.scp text; do
  cp $train_dir/$f data/train/$f || exit 1;
  cp $dev_dir/$f data/dev/$f || exit 1;
  cp $test_dir/$f data/test/$f || exit 1;
done

echo "$0: AISHELL data preparation succeeded"
exit 0;


================================================
FILE: egs/aishell1/local/aishell_train_lms.sh
================================================
#!/usr/bin/env bash

# To be run from one directory above this script.
. ./path.sh

text=data/local/train/text
lexicon=data/local/dict_nosp/lexicon.txt

for f in "$text" "$lexicon"; do
  [ ! -f $x ] && echo "$0: No such file $f" && exit 1
done

# This script takes no arguments.  It assumes you have already run
# aishell_data_prep.sh.
# It takes as input the files
# data/local/train/text
# data/local/dict/lexicon.txt
dir=data/local/lm
mkdir -p $dir

kaldi_lm=$(which train_lm.sh)
if [ -z $kaldi_lm ]; then
  echo "$0: train_lm.sh is not found. That might mean it's not installed"
  echo "$0: or it is not added to PATH"
  echo "$0: Please use the following commands to install it"
  echo "  git clone https://github.com/danpovey/kaldi_lm.git"
  echo "  cd kaldi_lm"
  echo "  make -j"
  echo "Then add the path of kaldi_lm to PATH and rerun $0"
  exit 1
fi

cleantext=$dir/text.no_oov

cat $text | awk -v lex=$lexicon 'BEGIN{while((getline<lex) >0){ seen[$1]=1; } }
  {for(n=1; n<=NF;n++) {  if (seen[$n]) { printf("%s ", $n); } else {printf("<UNK> ");} } printf("\n");}' \
  >$cleantext || exit 1

cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | sort | uniq -c |
  sort -nr >$dir/word.counts || exit 1

# Get counts from acoustic training transcripts, and add  one-count
# for each word in the lexicon (but not silence, we don't want it
# in the LM-- we'll add it optionally later).
cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' |
  cat - <(grep -w -v '!SIL' $lexicon | awk '{print $1}') |
  sort | uniq -c | sort -nr >$dir/unigram.counts || exit 1

# note: we probably won't really make use of <UNK> as there aren't any OOVs
cat $dir/unigram.counts | awk '{print $2}' | get_word_map.pl "<s>" "</s>" "<UNK>" >$dir/word_map ||
  exit 1

# note: ignore 1st field of train.txt, it's the utterance-id.
cat $cleantext | awk -v wmap=$dir/word_map 'BEGIN{while((getline<wmap)>0)map[$1]=$2;}
  { for(n=2;n<=NF;n++) { printf map[$n]; if(n<NF){ printf " "; } else { print ""; }}}' | gzip -c >$dir/train.gz ||
  exit 1

train_lm.sh --arpa --lmtype 3gram-mincount $dir || exit 1
train_lm.sh --arpa --lmtype 4gram-mincount $dir || exit 1

# LM is small enough that we don't need to prune it (only about 0.7M N-grams).
# Perplexity over 128254.000000 words is 90.446690

# note: output is
# data/local/lm/3gram-mincount/lm_unpruned.gz

exit 0

# From here is some commands to do a baseline with SRILM (assuming
# you have it installed).
heldout_sent=10000 # Don't change this if you want result to be comparable with
# kaldi_lm results
sdir=$dir/srilm # in case we want to use SRILM to double-check perplexities.
mkdir -p $sdir
cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n<NF) printf " "; else print ""; }}' |
  head -$heldout_sent >$sdir/heldout
cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n<NF) printf " "; else print ""; }}' |
  tail -n +$heldout_sent >$sdir/train

cat $dir/word_map | awk '{print $1}' | cat - <(
  echo "<s>"
  echo "</s>"
) >$sdir/wordlist

ngram-count -text $sdir/train -order 3 -limit-vocab -vocab $sdir/wordlist -unk \
  -map-unk "<UNK>" -kndiscount -interpolate -lm $sdir/srilm.o3g.kn.gz
ngram -lm $sdir/srilm.o3g.kn.gz -ppl $sdir/heldout
# 0 zeroprobs, logprob= -250954 ppl= 90.5091 ppl1= 132.482

# Note: perplexity SRILM gives to Kaldi-LM model is same as kaldi-lm reports above.
# Difference in WSJ must have been due to different treatment of <UNK>.
ngram -lm $dir/3gram-mincount/lm_unpruned.gz -ppl $sdir/heldout
# 0 zeroprobs, logprob= -250913 ppl= 90.4439 ppl1= 132.379


================================================
FILE: egs/aishell1/local/apply_map.pl
================================================
#!/usr/bin/env perl
use warnings; #sed replacement for -w perl parameter
# Copyright 2012  Johns Hopkins University (Author: Daniel Povey)
# Apache 2.0.

# This program is a bit like ./sym2int.pl in that it applies a map
# to things in a file, but it's a bit more general in that it doesn't
# assume the things being mapped to are single tokens, they could
# be sequences of tokens.  See the usage message.


$permissive = 0;

for ($x = 0; $x <= 2; $x++) {

  if (@ARGV > 0 && $ARGV[0] eq "-f") {
    shift @ARGV;
    $field_spec = shift @ARGV;
    if ($field_spec =~ m/^\d+$/) {
      $field_begin = $field_spec - 1; $field_end = $field_spec - 1;
    }
    if ($field_spec =~ m/^(\d*)[-:](\d*)/) { # accept e.g. 1:10 as a courtesty (properly, 1-10)
      if ($1 ne "") {
        $field_begin = $1 - 1;  # Change to zero-based indexing.
      }
      if ($2 ne "") {
        $field_end = $2 - 1;    # Change to zero-based indexing.
      }
    }
    if (!defined $field_begin && !defined $field_end) {
      die "Bad argument to -f option: $field_spec";
    }
  }

  if (@ARGV > 0 && $ARGV[0] eq '--permissive') {
    shift @ARGV;
    # Mapping is optional (missing key is printed to output)
    $permissive = 1;
  }
}

if(@ARGV != 1) {
  print STDERR "Invalid usage: " . join(" ", @ARGV) . "\n";
  print STDERR <<'EOF';
Usage: apply_map.pl [options] map <input >output
 options: [-f <field-range> ] [--permissive]
   This applies a map to some specified fields of some input text:
   For each line in the map file: the first field is the thing we
   map from, and the remaining fields are the sequence we map it to.
   The -f (field-range) option says which fields of the input file the map
   map should apply to.
   If the --permissive option is supplied, fields which are not present
   in the map will be left as they were.
 Applies the map 'map' to all input text, where each line of the map
 is interpreted as a map from the first field to the list of the other fields
 Note: <field-range> can look like 4-5, or 4-, or 5-, or 1, it means the field
 range in the input to apply the map to.
 e.g.: echo A B | apply_map.pl a.txt
 where a.txt is:
 A a1 a2
 B b
 will produce:
 a1 a2 b
EOF
  exit(1);
}

($map_file) = @ARGV;
open(M, "<$map_file") || die "Error opening map file $map_file: $!";

while (<M>) {
  @A = split(" ", $_);
  @A >= 1 || die "apply_map.pl: empty line.";
  $i = shift @A;
  $o = join(" ", @A);
  $map{$i} = $o;
}

while(<STDIN>) {
  @A = split(" ", $_);
  for ($x = 0; $x < @A; $x++) {
    if ( (!defined $field_begin || $x >= $field_begin)
         && (!defined $field_end || $x <= $field_end)) {
      $a = $A[$x];
      if (!defined $map{$a}) {
        if (!$permissive) {
          die "apply_map.pl: undefined key $a in $map_file\n";
        } else {
          print STDERR "apply_map.pl: warning! missing key $a in $map_file\n";
        }
      } else {
        $A[$x] = $map{$a};
      }
    }
  }
  print join(" ", @A) . "\n";
}


================================================
FILE: egs/aishell1/local/build_sp_text.py
================================================
import sys

in_f = sys.argv[1]

for line in open(in_f, 'r', encoding="utf8"):
    elems = line.split()
    uttid = elems[0]
    for sp in ["0.9", "1.0", "1.1"]:
        uttid_sp = f"sp{sp}-{uttid}"
        line = f"{uttid_sp} " + " ".join(elems[1:])
        print(line)


================================================
FILE: egs/aishell1/local/build_word_mapping.py
================================================
# convert the attention output vocabulary into lexicon vocabulary
import sys

att_vocab = sys.argv[1]
lex_vocab = sys.argv[2]
out_map = sys.argv[3]

# load lex_vocab
lex = {}
for line in open(lex_vocab, encoding='utf8'):
    tok, tid = line.split()
    lex[tok] = tid

writer = open(out_map, 'w', encoding='utf8')
for line in open(att_vocab, encoding='utf8'):
    tok, tid = line.split()
    if tok in lex.keys():
        info = "{} {}\n".format(tid, lex[tok])
        writer.write(info)
    else:
        print("CANNOT find ", tok)


================================================
FILE: egs/aishell1/local/compile_bigram.sh
================================================
# Compile char level bigram LM. for MMI training. 
# The bigram should be sparse or 4300+ words would lead to 17M arcs and overflow of GPU memory

lang=$1
train_text=$2
threshold=2

lmplz -o 2 --prune $threshold < $train_text > $lang/P.arpa
python3 -m kaldilm \
        --read-symbol-table="${lang}/words.txt" \
        --disambig-symbol='#0' \
        --max-order=2 \
        $lang/P.arpa > ${lang}/P.fst.txt


================================================
FILE: egs/aishell1/local/download_and_untar.sh
================================================
#!/usr/bin/env bash

# Copyright   2014  Johns Hopkins University (author: Daniel Povey)
#             2017  Xingyu Na
# Apache 2.0

remove_archive=false

if [ "$1" == --remove-archive ]; then
  remove_archive=true
  shift
fi

if [ $# -ne 3 ]; then
  echo "Usage: $0 [--remove-archive] <data-base> <url-base> <corpus-part>"
  echo "e.g.: $0 /export/a05/xna/data www.openslr.org/resources/33 data_aishell"
  echo "With --remove-archive it will remove the archive after successfully un-tarring it."
  echo "<corpus-part> can be one of: data_aishell, resource_aishell."
fi

data=$1
url=$2
part=$3

if [ ! -d "$data" ]; then
  echo "$0: no such directory $data"
  exit 1;
fi

part_ok=false
list="data_aishell resource_aishell"
for x in $list; do
  if [ "$part" == $x ]; then part_ok=true; fi
done
if ! $part_ok; then
  echo "$0: expected <corpus-part> to be one of $list, but got '$part'"
  exit 1;
fi

if [ -z "$url" ]; then
  echo "$0: empty URL base."
  exit 1;
fi

if [ -f $data/$part/.complete ]; then
  echo "$0: data part $part was already successfully extracted, nothing to do."
  exit 0;
fi

# sizes of the archive files in bytes.
sizes="15582913665 1246920"

if [ -f $data/$part.tgz ]; then
  size=$(/bin/ls -l $data/$part.tgz | awk '{print $5}')
  size_ok=false
  for s in $sizes; do if [ $s == $size ]; then size_ok=true; fi; done
  if ! $size_ok; then
    echo "$0: removing existing file $data/$part.tgz because its size in bytes $size"
    echo "does not equal the size of one of the archives."
    rm $data/$part.tgz
  else
    echo "$data/$part.tgz exists and appears to be complete."
  fi
fi

if [ ! -f $data/$part.tgz ]; then
  if ! command -v wget >/dev/null; then
    echo "$0: wget is not installed."
    exit 1;
  fi
  full_url=$url/$part.tgz
  echo "$0: downloading data from $full_url.  This may take some time, please be patient."

  cd $data || exit 1
  if ! wget --no-check-certificate $full_url; then
    echo "$0: error executing wget $full_url"
    exit 1;
  fi
fi

cd $data || exit 1

if ! tar -xvzf $part.tgz; then
  echo "$0: error un-tarring archive $data/$part.tgz"
  exit 1;
fi

touch $data/$part/.complete

if [ $part == "data_aishell" ]; then
  cd $data/$part/wav || exit 1
  for wav in ./*.tar.gz; do
    echo "Extracting wav from $wav"
    tar -zxf $wav && rm $wav
  done
fi

echo "$0: Successfully downloaded and un-tarred $data/$part.tgz"

if $remove_archive; then
  echo "$0: removing $data/$part.tgz file since --remove-archive option was supplied."
  rm $data/$part.tgz
fi

exit 0;


================================================
FILE: egs/aishell1/local/fstaddselfloops.pl
================================================
#!/usr/bin/env perl

# Copyright 2020 Xiaomi Corporation (Author: Junbo Zhang)
# Apache 2.0

use strict;
use warnings;

my $Usage = <<EOU;
fstaddselfloops.pl:
Adds self-loops to states of an FST to propagate disambiguation symbols through it.
They are added on each final state and each state with non-epsilon output symbols
on at least one arc out of the state. 

Usage: local/fstaddselfloops.pl <wdisambig_phone> <wdisambig_word> < <openfst_text>
 e.g.: cat L_disambig.txt | local/fstaddselfloops.pl 347 200004 > L_disambig_with_loop.txt
EOU

if (@ARGV != 2) {
  die $Usage;
}

my $wdisambig_phone = shift @ARGV;
my $wdisambig_word = shift @ARGV;

my %states_needs_self_loops;
while (<>) {
    print $_;

    my @items = split(/\s+/);
    if (@items == 2) {
        # it is a final state
        $states_needs_self_loops{$items[0]} = 1;
    } elsif (@items == 5) {
        my ($src, $dst, $inlabel, $outlabel, $score) = @items;
        $states_needs_self_loops{$src} = 1 if ($outlabel != 0);
    } else {
        die "Invalid openfst line.";
    }
}

foreach (keys %states_needs_self_loops) {
    print "$_ $_ $wdisambig_phone $wdisambig_word 0.0\n"
}


================================================
FILE: egs/aishell1/local/k2_aishell_prepare_dict.sh
================================================
#!/usr/bin/env bash

# Copyright 2017 Xingyu Na
# Apache 2.0

# prepare dict resources

# . ./path.sh

[ $# != 2 ] && echo "Usage: $0 <resource-path> <dest-path>" && exit 1

res_dir=$1
dict_dir=$2
mkdir -p $dict_dir
cp $res_dir/lexicon.txt $dict_dir
echo '<UNK> spn' >>$dict_dir/lexicon.txt

cat $dict_dir/lexicon.txt | awk '{ for(n=2;n<=NF;n++){ phones[$n] = 1; }} END{for (p in phones) print p;}' |
  perl -e 'while(<>){ chomp($_); $phone = $_; next if ($phone eq "sil");
    m:^([^\d]+)(\d*)$: || die "Bad phone $_"; $q{$1} .= "$phone "; }
    foreach $l (values %q) {print "$l\n";}
  ' | sort -k1 >$dict_dir/nonsilence_phones.txt || exit 1

echo sil >$dict_dir/silence_phones.txt

echo sil >$dict_dir/optional_silence.txt

# No "extra questions" in the input to this setup, as we don't
# have stress or tone

cat $dict_dir/silence_phones.txt | awk '{printf("%s ", $1);} END{printf "\n";}' >$dict_dir/extra_questions.txt || exit 1
cat $dict_dir/nonsilence_phones.txt | perl -e 'while(<>){ foreach $p (split(" ", $_)) {
  $p =~ m:^([^\d]+)(\d*)$: || die "Bad phone $_"; $q{$2} .= "$p "; } } foreach $l (values %q) {print "$l\n";}' \
  >>$dict_dir/extra_questions.txt || exit 1

echo "$0: AISHELL dict preparation succeeded"
exit 0


================================================
FILE: egs/aishell1/local/k2_aishell_prepare_dict_char.sh
================================================
# Build character-level dict for K2 CTC / MMI 
# The token list would be very large (10k+) if we use aishell lexicon 
# so we use the token list of espnet
[ $# != 2 ] && echo "Usage: $0 <espnet-char-list> <dest-path>" && exit 1

lex=$1
dict=$2

rm -r $dict
mkdir -p $dict

# prepare lexicon
cat $lex | tail -n +2 | awk '{print $1, $1}' > $dict/lexicon.txt
echo "<UNK> spn" >> $dict/lexicon.txt
echo "SIL sil" >> $dict/lexicon.txt
echo "<SPOKEN_NOISE> sil" >> $dict/lexicon.txt

# phones and extra questions
echo sil >$dict/silence_phones.txt
echo sil >$dict/optional_silence.txt
echo sil >$dict/extra_questions.txt
cat $dict/lexicon.txt | cut -d " " -f 2 | grep -v "sil" > $dict/nonsilence_phones.txt


================================================
FILE: egs/aishell1/local/k2_prepare_lang.sh
================================================
#!/usr/bin/env bash
# Copyright 2012-2013  Johns Hopkins University (Author: Daniel Povey);
#                      Arnab Ghoshal
#                2014  Guoguo Chen
#                2015  Hainan Xu
#                2016  FAU Erlangen (Author: Axel Horndasch)
#                2020  Xiaomi Corporation (Author: Junbo Zhang)

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#  http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.

# This script prepares a directory such as data/lang/, in the standard format,
# given a source directory containing a dictionary lexicon.txt in a form like:
# word phone1 phone2 ... phoneN
# per line (alternate prons would be separate lines), or a dictionary with probabilities
# called lexiconp.txt in a form:
# word pron-prob phone1 phone2 ... phoneN
# (with 0.0 < pron-prob <= 1.0); note: if lexiconp.txt exists, we use it even if
# lexicon.txt exists.
# and also files silence_phones.txt, nonsilence_phones.txt, optional_silence.txt
# and extra_questions.txt
# Here, silence_phones.txt and nonsilence_phones.txt are lists of silence and
# non-silence phones respectively (where silence includes various kinds of
# noise, laugh, cough, filled pauses etc., and nonsilence phones includes the
# "real" phones.)
# In each line of those files is a list of phones, and the phones on each line
# are assumed to correspond to the same "base phone", i.e. they will be
# different stress or tone variations of the same basic phone.
# The file "optional_silence.txt" contains just a single phone (typically SIL)
# which is used for optional silence in the lexicon.
# extra_questions.txt might be empty; typically will consist of lists of phones,
# all members of each list with the same stress or tone; and also possibly a
# list for the silence phones.  This will augment the automatically generated
# questions (note: the automatically generated ones will treat all the
# stress/tone versions of a phone the same, so will not "get to ask" about
# stress or tone).
#

# This script adds word-position-dependent phones and constructs a host of other
# derived files, that go in data/lang/.

# Begin configuration section.
num_sil_states=5
num_nonsil_states=3
position_dependent_phones=true
# position_dependent_phones is false also when position dependent phones and word_boundary.txt
# have been generated by another source
share_silence_phones=false  # if true, then share pdfs of different silence
                            # phones together.
sil_prob=0.5
num_extra_phone_disambig_syms=1 # Standard one phone disambiguation symbol is used for optional silence.
                                # Increasing this number does not harm, but is only useful if you later
                                # want to introduce this labels to L_disambig.fst


# end configuration sections

echo "$0 $@"  # Print the command line for logging
echo $sil_prob
. local/parse_options.sh
echo $sil_prob
if [ $# -ne 4 ]; then
  echo "Usage: local/prepare_lang.sh <dict-src-dir> <oov-dict-entry> <tmp-dir> <lang-dir>"
  echo "e.g.: local/prepare_lang.sh data/local/dict <SPOKEN_NOISE> data/local/lang data/lang"
  echo "<dict-src-dir> should contain the following files:"
  echo " extra_questions.txt  lexicon.txt nonsilence_phones.txt  optional_silence.txt  silence_phones.txt"
  echo "See http://kaldi-asr.org/doc/data_prep.html#data_prep_lang_creating for more info."
  echo "options: "
  echo "<dict-src-dir> may also, for the grammar-decoding case (see http://kaldi-asr.org/doc/grammar.html)"
  echo "contain a file nonterminals.txt containing symbols like #nonterm:contact_list, one per line."
  echo "     --num-sil-states <number of states>             # default: 5, #states in silence models."
  echo "     --num-nonsil-states <number of states>          # default: 3, #states in non-silence models."
  echo "     --position-dependent-phones (true|false)        # default: true; if true, use _B, _E, _S & _I"
  echo "                                                     # markers on phones to indicate word-internal positions. "
  echo "     --share-silence-phones (true|false)             # default: false; if true, share pdfs of "
  echo "                                                     # all silence phones. "
  echo "     --sil-prob <probability of silence>             # default: 0.5 [must have 0 <= silprob < 1]"
  exit 1;
fi

srcdir=$1
oov_word=$2
tmpdir=$3
dir=$4


if [ -d $dir/phones ]; then
  rm -r $dir/phones
fi
mkdir -p $dir $tmpdir $dir/phones

silprob=false
[ -f $srcdir/lexiconp_silprob.txt ] && silprob=true

[ -f path.sh ] && . ./path.sh

if [[ ! -f $srcdir/lexicon.txt ]]; then
  echo "**Creating $srcdir/lexicon.txt from $srcdir/lexiconp.txt"
  perl -ape 's/(\S+\s+)\S+\s+(.+)/$1$2/;' < $srcdir/lexiconp.txt > $srcdir/lexicon.txt || exit 1;
fi
if [[ ! -f $srcdir/lexiconp.txt ]]; then
  echo "**Creating $srcdir/lexiconp.txt from $srcdir/lexicon.txt"
  perl -ape 's/(\S+\s+)(.+)/${1}1.0\t$2/;' < $srcdir/lexicon.txt > $srcdir/lexiconp.txt || exit 1;
fi

if [ ! -z "$unk_fst" ] && [ ! -f "$unk_fst" ]; then
  echo "$0: expected --unk-fst $unk_fst to exist as a file"
  exit 1
fi

if $position_dependent_phones; then
  # Create $tmpdir/lexiconp.txt from $srcdir/lexiconp.txt (or
  # $tmpdir/lexiconp_silprob.txt from $srcdir/lexiconp_silprob.txt) by
  # adding the markers _B, _E, _S, _I depending on word position.
  # In this recipe, these markers apply to silence also.
  # Do this starting from lexiconp.txt only.
  if "$silprob"; then
    perl -ane '@A=split(" ",$_); $w = shift @A; $p = shift @A; $silword_p = shift @A;
              $wordsil_f = shift @A; $wordnonsil_f = shift @A; @A>0||die;
         if(@A==1) { print "$w $p $silword_p $wordsil_f $wordnonsil_f $A[0]_S\n"; }
         else { print "$w $p $silword_p $wordsil_f $wordnonsil_f $A[0]_B ";
         for($n=1;$n<@A-1;$n++) { print "$A[$n]_I "; } print "$A[$n]_E\n"; } ' \
                < $srcdir/lexiconp_silprob.txt > $tmpdir/lexiconp_silprob.txt
  else
    perl -ane '@A=split(" ",$_); $w = shift @A; $p = shift @A; @A>0||die;
         if(@A==1) { print "$w $p $A[0]_S\n"; } else { print "$w $p $A[0]_B ";
         for($n=1;$n<@A-1;$n++) { print "$A[$n]_I "; } print "$A[$n]_E\n"; } ' \
         < $srcdir/lexiconp.txt > $tmpdir/lexiconp.txt || exit 1;
  fi

  # create $tmpdir/phone_map.txt
  # this has the format (on each line)
  # <original phone> <version 1 of original phone> <version 2> ...
  # where the versions depend on the position of the phone within a word.
  # For instance, we'd have:
  # AA AA_B AA_E AA_I AA_S
  # for (B)egin, (E)nd, (I)nternal and (S)ingleton
  # and in the case of silence
  # SIL SIL SIL_B SIL_E SIL_I SIL_S
  # [because SIL on its own is one of the variants; this is for when it doesn't
  #  occur inside a word but as an option in the lexicon.]

  # This phone map expands the phone lists into all the word-position-dependent
  # versions of the phone lists.
  cat <(set -f; for x in `cat $srcdir/silence_phones.txt`; do for y in "" "" "_B" "_E" "_I" "_S"; do echo -n "$x$y "; done; echo; done) \
    <(set -f; for x in `cat $srcdir/nonsilence_phones.txt`; do for y in "" "_B" "_E" "_I" "_S"; do echo -n "$x$y "; done; echo; done) \
    > $tmpdir/phone_map.txt
else
  if "$silprob"; then
    cp $srcdir/lexiconp_silprob.txt $tmpdir/lexiconp_silprob.txt
  else
    cp $srcdir/lexiconp.txt $tmpdir/lexiconp.txt
  fi

  cat $srcdir/silence_phones.txt $srcdir/nonsilence_phones.txt | \
    awk '{for(n=1;n<=NF;n++) print $n; }' > $tmpdir/phones
  paste -d' ' $tmpdir/phones $tmpdir/phones > $tmpdir/phone_map.txt
fi


# Making monophone systems.
cat $srcdir/silence_phones.txt | local/apply_map.pl $tmpdir/phone_map.txt | \
  awk '{for(n=1;n<=NF;n++) print $n;}' > $dir/phones/silence.txt
cat $srcdir/nonsilence_phones.txt | local/apply_map.pl $tmpdir/phone_map.txt | \
  awk '{for(n=1;n<=NF;n++) print $n;}' > $dir/phones/nonsilence.txt
cp $srcdir/optional_silence.txt $dir/phones/optional_silence.txt

# if extra_questions.txt is empty, it's OK.
cat $srcdir/extra_questions.txt 2>/dev/null | local/apply_map.pl $tmpdir/phone_map.txt \
  >$dir/phones/extra_questions.txt

# Want extra questions about the word-start/word-end stuff. Make it separate for
# silence and non-silence. Probably doesn't matter, as silence will rarely
# be inside a word.
if $position_dependent_phones; then
  for suffix in _B _E _I _S; do
    (set -f; for x in `cat $srcdir/nonsilence_phones.txt`; do echo -n "$x$suffix "; done; echo) >>$dir/phones/extra_questions.txt
  done
  for suffix in "" _B _E _I _S; do
    (set -f; for x in `cat $srcdir/silence_phones.txt`; do echo -n "$x$suffix "; done; echo) >>$dir/phones/extra_questions.txt
  done
fi

# add_lex_disambig.pl is responsible for adding disambiguation symbols to
# the lexicon, for telling us how many disambiguation symbols it used,
# and also for modifying the unknown-word's pronunciation (if the
# --unk-fst was provided) to the sequence "#1 #2 #3", and reserving those
# disambig symbols for that purpose.
# The #2 will later be replaced with the actual unk model.  The reason
# for the #1 and the #3 is for disambiguation and also to keep the
# FST compact.  If we didn't have the #1, we might have a different copy of
# the unk-model FST, or at least some of its arcs, for each start-state from
# which an <unk> transition comes (instead of per end-state, which is more compact);
# and adding the #3 prevents us from potentially having 2 copies of the unk-model
# FST due to the optional-silence [the last phone of any word gets 2 arcs].
if [ ! -z "$unk_fst" ]; then  # if the --unk-fst option was provided...
  if "$silprob"; then
    local/lang/internal/modify_unk_pron.py $tmpdir/lexiconp_silprob.txt "$oov_word" || exit 1
  else
    local/lang/internal/modify_unk_pron.py $tmpdir/lexiconp.txt "$oov_word" || exit 1
  fi
  unk_opt="--first-allowed-disambig 4"
else
  unk_opt=
fi

if "$silprob"; then
  ndisambig=$(local/add_lex_disambig.pl $unk_opt --pron-probs --sil-probs $tmpdir/lexiconp_silprob.txt $tmpdir/lexiconp_silprob_disambig.txt)
else
  ndisambig=$(local/add_lex_disambig.pl $unk_opt --pron-probs $tmpdir/lexiconp.txt $tmpdir/lexiconp_disambig.txt)
fi
ndisambig=$[$ndisambig+$num_extra_phone_disambig_syms]; # add (at least) one disambig symbol for silence in lexicon FST.
echo $ndisambig > $tmpdir/lex_ndisambig

# Format of lexiconp_disambig.txt:
# !SIL	1.0   SIL_S
# <SPOKEN_NOISE>	1.0   SPN_S #1
# <UNK>	1.0  SPN_S #2
# <NOISE>	1.0  NSN_S
# !EXCLAMATION-POINT	1.0  EH2_B K_I S_I K_I L_I AH0_I M_I EY1_I SH_I AH0_I N_I P_I OY2_I N_I T_E

( for n in `seq 0 $ndisambig`; do echo '#'$n; done ) >$dir/phones/disambig.txt

# Create phone symbol table.
echo "<eps>" | cat - $dir/phones/{silence,nonsilence,disambig}.txt | \
  awk '{n=NR-1; print $1, n;}' > $dir/phones.txt

# Create a file that describes the word-boundary information for
# each phone.  5 categories.
if $position_dependent_phones; then
  cat $dir/phones/{silence,nonsilence}.txt | \
    awk '/_I$/{print $1, "internal"; next;} /_B$/{print $1, "begin"; next; }
         /_S$/{print $1, "singleton"; next;} /_E$/{print $1, "end"; next; }
         {print $1, "nonword";} ' > $dir/phones/word_boundary.txt
else
  # word_boundary.txt might have been generated by another source
  [ -f $srcdir/word_boundary.txt ] && cp $srcdir/word_boundary.txt $dir/phones/word_boundary.txt
fi

# Create word symbol table.
# <s> and </s> are only needed due to the need to rescore lattices with
# ConstArpaLm format language model. They do not normally appear in G.fst or
# L.fst.

if "$silprob"; then
  # remove the silprob
  cat $tmpdir/lexiconp_silprob.txt |\
    awk '{
      for(i=1; i<=NF; i++) {
        if(i!=3 && i!=4 && i!=5) printf("%s\t", $i); if(i==NF) print "";
      }
    }' > $tmpdir/lexiconp.txt
fi

cat $tmpdir/lexiconp.txt | awk '{print $1}' | sort | uniq  | awk '
  BEGIN {
    print "<eps> 0";
  }
  {
    if ($1 == "<s>") {
      print "<s> is in the vocabulary!" | "cat 1>&2"
      exit 1;
    }
    if ($1 == "</s>") {
      print "</s> is in the vocabulary!" | "cat 1>&2"
      exit 1;
    }
    printf("%s %d\n", $1, NR);
  }
  END {
    printf("#0 %d\n", NR+1);
    printf("<s> %d\n", NR+2);
    printf("</s> %d\n", NR+3);
  }' > $dir/words.txt || exit 1;

# format of $dir/words.txt:
#<eps> 0
#a 1
#aa 2
#aarvark 3
#...

silphone=`cat $srcdir/optional_silence.txt` || exit 1;
[ -z "$silphone" ] && \
  ( echo "You have no optional-silence phone; it is required in the current scripts"
    echo "but you may use the option --sil-prob 0.0 to stop it being used." ) && \
   exit 1;

grammar_opts=

# Create the basic L.fst without disambiguation symbols, for use
# in training.

if $silprob; then
  # Add silence probabilities (models the prob. of silence before and after each
  # word).  On some setups this helps a bit.  See local/dict_dir_add_pronprobs.sh
  # and where it's called in the example scripts (run.sh).
  local/make_lexicon_fst_silprob.py $grammar_opts --sil-phone=$silphone \
    $tmpdir/lexiconp_silprob.txt $srcdir/silprob.txt | \
    local/sym2int.pl -f 3 $dir/phones.txt | \
    local/sym2int.pl -f 4 $dir/words.txt  > $dir/L.fst.txt || exit 1;

    # fstcompile --isymbols=$dir/phones.txt --osymbols=$dir/words.txt \
    #   --keep_isymbols=false --keep_osymbols=false |   \
    # fstarcsort --sort_type=olabel > $dir/L.fst || exit 1;
else
  local/make_lexicon_fst.py $grammar_opts --sil-prob=$sil_prob --sil-phone=$silphone \
    $tmpdir/lexiconp.txt | \
    local/sym2int.pl -f 3 $dir/phones.txt | \
    local/sym2int.pl -f 4 $dir/words.txt > $dir/L.fst.txt || exit 1;

    # fstcompile --isymbols=$dir/phones.txt --osymbols=$dir/words.txt \
    #   --keep_isymbols=false --keep_osymbols=false | \
    # fstarcsort --sort_type=olabel > $dir/L.fst || exit 1;
fi

# The file oov.txt contains a word that we will map any OOVs to during
# training.
echo "$oov_word" > $dir/oov.txt || exit 1;
cat $dir/oov.txt | local/sym2int.pl $dir/words.txt >$dir/oov.int || exit 1;
# integer version of oov symbol, used in some scripts.


# the file wdisambig.txt contains a (line-by-line) list of the text-form of the
# disambiguation symbols that are used in the grammar and passed through by the
# lexicon.  At this stage it's hardcoded as '#0', but we're laying the groundwork
# for more generality (which probably would be added by another script).
# wdisambig_words.int contains the corresponding list interpreted by the
# symbol table words.txt, and wdisambig_phones.int contains the corresponding
# list interpreted by the symbol table phones.txt.
echo '#0' >$dir/phones/wdisambig.txt

wdisambig_phone=`local/sym2int.pl $dir/phones.txt <$dir/phones/wdisambig.txt`
wdisambig_word=`local/sym2int.pl $dir/words.txt <$dir/phones/wdisambig.txt`

# Create these lists of phones in colon-separated integer list form too,
# for purposes of being given to programs as command-line options.
for f in silence nonsilence optional_silence disambig; do
  local/sym2int.pl $dir/phones.txt <$dir/phones/$f.txt >$dir/phones/$f.int
  local/sym2int.pl $dir/phones.txt <$dir/phones/$f.txt | \
   awk '{printf(":%d", $1);} END{printf "\n"}' | sed s/:// > $dir/phones/$f.csl || exit 1;
done

if [ -f $dir/phones/word_boundary.txt ]; then
  local/sym2int.pl -f 1 $dir/phones.txt <$dir/phones/word_boundary.txt \
    > $dir/phones/word_boundary.int || exit 1;
fi

silphonelist=`cat $dir/phones/silence.csl`
nonsilphonelist=`cat $dir/phones/nonsilence.csl`

# Create the lexicon FST with disambiguation symbols, and put it in lang_test.
# There is an extra step where we create a loop to "pass through" the
# disambiguation symbols from G.fst.

if $silprob; then
  local/make_lexicon_fst_silprob.py $grammar_opts \
    --sil-phone=$silphone --sil-disambig='#'$ndisambig \
    $tmpdir/lexiconp_silprob_disambig.txt $srcdir/silprob.txt | \
    local/sym2int.pl -f 3 $dir/phones.txt | \
    local/sym2int.pl -f 4 $dir/words.txt | \
    local/fstaddselfloops.pl $wdisambig_phone $wdisambig_word > $dir/L_disambig.fst.txt || exit 1;
else
  local/make_lexicon_fst.py $grammar_opts \
    --sil-prob=$sil_prob --sil-phone=$silphone --sil-disambig='#'$ndisambig \
    $tmpdir/lexiconp_disambig.txt | \
    local/sym2int.pl -f 3 $dir/phones.txt | \
    local/sym2int.pl -f 4 $dir/words.txt | \
    local/fstaddselfloops.pl $wdisambig_phone $wdisambig_word > $dir/L_disambig.fst.txt || exit 1;
fi

exit 0;


================================================
FILE: egs/aishell1/local/make_lexicon_fst.py
================================================
#!/usr/bin/env python3

# Copyright   2018  Johns Hopkins University (author: Daniel Povey)
# Apache 2.0.

# see get_args() below for usage message.
import argparse
import os
import sys
import math
import re

# The use of latin-1 encoding does not preclude reading utf-8.  latin-1
# encoding means "treat words as sequences of bytes", and it is compatible
# with utf-8 encoding as well as other encodings such as gbk, as long as the
# spaces are also spaces in ascii (which we check).  It is basically how we
# emulate the behavior of python before python3.
sys.stdout = open(1, 'w', encoding='latin-1', closefd=False)
sys.stderr = open(2, 'w', encoding='latin-1', closefd=False)

def get_args():
    parser = argparse.ArgumentParser(description="""This script creates the
       text form of a lexicon FST, to be compiled by fstcompile using the
       appropriate symbol tables (phones.txt and words.txt) .  It will mostly
       be invoked indirectly via utils/prepare_lang.sh.  The output goes to
       the stdout.""")

    parser.add_argument('--sil-phone', dest='sil_phone', type=str,
                        help="""Text form of optional-silence phone, e.g. 'SIL'.  See also
                        the --silprob option.""")
    parser.add_argument('--sil-prob', dest='sil_prob', type=float, default=0.0,
                        help="""Probability of silence between words (including at the
                        beginning and end of word sequences).  Must be in the range [0.0, 1.0].
                        This refers to the optional silence inserted by the lexicon; see
                        the --silphone option.""")
    parser.add_argument('--sil-disambig', dest='sil_disambig', type=str,
                        help="""Disambiguation symbol to disambiguate silence, e.g. #5.
                        Will only be supplied if you are creating the version of L.fst
                        with disambiguation symbols, intended for use with cyclic G.fst.
                        This symbol was introduced to fix a rather obscure source of
                        nondeterminism of CLG.fst, that has to do with reordering of
                        disambiguation symbols and phone symbols.""")
    parser.add_argument('--left-context-phones', dest='left_context_phones', type=str,
                        help="""Only relevant if --nonterminals is also supplied; this relates
                        to grammar decoding (see http://kaldi-asr.org/doc/grammar.html or
                        src/doc/grammar.dox).  Format is a list of left-context phones,
                        in text form, one per line.  E.g. data/lang/phones/left_context_phones.txt""")
    parser.add_argument('--nonterminals', type=str,
                        help="""If supplied, --left-context-phones must also be supplied.
                        List of user-defined nonterminal symbols such as #nonterm:contact_list,
                        one per line.  E.g. data/local/dict/nonterminals.txt.""")
    parser.add_argument('lexiconp', type=str,
                        help="""Filename of lexicon with pronunciation probabilities
                        (normally lexiconp.txt), with lines of the form 'word prob p1 p2...',
                        e.g. 'a   1.0    ay'""")
    args = parser.parse_args()
    return args


def read_lexiconp(filename):
    """Reads the lexiconp.txt file in 'filename', with lines like 'word pron p1 p2 ...'.
    Returns a list of tuples (word, pron_prob, pron), where 'word' is a string,
   'pron_prob', a float, is the pronunciation probability (which must be >0.0
    and would normally be <=1.0),  and 'pron' is a list of strings representing phones.
    An element in the returned list might be ('hello', 1.0, ['h', 'eh', 'l', 'ow']).
    """

    ans = []
    found_empty_prons = False
    found_large_pronprobs = False
    # See the comment near the top of this file, RE why we use latin-1.
    with open(filename, 'r', encoding='latin-1') as f:
        whitespace = re.compile("[ \t]+")
        for line in f:
            a = whitespace.split(line.strip(" \t\r\n"))
            if len(a) < 2:
                print("{0}: error: found bad line '{1}' in lexicon file {2} ".format(
                    sys.argv[0], line.strip(" \t\r\n"), filename), file=sys.stderr)
                sys.exit(1)
            word = a[0]
            if word == "<eps>":
                # This would clash with the epsilon symbol normally used in OpenFst.
                print("{0}: error: found <eps> as a word in lexicon file "
                      "{1}".format(line.strip(" \t\r\n"), filename), file=sys.stderr)
                sys.exit(1)
            try:
                pron_prob = float(a[1])
            except:
                print("{0}: error: found bad line '{1}' in lexicon file {2}, 2nd field "
                      "should be pron-prob".format(sys.argv[0], line.strip(" \t\r\n"), filename),
                      file=sys.stderr)
                sys.exit(1)
            prons = a[2:]
            if pron_prob <= 0.0:
                print("{0}: error: invalid pron-prob in line '{1}' of lexicon file {1} ".format(
                    sys.argv[0], line.strip(" \t\r\n"), filename), file=sys.stderr)
                sys.exit(1)
            if len(prons) == 0:
                found_empty_prons = True
            ans.append( (word, pron_prob, prons) )
            if pron_prob > 1.0:
                found_large_pronprobs = True
    if found_empty_prons:
        print("{0}: warning: found at least one word with an empty pronunciation "
              "in lexicon file {1}.".format(sys.argv[0], filename),
              file=sys.stderr)
    if found_large_pronprobs:
        print("{0}: warning: found at least one word with pron-prob >1.0 "
              "in {1}".format(sys.argv[0], filename), file=sys.stderr)


    if len(ans) == 0:
        print("{0}: error: found no pronunciations in lexicon file {1}".format(
            sys.argv[0], filename), file=sys.stderr)
        sys.exit(1)
    return ans


def write_nonterminal_arcs(start_state, loop_state, next_state,
                           nonterminals, left_context_phones):
    """This function relates to the grammar-decoding setup, see
    kaldi-asr.org/doc/grammar.html.  It is called from write_fst_no_silence
    and write_fst_silence, and writes to the stdout some extra arcs
    in the lexicon FST that relate to nonterminal symbols.
    See the section "Special symbols in L.fst,
    kaldi-asr.org/doc/grammar.html#grammar_special_l.
       start_state: the start-state of L.fst.
       loop_state:  the state of high out-degree in L.fst where words leave
                  and enter.
       next_state: the number from which this function can start allocating its
                  own states.  the updated value of next_state will be returned.
       nonterminals: the user-defined nonterminal symbols as a list of
          strings, e.g. ['#nonterm:contact_list', ... ].
       left_context_phones: a list of phones that may appear as left-context,
          e.g. ['a', 'ah', ... '#nonterm_bos'].
    """
    shared_state = next_state
    next_state += 1
    final_state = next_state
    next_state += 1

    print("{src}\t{dest}\t{phone}\t{word}\t{cost}".format(
        src=start_state, dest=shared_state,
        phone='#nonterm_begin', word='#nonterm_begin',
        cost=0.0))

    for nonterminal in nonterminals:
        print("{src}\t{dest}\t{phone}\t{word}\t{cost}".format(
            src=loop_state, dest=shared_state,
            phone=nonterminal, word=nonterminal,
            cost=0.0))
    # this_cost equals log(len(left_context_phones)) but the expression below
    # better captures the meaning.  Applying this cost to arcs keeps the FST
    # stochatic (sum-to-one, like an HMM), so that if we do weight pushing
    # things won't get weird.  In the grammar-FST code when we splice things
    # together we will cancel out this cost, see the function CombineArcs().
    this_cost = -math.log(1.0 / len(left_context_phones))

    for left_context_phone in left_context_phones:
        print("{src}\t{dest}\t{phone}\t{word}\t{cost}".format(
            src=shared_state, dest=loop_state,
            phone=left_context_phone, word='<eps>', cost=this_cost))
    # arc from loop-state to a final-state with #nonterm_end as ilabel and olabel
    print("{src}\t{dest}\t{phone}\t{word}\t{cost}".format(
        src=loop_state, dest=final_state,
        phone='#nonterm_end', word='#nonterm_end', cost=0.0))
    print("{state}\t{final_cost}".format(
        state=final_state, final_cost=0.0))
    return next_state


def write_fst_no_silence(lexicon, nonterminals=None, left_context_phones=None):
    """Writes the text format of L.fst to the standard output.  This version is for
    when --sil-prob=0.0, meaning there is no optional silence allowed.

      'lexicon' is a list of 3-tuples (word, pron-prob, prons) as returned by
        read_lexiconp().
     'nonterminals', which relates to grammar decoding (see kaldi-asr.org/doc/grammar.html),
        is either None, or the user-defined nonterminal symbols as a list of
        strings, e.g. ['#nonterm:contact_list', ... ].
     'left_context_phones', which also relates to grammar decoding, and must be
        supplied if 'nonterminals' is supplied is either None or a list of
        phones that may appear as left-context, e.g. ['a', 'ah', ... '#nonterm_bos'].
    """

    loop_state = 0
    next_state = 1  # the next un-allocated state, will be incremented as we go.
    for (word, pronprob, pron) in lexicon:
        cost = -math.log(pronprob)
        cur_state = loop_state
        for i in range(len(pron) - 1):
            print("{src}\t{dest}\t{phone}\t{word}\t{cost}".format(
                src=cur_state,
                dest=next_state,
                phone=pron[i],
                word=(word if i == 0 else '<eps>'),
                cost=(cost if i == 0 else 0.0)))
            cur_state = next_state
            next_state += 1

        i = len(pron) - 1  # note: i == -1 if pron is empty.
        print("{src}\t{dest}\t{phone}\t{word}\t{cost}".format(
            src=cur_state,
            dest=loop_state,
            phone=(pron[i] if i >= 0 else '<eps>'),
            word=(word if i <= 0 else '<eps>'),
            cost=(cost if i <= 0 else 0.0)))

    if nonterminals is not None:
        next_state = write_nonterminal_arcs(
            loop_state, loop_state, next_state,
            nonterminals, left_context_phones)

    print("{state}\t{final_cost}".format(
        state=loop_state,
        final_cost=0.0))


def write_fst_with_silence(lexicon, sil_prob, sil_phone, sil_disambig,
                           nonterminals=None, left_context_phones=None):
    """Writes the text format of L.fst to the standard output.  This version is for
       when --sil-prob != 0.0, meaning there is optional silence
     'lexicon' is a list of 3-tuples (word, pron-prob, prons)
         as returned by read_lexiconp().
     'sil_prob', which is expected to be strictly between 0.. and 1.0, is the
         probability of silence
     'sil_phone' is the silence phone, e.g. "SIL".
     'sil_disambig' is either None, or the silence disambiguation symbol, e.g. "#5".
     'nonterminals', which relates to grammar decoding (see kaldi-asr.org/doc/grammar.html),
        is either None, or the user-defined nonterminal symbols as a list of
        strings, e.g. ['#nonterm:contact_list', ... ].
     'left_context_phones', which also relates to grammar decoding, and must be
        supplied if 'nonterminals' is supplied is either None or a list of
        phones that may appear as left-context, e.g. ['a', 'ah', ... '#nonterm_bos'].
    """

    assert sil_prob > 0.0 and sil_prob < 1.0
    sil_cost = -math.log(sil_prob)
    no_sil_cost = -math.log(1.0 - sil_prob);

    start_state = 0
    loop_state = 1  # words enter and leave from here
    sil_state = 2   # words terminate here when followed by silence; this state
                    # has a silence transition to loop_state.
    next_state = 3  # the next un-allocated state, will be incremented as we go.


    print('{src}\t{dest}\t{phone}\t{word}\t{cost}'.format(
        src=start_state, dest=loop_state,
        phone='<eps>', word='<eps>', cost=no_sil_cost))
    print('{src}\t{dest}\t{phone}\t{word}\t{cost}'.format(
        src=start_state, dest=sil_state,
        phone='<eps>', word='<eps>', cost=sil_cost))
    if sil_disambig is None:
        print('{src}\t{dest}\t{phone}\t{word}\t{cost}'.format(
            src=sil_state, dest=loop_state,
            phone=sil_phone, word='<eps>', cost=0.0))
    else:
        sil_disambig_state = next_state
        next_state += 1
        print('{src}\t{dest}\t{phone}\t{word}\t{cost}'.format(
            src=sil_state, dest=sil_disambig_state,
            phone=sil_phone, word='<eps>', cost=0.0))
        print('{src}\t{dest}\t{phone}\t{word}\t{cost}'.format(
            src=sil_disambig_state, dest=loop_state,
            phone=sil_disambig, word='<eps>', cost=0.0))


    for (word, pronprob, pron) in lexicon:
        pron_cost = -math.log(pronprob)
        cur_state = loop_state
        for i in range(len(pron) - 1):
            print("{src}\t{dest}\t{phone}\t{word}\t{cost}".format(
                src=cur_state, dest=next_state,
                phone=pron[i],
                word=(word if i == 0 else '<eps>'),
                cost=(pron_cost if i == 0 else 0.0)))
            cur_state = next_state
            next_state += 1

        i = len(pron) - 1  # note: i == -1 if pron is empty.
        print("{src}\t{dest}\t{phone}\t{word}\t{cost}".format(
            src=cur_state,
            dest=loop_state,
            phone=(pron[i] if i >= 0 else '<eps>'),
            word=(word if i <= 0 else '<eps>'),
            cost=no_sil_cost + (pron_cost if i <= 0 else 0.0)))
        print("{src}\t{dest}\t{phone}\t{word}\t{cost}".format(
            src=cur_state,
            dest=sil_state,
            phone=(pron[i] if i >= 0 else '<eps>'),
            word=(word if i <= 0 else '<eps>'),
            cost=sil_cost + (pron_cost if i <= 0 else 0.0)))

    if nonterminals is not None:
        next_state = write_nonterminal_arcs(
            start_state, loop_state, next_state,
            nonterminals, left_context_phones)

    print("{state}\t{final_cost}".format(
        state=loop_state,
        final_cost=0.0))


def write_words_txt(orig_lines, highest_numbered_symbol, nonterminals, filename):
    """Writes updated words.txt to 'filename'.  'orig_lines' is the original lines
       in the words.txt file as a list of strings (without the newlines);
       highest_numbered_symbol is the highest numbered symbol in the original
       words.txt; nonterminals is a list of strings like '#nonterm:foo'."""
    with open(filename, 'w', encoding='latin-1') as f:
        for l in orig_lines:
            print(l, file=f)
        cur_symbol = highest_numbered_symbol + 1
        for n in [ '#nonterm_begin', '#nonterm_end' ] + nonterminals:
            print("{0} {1}".format(n, cur_symbol), file=f)
            cur_symbol = cur_symbol + 1


def read_nonterminals(filename):
    """Reads the user-defined nonterminal symbols in 'filename', checks that
       it has the expected format and has no duplicates, and returns the nonterminal
       symbols as a list of strings, e.g.
       ['#nonterm:contact_list', '#nonterm:phone_number', ... ]. """
    ans = [line.strip(" \t\r\n") for line in open(filename, 'r', encoding='latin-1')]
    if len(ans) == 0:
        raise RuntimeError("The file {0} contains no nonterminals symbols.".format(filename))
    for nonterm in ans:
        if nonterm[:9] != '#nonterm:':
            raise RuntimeError("In file '{0}', expected nonterminal symbols to start with '#nonterm:', found '{1}'"
                               .format(filename, nonterm))
    if len(set(ans)) != len(ans):
        raise RuntimeError("Duplicate nonterminal symbols are present in file {0}".format(filename))
    return ans

def read_left_context_phones(filename):
    """Reads, checks, and returns a list of left-context phones, in text form, one
       per line.  Returns a list of strings, e.g. ['a', 'ah', ..., '#nonterm_bos' ]"""
    ans = [line.strip(" \t\r\n") for line in open(filename, 'r', encoding='latin-1')]
    if len(ans) == 0:
        raise RuntimeError("The file {0} contains no left-context phones.".format(filename))
    whitespace = re.compile("[ \t]+")
    for s in ans:
        if len(whitespace.split(s)) != 1:
            raise RuntimeError("The file {0} contains an invalid line '{1}'".format(filename, s)   )

    if len(set(ans)) != len(ans):
        raise RuntimeError("Duplicate nonterminal symbols are present in file {0}".format(filename))
    return ans


def is_token(s):
    """Returns true if s is a string and is space-free."""
    if not isinstance(s, str):
        return False
    whitespace = re.compile("[ \t\r\n]+")
    split_str = whitespace.split(s);
    return len(split_str) == 1 and s == split_str[0]


def main():
    args = get_args()

    lexicon = read_lexiconp(args.lexiconp)

    if args.nonterminals is None:
        nonterminals, left_context_phones = None, None
    else:
        if args.left_context_phones is None:
            print("{0}: if --nonterminals is specified, --left-context-phones must also "
                  "be specified".format(sys.argv[0]))
            sys.exit(1)
        nonterminals = read_nonterminals(args.nonterminals)
        left_context_phones = read_left_context_phones(args.left_context_phones)

    if args.sil_prob == 0.0:
          write_fst_no_silence(lexicon,
                               nonterminals=nonterminals,
                               left_context_phones=left_context_phones)
    else:
        # Do some checking that the options make sense.
        if args.sil_prob < 0.0 or args.sil_prob >= 1.0:
            print("{0}: invalid value specified --sil-prob={1}".format(
                sys.argv[0], args.sil_prob), file=sys.stderr)
            sys.exit(1)

        if not is_token(args.sil_phone):
            print("{0}: you specified --sil-prob={1} but --sil-phone is set "
                  "to '{2}'".format(sys.argv[0], args.sil_prob, args.sil_phone),
                  file=sys.stderr)
            sys.exit(1)
        if args.sil_disambig is not None and not is_token(args.sil_disambig):
            print("{0}: invalid value --sil-disambig='{1}' was specified."
                  "".format(sys.argv[0], args.sil_disambig), file=sys.stderr)
            sys.exit(1)
        write_fst_with_silence(lexicon, args.sil_prob, args.sil_phone,
                               args.sil_disambig,
                               nonterminals=nonterminals,
                               left_context_phones=left_context_phones)


#    (lines, highest_symbol) = read_words_txt(args.input_words_txt)
#    nonterminals = read_nonterminals(args.nonterminal_symbols_list)
#    write_words_txt(lines, highest_symbol, nonterminals, args.output_words_txt)


if __name__ == '__main__':
      main()


================================================
FILE: egs/aishell1/local/max_rescore.py
================================================
import sys
import json
import codecs
import copy

json_f = sys.argv[1]
json_f_out = sys.argv[2]
best_dict_f = sys.argv[3]

with codecs.open(json_f, "r", encoding="utf-8") as f:
        j = json.load(f)

best_dict = {}
for name in j["utts"]:
    hyp_lst = j["utts"][name]["output"]
    for idx, hyp in enumerate(hyp_lst):
        if hyp["text"] == hyp["rec_text"].replace("<eos>", "") and idx > 0:
            best_dict[name] = copy.deepcopy([hyp_lst[0]] + [hyp_lst[idx]]) 
            print(f"{name}: {idx}-th is the best")
            if hyp_lst[0]["mmi_tot_score"] - hyp_lst[idx]["mmi_tot_score"] <  - 1e-5:
                print("May be corrected by MMI")
            

            hyp_lst = [hyp]
    j["utts"][name]["output"] = hyp_lst[:1]

with open(json_f_out, "wb") as f:
    f.write(
        json.dumps(
            j, indent=4, ensure_ascii=False, sort_keys=True
        ).encode("utf_8")
    )

with open(best_dict_f, "wb") as f:
    f.write(
        json.dumps(
            best_dict, indent=4, ensure_ascii=False, sort_keys=True
        ).encode("utf_8")
    )


================================================
FILE: egs/aishell1/local/parse_options.sh
================================================
#!/usr/bin/env bash

# Copyright 2012  Johns Hopkins University (Author: Daniel Povey);
#                 Arnab Ghoshal, Karel Vesely

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#  http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.


# Parse command-line options.
# To be sourced by another script (as in ". parse_options.sh").
# Option format is: --option-name arg
# and shell variable "option_name" gets set to value "arg."
# The exception is --help, which takes no arguments, but prints the
# $help_message variable (if defined).


###
### The --config file options have lower priority to command line
### options, so we need to import them first...
###

# Now import all the configs specified by command-line, in left-to-right order
for ((argpos=1; argpos<$#; argpos++)); do
  if [ "${!argpos}" == "--config" ]; then
    argpos_plus1=$((argpos+1))
    config=${!argpos_plus1}
    [ ! -r $config ] && echo "$0: missing config '$config'" && exit 1
    . $config  # source the config file.
  fi
done


###
### Now we process the command line options
###
while true; do
  [ -z "${1:-}" ] && break;  # break if there are no arguments
  case "$1" in
    # If the enclosing script is called with --help option, print the help
    # message and exit.  Scripts should put help messages in $help_message
    --help|-h) if [ -z "$help_message" ]; then echo "No help found." 1>&2;
      else printf "$help_message\n" 1>&2 ; fi;
      exit 0 ;;
    --*=*) echo "$0: options to scripts must be of the form --name value, got '$1'"
      exit 1 ;;
    # If the first command-line argument begins with "--" (e.g. --foo-bar),
    # then work out the variable name as $name, which will equal "foo_bar".
    --*) name=`echo "$1" | sed s/^--// | sed s/-/_/g`;
      # Next we test whether the variable in question is undefned-- if so it's
      # an invalid option and we die.  Note: $0 evaluates to the name of the
      # enclosing script.
      # The test [ -z ${foo_bar+xxx} ] will return true if the variable foo_bar
      # is undefined.  We then have to wrap this test inside "eval" because
      # foo_bar is itself inside a variable ($name).
      eval '[ -z "${'$name'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1;

      oldval="`eval echo \\$$name`";
      # Work out whether we seem to be expecting a Boolean argument.
      if [ "$oldval" == "true" ] || [ "$oldval" == "false" ]; then
        was_bool=true;
      else
        was_bool=false;
      fi

      # Set the variable to the right value-- the escaped quotes make it work if
      # the option had spaces, like --cmd "queue.pl -sync y"
      eval $name=\"$2\";

      # Check that Boolean-valued arguments are really Boolean.
      if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then
        echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2
        exit 1;
      fi
      shift 2;
      ;;
  *) break;
  esac
done


# Check for an empty argument to the --cmd option, which can easily occur as a
# result of scripting errors.
[ ! -z "${cmd+xxx}" ] && [ -z "$cmd" ] && echo "$0: empty argument to --cmd option" 1>&2 && exit 1;


true; # so this script returns exit code 0.


================================================
FILE: egs/aishell1/local/parse_text_jieba.py
================================================
import jieba
import sys

in_f = sys.argv[1]
out_f = sys.argv[2]
word_dict = sys.argv[3]

jieba.load_userdict(word_dict)

writer = open(out_f, 'w', encoding="utf8")
for line in open(in_f, encoding="utf8"):
    elems = line.split()
    uttid = elems[0]
    trans = "".join(elems[1:])
    trans_seg = " ".join(elems[1:])
    trans_seg_jieba = list(jieba.cut(trans, cut_all=False))
    trans_seg_jieba = " ".join(trans_seg_jieba)
    if not trans_seg_jieba == trans_seg:
        writer.write(f"Initail: {trans_seg} | jieba: {trans_seg_jieba}\n")
writer.close()


================================================
FILE: egs/aishell1/local/prepare_word_lex.py
================================================
import sys

"""
Make a word-level lexicon for MMI training. 
Previous lexicon accepts phones, here this lexicon accepts words.
"""

in_f = sys.argv[1]
out_f = sys.argv[2]
char_out_f = sys.argv[3]

cnt = 0
writer = open(out_f, 'w', encoding="utf8")
char_writer = open(char_out_f, 'w', encoding="utf8")
for line in open(in_f, encoding="utf8"):
    cnt += 1
  
    # The first two lines should be kept: special tokens   
    if cnt <= 2:
        writer.write(line)
        char_writer.write(line)
        continue

    word = line.split()[0]
    line = word + " " + " ".join(list(word)) + "\n"
    writer.write(line)

    if len(word) == 1:
        line = f"{word} {word}\n"
        char_writer.write(line)

writer.close()
char_writer.close()
 

================================================
FILE: egs/aishell1/local/sym2int.pl
================================================
#!/usr/bin/env perl
# Copyright 2010-2012 Microsoft Corporation  Johns Hopkins University (Author: Daniel Povey)

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#  http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.


$ignore_oov = 0;

for($x = 0; $x < 2; $x++) {
  if ($ARGV[0] eq "--map-oov") {
    shift @ARGV;
    $map_oov = shift @ARGV;
    if ($map_oov eq "-f" || $map_oov =~ m/words\.txt$/ || $map_oov eq "") {
      # disallow '-f', the empty string and anything ending in words.txt as the
      # OOV symbol because these are likely command-line errors.
      die "the --map-oov option requires an argument";
    }
  }
  if ($ARGV[0] eq "-f") {
    shift @ARGV;
    $field_spec = shift @ARGV;
    if ($field_spec =~ m/^\d+$/) {
      $field_begin = $field_spec - 1; $field_end = $field_spec - 1;
    }
    if ($field_spec =~ m/^(\d*)[-:](\d*)/) { # accept e.g. 1:10 as a courtesy (properly, 1-10)
      if ($1 ne "") {
        $field_begin = $1 - 1;  # Change to zero-based indexing.
      }
      if ($2 ne "") {
        $field_end = $2 - 1;    # Change to zero-based indexing.
      }
    }
    if (!defined $field_begin && !defined $field_end) {
      die "Bad argument to -f option: $field_spec";
    }
  }
}

$symtab = shift @ARGV;
if (!defined $symtab) {
  print STDERR "Usage: sym2int.pl [options] symtab [input transcriptions] > output transcriptions\n" .
    "options: [--map-oov <oov-symbol> ]  [-f <field-range> ]\n" .
      "note: <field-range> can look like 4-5, or 4-, or 5-, or 1.\n";
}
open(F, "<$symtab") || die "Error opening symbol table file $symtab";
while(<F>) {
    @A = split(" ", $_);
    @A == 2 || die "bad line in symbol table file: $_";
    $sym2int{$A[0]} = $A[1] + 0;
}

if (defined $map_oov && $map_oov !~ m/^\d+$/) { # not numeric-> look it up
  if (!defined $sym2int{$map_oov}) { die "OOV symbol $map_oov not defined."; }
  $map_oov = $sym2int{$map_oov};
}

$num_warning = 0;
$max_warning = 20;

while (<>) {
  @A = split(" ", $_);
  @B = ();
  for ($n = 0; $n < @A; $n++) {
    $a = $A[$n];
    if ( (!defined $field_begin || $n >= $field_begin)
         && (!defined $field_end || $n <= $field_end)) {
      $i = $sym2int{$a};
      if (!defined ($i)) {
        if (defined $map_oov) {
          if ($num_warning++ < $max_warning) {
            print STDERR "sym2int.pl: replacing $a with $map_oov\n";
            if ($num_warning == $max_warning) {
              print STDERR "sym2int.pl: not warning for OOVs any more times\n";
            }
          }
          $i = $map_oov;
        }
      }
      $a = $i;
    }
    push @B, $a;
  }
  print join(" ", @B);
  print "\n";
}
if ($num_warning > 0) {
  print STDERR "** Replaced $num_warning instances of OOVs with $map_oov\n";
}

exit(0);


================================================
FILE: egs/aishell1/nt.sh
================================================
#!/usr/bin/env bash

# author: tyriontian
# tyriontian@tencent.com

. ./path.sh || exit 1;
. ./cmd.sh || exit 1;

# general configuration
backend=pytorch
stage=0        # start from 0 if you need to start from data preparation
stop_stage=100
debugmode=1
dumpdir=dump   # directory to dump full features
N=0            # number of minibatches to be used (mainly for debugging). "0" uses all minibatches.
verbose=0      # verbose option
debug=false

# feature configuration
do_delta=false

preprocess_config=conf/specaug.yaml
train_config=conf/tuning/transducer/train_conformer-rnn_transducer_aux_ngpu4.yaml
lm_config=conf/lm.yaml
decode_config=conf/tuning/transducer/decode_default.yaml

# rnnlm related
lm_resume=         # specify a snapshot file to resume LM training
lmtag=             # tag for managing LMs

# ngram
ngramtag=
n_gram=4

# decoding parameter
recog_model=model.acc.best # set a model to be used for decoding: 'model.acc.best' or 'model.loss.best'

# data
data=/data/asr_data/aishell/
data_url=www.openslr.org/resources/33
dict=data/lang_1char/train_sp_units.txt
lang=data/lang_phone

### Configurable parameters ###
tag="8v100_ddp_rnnt_mmi"
ngpu=8

# Train config
seed=888
batch_size=8
accum_grad=1
epochs=100
use_segment=true # if true, use word-level transcription in MMI criterion
aux_ctc=true
aux_ctc_weight=0.5
aux_ctc_dropout_rate=0.1
aux_mmi=true
aux_mmi_weight=0.5
aux_mmi_dropout_rate=0.1
aux_mmi_type='mmi' # mmi or phonectc
att_scorer_weight=0.0 # train an attention scorer for rescoring
resume=

# MBR training config
aux_mbr=false
aux_mbr_weight=1.0
aux_mbr_beam=4
mbr_epochs=100
mbr_lr=0.1
mbr_warmup=2500
mbr_resume=

master_port=22275

# Decode config
idx_average=91_100
search_type="alsd" # "default", "nsc", "tsd", "alsd"
mmi_weight=0.0 # MMI / phonectc joint decoding
mas_lookahead=0 # MMI Alignment look-ahead frames
ctc_weight=0.0 # char ctc joint decoding
ngram_order=4
ngram_weight=0.0
lm_weight=0.0
word_ngram_weight=0.0
word_ngram_tag=word_3gram_wbdiscount
word_ngram_log_semiring=true
beam_size=10
recog_set="test dev"
max_job=144

. utils/parse_options.sh || exit 1;

if [ $debug == true ]; then
    export HOST_GPU_NUM=1
    export HOST_NUM=1
    export NODE_NUM=1
    export INDEX=0
    export CHIEF_IP="9.135.217.29"
fi

train_opts=\
"\
--seed $seed \
--batch-size $batch_size \
--accum-grad $accum_grad \
--epochs $epochs \
--use-segment $use_segment \
--aux-ctc $aux_ctc \
--aux-ctc-weight $aux_ctc_weight \
--aux-ctc-dropout-rate $aux_ctc_dropout_rate \
--aux-mmi $aux_mmi \
--aux-mmi-weight $aux_mmi_weight \
--aux-mmi-dropout-rate $aux_mmi_dropout_rate \
--aux-mmi-type $aux_mmi_type \
--att-scorer-weight $att_scorer_weight \
"

if [ $aux_mbr == true ]; then
    train_opts="$train_opts \
                --aux-mbr $aux_mbr \
                --aux-mbr-weight $aux_mbr_weight \
                --aux-mbr-beam $aux_mbr_beam \
                --transformer-lr $mbr_lr \
                --epochs $mbr_epochs \
                --transformer-warmup-steps $mbr_warmup \
                --resume $mbr_resume \
                --load-trainer-and-opt false \
                --save-interval-iters 1000 \
                "
    export OMP_NUM_THREADS=6 # for on-the-fly decoding
fi

decode_opts=\
"\
--search-type $search_type \
--mmi-weight $mmi_weight \
--mas-lookahead $mas_lookahead \
--beam-size $beam_size \
--ctc-weight $ctc_weight \
--ngram-weight $ngram_weight \
--word-ngram-weight $word_ngram_weight \
--word-ngram data/$word_ngram_tag \
--word-ngram-log-semiring ${word_ngram_log_semiring} \
--lm-weight $lm_weight \
"

# Set bash to 'debug' mode, it will exit on :
# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
set -e
set -u
set -o pipefail

train_set=train_sp
train_dev=dev

expname=${train_set}_${backend}_${tag}
expdir=exp/${expname}
mkdir -p ${expdir}

feat_tr_dir=${dumpdir}/${train_set}/delta${do_delta}; mkdir -p ${feat_tr_dir}
feat_dt_dir=${dumpdir}/${train_dev}/delta${do_delta}; mkdir -p ${feat_dt_dir}
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    echo "stage 1: Network Training"
    
    # make sure in jizhi config file: "exec_start_in_all_mpi_pods": true, 
    MASTER_PORT=$master_port
    NCCL_DEBUG=TRACE python3 -m torch.distributed.launch \
        --nproc_per_node ${HOST_GPU_NUM} --master_port $MASTER_PORT \
        --nnodes=${HOST_NUM} --node_rank=${INDEX} --master_addr=${CHIEF_IP} \
        ${MAIN_ROOT}/bin/asr_train.py \
        --config ${train_config} \
        --preprocess-conf ${preprocess_config} \
        --ngpu 1 \
        --backend ${backend} \
        --outdir ${expdir}/results_RANK \
        --debugmode ${debugmode} \
        --dict ${dict} \
        --debugdir ${expdir} \
        --minibatches ${N} \
        --verbose ${verbose} \
        --resume ${resume} \
        --train-json ${feat_tr_dir}/split${ngpu}utt/data.RANK.json \
        --valid-json ${feat_dt_dir}/data.json \
        --lang $lang \
        --opt "noam_sgd" \
        --n-iter-processes 8 \
        --world-size $ngpu \
        --node-rank ${INDEX} \
        --node-size ${HOST_GPU_NUM} \
        $train_opts > ${expdir}/global_record.${INDEX}.txt 2>&1
fi

if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
    echo "stage 2: Decoding"
    nj=500
    if [[ $(get_yaml.py ${train_config} model-module) = *transformer* ]] || \
           [[ $(get_yaml.py ${train_config} model-module) = *conformer* ]] || \
           [[ $(get_yaml.py ${train_config} etype) = custom ]] || \
           [[ $(get_yaml.py ${train_config} dtype) = custom ]]; then
        recog_model=model.last${idx_average}.avg.best
        average_checkpoints.py --backend ${backend} \
         		       --snapshots ${expdir}/results_0/snapshot.ep.* \
          		       --out ${expdir}/results_0/${recog_model} \
         		       --num ${idx_average}
    fi

    decode_parent_dir=decode_mmi${mmi_weight}_${word_ngram_tag}${word_ngram_weight}_lookahead${mas_lookahead}_beam${beam_size}_${idx_average}
    for rtask in ${recog_set}; do
        decode_dir=$decode_parent_dir/$rtask 
        feat_recog_dir=${dumpdir}/${rtask}/delta${do_delta}

        # split data
        splitjson.py --parts ${nj} ${feat_recog_dir}/data.json

        #### use CPU for decoding
        ngpu=0

        # If use rnnlm, download the official ckpts and add:
        # --rnnlm exp/train_rnnlm_pytorch_lm/official_ckpts/rnnlm.model.best \
        # --rnnlm-conf exp/train_rnnlm_pytorch_lm/official_ckpts/model.json \

        # If use character-level N-gram lm, train with kenlm and add:
        # --ngram-model exp/train_ngram/${ngram_order}gram.bin \

        ${decode_cmd} JOB=1:${nj} ${expdir}/${decode_dir}/log/decode.JOB.log \
            asr_recog.py \
            --config ${decode_config} \
            --ngpu ${ngpu} \
            --backend ${backend} \
            --batchsize 0 \
            --recog-json ${feat_recog_dir}/split${nj}utt/data.JOB.json \
            --result-label ${expdir}/${decode_dir}/data.JOB.json \
            --model ${expdir}/results_0/${recog_model} \
            --local-rank JOB \
            $decode_opts

        score_sclite.sh ${expdir}/${decode_dir} ${dict} \
          > ${expdir}/${decode_dir}/decode_result.txt
     
    done
    echo "Finished"
fi


================================================
FILE: egs/aishell1/path.sh
================================================
# this is necessary since docker images would not run .bashrc if the command line is not "bash"
source ~/.bashrc # to include libfst.so

MAIN_ROOT=$PWD/../../
KALDI_ROOT=../../kaldi/ # Kaldi is local and is not available on jizhi task

export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
. $KALDI_ROOT/tools/config/common_path.sh
export LC_ALL=C

export PATH=$PWD/espnet_utils:$MAIN_ROOT/bin:$MAIN_ROOT:$PATH

export OMP_NUM_THREADS=1
# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
export PYTHONIOENCODING=UTF-8
export PATH=${KALDI_ROOT}/tools/sph2pipe:${KALDI_ROOT}/tools/kaldi_lm:${KALDI_ROOT}/tools/sctk/bin:${KALDI_ROOT}/tools/kenlm/build/bin:${KALDI_ROOT}/tools/srilm/bin/i686-m64/:${KALDI_ROOT}/tools/srilm/lm/bin/i686-m64/:$PATH
export PYTHONPATH=$MAIN_ROOT:$MAIN_ROOT/bin/:$MAIN_ROOT/../:$PWD:$PYTHONPATH # so espnet could be find like a library
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/lib64:${KALDI_ROOT}/tools/openfst/lib/
# nvidia-smi -c 3


================================================
FILE: egs/aishell1/prepare.sh
================================================
#!/usr/bin/env bash

# author: tyriontian
# tyriontian@tencent.com

. ./path.sh || exit 1;
. ./cmd.sh || exit 1;

# general configuration
backend=pytorch
stage=0        # start from 0 if you need to start from data preparation
stop_stage=100
ngpu=1         # number of gpus ("0" uses cpu, otherwise use gpu)
debugmode=1
dumpdir=dump   # directory to dump full features
N=0            # number of minibatches to be used (mainly for debugging). "0" uses all minibatches.
verbose=0      # verbose option
resume=        # Resume the training from snapshot

# feature configuration
do_delta=false

preprocess_config=conf/specaug.yaml
train_config=conf/train.yaml
lm_config=conf/lm_rnn.yaml
decode_config=conf/decode.yaml

# rnnlm related
lm_resume=         # specify a snapshot file to resume LM training
lmtag=             # tag for managing LMs

# ngram
ngramtag=
n_gram=4

# decoding parameter
recog_model=model.acc.best # set a model to be used for decoding: 'model.acc.best' or 'model.loss.best'
n_average=10

# data
data=/data/asr_data/aishell/
data_url=www.openslr.org/resources/33

# exp tag
tag="" # tag for managing experiments.

. utils/parse_options.sh || exit 1;

# Set bash to 'debug' mode, it will exit on :
# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
set -e
set -u
set -o pipefail

train_set=train_sp
train_dev=dev
recog_set="dev test"

if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
    echo "stage -1: Data Download"
    local/download_and_untar.sh ${data} ${data_url} data_aishell
    local/download_and_untar.sh ${data} ${data_url} resource_aishell
fi

if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    ### Task dependent. You have to make data the following preparation part by yourself.
    ### But you can utilize Kaldi recipes in most cases
    echo "stage 0: Data preparation"
    local/aishell_data_prep.sh ${data}/data_aishell/wav ${data}/data_aishell/transcript
    # remove space in text
    for x in train dev test; do
        cp data/${x}/text data/${x}/text_org
        paste -d " " <(cut -f 1 -d" " data/${x}/text_org) <(cut -f 2- -d" " data/${x}/text_org | tr -d " ") \
            > data/${x}/text
    done
fi

feat_tr_dir=${dumpdir}/${train_set}/delta${do_delta}; mkdir -p ${feat_tr_dir}
feat_dt_dir=${dumpdir}/${train_dev}/delta${do_delta}; mkdir -p ${feat_dt_dir}
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    ### Task dependent. You have to design training and dev sets by yourself.
    ### But you can utilize Kaldi recipes in most cases
    echo "stage 1: Feature Generation"
    fbankdir=fbank
    # Generate the fbank features; by default 80-dimensional fbanks with pitch on each frame
    steps/make_fbank_pitch.sh --cmd "$train_cmd" --nj 30 --write_utt2num_frames true \
        data/train exp/make_fbank/train ${fbankdir}
    utils/fix_data_dir.sh data/train
    steps/make_fbank_pitch.sh --cmd "$train_cmd" --nj 10 --write_utt2num_frames true \
        data/dev exp/make_fbank/dev ${fbankdir}
    utils/fix_data_dir.sh data/dev
    steps/make_fbank_pitch.sh --cmd "$train_cmd" --nj 10 --write_utt2num_frames true \
        data/test exp/make_fbank/test ${fbankdir}
    utils/fix_data_dir.sh data/test

    # speed-perturbed
    utils/perturb_data_dir_speed.sh 0.9 data/train data/temp1
    utils/perturb_data_dir_speed.sh 1.0 data/train data/temp2
    utils/perturb_data_dir_speed.sh 1.1 data/train data/temp3
    utils/combine_data.sh --extra-files utt2uniq data/${train_set} data/temp1 data/temp2 data/temp3
    rm -r data/temp1 data/temp2 data/temp3
    steps/make_fbank_pitch.sh --cmd "$train_cmd" --nj 30 --write_utt2num_frames true \
        data/${train_set} exp/make_fbank/${train_set} ${fbankdir}
    utils/fix_data_dir.sh data/${train_set}

    # By tyriontian: Additionally you need to copy text_org from data/train to data_train_sp
    # text_org in this script refer the transcriptions that are segmented into word level
    # This is useful for MMI as our MMI criterion works in word level
    python3 espnet_utils/build_sp_text.py data/train/text_org | sort -k 1 > data/${train_set}/text_org

    # compute global CMVN
    compute-cmvn-stats scp:data/${train_set}/feats.scp data/${train_set}/cmvn.ark

    # dump features for training
    split_dir=$(echo $PWD | awk -F "/" '{print $NF "/" $(NF-1)}')
    if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d ${feat_tr_dir}/storage ]; then
    utils/create_split_dir.pl \
        /export/a{11,12,13,14}/${USER}/espnet-data/egs/${split_dir}/dump/${train_set}/delta${do_delta}/storage \
        ${feat_tr_dir}/storage
    fi
    if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d ${feat_dt_dir}/storage ]; then
    utils/create_split_dir.pl \
        /export/a{11,12,13,14}/${USER}/espnet-data/egs/${split_dir}/dump/${train_dev}/delta${do_delta}/storage \
        ${feat_dt_dir}/storage
    fi
    dump.sh --cmd "$train_cmd" --nj 32 --do_delta ${do_delta} \
        data/${train_set}/feats.scp data/${train_set}/cmvn.ark exp/dump_feats/train ${feat_tr_dir}
    for rtask in ${recog_set}; do
        feat_recog_dir=${dumpdir}/${rtask}/delta${do_delta}; mkdir -p ${feat_recog_dir}
        dump.sh --cmd "$train_cmd" --nj 10 --do_delta ${do_delta} \
            data/${rtask}/feats.scp data/${train_set}/cmvn.ark exp/dump_feats/recog/${rtask} \
            ${feat_recog_dir}
    done
fi

dict=data/lang_1char/${train_set}_units.txt
echo "dictionary: ${dict}"
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
    ### Task dependent. You have to check non-linguistic symbols used in the corpus.
    echo "stage 2: Dictionary and Json Data Preparation"
    mkdir -p data/lang_1char/

    echo "make a dictionary"
    echo "<unk> 1" > ${dict} # <unk> must be 1, 0 will be used for "blank" in CTC
    text2token.py -s 1 -n 1 data/${train_set}/text | cut -f 2- -d" " | tr " " "\n" \
    | sort | uniq | grep -v -e '^\s*$' | awk '{print $0 " " NR+1}' >> ${dict}
    wc -l ${dict}

    echo "make json files"
    data2json.sh --feat ${feat_tr_dir}/feats.scp \
                 --text-org data/${train_set}/text_org \
		 data/${train_set} ${dict} > ${feat_tr_dir}/data.json
    for rtask in ${recog_set}; do
        feat_recog_dir=${dumpdir}/${rtask}/delta${do_delta}
        data2json.sh --feat ${feat_recog_dir}/feats.scp \
                     --text-org data/${rtask}/text_org \
		     data/${rtask} ${dict} > ${feat_recog_dir}/data.json
    done
fi

# you can skip this and remove --rnnlm option in the recognition (stage 5)
if [ -z ${lmtag} ]; then
    lmtag=$(basename ${lm_config%.*})
fi
lmexpname=train_rnnlm_${backend}_${lmtag}
lmexpdir=exp/${lmexpname}
mkdir -p ${lmexpdir}

ngramexpname=train_ngram
ngramexpdir=exp/${ngramexpname}
if [ -z ${ngramtag} ]; then
    ngramtag=${n_gram}
fi
mkdir -p ${ngramexpdir}

if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
    echo "stage 3: LM Preparation"
    lmdatadir=data/local/lm_train
    mkdir -p ${lmdatadir}
    text2token.py -s 1 -n 1 data/train/text | cut -f 2- -d" " \
        > ${lmdatadir}/train.txt
    text2token.py -s 1 -n 1 data/${train_dev}/text | cut -f 2- -d" " \
        > ${lmdatadir}/valid.txt

    # NNLM. by default you do not need this
    ${cuda_cmd} --gpu ${ngpu} ${lmexpdir}/train.log \
        lm_train.py \
        --config ${lm_config} \
        --ngpu 1 \
        --backend ${backend} \
        --verbose 1 \
        --outdir ${lmexpdir} \
        --tensorboard-dir tensorboard/${lmexpname} \
        --train-label ${lmdatadir}/train.txt \
        --valid-label ${lmdatadir}/valid.txt \
        --resume ${lm_resume} \
        --dict ${dict}

    # prepare character-level N-gram LM. You need kenlm to run this  
    # lmplz --discount_fallback -o ${n_gram} <${lmdatadir}/train.txt > ${ngramexpdir}/${n_gram}gram.arpa
    # build_binary -s ${ngramexpdir}/${n_gram}gram.arpa ${ngramexpdir}/${n_gram}gram.bin
fi

lang=data/lang_phone
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
  local/k2_aishell_prepare_dict.sh $data/resource_aishell data/local/dict_nosp
  local/k2_prepare_lang.sh --position-dependent-phones false data/local/dict_nosp \
      "<UNK>" data/local/lang_tmp_nosp $lang || exit 1

  # We also prepare Word-level N-gram LM; order = 3, 4
  local/aishell_train_lms.sh

  for order in 3 4 ; do
      mkdir -p data/word_${order}gram
      gunzip -c data/local/lm/${order}gram-mincount/lm_unpruned.gz \
        > data/word_${order}gram/lm.arpa

      cp $lang/words.txt data/word_${order}gram/
      cp $lang/oov.int data/word_${order}gram/

      python3 -m kaldilm \
        --read-symbol-table="data/word_${order}gram/words.txt" \
        --disambig-symbol='#0' \
        --max-order=$order \
        data/word_${order}gram/lm.arpa > data/word_${order}gram/G.fst.txt
    
    done
fi

# Prepare these word N-gram LMs for SPL response
# (1) use different smooth method
# (2) use jieba rather than the ground-truth transcription
if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
    # 3-gram LM with different smooth
    for sm in -wbdiscount -kndiscount -ukndiscount -ndiscount; do
        bash espnet_utils/train_lms_srilm.sh \
          --unk "<UNK>" --lm-opts $sm data/local/dict_nosp/lexicon.txt \
          data/local/train/text data/local/lm$sm  
    done

    # gtdiscount
    bash espnet_utils/train_lms_srilm.sh \
          --unk "<UNK>" data/local/dict_nosp/lexicon.txt \
          data/local/train/text data/local/lm-gtdiscount

    # word segmentation by jieba
    python3 espnet_utils/jieba_build_dict.py $lang/words.txt $lang/jieba_dict.txt
    python3 espnet_utils/text_norm.py --in-f data/train/text \
      --out-f data/local/train/text.jieba --segment
    bash espnet_utils/train_lms_srilm.sh \
      --unk "<UNK>" data/local/dict_nosp/lexicon.txt \
      data/local/train/text.jieba data/local/lm-jieba

    # build k2 directory
    for tag in wbdiscount kndiscount ukndiscount ndiscount gtdiscount jieba; do
        mkdir -p data/word_3gram_$tag; lmdir=data/word_3gram_$tag
        gunzip -c data/local/lm-$tag/srilm/srilm.o3g.kn.gz \
          > $lmdir/lm.arpa

        cp $lang/words.txt $lmdir
        cp $lang/oov.int $lmdir

        python3 -m kaldilm \
            --read-symbol-table="$lmdir/words.txt" \
            --disambig-symbol='#0' \
            --max-order=3 \
            $lmdir/lm.arpa > $lmdir/G.fst.txt

        python3 espnet/nets/scorers/word_ngram.py $lmdir
    done
    
fi


================================================
FILE: egs/aishell2/.gitignore
================================================
dump
dump32
dump64
data
exp
fbank
exp_without_segmentation
_exp


================================================
FILE: egs/aishell2/aed.sh
================================================
#!/usr/bin/env bash

# Copyright 2017 Johns Hopkins University (Shinji Watanabe)
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

. ./path.sh || exit 1;
. ./cmd.sh || exit 1;

# general configuration
backend=pytorch
stage=0        # start from 0 if you need to start from data preparation
stop_stage=100
ngpu=2         # number of gpus ("0" uses cpu, otherwise use gpu)
debugmode=1
dumpdir=dump   # directory to dump full features
N=0            # number of minibatches to be used (mainly for debugging). "0" uses all minibatches.
verbose=0      # verbose option
resume=        # Resume the training from snapshot

# feature configuration
do_delta=false

preprocess_config=conf/specaug.yaml
train_config=conf/tuning/train_pytorch_conformer_kernel31.yaml
lm_config=conf/lm.yaml
decode_config=conf/decode.yaml

# rnnlm related
lm_resume=         # specify a snapshot file to resume LM training
lmtag=             # tag for managing LMs

# decoding parameter
recog_model=model.acc.best # set a model to be used for decoding: 'model.acc.best' or 'model.loss.best'

# data dir, modify this to your AISHELL-2 data path
tr_dir=/data/asr_data/aishell2/iOS/data
dev_tst_dir=/data/asr_data/aishell2/AISHELL-DEV-TEST-SET

# exp tag
### Configurable parameters ###
tag="8v100_lasmmictc_alpha03_ctc03_seg"
ngpu=8
debug=false
# Train config
seed=888
batch_size=4
accum_grad=16
epochs=100
use_segment=true # if true, use word-level transcription in MMI criterion
ctc_type="k2mmi" # k2mmi k2ctc builtin
mtlalpha=0.3
third_weight=0.3

# MBR training config
aux_mbr=false
aux_mbr_weight=1.0
aux_mbr_beam=4
mbr_epochs=100
mbr_lr=0.1
mbr_warmup=2500
mbr_resume=

# Decode config
idx_average=41_50
mmi_weight=0.0 # MMI / phonectc joint decoding
ctc_weight=0.5 # char ctc joint decoding
ngram_weight=0.0
ngram_order=4
word_ngram_weight=0.0
word_ngram_tag=word_3gram_wbdiscount # 3 or 4 gram
word_ngram_log_semiring=true
lm_weight=0.0
mmi_rescore=false # or rescore
beam_size=10
recog_set="test_android test_ios test_mic"

. utils/parse_options.sh || exit 1;

if [ $debug == true ]; then
    export HOST_GPU_NUM=1
    export HOST_NUM=1
    export NODE_NUM=1
    export INDEX=0
    export CHIEF_IP="9.135.217.29"
fi

train_opts=\
"\
--seed $seed \
--batch-size $batch_size \
--accum-grad $accum_grad \
--epochs $epochs \
--use-segment $use_segment \
--ctc_type $ctc_type \
--mtlalpha $mtlalpha \
--third-weight $third_weight \
"

if [ $aux_mbr == true ]; then
    train_opts="$train_opts \
                --aux-mbr $aux_mbr \
                --aux-mbr-weight $aux_mbr_weight \
                --aux-mbr-beam $aux_mbr_beam \
                --transformer-lr $mbr_lr \
                --epochs $mbr_epochs \
                --transformer-warmup-steps $mbr_warmup \
                --resume $mbr_resume \
                --load-trainer-and-opt false \
                --save-interval-iters 1000 \
                "
    export OMP_NUM_THREADS=6 # for on-the-fly decoding
fi

decode_opts=\
"\
--mmi-weight $mmi_weight \
--mmi-rescore $mmi_rescore \
--beam-size $beam_size \
--ctc-weight $ctc_weight \
--ngram-weight $ngram_weight \
--word-ngram-weight $word_ngram_weight \
--word-ngram data/${word_ngram_tag} \
--word-ngram-log-semiring $word_ngram_log_semiring \
--lm-weight $lm_weight \
"
dict=data/lang_1char/train_sp_units.txt
# Set bash to 'debug' mode, it will exit on :
# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
set -e
set -u
set -o pipefail

train_set=train_sp
train_dev=dev_ios

expname=${train_set}_${backend}_${tag}
expdir=exp/${expname}
mkdir -p ${expdir}

lang=data/lang_phone
feat_tr_dir=${dumpdir}/${train_set}/delta${do_delta}; mkdir -p ${feat_tr_dir}
feat_dt_dir=${dumpdir}/${train_dev}/delta${do_delta}; mkdir -p ${feat_dt_dir}
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    echo "stage 1: Network Training"
    MASTER_PORT=22277
    NCCL_DEBUG=TRACE python3 -m torch.distributed.launch \
        --nproc_per_node ${HOST_GPU_NUM} --master_port $MASTER_PORT \
        --nnodes=${HOST_NUM} --node_rank=${INDEX} --master_addr=${CHIEF_IP} \
        ${MAIN_ROOT}/bin/asr_train.py \
        --config ${train_config} \
        --preprocess-conf ${preprocess_config} \
        --ngpu 1 \
        --backend ${backend} \
        --outdir ${expdir}/results_RANK \
        --debugmode ${debugmode} \
        --dict ${dict} \
        --debugdir ${expdir} \
        --minibatches ${N} \
        --verbose ${verbose} \
        --resume ${resume} \
        --train-json ${feat_tr_dir}/split${ngpu}utt/data_noeng.RANK.json \
        --valid-json ${feat_dt_dir}/data.json \
        --lang $lang \
        --opt "noam_sgd" \
        --n-iter-processes 8 \
        --world-size $ngpu \
        --node-rank ${INDEX} \
        --node-size ${HOST_GPU_NUM} \
        $train_opts > ${expdir}/global_record.${INDEX}.txt 2>&1
fi

if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
    echo "stage 2: Decoding"
    nj=1000
    recog_model=model.last${idx_average}.avg.best
    if [[ $(get_yaml.py ${train_config} model-module) = *transformer* ]] || \
           [[ $(get_yaml.py ${train_config} model-module) = *conformer* ]] || \
           [[ $(get_yaml.py ${train_config} etype) = custom ]] || \
           [[ $(get_yaml.py ${train_config} dtype) = custom ]]; then
	recog_model=model.last${idx_average}.avg.best
	average_checkpoints.py --backend ${backend} \
        	               --snapshots ${expdir}/results_0/snapshot.ep.* \
			       --out ${expdir}/results_0/${recog_model} \
			       --num ${idx_average}
    fi

    decode_parent_dir=decode_mmi${mmi_weight}_${word_ngram_tag}${word_ngram_weight}_ctc${ctc_weight}_ep${idx_average}_beam${beam_size}
    for rtask in ${recog_set}; do
        decode_dir=$decode_parent_dir/$rtask
        feat_recog_dir=${dumpdir}/${rtask}/delta${do_delta}

        # split data
        splitjson.py --parts ${nj} ${feat_recog_dir}/data.json

        #### use CPU for decoding
        ngpu=0
        ${decode_cmd} JOB=1:$nj ${expdir}/${decode_dir}/log/decode.JOB.log \
            python3 ${MAIN_ROOT}/bin/asr_recog.py \
            --config ${decode_config} \
            --ngpu ${ngpu} \
            --backend ${backend} \
            --batchsize 0 \
            --recog-json ${feat_recog_dir}/split${nj}utt/data.JOB.json \
            --result-label ${expdir}/${decode_dir}/data.JOB.json \
            --model ${expdir}/results_0/${recog_model}  \
            --ngram-model exp/train_ngram/${ngram_order}gram.bin \
            --rnnlm exp/train_rnnlm_pytorch_lm_transformer/results/rnnlm.model.best \
            --rnnlm-conf exp/train_rnnlm_pytorch_lm_transformer/results/model.json \
            --api v2 \
            --local-rank JOB $decode_opts  

        score_sclite.sh ${expdir}/${decode_dir} ${dict} \
          > ${expdir}/${decode_dir}/decode_result.txt

    done
    echo "Finished"
fi


================================================
FILE: egs/aishell2/conf/fbank.conf
================================================
--sample-frequency=16000 
--num-mel-bins=80


================================================
FILE: egs/aishell2/conf/gpu.conf
================================================
# Default configuration
command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
option mem=* -l mem_free=$0,ram_free=$0
option mem=0          # Do not add anything to qsub_opts
option num_threads=* -pe smp $0
option num_threads=1  # Do not add anything to qsub_opts
option max_jobs_run=* -tc $0
default gpu=0
option gpu=0
option gpu=* -l 'hostname=b1[12345678]*|c*,gpu=$0' -q g.q

================================================
FILE: egs/aishell2/conf/lm.yaml
================================================
# rnnlm related
layer: 2
unit: 650
opt: sgd        # or adam
batchsize: 64   # batch size in LM training
epoch: 20      # if the data size is large, we can reduce this
patience: 3
maxlen: 100     # if sentence length > lm_maxlen, lm_batchsize is automatically reduced


================================================
FILE: egs/aishell2/conf/lm_rnn.yaml
================================================
lm.yaml

================================================
FILE: egs/aishell2/conf/lm_transformer.yaml
================================================
# This Transformer LM setting w/ 4 GPUs took around 60 days for 50 epochs.
# However, you can get better results in 6 days for 5 epochs (WER: 2.2/5.4/2.6/5.7)
# than LSTM LM (WER: 2.6/5.6/2.6/5.7) in 60 days for 20 epochs
# And if you does not have 4 GPUs, try accum-grad=4.

# network architecture
model-module: transformer
att-unit: 512
embed-unit: 128
head: 8
layer: 16
pos-enc: none
unit: 2048

# minibatch related
batchsize: 32
maxlen: 40

# optimization related
opt: adam
schedulers: lr=cosine
dropout-rate: 0.0
epoch: 50
gradclip: 1.0
lr: 1e-4
lr-cosine-total: 100000
lr-cosine-warmup: 1000
patience: 0
sortagrad: 0


================================================
FILE: egs/aishell2/conf/pitch.conf
================================================
--sample-frequency=16000


================================================
FILE: egs/aishell2/conf/queue.conf
================================================
# Default configuration
command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
option mem=* -l mem_free=$0,ram_free=$0
option mem=0          # Do not add anything to qsub_opts
option num_threads=* -pe smp $0
option num_threads=1  # Do not add anything to qsub_opts
option max_jobs_run=* -tc $0
default gpu=0
option gpu=0
option gpu=* -l gpu=$0 -q g.q


================================================
FILE: egs/aishell2/conf/slurm.conf
================================================
# Default configuration
command sbatch --export=PATH
option name=* --job-name $0
option time=* --time $0
option mem=* --mem-per-cpu $0
option mem=0
option num_threads=* --cpus-per-task $0
option num_threads=1 --cpus-per-task 1
option num_nodes=* --nodes $0
default gpu=0
option gpu=0 -p cpu
option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
# note: the --max-jobs-run option is supported as a special case
# by slurm.pl and you don't have to handle it in the config file.


================================================
FILE: egs/aishell2/conf/specaug.yaml
================================================
process:
  # these three processes are a.k.a. SpecAugument
  - type: "time_warp"
    max_time_warp: 5
    inplace: true
    mode: "PIL"
  - type: "freq_mask"
    F: 30
    n_mask: 2
    inplace: true
    replace_with_zero: false
  - type: "time_mask"
    T: 40
    n_mask: 2
    inplace: true
    replace_with_zero: false


================================================
FILE: egs/aishell2/conf/specaug_test.yaml
================================================
process:
  # these three processes are a.k.a. SpecAugument
  - type: "time_warp"
    max_time_warp: 0
    inplace: true
    mode: "PIL"
  - type: "freq_mask"
    F: 30
    n_mask: 2
    inplace: true
    replace_with_zero: true
  - type: "time_mask"
    T: 40
    n_mask: 2
    inplace: true
    replace_with_zero: true


================================================
FILE: egs/aishell2/conf/tuning/decode_pytorch_transformer.yaml
================================================
batchsize: 0
beam-size: 10
penalty: 0.0
maxlenratio: 0.0
minlenratio: 0.0
ctc-weight: 0.5
lm-weight: 0.0
ngram-weight: 0.3


================================================
FILE: egs/aishell2/conf/tuning/decode_rnn.yaml
================================================
beam-size: 20
penalty: 0.0
maxlenratio: 0.0
minlenratio: 0.0
ctc-weight: 0.6
lm-weight: 0.3


================================================
FILE: egs/aishell2/conf/tuning/train_pytorch_conformer_kernel15.yaml
================================================
# network architecture
# encoder related
elayers: 12
eunits: 2048
# decoder related
dlayers: 6
dunits: 2048
# attention related
adim: 256
aheads: 4

# hybrid CTC/attention
mtlalpha: 0.3

# label smoothing
lsm-weight: 0.1

# minibatch related
batch-size: 32
maxlen-in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
maxlen-out: 150 # if output length > maxlen-out, batchsize is automatically reduced

# optimization related
sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
opt: noam
accum-grad: 2
grad-clip: 5
patience: 0
epochs: 50
dropout-rate: 0.1

# transformer specific setting
backend: pytorch
model-module: "espnet.nets.pytorch_backend.e2e_asr_conformer:E2E"
transformer-input-layer: conv2d     # encoder architecture type
transformer-lr: 1.0
transformer-warmup-steps: 25000
transformer-attn-dropout-rate: 0.0
transformer-length-normalized-loss: false
transformer-init: pytorch

# conformer specific setting
transformer-encoder-pos-enc-layer-type: rel_pos
transformer-encoder-selfattn-layer-type: rel_selfattn
transformer-encoder-activation-type: swish
rel-pos-type: latest
macaron-style: true
use-cnn-module: true
cnn-module-kernel: 15


================================================
FILE: egs/aishell2/conf/tuning/train_pytorch_conformer_kernel31.yaml
================================================
# network architecture
# encoder related
elayers: 12
eunits: 2048
# decoder related
dlayers: 6
dunits: 2048
# attention related
adim: 256
aheads: 4

# hybrid CTC/attention
mtlalpha: 0.3

# label smoothing
lsm-weight: 0.1

# minibatch related
batch-size: 32
maxlen-in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
maxlen-out: 150 # if output length > maxlen-out, batchsize is automatically reduced

# optimization related
sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
opt: noam
accum-grad: 2
grad-clip: 5
patience: 0
epochs: 50
dropout-rate: 0.1

# transformer specific setting
backend: pytorch
model-module: "espnet.nets.pytorch_backend.e2e_asr_conformer:E2E"
transformer-input-layer: conv2d     # encoder architecture type
transformer-lr: 1.0
transformer-warmup-steps: 25000
transformer-attn-dropout-rate: 0.0
transformer-length-normalized-loss: false
transformer-init: pytorch

# conformer specific setting
transformer-encoder-pos-enc-layer-type: rel_pos
transformer-encoder-selfattn-layer-type: rel_selfattn
transformer-encoder-activation-type: swish
macaron-style: true
use-cnn-module: true
cnn-module-kernel: 31


================================================
FILE: egs/aishell2/conf/tuning/train_pytorch_transformer.yaml
================================================
# network architecture
# encoder related
elayers: 12
eunits: 2048
# decoder related
dlayers: 6
dunits: 2048
# attention related
adim: 256
aheads: 4

# hybrid CTC/attention
mtlalpha: 0.3

# label smoothing
lsm-weight: 0.1

# minibatch related
batch-size: 32
maxlen-in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
maxlen-out: 150 # if output length > maxlen-out, batchsize is automatically reduced

# optimization related
sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
opt: noam
accum-grad: 2
grad-clip: 5
patience: 0
epochs: 50
dropout-rate: 0.1

# transformer specific setting
backend: pytorch
model-module: "espnet.nets.pytorch_backend.e2e_asr_transformer:E2E"
transformer-input-layer: conv2d     # encoder architecture type
transformer-lr: 1.0
transformer-warmup-steps: 25000
transformer-attn-dropout-rate: 0.0
transformer-length-normalized-loss: false
transformer-init: pytorch


================================================
FILE: egs/aishell2/conf/tuning/train_rnn.yaml
================================================
# network architecture
# encoder related
etype: vggblstm     # encoder architecture type
elayers: 3
eunits: 1024
eprojs: 1024
subsample: "1_2_2_1_1" # skip every n frame from input to nth layers
# decoder related
dlayers: 2
dunits: 1024
# attention related
atype: location
adim: 1024
aconv-chans: 10
aconv-filts: 100

# hybrid CTC/attention
mtlalpha: 0.5

# minibatch related
batch-size: 30
maxlen-in: 800  # if input length  > maxlen_in, batchsize is automatically reduced
maxlen-out: 150 # if output length > maxlen_out, batchsize is automatically reduced

# optimization related
opt: adadelta
epochs: 10
patience: 0

# scheduled sampling option
sampling-probability: 0.0


================================================
FILE: egs/aishell2/conf/tuning/transducer/decode_default.yaml
================================================
# decoding parameters
batch: 0
beam-size: 10
search-type: default
score-norm: True


================================================
FILE: egs/aishell2/conf/tuning/transducer/train_conformer-rnn_transducer.yaml
================================================
# minibatch related
batch-size: 64
maxlen-in: 512
maxlen-out: 150

# optimization related
criterion: loss
early-stop-criterion: "validation/main/loss"
sortagrad: 0
opt: noam
transformer-lr: 1.0
transformer-warmup-steps: 25000
epochs: 100
patience: 0
accum-grad: 2
grad-clip: 5.0

# network architecture
## general
custom-enc-positional-encoding-type: rel_pos
custom-enc-self-attn-type: rel_self_attn
custom-enc-pw-activation-type: swish
## encoder related
etype: custom
custom-enc-input-layer: vgg2l
enc-block-arch:
        - type: conformer
          d_hidden: 512
          d_ff: 2048
          heads: 4
          macaron_style: True
          use_conv_mod: True
          conv_mod_kernel: 15
          dropout-rate: 0.3
          att-dropout-rate: 0.3
enc-block-repeat: 12
## decoder related
dtype: lstm
dlayers: 1
dec-embed-dim: 1024
dunits: 512
dropout-rate-embed-decoder: 0.2
dropout-rate-decoder: 0.1
## joint network related
joint-dim: 512

# transducer related
model-module: "espnet.nets.pytorch_backend.e2e_asr_transducer:E2E"

# reporter related
report-wer: True
report-cer: True


================================================
FILE: egs/aishell2/conf/tuning/transducer/train_conformer-rnn_transducer_aux_ngpu4.yaml
================================================
# minibatch related
batch-size: 32
maxlen-in: 512
maxlen-out: 150

# optimization related
criterion: loss
early-stop-criterion: "validation/main/loss"
sortagrad: 0
opt: noam
transformer-lr: 1.0
transformer-warmup-steps: 25000
epochs: 100
patience: 0
#accum-grad: 2
grad-clip: 5.0

# network architecture
## general
custom-enc-positional-encoding-type: rel_pos
custom-enc-self-attn-type: rel_self_attn
custom-enc-pw-activation-type: swish
## encoder related
etype: custom
custom-enc-input-layer: vgg2l
enc-block-arch:
        - type: conformer
          d_hidden: 512
          d_ff: 2048
          heads: 4
          macaron_style: True
          use_conv_mod: True
          conv_mod_kernel: 15
          dropout-rate: 0.3
          att-dropout-rate: 0.3
enc-block-repeat: 12
## decoder related
dtype: lstm
dlayers: 1
dec-embed-dim: 1024
dunits: 512
dropout-rate-embed-decoder: 0.2
dropout-rate-decoder: 0.1
## joint network related
joint-dim: 512

# transducer related
model-module: "espnet.nets.pytorch_backend.e2e_asr_transducer:E2E"

# reporter related
report-wer: True
report-cer: True

# auxiliary task
#aux-ctc: True
#aux-ctc-weight: 0.5
#aux-ctc-dropout-rate: 0.1


================================================
FILE: egs/aishell2/conf/tuning/transducer/train_conformer-rnn_transducer_ngpu4.yaml
================================================
# minibatch related
batch-size: 32
maxlen-in: 512
maxlen-out: 150

# optimization related
criterion: loss
early-stop-criterion: "validation/main/loss"
sortagrad: 0
opt: noam
transformer-lr: 1.0
transformer-warmup-steps: 25000
epochs: 100
patience: 0
#accum-grad: 2
grad-clip: 5.0

# network architecture
## general
custom-enc-positional-encoding-type: rel_pos
custom-enc-self-attn-type: rel_self_attn
custom-enc-pw-activation-type: swish
## encoder related
etype: custom
custom-enc-input-layer: vgg2l
enc-block-arch:
        - type: conformer
          d_hidden: 512
          d_ff: 2048
          heads: 4
          macaron_style: True
          use_conv_mod: True
          conv_mod_kernel: 15
          dropout-rate: 0.3
          att-dropout-rate: 0.3
enc-block-repeat: 12
## decoder related
dtype: lstm
dlayers: 1
dec-embed-dim: 1024
dunits: 512
dropout-rate-embed-decoder: 0.2
dropout-rate-decoder: 0.1
## joint network related
joint-dim: 512

# transducer related
model-module: "espnet.nets.pytorch_backend.e2e_asr_transducer:E2E"

# reporter related
report-wer: True
report-cer: True

# auxiliary task
aux-ctc: False
aux-ctc-weight: 0.0
aux-ctc-dropout-rate: 0.0


================================================
FILE: egs/aishell2/conf/tuning/transducer/train_transducer.yaml
================================================
# minibatch related
batch-size: 64
maxlen-in: 512
maxlen-out: 150

# optimization related
criterion: loss
early-stop-criterion: "validation/main/loss"
sortagrad: 0
opt: adadelta
epochs: 30
patience: 3
accum-grad: 2

# network architecture
## encoder related
etype: vggblstm
elayers: 6
eunits: 512
eprojs: 512
dropout-rate: 0.4
## decoder related
dtype: lstm
dlayers: 1
dec-embed-dim: 1024
dunits: 512
dropout-rate-embed-decoder: 0.2
dropout-rate-decoder: 0.1
## joint network related
joint-dim: 512

# transducer related
model-module: "espnet.nets.pytorch_backend.e2e_asr_transducer:E2E"

# reporter related
report-wer: True
report-cer: True


================================================
FILE: egs/aishell2/conf/tuning/transducer/train_transducer_aux.yaml
================================================
# minibatch related
batch-size: 64
maxlen-in: 512
maxlen-out: 150

# optimization related
criterion: loss
early-stop-criterion: "validation/main/loss"
sortagrad: 0
opt: adadelta
epochs: 30
patience: 3
accum-grad: 2

# network architecture
## encoder related
etype: vggblstm
elayers: 6
eunits: 512
eprojs: 512
dropout-rate: 0.4
## decoder related
dtype: lstm
dlayers: 1
dec-embed-dim: 1024
dunits: 512
dropout-rate-embed-decoder: 0.2
dropout-rate-decoder: 0.1
## joint network related
joint-dim: 512

# transducer related
model-module: "espnet.nets.pytorch_backend.e2e_asr_transducer:E2E"

# reporter related
report-wer: True
report-cer: True

# auxiliary task
aux-ctc: True
aux-ctc-weight: 0.1
aux-ctc-dropout-rate: 0.1


================================================
FILE: egs/aishell2/local/add_lex_disambig.pl
================================================
#!/usr/bin/env perl
#  Copyright 2010-2011  Microsoft Corporation
#            2013-2016  Johns Hopkins University (author: Daniel Povey)
#                 2015  Hainan Xu
#                 2015  Guoguo Chen

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#  http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.


# Adds disambiguation symbols to a lexicon.
# Outputs still in the normal lexicon format.
# Disambig syms are numbered #1, #2, #3, etc. (#0
# reserved for symbol in grammar).
# Outputs the number of disambig syms to the standard output.
# With the --pron-probs option, expects the second field
# of each lexicon line to be a pron-prob.
# With the --sil-probs option, expects three additional
# fields after the pron-prob, representing various components
# of the silence probability model.

$pron_probs = 0;
$sil_probs = 0;
$first_allowed_disambig = 1;

for ($n = 1; $n <= 3 && @ARGV > 0; $n++) {
  if ($ARGV[0] eq "--pron-probs") {
    $pron_probs = 1;
    shift @ARGV;
  }
  if ($ARGV[0] eq "--sil-probs") {
    $sil_probs = 1;
    shift @ARGV;
  }
  if ($ARGV[0] eq "--first-allowed-disambig") {
    $first_allowed_disambig = 0 + $ARGV[1];
    if ($first_allowed_disambig < 1) {
      die "add_lex_disambig.pl: invalid --first-allowed-disambig option: $first_allowed_disambig\n";
    }
    shift @ARGV;
    shift @ARGV;
  }
}

if (@ARGV != 2) {
  die "Usage: add_lex_disambig.pl [opts] <lexicon-in> <lexicon-out>\n" .
    "This script adds disambiguation symbols to a lexicon in order to\n" .
    "make decoding graphs determinizable; it adds pseudo-phone\n" .
    "disambiguation symbols #1, #2 and so on at the ends of phones\n" .
    "to ensure that all pronunciations are different, and that none\n" .
    "is a prefix of another.\n" .
    "It prints to the standard output the number of the largest-numbered" .
    "disambiguation symbol that was used.\n" .
    "\n" .
    "Options:   --pron-probs       Expect pronunciation probabilities in the 2nd field\n" .
    "           --sil-probs        [should be with --pron-probs option]\n" .
    "                              Expect 3 extra fields after the pron-probs, for aspects of\n" .
    "                              the silence probability model\n" .
    "           --first-allowed-disambig <n>  The number of the first disambiguation symbol\n" .
    "                              that this script is allowed to add.  By default this is\n" .
    "                              #1, but you can set this to a larger value using this option.\n" .
    "e.g.:\n" .
    " add_lex_disambig.pl lexicon.txt lexicon_disambig.txt\n" .
    " add_lex_disambig.pl --pron-probs lexiconp.txt lexiconp_disambig.txt\n" .
    " add_lex_disambig.pl --pron-probs --sil-probs lexiconp_silprob.txt lexiconp_silprob_disambig.txt\n";
}


$lexfn = shift @ARGV;
$lexoutfn = shift @ARGV;

open(L, "<$lexfn") || die "Error opening lexicon $lexfn";

# (1)  Read in the lexicon.
@L = ( );
while(<L>) {
    @A = split(" ", $_);
    push @L, join(" ", @A);
}

# (2) Work out the count of each phone-sequence in the
# lexicon.

foreach $l (@L) {
    @A = split(" ", $l);
    shift @A; # Remove word.
    if ($pron_probs) {
      $p = shift @A;
      if (!($p > 0.0 && $p <= 1.0)) { die "Bad lexicon line $l (expecting pron-prob as second field)"; }
    }
    if ($sil_probs) {
      $silp = shift @A;
      if (!($silp > 0.0 && $silp <= 1.0)) { die "Bad lexicon line $l for silprobs"; }
      $correction = shift @A;
      if ($correction <= 0.0) { die "Bad lexicon line $l for silprobs"; }
      $correction = shift @A;
      if ($correction <= 0.0) { die "Bad lexicon line $l for silprobs"; }
    }
    if (!(@A)) {
      die "Bad lexicon line $1, no phone in phone list";
    }
    $count{join(" ",@A)}++;
}

# (3) For each left sub-sequence of each phone-sequence, note down
# that it exists (for identifying prefixes of longer strings).

foreach $l (@L) {
    @A = split(" ", $l);
    shift @A; # Remove word.
    if ($pron_probs) { shift @A; } # remove pron-prob.
    if ($sil_probs) {
      shift @A; # Remove silprob
      shift @A; # Remove silprob
      shift @A; # Remove silprob, there three numbers for sil_probs
    }
    while(@A > 0) {
        pop @A;  # Remove last phone
        $issubseq{join(" ",@A)} = 1;
    }
}

# (4) For each entry in the lexicon:
#  if the phone sequence is unique and is not a
#  prefix of another word, no diambig symbol.
#  Else output #1, or #2, #3, ... if the same phone-seq
#  has already been assigned a disambig symbol.


open(O, ">$lexoutfn") || die "Opening lexicon file $lexoutfn for writing.\n";

# max_disambig will always be the highest-numbered disambiguation symbol that
# has been used so far.
$max_disambig = $first_allowed_disambig - 1;

foreach $l (@L) {
  @A = split(" ", $l);
  $word = shift @A;
  if ($pron_probs) {
    $pron_prob = shift @A;
  }
  if ($sil_probs) {
    $sil_word_prob = shift @A;
    $word_sil_correction = shift @A;
    $prev_nonsil_correction = shift @A
  }
  $phnseq = join(" ", @A);
  if (!defined $issubseq{$phnseq}
      && $count{$phnseq} == 1) {
    ;                           # Do nothing.
  } else {
    if ($phnseq eq "") {        # need disambig symbols for the empty string
      # that are not use anywhere else.
      $max_disambig++;
      $reserved_for_the_empty_string{$max_disambig} = 1;
      $phnseq = "#$max_disambig";
    } else {
      $cur_disambig = $last_used_disambig_symbol_of{$phnseq};
      if (!defined $cur_disambig) {
        $cur_disambig = $first_allowed_disambig;
      } else {
        $cur_disambig++;           # Get a number that has not been used yet for
                                   # this phone sequence.
      }
      while (defined $reserved_for_the_empty_string{$cur_disambig}) {
        $cur_disambig++;
      }
      if ($cur_disambig > $max_disambig) {
        $max_disambig = $cur_disambig;
      }
      $last_used_disambig_symbol_of{$phnseq} = $cur_disambig;
      $phnseq = $phnseq . " #" . $cur_disambig;
    }
  }
  if ($pron_probs) {
    if ($sil_probs) {
      print O "$word\t$pron_prob\t$sil_word_prob\t$word_sil_correction\t$prev_nonsil_correction\t$phnseq\n";
    } else {
      print O "$word\t$pron_prob\t$phnseq\n";
    }
  } else {
    print O "$word\t$phnseq\n";
  }
}

print $max_disambig . "\n";


================================================
FILE: egs/aishell2/local/apply_map.pl
================================================
#!/usr/bin/env perl
use warnings; #sed replacement for -w perl parameter
# Copyright 2012  Johns Hopkins University (Author: Daniel Povey)
# Apache 2.0.

# This program is a bit like ./sym2int.pl in that it applies a map
# to things in a file, but it's a bit more general in that it doesn't
# assume the things being mapped to are single tokens, they could
# be sequences of tokens.  See the usage message.


$permissive = 0;

for ($x = 0; $x <= 2; $x++) {

  if (@ARGV > 0 && $ARGV[0] eq "-f") {
    shift @ARGV;
    $field_spec = shift @ARGV;
    if ($field_spec =~ m/^\d+$/) {
      $field_begin = $field_spec - 1; $field_end = $field_spec - 1;
    }
    if ($field_spec =~ m/^(\d*)[-:](\d*)/) { # accept e.g. 1:10 as a courtesty (properly, 1-10)
      if ($1 ne "") {
        $field_begin = $1 - 1;  # Change to zero-based indexing.
      }
      if ($2 ne "") {
        $field_end = $2 - 1;    # Change to zero-based indexing.
      }
    }
    if (!defined $field_begin && !defined $field_end) {
      die "Bad argument to -f option: $field_spec";
    }
  }

  if (@ARGV > 0 && $ARGV[0] eq '--permissive') {
    shift @ARGV;
    # Mapping is optional (missing key is printed to output)
    $permissive = 1;
  }
}

if(@ARGV != 1) {
  print STDERR "Invalid usage: " . join(" ", @ARGV) . "\n";
  print STDERR <<'EOF';
Usage: apply_map.pl [options] map <input >output
 options: [-f <field-range> ] [--permissive]
   This applies a map to some specified fields of some input text:
   For each line in the map file: the first field is the thing we
   map from, and the remaining fields are the sequence we map it to.
   The -f (field-range) option says which fields of the input file the map
   map should apply to.
   If the --permissive option is supplied, fields which are not present
   in the map will be left as they were.
 Applies the map 'map' to all input text, where each line of the map
 is interpreted as a map from the first field to the list of the other fields
 Note: <field-range> can look like 4-5, or 4-, or 5-, or 1, it means the field
 range in the input to apply the map to.
 e.g.: echo A B | apply_map.pl a.txt
 where a.txt is:
 A a1 a2
 B b
 will produce:
 a1 a2 b
EOF
  exit(1);
}

($map_file) = @ARGV;
open(M, "<$map_file") || die "Error opening map file $map_file: $!";

while (<M>) {
  @A = split(" ", $_);
  @A >= 1 || die "apply_map.pl: empty line.";
  $i = shift @A;
  $o = join(" ", @A);
  $map{$i} = $o;
}

while(<STDIN>) {
  @A = split(" ", $_);
  for ($x = 0; $x < @A; $x++) {
    if ( (!defined $field_begin || $x >= $field_begin)
         && (!defined $field_end || $x <= $field_end)) {
      $a = $A[$x];
      if (!defined $map{$a}) {
        if (!$permissive) {
          die "apply_map.pl: undefined key $a in $map_file\n";
        } else {
          print STDERR "apply_map.pl: warning! missing key $a in $map_file\n";
        }
      } else {
        $A[$x] = $map{$a};
      }
    }
  }
  print join(" ", @A) . "\n";
}


================================================
FILE: egs/aishell2/local/fstaddselfloops.pl
================================================
#!/usr/bin/env perl

# Copyright 2020 Xiaomi Corporation (Author: Junbo Zhang)
# Apache 2.0

use strict;
use warnings;

my $Usage = <<EOU;
fstaddselfloops.pl:
Adds self-loops to states of an FST to propagate disambiguation symbols through it.
They are added on each final state and each state with non-epsilon output symbols
on at least one arc out of the state. 

Usage: local/fstaddselfloops.pl <wdisambig_phone> <wdisambig_word> < <openfst_text>
 e.g.: cat L_disambig.txt | local/fstaddselfloops.pl 347 200004 > L_disambig_with_loop.txt
EOU

if (@ARGV != 2) {
  die $Usage;
}

my $wdisambig_phone = shift @ARGV;
my $wdisambig_word = shift @ARGV;

my %states_needs_self_loops;
while (<>) {
    print $_;

    my @items = split(/\s+/);
    if (@items == 2) {
        # it is a final state
        $states_needs_self_loops{$items[0]} = 1;
    } elsif (@items == 5) {
        my ($src, $dst, $inlabel, $outlabel, $score) = @items;
        $states_needs_self_loops{$src} = 1 if ($outlabel != 0);
    } else {
        die "Invalid openfst line.";
    }
}

foreach (keys %states_needs_self_loops) {
    print "$_ $_ $wdisambig_phone $wdisambig_word 0.0\n"
}


================================================
FILE: egs/aishell2/local/jieba_split_text.py
================================================
import jieba
import sys

src_file = sys.argv[1]
dst_file = sys.argv[2]
dict_file = sys.argv[3]
jieba.set_dictionary(dict_file)

reader = open(src_file, 'r')
writer = open(dst_file, 'w')

word_dict = {}
for line in open(dict_file):
    w = line.strip().split()[0]
    word_dict[w] = 0 

oov_count = 0
for i, line in enumerate(reader):
    elems = line.strip().split()
    uttid, ctx = elems[0], elems[1:]
    ctx = " ".join(ctx)
    ctx = jieba.lcut(ctx, HMM=False)
    #for x in ctx:
    #    if x not in word_dict:
    #        print(i, x, flush=True)
    ctx = " ".join(ctx)
    writer.write(f"{uttid} {ctx}\n")
    
writer.close()


================================================
FILE: egs/aishell2/local/k2_prepare_lang.sh
================================================
#!/usr/bin/env bash
# Copyright 2012-2013  Johns Hopkins University (Author: Daniel Povey);
#                      Arnab Ghoshal
#                2014  Guoguo Chen
#                2015  Hainan Xu
#                2016  FAU Erlangen (Author: Axel Horndasch)
#                2020  Xiaomi Corporation (Author: Junbo Zhang)

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#  http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.

# This script prepares a directory such as data/lang/, in the standard format,
# given a source directory containing a dictionary lexicon.txt in a form like:
# word phone1 phone2 ... phoneN
# per line (alternate prons would be separate lines), or a dictionary with probabilities
# called lexiconp.txt in a form:
# word pron-prob phone1 phone2 ... phoneN
# (with 0.0 < pron-prob <= 1.0); note: if lexiconp.txt exists, we use it even if
# lexicon.txt exists.
# and also files silence_phones.txt, nonsilence_phones.txt, optional_silence.txt
# and extra_questions.txt
# Here, silence_phones.txt and nonsilence_phones.txt are lists of silence and
# non-silence phones respectively (where silence includes various kinds of
# noise, laugh, cough, filled pauses etc., and nonsilence phones includes the
# "real" phones.)
# In each line of those files is a list of phones, and the phones on each line
# are assumed to correspond to the same "base phone", i.e. they will be
# different stress or tone variations of the same basic phone.
# The file "optional_silence.txt" contains just a single phone (typically SIL)
# which is used for optional silence in the lexicon.
# extra_questions.txt might be empty; typically will consist of lists of phones,
# all members of each list with the same stress or tone; and also possibly a
# list for the silence phones.  This will augment the automatically generated
# questions (note: the automatically generated ones will treat all the
# stress/tone versions of a phone the same, so will not "get to ask" about
# stress or tone).
#

# This script adds word-position-dependent phones and constructs a host of other
# derived files, that go in data/lang/.

# Begin configuration section.
num_sil_states=5
num_nonsil_states=3
position_dependent_phones=true
# position_dependent_phones is false also when position dependent phones and word_boundary.txt
# have been generated by another source
share_silence_phones=false  # if true, then share pdfs of different silence
                            # phones together.
sil_prob=0.5
num_extra_phone_disambig_syms=1 # Standard one phone disambiguation symbol is used for optional silence.
                                # Increasing this number does not harm, but is only useful if you later
                                # want to introduce this labels to L_disambig.fst


# end configuration sections

echo "$0 $@"  # Print the command line for logging
echo $sil_prob
. local/parse_options.sh
echo $sil_prob
if [ $# -ne 4 ]; then
  echo "Usage: local/prepare_lang.sh <dict-src-dir> <oov-dict-entry> <tmp-dir> <lang-dir>"
  echo "e.g.: local/prepare_lang.sh data/local/dict <SPOKEN_NOISE> data/local/lang data/lang"
  echo "<dict-src-dir> should contain the following files:"
  echo " extra_questions.txt  lexicon.txt nonsilence_phones.txt  optional_silence.txt  silence_phones.txt"
  echo "See http://kaldi-asr.org/doc/data_prep.html#data_prep_lang_creating for more info."
  echo "options: "
  echo "<dict-src-dir> may also, for the grammar-decoding case (see http://kaldi-asr.org/doc/grammar.html)"
  echo "contain a file nonterminals.txt containing symbols like #nonterm:contact_list, one per line."
  echo "     --num-sil-states <number of states>             # default: 5, #states in silence models."
  echo "     --num-nonsil-states <number of states>          # default: 3, #states in non-silence models."
  echo "     --position-dependent-phones (true|false)        # default: true; if true, use _B, _E, _S & _I"
  echo "                                                     # markers on phones to indicate word-internal positions. "
  echo "     --share-silence-phones (true|false)             # default: false; if true, share pdfs of "
  echo "                                                     # all silence phones. "
  echo "     --sil-prob <probability of silence>             # default: 0.5 [must have 0 <= silprob < 1]"
  exit 1;
fi

srcdir=$1
oov_word=$2
tmpdir=$3
dir=$4


if [ -d $dir/phones ]; then
  rm -r $dir/phones
fi
mkdir -p $dir $tmpdir $dir/phones

silprob=false
[ -f $srcdir/lexiconp_silprob.txt ] && silprob=true

[ -f path.sh ] && . ./path.sh

if [[ ! -f $srcdir/lexicon.txt ]]; then
  echo "**Creating $srcdir/lexicon.txt from $srcdir/lexiconp.txt"
  perl -ape 's/(\S+\s+)\S+\s+(.+)/$1$2/;' < $srcdir/lexiconp.txt > $srcdir/lexicon.txt || exit 1;
fi
if [[ ! -f $srcdir/lexiconp.txt ]]; then
  echo "**Creating $srcdir/lexiconp.txt from $srcdir/lexicon.txt"
  perl -ape 's/(\S+\s+)(.+)/${1}1.0\t$2/;' < $srcdir/lexicon.txt > $srcdir/lexiconp.txt || exit 1;
fi

if [ ! -z "$unk_fst" ] && [ ! -f "$unk_fst" ]; then
  echo "$0: expected --unk-fst $unk_fst to exist as a file"
  exit 1
fi

if $position_dependent_phones; then
  # Create $tmpdir/lexiconp.txt from $srcdir/lexiconp.txt (or
  # $tmpdir/lexiconp_silprob.txt from $srcdir/lexiconp_silprob.txt) by
  # adding the markers _B, _E, _S, _I depending on word position.
  # In this recipe, these markers apply to silence also.
  # Do this starting from lexiconp.txt only.
  if "$silprob"; then
    perl -ane '@A=split(" ",$_); $w = shift @A; $p = shift @A; $silword_p = shift @A;
              $wordsil_f = shift @A; $wordnonsil_f = shift @A; @A>0||die;
         if(@A==1) { print "$w $p $silword_p $wordsil_f $wordnonsil_f $A[0]_S\n"; }
         else { print "$w $p $silword_p $wordsil_f $wordnonsil_f $A[0]_B ";
         for($n=1;$n<@A-1;$n++) { print "$A[$n]_I "; } print "$A[$n]_E\n"; } ' \
                < $srcdir/lexiconp_silprob.txt > $tmpdir/lexiconp_silprob.txt
  else
    perl -ane '@A=split(" ",$_); $w = shift @A; $p = shift @A; @A>0||die;
         if(@A==1) { print "$w $p $A[0]_S\n"; } else { print "$w $p $A[0]_B ";
         for($n=1;$n<@A-1;$n++) { print "$A[$n]_I "; } print "$A[$n]_E\n"; } ' \
         < $srcdir/lexiconp.txt > $tmpdir/lexiconp.txt || exit 1;
  fi

  # create $tmpdir/phone_map.txt
  # this has the format (on each line)
  # <original phone> <version 1 of original phone> <version 2> ...
  # where the versions depend on the position of the phone within a word.
  # For instance, we'd have:
  # AA AA_B AA_E AA_I AA_S
  # for (B)egin, (E)nd, (I)nternal and (S)ingleton
  # and in the case of silence
  # SIL SIL SIL_B SIL_E SIL_I SIL_S
  # [because SIL on its own is one of the variants; this is for when it doesn't
  #  occur inside a word but as an option in the lexicon.]

  # This phone map expands the phone lists into all the word-position-dependent
  # versions of the phone lists.
  cat <(set -f; for x in `cat $srcdir/silence_phones.txt`; do for y in "" "" "_B" "_E" "_I" "_S"; do echo -n "$x$y "; done; echo; done) \
    <(set -f; for x in `cat $srcdir/nonsilence_phones.txt`; do for y in "" "_B" "_E" "_I" "_S"; do echo -n "$x$y "; done; echo; done) \
    > $tmpdir/phone_map.txt
else
  if "$silprob"; then
    cp $srcdir/lexiconp_silprob.txt $tmpdir/lexiconp_silprob.txt
  else
    cp $srcdir/lexiconp.txt $tmpdir/lexiconp.txt
  fi

  cat $srcdir/silence_phones.txt $srcdir/nonsilence_phones.txt | \
    awk '{for(n=1;n<=NF;n++) print $n; }' > $tmpdir/phones
  paste -d' ' $tmpdir/phones $tmpdir/phones > $tmpdir/phone_map.txt
fi


# Making monophone systems.
cat $srcdir/silence_phones.txt | local/apply_map.pl $tmpdir/phone_map.txt | \
  awk '{for(n=1;n<=NF;n++) print $n;}' > $dir/phones/silence.txt
cat $srcdir/nonsilence_phones.txt | local/apply_map.pl $tmpdir/phone_map.txt | \
  awk '{for(n=1;n<=NF;n++) print $n;}' > $dir/phones/nonsilence.txt
cp $srcdir/optional_silence.txt $dir/phones/optional_silence.txt

# if extra_questions.txt is empty, it's OK.
cat $srcdir/extra_questions.txt 2>/dev/null | local/apply_map.pl $tmpdir/phone_map.txt \
  >$dir/phones/extra_questions.txt

# Want extra questions about the word-start/word-end stuff. Make it separate for
# silence and non-silence. Probably doesn't matter, as silence will rarely
# be inside a word.
if $position_dependent_phones; then
  for suffix in _B _E _I _S; do
    (set -f; for x in `cat $srcdir/nonsilence_phones.txt`; do echo -n "$x$suffix "; done; echo) >>$dir/phones/extra_questions.txt
  done
  for suffix in "" _B _E _I _S; do
    (set -f; for x in `cat $srcdir/silence_phones.txt`; do echo -n "$x$suffix "; done; echo) >>$dir/phones/extra_questions.txt
  done
fi

# add_lex_disambig.pl is responsible for adding disambiguation symbols to
# the lexicon, for telling us how many disambiguation symbols it used,
# and also for modifying the unknown-word's pronunciation (if the
# --unk-fst was provided) to the sequence "#1 #2 #3", and reserving those
# disambig symbols for that purpose.
# The #2 will later be replaced with the actual unk model.  The reason
# for the #1 and the #3 is for disambiguation and also to keep the
# FST compact.  If we didn't have the #1, we might have a different copy of
# the unk-model FST, or at least some of its arcs, for each start-state from
# which an <unk> transition comes (instead of per end-state, which is more compact);
# and adding the #3 prevents us from potentially having 2 copies of the unk-model
# FST due to the optional-silence [the last phone of any word gets 2 arcs].
if [ ! -z "$unk_fst" ]; then  # if the --unk-fst option was provided...
  if "$silprob"; then
    local/lang/internal/modify_unk_pron.py $tmpdir/lexiconp_silprob.txt "$oov_word" || exit 1
  else
    local/lang/internal/modify_unk_pron.py $tmpdir/lexiconp.txt "$oov_word" || exit 1
  fi
  unk_opt="--first-allowed-disambig 4"
else
  unk_opt=
fi

if "$silprob"; then
  ndisambig=$(local/add_lex_disambig.pl $unk_opt --pron-probs --sil-probs $tmpdir/lexiconp_silprob.txt $tmpdir/lexiconp_silprob_disambig.txt)
else
  ndisambig=$(local/add_lex_disambig.pl $unk_opt --pron-probs $tmpdir/lexiconp.txt $tmpdir/lexiconp_disambig.txt)
fi
ndisambig=$[$ndisambig+$num_extra_phone_disambig_syms]; # add (at least) one disambig symbol for silence in lexicon FST.
echo $ndisambig > $tmpdir/lex_ndisambig

# Format of lexiconp_disambig.txt:
# !SIL	1.0   SIL_S
# <SPOKEN_NOISE>	1.0   SPN_S #1
# <UNK>	1.0  SPN_S #2
# <NOISE>	1.0  NSN_S
# !EXCLAMATION-POINT	1.0  EH2_B K_I S_I K_I L_I AH0_I M_I EY1_I SH_I AH0_I N_I P_I OY2_I N_I T_E

( for n in `seq 0 $ndisambig`; do echo '#'$n; done ) >$dir/phones/disambig.txt

# Create phone symbol table.
echo "<eps>" | cat - $dir/phones/{silence,nonsilence,disambig}.txt | \
  awk '{n=NR-1; print $1, n;}' > $dir/phones.txt

# Create a file that describes the word-boundary information for
# each phone.  5 categories.
if $position_dependent_phones; then
  cat $dir/phones/{silence,nonsilence}.txt | \
    awk '/_I$/{print $1, "internal"; next;} /_B$/{print $1, "begin"; next; }
         /_S$/{print $1, "singleton"; next;} /_E$/{print $1, "end"; next; }
         {print $1, "nonword";} ' > $dir/phones/word_boundary.txt
else
  # word_boundary.txt might have been generated by another source
  [ -f $srcdir/word_boundary.txt ] && cp $srcdir/word_boundary.txt $dir/phones/word_boundary.txt
fi

# Create word symbol table.
# <s> and </s> are only needed due to the need to rescore lattices with
# ConstArpaLm format language model. They do not normally appear in G.fst or
# L.fst.

if "$silprob"; then
  # remove the silprob
  cat $tmpdir/lexiconp_silprob.txt |\
    awk '{
      for(i=1; i<=NF; i++) {
        if(i!=3 && i!=4 && i!=5) printf("%s\t", $i); if(i==NF) print "";
      }
    }' > $tmpdir/lexiconp.txt
fi

cat $tmpdir/lexiconp.txt | awk '{print $1}' | sort | uniq  | awk '
  BEGIN {
    print "<eps> 0";
  }
  {
    if ($1 == "<s>") {
      print "<s> is in the vocabulary!" | "cat 1>&2"
      exit 1;
    }
    if ($1 == "</s>") {
      print "</s> is in the vocabulary!" | "cat 1>&2"
      exit 1;
    }
    printf("%s %d\n", $1, NR);
  }
  END {
    printf("#0 %d\n", NR+1);
    printf("<s> %d\n", NR+2);
    printf("</s> %d\n", NR+3);
  }' > $dir/words.txt || exit 1;

# format of $dir/words.txt:
#<eps> 0
#a 1
#aa 2
#aarvark 3
#...

silphone=`cat $srcdir/optional_silence.txt` || exit 1;
[ -z "$silphone" ] && \
  ( echo "You have no optional-silence phone; it is required in the current scripts"
    echo "but you may use the option --sil-prob 0.0 to stop it being used." ) && \
   exit 1;

grammar_opts=

# Create the basic L.fst without disambiguation symbols, for use
# in training.

if $silprob; then
  # Add silence probabilities (models the prob. of silence before and after each
  # word).  On some setups this helps a bit.  See local/dict_dir_add_pronprobs.sh
  # and where it's called in the example scripts (run.sh).
  local/make_lexicon_fst_silprob.py $grammar_opts --sil-phone=$silphone \
    $tmpdir/lexiconp_silprob.txt $srcdir/silprob.txt | \
    local/sym2int.pl -f 3 $dir/phones.txt | \
    local/sym2int.pl -f 4 $dir/words.txt  > $dir/L.fst.txt || exit 1;

    # fstcompile --isymbols=$dir/phones.txt --osymbols=$dir/words.txt \
    #   --keep_isymbols=false --keep_osymbols=false |   \
    # fstarcsort --sort_type=olabel > $dir/L.fst || exit 1;
else
  local/make_lexicon_fst.py $grammar_opts --sil-prob=$sil_prob --sil-phone=$silphone \
    $tmpdir/lexiconp.txt | \
    local/sym2int.pl -f 3 $dir/phones.txt | \
    local/sym2int.pl -f 4 $dir/words.txt > $dir/L.fst.txt || exit 1;

    # fstcompile --isymbols=$dir/phones.txt --osymbols=$dir/words.txt \
    #   --keep_isymbols=false --keep_osymbols=false | \
    # fstarcsort --sort_type=olabel > $dir/L.fst || exit 1;
fi

# The file oov.txt contains a word that we will map any OOVs to during
# training.
echo "$oov_word" > $dir/oov.txt || exit 1;
cat $dir/oov.txt | local/sym2int.pl $dir/words.txt >$dir/oov.int || exit 1;
# integer version of oov symbol, used in some scripts.


# the file wdisambig.txt contains a (line-by-line) list of the text-form of the
# disambiguation symbols that are used in the grammar and passed through by the
# lexicon.  At this stage it's hardcoded as '#0', but we're laying the groundwork
# for more generality (which probably would be added by another script).
# wdisambig_words.int contains the corresponding list interpreted by the
# symbol table words.txt, and wdisambig_phones.int contains the corresponding
# list interpreted by the symbol table phones.txt.
echo '#0' >$dir/phones/wdisambig.txt

wdisambig_phone=`local/sym2int.pl $dir/phones.txt <$dir/phones/wdisambig.txt`
wdisambig_word=`local/sym2int.pl $dir/words.txt <$dir/phones/wdisambig.txt`

# Create these lists of phones in colon-separated integer list form too,
# for purposes of being given to programs as command-line options.
for f in silence nonsilence optional_silence disambig; do
  local/sym2int.pl $dir/phones.txt <$dir/phones/$f.txt >$dir/phones/$f.int
  local/sym2int.pl $dir/phones.txt <$dir/phones/$f.txt | \
   awk '{printf(":%d", $1);} END{printf "\n"}' | sed s/:// > $dir/phones/$f.csl || exit 1;
done

if [ -f $dir/phones/word_boundary.txt ]; then
  local/sym2int.pl -f 1 $dir/phones.txt <$dir/phones/word_boundary.txt \
    > $dir/phones/word_boundary.int || exit 1;
fi

silphonelist=`cat $dir/phones/silence.csl`
nonsilphonelist=`cat $dir/phones/nonsilence.csl`

# Create the lexicon FST with disambiguation symbols, and put it in lang_test.
# There is an extra step where we create a loop to "pass through" the
# disambiguation symbols from G.fst.

if $silprob; then
  local/make_lexicon_fst_silprob.py $grammar_opts \
    --sil-phone=$silphone --sil-disambig='#'$ndisambig \
    $tmpdir/lexiconp_silprob_disambig.txt $srcdir/silprob.txt | \
    local/sym2int.pl -f 3 $dir/phones.txt | \
    local/sym2int.pl -f 4 $dir/words.txt | \
    local/fstaddselfloops.pl $wdisambig_phone $wdisambig_word > $dir/L_disambig.fst.txt || exit 1;
else
  local/make_lexicon_fst.py $grammar_opts \
    --sil-prob=$sil_prob --sil-phone=$silphone --sil-disambig='#'$ndisambig \
    $tmpdir/lexiconp_disambig.txt | \
    local/sym2int.pl -f 3 $dir/phones.txt | \
    local/sym2int.pl -f 4 $dir/words.txt | \
    local/fstaddselfloops.pl $wdisambig_phone $wdisambig_word > $dir/L_disambig.fst.txt || exit 1;
fi

exit 0;


================================================
FILE: egs/aishell2/local/make_lexicon_fst.py
================================================
#!/usr/bin/env python3

# Copyright   2018  Johns Hopkins University (author: Daniel Povey)
# Apache 2.0.

# see get_args() below for usage message.
import argparse
import os
import sys
import math
import re

# The use of latin-1 encoding does not preclude reading utf-8.  latin-1
# encoding means "treat words as sequences of bytes", and it is compatible
# with utf-8 encoding as well as other encodings such as gbk, as long as the
# spaces are also spaces in ascii (which we check).  It is basically how we
# emulate the behavior of python before python3.
sys.stdout = open(1, 'w', encoding='latin-1', closefd=False)
sys.stderr = open(2, 'w', encoding='latin-1', closefd=False)

def get_args():
    parser = argparse.ArgumentParser(description="""This script creates the
       text form of a lexicon FST, to be compiled by fstcompile using the
       appropriate symbol tables (phones.txt and words.txt) .  It will mostly
       be invoked indirectly via utils/prepare_lang.sh.  The output goes to
       the stdout.""")

    parser.add_argument('--sil-phone', dest='sil_phone', type=str,
                        help="""Text form of optional-silence phone, e.g. 'SIL'.  See also
                        the --silprob option.""")
    parser.add_argument('--sil-prob', dest='sil_prob', type=float, default=0.0,
                        help="""Probability of silence between words (including at the
                        beginning and end of word sequences).  Must be in the range [0.0, 1.0].
                        This refers to the optional silence inserted by the lexicon; see
                        the --silphone option.""")
    parser.add_argument('--sil-disambig', dest='sil_disambig', type=str,
                        help="""Disambiguation symbol to disambiguate silence, e.g. #5.
                        Will only be supplied if you are creating the version of L.fst
                        with disambiguation symbols, intended for use with cyclic G.fst.
                        This symbol was introduced to fix a rather obscure source of
                        nondeterminism of CLG.fst, that has to do with reordering of
                        disambiguation symbols and phone symbols.""")
    parser.add_argument('--left-context-phones', dest='left_context_phones', type=str,
                        help="""Only relevant if --nonterminals is also supplied; this relates
                        to grammar decoding (see http://kaldi-asr.org/doc/grammar.html or
                        src/doc/grammar.dox).  Format is a list of left-context phones,
                        in text form, one per line.  E.g. data/lang/phones/left_context_phones.txt""")
    parser.add_argument('--nonterminals', type=str,
                        help="""If supplied, --left-context-phones must also be supplied.
                        List of user-defined nonterminal symbols such as #nonterm:contact_list,
                        one per line.  E.g. data/local/dict/nonterminals.txt.""")
    parser.add_argument('lexiconp', type=str,
                        help="""Filename of lexicon with pronunciation probabilities
                        (normally lexiconp.txt), with lines of the form 'word prob p1 p2...',
                        e.g. 'a   1.0    ay'""")
    args = parser.parse_args()
    return args


def read_lexiconp(filename):
    """Reads the lexiconp.txt file in 'filename', with lines like 'word pron p1 p2 ...'.
    Returns a list of tuples (word, pron_prob, pron), where 'word' is a string,
   'pron_prob', a float, is the pronunciation probability (which must be >0.0
    and would normally be <=1.0),  and 'pron' is a list of strings representing phones.
    An element in the returned list might be ('hello', 1.0, ['h', 'eh', 'l', 'ow']).
    """

    ans = []
    found_empty_prons = False
    found_large_pronprobs = False
    # See the comment near the top of this file, RE why we use latin-1.
    with open(filename, 'r', encoding='latin-1') as f:
        whitespace = re.compile("[ \t]+")
        for line in f:
            a = whitespace.split(line.strip(" \t\r\n"))
            if len(a) < 2:
                print("{0}: error: found bad line '{1}' in lexicon file {2} ".format(
                    sys.argv[0], line.strip(" \t\r\n"), filename), file=sys.stderr)
                sys.exit(1)
            word = a[0]
            if word == "<eps>":
                # This would clash with the epsilon symbol normally used in OpenFst.
                print("{0}: error: found <eps> as a word in lexicon file "
                      "{1}".format(line.strip(" \t\r\n"), filename), file=sys.stderr)
                sys.exit(1)
            try:
                pron_prob = float(a[1])
            except:
                print("{0}: error: found bad line '{1}' in lexicon file {2}, 2nd field "
                      "should be pron-prob".format(sys.argv[0], line.strip(" \t\r\n"), filename),
                      file=sys.stderr)
                sys.exit(1)
            prons = a[2:]
            if pron_prob <= 0.0:
                print("{0}: error: invalid pron-prob in line '{1}' of lexicon file {1} ".format(
                    sys.argv[0], line.strip(" \t\r\n"), filename), file=sys.stderr)
                sys.exit(1)
            if len(prons) == 0:
                found_empty_prons = True
            ans.append( (word, pron_prob, prons) )
            if pron_prob > 1.0:
                found_large_pronprobs = True
    if found_empty_prons:
        print("{0}: warning: found at least one word with an empty pronunciation "
              "in lexicon file {1}.".format(sys.argv[0], filename),
              file=sys.stderr)
    if found_large_pronprobs:
        print("{0}: warning: found at least one word with pron-prob >1.0 "
              "in {1}".format(sys.argv[0], filename), file=sys.stderr)


    if len(ans) == 0:
        print("{0}: error: found no pronunciations in lexicon file {1}".format(
            sys.argv[0], filename), file=sys.stderr)
        sys.exit(1)
    return ans


def write_nonterminal_arcs(start_state, loop_state, next_state,
                           nonterminals, left_context_phones):
    """This function relates to the grammar-decoding setup, see
    kaldi-asr.org/doc/grammar.html.  It is called from write_fst_no_silence
    and write_fst_silence, and writes to the stdout some extra arcs
    in the lexicon FST that relate to nonterminal symbols.
    See the section "Special symbols in L.fst,
    kaldi-asr.org/doc/grammar.html#grammar_special_l.
       start_state: the start-state of L.fst.
       loop_state:  the state of high out-degree in L.fst where words leave
                  and enter.
       next_state: the number from which this function can start allocating its
                  own states.  the updated value of next_state will be returned.
       nonterminals: the user-defined nonterminal symbols as a list of
          strings, e.g. ['#nonterm:contact_list', ... ].
       left_context_phones: a list of phones that may appear as left-context,
          e.g. ['a', 'ah', ... '#nonterm_bos'].
    """
    shared_state = next_state
    next_state += 1
    final_state = next_state
    next_state += 1

    print("{src}\t{dest}\t{phone}\t{word}\t{cost}".format(
        src=start_state, dest=shared_state,
        phone='#nonterm_begin', word='#nonterm_begin',
        cost=0.0))

    for nonterminal in nonterminals:
        print("{src}\t{dest}\t{phone}\t{word}\t{cost}".format(
            src=loop_state, dest=shared_state,
            phone=nonterminal, word=nonterminal,
            cost=0.0))
    # this_cost equals log(len(left_context_phones)) but the expression below
    # better captures the meaning.  Applying this cost to arcs keeps the FST
    # stochatic (sum-to-one, like an HMM), so that if we do weight pushing
    # things won't get weird.  In the grammar-FST code when we splice things
    # together we will cancel out this cost, see the function CombineArcs().
    this_cost = -math.log(1.0 / len(left_context_phones))

    for left_context_phone in left_context_phones:
        print("{src}\t{dest}\t{phone}\t{word}\t{cost}".format(
            src=shared_state, dest=loop_state,
            phone=left_context_phone, word='<eps>', cost=this_cost))
    # arc from loop-state to a final-state with #nonterm_end as ilabel and olabel
    print("{src}\t{dest}\t{phone}\t{word}\t{cost}".format(
        src=loop_state, dest=final_state,
        phone='#nonterm_end', word='#nonterm_end', cost=0.0))
    print("{state}\t{final_cost}".format(
        state=final_state, final_cost=0.0))
    return next_state


def write_fst_no_silence(lexicon, nonterminals=None, left_context_phones=None):
    """Writes the text format of L.fst to the standard output.  This version is for
    when --sil-prob=0.0, meaning there is no optional silence allowed.

      'lexicon' is a list of 3-tuples (word, pron-prob, prons) as returned by
        read_lexiconp().
     'nonterminals', which relates to grammar decoding (see kaldi-asr.org/doc/grammar.html),
        is either None, or the user-defined nonterminal symbols as a list of
        strings, e.g. ['#nonterm:contact_list', ... ].
     'left_context_phones', which also relates to grammar decoding, and must be
        supplied if 'nonterminals' is supplied is either None or a list of
        phones that may appear as left-context, e.g. ['a', 'ah', ... '#nonterm_bos'].
    """

    loop_state = 0
    next_state = 1  # the next un-allocated state, will be incremented as we go.
    for (word, pronprob, pron) in lexicon:
        cost = -math.log(pronprob)
        cur_state = loop_state
        for i in range(len(pron) - 1):
            print("{src}\t{dest}\t{phone}\t{word}\t{cost}".format(
                src=cur_state,
                dest=next_state,
                phone=pron[i],
                word=(word if i == 0 else '<eps>'),
                cost=(cost if i == 0 else 0.0)))
            cur_state = next_state
            next_state += 1

        i = len(pron) - 1  # note: i == -1 if pron is empty.
        print("{src}\t{dest}\t{phone}\t{word}\t{cost}".format(
            src=cur_state,
            dest=loop_state,
            phone=(pron[i] if i >= 0 else '<eps>'),
            word=(word if i <= 0 else '<eps>'),
            cost=(cost if i <= 0 else 0.0)))

    if nonterminals is not None:
        next_state = write_nonterminal_arcs(
            loop_state, loop_state, next_state,
            nonterminals, left_context_phones)

    print("{state}\t{final_cost}".format(
        state=loop_state,
        final_cost=0.0))


def write_fst_with_silence(lexicon, sil_prob, sil_phone, sil_disambig,
                           nonterminals=None, left_context_phones=None):
    """Writes the text format of L.fst to the standard output.  This version is for
       when --sil-prob != 0.0, meaning there is optional silence
     'lexicon' is a list of 3-tuples (word, pron-prob, prons)
         as returned by read_lexiconp().
     'sil_prob', which is expected to be strictly between 0.. and 1.0, is the
         probability of silence
     'sil_phone' is the silence phone, e.g. "SIL".
     'sil_disambig' is either None, or the silence disambiguation symbol, e.g. "#5".
     'nonterminals', which relates to grammar decoding (see kaldi-asr.org/doc/grammar.html),
        is either None, or the user-defined nonterminal symbols as a list of
        strings, e.g. ['#nonterm:contact_list', ... ].
     'left_context_phones', which also relates to grammar decoding, and must be
        supplied if 'nonterminals' is supplied is either None or a list of
        phones that may appear as left-context, e.g. ['a', 'ah', ... '#nonterm_bos'].
    """

    assert sil_prob > 0.0 and sil_prob < 1.0
    sil_cost = -math.log(sil_prob)
    no_sil_cost = -math.log(1.0 - sil_prob);

    start_state = 0
    loop_state = 1  # words enter and leave from here
    sil_state = 2   # words terminate here when followed by silence; this state
                    # has a silence transition to loop_state.
    next_state = 3  # the next un-allocated state, will be incremented as we go.


    print('{src}\t{dest}\t{phone}\t{word}\t{cost}'.format(
        src=start_state, dest=loop_state,
        phone='<eps>', word='<eps>', cost=no_sil_cost))
    print('{src}\t{dest}\t{phone}\t{word}\t{cost}'.format(
        src=start_state, dest=sil_state,
        phone='<eps>', word='<eps>', cost=sil_cost))
    if sil_disambig is None:
        print('{src}\t{dest}\t{phone}\t{word}\t{cost}'.format(
            src=sil_state, dest=loop_state,
            phone=sil_phone, word='<eps>', cost=0.0))
    else:
        sil_disambig_state = next_state
        next_state += 1
        print('{src}\t{dest}\t{phone}\t{word}\t{cost}'.format(
            src=sil_state, dest=sil_disambig_state,
            phone=sil_phone, word='<eps>', cost=0.0))
        print('{src}\t{dest}\t{phone}\t{word}\t{cost}'.format(
            src=sil_disambig_state, dest=loop_state,
            phone=sil_disambig, word='<eps>', cost=0.0))


    for (word, pronprob, pron) in lexicon:
        pron_cost = -math.log(pronprob)
        cur_state = loop_state
        for i in range(len(pron) - 1):
            print("{src}\t{dest}\t{phone}\t{word}\t{cost}".format(
                src=cur_state, dest=next_state,
                phone=pron[i],
                word=(word if i == 0 else '<eps>'),
                cost=(pron_cost if i == 0 else 0.0)))
            cur_state = next_state
            next_state += 1

        i = len(pron) - 1  # note: i == -1 if pron is empty.
        print("{src}\t{dest}\t{phone}\t{word}\t{cost}".format(
            src=cur_state,
            dest=loop_state,
            phone=(pron[i] if i >= 0 else '<eps>'),
            word=(word if i <= 0 else '<eps>'),
            cost=no_sil_cost + (pron_cost if i <= 0 else 0.0)))
        print("{src}\t{dest}\t{phone}\t{word}\t{cost}".format(
            src=cur_state,
            dest=sil_state,
            phone=(pron[i] if i >= 0 else '<eps>'),
            word=(word if i <= 0 else '<eps>'),
            cost=sil_cost + (pron_cost if i <= 0 else 0.0)))

    if nonterminals is not None:
        next_state = write_nonterminal_arcs(
            start_state, loop_state, next_state,
            nonterminals, left_context_phones)

    print("{state}\t{final_cost}".format(
        state=loop_state,
        final_cost=0.0))


def write_words_txt(orig_lines, highest_numbered_symbol, nonterminals, filename):
    """Writes updated words.txt to 'filename'.  'orig_lines' is the original lines
       in the words.txt file as a list of strings (without the newlines);
       highest_numbered_symbol is the highest numbered symbol in the original
       words.txt; nonterminals is a list of strings like '#nonterm:foo'."""
    with open(filename, 'w', encoding='latin-1') as f:
        for l in orig_lines:
            print(l, file=f)
        cur_symbol = highest_numbered_symbol + 1
        for n in [ '#nonterm_begin', '#nonterm_end' ] + nonterminals:
            print("{0} {1}".format(n, cur_symbol), file=f)
            cur_symbol = cur_symbol + 1


def read_nonterminals(filename):
    """Reads the user-defined nonterminal symbols in 'filename', checks that
       it has the expected format and has no duplicates, and returns the nonterminal
       symbols as a list of strings, e.g.
       ['#nonterm:contact_list', '#nonterm:phone_number', ... ]. """
    ans = [line.strip(" \t\r\n") for line in open(filename, 'r', encoding='latin-1')]
    if len(ans) == 0:
        raise RuntimeError("The file {0} contains no nonterminals symbols.".format(filename))
    for nonterm in ans:
        if nonterm[:9] != '#nonterm:':
            raise RuntimeError("In file '{0}', expected nonterminal symbols to start with '#nonterm:', found '{1}'"
                               .format(filename, nonterm))
    if len(set(ans)) != len(ans):
        raise RuntimeError("Duplicate nonterminal symbols are present in file {0}".format(filename))
    return ans

def read_left_context_phones(filename):
    """Reads, checks, and returns a list of left-context phones, in text form, one
       per line.  Returns a list of strings, e.g. ['a', 'ah', ..., '#nonterm_bos' ]"""
    ans = [line.strip(" \t\r\n") for line in open(filename, 'r', encoding='latin-1')]
    if len(ans) == 0:
        raise RuntimeError("The file {0} contains no left-context phones.".format(filename))
    whitespace = re.compile("[ \t]+")
    for s in ans:
        if len(whitespace.split(s)) != 1:
            raise RuntimeError("The file {0} contains an invalid line '{1}'".format(filename, s)   )

    if len(set(ans)) != len(ans):
        raise RuntimeError("Duplicate nonterminal symbols are present in file {0}".format(filename))
    return ans


def is_token(s):
    """Returns true if s is a string and is space-free."""
    if not isinstance(s, str):
        return False
    whitespace = re.compile("[ \t\r\n]+")
    split_str = whitespace.split(s);
    return len(split_str) == 1 and s == split_str[0]


def main():
    args = get_args()

    lexicon = read_lexiconp(args.lexiconp)

    if args.nonterminals is None:
        nonterminals, left_context_phones = None, None
    else:
        if args.left_context_phones is None:
            print("{0}: if --nonterminals is specified, --left-context-phones must also "
                  "be specified".format(sys.argv[0]))
            sys.exit(1)
        nonterminals = read_nonterminals(args.nonterminals)
        left_context_phones = read_left_context_phones(args.left_context_phones)

    if args.sil_prob == 0.0:
          write_fst_no_silence(lexicon,
                               nonterminals=nonterminals,
                               left_context_phones=left_context_phones)
    else:
        # Do some checking that the options make sense.
        if args.sil_prob < 0.0 or args.sil_prob >= 1.0:
            print("{0}: invalid value specified --sil-prob={1}".format(
                sys.argv[0], args.sil_prob), file=sys.stderr)
            sys.exit(1)

        if not is_token(args.sil_phone):
            print("{0}: you specified --sil-prob={1} but --sil-phone is set "
                  "to '{2}'".format(sys.argv[0], args.sil_prob, args.sil_phone),
                  file=sys.stderr)
            sys.exit(1)
        if args.sil_disambig is not None and not is_token(args.sil_disambig):
            print("{0}: invalid value --sil-disambig='{1}' was specified."
                  "".format(sys.argv[0], args.sil_disambig), file=sys.stderr)
            sys.exit(1)
        write_fst_with_silence(lexicon, args.sil_prob, args.sil_phone,
                               args.sil_disambig,
                               nonterminals=nonterminals,
                               left_context_phones=left_context_phones)


#    (lines, highest_symbol) = read_words_txt(args.input_words_txt)
#    nonterminals = read_nonterminals(args.nonterminal_symbols_list)
#    write_words_txt(lines, highest_symbol, nonterminals, args.output_words_txt)


if __name__ == '__main__':
      main()


================================================
FILE: egs/aishell2/local/max_rescore.py
================================================
import sys
import json
import codecs
import copy

json_f = sys.argv[1]
json_f_out = sys.argv[2]
best_dict_f = sys.argv[3]

with codecs.open(json_f, "r", encoding="utf-8") as f:
        j = json.load(f)

best_dict = {}
for name in j["utts"]:
    hyp_lst = j["utts"][name]["output"]
    for idx, hyp in enumerate(hyp_lst):
        if hyp["text"] == hyp["rec_text"].replace("<eos>", "") and idx > 0:
            best_dict[name] = copy.deepcopy([hyp_lst[0]] + [hyp_lst[idx]]) 
            print(f"{name}: {idx}-th is the best")
            if hyp_lst[0]["mmi_tot_score"] - hyp_lst[idx]["mmi_tot_score"] <  - 1e-5:
                print("May be corrected by MMI")
            

            hyp_lst = [hyp]
    j["utts"][name]["output"] = hyp_lst[:1]

with open(json_f_out, "wb") as f:
    f.write(
        json.dumps(
            j, indent=4, ensure_ascii=False, sort_keys=True
        ).encode("utf_8")
    )

with open(best_dict_f, "wb") as f:
    f.write(
        json.dumps(
            best_dict, indent=4, ensure_ascii=False, sort_keys=True
        ).encode("utf_8")
    )


================================================
FILE: egs/aishell2/local/mmi_rescore.sh
================================================
decode_dir=$1
dict=$2

mkdir -p $decode_dir/rescore
dir=$decode_dir/rescore

mkdir -p $dir/best
python3 local/max_rescore.py $decode_dir/data.json ${dir}/best/data.1.json $dir/best/best.json
score_sclite.sh  $dir/best ${dict} > ${dir}/best/decode_result.txt

for w in 0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9; do
    mkdir -p $dir/$w
    python3 local/rerank.py $decode_dir/data.json $w ${dir}/${w}/data.1.json
    score_sclite.sh  $dir/$w ${dict} > ${dir}/$w/decode_result.txt
done 


================================================
FILE: egs/aishell2/local/parse_options.sh
================================================
#!/usr/bin/env bash

# Copyright 2012  Johns Hopkins University (Author: Daniel Povey);
#                 Arnab Ghoshal, Karel Vesely

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#  http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.


# Parse command-line options.
# To be sourced by another script (as in ". parse_options.sh").
# Option format is: --option-name arg
# and shell variable "option_name" gets set to value "arg."
# The exception is --help, which takes no arguments, but prints the
# $help_message variable (if defined).


###
### The --config file options have lower priority to command line
### options, so we need to import them first...
###

# Now import all the configs specified by command-line, in left-to-right order
for ((argpos=1; argpos<$#; argpos++)); do
  if [ "${!argpos}" == "--config" ]; then
    argpos_plus1=$((argpos+1))
    config=${!argpos_plus1}
    [ ! -r $config ] && echo "$0: missing config '$config'" && exit 1
    . $config  # source the config file.
  fi
done


###
### Now we process the command line options
###
while true; do
  [ -z "${1:-}" ] && break;  # break if there are no arguments
  case "$1" in
    # If the enclosing script is called with --help option, print the help
    # message and exit.  Scripts should put help messages in $help_message
    --help|-h) if [ -z "$help_message" ]; then echo "No help found." 1>&2;
      else printf "$help_message\n" 1>&2 ; fi;
      exit 0 ;;
    --*=*) echo "$0: options to scripts must be of the form --name value, got '$1'"
      exit 1 ;;
    # If the first command-line argument begins with "--" (e.g. --foo-bar),
    # then work out the variable name as $name, which will equal "foo_bar".
    --*) name=`echo "$1" | sed s/^--// | sed s/-/_/g`;
      # Next we test whether the variable in question is undefned-- if so it's
      # an invalid option and we die.  Note: $0 evaluates to the name of the
      # enclosing script.
      # The test [ -z ${foo_bar+xxx} ] will return true if the variable foo_bar
      # is undefined.  We then have to wrap this test inside "eval" because
      # foo_bar is itself inside a variable ($name).
      eval '[ -z "${'$name'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1;

      oldval="`eval echo \\$$name`";
      # Work out whether we seem to be expecting a Boolean argument.
      if [ "$oldval" == "true" ] || [ "$oldval" == "false" ]; then
        was_bool=true;
      else
        was_bool=false;
      fi

      # Set the variable to the right value-- the escaped quotes make it work if
      # the option had spaces, like --cmd "queue.pl -sync y"
      eval $name=\"$2\";

      # Check that Boolean-valued arguments are really Boolean.
      if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then
        echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2
        exit 1;
      fi
      shift 2;
      ;;
  *) break;
  esac
done


# Check for an empty argument to the --cmd option, which can easily occur as a
# result of scripting errors.
[ ! -z "${cmd+xxx}" ] && [ -z "$cmd" ] && echo "$0: empty argument to --cmd option" 1>&2 && exit 1;


true; # so this script returns exit code 0.


================================================
FILE: egs/aishell2/local/prepare_data.sh
================================================
#!/usr/bin/env bash
# Copyright 2018 AIShell-Foundation(Authors:Jiayu DU, Xingyu NA, Bengu WU, Hao ZHENG)
#           2018 Beijing Shell Shell Tech. Co. Ltd. (Author: Hui BU)
# Apache 2.0

# transform raw AISHELL-2 data to kaldi format

. ./path.sh || exit 1;

tmp=
dir=

if [ $# != 3 ]; then
  echo "Usage: $0 <corpus-data-dir> <tmp-dir> <output-dir>"
  echo " $0 /export/AISHELL-2/iOS/train data/local/train data/train"
  exit 1;
fi

corpus=$1
#dict_dir=$2
tmp=$2
dir=$3

echo "prepare_data.sh: Preparing data in $corpus"

mkdir -p $tmp
mkdir -p $dir

# corpus check
if [ ! -d $corpus ] || [ ! -f $corpus/wav.scp ] || [ ! -f $corpus/trans.txt ]; then
  echo "Error: $0 requires wav.scp and trans.txt under $corpus directory."
  exit 1;
fi

# validate utt-key list, IC0803W0380 is a bad utterance
awk '{print $1}' $corpus/wav.scp | grep -v 'IC0803W0380' > $tmp/wav_utt.list
awk '{print $1}' $corpus/trans.txt > $tmp/trans_utt.list
utils/filter_scp.pl -f 1 $tmp/wav_utt.list $tmp/trans_utt.list > $tmp/utt.list

# wav.scp
awk -F'\t' -v path_prefix=$corpus '{printf("%s\t%s/%s\n",$1,path_prefix,$2)}' $corpus/wav.scp > $tmp/tmp_wav.scp
utils/filter_scp.pl -f 1 $tmp/utt.list $tmp/tmp_wav.scp | sort -k 1 | uniq > $tmp/wav.scp

# text
utils/filter_scp.pl -f 1 $tmp/utt.list $corpus/trans.txt | sort -k 1 | uniq > $tmp/text

# utt2spk & spk2utt
awk -F'\t' '{print $2}' $tmp/wav.scp > $tmp/wav.list
sed -e 's:\.wav::g' $tmp/wav.list | \
  awk -F'/' '{i=NF-1;printf("%s\t%s\n",$NF,$i)}' > $tmp/tmp_utt2spk
utils/filter_scp.pl -f 1 $tmp/utt.list $tmp/tmp_utt2spk | sort -k 1 | uniq > $tmp/utt2spk
utils/utt2spk_to_spk2utt.pl $tmp/utt2spk | sort -k 1 | uniq > $tmp/spk2utt

# copy prepared resources from tmp_dir to target dir
mkdir -p $dir
for f in wav.scp text spk2utt utt2spk; do
  cp $tmp/$f $dir/$f || exit 1;
done

echo "local/prepare_data.sh succeeded"
exit 0;


================================================
FILE: egs/aishell2/local/prepare_dict.sh
================================================
#!/usr/bin/env bash
# Copyright 2018 AIShell-Foundation(Authors:Jiayu DU, Xingyu NA, Bengu WU, Hao ZHENG)
#           2018 Beijing Shell Shell Tech. Co. Ltd. (Author: Hui BU)
# Apache 2.0

# This is a shell script, and it download and process DaCiDian for Mandarin ASR.

. ./path.sh

download_dir=data/local/DaCiDian
dir=data/local/dict

if [ $# -ne 1 ]; then
  echo "Usage: $0 <dict-dir>";
  exit 1;
fi

dir=$1

# download the DaCiDian from github
if [ ! -d $download_dir ]; then
  git clone https://github.com/aishell-foundation/DaCiDian.git $download_dir
fi

# here we map <UNK> to the phone spn(spoken noise)
mkdir -p $dir
python $download_dir/DaCiDian.py $download_dir/word_to_pinyin.txt $download_dir/pinyin_to_phone.txt > $dir/lexicon.txt
echo -e "<UNK>\tspn" >> $dir/lexicon.txt

# prepare silence_phones.txt, nonsilence_phones.txt, optional_silence.txt, extra_questions.txt
cat $dir/lexicon.txt | awk '{ for(n=2;n<=NF;n++){ phones[$n] = 1; }} END{for (p in phones) print p;}'| \
  perl -e 'while(<>){ chomp($_); $phone = $_; next if ($phone eq "sil");
    m:^([^\d]+)(\d*)$: || die "Bad phone $_"; $q{$1} .= "$phone "; }
    foreach $l (values %q) {print "$l\n";}
  ' | sort -k1 > $dir/nonsilence_phones.txt  || exit 1;

echo sil > $dir/silence_phones.txt
echo sil > $dir/optional_silence.txt

cat $dir/silence_phones.txt | awk '{printf("%s ", $1);} END{printf "\n";}' > $dir/extra_questions.txt || exit 1;
cat $dir/nonsilence_phones.txt | perl -e 'while(<>){ foreach $p (split(" ", $_)) {
  $p =~ m:^([^\d]+)(\d*)$: || die "Bad phone $_"; if($p eq "\$0"){$q{""} .= "$p ";}else{$q{$2} .= "$p ";} } } foreach $l (values %q) {print "$l\n";}' \
 >> $dir/extra_questions.txt || exit 1;

echo "local/prepare_dict.sh succeeded"
exit 0;


================================================
FILE: egs/aishell2/local/rerank.py
================================================
import sys
import json
import codecs


json_f = sys.argv[1]
json_f_out = sys.argv[3]
weight = float(sys.argv[2])

with codecs.open(json_f, "r", encoding="utf-8") as f:
        j = json.load(f)

for name in j["utts"]:
    hyp_lst = j["utts"][name]["output"]
    for hyp in hyp_lst:
        hyp["score"] = float(hyp["score"]) * weight + float(hyp["mmi_tot_score"]) * (1 - weight)
    hyp_lst.sort(key=lambda hyp: hyp["score"], reverse=True)
    j["utts"][name]["output"] = hyp_lst

with open(json_f_out, "wb") as f:
    f.write(
        json.dumps(
            j, indent=4, ensure_ascii=False, sort_keys=True
        ).encode("utf_8")
    )


================================================
FILE: egs/aishell2/local/sym2int.pl
================================================
#!/usr/bin/env perl
# Copyright 2010-2012 Microsoft Corporation  Johns Hopkins University (Author: Daniel Povey)

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#  http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.


$ignore_oov = 0;

for($x = 0; $x < 2; $x++) {
  if ($ARGV[0] eq "--map-oov") {
    shift @ARGV;
    $map_oov = shift @ARGV;
    if ($map_oov eq "-f" || $map_oov =~ m/words\.txt$/ || $map_oov eq "") {
      # disallow '-f', the empty string and anything ending in words.txt as the
      # OOV symbol because these are likely command-line errors.
      die "the --map-oov option requires an argument";
    }
  }
  if ($ARGV[0] eq "-f") {
    shift @ARGV;
    $field_spec = shift @ARGV;
    if ($field_spec =~ m/^\d+$/) {
      $field_begin = $field_spec - 1; $field_end = $field_spec - 1;
    }
    if ($field_spec =~ m/^(\d*)[-:](\d*)/) { # accept e.g. 1:10 as a courtesy (properly, 1-10)
      if ($1 ne "") {
        $field_begin = $1 - 1;  # Change to zero-based indexing.
      }
      if ($2 ne "") {
        $field_end = $2 - 1;    # Change to zero-based indexing.
      }
    }
    if (!defined $field_begin && !defined $field_end) {
      die "Bad argument to -f option: $field_spec";
    }
  }
}

$symtab = shift @ARGV;
if (!defined $symtab) {
  print STDERR "Usage: sym2int.pl [options] symtab [input transcriptions] > output transcriptions\n" .
    "options: [--map-oov <oov-symbol> ]  [-f <field-range> ]\n" .
      "note: <field-range> can look like 4-5, or 4-, or 5-, or 1.\n";
}
open(F, "<$symtab") || die "Error opening symbol table file $symtab";
while(<F>) {
    @A = split(" ", $_);
    @A == 2 || die "bad line in symbol table file: $_";
    $sym2int{$A[0]} = $A[1] + 0;
}

if (defined $map_oov && $map_oov !~ m/^\d+$/) { # not numeric-> look it up
  if (!defined $sym2int{$map_oov}) { die "OOV symbol $map_oov not defined."; }
  $map_oov = $sym2int{$map_oov};
}

$num_warning = 0;
$max_warning = 20;

while (<>) {
  @A = split(" ", $_);
  @B = ();
  for ($n = 0; $n < @A; $n++) {
    $a = $A[$n];
    if ( (!defined $field_begin || $n >= $field_begin)
         && (!defined $field_end || $n <= $field_end)) {
      $i = $sym2int{$a};
      if (!defined ($i)) {
        if (defined $map_oov) {
          if ($num_warning++ < $max_warning) {
            print STDERR "sym2int.pl: replacing $a with $map_oov\n";
            if ($num_warning == $max_warning) {
              print STDERR "sym2int.pl: not warning for OOVs any more times\n";
            }
          }
          $i = $map_oov;
        }
      }
      $a = $i;
    }
    push @B, $a;
  }
  print join(" ", @B);
  print "\n";
}
if ($num_warning > 0) {
  print STDERR "** Replaced $num_warning instances of OOVs with $map_oov\n";
}

exit(0);


================================================
FILE: egs/aishell2/local/train_lms.sh
================================================
#!/usr/bin/env bash
# Copyright 2018 AIShell-Foundation(Authors:Jiayu DU, Xingyu NA, Bengu WU, Hao ZHENG)
#           2018 Beijing Shell Shell Tech. Co. Ltd. (Author: Hui BU)
# Apache 2.0

. ./path.sh
. ./utils/parse_options.sh

if [ $# -ne 3 ]; then
  echo "train_lms.sh <lexicon> <word-segmented-text> <dir>"
  echo " e.g train_lms.sh data/local/dict/lexicon.txt data/local/train/text data/local/lm"
  exit 1;
fi

lexicon=$1
text=$2
dir=$3

for f in "$text" "$lexicon"; do
  [ ! -f $x ] && echo "$0: No such file $f" && exit 1;
done

kaldi_lm=`which train_lm.sh`
if [ -z $kaldi_lm ]; then
  echo "$0: train_lm.sh is not found. That might mean it's not installed"
  echo "$0: or it is not added to PATH"
  echo "$0: Use the script tools/extras/install_kaldi_lm.sh to install it"
  exit 1
fi

mkdir -p $dir
cleantext=$dir/text.no_oov

cat $text | awk -v lex=$lexicon 'BEGIN{while((getline<lex) >0){ seen[$1]=1; } }
  {for(n=1; n<=NF;n++) {  if (seen[$n]) { printf("%s ", $n); } else {printf("<UNK> ");} } printf("\n");}' \
  > $cleantext || exit 1;

cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | sort | uniq -c | \
   sort -nr > $dir/word.counts || exit 1;

# Get counts from acoustic training transcripts, and add  one-count
# for each word in the lexicon (but not silence, we don't want it
# in the LM-- we'll add it optionally later).
cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | \
  cat - <(grep -w -v '!SIL' $lexicon | awk '{print $1}') | \
   sort | uniq -c | sort -nr > $dir/unigram.counts || exit 1;

# note: we probably won't really make use of <UNK> as there aren't any OOVs
cat $dir/unigram.counts  | awk '{print $2}' | get_word_map.pl "<s>" "</s>" "<UNK>" > $dir/word_map \
   || exit 1;

# note: ignore 1st field of train.txt, it's the utterance-id.
cat $cleantext | awk -v wmap=$dir/word_map 'BEGIN{while((getline<wmap)>0)map[$1]=$2;}
  { for(n=2;n<=NF;n++) { printf map[$n]; if(n<NF){ printf " "; } else { print ""; }}}' | gzip -c >$dir/train.gz \
   || exit 1;

train_lm.sh --arpa --lmtype 3gram-mincount $dir || exit 1;
train_lm.sh --arpa --lmtype 4gram-mincount $dir
# note: output is
# data/local/lm/3gram-mincount/lm_unpruned.gz

echo "local/train_lms.sh succeeded"
exit 0


# From here is some commands to do a baseline with SRILM (assuming
# you have it installed).
heldout_sent=10000 # Don't change this if you want result to be comparable with
    # kaldi_lm results
sdir=$dir/srilm # in case we want to use SRILM to double-check perplexities.
mkdir -p $sdir
cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n<NF) printf " "; else print ""; }}' | \
  head -$heldout_sent > $sdir/heldout
cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n<NF) printf " "; else print ""; }}' | \
  tail -n +$heldout_sent > $sdir/train

cat $dir/word_map | awk '{print $1}' | cat - <(echo "<s>"; echo "</s>" ) > $sdir/wordlist


ngram-count -text $sdir/train -order 3 -limit-vocab -vocab $sdir/wordlist -unk \
  -map-unk "<UNK>" -kndiscount -interpolate -lm $sdir/srilm.o3g.kn.gz
ngram -lm $sdir/srilm.o3g.kn.gz -ppl $sdir/heldout
# 0 zeroprobs, logprob= -250954 ppl= 90.5091 ppl1= 132.482

# Note: perplexity SRILM gives to Kaldi-LM model is same as kaldi-lm reports above.
# Difference in WSJ must have been due to different treatment of <UNK>.
ngram -lm $dir/3gram-mincount/lm_unpruned.gz  -ppl $sdir/heldout
# 0 zeroprobs, logprob= -250913 ppl= 90.4439 ppl1= 132.379

echo "local/train_lms.sh succeeded"
exit 0


================================================
FILE: egs/aishell2/local/word_segmentation.py
================================================
# encoding=utf-8
# Copyright 2018 AIShell-Foundation(Authors:Jiayu DU, Xingyu NA, Bengu WU, Hao ZHENG)
#           2018 Beijing Shell Shell Tech. Co. Ltd. (Author: Hui BU)
# Apache 2.0

from __future__ import print_function
import sys
import jieba
reload(sys)
sys.setdefaultencoding('utf-8')

if len(sys.argv) < 3:
  sys.stderr.write("word_segmentation.py <vocab> <trans> > <word-segmented-trans>\n")
  exit(1)

vocab_file=sys.argv[1]
trans_file=sys.argv[2]

jieba.set_dictionary(vocab_file)
for line in open(trans_file):
  key,trans = line.strip().split('\t',1)
  words = jieba.cut(trans, HMM=False) # turn off new word discovery (HMM-based)
  new_line = key + '\t' + " ".join(words)
  print(new_line)


================================================
FILE: egs/aishell2/nt.sh
================================================
#!/usr/bin/env bash

# Copyright 2017 Johns Hopkins University (Shinji Watanabe)
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

. ./path.sh || exit 1;
. ./cmd.sh || exit 1;

# general configuration
backend=pytorch
stage=0        # start from 0 if you need to start from data preparation
stop_stage=100
ngpu=2         # number of gpus ("0" uses cpu, otherwise use gpu)
debugmode=1
dumpdir=dump   # directory to dump full features
N=0            # number of minibatches to be used (mainly for debugging). "0" uses all minibatches.
verbose=0      # verbose option
resume=        # Resume the training from snapshot

# feature configuration
do_delta=false
preprocess_config=conf/specaug.yaml
train_config=conf/tuning/transducer/train_conformer-rnn_transducer_aux_ngpu4.yaml
lm_config=conf/lm.yaml
decode_config=conf/tuning/transducer/decode_default.yaml

# rnnlm related
lm_resume=         # specify a snapshot file to resume LM training
lmtag=             # tag for managing LMs

# decoding parameter
recog_model=model.acc.best # set a model to be used for decoding: 'model.acc.best' or 'model.loss.best'

# data dir, modify this to your AISHELL-2 data path
tr_dir=/data/asr_data/aishell2/iOS/data
dev_tst_dir=/data/asr_data/aishell2/AISHELL-DEV-TEST-SET

# exp tag
### Configurable parameters ###
tag="8v100_rnnt_mmi_ctc"
ngpu=8
debug=false

# Train config
seed=888
batch_size=8
accum_grad=8
epochs=100
use_segment=true # if true, use word-level transcription in MMI criterion
aux_ctc=false
aux_ctc_weight=0.5
aux_ctc_dropout_rate=0.1
aux_mmi=false
aux_mmi_weight=0.5
aux_mmi_dropout_rate=0.1
aux_mmi_type='mmi' # mmi or phonectc

# MBR training config
aux_mbr=false
aux_mbr_weight=1.0
aux_mbr_beam=4
mbr_epochs=100
mbr_lr=0.1
mbr_warmup=2500
mbr_resume=

# Decode config
idx_average=41_50
search_type="alsd" # "default", "nsc", "tsd", "alsd"
mmi_weight=0.0 # MMI / phonectc joint decoding
mas_lookahead=0
ctc_weight=0.0 # char ctc joint decoding
ngram_weight=0.0
ngram_order=4
word_ngram_weight=0.0
word_ngram_tag=word_3gram # 3 or 4 gram
word_ngram_log_semiring=true
lm_weight=0.0
beam_size=10
recog_set="test_android test_ios test_mic"

. utils/parse_options.sh || exit 1;

if [ $debug == true ]; then
    export HOST_GPU_NUM=1
    export HOST_NUM=1
    export NODE_NUM=1
    export INDEX=0
    export CHIEF_IP="9.135.217.29"
fi

train_opts=\
"\
--seed $seed \
--batch-size $batch_size \
--accum-grad $accum_grad \
--epochs $epochs \
--use-segment $use_segment \
--aux-ctc $aux_ctc \
--aux-ctc-weight $aux_ctc_weight \
--aux-ctc-dropout-rate $aux_ctc_dropout_rate \
--aux-mmi $aux_mmi \
--aux-mmi-weight $aux_mmi_weight \
--aux-mmi-dropout-rate $aux_mmi_dropout_rate \
--aux-mmi-type $aux_mmi_type \
"


if [ $aux_mbr == true ]; then
    train_opts="$train_opts \
                --aux-mbr $aux_mbr \
                --aux-mbr-weight $aux_mbr_weight \
                --aux-mbr-beam $aux_mbr_beam \
                --transformer-lr $mbr_lr \
                --epochs $mbr_epochs \
                --transformer-warmup-steps $mbr_warmup \
                --resume $mbr_resume \
                --load-trainer-and-opt false \
                --save-interval-iters 1000 \
                "
    export OMP_NUM_THREADS=6 # for on-the-fly decoding
fi

decode_opts=\
"\
--search-type $search_type \
--mmi-weight $mmi_weight \
--beam-size $beam_size \
--ctc-weight $ctc_weight \
--ngram-weight $ngram_weight \
--word-ngram-weight $word_ngram_weight \
--word-ngram data/${word_ngram_tag} \
--word-ngram-log-semiring $word_ngram_log_semiring \
--lm-weight $lm_weight \
--mas-lookahead $mas_lookahead \
"
dict=data/lang_1char/train_sp_units.txt
# Set bash to 'debug' mode, it will exit on :
# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
set -e
set -u
set -o pipefail

train_set=train_sp
train_dev=dev_ios

expname=${train_set}_${backend}_${tag}
expdir=exp/${expname}
mkdir -p ${expdir}

lang=data/lang_phone
feat_tr_dir=${dumpdir}/${train_set}/delta${do_delta}; mkdir -p ${feat_tr_dir}
feat_dt_dir=${dumpdir}/${train_dev}/delta${do_delta}; mkdir -p ${feat_dt_dir}
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    echo "stage 1: Network Training"
    MASTER_PORT=22277
    NCCL_DEBUG=TRACE python3 -m torch.distributed.launch \
        --nproc_per_node ${HOST_GPU_NUM} --master_port $MASTER_PORT \
        --nnodes=${HOST_NUM} --node_rank=${INDEX} --master_addr=${CHIEF_IP} \
        ${MAIN_ROOT}/bin/asr_train.py \
        --config ${train_config} \
        --preprocess-conf ${preprocess_config} \
        --ngpu 1 \
        --backend ${backend} \
        --outdir ${expdir}/results_RANK \
        --debugmode ${debugmode} \
        --dict ${dict} \
        --debugdir ${expdir} \
        --minibatches ${N} \
        --verbose ${verbose} \
        --resume ${resume} \
        --train-json ${feat_tr_dir}/split${ngpu}utt/data_noeng.RANK.json \
        --valid-json ${feat_dt_dir}/data.json \
        --lang $lang \
        --opt "noam_sgd" \
        --n-iter-processes 8 \
        --world-size $ngpu \
        --node-rank ${INDEX} \
        --node-size ${HOST_GPU_NUM} \
        $train_opts > ${expdir}/global_record.${INDEX}.txt 2>&1
fi

if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
    echo "stage 2: Decoding"
    nj=2500
    recog_model=model.last${idx_average}.avg.best
    if [[ $(get_yaml.py ${train_config} model-module) = *transformer* ]] || \
           [[ $(get_yaml.py ${train_config} model-module) = *conformer* ]] || \
           [[ $(get_yaml.py ${train_config} etype) = custom ]] || \
           [[ $(get_yaml.py ${train_config} dtype) = custom ]]; then
	recog_model=model.last${idx_average}.avg.best
	average_checkpoints.py --backend ${backend} \
        	               --snapshots ${expdir}/results_0/snapshot.ep.* \
	     	 	       --out ${expdir}/results_0/${recog_model} \
	 		       --num ${idx_average}
    fi
    
    decode_parent_dir=decode_mmi${mmi_weight}_${word_ngram_tag}${word_ngram_weight}_lookahead${mas_lookahead}_ep${idx_average}_beam${beam_size}
    for rtask in ${recog_set}; do
        decode_dir=$decode_parent_dir/$rtask
        feat_recog_dir=${dumpdir}/${rtask}/delta${do_delta}

        # split data
        splitjson.py --parts ${nj} ${feat_recog_dir}/data.json

        #### use CPU for decoding
        ngpu=0
        ${decode_cmd} JOB=1:$nj ${expdir}/${decode_dir}/log/decode.JOB.log \
            python3 ${MAIN_ROOT}/bin/asr_recog.py \
            --config ${decode_config} \
            --ngpu ${ngpu} \
            --backend ${backend} \
            --batchsize 0 \
            --recog-json ${feat_recog_dir}/split${nj}utt/data.JOB.json \
            --result-label ${expdir}/${decode_dir}/data.JOB.json \
            --model ${expdir}/results_0/${recog_model}  \
            --local-rank JOB $decode_opts  

        score_sclite.sh ${expdir}/${decode_dir} ${dict} \
          > ${expdir}/${decode_dir}/decode_result.txt

    done
    echo "Finished"
fi


================================================
FILE: egs/aishell2/prepare.sh
================================================
#!/usr/bin/env bash

# Copyright 2017 Johns Hopkins University (Shinji Watanabe)
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

. ./path.sh || exit 1;
. ./cmd.sh || exit 1;

# general configuration
backend=pytorch
stage=0        # start from 0 if you need to start from data preparation
stop_stage=100
ngpu=2         # number of gpus ("0" uses cpu, otherwise use gpu)
debugmode=1
dumpdir=dump   # directory to dump full features
N=0            # number of minibatches to be used (mainly for debugging). "0" uses all minibatches.
verbose=0      # verbose option
resume=        # Resume the training from snapshot

# feature configuration
do_delta=false

train_config=conf/train.yaml
lm_config=conf/lm_rnn.yaml
decode_config=conf/decode.yaml

# rnnlm related
lm_resume=         # specify a snapshot file to resume LM training
lmtag=             # tag for managing LMs
n_gram=4
# decoding parameter
recog_model=model.acc.best # set a model to be used for decoding: 'model.acc.best' or 'model.loss.best'

# data dir, modify this to your AISHELL-2 data path
tr_dir=/data/asr_data/aishell2/iOS/data
dev_tst_dir=/data/asr_data/aishell2/AISHELL-DEV-TEST-SET
word_arpa=/apdcephfs/share_1149801/speech_user/tomasyu/jinchuan/data/ngram/cweng_3g_5gram.arpa
# exp tag
### Configurable parameters ###
tag="8v100_rnnt_mmi_ctc"
ngpu=8

# Train config
seed=888
batch_size=8
accum_grad=1
epochs=100
use_segment=true # if true, use word-level transcription in MMI criterion
aux_ctc_weight=0.5
aux_ctc_dropout_rate=0.1
aux_mmi=true
aux_mmi_weight=0.5
aux_mmi_dropout_rate=0.1
aux_mmi_type='mmi' # mmi or phonectc

# Decode config
idx_average=91_100
search_type="alsd" # "default", "nsc", "tsd", "alsd"
mmi_weight=0.2 # MMI / phonectc joint decoding
ctc_weight=0.0 # char ctc joint decoding
ngram_weight=0.0
word_ngram_weight=0.0
word_ngram_order=4 # 3 or 4 gram
mmi_type="frame" # or rescore
beam_size=10
recog_set="test_android test_ios test_mic"

. utils/parse_options.sh || exit 1;

#if [ $debug -eq true ]; then
#    export HOST_GPU_NUM=1
#    export HOST_NUM=1
#    export NODE_NUM=1
#    export INDEX=0
#    export CHIEF_IP="9.135.217.29"
#fi

train_opts=\
"\
--seed $seed \
--batch-size $batch_size \
--accum-grad $accum_grad \
--epochs $epochs \
--use-segment $use_segment \
--aux-ctc $aux_ctc \
--aux-ctc-weight $aux_ctc_weight \
--aux-ctc-dropout-rate $aux_ctc_dropout_rate \
--aux-mmi $aux_mmi \
--aux-mmi-weight $aux_mmi_weight \
--aux-mmi-dropout-rate $aux_mmi_dropout_rate \
--aux-mmi-type $aux_mmi_type \
"

decode_opts=\
"\
--search-type $search_type \
--mmi-weight $mmi_weight \
--beam-size $beam_size \
--ctc-weight $ctc_weight \
--mmi-type $mmi_type \
--ngram-weight $ngram_weight \
--word-ngram-weight $word_ngram_weight \
--word-ngram data/word_${word_ngram_order}gram \
"

# Set bash to 'debug' mode, it will exit on :
# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
set -e
set -u
set -o pipefail

train_set=train_sp
train_dev=dev_ios

if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    ### Task dependent. You have to make data the following preparation part by yourself.
    ### But you can utilize Kaldi recipes in most cases
    echo "stage 0: Data preparation"
    # For training set
    local/prepare_data.sh ${tr_dir} data/local/train data/train || exit 1;
    # # For dev and test set
    for x in Android iOS Mic; do
        local/prepare_data.sh ${dev_tst_dir}/${x}/dev data/local/dev_${x,,} data/dev_${x,,} || exit 1;
        local/prepare_data.sh ${dev_tst_dir}/${x}/test data/local/test_${x,,} data/test_${x,,} || exit 1;
    done 
    # Normalize text to capital letters
    for x in train dev_android dev_ios dev_mic test_android test_ios test_mic; do
        mv data/${x}/text data/${x}/text_org
        paste <(cut -f 1 data/${x}/text_org) <(cut -f 2 data/${x}/text_org | tr '[:lower:]' '[:upper:]') \
            > data/${x}/text
        rm data/${x}/text_org
    done
fi

feat_tr_dir=${dumpdir}/${train_set}/delta${do_delta}; mkdir -p ${feat_tr_dir}
feat_dt_dir=${dumpdir}/${train_dev}/delta${do_delta}; mkdir -p ${feat_dt_dir}
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    ### Task dependent. You have to design training and dev sets by yourself.
    ### But you can utilize Kaldi recipes in most cases
    echo "stage 1: Feature Generation"
    fbankdir=fbank
    # Generate the fbank features; by default 80-dimensional fbanks with pitch on each frame
    steps/make_fbank_pitch.sh --cmd "$train_cmd" --nj 188 --write_utt2num_frames true \
        data/train exp/make_fbank/train ${fbankdir}
    utils/fix_data_dir.sh data/train
    for x in android ios mic; do
        steps/make_fbank_pitch.sh --cmd "$train_cmd" --nj 20 --write_utt2num_frames true \
            data/dev_${x} exp/make_fbank/dev_${x} ${fbankdir}
        utils/fix_data_dir.sh data/dev_${x}     
        steps/make_fbank_pitch.sh --cmd "$train_cmd" --nj 20 --write_utt2num_frames true \
            data/test_${x} exp/make_fbank/test_${x} ${fbankdir}
        utils/fix_data_dir.sh data/test_${x}
    done
    
    # speed-perturbed
    utils/perturb_data_dir_speed.sh 0.9 data/train data/temp1
    utils/perturb_data_dir_speed.sh 1.0 data/train data/temp2
    utils/perturb_data_dir_speed.sh 1.1 data/train data/temp3
    utils/combine_data.sh --extra-files utt2uniq data/${train_set} data/temp1 data/temp2 data/temp3
    rm -r data/temp1 data/temp2 data/temp3
    steps/make_fbank_pitch.sh --cmd "$train_cmd" --nj 30 --write_utt2num_frames true \
    data/${train_set} exp/make_fbank/${train_set} ${fbankdir}
    utils/fix_data_dir.sh data/${train_set}

    # compute global CMVN
    compute-cmvn-stats scp:data/${train_set}/feats.scp data/${train_set}/cmvn.ark

    # dump features for training
    split_dir=$(echo $PWD | awk -F "/" '{print $NF "/" $(NF-1)}')
    if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d ${feat_tr_dir}/storage ]; then
    utils/create_split_dir.pl \
        /export/a{11,12,13,14}/${USER}/espnet-data/egs/${split_dir}/dump/${train_set}/delta${do_delta}/storage \
        ${feat_tr_dir}/storage
    fi
    if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d ${feat_dt_dir}/storage ]; then
    utils/create_split_dir.pl \
        /export/a{11,12,13,14}/${USER}/espnet-data/egs/${split_dir}/dump/${train_dev}/delta${do_delta}/storage \
        ${feat_dt_dir}/storage
    fi
    dump.sh --cmd "$train_cmd" --nj 100 --do_delta ${do_delta} \
        data/${train_set}/feats.scp data/${train_set}/cmvn.ark exp/dump_feats/train ${feat_tr_dir}
        
    for rtask in ${recog_set}; do
        feat_recog_dir=${dumpdir}/${rtask}/delta${do_delta}; mkdir -p ${feat_recog_dir}
        dump.sh --cmd "$train_cmd" --nj 20 --do_delta ${do_delta} \
            data/${rtask}/feats.scp data/${train_set}/cmvn.ark exp/dump_feats/recog/${rtask} \
            ${feat_recog_dir}
    done
fi

lang=data/lang_phone
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
    echo "stage 2: prepare lang and do text segmentation"
    bash local/prepare_dict.sh data/local/dict
    local/k2_prepare_lang.sh --position-dependent-phones false data/local/dict \
      "<UNK>" data/local/lang_tmp_nosp $lang || exit 1

    # use jieba for segmentation used in MMI. This would take a few minutes
    python3 local/jieba_build_dict.py $lang/words.txt $lang/jieba_dict.txt
    for part in train_sp dev_android dev_ios dev_mic test_android test_ios test_mic; do
        python3 local/jieba_split_text.py data/${part}/text data/${part}/text_orig.scp $lang/jieba_dict.txt
    done

    # word-level N-gram model
    mkdir -p data/local/lm
    awk '{print $1}' data/local/dict/lexicon.txt | sort | uniq | awk '{print $1,99}' \
      > data/local/lm/word_seg_vocab.txt
    python2 local/word_segmentation.py data/local/lm/word_seg_vocab.txt \
      data/local/train/text > data/local/lm/trans.txt

    local/train_lms.sh \
     data/local/dict/lexicon.txt \
     data/local/lm/trans.txt \
     data/local/lm || exit 1;

    for order in 3 4; do
      wngram_dir=data/word_${order}gram; mkdir -p $wngram_dir
      cp $lang/words.txt $wngram_dir
      cp $lang/oov.int $wngram_dir
      gunzip -c data/local/lm/${order}gram-mincount/lm_unpruned.gz \
        > $wngram_dir/lm.arpa
      python3 -m kaldilm \
      --read-symbol-table="$wngram_dir/words.txt" \
      --disambig-symbol='#0' \
      --max-order=$order \
      $wngram_dir/lm.arpa > $wngram_dir/G.fst.txt
    done

    # prepare very large LM with external resources
    mkdir -p data/word_5gram; wdir=data/word_5gram
    cp $lang/words.txt $wdir/
    cp $lang/oov.int $wdir/

    python3 -m kaldilm \
        --read-symbol-table="$wdir/words.txt" \
        --disambig-symbol='#0' \
        --max-order=5 \
        $word_arpa > $wdir/G.fst.txt
    # built the .pt file
    python3 espnet/nets/scorers/word_ngram.py
fi

dict=data/lang_1char/${train_set}_units.txt
echo "dictionary: ${dict}"
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
    ### Task dependent. You have to check non-linguistic symbols used in the corpus.
    echo "stage 3: Dictionary and Json Data Preparation"
    mkdir -p data/lang_1char/

    echo "make a dictionary"
    echo "<unk> 1" > ${dict} # <unk> must be 1, 0 will be used for "blank" in CTC
    text2token.py -s 1 -n 1 data/${train_set}/text | cut -f 2- -d" " | tr " " "\n" \
    | sort | uniq | grep -v -e '^\s*$' | awk '{print $0 " " NR+1}' >> ${dict}
    wc -l ${dict}

    echo "make json files"
    data2json.sh --feat ${feat_tr_dir}/feats.scp \
                 --text_org data/${train_set}/text_orig.scp \
		 data/${train_set} ${dict} > ${feat_tr_dir}/data.json
    for rtask in ${recog_set}; do
        feat_recog_dir=${dumpdir}/${rtask}/delta${do_delta}
        data2json.sh --feat ${feat_recog_dir}/feats.scp \
                     --text_org data/${rtask}/text_orig.scp \
		     data/${rtask} ${dict} > ${feat_recog_dir}/data.json
    done   
fi

# you can skip this and remove --rnnlm option in the recognition (stage 5)
if [ -z ${lmtag} ]; then
    lmtag=$(basename ${lm_config%.*})
fi
lmexpname=train_rnnlm_${backend}_${lmtag}
lmexpdir=exp/${lmexpname}
mkdir -p ${lmexpdir}

if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
    echo "stage 4: LM Preparation"
    lmdatadir=data/local/lm_train
    mkdir -p ${lmdatadir}
    text2token.py -s 1 -n 1 data/train/text | cut -f 2- -d" " \
        > ${lmdatadir}/train.txt
    text2token.py -s 1 -n 1 data/${train_dev}/text | cut -f 2- -d" " \
        > ${lmdatadir}/valid.txt
    mkdir -p ${lmexpdir}/results

    ${cuda_cmd} --gpu ${ngpu} ${lmexpdir}/train.log \
        lm_train.py \
        --config ${lm_config} \
        --ngpu 4 \
        --backend ${backend} \
        --verbose ${verbose} \
        --outdir ${lmexpdir}/results \
        --tensorboard-dir ${lmexpdir}/tensorboard \
        --train-label ${lmdatadir}/train.txt \
        --valid-label ${lmdatadir}/valid.txt \
        --resume ${lm_resume} \
        --dict ${dict}
 
    ngramexpdir=exp/train_ngram
    lmplz --discount_fallback -o ${n_gram} <${lmdatadir}/train.txt > ${ngramexpdir}/${n_gram}gram.arpa
    build_binary -s ${ngramexpdir}/${n_gram}gram.arpa ${ngramexpdir}/${n_gram}gram.bin

fi

# Prepare these word N-gram LMs for SPL response
# (1) use different smooth method
if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
    # 3-gram LM with different smooth
    for sm in -wbdiscount -kndiscount -ukndiscount -ndiscount; do
        bash espnet_utils/train_lms_srilm.sh \
          --unk "<UNK>" --lm-opts $sm data/local/dict/lexicon.txt \
          data/local/lm/trans.txt data/local/lm$sm
    done

    # good-tuning
    bash espnet_utils/train_lms_srilm.sh \
      --unk "<UNK>" data/local/dict/lexicon.txt \
      data/local/lm/trans.txt data/local/lm-gtdiscount

    # build k2 directory
    for tag in wbdiscount kndiscount ukndiscount ndiscount gtdiscount; do
        mkdir -p data/word_3gram_$tag; lmdir=data/word_3gram_$tag
        gunzip -c data/local/lm-$tag/srilm/srilm.o3g.kn.gz \
          > $lmdir/lm.arpa

        cp $lang/words.txt $lmdir
        cp $lang/oov.int $lmdir

        python3 -m kaldilm \
            --read-symbol-table="$lmdir/words.txt" \
            --disambig-symbol='#0' \
            --max-order=3 \
            $lmdir/lm.arpa > $lmdir/G.fst.txt

        python3 espnet/nets/scorers/word_ngram.py $lmdir
    done
fi


================================================
FILE: egs/asrucs/.gitignore
================================================
dump
dump32
dump64
data
exp*
fbank


================================================
FILE: egs/asrucs/cmd.sh
================================================
../aishell1/cmd.sh

================================================
FILE: egs/asrucs/conf/decode.yaml
================================================
tuning/decode_pytorch_transformer.yaml

================================================
FILE: egs/asrucs/conf/fbank.conf
================================================
--sample-frequency=16000 
--num-mel-bins=80


================================================
FILE: egs/asrucs/conf/gpu.conf
================================================
# Default configuration
command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
option mem=* -l mem_free=$0,ram_free=$0
option mem=0          # Do not add anything to qsub_opts
option num_threads=* -pe smp $0
option num_threads=1  # Do not add anything to qsub_opts
option max_jobs_run=* -tc $0
default gpu=0
option gpu=0
option gpu=* -l 'hostname=b1[12345678]*|c*,gpu=$0' -q g.q

================================================
FILE: egs/asrucs/conf/lm.yaml
================================================
# rnnlm related
layer: 2
unit: 650
opt: sgd        # or adam
batchsize: 64   # batch size in LM training
epoch: 20      # if the data size is large, we can reduce this
patience: 3
maxlen: 100     # if sentence length > lm_maxlen, lm_batchsize is automatically reduced


================================================
FILE: egs/asrucs/conf/lm_rnn.yaml
================================================
lm.yaml

================================================
FILE: egs/asrucs/conf/lm_transformer.yaml
================================================
# This Transformer LM setting w/ 4 GPUs took around 60 days for 50 epochs.
# However, you can get better results in 6 days for 5 epochs (WER: 2.2/5.4/2.6/5.7)
# than LSTM LM (WER: 2.6/5.6/2.6/5.7) in 60 days for 20 epochs
# And if you does not have 4 GPUs, try accum-grad=4.

# network architecture
model-module: transformer
att-unit: 512
embed-unit: 128
head: 8
layer: 16
pos-enc: none
unit: 2048

# minibatch related
batchsize: 32
maxlen: 40

# optimization related
opt: adam
schedulers: lr=cosine
dropout-rate: 0.0
epoch: 10
gradclip: 1.0
lr: 1e-4
lr-cosine-total: 100000
lr-cosine-warmup: 1000
patience: 0
sortagrad: 0


================================================
FILE: egs/asrucs/conf/pitch.conf
================================================
--sample-frequency=16000


================================================
FILE: egs/asrucs/conf/pure_ctc.yaml
================================================
# network architecture
# encoder related
elayers: 15
eunits: 2048
# decoder related
dlayers: 6
dunits: 2048
# attention related
adim: 256
aheads: 4

# hybrid CTC/attention
mtlalpha: 1.0

# label smoothing
lsm-weight: 0.1

# minibatch related
batch-size: 16
maxlen-in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
maxlen-out: 150 # if output length > maxlen-out, batchsize is automatically reduced

# optimization related
sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
opt: noam
accum-grad: 4
grad-clip: 5
patience: 0
epochs: 50
dropout-rate: 0.1

# transformer specific setting
backend: pytorch
model-module: "espnet.nets.pytorch_backend.e2e_asr_conformer:E2E"
transformer-input-layer: conv2d     # encoder architecture type
transformer-lr: 1.0
transformer-warmup-steps: 25000
transformer-attn-dropout-rate: 0.0
transformer-length-normalized-loss: false
transformer-init: pytorch

# conformer specific setting
transformer-encoder-pos-enc-layer-type: rel_pos
transformer-encoder-selfattn-layer-type: rel_selfattn
transformer-encoder-activation-type: swish
macaron-style: true
use-cnn-module: true
cnn-module-kernel: 31


================================================
FILE: egs/asrucs/conf/queue.conf
================================================
# Default configuration
command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
option mem=* -l mem_free=$0,ram_free=$0
option mem=0          # Do not add anything to qsub_opts
option num_threads=* -pe smp $0
option num_threads=1  # Do not add anything to qsub_opts
option max_jobs_run=* -tc $0
default gpu=0
option gpu=0
option gpu=* -l gpu=$0 -q g.q


================================================
FILE: egs/asrucs/conf/slurm.conf
================================================
# Default configuration
command sbatch --export=PATH
option name=* --job-name $0
option time=* --time $0
option mem=* --mem-per-cpu $0
option mem=0
option num_threads=* --cpus-per-task $0
option num_threads=1 --cpus-per-task 1
option num_nodes=* --nodes $0
default gpu=0
option gpu=0 -p cpu
option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
# note: the --max-jobs-run option is supported as a special case
# by slurm.pl and you don't have to handle it in the config file.


================================================
FILE: egs/asrucs/conf/specaug.yaml
================================================
process:
  # these three processes are a.k.a. SpecAugument
  - type: "time_warp"
    max_time_warp: 5
    inplace: true
    mode: "PIL"
  - type: "freq_mask"
    F: 30
    n_mask: 2
    inplace: true
    replace_with_zero: false
  - type: "time_mask"
    T: 40
    n_mask: 2
    inplace: true
    replace_with_zero: false


================================================
FILE: egs/asrucs/conf/specaug_test.yaml
================================================
process:
  # these three processes are a.k.a. SpecAugument
  - type: "time_warp"
    max_time_warp: 0
    inplace: true
    mode: "PIL"
  - type: "freq_mask"
    F: 30
    n_mask: 2
    inplace: true
    replace_with_zero: true
  - type: "time_mask"
    T: 40
    n_mask: 2
    inplace: true
    replace_with_zero: true


================================================
FILE: egs/asrucs/conf/train.yaml
================================================
tuning/train_pytorch_conformer_kernel15.yaml

================================================
FILE: egs/asrucs/conf/train_conformer-rnn_transducer_cs.yaml
================================================
# minibatch related
batch-size: 32
maxlen-in: 512
maxlen-out: 150

# optimization related
criterion: loss
early-stop-criterion: "validation/main/loss"
sortagrad: 0
opt: noam
transformer-lr: 1.0
transformer-warmup-steps: 25000
epochs: 50
patience: 0
#accum-grad: 2
grad-clip: 5.0

# network architecture
## general
custom-enc-positional-encoding-type: rel_pos
custom-enc-self-attn-type: rel_self_attn
custom-enc-pw-activation-type: swish
## encoder related
etype: custom
custom-enc-input-layer: vgg2l
enc-block-arch:
        - type: conformer
          d_hidden: 512
          d_ff: 2048
          heads: 4
          macaron_style: True
          use_conv_mod: True
          conv_mod_kernel: 15
          dropout-rate: 0.3
          att-dropout-rate: 0.3
enc-block-repeat: 12
## decoder related
dtype: lstm
dlayers: 1
dec-embed-dim: 1024
dunits: 512
dropout-rate-embed-decoder: 0.2
dropout-rate-decoder: 0.1
## joint network related
joint-dim: 256

# transducer related
model-module: "espnet.nets.pytorch_backend.e2e_asr_transducer_cs:E2E"


================================================
FILE: egs/asrucs/conf/tuning/decode_pytorch_transformer.yaml
================================================
batchsize: 0
beam-size: 10
penalty: 0.0
maxlenratio: 0.0
minlenratio: 0.0
ctc-weight: 0.5
lm-weight: 0.0
ngram-weight: 0.3


================================================
FILE: egs/asrucs/conf/tuning/decode_rnn.yaml
================================================
beam-size: 20
penalty: 0.0
maxlenratio: 0.0
minlenratio: 0.0
ctc-weight: 0.6
lm-weight: 0.3


================================================
FILE: egs/asrucs/conf/tuning/train_pytorch_conformer_kernel15.yaml
================================================
# network architecture
# encoder related
elayers: 12
eunits: 2048
# decoder related
dlayers: 6
dunits: 2048
# attention related
adim: 256
aheads: 4

# hybrid CTC/attention
mtlalpha: 0.3

# label smoothing
lsm-weight: 0.1

# minibatch related
batch-size: 32
maxlen-in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
maxlen-out: 150 # if output length > maxlen-out, batchsize is automatically reduced

# optimization related
sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
opt: noam
accum-grad: 2
grad-clip: 5
patience: 0
epochs: 50
dropout-rate: 0.1

# transformer specific setting
backend: pytorch
model-module: "espnet.nets.pytorch_backend.e2e_asr_conformer:E2E"
transformer-input-layer: conv2d     # encoder architecture type
transformer-lr: 1.0
transformer-warmup-steps: 25000
transformer-attn-dropout-rate: 0.0
transformer-length-normalized-loss: false
transformer-init: pytorch

# conformer specific setting
transformer-encoder-pos-enc-layer-type: rel_pos
transformer-encoder-selfattn-layer-type: rel_selfattn
transformer-encoder-activation-type: swish
rel-pos-type: latest
macaron-style: true
use-cnn-module: true
cnn-module-kernel: 15


================================================
FILE: egs/asrucs/conf/tuning/train_pytorch_conformer_kernel31.yaml
================================================
# network architecture
# encoder related
elayers: 12
eunits: 2048
# decoder related
dlayers: 6
dunits: 2048
# attention related
adim: 256
aheads: 4

# hybrid CTC/attention
mtlalpha: 0.3

# label smoothing
lsm-weight: 0.1

# minibatch related
batch-size: 32
maxlen-in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
maxlen-out: 150 # if output length > maxlen-out, batchsize is automatically reduced

# optimization related
sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
opt: noam
accum-grad: 2
grad-clip: 5
patience: 0
epochs: 50
dropout-rate: 0.1

# transformer specific setting
backend: pytorch
model-module: "espnet.nets.pytorch_backend.e2e_asr_conformer:E2E"
transformer-input-layer: conv2d     # encoder architecture type
transformer-lr: 1.0
transformer-warmup-steps: 25000
transformer-attn-dropout-rate: 0.0
transformer-length-normalized-loss: false
transformer-init: pytorch

# conformer specific setting
transformer-encoder-pos-enc-layer-type: rel_pos
transformer-encoder-selfattn-layer-type: rel_selfattn
transformer-encoder-activation-type: swish
macaron-style: true
use-cnn-module: true
cnn-module-kernel: 31


================================================
FILE: egs/asrucs/conf/tuning/train_pytorch_conformer_kernel31_large.yaml
================================================
# network architecture
# encoder related
elayers: 16
eunits: 2048
# decoder related
dlayers: 6
dunits: 2048
# attention related
adim: 512
aheads: 8

# hybrid CTC/attention
mtlalpha: 0.3

# label smoothing
lsm-weight: 0.1

# minibatch related
batch-size: 32
maxlen-in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
maxlen-out: 150 # if output length > maxlen-out, batchsize is automatically reduced

# optimization related
sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
opt: noam
accum-grad: 2
grad-clip: 5
patience: 0
epochs: 50
dropout-rate: 0.1

# transformer specific setting
backend: pytorch
model-module: "espnet.nets.pytorch_backend.e2e_asr_conformer:E2E"
transformer-input-layer: conv2d     # encoder architecture type
transformer-lr: 1.0
transformer-warmup-steps: 25000
transformer-attn-dropout-rate: 0.0
transformer-length-normalized-loss: false
transformer-init: pytorch

# conformer specific setting
transformer-encoder-pos-enc-layer-type: rel_pos
transformer-encoder-selfattn-layer-type: rel_selfattn
transformer-encoder-activation-type: swish
macaron-style: true
use-cnn-module: true
cnn-module-kernel: 31


================================================
FILE: egs/asrucs/conf/tuning/train_pytorch_conformer_kernel31_small.yaml
================================================
# network architecture
# encoder related
elayers: 8
eunits: 1024
# decoder related
dlayers: 4
dunits: 1024
# attention related
adim: 256
aheads: 4

# hybrid CTC/attention
mtlalpha: 0.3

# label smoothing
lsm-weight: 0.1

# minibatch related
batch-size: 32
maxlen-in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
maxlen-out: 150 # if output length > maxlen-out, batchsize is automatically reduced

# optimization related
sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
opt: noam
accum-grad: 2
grad-clip: 5
patience: 0
epochs: 50
dropout-rate: 0.1

# transformer specific setting
backend: pytorch
model-module: "espnet.nets.pytorch_backend.e2e_asr_conformer:E2E"
transformer-input-layer: conv2d     # encoder architecture type
transformer-lr: 1.0
transformer-warmup-steps: 25000
transformer-attn-dropout-rate: 0.0
transformer-length-normalized-loss: false
transformer-init: pytorch

# conformer specific setting
transformer-encoder-pos-enc-layer-type: rel_pos
transformer-encoder-selfattn-layer-type: rel_selfattn
transformer-encoder-activation-type: swish
macaron-style: true
use-cnn-module: true
cnn-module-kernel: 31


================================================
FILE: egs/asrucs/conf/tuning/train_pytorch_transformer.yaml
================================================
# network architecture
# encoder related
elayers: 12
eunits: 2048
# decoder related
dlayers: 6
dunits: 2048
# attention related
adim: 256
aheads: 4

# hybrid CTC/attention
mtlalpha: 0.3

# label smoothing
lsm-weight: 0.1

# minibatch related
batch-size: 32
maxlen-in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
maxlen-out: 150 # if output length > maxlen-out, batchsize is automatically reduced

# optimization related
sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
opt: noam
accum-grad: 2
grad-clip: 5
patience: 0
epochs: 50
dropout-rate: 0.1

# transformer specific setting
backend: pytorch
model-module: "espnet.nets.pytorch_backend.e2e_asr_transformer:E2E"
transformer-input-layer: conv2d     # encoder architecture type
transformer-lr: 1.0
transformer-warmup-steps: 25000
transformer-attn-dropout-rate: 0.0
transformer-length-normalized-loss: false
transformer-init: pytorch


================================================
FILE: egs/asrucs/conf/tuning/train_rnn.yaml
================================================
# network architecture
# encoder related
etype: vggblstm     # encoder architecture type
elayers: 3
eunits: 1024
eprojs: 1024
subsample: "1_2_2_1_1" # skip every n frame from input to nth layers
# decoder related
dlayers: 2
dunits: 1024
# attention related
atype: location
adim: 1024
aconv-chans: 10
aconv-filts: 100

# hybrid CTC/attention
mtlalpha: 0.5

# minibatch related
batch-size: 30
maxlen-in: 800  # if input length  > maxlen_in, batchsize is automatically reduced
maxlen-out: 150 # if output length > maxlen_out, batchsize is automatically reduced

# optimization related
opt: adadelta
epochs: 10
patience: 0

# scheduled sampling option
sampling-probability: 0.0


================================================
FILE: egs/asrucs/conf/tuning/transducer/decode_default.yaml
================================================
# decoding parameters
batch: 0
beam-size: 10
search-type: default
score-norm: True


================================================
FILE: egs/asrucs/conf/tuning/transducer/train_conformer-rnn_transducer.yaml
================================================
# minibatch related
batch-size: 64
maxlen-in: 512
maxlen-out: 150

# optimization related
criterion: loss
early-stop-criterion: "validation/main/loss"
sortagrad: 0
opt: noam
transformer-lr: 1.0
transformer-warmup-steps: 25000
epochs: 100
patience: 0
accum-grad: 2
grad-clip: 5.0

# network architecture
## general
custom-enc-positional-encoding-type: rel_pos
custom-enc-self-attn-type: rel_self_attn
custom-enc-pw-activation-type: swish
## encoder related
etype: custom
custom-enc-input-layer: vgg2l
enc-block-arch:
        - type: conformer
          d_hidden: 512
          d_ff: 2048
          heads: 4
          macaron_style: True
          use_conv_mod: True
          conv_mod_kernel: 15
          dropout-rate: 0.3
          att-dropout-rate: 0.3
enc-block-repeat: 12
## decoder related
dtype: lstm
dlayers: 1
dec-embed-dim: 1024
dunits: 512
dropout-rate-embed-decoder: 0.2
dropout-rate-decoder: 0.1
## joint network related
joint-dim: 512

# transducer related
model-module: "espnet.nets.pytorch_backend.e2e_asr_transducer:E2E"

# reporter related
report-wer: True
report-cer: True


================================================
FILE: egs/asrucs/conf/tuning/transducer/train_conformer-rnn_transducer_aux_ngpu4.yaml
================================================
# minibatch related
batch-size: 32
maxlen-in: 512
maxlen-out: 150

# optimization related
criterion: loss
early-stop-criterion: "validation/main/loss"
sortagrad: 0
opt: noam
transformer-lr: 1.0
transformer-warmup-steps: 25000
epochs: 100
patience: 0
#accum-grad: 2
grad-clip: 5.0

# network architecture
## general
custom-enc-positional-encoding-type: rel_pos
custom-enc-self-attn-type: rel_self_attn
custom-enc-pw-activation-type: swish
## encoder related
etype: custom
custom-enc-input-layer: vgg2l
enc-block-arch:
        - type: conformer
          d_hidden: 512
          d_ff: 2048
          heads: 4
          macaron_style: True
          use_conv_mod: True
          conv_mod_kernel: 15
          dropout-rate: 0.3
          att-dropout-rate: 0.3
enc-block-repeat: 15
## decoder related
dtype: lstm
dlayers: 1
dec-embed-dim: 1024
dunits: 512
dropout-rate-embed-decoder: 0.2
dropout-rate-decoder: 0.1
## joint network related
joint-dim: 256

# transducer related
model-module: "espnet.nets.pytorch_backend.e2e_asr_transducer:E2E"

# reporter related
report-wer: True
report-cer: True


================================================
FILE: egs/asrucs/conf/tuning/transducer/train_conformer-rnn_transducer_aux_ngpu4_att.yaml
================================================
# minibatch related
batch-size: 32
maxlen-in: 512
maxlen-out: 150

# optimization related
criterion: loss
early-stop-criterion: "validation/main/loss"
sortagrad: 0
opt: noam
transformer-lr: 1.0
transformer-warmup-steps: 25000
epochs: 100
patience: 0
#accum-grad: 2
grad-clip: 5.0

# network architecture
## general
custom-enc-positional-encoding-type: rel_pos
custom-enc-self-attn-type: rel_self_attn
custom-enc-pw-activation-type: swish
## encoder related
etype: custom
custom-enc-input-layer: vgg2l
enc-block-arch:
        - type: conformer
          d_hidden: 512
          d_ff: 2048
          heads: 4
          macaron_style: True
          use_conv_mod: True
          conv_mod_kernel: 15
          dropout-rate: 0.3
          att-dropout-rate: 0.3
enc-block-repeat: 12
## decoder related
dtype: lstm
dlayers: 1
dec-embed-dim: 1024
dunits: 512
dropout-rate-embed-decoder: 0.2
dropout-rate-decoder: 0.1
## joint network related
joint-dim: 512

# transducer related
model-module: "espnet.nets.pytorch_backend.e2e_asr_transducer:E2E"

# reporter related
report-wer: True
report-cer: True

# Attention scorer auxiliary task: mainly follow the settings in LASCTC decoder
att-adim: 512
att-aheads: 8
att-dlayers: 6
att-dunits: 2048
att-dropout-rate: 0.1
att-attn-dropout-rate: 0.0
att-length-normalized-loss: false
lsm-weight: 0.1


================================================
FILE: egs/asrucs/conf/tuning/transducer/train_conformer-rnn_transducer_aux_ngpu4_small.yaml
================================================
# minibatch related
batch-size: 32
maxlen-in: 512
maxlen-out: 150

# optimization related
criterion: loss
early-stop-criterion: "validation/main/loss"
sortagrad: 0
opt: noam
transformer-lr: 1.0
transformer-warmup-steps: 25000
epochs: 100
patience: 0
#accum-grad: 2
grad-clip: 5.0

# network architecture
## general
custom-enc-positional-encoding-type: rel_pos
custom-enc-self-attn-type: rel_self_attn
custom-enc-pw-activation-type: swish
## encoder related
etype: custom
custom-enc-input-layer: vgg2l
enc-block-arch:
        - type: conformer
          d_hidden: 256
          d_ff: 1024
          heads: 4
          macaron_style: True
          use_conv_mod: True
          conv_mod_kernel: 15
          dropout-rate: 0.3
          att-dropout-rate: 0.3
enc-block-repeat: 12
## decoder related
dtype: lstm
dlayers: 1
dec-embed-dim: 512
dunits: 256
dropout-rate-embed-decoder: 0.2
dropout-rate-decoder: 0.1
## joint network related
joint-dim: 256

# transducer related
model-module: "espnet.nets.pytorch_backend.e2e_asr_transducer:E2E"

# reporter related
report-wer: True
report-cer: True

# auxiliary task
#aux-ctc: True
#aux-ctc-weight: 0.5
#aux-ctc-dropout-rate: 0.1


================================================
FILE: egs/asrucs/conf/tuning/transducer/train_conformer-rnn_transducer_ngpu4.yaml
================================================
# minibatch related
batch-size: 32
maxlen-in: 512
maxlen-out: 150

# optimization related
criterion: loss
early-stop-criterion: "validation/main/loss"
sortagrad: 0
opt: noam
transformer-lr: 1.0
transformer-warmup-steps: 25000
epochs: 100
patience: 0
#accum-grad: 2
grad-clip: 5.0

# network architecture
## general
custom-enc-positional-encoding-type: rel_pos
custom-enc-self-attn-type: rel_self_attn
custom-enc-pw-activation-type: swish
## encoder related
etype: custom
custom-enc-input-layer: vgg2l
enc-block-arch:
        - type: conformer
          d_hidden: 512
          d_ff: 2048
          heads: 4
          macaron_style: True
          use_conv_mod: True
          conv_mod_kernel: 15
          dropout-rate: 0.3
          att-dropout-rate: 0.3
enc-block-repeat: 12
## decoder related
dtype: lstm
dlayers: 1
dec-embed-dim: 1024
dunits: 512
dropout-rate-embed-decoder: 0.2
dropout-rate-decoder: 0.1
## joint network related
joint-dim: 512

# transducer related
model-module: "espnet.nets.pytorch_backend.e2e_asr_transducer:E2E"

# reporter related
report-wer: True
report-cer: True

# auxiliary task
aux-ctc: False
aux-ctc-weight: 0.0
aux-ctc-dropout-rate: 0.0


================================================
FILE: egs/asrucs/conf/tuning/transducer/train_conformer-rnn_transducer_ngpu4_large.yaml
================================================
# minibatch related
batch-size: 32
maxlen-in: 512
maxlen-out: 150

# optimization related
criterion: loss
early-stop-criterion: "validation/main/loss"
sortagrad: 0
opt: noam
transformer-lr: 1.0
transformer-warmup-steps: 25000
epochs: 100
patience: 0
#accum-grad: 2
grad-clip: 5.0

# network architecture
## general
custom-enc-positional-encoding-type: rel_pos
custom-enc-self-attn-type: rel_self_attn
custom-enc-pw-activation-type: swish
## encoder related
etype: custom
custom-enc-input-layer: vgg2l
enc-block-arch:
        - type: conformer
          d_hidden: 512
          d_ff: 2048
          heads: 4
          macaron_style: True
          use_conv_mod: True
          conv_mod_kernel: 31
          dropout-rate: 0.3
          att-dropout-rate: 0.3
enc-block-repeat: 16
## decoder related
dtype: lstm
dlayers: 2
dec-embed-dim: 1024
dunits: 1024
dropout-rate-embed-decoder: 0.2
dropout-rate-decoder: 0.1
## joint network related
joint-dim: 512

# transducer related
model-module: "espnet.nets.pytorch_backend.e2e_asr_transducer:E2E"

# reporter related
report-wer: True
report-cer: True

# auxiliary task
aux-ctc: False
aux-ctc-weight: 0.0
aux-ctc-dropout-rate: 0.0


================================================
FILE: egs/asrucs/conf/tuning/transducer/train_transducer.yaml
================================================
# minibatch related
batch-size: 64
maxlen-in: 512
maxlen-out: 150

# optimization related
criterion: loss
early-stop-criterion: "validation/main/loss"
sortagrad: 0
opt: adadelta
epochs: 30
patience: 3
accum-grad: 2

# network architecture
## encoder related
etype: vggblstm
elayers: 6
eunits: 512
eprojs: 512
dropout-rate: 0.4
## decoder related
dtype: lstm
dlayers: 1
dec-embed-dim: 1024
dunits: 512
dropout-rate-embed-decoder: 0.2
dropout-rate-decoder: 0.1
## joint network related
joint-dim: 512

# transducer related
model-module: "espnet.nets.pytorch_backend.e2e_asr_transducer:E2E"

# reporter related
report-wer: True
report-cer: True


================================================
FILE: egs/asrucs/conf/tuning/transducer/train_transducer_aux.yaml
================================================
# minibatch related
batch-size: 64
maxlen-in: 512
maxlen-out: 150

# optimization related
criterion: loss
early-stop-criterion: "validation/main/loss"
sortagrad: 0
opt: adadelta
epochs: 30
patience: 3
accum-grad: 2

# network architecture
## encoder related
etype: vggblstm
elayers: 6
eunits: 512
eprojs: 512
dropout-rate: 0.4
## decoder related
dtype: lstm
dlayers: 1
dec-embed-dim: 1024
dunits: 512
dropout-rate-embed-decoder: 0.2
dropout-rate-decoder: 0.1
## joint network related
joint-dim: 512

# transducer related
model-module: "espnet.nets.pytorch_backend.e2e_asr_transducer:E2E"

# reporter related
report-wer: True
report-cer: True

# auxiliary task
aux-ctc: True
aux-ctc-weight: 0.1
aux-ctc-dropout-rate: 0.1


================================================
FILE: egs/asrucs/espnet
================================================
../../../E2E-ASR-Framework/

================================================
FILE: egs/asrucs/espnet_utils
================================================
../espnet_utils/

================================================
FILE: egs/asrucs/local/add_seperator.py
================================================
import sys

def is_all_chinese(strs):
    for _char in strs:
        if not '\u4e00' <= _char <= '\u9fa5':
            return False
    return True

in_f, out_f = sys.argv[1:]

lines = open(in_f, encoding="utf-8").readlines()
seperator = u'\u2581'
writer = open(out_f, 'w', encoding="utf-8")

for line in lines:
    line = line.strip().split()
    ans = []
    for p in line[1:]: # remove uttid
        if is_all_chinese(p):
            ans.append(p)
        else:
            ans.append(seperator + p)
    line = " ".join(ans) + '\n'
    writer.write(line)
writer.close()


================================================
FILE: egs/asrucs/local/generate_fake_cs.py
================================================
import sys
import torch
import random
import torchaudio

MAX_LENGTH = 12 * 100 # 12 seconds


def read_datadir(d):
    wav_dict = {}
    for line in open(d + "/wav.scp", encoding="utf-8"):
        uttid, item = line.split()
        wav_dict[uttid] = item

    text_dict = {}
    for line in open(d + "/text", encoding="utf-8"):
        uttid, item = line.split()[0], line.split()[1:]
        text_dict[uttid] = " ".join(item)

    dur_dict = {}
    for line in open(d + "/utt2num_frames", encoding="utf-8"):
        uttid, item = line.strip().split()
        dur_dict[uttid] = int(item)
    
    return wav_dict, text_dict, dur_dict

def generate_pairs(chn_dur_dict, eng_dur_dict, dur_maximum):
    dur_sum = 0 
    ans = []
    chn_keys = list(chn_dur_dict.keys())
    eng_keys = list(eng_dur_dict.keys())
    random.shuffle(chn_keys)
    random.shuffle(eng_keys)

    for i, (chn_k, eng_k) in enumerate(zip(chn_keys, eng_keys)):
        length = chn_dur_dict[chn_k] + eng_dur_dict[eng_k]
        if length > MAX_LENGTH:
            continue

        k_lst = [chn_k, eng_k]
        random.shuffle(k_lst)

        ans.append(k_lst)
        dur_sum += length

        if dur_sum > dur_maximum:
            break

    return ans 

def write_utts(tgt_dir, pair_list, wav_dict, text_dict):
    text_writer = open(tgt_dir + "/text", 'w', encoding="utf-8")
    scp_writer = open(tgt_dir + "/wav.scp", 'w', encoding="utf-8")

    def write_utt(path1, path2, path):
        wave1, sr1 = torchaudio.load(path1)
        wave2, sr2 = torchaudio.load(path2)
        assert sr1 == sr2
        wave = torch.cat([wave1, wave2], dim=-1)
        torchaudio.save(path, wave, sample_rate=sr1)

    for i, (k1, k2) in enumerate(pair_list):
        uttid = k1 + "_and_" + k2
        wave_path = tgt_dir + '/wavs/' + uttid + '.wav'
        write_utt(wav_dict[k1], wav_dict[k2], wave_path)
   
        text = uttid + " " + text_dict[k1] + ' ' + text_dict[k2] + "\n"
        text_writer.write(text)
        text_writer.flush()

        scp_info = uttid + " " + wave_path + "\n"
        scp_writer.write(scp_info)
        scp_writer.flush()

        if i % 10000 == 0:
            print(f"have generate {i} utts")

    text_writer.close()
    scp_writer.close()

def main():

    chn_dir, eng_dir, tgt_dir = sys.argv[1:4]

    chn_wav_dict, chn_text_dict, chn_dur_dict = \
        read_datadir(chn_dir)

    eng_wav_dict, eng_text_dict, eng_dur_dict = \
        read_datadir(eng_dir)
    
    chn_wav_dict.update(eng_wav_dict)
    chn_text_dict.update(eng_text_dict)

    # 200 hours
    pair_list = generate_pairs(chn_dur_dict, eng_dur_dict, dur_maximum=100 * 3600 * 200)
    write_utts(tgt_dir, pair_list, chn_wav_dict, chn_text_dict)

if __name__ == "__main__":
    main()


================================================
FILE: egs/asrucs/local/prepare_fake_cs.sh
================================================
cs_dir=data/train_cs_fake/
# generate wav files
# python3 local/generate_fake_cs.py data/train_zh_trim/ data/train_en_trim/ data/train_cs_fake/

# cat $cs_dir/text | cut -d ' ' -f 1 | awk '{print $1, $1}' > $cs_dir/spk2utt 
# cp $cs_dir/spk2utt $cs_dir/utt2spk
# utils/fix_data_dir.sh $cs_dir

# steps/make_fbank_pitch.sh --nj 500 --write_utt2num_frames true \
#     $cs_dir exp/make_fbank/cs_fake fbank

# dump.sh  --nj 48 --do_delta false \
#     $cs_dir/feats.scp data/cmvn/cmvn.ark exp/dump_feats/fake_cs \
#     dump/train_cs_fake/deltafalse/

# python3 espnet_utils/text_norm.py --in-f data/train_cs_fake/text --out-f data/train_cs_fake/text_org --eng-upper
# mv data/train_cs_fake/text_org data/train_cs_fake/text

# feat_part_dir=dump/train_cs_fake/deltafalse/
# data2json.sh --nj 20 --feat $feat_part_dir/feats.scp --bpecode data/dict_cs/bpe.model \
#     data/train_cs_fake data/dict_cs/dict.txt > $feat_part_dir/data.json

# dict=data/dict_cs/dict.txt
# n_symbols=`wc -l $dict | cut -d ' ' -f 1`

# python3 espnet_utils/add_uttcls_json.py dump/train_cs_fake/deltafalse/data.json \
#             dump/train_cs_fake/deltafalse/data_withcls.json $[$n_symbols + 4]

python3 espnet_utils/concatjson.py \
        dump/train_zh_trim/deltafalse/data_withcls.json \
        dump/train_en_trim/deltafalse/data_withcls.json \
        dump/train_cs_fake/deltafalse/data_withcls.json \
        --shuffle > dump/jsons/pretrain_data_fakecs.json
python3 espnet_utils/splitjson.py -p 8 --original-order dump/jsons/pretrain_data_fakecs.json


================================================
FILE: egs/asrucs/nt.sh
================================================
#!/usr/bin/env bash

# author: tyriontian
# tyriontian@tencent.com

. ./path.sh || exit 1;
. ./cmd.sh || exit 1;

# general configuration
backend=pytorch
stage=0        # start from 0 if you need to start from data preparation
stop_stage=100
debugmode=1
dumpdir=dump   # directory to dump full features
N=0            # number of minibatches to be used (mainly for debugging). "0" uses all minibatches.
verbose=0      # verbose option
debug=false

# feature configuration
do_delta=false

preprocess_config=conf/specaug.yaml
train_config=conf/train_conformer-rnn_transducer_cs.yaml
decode_config=conf/tuning/transducer/decode_default.yaml

# decoding parameter
recog_model=model.acc.best # set a model to be used for decoding: 'model.acc.best' or 'model.loss.best'
resume=

# data
train_json=dump/jsons/split8utt/zh_en_cs_withcls.RANK.json
valid_json=dump/dev_cs/deltafalse/data_withcls.json
dict=data/dict_cs/dict.txt
nlsyms=data/dict_cs/nlsyms.txt
nbpe=5000
bpemodel=data/dict_cs/bpe.model

### Configurable parameters ###
tag=debug
ngpu=8

# Train config
seed=888
batch_size=16
accum_grad=4
epochs=50
aux_ctc_weight=1.0
cs_lang_weight=0.0
cs_share_encoder=true
cs_share_encoder_layers=9
cs_is_pretrain=false
cs_use_adversial_examples=true
cs_is_ctc_decoder=true
cs_use_mask_predictor=false
enc_block_repeat=3
pretrain_model=

# Decode config 
decode_tag="default_decode"
idx_average=91_100
decode_feature=combine 
search_type="ctc_greedy" # ctc_greedy | ctc_beam | alsd 
beam_size=10
ngram_model=data/ngram/train_cs_5gram.bin
ngram_weight=0.0
rnnlm=exp/train_nnlm_combine/rnnlm.model.best
rnnlm_conf=exp/train_nnlm_combine/model.json
lm_weight=0.0
word_ngram=data/word_ngram/train_combine/
word_ngram_weight=0.0
eng_vocab=None
recog_set="test_zh test_en test_cs"

. utils/parse_options.sh || exit 1;

if [ $debug == true ]; then
    export HOST_GPU_NUM=1
    export HOST_NUM=1
    export NODE_NUM=1
    export INDEX=0
    export CHIEF_IP="9.135.217.29"
fi

train_opts=\
"\
--seed $seed \
--batch-size $batch_size \
--accum-grad $accum_grad \
--epochs $epochs \
--aux-ctc-weight $aux_ctc_weight \
--cs-lang-weight $cs_lang_weight \
--cs-share-encoder $cs_share_encoder \
--cs-share-encoder-layers $cs_share_encoder_layers \
--cs-use-adversial-examples $cs_use_adversial_examples \
--cs-is-pretrain $cs_is_pretrain \
--cs-is-ctc-decoder $cs_is_ctc_decoder \
--cs-use-mask-predictor $cs_use_mask_predictor \
--enc-block-repeat $enc_block_repeat \
"

decode_opts=\
"\
--beam-size $beam_size \
--cs-nt-decode-feature $decode_feature \
--search-type $search_type \
--ngram-model $ngram_model \
--ngram-weight $ngram_weight \
--rnnlm $rnnlm \
--rnnlm-conf $rnnlm_conf \
--lm-weight $lm_weight \
--word-ngram-lower-char false \
--word-ngram $word_ngram \
--word-ngram-weight $word_ngram_weight \
--eng-vocab $eng_vocab \
"

# Set bash to 'debug' mode, it will exit on :
# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
set -e
set -u
set -o pipefail

expname=${tag}
expdir=exp/${expname}
mkdir -p ${expdir}


if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    echo "stage 1: Network Pre-Training: CTC bilingual system"
    # keep the model file 
    cp espnet/nets/pytorch_backend/e2e_asr_transducer_cs.py $expdir 
    MASTER_PORT=22275
    NCCL_DEBUG=TRACE python3 -m torch.distributed.launch \
        --nproc_per_node ${HOST_GPU_NUM} --master_port $MASTER_PORT \
        --nnodes=${HOST_NUM} --node_rank=${INDEX} --master_addr=${CHIEF_IP} \
        ${MAIN_ROOT}/bin/asr_train.py \
        --config ${train_config} \
        --preprocess-conf ${preprocess_config} \
        --ngpu 1 \
        --backend ${backend} \
        --outdir ${expdir}/pretrain_RANK \
        --debugmode ${debugmode} \
        --dict ${dict} \
        --debugdir ${expdir} \
        --minibatches ${N} \
        --verbose ${verbose} \
        --resume ${resume} \
        --train-json $train_json \
        --valid-json $valid_json \
        --n-iter-processes 8 \
        --world-size $ngpu \
        --node-rank ${INDEX} \
        --node-size ${HOST_GPU_NUM} \
        --num-save-attention 0 \
        --cs-is-pretrain true $train_opts \
        > ${expdir}/pretrain.${INDEX}.txt 2>&1
fi


if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
    echo "stage 1: Network Fine-tuning: RNNT bilingual system"
    MASTER_PORT=22275
    NCCL_DEBUG=TRACE python3 -m torch.distributed.launch \
        --nproc_per_node ${HOST_GPU_NUM} --master_port $MASTER_PORT \
        --nnodes=${HOST_NUM} --node_rank=${INDEX} --master_addr=${CHIEF_IP} \
        ${MAIN_ROOT}/bin/asr_train.py \
        --config ${train_config} \
        --preprocess-conf ${preprocess_config} \
        --ngpu 1 \
        --backend ${backend} \
        --outdir ${expdir}/finetuning_RANK \
        --debugmode ${debugmode} \
        --dict ${dict} \
        --debugdir ${expdir} \
        --minibatches ${N} \
        --verbose ${verbose} \
        --train-json $train_json \
        --valid-json $valid_json \
        --n-iter-processes 8 \
        --world-size $ngpu \
        --node-rank ${INDEX} \
        --node-size ${HOST_GPU_NUM} \
        --num-save-attention 0 \
        --resume $pretrain_model \
        --load-trainer-and-opt false \
        $train_opts > ${expdir}/finetuning.${INDEX}.txt 2>&1

       
fi

if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
    echo "stage 3: Decoding"
    nj=5
    recog_model=model.last${idx_average}.avg.best
    if [[ $(get_yaml.py ${train_config} model-module) = *transformer* ]] || \
           [[ $(get_yaml.py ${train_config} model-module) = *conformer* ]] || \
           [[ $(get_yaml.py ${train_config} etype) = custom ]] || \
           [[ $(get_yaml.py ${train_config} dtype) = custom ]]; then \
        if [ ! -f ${expdir}/pretrain_0/${recog_model} ]; then
            echo "conduct model average"
            average_checkpoints.py --backend ${backend} \
                                   --snapshots ${expdir}/pretrain_0/snapshot.ep.* \
                                   --out ${expdir}/pretrain_0/${recog_model} \
                                   --num ${idx_average} 
        fi
    fi
    decode_parent_dir=${decode_tag}
    for rtask in ${recog_set}; do
        decode_dir=$decode_parent_dir/${rtask}_${decode_feature}_${search_type}
        feat_recog_dir=${dumpdir}/${rtask}/delta${do_delta}

        # split data
        splitjson.py --parts ${nj} ${feat_recog_dir}/data.json

        #### use CPU for decoding
        ngpu=0

        ${decode_cmd} JOB=1:${nj} --max-job 100 ${expdir}/${decode_dir}/log/decode.JOB.log \
            asr_recog.py \
            --config ${decode_config} \
            --ngpu ${ngpu} \
            --backend ${backend} \
            --batchsize 0 \
            --recog-json ${feat_recog_dir}/split${nj}utt/data.JOB.json \
            --result-label ${expdir}/${decode_dir}/data.JOB.json \
            --model ${expdir}/pretrain_0/${recog_model} \
            $decode_opts

        if [ $rtask == "test_cs" ]; then 
            mer_opt="--mer true"
        else
            mer_opt=""
        fi
        
        score_sclite.sh --bpe $nbpe --bpemodel ${bpemodel} --wer true $mer_opt --nlsyms data/dict_cs/nlsyms.txt \
          ${expdir}/${decode_dir} ${dict} > ${expdir}/${decode_dir}/decode_result.txt
    done
    echo "Finished"
fi


================================================
FILE: egs/asrucs/path.sh
================================================
../aishell1/path.sh

================================================
FILE: egs/asrucs/prepare.sh
================================================
#!/usr/bin/env bash

# author: tyriontian
# tianjinchuan@stu.pku.edu.cn ; tyriontian@tencent.com

# A Code-Switch ASR recipe

. ./path.sh || exit 1;
. ./cmd.sh || exit 1;

stage=2
stop_stage=100
dumpdir=dump
fbankdir=fank
do_delta=false
nbpe=5000
oteam_cs_text=../oteam_asr3/data/eng/text

. utils/parse_options.sh || exit 1;

if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    echo "stage 0: prepare features"
    # make raw features and remove long-short utts
    for part in train_zh train_en train_cs; do 
        steps/make_fbank_pitch.sh --cmd "$train_cmd" --nj 500 --write_utt2num_frames true \
            data/$part exp/make_fbank/$part ${fbankdir}
        espnet_utils/remove_longshortdata.sh --maxframes 1200 --maxchars 400 data/${part} data/${part}_trim
    done

    # compute cmvn
    mkdir -p data/cmvn
    cat data/train_zh_trim/feats.scp data/train_en_trim/feats.scp data/train_cs_trim/feats.scp \
        | shuf | head -n 10000 > data/cmvn/feats_cmvn.scp
    compute-cmvn-stats scp:data/cmvn/feats_cmvn.scp data/cmvn/cmvn.ark

    # dump features without speed perturb
    for part in train_zh_trim train_en_trim train_cs_trim \
                dev_zh dev_en dev_cs test_zh test_en test_cs; do
        feat_part_dir=${dumpdir}/${part}/delta${do_delta}; mkdir -p ${feat_part_dir}
        dump.sh --cmd "$train_cmd" --nj 48 --do_delta ${do_delta} \
            data/${part}/feats.scp data/cmvn/cmvn.ark exp/dump_feats/${part} \
            ${feat_part_dir}
    done
fi

mkdir -p data/dict_cs
dict=data/dict_cs/dict.txt
nlsyms=data/dict_cs/nlsyms.txt
bpemodel=data/dict_cs/bpe
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    echo "stage 1: prepare dictionary and json file"
    
    # prepare special symbols
    echo "<unk> 1" > $dict; echo "<chn> 2" >> $dict; echo "<eng> 3" >> $dict
    echo "<chn>" > $nlsyms; echo "<eng>" >> $nlsyms

    # chn symbols
    text2token.py -s 1 -n 1 data/train_zh_trim/text | cut -f 2- -d" " | tr " " "\n" \
      | sort | uniq | grep -v -e '^\s*$' | awk '{print $0 " " NR+3}' >> ${dict}
    nchn_symbols=`wc -l $dict | cut -d ' ' -f 1` 

    # eng symbols and bpe models 
    cat data/train_en_trim/text | cut -d ' ' -f 2- > data/dict_cs/bpe_input.txt
    spm_train --input=data/dict_cs/bpe_input.txt --vocab_size=${nbpe} --model_type=unigram \
        --model_prefix=${bpemodel} --input_sentence_size=100000000
    spm_encode --model=${bpemodel}.model --output_format=piece --split-chn < data/dict_cs/bpe_input.txt \
        | tr ' ' '\n' | sort | uniq | awk '{print $0 " " NR+'${nchn_symbols}'}' >> ${dict}

    # eng words
    cat data/train_en_trim/text data/train_cs_trim/text 

    # make json files
    for part in train_zh_trim train_en_trim train_cs_trim \
                dev_zh dev_en dev_cs test_zh test_en test_cs; do
        feat_part_dir=${dumpdir}/${part}/delta${do_delta}
        data2json.sh --nj 20 --feat $feat_part_dir/feats.scp --bpecode ${bpemodel}.model \
            data/$part $dict > $feat_part_dir/data.json 
    done

    # Add language-id in label sequence; consider <blk> <eos>
    # We add these label only for model inference -> no test sets
    n_symbols=`wc -l $dict | cut -d ' ' -f 1`
    for part in train_zh_trim dev_zh; do
        python3 espnet_utils/add_uttcls_json.py dump/${part}/deltafalse/data.json \
            dump/${part}/deltafalse/data_withcls.json $[$n_symbols + 2]
    done

    for part in train_en_trim dev_en; do
        python3 espnet_utils/add_uttcls_json.py dump/${part}/deltafalse/data.json \
            dump/${part}/deltafalse/data_withcls.json $[$n_symbols + 3]
    done

    for part in train_cs_trim dev_cs; do
        python3 espnet_utils/add_uttcls_json.py dump/${part}/deltafalse/data.json \
            dump/${part}/deltafalse/data_withcls.json $[$n_symbols + 4]
    done
fi

if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
    echo "stage 2: Process json files in multiple styles"
    # dev
    python3 espnet_utils/concatjson.py \
        dump/dev_zh/deltafalse/data.json \
        dump/dev_en/deltafalse/data.json \
        dump/dev_cs/deltafalse/data.json \
        > dump/jsons/dev.json

    # zh + en 
    python3 espnet_utils/concatjson.py \
        dump/train_zh_trim/deltafalse/data.json \
        dump/train_en_trim/deltafalse/data.json \
        --shuffle > dump/jsons/zh_en.json
    
    # zh + en + cs
    python3 espnet_utils/concatjson.py \
        dump/train_zh_trim/deltafalse/data.json \
        dump/train_en_trim/deltafalse/data.json \
        dump/train_cs_trim/deltafalse/data.json \
        --shuffle > dump/jsons/zh_en_cs.json
    
    # zh + en + fakecs
    python3 espnet_utils/concatjson.py \
        dump/train_zh_trim/deltafalse/data.json \
        dump/train_en_trim/deltafalse/data.json \
        dump/train_cs_fake/deltafalse/data.json \
        --shuffle > dump/jsons/zh_en_fakecs.json

    ### With class label ###
    
    # dev
    python3 espnet_utils/concatjson.py \
        dump/dev_zh/deltafalse/data_withcls.json \
        dump/dev_en/deltafalse/data_withcls.json \
        dump/dev_cs/deltafalse/data_withcls.json \
        > dump/jsons/dev_withcls.json

    # zh + en 
    python3 espnet_utils/concatjson.py \
        dump/train_zh_trim/deltafalse/data_withcls.json \
        dump/train_en_trim/deltafalse/data_withcls.json \
        --shuffle > dump/jsons/zh_en_withcls.json

    # zh + en + cs
    python3 espnet_utils/concatjson.py \
        dump/train_zh_trim/deltafalse/data_withcls.json \
        dump/train_en_trim/deltafalse/data_withcls.json \
        dump/train_cs_trim/deltafalse/data_withcls.json \
        --shuffle > dump/jsons/zh_en_cs_withcls.json

    # zh + en + fakecs
    python3 espnet_utils/concatjson.py \
        dump/train_zh_trim/deltafalse/data_withcls.json \
        dump/train_en_trim/deltafalse/data_withcls.json \
        dump/train_cs_fake/deltafalse/data_withcls.json \
        --shuffle > dump/jsons/zh_en_fakecs_withcls.json

    for json in zh_en zh_en_withcls zh_en_cs zh_en_cs_withcls zh_en_fakecs zh_en_fakecs_withcls; do
        python3 espnet_utils/splitjson.py -p 8 --original-order dump/jsons/${json}.json &
    done; wait
fi

if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
    echo "stage 3: build LMs"
    
#    mkdir -p data/ngram
#    # process corpus
#    for part in train_zh train_en train_cs dev_zh dev_en dev_cs; do
#        cut -d " " -f 2- data/$part/text | spm_encode --model=${bpemodel}.model \
#          --output_format=piece > data/ngram/${part}_input.txt
#    done
#    cut -d " " -f 2- $oteam_cs_text | spm_encode --model=${bpemodel}.model \
#          --output_format=piece > data/ngram/oteam_cs_input.txt
#
#    rm -f data/ngram/train_combine_input.txt data/ngram/dev_combine_input.txt 
#    cat data/ngram/train*_input.txt > data/ngram/train_combine_input.txt
#    cat data/ngram/dev*_input.txt   > data/ngram/dev_combine_input.txt
#  
#    # train N-gram LM 
#    for part in zh en cs combine; do 
#        lmplz --discount_fallback -o 5 < data/ngram/train_${part}_input.txt \
#          > data/ngram/${part}_5gram.arpa
#        build_binary data/ngram/${part}_5gram.arpa data/ngram/${part}_5gram.bin
#
#        cat data/ngram/train_${part}_input.txt data/ngram/oteam_cs_input.txt \
#          > data/ngram/train_${part}_input_extended.txt
#        lmplz --discount_fallback -o 5 <  data/ngram/train_${part}_input_extended.txt\
#          > data/ngram/${part}_5gram_oteam.arpa
#        build_binary data/ngram/${part}_5gram_oteam.arpa data/ngram/${part}_5gram_oteam.bin
#    done
#
#    # train token-level LM
#    for part in combine; do
#        ${cuda_cmd} --gpu 4 exp/train_nnlm_${part}/train.log \
#            lm_train.py \
#            --config conf/lm_transformer.yaml \
#            --ngpu 4 \
#            --backend pytorch \
#            --verbose 1 \
#            --outdir exp/train_nnlm_${part} \
#            --train-label data/ngram/train_${part}_input.txt \
#            --valid-label data/ngram/dev_${part}_input.txt \
#            --dict ${dict}
#    done

#    mkdir -p data/word_lm; rm data/word_lm/*
#    for part in train_zh train_en train_cs dev_zh dev_en dev_cs; do
#        python3 espnet_utils/text_norm.py --in-f data/${part}/text \
#          --out-f data/word_lm/${part}_input.txt --eng-upper --segment-chn 
#    done
#    rm -f data/word_lm/train_combine_input.txt 
#    rm -f data/word_lm/dev_combine_input.txt
#    cat data/word_lm/train_* > data/word_lm/train_combine_input.txt
#    cat data/word_lm/dev_*   > data/word_lm/dev_combine_input.txt
#
#    word_dict=data/word_lm/dict.txt
#    text2vocabulary.py -s 65000 -o ${word_dict} data/word_lm/train_combine_input.txt

    
fi
exit 0;

if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
    echo "stage 3: build word N-gram LMs"
    mkdir -p data/word_ngram;
   
    # prepare corpus 
    for part in train_zh train_en train_cs; do
        python3 espnet_utils/text_norm.py --in-f data/${part}/text \
          --out-f data/word_ngram/${part}_seg.txt --segment --eng-upper
        cut -d ' ' -f 2- data/word_ngram/${part}_seg.txt | spm_encode \
          --model=${bpemodel}.model --output_format=piece \
          > data/word_ngram/${part}_bpe.txt
        python3 local/add_seperator.py data/word_ngram/${part}_seg.txt \
          data/word_ngram/${part}_word.txt
        cat data/word_ngram/${part}_bpe.txt data/word_ngram/${part}_word.txt \
          > data/word_ngram/${part}_input.txt
    done 
    cat data/word_ngram/*_input.txt > data/word_ngram/train_combine_input.txt

    # prepare words.txt
    words=data/word_ngram/words.txt
    cat data/word_ngram/train_combine_input.txt | tr " " "\n" | sort | uniq | \
      grep -v '<unk>' > ${words} 

    # train N-gram and convert to torch version
    words_disambig=data/word_ngram/words_disambig.txt
    (echo "<eps>"; echo "<unk>") | cat - $words |\
      awk '{print $0 " " NR-1}' > $words_disambig
    echo "#0 `wc -l $words_disambig | cut -d ' ' -f 1`" \
      >> $words_disambig
    for part in train_zh train_en train_cs train_combine; do
        bash espnet_utils/train_lms_srilm.sh \
          --unk "<unk>" --lm-opts -wbdiscount --order 5 \
          $words data/word_ngram/${part}_input.txt \
          data/word_ngram/$part
        gunzip -c data/word_ngram/$part/srilm/srilm.o3g.kn.gz \
          > data/word_ngram/$part/lm.arpa
        cp $words_disambig data/word_ngram/$part/words.txt
        echo 1 > data/word_ngram/$part/oov.int

        python3 -m kaldilm \
          --read-symbol-table=$words_disambig \
          --disambig-symbol='#0' \
          --max-order=5 \
          data/word_ngram/$part/lm.arpa \
          > data/word_ngram/$part/G.fst.txt

        python3 espnet/nets/scorers/word_ngram.py data/word_ngram/$part 
    done 
fi


================================================
FILE: egs/asrucs/steps
================================================
../steps/

================================================
FILE: egs/asrucs/text
================================================


================================================
FILE: egs/asrucs/utils
================================================
../utils/

================================================
FILE: egs/espnet_utils/add_uttcls_json.py
================================================
import json
import sys

def main():
    in_json = sys.argv[1]
    out_json = sys.argv[2]
    clsid = sys.argv[3]

    reader = open(in_json, encoding="utf-8")
    j = json.load(reader)

    for name in j["utts"].keys():
        j["utts"][name]["output"][0]["tokenid"] = \
            clsid + " " + j["utts"][name]["output"][0]["tokenid"]

    with open(out_json, "wb") as f:
        f.write(
            json.dumps(
                j, indent=4, ensure_ascii=False, sort_keys=True
            ).encode("utf_8")
        )

if __name__ == "__main__":
    main()


================================================
FILE: egs/espnet_utils/addjson.py
================================================
#!/usr/bin/env python3
# encoding: utf-8

# Copyright 2018 Nagoya University (Tomoki Hayashi)
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)


import argparse
import codecs
import json
import logging
import sys

from distutils.util import strtobool

from espnet.utils.cli_utils import get_commandline_args

is_python2 = sys.version_info[0] == 2


def get_parser():
    parser = argparse.ArgumentParser(
        description="add multiple json values to an input or output value",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
    )
    parser.add_argument("jsons", type=str, nargs="+", help="json files")
    parser.add_argument(
        "-i",
        "--is-input",
        default=True,
        type=strtobool,
        help="If true, add to input. If false, add to output",
    )
    parser.add_argument("--verbose", "-V", default=0, type=int, help="Verbose option")
    return parser


if __name__ == "__main__":
    parser = get_parser()
    args = parser.parse_args()

    # logging info
    logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
    if args.verbose > 0:
        logging.basicConfig(level=logging.INFO, format=logfmt)
    else:
        logging.basicConfig(level=logging.WARN, format=logfmt)
    logging.info(get_commandline_args())

    # make intersection set for utterance keys
    js = []
    intersec_ks = []
    for x in args.jsons:
        with codecs.open(x, "r", encoding="utf-8") as f:
            j = json.load(f)
        ks = j["utts"].keys()
        logging.info(x + ": has " + str(len(ks)) + " utterances")
        if len(intersec_ks) > 0:
            intersec_ks = intersec_ks.intersection(set(ks))
            if len(intersec_ks) == 0:
                logging.warning("Empty intersection")
                break
        else:
            intersec_ks = set(ks)
        js.append(j)
    logging.info("new json has " + str(len(intersec_ks)) + " utterances")

    # updated original dict to keep intersection
    intersec_org_dic = dict()
    for k in intersec_ks:
        v = js[0]["utts"][k]
        intersec_org_dic[k] = v

    intersec_add_dic = dict()
    for k in intersec_ks:
        v = js[1]["utts"][k]
        for j in js[2:]:
            v.update(j["utts"][k])
        intersec_add_dic[k] = v

    new_dic = dict()
    for key_id in intersec_org_dic:
        orgdic = intersec_org_dic[key_id]
        adddic = intersec_add_dic[key_id]

        if "utt2spk" not in orgdic:
            orgdic["utt2spk"] = ""
        # NOTE: for machine translation

        # add as input
        if args.is_input:
            # original input
            input_list = orgdic["input"]
            # additional input
            in_add_dic = {}
            if "idim" in adddic and "ilen" in adddic:
                in_add_dic["shape"] = [int(adddic["ilen"]), int(adddic["idim"])]
            elif "idim" in adddic:
                in_add_dic["shape"] = [int(adddic["idim"])]
            # add all other key value
            for key, value in adddic.items():
                if key in ["idim", "ilen"]:
                    continue
                in_add_dic[key] = value
            # add name
            in_add_dic["name"] = "input%d" % (len(input_list) + 1)

            input_list.append(in_add_dic)
            new_dic[key_id] = {
                "input": input_list,
                "output": orgdic["output"],
                "utt2spk": orgdic["utt2spk"],
            }
        # add as output
        else:
            # original output
            output_list = orgdic["output"]
            # additional output
            out_add_dic = {}
            # add shape
            if "odim" in adddic and "olen" in adddic:
                out_add_dic["shape"] = [int(adddic["olen"]), int(adddic["odim"])]
            elif "odim" in adddic:
                out_add_dic["shape"] = [int(adddic["odim"])]
            # add all other key value
            for key, value in adddic.items():
                if key in ["odim", "olen"]:
                    continue
                out_add_dic[key] = value
            # add name
            out_add_dic["name"] = "target%d" % (len(output_list) + 1)

            output_list.append(out_add_dic)
            new_dic[key_id] = {
                "input": orgdic["input"],
                "output": output_list,
                "utt2spk": orgdic["utt2spk"],
            }
            if "lang" in orgdic.keys():
                new_dic[key_id]["lang"] = orgdic["lang"]

    # ensure "ensure_ascii=False", which is a bug
    jsonstring = json.dumps(
        {"utts": new_dic},
        indent=4,
        ensure_ascii=False,
        sort_keys=True,
        separators=(",", ": "),
    )
    sys.stdout = codecs.getwriter("utf-8")(
        sys.stdout if is_python2 else sys.stdout.buffer
    )
    print(jsonstring)


================================================
FILE: egs/espnet_utils/apply-cmvn.py
================================================
#!/usr/bin/env python3
import argparse
from distutils.util import strtobool
import logging

import kaldiio
import numpy

from espnet.transform.cmvn import CMVN
from espnet.utils.cli_readers import file_reader_helper
from espnet.utils.cli_utils import get_commandline_args
from espnet.utils.cli_utils import is_scipy_wav_style
from espnet.utils.cli_writers import file_writer_helper


def get_parser():
    parser = argparse.ArgumentParser(
        description="apply mean-variance normalization to files",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
    )

    parser.add_argument("--verbose", "-V", default=0, type=int, help="Verbose option")
    parser.add_argument(
        "--in-filetype",
        type=str,
        default="mat",
        choices=["mat", "hdf5", "sound.hdf5", "sound"],
        help="Specify the file format for the rspecifier. "
        '"mat" is the matrix format in kaldi',
    )
    parser.add_argument(
        "--stats-filetype",
        type=str,
        default="mat",
        choices=["mat", "hdf5", "npy"],
        help="Specify the file format for the rspecifier. "
        '"mat" is the matrix format in kaldi',
    )
    parser.add_argument(
        "--out-filetype",
        type=str,
        default="mat",
        choices=["mat", "hdf5"],
        help="Specify the file format for the wspecifier. "
        '"mat" is the matrix format in kaldi',
    )

    parser.add_argument(
        "--norm-means",
        type=strtobool,
        default=True,
        help="Do variance normalization or not.",
    )
    parser.add_argument(
        "--norm-vars",
        type=strtobool,
        default=False,
        help="Do variance normalization or not.",
    )
    parser.add_argument(
        "--reverse", type=strtobool, default=False, help="Do reverse mode or not"
    )
    parser.add_argument(
        "--spk2utt",
        type=str,
        help="A text file of speaker to utterance-list map. "
        "(Don't give rspecifier format, such as "
        '"ark:spk2utt")',
    )
    parser.add_argument(
        "--utt2spk",
        type=str,
        help="A text file of utterance to speaker map. "
        "(Don't give rspecifier format, such as "
        '"ark:utt2spk")',
    )
    parser.add_argument(
        "--write-num-frames", type=str, help="Specify wspecifer for utt2num_frames"
    )
    parser.add_argument(
        "--compress", type=strtobool, default=False, help="Save in compressed format"
    )
    parser.add_argument(
        "--compression-method",
        type=int,
        default=2,
        help="Specify the method(if mat) or " "gzip-level(if hdf5)",
    )
    parser.add_argument(
        "stats_rspecifier_or_rxfilename",
        help="Input stats. e.g. ark:stats.ark or stats.mat",
    )
    parser.add_argument(
        "rspecifier", type=str, help="Read specifier id. e.g. ark:some.ark"
    )
    parser.add_argument(
        "wspecifier", type=str, help="Write specifier id. e.g. ark:some.ark"
    )
    return parser


def main():
    args = get_parser().parse_args()

    # logging info
    logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
    if args.verbose > 0:
        logging.basicConfig(level=logging.INFO, format=logfmt)
    else:
        logging.basicConfig(level=logging.WARN, format=logfmt)
    logging.info(get_commandline_args())

    if ":" in args.stats_rspecifier_or_rxfilename:
        is_rspcifier = True
        if args.stats_filetype == "npy":
            stats_filetype = "hdf5"
        else:
            stats_filetype = args.stats_filetype

        stats_dict = dict(
            file_reader_helper(args.stats_rspecifier_or_rxfilename, stats_filetype)
        )
    else:
        is_rspcifier = False
        if args.stats_filetype == "mat":
            stats = kaldiio.load_mat(args.stats_rspecifier_or_rxfilename)
        else:
            stats = numpy.load(args.stats_rspecifier_or_rxfilename)
        stats_dict = {None: stats}

    cmvn = CMVN(
        stats=stats_dict,
        norm_means=args.norm_means,
        norm_vars=args.norm_vars,
        utt2spk=args.utt2spk,
        spk2utt=args.spk2utt,
        reverse=args.reverse,
    )

    with file_writer_helper(
        args.wspecifier,
        filetype=args.out_filetype,
        write_num_frames=args.write_num_frames,
        compress=args.compress,
        compression_method=args.compression_method,
    ) as writer:
        for utt, mat in file_reader_helper(args.rspecifier, args.in_filetype):
            if is_scipy_wav_style(mat):
                # If data is sound file, then got as Tuple[int, ndarray]
                rate, mat = mat
            mat = cmvn(mat, utt if is_rspcifier else None)
            writer[utt] = mat


if __name__ == "__main__":
    main()


================================================
FILE: egs/espnet_utils/asr_align_wav.sh
================================================
#!/usr/bin/env bash

# Copyright 2020 Johns Hopkins University (Xuankai Chang)
# 2020 Technische Universität München, Authors: Ludwig Kürzinger, Dominik Winkelbauer
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

if [ ! -f path.sh ] || [ ! -f cmd.sh ]; then
    echo "Please change current directory to recipe directory e.g., egs/tedlium2/asr1"
    exit 1
fi

. ./path.sh

# general configuration
python=python3
backend=pytorch
stage=-1       # start from -1 if you need to start from model download
stop_stage=100
ngpu=0         # number of gpus ("0" uses cpu, otherwise use gpu)
verbose=1      # verbose option

# feature configuration
do_delta=false
cmvn=

# decoding parameter
align_model=
align_config=
align_dir=align
api=v1

# Parameters for CTC alignment
# The subsampling factor depends on whether the encoder uses subsampling
subsampling_factor=4
# minium confidence score in log space - may need adjustment depending on data and model, e.g. -1.5 or -5.0
min_confidence_score=-5.0
# minimum length of one utterance (counted in frames)
min_window_size=8000
# partitioning length L for calculation of the confidence score
scoring_length=30


# download related
models=tedlium2.rnn.v2
dict=
nlsyms=

. utils/parse_options.sh || exit 1;

help_message=$(cat <<EOF
Usage:
    $0 [options] <wav_file> "<text>"

Options:
    --backend <chainer|pytorch>     # chainer or pytorch (Default: pytorch)
    --ngpu <ngpu>                   # Number of GPUs (Default: 0)
    --align-dir <directory_name>    # Name of directory to store decoding temporary data
    --models <model_name>           # Model name (e.g. tedlium2.transformer.v1)
    --cmvn <path>                   # Location of cmvn.ark
    --align-model <path>            # Location of E2E model
    --align-config <path>           # Location of configuration file
    --api <api_version>             # API version (v1 or v2, available in only pytorch backend)
    --nlsyms <path>                 # Non-linguistic symbol list

Example:
    # Record audio from microphone input as example.wav
    rec -c 1 -r 16000 example.wav trim 0 5

    # Align using model name
    $0 --models tedlium2.transformer.v1 example.wav "example text"

    # Align using model file
    $0 --cmvn cmvn.ark --align_model model.acc.best --align_config conf/align.yaml example.wav

    # Align with GPU (require batchsize > 0 in configuration file)
    $0 --ngpu 1 example.wav

Available models:
    - tedlium2.rnn.v1
    - tedlium2.rnn.v2
    - tedlium2.transformer.v1
    - tedlium3.transformer.v1
    - librispeech.transformer.v1
    - librispeech.transformer.v1.transformerlm.v1
    - commonvoice.transformer.v1
    - csj.transformer.v1
    - csj.rnn.v1
    - wsj.transformer.v1
    - wsj.transformer_small.v1
EOF
)


# make shellcheck happy
train_cmd=

. ./cmd.sh

wav=$1
text=$2
download_dir=${align_dir}/download

if [ ! $# -eq 2 ]; then
    echo "${help_message}"
    exit 1;
fi

# Set bash to 'debug' mode, it will exit on :
# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
set -e
set -u
set -o pipefail

# check api version
if [ "${backend}" = "chainer" ]; then
    echo "chainer backend is not supported." >&2
    exit 1;
fi

# Check model name or model file is set
if [ -z $models ]; then
    if [[ -z $cmvn || -z $align_model || -z $align_config ]]; then
        echo 'Error: models or set of cmvn, align_model and align_config are required.' >&2
        exit 1
    fi
fi

# Check for transformer models because of their memory consumption
if [[ $models == *"rnn"* ]]; then
    echo "Using RNN model: "${models}
else
    echo "Using Transformer model: "${models}
    echo "WARNING. For large audio files, use an RNN model."
fi

dir=${download_dir}/${models}
mkdir -p ${dir}

function download_models () {
    if [ -z $models ]; then
        return
    fi

    file_ext="tar.gz"
    case "${models}" in
        "tedlium2.rnn.v1") share_url="https://drive.google.com/open?id=1UqIY6WJMZ4sxNxSugUqp3mrGb3j6h7xe"; api=v1 ;;
        "tedlium2.rnn.v2") share_url="https://drive.google.com/open?id=1cac5Uc09lJrCYfWkLQsF8eapQcxZnYdf"; api=v1 ;;
        "tedlium2.transformer.v1") share_url="https://drive.google.com/open?id=1cVeSOYY1twOfL9Gns7Z3ZDnkrJqNwPow" ;;
        "tedlium3.transformer.v1") share_url="https://drive.google.com/open?id=1zcPglHAKILwVgfACoMWWERiyIquzSYuU" ;;
        "librispeech.transformer.v1") share_url="https://drive.google.com/open?id=1BtQvAnsFvVi-dp_qsaFP7n4A_5cwnlR6" ;;
        "librispeech.transformer.v1.transformerlm.v1") share_url="https://drive.google.com/open?id=17cOOSHHMKI82e1MXj4r2ig8gpGCRmG2p" ;;
        "commonvoice.transformer.v1") share_url="https://drive.google.com/open?id=1tWccl6aYU67kbtkm8jv5H6xayqg1rzjh" ;;
        "csj.transformer.v1") share_url="https://drive.google.com/open?id=120nUQcSsKeY5dpyMWw_kI33ooMRGT2uF" ;;
        "csj.rnn.v1") share_url="https://drive.google.com/open?id=1ALvD4nHan9VDJlYJwNurVr7H7OV0j2X9" ;;
        "wsj.transformer.v1") share_url="https://drive.google.com/open?id=1Az-4H25uwnEFa4lENc-EKiPaWXaijcJp" ;;
        "wsj.transformer_small.v1") share_url="https://drive.google.com/open?id=1jdEKbgWhLTxN_qP4xwE7mTOPmp7Ga--T" ;;
        *) echo "No such models: ${models}"; exit 1 ;;
    esac

    if [ ! -e ${dir}/.complete ]; then
        download_from_google_drive.sh ${share_url} ${dir} ${file_ext}
        touch ${dir}/.complete
    fi
}

# Download trained models
if [ -z "${cmvn}" ]; then
    download_models
    cmvn=$(find ${download_dir}/${models} -name "cmvn.ark" | head -n 1)
fi
if [ -z "${align_model}" ]; then
    download_models
    align_model=$(find ${download_dir}/${models} -name "model*.best*" | head -n 1)
fi
if [ -z "${align_config}" ]; then
    download_models
    align_config=$(find ${download_dir}/${models} -name "decode*.yaml" | head -n 1)
fi
if [ -z "${wav}" ]; then
    download_models
    wav=$(find ${download_dir}/${models} -name "*.wav" | head -n 1)
fi
if [ -z "${dict}" ]; then
    download_models

    if [ -z "${dict}" ]; then
        mkdir -p ${download_dir}/${models}/data/lang_autochar/
        model_config=$(find -L ${download_dir}/${models}/exp/*/results/model.json | head -n 1)
        dict=${download_dir}/${models}/data/lang_autochar/dict.txt
        python -c 'import json,sys;obj=json.load(sys.stdin);[print(char + " " + str(i + 1)) for i, char in enumerate(obj[2]["char_list"])]' > ${dict} < ${model_config}
    fi
fi

# Check file existence
if [ ! -f "${cmvn}" ]; then
    echo "No such CMVN file: ${cmvn}"
    exit 1
fi
if [ ! -f "${align_model}" ]; then
    echo "No such E2E model: ${align_model}"
    exit 1
fi
if [ ! -f "${align_config}" ]; then
    echo "No such config file: ${align_config}"
    exit 1
fi
if [ ! -f "${dict}" ]; then
    echo "No such Dictionary file: ${dict}"
    exit 1
fi
if [ ! -f "${wav}" ]; then
    echo "No such WAV file: ${wav}"
    exit 1
fi
if [ -z "${text}" ]; then
    echo "Text is empty: ${text}"
    exit 1
fi

base=$(basename $wav .wav)

if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    echo "stage 0: Data preparation"

    mkdir -p ${align_dir}/data
    echo "$base $wav" > ${align_dir}/data/wav.scp
    echo "X $base" > ${align_dir}/data/spk2utt
    echo "$base X" > ${align_dir}/data/utt2spk
    echo "$base $text" > ${align_dir}/data/text
fi

if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    echo "stage 1: Feature Generation"

    steps/make_fbank_pitch.sh --cmd "$train_cmd" --nj 1 --write_utt2num_frames true \
        ${align_dir}/data ${align_dir}/log ${align_dir}/fbank || exit 1;

    feat_align_dir=${align_dir}/dump; mkdir -p ${feat_align_dir}
    dump.sh --cmd "$train_cmd" --nj 1 --do_delta ${do_delta} \
        ${align_dir}/data/feats.scp ${cmvn} ${align_dir}/log \
        ${feat_align_dir}
    utils/fix_data_dir.sh ${align_dir}/data
fi

if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
    echo "stage 2: Json Data Preparation"

    nlsyms_opts=""
    if [[ -n ${nlsyms} ]]; then
        nlsyms_opts="--nlsyms ${nlsyms}"
    fi

    feat_align_dir=${align_dir}/dump
    data2json.sh --feat ${feat_align_dir}/feats.scp ${nlsyms_opts} \
        ${align_dir}/data ${dict} > ${feat_align_dir}/data.json || exit 1;

fi

if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
    echo "stage 3: Aligning"
    feat_align_dir=${align_dir}/dump

    ${python} -m espnet.bin.asr_align \
        --config ${align_config} \
        --ngpu ${ngpu} \
        --verbose ${verbose} \
        --data-json ${feat_align_dir}/data.json \
        --model ${align_model} \
        --subsampling-factor ${subsampling_factor} \
        --min-window-size ${min_window_size} \
        --scoring-length ${scoring_length} \
        --api ${api} \
        --utt-text ${align_dir}/utt_text \
        --output ${align_dir}/aligned_segments || exit 1;

    echo ""
    echo "Segments file: $(wc -l ${align_dir}/aligned_segments)"
    count_reliable=$(awk -v ms=${min_confidence_score} '{ if ($5 > ms) {print} }' ${align_dir}/aligned_segments | wc -l)
    echo "Utterances with min confidence score: ${count_reliable}"
    echo "Finished."
fi


================================================
FILE: egs/espnet_utils/average_checkpoints.py
================================================
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import argparse
import json
import os

import numpy as np


def main():
    if args.log is not None:
        with open(args.log) as f:
            logs = json.load(f)
        val_scores = []
        for log in logs:
            if log["epoch"] > args.max_epoch:
                continue

            if args.metric == "acc":
                if "validation/main/acc" in log.keys():
                    val_scores += [[log["epoch"], log["validation/main/acc"]]]
            elif args.metric == "perplexity":
                if "val_perplexity" in log.keys():
                    val_scores += [[log["epoch"], 1 / log["val_perplexity"]]]
            elif args.metric == "loss":
                if "validation/main/loss" in log.keys():
                    val_scores += [[log["epoch"], -log["validation/main/loss"]]]
            elif args.metric == "bleu":
                if "validation/main/bleu" in log.keys():
                    val_scores += [[log["epoch"], log["validation/main/bleu"]]]
            elif args.metric == "cer":
                if "validation/main/cer" in log.keys():
                    val_scores += [[log["epoch"], -log["validation/main/cer"]]]
            elif args.metric == "cer_ctc":
                if "validation/main/cer_ctc" in log.keys():
                    val_scores += [[log["epoch"], -log["validation/main/cer_ctc"]]]
            else:
                # Keep original order for compatibility
                if "validation/main/acc" in log.keys():
                    val_scores += [[log["epoch"], log["validation/main/acc"]]]
                elif "val_perplexity" in log.keys():
                    val_scores += [[log["epoch"], 1 / log["val_perplexity"]]]
                elif "validation/main/loss" in log.keys():
                    val_scores += [[log["epoch"], -log["validation/main/loss"]]]

        if len(val_scores) == 0:
            raise ValueError("%s is not found in log." % args.metric)
        val_scores = np.array(val_scores)
        sort_idx = np.argsort(val_scores[:, -1])
        sorted_val_scores = val_scores[sort_idx][::-1]
        print("metric: %s" % args.metric)
        print("best val scores = " + str(sorted_val_scores[: int(args.num), 1]))
        print(
            "selected epochs = "
            + str(sorted_val_scores[: int(args.num), 0].astype(np.int64))
        )
        last = [
            os.path.dirname(args.snapshots[0]) + "/snapshot.ep.%d" % (int(epoch))
            for epoch in sorted_val_scores[: int(args.num), 0]
        ]
        args.num = int(args.num)
    else:
        print(args.num)
        last = sorted(args.snapshots, key=lambda x: int(x.split(".")[-1]))
        if args.num.isdigit():
            last = last[-int(args.num) :]
        else:
            start, end = args.num.split('_')
            start, end  = int(start) - 1, int(end)
            last = last[start: end]
        args.num = len(last)
    print("average over", last)
    avg = None

    if args.backend == "pytorch":
        import torch

        # sum
        for path in last:
            states = torch.load(path, map_location=torch.device("cpu"))["model"]
            if avg is None:
                avg = states
            else:
                for k in avg.keys():
                    avg[k] += states[k]

        # average
        for k in avg.keys():
            if avg[k] is not None:
                if avg[k].is_floating_point():
                    avg[k] /= args.num
                else:
                    avg[k] //= args.num

        torch.save(avg, args.out)

    elif args.backend == "chainer":
        # sum
        for path in last:
            states = np.load(path)
            if avg is None:
                keys = [x.split("main/")[1] for x in states if "model" in x]
                avg = dict()
                for k in keys:
                    avg[k] = states["updater/model:main/{}".format(k)]
            else:
                for k in keys:
                    avg[k] += states["updater/model:main/{}".format(k)]
        # average
        for k in keys:
            if avg[k] is not None:
                avg[k] /= args.num
        np.savez_compressed(args.out, **avg)
        os.rename("{}.npz".format(args.out), args.out)  # numpy save with .npz extension
    else:
        raise ValueError("Incorrect type of backend")


def get_parser():
    parser = argparse.ArgumentParser(description="average models from snapshot")
    parser.add_argument("--snapshots", required=True, type=str, nargs="+")
    parser.add_argument("--out", required=True, type=str)
    parser.add_argument("--num", default=10, type=str)
    parser.add_argument("--backend", default="chainer", type=str)
    parser.add_argument("--log", default=None, type=str, nargs="?")
    parser.add_argument(
        "--metric",
        default="",
        type=str,
        nargs="?",
        choices=["acc", "bleu", "cer", "cer_ctc", "loss", "perplexity"],
    )
    parser.add_argument(
        "--max-epoch",
        default=10000000,
        type=int,
        nargs="?",
    )
    return parser


if __name__ == "__main__":
    args = get_parser().parse_args()
    main()


================================================
FILE: egs/espnet_utils/build_fake_lexicon.py
================================================
import sys
import os

orig_lexicon = sys.argv[1]

for line in open(orig_lexicon, encoding="utf-8"):
    word = line.strip().split()[0]
    if word.startswith("<") or word == "SIL" or word == "sil":
        print(f"{word} {word}")
    else:
        out = [word] + list(word)
        out = " ".join(out)
        print(out)
        

================================================
FILE: egs/espnet_utils/build_sp_text.py
================================================
import sys

in_f = sys.argv[1]

for line in open(in_f, 'r', encoding="utf8"):
    elems = line.split()
    uttid = elems[0]
    for sp in ["0.9", "1.0", "1.1"]:
        uttid_sp = f"sp{sp}-{uttid}"
        line = f"{uttid_sp} " + " ".join(elems[1:])
        print(line)


================================================
FILE: egs/espnet_utils/calculate_rtf.py
================================================
#!/usr/bin/env python3
# encoding: utf-8

# Copyright 2021 Kyoto University (Hirofumi Inaguma)
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

import argparse
import codecs
from dateutil import parser
import glob
import os


def get_parser():
    parser = argparse.ArgumentParser(description="calculate real time factor (RTF)")
    parser.add_argument(
        "--log-dir",
        type=str,
        default=None,
        help="path to logging directory",
    )
    return parser


def main():

    args = get_parser().parse_args()

    audio_sec = 0
    decode_sec = 0
    n_utt = 0

    audio_durations = []
    start_times = []
    end_times = []
    for x in glob.glob(os.path.join(args.log_dir, "decode.*.log")):
        with codecs.open(x, "r", "utf-8") as f:
            for line in f:
                x = line.strip()
                if "INFO: input lengths" in x:
                    audio_durations += [int(x.split("input lengths: ")[1])]
                    start_times += [parser.parse(x.split("(")[0])]
                elif "INFO: prediction" in x:
                    end_times += [parser.parse(x.split("(")[0])]
        assert len(audio_durations) == len(end_times), (
            len(audio_durations),
            len(end_times),
        )
        assert len(start_times) == len(end_times), (len(start_times), len(end_times))
        audio_sec += sum(audio_durations) / 100  # [sec]
        decode_sec += sum(
            [
                (end - start).total_seconds()
                for start, end in zip(start_times, end_times)
            ]
        )
        n_utt += len(audio_durations)

    print("Total audio duration: %.3f [sec]" % audio_sec)
    print("Total decoding time: %.3f [sec]" % decode_sec)
    rtf = decode_sec / audio_sec if audio_sec > 0 else 0
    print("RTF: %.3f" % rtf)
    latency = decode_sec * 1000 / n_utt if n_utt > 0 else 0
    print("Latency: %.3f [ms/sentence]" % latency)


if __name__ == "__main__":
    main()


================================================
FILE: egs/espnet_utils/change_root.py
================================================
# author: tyriontian
# tyriontian@tencent.com

# this script is to change the root dir of some data files, like dump directory 
# in espnet format. By change the root, we can transplant the data into other 
# experiments

# e.g., python3 espnet_utils/change_root.py /mnt/ceph_asr_ts/tomasyu/jinchuan/las_mmi/ /apdcephfs/share_1149801/speech_user/tomasyu/jinchuan/lasmmi/ dump/dev_test/ ".json,.scp"
import sys
import os

org_pref = sys.argv[1]
dst_pref = sys.argv[2]
root = sys.argv[3]
suffix = sys.argv[4]

if org_pref[-1] != '/' or dst_pref[-1] != '/':
    raise ValueError("path should end with /")

suffix = suffix.strip().split(",")
print(f"Working under the directory: {root}")
print(f"Change the prefix in all files that end with {suffix}")
print(f"The initial prefix is {org_pref}; The destination prefix is {dst_pref}")

# BFS search: find all files to change root
queue = [root]
flist = []
while queue:
    cur_dir = queue.pop(0)
    for d in os.listdir(cur_dir):
        d = os.path.join(cur_dir, d)
        
        if os.path.isfile(d):
            for s in suffix:
                if d.endswith(s):
                    flist.append(d)
                    print(f"File to change: {d}")

        if os.path.isdir(d):
            queue.append(d)

# process these files one by one
for f in flist:
    handle = open(f, 'r+', encoding="utf-8")
    context = handle.readlines()
    handle.seek(0)
    handle.truncate(0)
    for line in context:
        line = line.replace(org_pref, dst_pref)
        handle.write(line)
    

================================================
FILE: egs/espnet_utils/change_yaml.py
================================================
#!/usr/bin/env python3
import argparse
from pathlib import Path

import yaml


def get_parser():
    parser = argparse.ArgumentParser(
        description="change specified attributes of a YAML file",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
    )

    egroup = parser.add_mutually_exclusive_group()
    parser.add_argument("inyaml", nargs="?")
    egroup.add_argument("-o", "--outyaml")
    egroup.add_argument("--outdir")
    parser.add_argument(
        "-a",
        "--arg",
        action="append",
        default=[],
        help="e.g -a a.b.c=4 -> {'a': {'b': {'c': 4}}}",
    )
    parser.add_argument(
        "-d",
        "--delete",
        action="append",
        default=[],
        help='e.g -d a -> "a" is removed from the input yaml',
    )
    return parser


def main():
    args = get_parser().parse_args()

    if args.inyaml is None:
        indict = {}
    else:
        with open(args.inyaml, "r") as f:
            indict = yaml.load(f, Loader=yaml.Loader)
        if indict is None:
            indict = {}

    if args.outyaml is None:
        # Auto naming from arguments
        eles = []
        if args.inyaml is not None:
            p = Path(args.inyaml)
            if args.outdir is None:
                outdir = p.parent
            else:
                outdir = Path(args.outdir)
            eles.append(str(outdir / p.stem))

        table = str.maketrans("{}[]()", "%%__--", " |&;#*?~\"'\\")
        for arg in args.delete:
            value = arg.translate(table)
            eles.append("del-" + value)
        for arg in args.arg:
            if "=" not in arg:
                raise RuntimeError(f'"{arg}" does\'t include "="')
            key, value = arg.split("=")
            key = key.translate(table)
            value = value.translate(table)
            eles.append(key + value)

        outyaml = "_".join(eles)
        if outyaml == "":
            outyaml = "config"
        outyaml += ".yaml"
        if args.inyaml == outyaml:
            p = Path(args.outyaml)
            outyaml = p.parent / (p.stem + ".2" + p.suffix)

        outyaml = Path(outyaml)
    else:
        outyaml = Path(args.outyaml)

    for arg in args.delete + args.arg:
        if "=" in arg:
            key, value = arg.split("=")
            if not value.strip() == "":
                value = yaml.load(value, Loader=yaml.Loader)
        else:
            key = arg
            value = None

        keys = key.split(".")
        d = indict
        for idx, k in enumerate(keys):
            if idx == len(keys) - 1:
                if isinstance(d, (tuple, list)):
                    k = int(k)
                    if k >= len(d):
                        d += type(d)(None for _ in range(k - len(d) + 1))
                if value is not None:
                    d[k] = value
                else:
                    del d[k]
            else:
                if isinstance(d, (tuple, list)):
                    k = int(k)
                    if k >= len(d):
                        d += type(d)(None for _ in range(k - len(d) + 1))
                elif isinstance(d, dict):
                    if k not in d:
                        d[k] = {}
                if not isinstance(d[k], (dict, tuple, list)):
                    d[k] = {}
                d = d[k]

    outyaml.parent.mkdir(parents=True, exist_ok=True)
    with outyaml.open("w") as f:
        yaml.dump(indict, f, Dumper=yaml.Dumper, indent=4, sort_keys=False)
    print(outyaml)


if __name__ == "__main__":
    main()


================================================
FILE: egs/espnet_utils/clean_corpus.sh
================================================
#!/usr/bin/env bash

# Copyright 2021 Kyoto University (Hirofumi Inaguma)
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

maxframes=3000
maxchars=400
utt_extra_files="text.tc text.lc text.lc.rm"
no_feat=false

help_message=$(cat <<EOF
Usage: $0 [options] <data-dir> <langs>
e.g.: $0 data/train "en de"
Options:
  --maxframes        # number of maximum input frame length
  --maxchars         # number of maximum character length
  --utt_extra_files  # extra text files for target sequence
  --no_feat          # set to True for MT recipe
EOF
)
echo "$0 $*"  # Print the command line for logging

. parse_options.sh || exit 1;

if [ $# -lt 1 ] || [ $# -gt 2 ]; then
    echo "${help_message}"
    exit 1;
fi

set -euo pipefail

data_dir=$1
langs=$2

mkdir -p ${data_dir}
tmpdir=$(mktemp -d ${data_dir}/tmp-XXXXX)
trap 'rm -rf ${tmpdir}' EXIT

# remove utt having more than ${maxframes} frames
# remove utt having more than ${maxchars} characters
for lang in ${langs}; do
    remove_longshortdata.sh --no_feat ${no_feat} --maxframes ${maxframes} --maxchars ${maxchars} ${data_dir}.${lang} ${tmpdir}.${lang}
done

# Match the number of utterances between source and target languages
for lang in ${langs}; do
    cut -f 1 -d " " ${tmpdir}.${lang}/text > ${tmpdir}.${lang}/reclist
    if [ ! -f ${tmpdir}/reclist ]; then
        cp ${tmpdir}.${lang}/reclist  ${tmpdir}/reclist
    else
        # extract common lines
        comm -12 ${tmpdir}/reclist ${tmpdir}.${lang}/reclist > ${tmpdir}/reclist.tmp
        mv ${tmpdir}/reclist.tmp ${tmpdir}/reclist
    fi
done

for lang in ${langs}; do
    reduce_data_dir.sh ${tmpdir}.${lang} ${tmpdir}/reclist ${data_dir}.${lang}
    utils/fix_data_dir.sh --utt_extra_files "${utt_extra_files}" ${data_dir}.${lang}
done

rm -rf ${tmpdir}*


================================================
FILE: egs/espnet_utils/compute-cmvn-stats.py
================================================
#!/usr/bin/env python3
import argparse
import logging

import kaldiio
import numpy as np

from espnet.transform.transformation import Transformation
from espnet.utils.cli_readers import file_reader_helper
from espnet.utils.cli_utils import get_commandline_args
from espnet.utils.cli_utils import is_scipy_wav_style
from espnet.utils.cli_writers import file_writer_helper


def get_parser():
    parser = argparse.ArgumentParser(
        description="Compute cepstral mean and "
        "variance normalization statistics"
        "If wspecifier provided: per-utterance by default, "
        "or per-speaker if"
        "spk2utt option provided; if wxfilename: global",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
    )
    parser.add_argument(
        "--spk2utt",
        type=str,
        help="A text file of speaker to utterance-list map. "
        "(Don't give rspecifier format, such as "
        '"ark:utt2spk")',
    )
    parser.add_argument("--verbose", "-V", default=0, type=int, help="Verbose option")
    parser.add_argument(
        "--in-filetype",
        type=str,
        default="mat",
        choices=["mat", "hdf5", "sound.hdf5", "sound"],
        help="Specify the file format for the rspecifier. "
        '"mat" is the matrix format in kaldi',
    )
    parser.add_argument(
        "--out-filetype",
        type=str,
        default="mat",
        choices=["mat", "hdf5", "npy"],
        help="Specify the file format for the wspecifier. "
        '"mat" is the matrix format in kaldi',
    )
    parser.add_argument(
        "--preprocess-conf",
        type=str,
        default=None,
        help="The configuration file for the pre-processing",
    )
    parser.add_argument(
        "rspecifier", type=str, help="Read specifier for feats. e.g. ark:some.ark"
    )
    parser.add_argument(
        "wspecifier_or_wxfilename", type=str, help="Write specifier. e.g. ark:some.ark"
    )
    return parser


def main():
    args = get_parser().parse_args()

    logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
    if args.verbose > 0:
        logging.basicConfig(level=logging.INFO, format=logfmt)
    else:
        logging.basicConfig(level=logging.WARN, format=logfmt)
    logging.info(get_commandline_args())

    is_wspecifier = ":" in args.wspecifier_or_wxfilename

    if is_wspecifier:
        if args.spk2utt is not None:
            logging.info("Performing as speaker CMVN mode")
            utt2spk_dict = {}
            with open(args.spk2utt) as f:
                for line in f:
                    spk, utts = line.rstrip().split(None, 1)
                    for utt in utts.split():
                        utt2spk_dict[utt] = spk

            def utt2spk(x):
                return utt2spk_dict[x]

        else:
            logging.info("Performing as utterance CMVN mode")

            def utt2spk(x):
                return x

        if args.out_filetype == "npy":
            logging.warning(
                "--out-filetype npy is allowed only for "
                "Global CMVN mode, changing to hdf5"
            )
            args.out_filetype = "hdf5"

    else:
        logging.info("Performing as global CMVN mode")
        if args.spk2utt is not None:
            logging.warning("spk2utt is not used for global CMVN mode")

        def utt2spk(x):
            return None

        if args.out_filetype == "hdf5":
            logging.warning(
                "--out-filetype hdf5 is not allowed for "
                "Global CMVN mode, changing to npy"
            )
            args.out_filetype = "npy"

    if args.preprocess_conf is not None:
        preprocessing = Transformation(args.preprocess_conf)
        logging.info("Apply preprocessing: {}".format(preprocessing))
    else:
        preprocessing = None

    # Calculate stats for each speaker
    counts = {}
    sum_feats = {}
    square_sum_feats = {}

    idx = 0
    for idx, (utt, matrix) in enumerate(
        file_reader_helper(args.rspecifier, args.in_filetype), 1
    ):
        if is_scipy_wav_style(matrix):
            # If data is sound file, then got as Tuple[int, ndarray]
            rate, matrix = matrix
        if preprocessing is not None:
            matrix = preprocessing(matrix, uttid_list=utt)

        spk = utt2spk(utt)

        # Init at the first seen of the spk
        if spk not in counts:
            counts[spk] = 0
            feat_shape = matrix.shape[1:]
            # Accumulate in double precision
            sum_feats[spk] = np.zeros(feat_shape, dtype=np.float64)
            square_sum_feats[spk] = np.zeros(feat_shape, dtype=np.float64)

        counts[spk] += matrix.shape[0]
        sum_feats[spk] += matrix.sum(axis=0)
        square_sum_feats[spk] += (matrix ** 2).sum(axis=0)
    logging.info("Processed {} utterances".format(idx))
    assert idx > 0, idx

    cmvn_stats = {}
    for spk in counts:
        feat_shape = sum_feats[spk].shape
        cmvn_shape = (2, feat_shape[0] + 1) + feat_shape[1:]
        _cmvn_stats = np.empty(cmvn_shape, dtype=np.float64)
        _cmvn_stats[0, :-1] = sum_feats[spk]
        _cmvn_stats[1, :-1] = square_sum_feats[spk]

        _cmvn_stats[0, -1] = counts[spk]
        _cmvn_stats[1, -1] = 0.0

        # You can get the mean and std as following,
        # >>> N = _cmvn_stats[0, -1]
        # >>> mean = _cmvn_stats[0, :-1] / N
        # >>> std = np.sqrt(_cmvn_stats[1, :-1] / N - mean ** 2)

        cmvn_stats[spk] = _cmvn_stats

    # Per utterance or speaker CMVN
    if is_wspecifier:
        with file_writer_helper(
            args.wspecifier_or_wxfilename, filetype=args.out_filetype
        ) as writer:
            for spk, mat in cmvn_stats.items():
                writer[spk] = mat

    # Global CMVN
    else:
        matrix = cmvn_stats[None]
        if args.out_filetype == "npy":
            np.save(args.wspecifier_or_wxfilename, matrix)
        elif args.out_filetype == "mat":
            # Kaldi supports only matrix or vector
            kaldiio.save_mat(args.wspecifier_or_wxfilename, matrix)
        else:
            raise RuntimeError(
                "Not supporting: --out-filetype {}".format(args.out_filetype)
            )


if __name__ == "__main__":
    main()


================================================
FILE: egs/espnet_utils/compute-fbank-feats.py
================================================
#!/usr/bin/env python3

# Copyright 2018 Nagoya University (Tomoki Hayashi)
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

import argparse
from distutils.util import strtobool
import logging

import kaldiio
import numpy
import resampy

from espnet.transform.spectrogram import logmelspectrogram
from espnet.utils.cli_utils import get_commandline_args
from espnet.utils.cli_writers import file_writer_helper
from espnet2.utils.types import int_or_none


def get_parser():
    parser = argparse.ArgumentParser(
        description="compute FBANK feature from WAV",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
    )
    parser.add_argument("--fs", type=int_or_none, help="Sampling frequency")
    parser.add_argument(
        "--fmax", type=int_or_none, default=None, nargs="?", help="Maximum frequency"
    )
    parser.add_argument(
        "--fmin", type=int_or_none, default=None, nargs="?", help="Minimum frequency"
    )
    parser.add_argument("--n_mels", type=int, default=80, help="Number of mel basis")
    parser.add_argument("--n_fft", type=int, default=1024, help="FFT length in point")
    parser.add_argument(
        "--n_shift", type=int, default=512, help="Shift length in point"
    )
    parser.add_argument(
        "--win_length",
        type=int_or_none,
        default=None,
        nargs="?",
        help="Analisys window length in point",
    )
    parser.add_argument(
        "--window",
        type=str,
        default="hann",
        choices=["hann", "hamming"],
        help="Type of window",
    )
    parser.add_argument(
        "--write-num-frames", type=str, help="Specify wspecifer for utt2num_frames"
    )
    parser.add_argument(
        "--filetype",
        type=str,
        default="mat",
        choices=["mat", "hdf5"],
        help="Specify the file format for output. "
        '"mat" is the matrix format in kaldi',
    )
    parser.add_argument(
        "--compress", type=strtobool, default=False, help="Save in compressed format"
    )
    parser.add_argument(
        "--compression-method",
        type=int,
        default=2,
        help="Specify the method(if mat) or " "gzip-level(if hdf5)",
    )
    parser.add_argument("--verbose", "-V", default=0, type=int, help="Verbose option")
    parser.add_argument(
        "--normalize",
        choices=[1, 16, 24, 32],
        type=int,
        default=None,
        help="Give the bit depth of the PCM, "
        "then normalizes data to scale in [-1,1]",
    )
    parser.add_argument("rspecifier", type=str, help="WAV scp file")
    parser.add_argument(
        "--segments",
        type=str,
        help="segments-file format: each line is either"
        "<segment-id> <recording-id> <start-time> <end-time>"
        "e.g. call-861225-A-0050-0065 call-861225-A 5.0 6.5",
    )
    parser.add_argument("wspecifier", type=str, help="Write specifier")
    return parser


def main():
    parser = get_parser()
    args = parser.parse_args()

    logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
    if args.verbose > 0:
        logging.basicConfig(level=logging.INFO, format=logfmt)
    else:
        logging.basicConfig(level=logging.WARN, format=logfmt)
    logging.info(get_commandline_args())

    with kaldiio.ReadHelper(
        args.rspecifier, segments=args.segments
    ) as reader, file_writer_helper(
        args.wspecifier,
        filetype=args.filetype,
        write_num_frames=args.write_num_frames,
        compress=args.compress,
        compression_method=args.compression_method,
    ) as writer:
        for utt_id, (rate, array) in reader:
            array = array.astype(numpy.float32)
            if args.fs is not None and rate != args.fs:
                array = resampy.resample(array, rate, args.fs, axis=0)
            if args.normalize is not None and args.normalize != 1:
                array = array / (1 << (args.normalize - 1))

            lmspc = logmelspectrogram(
                x=array,
                fs=args.fs if args.fs is not None else rate,
                n_mels=args.n_mels,
                n_fft=args.n_fft,
                n_shift=args.n_shift,
                win_length=args.win_length,
                window=args.window,
                fmin=args.fmin,
                fmax=args.fmax,
            )
            writer[utt_id] = lmspc


if __name__ == "__main__":
    main()


================================================
FILE: egs/espnet_utils/compute-stft-feats.py
================================================
#!/usr/bin/env python3

# Copyright 2018 Nagoya University (Tomoki Hayashi)
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

import argparse
from distutils.util import strtobool
import logging

import kaldiio
import numpy
import resampy

from espnet.transform.spectrogram import spectrogram
from espnet.utils.cli_utils import get_commandline_args
from espnet.utils.cli_writers import file_writer_helper
from espnet2.utils.types import int_or_none


def get_parser():
    parser = argparse.ArgumentParser(
        description="compute STFT feature from WAV",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
    )
    parser.add_argument("--fs", type=int_or_none, help="Sampling frequency")
    parser.add_argument("--n_fft", type=int, default=1024, help="FFT length in point")
    parser.add_argument(
        "--n_shift", type=int, default=512, help="Shift length in point"
    )
    parser.add_argument(
        "--win_length",
        type=int_or_none,
        default=None,
        nargs="?",
        help="Analisys window length in point",
    )
    parser.add_argument(
        "--window",
        type=str,
        default="hann",
        choices=["hann", "hamming"],
        help="Type of window",
    )
    parser.add_argument(
        "--write-num-frames", type=str, help="Specify wspecifer for utt2num_frames"
    )
    parser.add_argument(
        "--filetype",
        type=str,
        default="mat",
        choices=["mat", "hdf5"],
        help="Specify the file format. " '"mat" is the matrix format in kaldi',
    )
    parser.add_argument(
        "--compress", type=strtobool, default=False, help="Save in compressed format"
    )
    parser.add_argument(
        "--compression-method",
        type=int,
        default=2,
        help="Specify the method(if mat) or " "gzip-level(if hdf5)",
    )
    parser.add_argument("--verbose", "-V", default=0, type=int, help="Verbose option")
    parser.add_argument(
        "--normalize",
        choices=[1, 16, 24, 32],
        type=int,
        default=None,
        help="Give the bit depth of the PCM, "
        "then normalizes data to scale in [-1,1]",
    )
    parser.add_argument("rspecifier", type=str, help="WAV scp file")
    parser.add_argument(
        "--segments",
        type=str,
        help="segments-file format: each line is either"
        "<segment-id> <recording-id> <start-time> <end-time>"
        "e.g. call-861225-A-0050-0065 call-861225-A 5.0 6.5",
    )
    parser.add_argument("wspecifier", type=str, help="Write specifier")
    return parser


def main():
    parser = get_parser()
    args = parser.parse_args()

    logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
    if args.verbose > 0:
        logging.basicConfig(level=logging.INFO, format=logfmt)
    else:
        logging.basicConfig(level=logging.WARN, format=logfmt)
    logging.info(get_commandline_args())

    with kaldiio.ReadHelper(
        args.rspecifier, segments=args.segments
    ) as reader, file_writer_helper(
        args.wspecifier,
        filetype=args.filetype,
        write_num_frames=args.write_num_frames,
        compress=args.compress,
        compression_method=args.compression_method,
    ) as writer:
        for utt_id, (rate, array) in reader:
            array = array.astype(numpy.float32)
            if args.fs is not None and rate != args.fs:
                array = resampy.resample(array, rate, args.fs, axis=0)
            if args.normalize is not None and args.normalize != 1:
                array = array / (1 << (args.normalize - 1))
            spc = spectrogram(
                x=array,
                n_fft=args.n_fft,
                n_shift=args.n_shift,
                win_length=args.win_length,
                window=args.window,
            )
            writer[utt_id] = spc


if __name__ == "__main__":
    main()


================================================
FILE: egs/espnet_utils/concat_json_multiref.py
================================================
#!/usr/bin/env python3
# encoding: utf-8

# Copyright 2018 Kyoto University (Hirofumi Inaguma)
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

import argparse
import codecs
import json
import logging
import sys

from espnet.utils.cli_utils import get_commandline_args


def get_parser():
    parser = argparse.ArgumentParser(
        description="concatenate multiple json files for data augmentation",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
    )
    parser.add_argument("jsons", type=str, nargs="+", help="json files")
    return parser


def main():
    parser = get_parser()
    args = parser.parse_args()

    # logging info
    logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
    logging.basicConfig(level=logging.INFO, format=logfmt)
    logging.info(get_commandline_args())

    # make intersection set for utterance keys
    num_keys = 0
    js = {}
    for i, x in enumerate(args.jsons):
        with codecs.open(x, encoding="utf-8") as f:
            j = json.load(f)
        ks = j["utts"].keys()
        logging.debug(x + ": has " + str(len(ks)) + " utterances")

        num_keys += len(ks)
        if i > 0:
            for k in ks:
                js[k + "." + str(i)] = j["utts"][k]
        else:
            js = j["utts"]
        # js.update(j['utts'])

    # logging.info('new json has ' + str(len(js.keys())) + ' utterances')
    logging.info("new json has " + str(num_keys) + " utterances")

    # ensure "ensure_ascii=False", which is a bug
    jsonstring = json.dumps(
        {"utts": js},
        indent=4,
        sort_keys=True,
        ensure_ascii=False,
        separators=(",", ": "),
    )
    sys.stdout = codecs.getwriter("utf-8")(sys.stdout.buffer)
    print(jsonstring)


if __name__ == "__main__":
    main()


================================================
FILE: egs/espnet_utils/concatjson.py
================================================
#!/usr/bin/env python3
# encoding: utf-8

# Copyright 2017 Johns Hopkins University (Shinji Watanabe)
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)


import argparse
import codecs
import json
import logging
import sys
import random
from espnet.utils.cli_utils import get_commandline_args

is_python2 = sys.version_info[0] == 2


def get_parser():
    parser = argparse.ArgumentParser(
        description="concatenate json files",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
    )
    parser.add_argument("--ark-size", type=int, default=0, help="json files")
    parser.add_argument("--shuffle", action='store_true', help="shuffle the output")
    parser.add_argument("jsons", type=str, nargs="+", help="json files")
    return parser

def truncate_tail(d, size):
    tot_length = len(list(d.keys()))
    tot_length -= tot_length % size
    out = {}
    keys = list(d.keys())[:tot_length]
    for k in keys:
        out[k] = d[k]
    return out

if __name__ == "__main__":
    args = get_parser().parse_args()

    # logging info
    logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
    logging.basicConfig(level=logging.INFO, format=logfmt)
    logging.info(get_commandline_args())

    # make intersection set for utterance keys
    js = {}
    for x in args.jsons:
        with codecs.open(x, encoding="utf-8") as f:
            j = json.load(f)
        ks = j["utts"].keys()
        logging.debug(x + ": has " + str(len(ks)) + " utterances")
        if args.ark_size > 0:
            dict_truncated = truncate_tail(j["utts"], args.ark_size)
        else:
            dict_truncated = j["utts"]
        js.update(dict_truncated)
    logging.info("new json has " + str(len(js.keys())) + " utterances")

    if args.shuffle:
        keys = list(js.keys())
        random.shuffle(keys)
        new_js = {k: js[k] for k in keys}
        js = new_js

    # ensure "ensure_ascii=False", which is a bug
    jsonstring = json.dumps(
        {"utts": js},
        indent=4,
        sort_keys=False,
        ensure_ascii=False,
        separators=(",", ": "),
    )
    sys.stdout = codecs.getwriter("utf-8")(
        sys.stdout if is_python2 else sys.stdout.buffer
    )
    print(jsonstring)


================================================
FILE: egs/espnet_utils/convert_fbank.sh
================================================
#!/usr/bin/env bash
# Set bash to 'debug' mode, it will exit on :
# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
set -e
set -u
set -o pipefail


# Copyright 2018 Nagoya University (Tomoki Hayashi)
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

# Begin configuration section.
nj=4
fs=22050
fmax=
fmin=
n_fft=1024
n_shift=512
win_length=
n_mels=
iters=64
cmd=run.pl
help_message=$(cat <<EOF
Usage: $0 [options] <data-dir> [<log-dir> [<fbank-dir>] ]
e.g.: $0 data/train exp/griffin_lim/train wav
Note: <log-dir> defaults to <data-dir>/log, and <fbank-dir> defaults to <data-dir>/data
Options:
  --nj <nj>                  # number of parallel jobs
  --fs <fs>                  # sampling rate
  --fmax <fmax>              # maximum frequency
  --fmin <fmin>              # minimum frequency
  --n_fft <n_fft>            # number of FFT points (default=1024)
  --n_shift <n_shift>        # shift size in point (default=256)
  --win_length <win_length>  # window length in point (default=)
  --n_mels <n_mels>          # number of mel basis (default=80)
  --iters <iters>            # number of Griffin-lim iterations (default=64)
  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs.
EOF
)
# End configuration section.

echo "$0 $*"  # Print the command line for logging

. parse_options.sh || exit 1;

if [ $# -lt 1 ] || [ $# -gt 3 ]; then
    echo "${help_message}"
    exit 1;
fi

set -euo pipefail

data=$1
if [ $# -ge 2 ]; then
  logdir=$2
else
  logdir=${data}/log
fi
if [ $# -ge 3 ]; then
  wavdir=$3
else
  wavdir=${data}/data
fi

# use "name" as part of name of the archive.
name=$(basename ${data})

mkdir -p ${wavdir} || exit 1;
mkdir -p ${logdir} || exit 1;

scp=${data}/feats.scp

split_scps=""
for n in $(seq ${nj}); do
    split_scps="$split_scps $logdir/feats.$n.scp"
done

utils/split_scp.pl ${scp} ${split_scps} || exit 1;

${cmd} JOB=1:${nj} ${logdir}/griffin_lim_${name}.JOB.log \
    convert_fbank_to_wav.py \
        --fs ${fs} \
        --fmax ${fmax} \
        --fmin ${fmin} \
        --win_length ${win_length} \
        --n_fft ${n_fft} \
        --n_shift ${n_shift} \
        --n_mels ${n_mels} \
        --iters ${iters} \
        scp:${logdir}/feats.JOB.scp \
        ${wavdir}

rm ${logdir}/feats.*.scp 2>/dev/null

echo "Succeeded creating wav for $name"


================================================
FILE: egs/espnet_utils/convert_fbank_to_wav.py
================================================
#!/usr/bin/env python3

# Copyright 2018 Nagoya University (Tomoki Hayashi)
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

import argparse
import logging
import os

from distutils.version import LooseVersion

import librosa
import numpy as np
from scipy.io.wavfile import write

from espnet.utils.cli_readers import file_reader_helper
from espnet.utils.cli_utils import get_commandline_args


EPS = 1e-10


def logmelspc_to_linearspc(lmspc, fs, n_mels, n_fft, fmin=None, fmax=None):
    """Convert log Mel filterbank to linear spectrogram.

    Args:
        lmspc (ndarray): Log Mel filterbank (T, n_mels).
        fs (int): Sampling frequency.
        n_mels (int): Number of mel basis.
        n_fft (int): Number of FFT points.
        f_min (int, optional): Minimum frequency to analyze.
        f_max (int, optional): Maximum frequency to analyze.

    Returns:
        ndarray: Linear spectrogram (T, n_fft // 2 + 1).

    """
    assert lmspc.shape[1] == n_mels
    fmin = 0 if fmin is None else fmin
    fmax = fs / 2 if fmax is None else fmax
    mspc = np.power(10.0, lmspc)
    mel_basis = librosa.filters.mel(fs, n_fft, n_mels, fmin, fmax)
    inv_mel_basis = np.linalg.pinv(mel_basis)
    spc = np.maximum(EPS, np.dot(inv_mel_basis, mspc.T).T)

    return spc


def griffin_lim(spc, n_fft, n_shift, win_length, window="hann", n_iters=100):
    """Convert linear spectrogram into waveform using Griffin-Lim.

    Args:
        spc (ndarray): Linear spectrogram (T, n_fft // 2 + 1).
        n_fft (int): Number of FFT points.
        n_shift (int): Shift size in points.
        win_length (int): Window length in points.
        window (str, optional): Window function type.
        n_iters (int, optionl): Number of iterations of Griffin-Lim Algorithm.

    Returns:
        ndarray: Reconstructed waveform (N,).

    """
    # assert the size of input linear spectrogram
    assert spc.shape[1] == n_fft // 2 + 1

    if LooseVersion(librosa.__version__) >= LooseVersion("0.7.0"):
        # use librosa's fast Grriffin-Lim algorithm
        spc = np.abs(spc.T)
        y = librosa.griffinlim(
            S=spc,
            n_iter=n_iters,
            hop_length=n_shift,
            win_length=win_length,
            window=window,
            center=True if spc.shape[1] > 1 else False,
        )
    else:
        # use slower version of Grriffin-Lim algorithm
        logging.warning(
            "librosa version is old. use slow version of Grriffin-Lim algorithm."
            "if you want to use fast Griffin-Lim, please update librosa via "
            "`source ./path.sh && pip install librosa==0.7.0`."
        )
        cspc = np.abs(spc).astype(np.complex).T
        angles = np.exp(2j * np.pi * np.random.rand(*cspc.shape))
        y = librosa.istft(cspc * angles, n_shift, win_length, window=window)
        for i in range(n_iters):
            angles = np.exp(
                1j
                * np.angle(librosa.stft(y, n_fft, n_shift, win_length, window=window))
            )
            y = librosa.istft(cspc * angles, n_shift, win_length, window=window)

    return y


def get_parser():
    parser = argparse.ArgumentParser(
        description="convert FBANK to WAV using Griffin-Lim algorithm",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
    )
    parser.add_argument("--fs", type=int, default=22050, help="Sampling frequency")
    parser.add_argument(
        "--fmax", type=int, default=None, nargs="?", help="Maximum frequency"
    )
    parser.add_argument(
        "--fmin", type=int, default=None, nargs="?", help="Minimum frequency"
    )
    parser.add_argument("--n_fft", type=int, default=1024, help="FFT length in point")
    parser.add_argument(
        "--n_shift", type=int, default=512, help="Shift length in point"
    )
    parser.add_argument(
        "--win_length",
        type=int,
        default=None,
        nargs="?",
        help="Analisys window length in point",
    )
    parser.add_argument(
        "--n_mels", type=int, default=None, nargs="?", help="Number of mel basis"
    )
    parser.add_argument(
        "--window",
        type=str,
        default="hann",
        choices=["hann", "hamming"],
        help="Type of window",
    )
    parser.add_argument(
        "--iters", type=int, default=100, help="Number of iterations in Grriffin Lim"
    )
    parser.add_argument(
        "--filetype",
        type=str,
        default="mat",
        choices=["mat", "hdf5"],
        help="Specify the file format for the rspecifier. "
        '"mat" is the matrix format in kaldi',
    )
    parser.add_argument("rspecifier", type=str, help="Input feature")
    parser.add_argument("outdir", type=str, help="Output directory")
    return parser


def main():
    parser = get_parser()
    args = parser.parse_args()

    # logging info
    logging.basicConfig(
        level=logging.INFO,
        format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
    )
    logging.info(get_commandline_args())

    # check directory
    if not os.path.exists(args.outdir):
        os.makedirs(args.outdir)

    for idx, (utt_id, lmspc) in enumerate(
        file_reader_helper(args.rspecifier, args.filetype), 1
    ):
        if args.n_mels is not None:
            spc = logmelspc_to_linearspc(
                lmspc,
                fs=args.fs,
                n_mels=args.n_mels,
                n_fft=args.n_fft,
                fmin=args.fmin,
                fmax=args.fmax,
            )
        else:
            spc = lmspc
        y = griffin_lim(
            spc,
            n_fft=args.n_fft,
            n_shift=args.n_shift,
            win_length=args.win_length,
            window=args.window,
            n_iters=args.iters,
        )
        logging.info("(%d) %s" % (idx, utt_id))
        write(
            args.outdir + "/%s.wav" % utt_id,
            args.fs,
            (y * np.iinfo(np.int16).max).astype(np.int16),
        )


if __name__ == "__main__":
    main()


================================================
FILE: egs/espnet_utils/copy-feats.py
================================================
#!/usr/bin/env python3
import argparse
from distutils.util import strtobool
import logging

from espnet.transform.transformation import Transformation
from espnet.utils.cli_readers import file_reader_helper
from espnet.utils.cli_utils import get_commandline_args
from espnet.utils.cli_utils import is_scipy_wav_style
from espnet.utils.cli_writers import file_writer_helper


def get_parser():
    parser = argparse.ArgumentParser(
        description="copy feature with preprocessing",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
    )

    parser.add_argument("--verbose", "-V", default=0, type=int, help="Verbose option")
    parser.add_argument(
        "--in-filetype",
        type=str,
        default="mat",
        choices=["mat", "hdf5", "sound.hdf5", "sound"],
        help="Specify the file format for the rspecifier. "
        '"mat" is the matrix format in kaldi',
    )
    parser.add_argument(
        "--out-filetype",
        type=str,
        default="mat",
        choices=["mat", "hdf5", "sound.hdf5", "sound"],
        help="Specify the file format for the wspecifier. "
        '"mat" is the matrix format in kaldi',
    )
    parser.add_argument(
        "--write-num-frames", type=str, help="Specify wspecifer for utt2num_frames"
    )
    parser.add_argument(
        "--compress", type=strtobool, default=False, help="Save in compressed format"
    )
    parser.add_argument(
        "--compression-method",
        type=int,
        default=2,
        help="Specify the method(if mat) or " "gzip-level(if hdf5)",
    )
    parser.add_argument(
        "--preprocess-conf",
        type=str,
        default=None,
        help="The configuration file for the pre-processing",
    )
    parser.add_argument(
        "rspecifier", type=str, help="Read specifier for feats. e.g. ark:some.ark"
    )
    parser.add_argument(
        "wspecifier", type=str, help="Write specifier. e.g. ark:some.ark"
    )
    return parser


def main():
    parser = get_parser()
    args = parser.parse_args()

    # logging info
    logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
    if args.verbose > 0:
        logging.basicConfig(level=logging.INFO, format=logfmt)
    else:
        logging.basicConfig(level=logging.WARN, format=logfmt)
    logging.info(get_commandline_args())

    if args.preprocess_conf is not None:
        preprocessing = Transformation(args.preprocess_conf)
        logging.info("Apply preprocessing: {}".format(preprocessing))
    else:
        preprocessing = None

    with file_writer_helper(
        args.wspecifier,
        filetype=args.out_filetype,
        write_num_frames=args.write_num_frames,
        compress=args.compress,
        compression_method=args.compression_method,
    ) as writer:
        for utt, mat in file_reader_helper(args.rspecifier, args.in_filetype):
            if is_scipy_wav_style(mat):
                # If data is sound file, then got as Tuple[int, ndarray]
                rate, mat = mat

            if preprocessing is not None:
                mat = preprocessing(mat, uttid_list=utt)

            # shape = (Time, Channel)
            if args.out_filetype in ["sound.hdf5", "sound"]:
                # Write Tuple[int, numpy.ndarray] (scipy style)
                writer[utt] = (rate, mat)
            else:
                writer[utt] = mat


if __name__ == "__main__":
    main()


================================================
FILE: egs/espnet_utils/data2json.sh
================================================
#!/usr/bin/env bash

# Copyright 2017 Johns Hopkins University (Shinji Watanabe)
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

echo "$0 $*" >&2 # Print the command line for logging
. ./path.sh

nj=10
cmd=run.pl
nlsyms=""
lang=""
feat="" # feat.scp
oov="<unk>"
bpecode=""
allow_one_column=false
verbose=0
trans_type=char
filetype=""
preprocess_conf=""
category=""
text_org=""
out="" # If omitted, write in stdout

text=""
multilingual=false

help_message=$(cat << EOF
Usage: $0 <data-dir> <dict>
e.g. $0 data/train data/lang_1char/train_units.txt
Options:
  --nj <nj>                                        # number of parallel jobs
  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs.
  --feat <feat-scp>                                # feat.scp or feat1.scp,feat2.scp,...
  --oov <oov-word>                                 # Default: <unk>
  --out <outputfile>                               # If omitted, write in stdout
  --filetype <mat|hdf5|sound.hdf5>                 # Specify the format of feats file
  --preprocess-conf <json>                         # Apply preprocess to feats when creating shape.scp
  --verbose <num>                                  # Default: 0
EOF
)
. utils/parse_options.sh

if [ $# != 2 ]; then
    echo $@
    echo "${help_message}" 1>&2
    exit 1;
fi

set -euo pipefail

dir=$1
dic=$2
tmpdir=$(mktemp -d ${dir}/tmp-XXXXX)
trap 'rm -rf ${tmpdir}' EXIT

if [ -z ${text} ]; then
    text=${dir}/text
fi

# 1. Create scp files for inputs
#   These are not necessary for decoding mode, and make it as an option
input=
if [ -n "${feat}" ]; then
    _feat_scps=$(echo "${feat}" | tr ',' ' ' )
    read -r -a feat_scps <<< $_feat_scps
    num_feats=${#feat_scps[@]}

    for (( i=1; i<=num_feats; i++ )); do
        feat=${feat_scps[$((i-1))]}
        mkdir -p ${tmpdir}/input_${i}
        input+="input_${i} "
        cat ${feat} > ${tmpdir}/input_${i}/feat.scp

        # Dump in the "legacy" style JSON format
        if [ -n "${filetype}" ]; then
            awk -v filetype=${filetype} '{print $1 " " filetype}' ${feat} \
                > ${tmpdir}/input_${i}/filetype.scp
        fi

        feat_to_shape.sh --cmd "${cmd}" --nj ${nj} \
            --filetype "${filetype}" \
            --preprocess-conf "${preprocess_conf}" \
            --verbose ${verbose} ${feat} ${tmpdir}/input_${i}/shape.scp
    done
fi

# 2. Create scp files for outputs
mkdir -p ${tmpdir}/output
if [ -n "${bpecode}" ]; then
    if [ ${multilingual} = true ]; then
        # remove a space before the language ID
        paste -d " " <(awk '{print $1}' ${text}) <(cut -f 2- -d" " ${text} \
            | spm_encode --model=${bpecode} --output_format=piece | cut -f 2- -d" ") \
            > ${tmpdir}/output/token.scp
    else
        paste -d " " <(awk '{print $1}' ${text}) <(cut -f 2- -d" " ${text} \
            | spm_encode --model=${bpecode} --output_format=piece --split-chn) \
            > ${tmpdir}/output/token.scp
    fi
elif [ -n "${nlsyms}" ]; then
    text2token.py -s 1 -n 1 -l ${nlsyms} ${text} --trans_type ${trans_type} > ${tmpdir}/output/token.scp
else
    text2token.py -s 1 -n 1 ${text} --trans_type ${trans_type} > ${tmpdir}/output/token.scp
fi
< ${tmpdir}/output/token.scp utils/sym2int.pl --map-oov ${oov} -f 2- ${dic} > ${tmpdir}/output/tokenid.scp
# +2 comes from CTC blank and EOS
vocsize=$(tail -n 1 ${dic} | awk '{print $2}')
odim=$(echo "$vocsize + 2" | bc)
< ${tmpdir}/output/tokenid.scp awk -v odim=${odim} '{print $1 " " NF-1 "," odim}' > ${tmpdir}/output/shape.scp

cat ${text} > ${tmpdir}/output/text.scp


# 3. Create scp files for the others
mkdir -p ${tmpdir}/other
if [ ${multilingual} == true ]; then
    awk '{
        n = split($1,S,"[-]");
        lang=S[n];
        print $1 " " lang
    }' ${text} > ${tmpdir}/other/lang.scp
elif [ -n "${lang}" ]; then
    awk -v lang=${lang} '{print $1 " " lang}' ${text} > ${tmpdir}/other/lang.scp
fi

if [ -n "${category}" ]; then
    awk -v category=${category} '{print $1 " " category}' ${dir}/text \
        > ${tmpdir}/other/category.scp
fi
cat ${dir}/utt2spk > ${tmpdir}/other/utt2spk.scp

if [ -n "${text_org}" ]; then
    cp $text_org ${tmpdir}/other/text_org.scp
fi

# 4. Merge scp files into a JSON file
opts=""
if [ -n "${feat}" ]; then
    intypes="${input} output other"
else
    intypes="output other"
fi
for intype in ${intypes}; do
    if [ -z "$(find "${tmpdir}/${intype}" -name "*.scp")" ]; then
        continue
    fi

    if [ ${intype} != other ]; then
        opts+="--${intype%_*}-scps "
    else
        opts+="--scps "
    fi

    for x in "${tmpdir}/${intype}"/*.scp; do
        k=$(basename ${x} .scp)
        if [ ${k} = shape ]; then
            opts+="shape:${x}:shape "
        else
            opts+="${k}:${x} "
        fi
    done
done

if ${allow_one_column}; then
    opts+="--allow-one-column true "
else
    opts+="--allow-one-column false "
fi

if [ -n "${out}" ]; then
    opts+="-O ${out}"
fi
merge_scp2json.py --verbose ${verbose} ${opts}

rm -fr ${tmpdir}


================================================
FILE: egs/espnet_utils/divide_lang.sh
================================================
#!/bin/bash

# Copyright 2021 Kyoto University (Hirofumi Inaguma)
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

. ./path.sh

if [ "$#" -ne 2 ]; then
    echo "Usage: $0 <set> <langs divided by space>"
    echo "e.g.: $0 dev"
    exit 1
fi

set=$1
langs=$2

# Copy stuff intoc its final locations [this has been moved from the format_data script]
for lang in ${langs}; do
    mkdir -p data/${set}.${lang}
    for f in spk2utt utt2spk segments wav.scp feats.scp utt2num_frames; do
        if [ -f data/${set}/${f} ]; then
            sort data/${set}/${f} > data/${set}.${lang}/${f}
        fi
    done
    sort data/${set}/text.lc.rm.${lang} > data/${set}.${lang}/text  # dummy
    for case in lc.rm lc tc; do
        sort data/${set}/text.${case}.${lang} > data/${set}.${lang}/text.${case}
    done
    utils/fix_data_dir.sh --utt_extra_files "text.tc text.lc text.lc.rm" data/${set}.${lang}
    if [ -f data/${set}.${lang}/feats.scp ]; then
        utils/validate_data_dir.sh data/${set}.${lang} || exit 1;
    else
        utils/validate_data_dir.sh --no-feats --no-wav data/${set}.${lang} || exit 1;
    fi
done


================================================
FILE: egs/espnet_utils/double_precious_cer.py
================================================
import sys

in_f = sys.argv[1]

for line in open(in_f, encoding="utf-8"):
    if "Sum" in line and "|" in line and "Avg" not in line:
        line = line.strip().split()
        tot = line[4]
        err = line[10]
        cer = float(err) / float(tot) * 100
        print("CER: {:.3f}".format(cer))


================================================
FILE: egs/espnet_utils/download_from_google_drive.sh
================================================
#!/usr/bin/env bash

# Download zip, tar, or tar.gz file from google drive

# Copyright 2019 Tomoki Hayashi
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

share_url=$1
download_dir=${2:-"downloads"}
file_ext=${3:-"zip"}

if [ "$1" = "--help" ] || [ $# -lt 1 ] || [ $# -gt 3 ]; then
   echo "Usage: $0 <share-url> [<download_dir> <file_ext>]";
   echo "e.g.: $0 https://drive.google.com/open?id=1zF88bRNbJhw9hNBq3NrDg8vnGGibREmg downloads zip"
   echo "Options:"
   echo "    <download_dir>: directory to save downloaded file. (Default=downloads)"
   echo "    <file_ext>: file extension of the file to be downloaded. (Default=zip)"
   if [ "$1" = "--help" ]; then
       exit 0;
   fi
   exit 1;
fi

[ ! -e "${download_dir}" ] && mkdir -p "${download_dir}"
tmp=$(mktemp "${download_dir}/XXXXXX.${file_ext}")

# file id in google drive can be obtain from sharing link
# ref: https://qiita.com/namakemono/items/c963e75e0af3f7eed732
file_id=$(echo "${share_url}" | cut -d"=" -f 2)

# define decompressor
decompress () {
    filename=$1
    decompress_dir=$2
    if echo "${filename}" | grep -q ".zip"; then
        unzip "${filename}" -d "${decompress_dir}"
    elif echo "${filename}" | grep -q -e ".tar" -e ".tar.gz" -e ".tgz"; then
        tar xvzf "${filename}" -C "${decompress_dir}"
    else
        echo "Unsupported file extension." >&2 && exit 1
    fi
}

set -e
# Solution from https://github.com/wkentaro/gdown
gdown --id "${file_id}" -O "${tmp}"
decompress "${tmp}" "${download_dir}"

# remove tmpfiles
rm "${tmp}"
echo "Sucessfully downloaded ${file_ext} file from ${share_url}"


================================================
FILE: egs/espnet_utils/dump-pcm.py
================================================
#!/usr/bin/env python3
import argparse
from distutils.util import strtobool
import logging

import kaldiio
import numpy

from espnet.transform.transformation import Transformation
from espnet.utils.cli_utils import get_commandline_args
from espnet.utils.cli_writers import file_writer_helper


def get_parser():
    parser = argparse.ArgumentParser(
        description="dump PCM files from a WAV scp file",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
    )
    parser.add_argument(
        "--write-num-frames", type=str, help="Specify wspecifer for utt2num_frames"
    )
    parser.add_argument(
        "--filetype",
        type=str,
        default="mat",
        choices=["mat", "hdf5", "sound.hdf5", "sound"],
        help="Specify the file format for output. "
        '"mat" is the matrix format in kaldi',
    )
    parser.add_argument(
        "--format",
        type=str,
        default=None,
        help="The file format for output pcm. "
        "This option is only valid "
        'when "--filetype" is "sound.hdf5" or "sound"',
    )
    parser.add_argument(
        "--compress", type=strtobool, default=False, help="Save in compressed format"
    )
    parser.add_argument(
        "--compression-method",
        type=int,
        default=2,
        help="Specify the method(if mat) or " "gzip-level(if hdf5)",
    )
    parser.add_argument("--verbose", "-V", default=0, type=int, help="Verbose option")
    parser.add_argument(
        "--normalize",
        choices=[1, 16, 24, 32],
        type=int,
        default=None,
        help="Give the bit depth of the PCM, "
        "then normalizes data to scale in [-1,1]",
    )
    parser.add_argument(
        "--preprocess-conf",
        type=str,
        default=None,
        help="The configuration file for the pre-processing",
    )
    parser.add_argument(
        "--keep-length",
        type=strtobool,
        default=True,
        help="Truncating or zero padding if the output length "
        "is changed from the input by preprocessing",
    )
    parser.add_argument("rspecifier", type=str, help="WAV scp file")
    parser.add_argument(
        "--segments",
        type=str,
        help="segments-file format: each line is either"
        "<segment-id> <recording-id> <start-time> <end-time>"
        "e.g. call-861225-A-0050-0065 call-861225-A 5.0 6.5",
    )
    parser.add_argument("wspecifier", type=str, help="Write specifier")
    return parser


def main():
    parser = get_parser()
    args = parser.parse_args()

    logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
    if args.verbose > 0:
        logging.basicConfig(level=logging.INFO, format=logfmt)
    else:
        logging.basicConfig(level=logging.WARN, format=logfmt)
    logging.info(get_commandline_args())

    if args.preprocess_conf is not None:
        preprocessing = Transformation(args.preprocess_conf)
        logging.info("Apply preprocessing: {}".format(preprocessing))
    else:
        preprocessing = None

    with file_writer_helper(
        args.wspecifier,
        filetype=args.filetype,
        write_num_frames=args.write_num_frames,
        compress=args.compress,
        compression_method=args.compression_method,
        pcm_format=args.format,
    ) as writer:
        for utt_id, (rate, array) in kaldiio.ReadHelper(args.rspecifier, args.segments):
            if args.filetype == "mat":
                # Kaldi-matrix doesn't support integer
                array = array.astype(numpy.float32)

            if array.ndim == 1:
                # (Time) -> (Time, Channel)
                array = array[:, None]

            if args.normalize is not None and args.normalize != 1:
                array = array.astype(numpy.float32)
                array = array / (1 << (args.normalize - 1))

            if preprocessing is not None:
                orgtype = array.dtype
                out = preprocessing(array, uttid_list=utt_id)
                out = out.astype(orgtype)

                if args.keep_length:
                    if len(out) > len(array):
                        out = numpy.pad(
                            out,
                            [(0, len(out) - len(array))]
                            + [(0, 0) for _ in range(out.ndim - 1)],
                            mode="constant",
                        )
                    elif len(out) < len(array):
                        # The length can be changed by stft, for example.
                        out = out[: len(out)]

                array = out

            # shape = (Time, Channel)
            if args.filetype in ["sound.hdf5", "sound"]:
                # Write Tuple[int, numpy.ndarray] (scipy style)
                writer[utt_id] = (rate, array)
            else:
                writer[utt_id] = array


if __name__ == "__main__":
    main()


================================================
FILE: egs/espnet_utils/dump.sh
================================================
#!/usr/bin/env bash

# Copyright 2017 Nagoya University (Tomoki Hayashi)
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

echo "$0 $*"  # Print the command line for logging
. ./path.sh

cmd=run.pl
do_delta=false
nj=1
verbose=0
compress=true
write_utt2num_frames=true
filetype='mat'  # mat or hdf5
help_message="Usage: $0 <scp> <cmvnark> <logdir> <dumpdir>"

. utils/parse_options.sh

scp=$1
cvmnark=$2
logdir=$3
dumpdir=$4

if [ $# != 4 ]; then
    echo "${help_message}"
    exit 1;
fi

set -euo pipefail

mkdir -p ${logdir}
mkdir -p ${dumpdir}

dumpdir=$(perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' ${dumpdir} ${PWD})

for n in $(seq ${nj}); do
    # the next command does nothing unless $dumpdir/storage/ exists, see
    # utils/create_data_link.pl for more info.
    utils/create_data_link.pl ${dumpdir}/feats.${n}.ark
done

if ${write_utt2num_frames}; then
    write_num_frames_opt="--write-num-frames=ark,t:$dumpdir/utt2num_frames.JOB"
else
    write_num_frames_opt=
fi

# split scp file
split_scps=""
for n in $(seq ${nj}); do
    split_scps="$split_scps $logdir/feats.$n.scp"
done

utils/split_scp.pl ${scp} ${split_scps} || exit 1;

# dump features
if ${do_delta}; then
    ${cmd} JOB=1:${nj} ${logdir}/dump_feature.JOB.log \
        apply-cmvn --norm-vars=true ${cvmnark} scp:${logdir}/feats.JOB.scp ark:- \| \
        add-deltas ark:- ark:- \| \
        copy-feats.py --verbose ${verbose} --out-filetype ${filetype} \
            --compress=${compress} --compression-method=2 ${write_num_frames_opt} \
            ark:- ark,scp:${dumpdir}/feats.JOB.ark,${dumpdir}/feats.JOB.scp \
        || exit 1
else
    ${cmd} JOB=1:${nj} ${logdir}/dump_feature.JOB.log \
        apply-cmvn --norm-vars=true ${cvmnark} scp:${logdir}/feats.JOB.scp ark:- \| \
        copy-feats.py --verbose ${verbose} --out-filetype ${filetype} \
            --compress=${compress} --compression-method=2 ${write_num_frames_opt} \
            ark:- ark,scp:${dumpdir}/feats.JOB.ark,${dumpdir}/feats.JOB.scp \
        || exit 1
fi

# concatenate scp files
for n in $(seq ${nj}); do
    cat ${dumpdir}/feats.${n}.scp || exit 1;
done > ${dumpdir}/feats.scp || exit 1

if ${write_utt2num_frames}; then
    for n in $(seq ${nj}); do
        cat ${dumpdir}/utt2num_frames.${n} || exit 1;
    done > ${dumpdir}/utt2num_frames || exit 1
    rm ${dumpdir}/utt2num_frames.* 2>/dev/null
fi

# Write the filetype, this will be used for data2json.sh
echo ${filetype} > ${dumpdir}/filetype


# remove temp scps
rm ${logdir}/feats.*.scp 2>/dev/null
if [ ${verbose} -eq 1 ]; then
    echo "Succeeded dumping features for training"
fi


================================================
FILE: egs/espnet_utils/dump_pcm.sh
================================================
#!/usr/bin/env bash

# Begin configuration section.
nj=4
cmd=run.pl
compress=false
write_utt2num_frames=false # if true writes utt2num_frames
verbose=2
filetype=mat # mat or hdf5
keep_length=true
format=wav
# End configuration section.

help_message=$(cat <<EOF
Usage: $0 [options] <data-dir> [<log-dir> [<pcm-dir>] ]
e.g.: $0 data/train exp/dump_pcm/train pcm
Note: <log-dir> defaults to <data-dir>/log, and <pcm-dir> defaults to <data-dir>/data
Options:
  --nj <nj>                                        # number of parallel jobs
  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs.
  --write-utt2num-frames <true|false>     # If true, write utt2num_frames file.
  --filetype <mat|hdf5|sound.hdf5>                 # Specify the format of feats file
EOF
)
echo "$0 $*"  # Print the command line for logging

if [ -f path.sh ]; then . ./path.sh; fi
. parse_options.sh || exit 1;

if [ $# -lt 1 ] || [ $# -gt 3 ]; then
    echo "${help_message}"
    exit 1;
fi

set -euo pipefail

data=$1
if [ $# -ge 2 ]; then
  logdir=$2
else
  logdir=${data}/log
fi
if [ $# -ge 3 ]; then
  pcmdir=$3
else
  pcmdir=${data}/data
fi


# make $pcmdir an absolute pathname.
pcmdir=$(perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' ${pcmdir} ${PWD})

# use "name" as part of name of the archive.
name=$(basename ${data})

mkdir -p ${pcmdir}
mkdir -p ${logdir}

if [ -f ${data}/feats.scp ]; then
  mkdir -p ${data}/.backup
  echo "$0: moving ${data}/feats.scp to ${data}/.backup"
  mv ${data}/feats.scp ${data}/.backup
fi

scp=${data}/wav.scp

required="${scp}"

for f in ${required}; do
  if [ ! -f ${f} ]; then
    echo "$0: no such file ${f}"
  fi
done

utils/validate_data_dir.sh --no-text --no-feats ${data}

if ${write_utt2num_frames}; then
    opts="--write-num-frames=ark,t:${logdir}/utt2num_frames.JOB "
else
    opts=
fi

if [ "${filetype}" == hdf5 ]; then
    ext=.h5
elif [ "${filetype}" == sound.hdf5 ]; then
    ext=.flac.h5
    opts+="--format ${format} "

elif [ "${filetype}" == sound ]; then
    ext=
    opts+="--format wav "
else
    ext=.ark
fi

if [ -f ${data}/segments ]; then
  echo "$0 [info]: segments file exists: using that."
  split_segments=""
  for n in $(seq ${nj}); do
    split_segments="${split_segments} ${logdir}/segments.${n}"
  done

  utils/split_scp.pl ${data}/segments ${split_segments}

  ${cmd} JOB=1:${nj} ${logdir}/dump_pcm_${name}.JOB.log \
      dump-pcm.py ${opts} --filetype ${filetype} --verbose=${verbose} --compress=${compress} \
      --keep-length ${keep_length} --segment=${logdir}/segments.JOB scp:${scp} \
      ark,scp:${pcmdir}/raw_pcm_${name}.JOB${ext},${pcmdir}/raw_pcm_${name}.JOB.scp

else

  echo "$0: [info]: no segments file exists: assuming wav.scp indexed by utterance."
  split_scps=""
  for n in $(seq ${nj}); do
    split_scps="${split_scps} ${logdir}/wav.${n}.scp"
  done

  utils/split_scp.pl ${scp} ${split_scps}

  ${cmd} JOB=1:${nj} ${logdir}/dump_pcm_${name}.JOB.log \
      dump-pcm.py ${opts} --filetype ${filetype} --verbose=${verbose} --compress=${compress} \
      --keep-length ${keep_length} scp:${logdir}/wav.JOB.scp \
      ark,scp:${pcmdir}/raw_pcm_${name}.JOB${ext},${pcmdir}/raw_pcm_${name}.JOB.scp

fi


# concatenate the .scp files together.
for n in $(seq ${nj}); do
  cat ${pcmdir}/raw_pcm_${name}.${n}.scp
done > ${data}/feats.scp

if ${write_utt2num_frames}; then
  for n in $(seq ${nj}); do
    cat ${logdir}/utt2num_frames.${n}
  done > ${data}/utt2num_frames
  rm ${logdir}/utt2num_frames.*
fi

rm -f ${logdir}/wav.*.scp ${logdir}/segments.* 2>/dev/null

# Write the filetype, this will be used for data2json.sh
echo ${filetype} > ${data}/filetype

nf=$(< $data/feats.scp wc -l)
nu=$(< $data/utt2spk wc -l)
if [ ${nf} -ne ${nu} ]; then
  echo "It seems not all of the feature files were successfully (${nf} != ${nu});"
  echo "consider using utils/fix_data_dir.sh ${data}"
fi

echo "Succeeded dumping pcm for ${name}"


================================================
FILE: egs/espnet_utils/eval-source-separation.py
================================================
#!/usr/bin/env python3
import argparse
from collections import OrderedDict
from distutils.util import strtobool
import itertools
import logging
import os
from pathlib import Path
import shutil
import subprocess
import sys
from tempfile import TemporaryDirectory
import warnings

import museval
import numpy as np
from pystoi.stoi import stoi
import soundfile

from espnet.utils.cli_utils import get_commandline_args


def eval_STOI(ref, y, fs, extended=False, compute_permutation=True):
    """Calculate STOI

    Reference:
        A short-time objective intelligibility measure
            for time-frequency weighted noisy speech
        https://ieeexplore.ieee.org/document/5495701

    Note(kamo):
        STOI is defined on the signal at 10kHz
        and the input at the other sampling rate will be resampled.
        Thus, the result differs depending on the implementation of resampling.
        Especially, pystoi cannot reproduce matlab's resampling now.

    :param ref (np.ndarray): Reference (Nsrc, Nframe, Nmic)
    :param y (np.ndarray): Enhanced (Nsrc, Nframe, Nmic)
    :param fs (int): Sample frequency
    :param extended (bool): stoi or estoi
    :param compute_permutation (bool):
    :return: value, perm
    :rtype: Tuple[Tuple[float, ...], Tuple[int, ...]]
    """
    if ref.shape != y.shape:
        raise ValueError(
            "ref and y should have the same shape: {} != {}".format(ref.shape, y.shape)
        )
    if ref.ndim != 3:
        raise ValueError("Input must have 3 dims: {}".format_map(ref.ndim))
    n_src = ref.shape[0]
    n_mic = ref.shape[2]

    if compute_permutation:
        index_list = list(itertools.permutations(range(n_src)))
    else:
        index_list = [list(range(n_src))]

    values = [
        [
            sum(stoi(ref[i, :, ch], y[j, :, ch], fs, extended) for ch in range(n_mic))
            / n_mic
            for i, j in enumerate(indices)
        ]
        for indices in index_list
    ]

    best_pairs = sorted(
        [(v, i) for v, i in zip(values, index_list)], key=lambda x: sum(x[0])
    )[-1]
    value, perm = best_pairs
    return tuple(value), tuple(perm)


def eval_PESQ(ref, enh, fs, compute_permutation: bool = True, wideband: bool = True):
    """Evaluate PESQ

    PESQ program can be downloaded from here:
        http://www.itu.int/rec/dologin_pub.asp?lang=e&id=T-REC-P.862-200511-I!Amd2!SOFT-ZST-E&type=items

    Reference:
        Perceptual evaluation of speech quality (PESQ)-a new method
            for speech quality assessment of telephone networks and codecs
        https://ieeexplore.ieee.org/document/941023

    :param x (np.ndarray): Reference (Nsrc, Nframe, Nmic)
    :param y (np.ndarray): Enhanced (Nsrc, Nframe, Nmic)
    :param fs (int): Sample frequency
    :param compute_permutation (bool):
    """
    if shutil.which("PESQ") is None:
        raise RuntimeError("PESQ: command not found: Please install")
    if fs not in (8000, 16000):
        raise ValueError("Sample frequency must be 8000 or 16000: {}".format(fs))
    if ref.shape != enh.shape:
        raise ValueError(
            "ref and enh should have the same shape: {} != {}".format(
                ref.shape, enh.shape
            )
        )
    if ref.ndim != 3:
        raise ValueError("Input must have 3 dims: {}".format_map(ref.ndim))

    n_src = ref.shape[0]
    n_mic = ref.shape[2]
    with TemporaryDirectory() as d:
        # Dumping wav files temporary
        ref_files = []
        enh_files = []
        for isrc in range(n_src):
            refs = []  # [Nsrc, Nmic]
            enhs = []  # [Nsrc, Nmic]
            for imic in range(n_mic):
                wv = str(os.path.join(d, "ref.{}.{}.wav".format(isrc, imic)))
                soundfile.write(wv, ref[isrc, :, imic].astype(np.int16), fs)
                refs.append(wv)

                wv = str(os.path.join(d, "enh.{}.{}.wav".format(isrc, imic)))
                soundfile.write(wv, enh[isrc, :, imic].astype(np.int16), fs)
                enhs.append(wv)
            ref_files.append(refs)
            enh_files.append(enhs)

        if compute_permutation:
            index_list = list(itertools.permutations(range(n_src)))
        else:
            index_list = [list(range(n_src))]

        values = []
        for indices in index_list:
            values2 = []
            for i, j in enumerate(indices):
                lis = []
                for imic in range(n_mic):
                    # PESQ +<8000|16000> <ref.wav> <enh.wav> [smos] [cond]
                    if wideband:
                        commands = [
                            "PESQ",
                            "+{}".format(fs),
                            "+wb",
                            ref_files[i][imic],
                            enh_files[j][imic],
                        ]
                    else:
                        commands = [
                            "PESQ",
                            "+{}".format(fs),
                            ref_files[i][imic],
                            enh_files[j][imic],
                        ]
                    with subprocess.Popen(
                        commands, stdout=subprocess.DEVNULL, cwd=d
                    ) as p:
                        _, _ = p.communicate()

                    # e.g.
                    # REFERENCE	 DEGRADED	 PESQMOS	 MOSLQO	 SAMPLE_FREQ	 MODE
                    # /tmp/t/ref.0.wav	 /tmp/t/enh.0.wav	 -1.000	 4.644	 16000	wb
                    result_txt = Path(d) / "pesq_results.txt"
                    if result_txt.exists():
                        with result_txt.open("r") as f:
                            lis.append(float(f.readlines()[1].split()[3]))
                    else:
                        # Sometimes PESQ is failed. I don't know why.
                        warnings.warn("Processing error is found.")
                        lis.append(1.0)
                    # Averaging over n_mic
                # Averaging over n_mic
                values2.append(sum(lis) / len(lis))
            values.append(values2)
    best_pairs = sorted(
        [(v, i) for v, i in zip(values, index_list)], key=lambda x: sum(x[0])
    )[-1]
    value, perm = best_pairs
    return tuple(value), tuple(perm)


def get_parser():
    parser = argparse.ArgumentParser(
        description="Evaluate enhanced speech. "
        "e.g. {c} --ref ref.scp --enh enh.scp --outdir outputdir"
        "or {c} --ref ref.scp ref2.scp --enh enh.scp enh2.scp "
        "--outdir outputdir".format(c=sys.argv[0]),
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
    )
    parser.add_argument("--verbose", "-V", default=0, type=int, help="Verbose option")
    parser.add_argument(
        "--ref",
        dest="reffiles",
        nargs="+",
        type=str,
        required=True,
        help="WAV file lists for reference",
    )
    parser.add_argument(
        "--enh",
        dest="enhfiles",
        nargs="+",
        type=str,
        required=True,
        help="WAV files lists for enhanced",
    )
    parser.add_argument("--outdir", type=str, required=True)
    parser.add_argument(
        "--keylist",
        type=str,
        help="Specify the target samples. By default, "
        "using all keys in the first reference file",
    )
    parser.add_argument(
        "--evaltypes",
        type=str,
        nargs="+",
        choices=["SDR", "STOI", "ESTOI", "PESQ"],
        default=["SDR", "STOI", "ESTOI", "PESQ"],
    )
    parser.add_argument(
        "--permutation",
        type=strtobool,
        default=True,
        help="Compute all permutations or " "use the pair of input order",
    )

    # About BSS Eval v4:
    # The 2018 Signal Separation Evaluation Campaign
    # https://arxiv.org/abs/1804.06267
    parser.add_argument(
        "--bss-eval-images",
        type=strtobool,
        default=True,
        help="Use bss_eval_images or bss_eval_sources. "
        "For more detail, see museval source codes.",
    )
    parser.add_argument(
        "--bss-eval-version",
        type=str,
        default="v3",
        choices=["v3", "v4"],
        help="Specify bss-eval-version: v3 or v4",
    )
    return parser


def main():
    parser = get_parser()
    args = parser.parse_args()

    logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
    if args.verbose > 0:
        logging.basicConfig(level=logging.INFO, format=logfmt)
    else:
        logging.basicConfig(level=logging.WARN, format=logfmt)
    logging.info(get_commandline_args())
    if len(args.reffiles) != len(args.enhfiles):
        raise RuntimeError(
            "The number of ref files are different "
            "from the enh files: {} != {}".format(
                len(args.reffiles), len(args.enhfiles)
            )
        )
    if len(args.enhfiles) == 1:
        args.permutation = False

    # Read text files and created a mapping of key2filepath
    reffiles_dict = OrderedDict()  # Dict[str, Dict[str, str]]
    for ref in args.reffiles:
        d = OrderedDict()
        with open(ref, "r") as f:
            for line in f:
                key, path = line.split(None, 1)
                d[key] = path.rstrip()
        reffiles_dict[ref] = d

    enhfiles_dict = OrderedDict()  # Dict[str, Dict[str, str]]
    for enh in args.enhfiles:
        d = OrderedDict()
        with open(enh, "r") as f:
            for line in f:
                key, path = line.split(None, 1)
                d[key] = path.rstrip()
        enhfiles_dict[enh] = d

    if args.keylist is not None:
        with open(args.keylist, "r") as f:
            keylist = [line.rstrip().split()[0] for line in f]
    else:
        keylist = list(reffiles_dict.values())[0]

    if len(keylist) == 0:
        raise RuntimeError("No keys are found")

    if not os.path.exists(args.outdir):
        os.makedirs(args.outdir)

    evaltypes = []
    for evaltype in args.evaltypes:
        if evaltype == "SDR":
            evaltypes += ["SDR", "ISR", "SIR", "SAR"]
        else:
            evaltypes.append(evaltype)

    # Open files in write mode
    writers = {k: open(os.path.join(args.outdir, k), "w") for k in evaltypes}

    for key in keylist:
        # 1. Load ref files
        rate_prev = None

        ref_signals = []
        for listname, d in reffiles_dict.items():
            if key not in d:
                raise RuntimeError("{} doesn't exist in {}".format(key, listname))
            filepath = d[key]
            signal, rate = soundfile.read(filepath, dtype=np.int16)
            if signal.ndim == 1:
                # (Nframe) -> (Nframe, 1)
                signal = signal[:, None]
            ref_signals.append(signal)
            if rate_prev is not None and rate != rate_prev:
                raise RuntimeError("Sampling rates mismatch")
            rate_prev = rate

        # 2. Load enh files
        enh_signals = []
        for listname, d in enhfiles_dict.items():
            if key not in d:
                raise RuntimeError("{} doesn't exist in {}".format(key, listname))
            filepath = d[key]
            signal, rate = soundfile.read(filepath, dtype=np.int16)
            if signal.ndim == 1:
                # (Nframe) -> (Nframe, 1)
                signal = signal[:, None]
            enh_signals.append(signal)
            if rate_prev is not None and rate != rate_prev:
                raise RuntimeError("Sampling rates mismatch")
            rate_prev = rate

        for signal in ref_signals + enh_signals:
            if signal.shape[1] != ref_signals[0].shape[1]:
                raise RuntimeError("The number of channels mismatch")

        # 3. Zero padding to adjust the length to the maximum length in inputs
        ml = max(len(s) for s in ref_signals + enh_signals)
        ref_signals = [
            np.pad(s, [(0, ml - len(s)), (0, 0)], mode="constant") if len(s) < ml else s
            for s in ref_signals
        ]

        enh_signals = [
            np.pad(s, [(0, ml - len(s)), (0, 0)], mode="constant") if len(s) < ml else s
            for s in enh_signals
        ]

        # ref_signals, enh_signals: (Nsrc, Nframe, Nmic)
        ref_signals = np.stack(ref_signals, axis=0)
        enh_signals = np.stack(enh_signals, axis=0)

        # 4. Evaluates
        for evaltype in args.evaltypes:
            if evaltype == "SDR":
                (sdr, isr, sir, sar, perm) = museval.metrics.bss_eval(
                    ref_signals,
                    enh_signals,
                    window=np.inf,
                    hop=np.inf,
                    compute_permutation=args.permutation,
                    filters_len=512,
                    framewise_filters=args.bss_eval_version == "v3",
                    bsseval_sources_version=not args.bss_eval_images,
                )

                # sdr: (Nsrc, Nframe)
                writers["SDR"].write(
                    "{} {}\n".format(key, " ".join(map(str, sdr[:, 0])))
                )
                writers["ISR"].write(
                    "{} {}\n".format(key, " ".join(map(str, isr[:, 0])))
                )
                writers["SIR"].write(
                    "{} {}\n".format(key, " ".join(map(str, sir[:, 0])))
                )
                writers["SAR"].write(
                    "{} {}\n".format(key, " ".join(map(str, sar[:, 0])))
                )

            elif evaltype == "STOI":
                stoi, perm = eval_STOI(
                    ref_signals,
                    enh_signals,
                    rate,
                    extended=False,
                    compute_permutation=args.permutation,
                )
                writers["STOI"].write("{} {}\n".format(key, " ".join(map(str, stoi))))

            elif evaltype == "ESTOI":
                estoi, perm = eval_STOI(
                    ref_signals,
                    enh_signals,
                    rate,
                    extended=True,
                    compute_permutation=args.permutation,
                )
                writers["ESTOI"].write("{} {}\n".format(key, " ".join(map(str, estoi))))

            elif evaltype == "PESQ":
                pesq, perm = eval_PESQ(
                    ref_signals, enh_signals, rate, compute_permutation=args.permutation
                )
                writers["PESQ"].write("{} {}\n".format(key, " ".join(map(str, pesq))))
            else:
                # Cannot reach
                raise RuntimeError


if __name__ == "__main__":
    main()


================================================
FILE: egs/espnet_utils/eval_perm_free_error.py
================================================
#!/usr/bin/env python3
# encoding: utf-8

# Copyright 2019 Johns Hopkins University (Xuankai Chang)
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
import argparse
import codecs
import json
import logging
import re
import six
import sys

import numpy as np


def permutationDFS(source, start, res):
    # get permutations with DFS
    # return order in [[1, 2], [2, 1]] or
    # [[1, 2, 3], [1, 3, 2], [2, 1, 3], [2, 3, 1], [3, 2, 1], [3, 1, 2]]
    if start == len(source) - 1:  # reach final state
        res.append(source.tolist())
    for i in range(start, len(source)):
        # swap values at position start and i
        source[start], source[i] = source[i], source[start]
        permutationDFS(source, start + 1, res)
        # reverse the swap
        source[start], source[i] = source[i], source[start]


# pre-set the permutation scheme (ref_idx, hyp_idx)
def permutation_schemes(num_spkrs):
    src = [x for x in range(1, num_spkrs + 1)]
    perms = []

    # get all permutations of [1, ..., num_spkrs]
    # [[r1h1, r2h2], [r1h2, r2h1]]
    # [[r1h1, r2h2, r3h3], [r1h1, r2h3, r3h2], [r1h2, r2h1, r3h3],
    #  [r1h2, r2h3, r3h2], [r1h3, r2h2, r3h1], [r1h3, r2h1, r3h2]]]
    # ...
    permutationDFS(np.array(src), 0, perms)

    keys = []
    for perm in perms:
        keys.append(["r%dh%d" % (i, j) for i, j in enumerate(perm, 1)])

    return sum(keys, []), keys


def convert_score(keys, dic):
    ret = {}
    pat = re.compile(r"\d+")
    for k in keys:
        score = dic[k]["Scores"]
        score = list(map(int, pat.findall(score)))  # [c,s,d,i]
        assert len(score) == 4
        ret[k] = score
    return ret


def get_utt_permutation(old_dic, num_spkrs=2):
    perm, keys = permutation_schemes(num_spkrs)
    new_dic = {}

    for id in old_dic.keys():
        # compute error rate for each utt
        in_dic = old_dic[id]
        score = convert_score(perm, in_dic)
        perm_score = []
        for ks in keys:
            tmp_score = [0, 0, 0, 0]
            for k in ks:
                tmp_score = [tmp_score[i] + score[k][i] for i in range(4)]
            perm_score.append(tmp_score)

        error_rate = [
            sum(s[1:4]) / float(sum(s[0:3])) for s in perm_score
        ]  # (s+d+i) / (c+s+d)

        min_idx, min_v = min(enumerate(error_rate), key=lambda x: x[1])
        dic = {}
        for k in keys[min_idx]:
            dic[k] = in_dic[k]
        dic["Scores"] = "(#C #S #D #I) " + " ".join(map(str, perm_score[min_idx]))
        new_dic[id] = dic

    return new_dic


def get_results(result_file, result_key):
    re_id = r"^id: "
    re_strings = {
        "Speaker": r"^Speaker sentences",
        "Scores": r"^Scores: ",
        "REF": r"^REF: ",
        "HYP": r"^HYP: ",
    }
    re_id = re.compile(re_id)
    re_patterns = {}
    for p in re_strings.keys():
        re_patterns[p] = re.compile(re_strings[p])

    results = {}
    tmp_id = None
    tmp_ret = {}

    with codecs.open(result_file, "r", encoding="utf-8") as f:
        line = f.readline()
        while line:
            x = line.rstrip()
            x_split = x.split()

            if re_id.match(x):
                if tmp_id:
                    results[tmp_id] = {result_key: tmp_ret}
                    tmp_ret = {}
                tmp_id = x_split[1]
            for p in re_patterns.keys():
                if re_patterns[p].match(x):
                    tmp_ret[p] = " ".join(x_split[1:])
            line = f.readline()

    if tmp_ret != {}:
        results[tmp_id] = {result_key: tmp_ret}

    return {"utts": results}


def merge_results(results):
    rslt_lst = []

    # make intersection set for utterance keys
    intersec_keys = []
    for x in results.keys():
        j = results[x]

        ks = j["utts"].keys()
        logging.info(x + ": has " + str(len(ks)) + " utterances")

        if len(intersec_keys) > 0:
            intersec_keys = intersec_keys.intersection(set(ks))
        else:
            intersec_keys = set(ks)
        rslt_lst.append(j)

    logging.info(
        "After merge, the result has " + str(len(intersec_keys)) + " utterances"
    )

    # merging results
    dic = dict()
    for k in intersec_keys:
        v = rslt_lst[0]["utts"][k]
        for j in rslt_lst[1:]:
            v.update(j["utts"][k])
        dic[k] = v

    return dic


def get_parser():
    parser = argparse.ArgumentParser(description="evaluate permutation-free error")
    parser.add_argument(
        "--num-spkrs", type=int, default=2, help="number of mixed speakers."
    )
    parser.add_argument(
        "results",
        type=str,
        nargs="+",
        help="the scores between references and hypotheses, "
        "in ascending order of references (1st) and hypotheses (2nd), "
        "e.g. [r1h1, r1h2, r2h1, r2h2] in 2-speaker-mix case.",
    )
    return parser


def main():
    parser = get_parser()
    args = parser.parse_args()

    if len(args.results) != args.num_spkrs ** 2:
        parser.print_help()
        sys.exit(1)

    # Read results from files
    results = {}
    for r in six.moves.range(1, args.num_spkrs + 1):
        for h in six.moves.range(1, args.num_spkrs + 1):
            idx = (r - 1) * args.num_spkrs + h - 1
            key = "r{}h{}".format(r, h)

            result = get_results(args.results[idx], key)
            results[key] = result

    # Merge the results of every permutation
    results = merge_results(results)

    # Get the final results with best permutation
    new_results = get_utt_permutation(results, args.num_spkrs)

    # Get WER/CER
    pat = re.compile(r"\d+")
    score = np.zeros((len(new_results.keys()), 4))
    for idx, key in enumerate(new_results.keys()):
        # [c, s, d, i]
        tmp_score = list(map(int, pat.findall(new_results[key]["Scores"])))
        score[idx] = tmp_score
    return score, new_results


if __name__ == "__main__":
    sys.stdout = codecs.getwriter("utf-8")(sys.stdout.buffer)

    scores, new_results = main()
    score_sum = np.sum(scores, axis=0, dtype=int)

    # Print results
    print(sys.argv)
    print("Total Scores: (#C #S #D #I) " + " ".join(map(str, list(score_sum))))
    print(
        "Error Rate:   {:0.2f}".format(
            100 * sum(score_sum[1:4]) / float(sum(score_sum[0:3]))
        )
    )
    print("Total Utts: ", str(scores.shape[0]))

    print(
        json.dumps(
            {"utts": new_results},
            indent=4,
            ensure_ascii=False,
            sort_keys=True,
            separators=(",", ": "),
        )
    )


================================================
FILE: egs/espnet_utils/eval_source_separation.sh
================================================
#!/usr/bin/env bash

echo "$0 $*" >&2 # Print the command line for logging

nj=10
cmd=run.pl
evaltypes="SDR STOI ESTOI PESQ"
permutation=true
# Use museval.metrics.bss_eval_images or museval.metrics.bss_eval_source
bss_eval_images=true
bss_eval_version=v3

help_message=$(cat << EOF
Usage: $0 reffiles enffiles <dir>
    e.g. $0 reference.scp enhanced.scp outdir

And also supporting multiple sources:
    e.g. $0 "ref1.scp,ref2.scp" "enh1.scp,enh2.scp" outdir

Options:
  --nj <nj>                                        # number of parallel jobs
  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs.
EOF
)

. ./path.sh
. utils/parse_options.sh

if [ $# != 3 ]; then
    echo "${help_message}" 1>&2
    exit 1;
fi

set -euo pipefail

IFS=, read -r -a reffiles <<<$1
IFS=, read -r -a enhfiles <<<$2
dir=$3
logdir=${dir}/log
mkdir -p ${logdir}

split_scps=""
for n in $(seq ${nj}); do
    split_scps="${split_scps} ${logdir}/key.${n}.scp"
done

# Split the first reference
utils/split_scp.pl ${reffiles[0]} ${split_scps} || exit 1;

${cmd} JOB=1:${nj} ${logdir}/eval-enhanced-speech.JOB.log \
    eval-source-separation.py \
    --ref "${reffiles[@]}" --enh "${enhfiles[@]}" \
    --keylist ${logdir}/key.JOB.scp \
    --out ${logdir}/JOB \
    --evaltypes ${evaltypes} \
    --permutation ${permutation} \
    --bss-eval-images ${bss_eval_images} \
    --bss-eval-version ${bss_eval_version}


for t in ${evaltypes/SDR/SDR ISR SIR SAR}; do
    for i in $(seq 1 ${nj}); do
        cat ${logdir}/${i}/${t}
    done > ${dir}/${t}

    # Calculate the mean over files
    python3 << EOF > ${dir}/mean_${t}
with open('${dir}/${t}', 'r') as f:
    values = []
    for l in f:
        vs = l.rstrip().split(None)[1:]
        values.append(sum(map(float, vs)) / len(vs))
    mean = sum(values) / len(values)
print(mean)
EOF

done


================================================
FILE: egs/espnet_utils/feat-to-shape.py
================================================
#!/usr/bin/env python3
import argparse
import logging
import sys

from espnet.transform.transformation import Transformation
from espnet.utils.cli_readers import file_reader_helper
from espnet.utils.cli_utils import get_commandline_args
from espnet.utils.cli_utils import is_scipy_wav_style


def get_parser():
    parser = argparse.ArgumentParser(
        description="convert feature to its shape",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
    )
    parser.add_argument("--verbose", "-V", default=0, type=int, help="Verbose option")
    parser.add_argument(
        "--filetype",
        type=str,
        default="mat",
        choices=["mat", "hdf5", "sound.hdf5", "sound"],
        help="Specify the file format for the rspecifier. "
        '"mat" is the matrix format in kaldi',
    )
    parser.add_argument(
        "--preprocess-conf",
        type=str,
        default=None,
        help="The configuration file for the pre-processing",
    )
    parser.add_argument(
        "rspecifier", type=str, help="Read specifier for feats. e.g. ark:some.ark"
    )
    parser.add_argument(
        "out",
        nargs="?",
        type=argparse.FileType("w"),
        default=sys.stdout,
        help="The output filename. " "If omitted, then output to sys.stdout",
    )
    return parser


def main():
    parser = get_parser()
    args = parser.parse_args()

    # logging info
    logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
    if args.verbose > 0:
        logging.basicConfig(level=logging.INFO, format=logfmt)
    else:
        logging.basicConfig(level=logging.WARN, format=logfmt)
    logging.info(get_commandline_args())

    if args.preprocess_conf is not None:
        preprocessing = Transformation(args.preprocess_conf)
        logging.info("Apply preprocessing: {}".format(preprocessing))
    else:
        preprocessing = None

    # There are no necessary for matrix without preprocessing,
    # so change to file_reader_helper to return shape.
    # This make sense only with filetype="hdf5".
    for utt, mat in file_reader_helper(
        args.rspecifier, args.filetype, return_shape=preprocessing is None
    ):
        if preprocessing is not None:
            if is_scipy_wav_style(mat):
                # If data is sound file, then got as Tuple[int, ndarray]
                rate, mat = mat
            mat = preprocessing(mat, uttid_list=utt)
            shape_str = ",".join(map(str, mat.shape))
        else:
            if len(mat) == 2 and isinstance(mat[1], tuple):
                # If data is sound file, Tuple[int, Tuple[int, ...]]
                rate, mat = mat
            shape_str = ",".join(map(str, mat))
        args.out.write("{} {}\n".format(utt, shape_str))


if __name__ == "__main__":
    main()


================================================
FILE: egs/espnet_utils/feat_to_shape.sh
================================================
#!/usr/bin/env bash

# Begin configuration section.
nj=188
cmd=run.pl
verbose=0
filetype=""
preprocess_conf=""
# End configuration section.

help_message=$(cat << EOF
Usage: $0 [options] <input-scp> <output-scp> [<log-dir>]
e.g.: $0 data/train/feats.scp data/train/shape.scp data/train/log
Options:
  --nj <nj>                                        # number of parallel jobs
  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs.
  --filetype <mat|hdf5|sound.hdf5>                 # Specify the format of feats file
  --preprocess-conf <json>                         # Apply preprocess to feats when creating shape.scp
  --verbose <num>                                  # Default: 0
EOF
)

echo "$0 $*" 1>&2 # Print the command line for logging

. parse_options.sh || exit 1;

if [ $# -lt 2 ] || [ $# -gt 3 ]; then
    echo "${help_message}" 1>&2
    exit 1;
fi

set -euo pipefail

scp=$1
outscp=$2
data=$(dirname ${scp})
if [ $# -eq 3 ]; then
  logdir=$3
else
  logdir=${data}/log
fi
mkdir -p ${logdir}

nj=$((nj<$(<"${scp}" wc -l)?nj:$(<"${scp}" wc -l)))
split_scps=""
for n in $(seq ${nj}); do
    split_scps="${split_scps} ${logdir}/feats.${n}.scp"
done

utils/split_scp.pl ${scp} ${split_scps}

if [ -n "${preprocess_conf}" ]; then
    preprocess_opt="--preprocess-conf ${preprocess_conf}"
else
    preprocess_opt=""
fi
if [ -n "${filetype}" ]; then
    filetype_opt="--filetype ${filetype}"
else
    filetype_opt=""
fi

${cmd} JOB=1:${nj} ${logdir}/feat_to_shape.JOB.log \
    feat-to-shape.py --verbose ${verbose} ${preprocess_opt} ${filetype_opt} \
    scp:${logdir}/feats.JOB.scp ${logdir}/shape.JOB.scp

# concatenate the .scp files together.
for n in $(seq ${nj}); do
    cat ${logdir}/shape.${n}.scp
done > ${outscp}

rm -f ${logdir}/feats.*.scp 2>/dev/null


================================================
FILE: egs/espnet_utils/feats2npy.py
================================================
#!/usr/bin/env python
#  coding: utf-8

import argparse
from kaldiio import ReadHelper
import numpy as np
import os
from os.path import join
import sys


def get_parser():
    parser = argparse.ArgumentParser(
        description="Convet kaldi-style features to numpy arrays",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
    )
    parser.add_argument("scp_file", type=str, help="scp file")
    parser.add_argument("out_dir", type=str, help="output directory")
    return parser


if __name__ == "__main__":
    args = get_parser().parse_args(sys.argv[1:])
    os.makedirs(args.out_dir, exist_ok=True)
    with ReadHelper(f"scp:{args.scp_file}") as f:
        for utt_id, arr in f:
            out_path = join(args.out_dir, f"{utt_id}-feats.npy")
            np.save(out_path, arr, allow_pickle=False)
    sys.exit(0)


================================================
FILE: egs/espnet_utils/filt.py
================================================
#!/usr/bin/env python3

# Apache 2.0

import argparse
import codecs
import sys

is_python2 = sys.version_info[0] == 2


def get_parser():
    parser = argparse.ArgumentParser(
        description="filter words in a text file",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
    )
    parser.add_argument(
        "--exclude",
        "-v",
        dest="exclude",
        action="store_true",
        help="exclude filter words",
    )
    parser.add_argument("filt", type=str, help="filter list")
    parser.add_argument("infile", type=str, help="input file")
    return parser


def main(args):
    args = get_parser().parse_args(args)
    filter_file(args.infile, args.filt, args.exclude)


def filter_file(infile, filt, exclude):
    vocab = set()
    with codecs.open(filt, "r", encoding="utf-8") as vocabfile:
        for line in vocabfile:
            vocab.add(line.strip())

    sys.stdout = codecs.getwriter("utf-8")(
        sys.stdout if is_python2 else sys.stdout.buffer
    )
    with codecs.open(infile, "r", encoding="utf-8") as textfile:
        for line in textfile:
            if exclude:
                print(
                    " ".join(
                        map(
                            lambda word: word if word not in vocab else "",
                            line.strip().split(),
                        )
                    )
                )
            else:
                print(
                    " ".join(
                        map(
                            lambda word: word if word in vocab else "<UNK>",
                            line.strip().split(),
                        )
                    )
                )


if __name__ == "__main__":
    main(sys.argv[1:])


================================================
FILE: egs/espnet_utils/filter_all_eng_utts.py
================================================
import sys

def is_all_chinese(strs):
    for _char in strs:
        if not '\u4e00' <= _char <= '\u9fa5':
            return False
    return True

f_in = sys.argv[1]
f_out = sys.argv[2]

writer = open(f_out, 'w', encoding='utf-8')
for line in open(f_in, encoding='utf-8'):
    elems = line.strip().split()
    uttid = elems[0]
    
    if len(elems) <= 1:
        continue

    text = " ".join(elems[1:])
    if not is_all_chinese(text):
        continue

    out_line = " ".join([uttid, text, '\n'])
    writer.write(out_line)


================================================
FILE: egs/espnet_utils/filter_scp.py
================================================
import sys

ref_f = sys.argv[1]
in_f = sys.argv[2]

# output is in the order of ref_f
ref = []
for line in open(ref_f, encoding='utf-8'):
    uttid = line.strip().split()[0]
    ref.append(uttid)

in_dic = {}
for line in open(in_f, encoding='utf-8'):
    elems = line.strip().split()
    uttid = elems[0]
    ctx = " ".join(elems[1:])
    in_dic[uttid] = ctx

for e in ref:
    if e in in_dic:
        print(f"{e} {in_dic[e]}")


================================================
FILE: egs/espnet_utils/filter_trn.py
================================================
# this is to process the hyp.trn of sppd3
import sys

in_f = sys.argv[1]
ignore = "叮 当 叮 当 "

for line in open(in_f, encoding="utf-8"):
    line = line.strip().replace(ignore, "")
    print(line) 


================================================
FILE: egs/espnet_utils/free-gpu.sh
================================================
#!/usr/bin/env bash
# Author: Gaurav Kumar


# Usage: e.g.
# % free-gpu.sh -n 2
# 1,2

# Allow requests for multiple GPUs
# (Optional) defaults to 1
req_gpus=1
while getopts ':n:' opt; do
  case ${opt} in
    n)
      req_gpus=${OPTARG}
      ;;
    :)
      echo "Option -${OPTARG} requires an argument." >&2
      exit 1
      ;;
    *)
      echo "Option -${OPTARG} is not supported" >&2
      exit 1
      ;;
  esac
done

# Number of free GPUs on a machine
n_gpus=$(lspci | grep -i "nvidia" | grep -c -v "Audio")

# Return -1 if there are no GPUs on the machine
# or if the requested number of GPUs exceed
# the number of GPUs installed.
if [ ${n_gpus} -eq 0 ] || [ ${req_gpus} -gt ${n_gpus} ]; then
  echo "-1"
  exit 1
fi

# shellcheck disable=SC2026
f_gpu=$(nvidia-smi | sed -e '1,/Processes/d' \
  | tail -n+3 | head -n-1 | awk '{print $2}' \
  | awk -v ng=${n_gpus} 'BEGIN{for (n=0;n<ng;++n){g[n] = 1}} {delete g[$1];} END{for (i in g) print i}' \
  | tail -n ${req_gpus})

# return -1 if not enough free GPUs were found
if [[ $(echo ${f_gpu} | grep -v '^$' | wc -w) -ne ${req_gpus} ]]; then
  echo "-1"
  exit 1
else
  echo ${f_gpu} | sed 's: :,:g'
fi


================================================
FILE: egs/espnet_utils/gdown.pl
================================================
#!/usr/bin/env perl
#
# Google Drive direct download of big files
# ./gdown.pl 'gdrive file url' ['desired file name']
#
# v1.0 by circulosmeos 04-2014.
# v1.1 by circulosmeos 01-2017.
# v1.2, v1.3, v1.4 by circulosmeos 01-2019, 02-2019.
# //circulosmeos.wordpress.com/2014/04/12/google-drive-direct-download-of-big-files
# Distributed under GPL 3 (//www.gnu.org/licenses/gpl-3.0.html)
#
use strict;
use POSIX;

my $TEMP='gdown.cookie.temp';
my $COMMAND;
my $confirm;
my $check;
sub execute_command();

my $URL=shift;
die "\n./gdown.pl 'gdrive file url' [desired file name]\n\n" if $URL eq '';

my $FILENAME=shift;
$FILENAME='gdown.'.strftime("%Y%m%d%H%M%S", localtime).'.'.substr(rand,2) if $FILENAME eq '';

if ($URL=~m#^https?://drive.google.com/file/d/([^/]+)#) {
    $URL="https://docs.google.com/uc?id=$1&export=download";
}
elsif ($URL=~m#^https?://drive.google.com/open\?id=([^/]+)#) {
    $URL="https://docs.google.com/uc?id=$1&export=download";
}

execute_command();

while (-s $FILENAME < 100000) { # only if the file isn't the download yet
    open fFILENAME, '<', $FILENAME;
    $check=0;
    foreach (<fFILENAME>) {
        if (/href="(\/uc\?export=download[^"]+)/) {
            $URL='https://docs.google.com'.$1;
            $URL=~s/&amp;/&/g;
            $confirm='';
            $check=1;
            last;
        }
        if (/confirm=([^;&]+)/) {
            $confirm=$1;
            $check=1;
            last;
        }
        if (/"downloadUrl":"([^"]+)/) {
            $URL=$1;
            $URL=~s/\\u003d/=/g;
            $URL=~s/\\u0026/&/g;
            $confirm='';
            $check=1;
            last;
        }
    }
    close fFILENAME;
    die "Couldn't download the file :-(\n" if ($check==0);
    $URL=~s/confirm=([^;&]+)/confirm=$confirm/ if $confirm ne '';

    execute_command();
}

unlink $TEMP;

sub execute_command() {
    $COMMAND="wget --progress=dot:giga --no-check-certificate --load-cookie $TEMP --save-cookie $TEMP \"$URL\"";
    $COMMAND.=" -O \"$FILENAME\"" if $FILENAME ne '';
    system ( $COMMAND );
    return 1;
}


================================================
FILE: egs/espnet_utils/generate_wav.sh
================================================
#!/usr/bin/env bash

# Copyright 2018 Nagoya University (Tomoki Hayashi)
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

# Begin configuration section.
nj=2
fs=22050
n_fft=1024
n_shift=256
cmd=run.pl
help_message=$(cat <<EOF
Usage:
  $0 [options] <model-path> <data-dir> [<log-dir> [<fbank-dir>] ]
Example:
  $0 ljspeech.wavenet.ns.v1/checkpoint-1000000.pkl data/train exp/wavenet_vocoder/train wav
Note:
  <log-dir> defaults to <data-dir>/log, and <fbank-dir> defaults to <data-dir>/data
Options:
  --nj <nj>             # number of parallel jobs
  --fs <fs>             # sampling rate (default=22050)
  --n_fft <n_fft>       # number of FFT points (default=1024)
  --n_shift <n_shift>   # shift size in point (default=256)
  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs.
EOF
)
# End configuration section.

echo "$0 $*"  # Print the command line for logging

. parse_options.sh || exit 1;

if [ $# -lt 2 ] || [ $# -gt 4 ]; then
    echo "${help_message}"
    exit 1;
fi

model=$1
data=$2
if [ $# -ge 3 ]; then
  logdir=$3
else
  logdir=${data}/log
fi
if [ $# -ge 4 ]; then
  wavdir=$4
else
  wavdir=${data}/data
fi

# use "name" as part of name of the archive.
name=$(basename ${data})

mkdir -p ${wavdir} || exit 1;
mkdir -p ${logdir} || exit 1;

scp=${data}/feats.scp

split_scps=""
for n in $(seq ${nj}); do
    split_scps="$split_scps $logdir/feats.${n}.scp"
done

utils/split_scp.pl ${scp} ${split_scps} || exit 1;

${cmd} JOB=1:${nj} ${logdir}/generate_with_wavenet_${name}.JOB.log \
    generate_wav_from_fbank.py \
        --model ${model} \
        --fs ${fs} \
        --n_fft ${n_fft} \
        --n_shift ${n_shift} \
        scp:${logdir}/feats.JOB.scp \
        ${wavdir}

rm ${logdir}/feats.*.scp 2>/dev/null

echo "Succeeded creating wav for ${name}"


================================================
FILE: egs/espnet_utils/generate_wav_from_fbank.py
================================================
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""This code is based on https://github.com/kan-bayashi/PytorchWaveNetVocoder."""

# Copyright 2019 Nagoya University (Tomoki Hayashi)
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

import argparse
import logging
import os
import time

import h5py
import numpy as np
import pysptk
import torch

from scipy.io.wavfile import write
from sklearn.preprocessing import StandardScaler

from espnet.nets.pytorch_backend.wavenet import decode_mu_law
from espnet.nets.pytorch_backend.wavenet import encode_mu_law
from espnet.nets.pytorch_backend.wavenet import WaveNet
from espnet.utils.cli_readers import file_reader_helper
from espnet.utils.cli_utils import get_commandline_args


class TimeInvariantMLSAFilter(object):
    """Time invariant MLSA filter.

    This module is used to perform noise shaping described in
    `An investigation of noise shaping with perceptual
     weighting for WaveNet-based speech generation`_.

    Args:
        coef (ndaaray): MLSA filter coefficient (D,).
        alpha (float): All pass constant value.
        n_shift (int): Shift length in points.

    .. _`An investigation of noise shaping with perceptual
        weighting for WaveNet-based speech generation`:
        https://ieeexplore.ieee.org/abstract/document/8461332

    """

    def __init__(self, coef, alpha, n_shift):
        self.coef = coef
        self.n_shift = n_shift
        self.mlsa_filter = pysptk.synthesis.Synthesizer(
            pysptk.synthesis.MLSADF(order=coef.shape[0] - 1, alpha=alpha),
            hopsize=n_shift,
        )

    def __call__(self, y):
        """Apply time invariant MLSA filter.

        Args:
            y (ndarray): Waveform signal normalized from -1 to 1 (N,).

        Returns:
            y (ndarray): Filtered waveform signal normalized from -1 to 1 (N,).

        """
        # check shape and type
        assert len(y.shape) == 1
        y = np.float64(y)

        # get frame number and then replicate mlsa coef
        num_frames = int(len(y) / self.n_shift) + 1
        coef = np.tile(self.coef, [num_frames, 1])

        return self.mlsa_filter.synthesis(y, coef)


def get_parser():
    parser = argparse.ArgumentParser(
        description="generate wav from FBANK using wavenet vocoder",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
    )
    parser.add_argument("--fs", type=int, default=22050, help="Sampling frequency")
    parser.add_argument("--n_fft", type=int, default=1024, help="FFT length in point")
    parser.add_argument(
        "--n_shift", type=int, default=256, help="Shift length in point"
    )
    parser.add_argument("--model", type=str, default=None, help="WaveNet model")
    parser.add_argument(
        "--filetype",
        type=str,
        default="mat",
        choices=["mat", "hdf5"],
        help="Specify the file format for the rspecifier. "
        '"mat" is the matrix format in kaldi',
    )
    parser.add_argument("rspecifier", type=str, help="Input feature e.g. scp:feat.scp")
    parser.add_argument("outdir", type=str, help="Output directory")
    return parser


def main():
    parser = get_parser()
    args = parser.parse_args()

    # logging info
    logging.basicConfig(
        level=logging.INFO,
        format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
    )
    logging.info(get_commandline_args())

    # check directory
    if not os.path.exists(args.outdir):
        os.makedirs(args.outdir)

    # load model config
    model_dir = os.path.dirname(args.model)
    train_args = torch.load(os.path.join(model_dir, "model.conf"))

    # load statistics
    scaler = StandardScaler()
    with h5py.File(os.path.join(model_dir, "stats.h5")) as f:
        scaler.mean_ = f["/melspc/mean"][()]
        scaler.scale_ = f["/melspc/scale"][()]
        # TODO(kan-bayashi): include following info as default
        coef = f["/mlsa/coef"][()]
        alpha = f["/mlsa/alpha"][()]

    # define MLSA filter for noise shaping
    mlsa_filter = TimeInvariantMLSAFilter(
        coef=coef,
        alpha=alpha,
        n_shift=args.n_shift,
    )

    # define model and laod parameters
    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
    model = WaveNet(
        n_quantize=train_args.n_quantize,
        n_aux=train_args.n_aux,
        n_resch=train_args.n_resch,
        n_skipch=train_args.n_skipch,
        dilation_depth=train_args.dilation_depth,
        dilation_repeat=train_args.dilation_repeat,
        kernel_size=train_args.kernel_size,
        upsampling_factor=train_args.upsampling_factor,
    )
    model.load_state_dict(torch.load(args.model, map_location="cpu")["model"])
    model.eval()
    model.to(device)

    for idx, (utt_id, lmspc) in enumerate(
        file_reader_helper(args.rspecifier, args.filetype), 1
    ):
        logging.info("(%d) %s" % (idx, utt_id))

        # perform preprocesing
        x = encode_mu_law(
            np.zeros((1)), mu=train_args.n_quantize
        )  # quatize initial seed waveform
        h = scaler.transform(lmspc)  # normalize features

        # convert to tensor
        x = torch.tensor(x, dtype=torch.long, device=device)  # (1,)
        h = torch.tensor(h, dtype=torch.float, device=device)  # (T, n_aux)

        # get length of waveform
        n_samples = (h.shape[0] - 1) * args.n_shift + args.n_fft

        # generate
        start_time = time.time()
        with torch.no_grad():
            y = model.generate(x, h, n_samples, interval=100)
        logging.info(
            "generation speed = %s (sec / sample)"
            % ((time.time() - start_time) / (len(y) - 1))
        )
        y = decode_mu_law(y, mu=train_args.n_quantize)

        # apply mlsa filter for noise shaping
        y = mlsa_filter(y)

        # save as .wav file
        write(
            os.path.join(args.outdir, "%s.wav" % utt_id),
            args.fs,
            (y * np.iinfo(np.int16).max).astype(np.int16),
        )


if __name__ == "__main__":
    main()


================================================
FILE: egs/espnet_utils/get_yaml.py
================================================
#!/usr/bin/env python3
import argparse

import yaml


def get_parser():
    parser = argparse.ArgumentParser(
        description="get a specified attribute from a YAML file",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
    )
    parser.add_argument("inyaml")
    parser.add_argument(
        "attr", help='foo.bar will access yaml.load(inyaml)["foo"]["bar"]'
    )
    return parser


def main():
    args = get_parser().parse_args()
    with open(args.inyaml, "r") as f:
        indict = yaml.load(f, Loader=yaml.Loader)

    try:
        for attr in args.attr.split("."):
            if attr.isdigit():
                attr = int(attr)
            indict = indict[attr]
        print(indict)
    except KeyError:
        # print nothing
        # sys.exit(1)
        pass


if __name__ == "__main__":
    main()


================================================
FILE: egs/espnet_utils/jieba_build_dict.py
================================================
import jieba
import sys
_ = jieba.lcut("aaa")


words_file = sys.argv[1]
dict_file = sys.argv[2]

reader = open(words_file, 'r')
writer = open(dict_file, 'w')
for line in reader:
    term = line.strip().split()[0]
    freq = jieba.dt.FREQ.get(term)
    freq = 1 if freq == None else freq
    writer.write(f"{term} {freq}\n")
writer.close()


================================================
FILE: egs/espnet_utils/json2sctm.py
================================================
#!/usr/bin/python
# -*- coding: utf-8 -*-

import argparse
import os
import subprocess
import sys


is_python2 = sys.version_info[0] == 2


def get_parser():
    parser = argparse.ArgumentParser(description="convert json to sctm")
    parser.add_argument("json", type=str, default=None, nargs="?", help="input trn")
    parser.add_argument("dict", type=str, help="dict")
    parser.add_argument(
        "--num-spkrs", type=int, default=1, nargs="?", help="number of speakers"
    )
    parser.add_argument("--refs", type=str, nargs="*", help="ref for all speakers")
    parser.add_argument("--hyps", type=str, nargs="*", help="hyp for all outputs")
    parser.add_argument("--orig-stm", type=str, nargs="?", help="orig stm")
    parser.add_argument("--stm", type=str, default=None, nargs="+", help="output stm")
    parser.add_argument("--ctm", type=str, default=None, nargs="+", help="output ctm")
    parser.add_argument(
        "--bpe", type=str, default=None, nargs="?", help="BPE model if applicable"
    )
    return parser


def main(args):
    from utils import json2trn
    from utils import trn2ctm
    from utils import trn2stm

    parser = get_parser()
    args = parser.parse_args(args)
    if args.refs is None:
        refs = ["ref_tmp.trn"]
        del_ref = True
    else:
        refs = args.refs
        del_ref = False
    if args.hyps is None:
        hyps = ["hyp_tmp.trn"]
        del_hyp = True
    else:
        hyps = args.hyps
        del_hyp = False
    json2trn.convert(args.json, args.dict, refs, hyps, args.num_spkrs)
    for trn in refs + hyps:
        # We don't remove non-lang-syms because kaldi already removes them when scoring
        call_args = ["sed", "-i.bak2", "-r", "s/<blank> //g", trn]
        subprocess.check_call(call_args)
        if args.bpe is not None:
            with open(wrd_name(trn), "w") as out:
                with open(trn, "r") as spm_in:
                    sed_args = ["sed", "-e", "s/▁/ /g"]
                    sed = subprocess.Popen(sed_args, stdout=out, stdin=subprocess.PIPE)
                    spm_args = [
                        "spm_decode",
                        "--model=" + args.bpe,
                        "--input_format=piece",
                    ]
                    subprocess.Popen(spm_args, stdin=spm_in)
                    sed.communicate()
        else:
            call_args = [
                "sed",
                "-e",
                "s/ //g",
                "-e",
                "s/(/ (/",
                "-e",
                "s/<space>/ /g",
                trn,
            ]
            with open(wrd_name(trn), "w") as out:
                sed = subprocess.Popen(call_args, stdout=out)
                sed.communicate()
    for trn, stm in zip(refs, args.stm):
        trn2stm.convert(wrd_name(trn), stm, args.orig_stm)
    if del_ref:
        os.remove(refs[0])
        os.remove(refs[0] + ".bak2")
        os.remove(wrd_name(refs[0]))

    for trn, ctm in zip(hyps, args.ctm):
        trn2ctm.convert(wrd_name(trn), ctm)
    if del_hyp:
        os.remove(hyps[0])
        os.remove(hyps[0] + ".bak2")
        os.remove(wrd_name(hyps[0]))


def wrd_name(trn):
    split = trn.split(".")
    return ".".join(split[:-1]) + ".wrd." + split[-1]


if __name__ == "__main__":
    main(sys.argv[1:])


================================================
FILE: egs/espnet_utils/json2text.py
================================================
#!/usr/bin/env python3
# encoding: utf-8

# Copyright 2017 Johns Hopkins University (Shinji Watanabe)
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

import argparse
import codecs
import json
import logging

from espnet.utils.cli_utils import get_commandline_args


def get_parser():
    parser = argparse.ArgumentParser(
        description="convert ASR recognized json to text",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
    )
    parser.add_argument("json", type=str, help="json files")
    parser.add_argument("dict", type=str, help="dict")
    parser.add_argument("ref", type=str, help="ref")
    parser.add_argument("hyp", type=str, help="hyp")
    return parser


if __name__ == "__main__":
    args = get_parser().parse_args()

    # logging info
    logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
    logging.basicConfig(level=logging.INFO, format=logfmt)
    logging.info(get_commandline_args())

    logging.info("reading %s", args.json)
    with codecs.open(args.json, "r", encoding="utf-8") as f:
        j = json.load(f)

    logging.info("reading %s", args.dict)
    with codecs.open(args.dict, "r", encoding="utf-8") as f:
        dictionary = f.readlines()
    char_list = [entry.split(" ")[0] for entry in dictionary]
    char_list.insert(0, "<blank>")
    char_list.append("<eos>")
    # print([x.encode('utf-8') for x in char_list])

    logging.info("writing hyp trn to %s", args.hyp)
    logging.info("writing ref trn to %s", args.ref)
    h = codecs.open(args.hyp, "w", encoding="utf-8")
    r = codecs.open(args.ref, "w", encoding="utf-8")

    for x in j["utts"]:
        seq = [
            char_list[int(i)] for i in j["utts"][x]["output"][0]["rec_tokenid"].split()
        ]
        h.write(x + " " + " ".join(seq).replace("<eos>", "") + "\n")

        if "tokenid" in j["utts"][x]["output"][0].keys():
            seq = [
                char_list[int(i)] for i in j["utts"][x]["output"][0]["tokenid"].split()
            ]
            r.write(x + " " + " ".join(seq).replace("<eos>", "") + "\n")


================================================
FILE: egs/espnet_utils/json2trn.py
================================================
#!/usr/bin/env python3
# encoding: utf-8

# Copyright 2017 Johns Hopkins University (Shinji Watanabe)
#           2018 Xuankai Chang (Shanghai Jiao Tong University)
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

import argparse
import codecs
import json
import logging
import sys

from espnet.utils.cli_utils import get_commandline_args


def get_parser():
    parser = argparse.ArgumentParser(
        description="convert a json to a transcription file with a token dictionary",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
    )
    parser.add_argument("json", type=str, help="json files")
    parser.add_argument("dict", type=str, help="dict")
    parser.add_argument("--num-spkrs", type=int, default=1, help="number of speakers")
    parser.add_argument("--refs", type=str, nargs="+", help="ref for all speakers")
    parser.add_argument("--hyps", type=str, nargs="+", help="hyp for all outputs")
    return parser


def main(args):
    args = get_parser().parse_args(args)
    convert(args.json, args.dict, args.refs, args.hyps, args.num_spkrs)


def convert(jsonf, dic, refs, hyps, num_spkrs=1):
    n_ref = len(refs)
    n_hyp = len(hyps)
    assert n_ref == n_hyp
    assert n_ref == num_spkrs

    # logging info
    logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
    logging.basicConfig(level=logging.INFO, format=logfmt)
    logging.info(get_commandline_args())

    logging.info("reading %s", jsonf)
    with codecs.open(jsonf, "r", encoding="utf-8") as f:
        j = json.load(f)

    logging.info("reading %s", dic)
    with codecs.open(dic, "r", encoding="utf-8") as f:
        dictionary = f.readlines()
    char_list = [entry.split(" ")[0] for entry in dictionary]
    char_list.insert(0, "<blank>")
    char_list.append("<eos>")

    for ns in range(num_spkrs):
        hyp_file = codecs.open(hyps[ns], "w", encoding="utf-8")
        ref_file = codecs.open(refs[ns], "w", encoding="utf-8")

        for x in j["utts"]:
            # recognition hypothesis
            if num_spkrs == 1:
                seq = [
                    char_list[int(i)]
                    for i in j["utts"][x]["output"][0]["rec_tokenid"].split()
                ]
            else:
                seq = [
                    char_list[int(i)]
                    for i in j["utts"][x]["output"][ns][0]["rec_tokenid"].split()
                ]
            # In the recognition hypothesis,
            # the <eos> symbol is usually attached in the last part of the sentence
            # and it is removed below.
            hyp_file.write(" ".join(seq).replace("<eos>", "")),
            hyp_file.write(
                " (" + j["utts"][x]["utt2spk"].replace("-", "_") + "-" + x + ")\n"
            )

            # reference
            if num_spkrs == 1:
                seq = j["utts"][x]["output"][0]["token"]
            else:
                seq = j["utts"][x]["output"][ns][0]["token"]
            # Unlike the recognition hypothesis,
            # the reference is directly generated from a token without dictionary
            # to avoid to include <unk> symbols in the reference to make scoring normal.
            # The detailed discussion can be found at
            # https://github.com/espnet/espnet/issues/993
            ref_file.write(
                seq + " (" + j["utts"][x]["utt2spk"].replace("-", "_") + "-" + x + ")\n"
            )

        hyp_file.close()
        ref_file.close()


if __name__ == "__main__":
    main(sys.argv[1:])


================================================
FILE: egs/espnet_utils/json2trn_mt.py
================================================
#!/usr/bin/env python3
# encoding: utf-8

# Copyright 2018 Kyoto University (Hirofumi Inaguma)
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

# NOTE: this is made for machine translation

import argparse
import codecs
import json
import logging
import sys

from espnet.utils.cli_utils import get_commandline_args


def get_parser():
    parser = argparse.ArgumentParser(
        description="convert json to machine translation transcription",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
    )
    parser.add_argument("json", type=str, help="json files")
    parser.add_argument("dict", type=str, help="dict for target language")
    parser.add_argument("--refs", type=str, nargs="+", help="ref for all speakers")
    parser.add_argument("--hyps", type=str, nargs="+", help="hyp for all outputs")
    parser.add_argument("--srcs", type=str, nargs="+", help="src for all outputs")
    parser.add_argument(
        "--dict-src",
        type=str,
        help="dict for source language",
        default=False,
        nargs="?",
    )
    return parser


def main(args):
    parser = get_parser()
    args = parser.parse_args(args)
    convert(args.json, args.dict, args.refs, args.hyps, args.srcs, args.dict_src)


def convert(jsonf, dic, refs, hyps, srcs, dic_src):

    # logging info
    logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
    logging.basicConfig(level=logging.INFO, format=logfmt)
    logging.info(get_commandline_args())

    logging.info("reading %s", jsonf)
    with codecs.open(jsonf, "r", encoding="utf-8") as f:
        j = json.load(f)

    # target dictionary
    logging.info("reading %s", dic)
    with codecs.open(dic, "r", encoding="utf-8") as f:
        dictionary = f.readlines()
    char_list_tgt = [entry.split(" ")[0] for entry in dictionary]
    char_list_tgt.insert(0, "<blank>")
    char_list_tgt.append("<eos>")

    # source dictionary
    logging.info("reading %s", dic_src)
    if dic_src:
        with codecs.open(dic_src, "r", encoding="utf-8") as f:
            dictionary = f.readlines()
        char_list_src = [entry.split(" ")[0] for entry in dictionary]
        char_list_src.insert(0, "<blank>")
        char_list_src.append("<eos>")

    if hyps:
        hyp_file = codecs.open(hyps[0], "w", encoding="utf-8")
    ref_file = codecs.open(refs[0], "w", encoding="utf-8")
    if srcs:
        src_file = codecs.open(srcs[0], "w", encoding="utf-8")

    for x in j["utts"]:
        # hyps
        if hyps:
            seq = [
                char_list_tgt[int(i)]
                for i in j["utts"][x]["output"][0]["rec_tokenid"].split()
            ]
            hyp_file.write(" ".join(seq).replace("<eos>", "")),
            hyp_file.write(
                " (" + j["utts"][x]["utt2spk"].replace("-", "_") + "-" + x + ")\n"
            )

        # ref
        seq = [
            char_list_tgt[int(i)] for i in j["utts"][x]["output"][0]["tokenid"].split()
        ]
        ref_file.write(" ".join(seq).replace("<eos>", "")),
        ref_file.write(
            " (" + j["utts"][x]["utt2spk"].replace("-", "_") + "-" + x + ")\n"
        )

        # src
        if "tokenid_src" in j["utts"][x]["output"][0].keys():
            if dic_src:
                seq = [
                    char_list_src[int(i)]
                    for i in j["utts"][x]["output"][0]["tokenid_src"].split()
                ]
            else:
                seq = [
                    char_list_tgt[int(i)]
                    for i in j["utts"][x]["output"][0]["tokenid_src"].split()
                ]
            src_file.write(" ".join(seq).replace("<eos>", "")),
            src_file.write(
                " (" + j["utts"][x]["utt2spk"].replace("-", "_") + "-" + x + ")\n"
            )

    if hyps:
        hyp_file.close()
    ref_file.close()
    if srcs:
        src_file.close()


if __name__ == "__main__":
    main(sys.argv[1:])


================================================
FILE: egs/espnet_utils/json2trn_wo_dict.py
================================================
#!/usr/bin/env python3
# encoding: utf-8

# Copyright 2019 Okayama University (Katsuki Inoue)
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

import argparse
import codecs
import json
import logging
import sys

from espnet.utils.cli_utils import get_commandline_args


def get_parser():
    parser = argparse.ArgumentParser(
        description="convert a json to a transcription file with a token dictionary",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
    )
    parser.add_argument("json", type=str, help="json files")
    parser.add_argument("--num-spkrs", type=int, default=1, help="number of speakers")
    parser.add_argument("--refs", type=str, nargs="+", help="ref for all speakers")
    parser.add_argument("--hyps", type=str, nargs="+", help="hyp for all outputs")
    return parser


def main(args):
    args = get_parser().parse_args(args)
    convert(args.json, args.refs, args.hyps, args.num_spkrs)


def convert(jsonf, refs, hyps, num_spkrs=1):
    n_ref = len(refs)
    n_hyp = len(hyps)
    assert n_ref == n_hyp
    assert n_ref == num_spkrs

    # logging info
    logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
    logging.basicConfig(level=logging.INFO, format=logfmt)
    logging.info(get_commandline_args())

    logging.info("reading %s", jsonf)
    with codecs.open(jsonf, "r", encoding="utf-8") as f:
        j = json.load(f)

    for ns in range(num_spkrs):
        hyp_file = codecs.open(hyps[ns], "w", encoding="utf-8")
        ref_file = codecs.open(refs[ns], "w", encoding="utf-8")

        for x in j["utts"]:
            # recognition hypothesis
            if num_spkrs == 1:
                seq = j["utts"][x]["output"][0]["rec_text"].replace("<eos>", "")
            else:
                seq = j["utts"][x]["output"][ns][0]["rec_text"].replace("<eos>", "")
            # In the recognition hypothesis,
            # the <eos> symbol is usually attached in the last part of the sentence
            # and it is removed below.
            hyp_file.write(seq)
            hyp_file.write(" (" + x.replace("-", "_") + ")\n")

            # reference
            if num_spkrs == 1:
                seq = j["utts"][x]["output"][0]["text"]
            else:
                seq = j["utts"][x]["output"][ns][0]["text"]
            # Unlike the recognition hypothesis,
            # the reference is directly generated from a token without dictionary
            # to avoid to include <unk> symbols in the reference to make scoring normal.
            # The detailed discussion can be found at
            # https://github.com/espnet/espnet/issues/993
            ref_file.write(seq + " (" + x.replace("-", "_") + ")\n")

        hyp_file.close()
        ref_file.close()


if __name__ == "__main__":
    main(sys.argv[1:])


================================================
FILE: egs/espnet_utils/k2/add_lex_disambig.pl
================================================
#!/usr/bin/env perl
#  Copyright 2010-2011  Microsoft Corporation
#            2013-2016  Johns Hopkins University (author: Daniel Povey)
#                 2015  Hainan Xu
#                 2015  Guoguo Chen

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#  http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.


# Adds disambiguation symbols to a lexicon.
# Outputs still in the normal lexicon format.
# Disambig syms are numbered #1, #2, #3, etc. (#0
# reserved for symbol in grammar).
# Outputs the number of disambig syms to the standard output.
# With the --pron-probs option, expects the second field
# of each lexicon line to be a pron-prob.
# With the --sil-probs option, expects three additional
# fields after the pron-prob, representing various components
# of the silence probability model.

$pron_probs = 0;
$sil_probs = 0;
$first_allowed_disambig = 1;

for ($n = 1; $n <= 3 && @ARGV > 0; $n++) {
  if ($ARGV[0] eq "--pron-probs") {
    $pron_probs = 1;
    shift @ARGV;
  }
  if ($ARGV[0] eq "--sil-probs") {
    $sil_probs = 1;
    shift @ARGV;
  }
  if ($ARGV[0] eq "--first-allowed-disambig") {
    $first_allowed_disambig = 0 + $ARGV[1];
    if ($first_allowed_disambig < 1) {
      die "add_lex_disambig.pl: invalid --first-allowed-disambig option: $first_allowed_disambig\n";
    }
    shift @ARGV;
    shift @ARGV;
  }
}

if (@ARGV != 2) {
  die "Usage: add_lex_disambig.pl [opts] <lexicon-in> <lexicon-out>\n" .
    "This script adds disambiguation symbols to a lexicon in order to\n" .
    "make decoding graphs determinizable; it adds pseudo-phone\n" .
    "disambiguation symbols #1, #2 and so on at the ends of phones\n" .
    "to ensure that all pronunciations are different, and that none\n" .
    "is a prefix of another.\n" .
    "It prints to the standard output the number of the largest-numbered" .
    "disambiguation symbol that was used.\n" .
    "\n" .
    "Options:   --pron-probs       Expect pronunciation probabilities in the 2nd field\n" .
    "           --sil-probs        [should be with --pron-probs option]\n" .
    "                              Expect 3 extra fields after the pron-probs, for aspects of\n" .
    "                              the silence probability model\n" .
    "           --first-allowed-disambig <n>  The number of the first disambiguation symbol\n" .
    "                              that this script is allowed to add.  By default this is\n" .
    "                              #1, but you can set this to a larger value using this option.\n" .
    "e.g.:\n" .
    " add_lex_disambig.pl lexicon.txt lexicon_disambig.txt\n" .
    " add_lex_disambig.pl --pron-probs lexiconp.txt lexiconp_disambig.txt\n" .
    " add_lex_disambig.pl --pron-probs --sil-probs lexiconp_silprob.txt lexiconp_silprob_disambig.txt\n";
}


$lexfn = shift @ARGV;
$lexoutfn = shift @ARGV;

open(L, "<$lexfn") || die "Error opening lexicon $lexfn";

# (1)  Read in the lexicon.
@L = ( );
while(<L>) {
    @A = split(" ", $_);
    push @L, join(" ", @A);
}

# (2) Work out the count of each phone-sequence in the
# lexicon.

foreach $l (@L) {
    @A = split(" ", $l);
    shift @A; # Remove word.
    if ($pron_probs) {
      $p = shift @A;
      if (!($p > 0.0 && $p <= 1.0)) { die "Bad lexicon line $l (expecting pron-prob as second field)"; }
    }
    if ($sil_probs) {
      $silp = shift @A;
      if (!($silp > 0.0 && $silp <= 1.0)) { die "Bad lexicon line $l for silprobs"; }
      $correction = shift @A;
      if ($correction <= 0.0) { die "Bad lexicon line $l for silprobs"; }
      $correction = shift @A;
      if ($correction <= 0.0) { die "Bad lexicon line $l for silprobs"; }
    }
    if (!(@A)) {
      die "Bad lexicon line $1, no phone in phone list";
    }
    $count{join(" ",@A)}++;
}

# (3) For each left sub-sequence of each phone-sequence, note down
# that it exists (for identifying prefixes of longer strings).

foreach $l (@L) {
    @A = split(" ", $l);
    shift @A; # Remove word.
    if ($pron_probs) { shift @A; } # remove pron-prob.
    if ($sil_probs) {
      shift @A; # Remove silprob
      shift @A; # Remove silprob
      shift @A; # Remove silprob, there three numbers for sil_probs
    }
    while(@A > 0) {
        pop @A;  # Remove last phone
        $issubseq{join(" ",@A)} = 1;
    }
}

# (4) For each entry in the lexicon:
#  if the phone sequence is unique and is not a
#  prefix of another word, no diambig symbol.
#  Else output #1, or #2, #3, ... if the same phone-seq
#  has already been assigned a disambig symbol.


open(O, ">$lexoutfn") || die "Opening lexicon file $lexoutfn for writing.\n";

# max_disambig will always be the highest-numbered disambiguation symbol that
# has been used so far.
$max_disambig = $first_allowed_disambig - 1;

foreach $l (@L) {
  @A = split(" ", $l);
  $word = shift @A;
  if ($pron_probs) {
    $pron_prob = shift @A;
  }
  if ($sil_probs) {
    $sil_word_prob = shift @A;
    $word_sil_correction = shift @A;
    $prev_nonsil_correction = shift @A
  }
  $phnseq = join(" ", @A);
  if (!defined $issubseq{$phnseq}
      && $count{$phnseq} == 1) {
    ;                           # Do nothing.
  } else {
    if ($phnseq eq "") {        # need disambig symbols for the empty string
      # that are not use anywhere else.
      $max_disambig++;
      $reserved_for_the_empty_string{$max_disambig} = 1;
      $phnseq = "#$max_disambig";
    } else {
      $cur_disambig = $last_used_disambig_symbol_of{$phnseq};
      if (!defined $cur_disambig) {
        $cur_disambig = $first_allowed_disambig;
      } else {
        $cur_disambig++;           # Get a number that has not been used yet for
                                   # this phone sequence.
      }
      while (defined $reserved_for_the_empty_string{$cur_disambig}) {
        $cur_disambig++;
      }
      if ($cur_disambig > $max_disambig) {
        $max_disambig = $cur_disambig;
      }
      $last_used_disambig_symbol_of{$phnseq} = $cur_disambig;
      $phnseq = $phnseq . " #" . $cur_disambig;
    }
  }
  if ($pron_probs) {
    if ($sil_probs) {
      print O "$word\t$pron_prob\t$sil_word_prob\t$word_sil_correction\t$prev_nonsil_correction\t$phnseq\n";
    } else {
      print O "$word\t$pron_prob\t$phnseq\n";
    }
  } else {
    print O "$word\t$phnseq\n";
  }
}

print $max_disambig . "\n";


================================================
FILE: egs/espnet_utils/k2/apply_map.pl
================================================
#!/usr/bin/env perl
use warnings; #sed replacement for -w perl parameter
# Copyright 2012  Johns Hopkins University (Author: Daniel Povey)
# Apache 2.0.

# This program is a bit like ./sym2int.pl in that it applies a map
# to things in a file, but it's a bit more general in that it doesn't
# assume the things being mapped to are single tokens, they could
# be sequences of tokens.  See the usage message.


$permissive = 0;

for ($x = 0; $x <= 2; $x++) {

  if (@ARGV > 0 && $ARGV[0] eq "-f") {
    shift @ARGV;
    $field_spec = shift @ARGV;
    if ($field_spec =~ m/^\d+$/) {
      $field_begin = $field_spec - 1; $field_end = $field_spec - 1;
    }
    if ($field_spec =~ m/^(\d*)[-:](\d*)/) { # accept e.g. 1:10 as a courtesty (properly, 1-10)
      if ($1 ne "") {
        $field_begin = $1 - 1;  # Change to zero-based indexing.
      }
      if ($2 ne "") {
        $field_end = $2 - 1;    # Change to zero-based indexing.
      }
    }
    if (!defined $field_begin && !defined $field_end) {
      die "Bad argument to -f option: $field_spec";
    }
  }

  if (@ARGV > 0 && $ARGV[0] eq '--permissive') {
    shift @ARGV;
    # Mapping is optional (missing key is printed to output)
    $permissive = 1;
  }
}

if(@ARGV != 1) {
  print STDERR "Invalid usage: " . join(" ", @ARGV) . "\n";
  print STDERR <<'EOF';
Usage: apply_map.pl [options] map <input >output
 options: [-f <field-range> ] [--permissive]
   This applies a map to some specified fields of some input text:
   For each line in the map file: the first field is the thing we
   map from, and the remaining fields are the sequence we map it to.
   The -f (field-range) option says which fields of the input file the map
   map should apply to.
   If the --permissive option is supplied, fields which are not present
   in the map will be left as they were.
 Applies the map 'map' to all input text, where each line of the map
 is interpreted as a map from the first field to the list of the other fields
 Note: <field-range> can look like 4-5, or 4-, or 5-, or 1, it means the field
 range in the input to apply the map to.
 e.g.: echo A B | apply_map.pl a.txt
 where a.txt is:
 A a1 a2
 B b
 will produce:
 a1 a2 b
EOF
  exit(1);
}

($map_file) = @ARGV;
open(M, "<$map_file") || die "Error opening map file $map_file: $!";

while (<M>) {
  @A = split(" ", $_);
  @A >= 1 || die "apply_map.pl: empty line.";
  $i = shift @A;
  $o = join(" ", @A);
  $map{$i} = $o;
}

while(<STDIN>) {
  @A = split(" ", $_);
  for ($x = 0; $x < @A; $x++) {
    if ( (!defined $field_begin || $x >= $field_begin)
         && (!defined $field_end || $x <= $field_end)) {
      $a = $A[$x];
      if (!defined $map{$a}) {
        if (!$permissive) {
          die "apply_map.pl: undefined key $a in $map_file\n";
        } else {
          print STDERR "apply_map.pl: warning! missing key $a in $map_file\n";
        }
      } else {
        $A[$x] = $map{$a};
      }
    }
  }
  print join(" ", @A) . "\n";
}


================================================
FILE: egs/espnet_utils/k2/fstaddselfloops.pl
================================================
#!/usr/bin/env perl

# Copyright 2020 Xiaomi Corporation (Author: Junbo Zhang)
# Apache 2.0

use strict;
use warnings;

my $Usage = <<EOU;
fstaddselfloops.pl:
Adds self-loops to states of an FST to propagate disambiguation symbols through it.
They are added on each final state and each state with non-epsilon output symbols
on at least one arc out of the state. 

Usage: local/fstaddselfloops.pl <wdisambig_phone> <wdisambig_word> < <openfst_text>
 e.g.: cat L_disambig.txt | local/fstaddselfloops.pl 347 200004 > L_disambig_with_loop.txt
EOU

if (@ARGV != 2) {
  die $Usage;
}

my $wdisambig_phone = shift @ARGV;
my $wdisambig_word = shift @ARGV;

my %states_needs_self_loops;
while (<>) {
    print $_;

    my @items = split(/\s+/);
    if (@items == 2) {
        # it is a final state
        $states_needs_self_loops{$items[0]} = 1;
    } elsif (@items == 5) {
        my ($src, $dst, $inlabel, $outlabel, $score) = @items;
        $states_needs_self_loops{$src} = 1 if ($outlabel != 0);
    } else {
        die "Invalid openfst line.";
    }
}

foreach (keys %states_needs_self_loops) {
    print "$_ $_ $wdisambig_phone $wdisambig_word 0.0\n"
}


================================================
FILE: egs/espnet_utils/k2/k2_prepare_lang.sh
================================================
#!/usr/bin/env bash
# Copyright 2012-2013  Johns Hopkins University (Author: Daniel Povey);
#                      Arnab Ghoshal
#                2014  Guoguo Chen
#                2015  Hainan Xu
#                2016  FAU Erlangen (Author: Axel Horndasch)
#                2020  Xiaomi Corporation (Author: Junbo Zhang)

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#  http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.

# This script prepares a directory such as data/lang/, in the standard format,
# given a source directory containing a dictionary lexicon.txt in a form like:
# word phone1 phone2 ... phoneN
# per line (alternate prons would be separate lines), or a dictionary with probabilities
# called lexiconp.txt in a form:
# word pron-prob phone1 phone2 ... phoneN
# (with 0.0 < pron-prob <= 1.0); note: if lexiconp.txt exists, we use it even if
# lexicon.txt exists.
# and also files silence_phones.txt, nonsilence_phones.txt, optional_silence.txt
# and extra_questions.txt
# Here, silence_phones.txt and nonsilence_phones.txt are lists of silence and
# non-silence phones respectively (where silence includes various kinds of
# noise, laugh, cough, filled pauses etc., and nonsilence phones includes the
# "real" phones.)
# In each line of those files is a list of phones, and the phones on each line
# are assumed to correspond to the same "base phone", i.e. they will be
# different stress or tone variations of the same basic phone.
# The file "optional_silence.txt" contains just a single phone (typically SIL)
# which is used for optional silence in the lexicon.
# extra_questions.txt might be empty; typically will consist of lists of phones,
# all members of each list with the same stress or tone; and also possibly a
# list for the silence phones.  This will augment the automatically generated
# questions (note: the automatically generated ones will treat all the
# stress/tone versions of a phone the same, so will not "get to ask" about
# stress or tone).
#

# This script adds word-position-dependent phones and constructs a host of other
# derived files, that go in data/lang/.

# Begin configuration section.
num_sil_states=5
num_nonsil_states=3
position_dependent_phones=true
# position_dependent_phones is false also when position dependent phones and word_boundary.txt
# have been generated by another source
share_silence_phones=false  # if true, then share pdfs of different silence
                            # phones together.
sil_prob=0.5
num_extra_phone_disambig_syms=1 # Standard one phone disambiguation symbol is used for optional silence.
                                # Increasing this number does not harm, but is only useful if you later
                                # want to introduce this labels to L_disambig.fst


# end configuration sections

echo "$0 $@"  # Print the command line for logging
echo $sil_prob
. local/parse_options.sh
echo $sil_prob
if [ $# -ne 4 ]; then
  echo "Usage: local/prepare_lang.sh <dict-src-dir> <oov-dict-entry> <tmp-dir> <lang-dir>"
  echo "e.g.: local/prepare_lang.sh data/local/dict <SPOKEN_NOISE> data/local/lang data/lang"
  echo "<dict-src-dir> should contain the following files:"
  echo " extra_questions.txt  lexicon.txt nonsilence_phones.txt  optional_silence.txt  silence_phones.txt"
  echo "See http://kaldi-asr.org/doc/data_prep.html#data_prep_lang_creating for more info."
  echo "options: "
  echo "<dict-src-dir> may also, for the grammar-decoding case (see http://kaldi-asr.org/doc/grammar.html)"
  echo "contain a file nonterminals.txt containing symbols like #nonterm:contact_list, one per line."
  echo "     --num-sil-states <number of states>             # default: 5, #states in silence models."
  echo "     --num-nonsil-states <number of states>          # default: 3, #states in non-silence models."
  echo "     --position-dependent-phones (true|false)        # default: true; if true, use _B, _E, _S & _I"
  echo "                                                     # markers on phones to indicate word-internal positions. "
  echo "     --share-silence-phones (true|false)             # default: false; if true, share pdfs of "
  echo "                                                     # all silence phones. "
  echo "     --sil-prob <probability of silence>             # default: 0.5 [must have 0 <= silprob < 1]"
  exit 1;
fi

srcdir=$1
oov_word=$2
tmpdir=$3
dir=$4


if [ -d $dir/phones ]; then
  rm -r $dir/phones
fi
mkdir -p $dir $tmpdir $dir/phones

silprob=false
[ -f $srcdir/lexiconp_silprob.txt ] && silprob=true

[ -f path.sh ] && . ./path.sh

if [[ ! -f $srcdir/lexicon.txt ]]; then
  echo "**Creating $srcdir/lexicon.txt from $srcdir/lexiconp.txt"
  perl -ape 's/(\S+\s+)\S+\s+(.+)/$1$2/;' < $srcdir/lexiconp.txt > $srcdir/lexicon.txt || exit 1;
fi
if [[ ! -f $srcdir/lexiconp.txt ]]; then
  echo "**Creating $srcdir/lexiconp.txt from $srcdir/lexicon.txt"
  perl -ape 's/(\S+\s+)(.+)/${1}1.0\t$2/;' < $srcdir/lexicon.txt > $srcdir/lexiconp.txt || exit 1;
fi

if [ ! -z "$unk_fst" ] && [ ! -f "$unk_fst" ]; then
  echo "$0: expected --unk-fst $unk_fst to exist as a file"
  exit 1
fi

if $position_dependent_phones; then
  # Create $tmpdir/lexiconp.txt from $srcdir/lexiconp.txt (or
  # $tmpdir/lexiconp_silprob.txt from $srcdir/lexiconp_silprob.txt) by
  # adding the markers _B, _E, _S, _I depending on word position.
  # In this recipe, these markers apply to silence also.
  # Do this starting from lexiconp.txt only.
  if "$silprob"; then
    perl -ane '@A=split(" ",$_); $w = shift @A; $p = shift @A; $silword_p = shift @A;
              $wordsil_f = shift @A; $wordnonsil_f = shift @A; @A>0||die;
         if(@A==1) { print "$w $p $silword_p $wordsil_f $wordnonsil_f $A[0]_S\n"; }
         else { print "$w $p $silword_p $wordsil_f $wordnonsil_f $A[0]_B ";
         for($n=1;$n<@A-1;$n++) { print "$A[$n]_I "; } print "$A[$n]_E\n"; } ' \
                < $srcdir/lexiconp_silprob.txt > $tmpdir/lexiconp_silprob.txt
  else
    perl -ane '@A=split(" ",$_); $w = shift @A; $p = shift @A; @A>0||die;
         if(@A==1) { print "$w $p $A[0]_S\n"; } else { print "$w $p $A[0]_B ";
         for($n=1;$n<@A-1;$n++) { print "$A[$n]_I "; } print "$A[$n]_E\n"; } ' \
         < $srcdir/lexiconp.txt > $tmpdir/lexiconp.txt || exit 1;
  fi

  # create $tmpdir/phone_map.txt
  # this has the format (on each line)
  # <original phone> <version 1 of original phone> <version 2> ...
  # where the versions depend on the position of the phone within a word.
  # For instance, we'd have:
  # AA AA_B AA_E AA_I AA_S
  # for (B)egin, (E)nd, (I)nternal and (S)ingleton
  # and in the case of silence
  # SIL SIL SIL_B SIL_E SIL_I SIL_S
  # [because SIL on its own is one of the variants; this is for when it doesn't
  #  occur inside a word but as an option in the lexicon.]

  # This phone map expands the phone lists into all the word-position-dependent
  # versions of the phone lists.
  cat <(set -f; for x in `cat $srcdir/silence_phones.txt`; do for y in "" "" "_B" "_E" "_I" "_S"; do echo -n "$x$y "; done; echo; done) \
    <(set -f; for x in `cat $srcdir/nonsilence_phones.txt`; do for y in "" "_B" "_E" "_I" "_S"; do echo -n "$x$y "; done; echo; done) \
    > $tmpdir/phone_map.txt
else
  if "$silprob"; then
    cp $srcdir/lexiconp_silprob.txt $tmpdir/lexiconp_silprob.txt
  else
    cp $srcdir/lexiconp.txt $tmpdir/lexiconp.txt
  fi

  cat $srcdir/silence_phones.txt $srcdir/nonsilence_phones.txt | \
    awk '{for(n=1;n<=NF;n++) print $n; }' > $tmpdir/phones
  paste -d' ' $tmpdir/phones $tmpdir/phones > $tmpdir/phone_map.txt
fi


# Making monophone systems.
cat $srcdir/silence_phones.txt | local/apply_map.pl $tmpdir/phone_map.txt | \
  awk '{for(n=1;n<=NF;n++) print $n;}' > $dir/phones/silence.txt
cat $srcdir/nonsilence_phones.txt | local/apply_map.pl $tmpdir/phone_map.txt | \
  awk '{for(n=1;n<=NF;n++) print $n;}' > $dir/phones/nonsilence.txt
cp $srcdir/optional_silence.txt $dir/phones/optional_silence.txt

# if extra_questions.txt is empty, it's OK.
cat $srcdir/extra_questions.txt 2>/dev/null | local/apply_map.pl $tmpdir/phone_map.txt \
  >$dir/phones/extra_questions.txt

# Want extra questions about the word-start/word-end stuff. Make it separate for
# silence and non-silence. Probably doesn't matter, as silence will rarely
# be inside a word.
if $position_dependent_phones; then
  for suffix in _B _E _I _S; do
    (set -f; for x in `cat $srcdir/nonsilence_phones.txt`; do echo -n "$x$suffix "; done; echo) >>$dir/phones/extra_questions.txt
  done
  for suffix in "" _B _E _I _S; do
    (set -f; for x in `cat $srcdir/silence_phones.txt`; do echo -n "$x$suffix "; done; echo) >>$dir/phones/extra_questions.txt
  done
fi

# add_lex_disambig.pl is responsible for adding disambiguation symbols to
# the lexicon, for telling us how many disambiguation symbols it used,
# and also for modifying the unknown-word's pronunciation (if the
# --unk-fst was provided) to the sequence "#1 #2 #3", and reserving those
# disambig symbols for that purpose.
# The #2 will later be replaced with the actual unk model.  The reason
# for the #1 and the #3 is for disambiguation and also to keep the
# FST compact.  If we didn't have the #1, we might have a different copy of
# the unk-model FST, or at least some of its arcs, for each start-state from
# which an <unk> transition comes (instead of per end-state, which is more compact);
# and adding the #3 prevents us from potentially having 2 copies of the unk-model
# FST due to the optional-silence [the last phone of any word gets 2 arcs].
if [ ! -z "$unk_fst" ]; then  # if the --unk-fst option was provided...
  if "$silprob"; then
    local/lang/internal/modify_unk_pron.py $tmpdir/lexiconp_silprob.txt "$oov_word" || exit 1
  else
    local/lang/internal/modify_unk_pron.py $tmpdir/lexiconp.txt "$oov_word" || exit 1
  fi
  unk_opt="--first-allowed-disambig 4"
else
  unk_opt=
fi

if "$silprob"; then
  ndisambig=$(local/add_lex_disambig.pl $unk_opt --pron-probs --sil-probs $tmpdir/lexiconp_silprob.txt $tmpdir/lexiconp_silprob_disambig.txt)
else
  ndisambig=$(local/add_lex_disambig.pl $unk_opt --pron-probs $tmpdir/lexiconp.txt $tmpdir/lexiconp_disambig.txt)
fi
ndisambig=$[$ndisambig+$num_extra_phone_disambig_syms]; # add (at least) one disambig symbol for silence in lexicon FST.
echo $ndisambig > $tmpdir/lex_ndisambig

# Format of lexiconp_disambig.txt:
# !SIL	1.0   SIL_S
# <SPOKEN_NOISE>	1.0   SPN_S #1
# <UNK>	1.0  SPN_S #2
# <NOISE>	1.0  NSN_S
# !EXCLAMATION-POINT	1.0  EH2_B K_I S_I K_I L_I AH0_I M_I EY1_I SH_I AH0_I N_I P_I OY2_I N_I T_E

( for n in `seq 0 $ndisambig`; do echo '#'$n; done ) >$dir/phones/disambig.txt

# Create phone symbol table.
echo "<eps>" | cat - $dir/phones/{silence,nonsilence,disambig}.txt | \
  awk '{n=NR-1; print $1, n;}' > $dir/phones.txt

# Create a file that describes the word-boundary information for
# each phone.  5 categories.
if $position_dependent_phones; then
  cat $dir/phones/{silence,nonsilence}.txt | \
    awk '/_I$/{print $1, "internal"; next;} /_B$/{print $1, "begin"; next; }
         /_S$/{print $1, "singleton"; next;} /_E$/{print $1, "end"; next; }
         {print $1, "nonword";} ' > $dir/phones/word_boundary.txt
else
  # word_boundary.txt might have been generated by another source
  [ -f $srcdir/word_boundary.txt ] && cp $srcdir/word_boundary.txt $dir/phones/word_boundary.txt
fi

# Create word symbol table.
# <s> and </s> are only needed due to the need to rescore lattices with
# ConstArpaLm format language model. They do not normally appear in G.fst or
# L.fst.

if "$silprob"; then
  # remove the silprob
  cat $tmpdir/lexiconp_silprob.txt |\
    awk '{
      for(i=1; i<=NF; i++) {
        if(i!=3 && i!=4 && i!=5) printf("%s\t", $i); if(i==NF) print "";
      }
    }' > $tmpdir/lexiconp.txt
fi

cat $tmpdir/lexiconp.txt | awk '{print $1}' | sort | uniq  | awk '
  BEGIN {
    print "<eps> 0";
  }
  {
    if ($1 == "<s>") {
      print "<s> is in the vocabulary!" | "cat 1>&2"
      exit 1;
    }
    if ($1 == "</s>") {
      print "</s> is in the vocabulary!" | "cat 1>&2"
      exit 1;
    }
    printf("%s %d\n", $1, NR);
  }
  END {
    printf("#0 %d\n", NR+1);
    printf("<s> %d\n", NR+2);
    printf("</s> %d\n", NR+3);
  }' > $dir/words.txt || exit 1;

# format of $dir/words.txt:
#<eps> 0
#a 1
#aa 2
#aarvark 3
#...

silphone=`cat $srcdir/optional_silence.txt` || exit 1;
[ -z "$silphone" ] && \
  ( echo "You have no optional-silence phone; it is required in the current scripts"
    echo "but you may use the option --sil-prob 0.0 to stop it being used." ) && \
   exit 1;

grammar_opts=

# Create the basic L.fst without disambiguation symbols, for use
# in training.

if $silprob; then
  # Add silence probabilities (models the prob. of silence before and after each
  # word).  On some setups this helps a bit.  See local/dict_dir_add_pronprobs.sh
  # and where it's called in the example scripts (run.sh).
  local/make_lexicon_fst_silprob.py $grammar_opts --sil-phone=$silphone \
    $tmpdir/lexiconp_silprob.txt $srcdir/silprob.txt | \
    local/sym2int.pl -f 3 $dir/phones.txt | \
    local/sym2int.pl -f 4 $dir/words.txt  > $dir/L.fst.txt || exit 1;

    # fstcompile --isymbols=$dir/phones.txt --osymbols=$dir/words.txt \
    #   --keep_isymbols=false --keep_osymbols=false |   \
    # fstarcsort --sort_type=olabel > $dir/L.fst || exit 1;
else
  local/make_lexicon_fst.py $grammar_opts --sil-prob=$sil_prob --sil-phone=$silphone \
    $tmpdir/lexiconp.txt | \
    local/sym2int.pl -f 3 $dir/phones.txt | \
    local/sym2int.pl -f 4 $dir/words.txt > $dir/L.fst.txt || exit 1;

    # fstcompile --isymbols=$dir/phones.txt --osymbols=$dir/words.txt \
    #   --keep_isymbols=false --keep_osymbols=false | \
    # fstarcsort --sort_type=olabel > $dir/L.fst || exit 1;
fi

# The file oov.txt contains a word that we will map any OOVs to during
# training.
echo "$oov_word" > $dir/oov.txt || exit 1;
cat $dir/oov.txt | local/sym2int.pl $dir/words.txt >$dir/oov.int || exit 1;
# integer version of oov symbol, used in some scripts.


# the file wdisambig.txt contains a (line-by-line) list of the text-form of the
# disambiguation symbols that are used in the grammar and passed through by the
# lexicon.  At this stage it's hardcoded as '#0', but we're laying the groundwork
# for more generality (which probably would be added by another script).
# wdisambig_words.int contains the corresponding list interpreted by the
# symbol table words.txt, and wdisambig_phones.int contains the corresponding
# list interpreted by the symbol table phones.txt.
echo '#0' >$dir/phones/wdisambig.txt

wdisambig_phone=`local/sym2int.pl $dir/phones.txt <$dir/phones/wdisambig.txt`
wdisambig_word=`local/sym2int.pl $dir/words.txt <$dir/phones/wdisambig.txt`

# Create these lists of phones in colon-separated integer list form too,
# for purposes of being given to programs as command-line options.
for f in silence nonsilence optional_silence disambig; do
  local/sym2int.pl $dir/phones.txt <$dir/phones/$f.txt >$dir/phones/$f.int
  local/sym2int.pl $dir/phones.txt <$dir/phones/$f.txt | \
   awk '{printf(":%d", $1);} END{printf "\n"}' | sed s/:// > $dir/phones/$f.csl || exit 1;
done

if [ -f $dir/phones/word_boundary.txt ]; then
  local/sym2int.pl -f 1 $dir/phones.txt <$dir/phones/word_boundary.txt \
    > $dir/phones/word_boundary.int || exit 1;
fi

silphonelist=`cat $dir/phones/silence.csl`
nonsilphonelist=`cat $dir/phones/nonsilence.csl`

# Create the lexicon FST with disambiguation symbols, and put it in lang_test.
# There is an extra step where we create a loop to "pass through" the
# disambiguation symbols from G.fst.

if $silprob; then
  local/make_lexicon_fst_silprob.py $grammar_opts \
    --sil-phone=$silphone --sil-disambig='#'$ndisambig \
    $tmpdir/lexiconp_silprob_disambig.txt $srcdir/silprob.txt | \
    local/sym2int.pl -f 3 $dir/phones.txt | \
    local/sym2int.pl -f 4 $dir/words.txt | \
    local/fstaddselfloops.pl $wdisambig_phone $wdisambig_word > $dir/L_disambig.fst.txt || exit 1;
else
  local/make_lexicon_fst.py $grammar_opts \
    --sil-prob=$sil_prob --sil-phone=$silphone --sil-disambig='#'$ndisambig \
    $tmpdir/lexiconp_disambig.txt | \
    local/sym2int.pl -f 3 $dir/phones.txt | \
    local/sym2int.pl -f 4 $dir/words.txt | \
    local/fstaddselfloops.pl $wdisambig_phone $wdisambig_word > $dir/L_disambig.fst.txt || exit 1;
fi

exit 0;


================================================
FILE: egs/espnet_utils/k2/parse_options.sh
================================================
#!/usr/bin/env bash

# Copyright 2012  Johns Hopkins University (Author: Daniel Povey);
#                 Arnab Ghoshal, Karel Vesely

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#  http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.


# Parse command-line options.
# To be sourced by another script (as in ". parse_options.sh").
# Option format is: --option-name arg
# and shell variable "option_name" gets set to value "arg."
# The exception is --help, which takes no arguments, but prints the
# $help_message variable (if defined).


###
### The --config file options have lower priority to command line
### options, so we need to import them first...
###

# Now import all the configs specified by command-line, in left-to-right order
for ((argpos=1; argpos<$#; argpos++)); do
  if [ "${!argpos}" == "--config" ]; then
    argpos_plus1=$((argpos+1))
    config=${!argpos_plus1}
    [ ! -r $config ] && echo "$0: missing config '$config'" && exit 1
    . $config  # source the config file.
  fi
done


###
### Now we process the command line options
###
while true; do
  [ -z "${1:-}" ] && break;  # break if there are no arguments
  case "$1" in
    # If the enclosing script is called with --help option, print the help
    # message and exit.  Scripts should put help messages in $help_message
    --help|-h) if [ -z "$help_message" ]; then echo "No help found." 1>&2;
      else printf "$help_message\n" 1>&2 ; fi;
      exit 0 ;;
    --*=*) echo "$0: options to scripts must be of the form --name value, got '$1'"
      exit 1 ;;
    # If the first command-line argument begins with "--" (e.g. --foo-bar),
    # then work out the variable name as $name, which will equal "foo_bar".
    --*) name=`echo "$1" | sed s/^--// | sed s/-/_/g`;
      # Next we test whether the variable in question is undefned-- if so it's
      # an invalid option and we die.  Note: $0 evaluates to the name of the
      # enclosing script.
      # The test [ -z ${foo_bar+xxx} ] will return true if the variable foo_bar
      # is undefined.  We then have to wrap this test inside "eval" because
      # foo_bar is itself inside a variable ($name).
      eval '[ -z "${'$name'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1;

      oldval="`eval echo \\$$name`";
      # Work out whether we seem to be expecting a Boolean argument.
      if [ "$oldval" == "true" ] || [ "$oldval" == "false" ]; then
        was_bool=true;
      else
        was_bool=false;
      fi

      # Set the variable to the right value-- the escaped quotes make it work if
      # the option had spaces, like --cmd "queue.pl -sync y"
      eval $name=\"$2\";

      # Check that Boolean-valued arguments are really Boolean.
      if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then
        echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2
        exit 1;
      fi
      shift 2;
      ;;
  *) break;
  esac
done


# Check for an empty argument to the --cmd option, which can easily occur as a
# result of scripting errors.
[ ! -z "${cmd+xxx}" ] && [ -z "$cmd" ] && echo "$0: empty argument to --cmd option" 1>&2 && exit 1;


true; # so this script returns exit code 0.


================================================
FILE: egs/espnet_utils/k2/sym2int.pl
================================================
#!/usr/bin/env perl
# Copyright 2010-2012 Microsoft Corporation  Johns Hopkins University (Author: Daniel Povey)

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#  http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.


$ignore_oov = 0;

for($x = 0; $x < 2; $x++) {
  if ($ARGV[0] eq "--map-oov") {
    shift @ARGV;
    $map_oov = shift @ARGV;
    if ($map_oov eq "-f" || $map_oov =~ m/words\.txt$/ || $map_oov eq "") {
      # disallow '-f', the empty string and anything ending in words.txt as the
      # OOV symbol because these are likely command-line errors.
      die "the --map-oov option requires an argument";
    }
  }
  if ($ARGV[0] eq "-f") {
    shift @ARGV;
    $field_spec = shift @ARGV;
    if ($field_spec =~ m/^\d+$/) {
      $field_begin = $field_spec - 1; $field_end = $field_spec - 1;
    }
    if ($field_spec =~ m/^(\d*)[-:](\d*)/) { # accept e.g. 1:10 as a courtesy (properly, 1-10)
      if ($1 ne "") {
        $field_begin = $1 - 1;  # Change to zero-based indexing.
      }
      if ($2 ne "") {
        $field_end = $2 - 1;    # Change to zero-based indexing.
      }
    }
    if (!defined $field_begin && !defined $field_end) {
      die "Bad argument to -f option: $field_spec";
    }
  }
}

$symtab = shift @ARGV;
if (!defined $symtab) {
  print STDERR "Usage: sym2int.pl [options] symtab [input transcriptions] > output transcriptions\n" .
    "options: [--map-oov <oov-symbol> ]  [-f <field-range> ]\n" .
      "note: <field-range> can look like 4-5, or 4-, or 5-, or 1.\n";
}
open(F, "<$symtab") || die "Error opening symbol table file $symtab";
while(<F>) {
    @A = split(" ", $_);
    @A == 2 || die "bad line in symbol table file: $_";
    $sym2int{$A[0]} = $A[1] + 0;
}

if (defined $map_oov && $map_oov !~ m/^\d+$/) { # not numeric-> look it up
  if (!defined $sym2int{$map_oov}) { die "OOV symbol $map_oov not defined."; }
  $map_oov = $sym2int{$map_oov};
}

$num_warning = 0;
$max_warning = 20;

while (<>) {
  @A = split(" ", $_);
  @B = ();
  for ($n = 0; $n < @A; $n++) {
    $a = $A[$n];
    if ( (!defined $field_begin || $n >= $field_begin)
         && (!defined $field_end || $n <= $field_end)) {
      $i = $sym2int{$a};
      if (!defined ($i)) {
        if (defined $map_oov) {
          if ($num_warning++ < $max_warning) {
            print STDERR "sym2int.pl: replacing $a with $map_oov\n";
            if ($num_warning == $max_warning) {
              print STDERR "sym2int.pl: not warning for OOVs any more times\n";
            }
          }
          $i = $map_oov;
        }
      }
      $a = $i;
    }
    push @B, $a;
  }
  print join(" ", @B);
  print "\n";
}
if ($num_warning > 0) {
  print STDERR "** Replaced $num_warning instances of OOVs with $map_oov\n";
}

exit(0);


================================================
FILE: egs/espnet_utils/make_fbank.sh
================================================
#!/usr/bin/env bash

# Copyright 2018 Nagoya University (Tomoki Hayashi)
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

# Begin configuration section.
nj=4
fs=none
fmax=
fmin=
n_mels=80
n_fft=1024
n_shift=512
win_length=
window=hann
write_utt2num_frames=true
cmd=run.pl
compress=true
normalize=16  # The bit-depth of the input wav files
filetype=mat # mat or hdf5
# End configuration section.

help_message=$(cat <<EOF
Usage: $0 [options] <data-dir> [<log-dir> [<fbank-dir>] ]
e.g.: $0 data/train exp/make_fbank/train mfcc
Note: <log-dir> defaults to <data-dir>/log, and <fbank-dir> defaults to <data-dir>/data
Options:
  --nj <nj>                                        # number of parallel jobs
  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs.
  --filetype <mat|hdf5|sound.hdf5>                 # Specify the format of feats file
EOF
)
echo "$0 $*"  # Print the command line for logging

. parse_options.sh || exit 1;

if [ $# -lt 1 ] || [ $# -gt 3 ]; then
    echo "${help_message}"
    exit 1;
fi

set -euo pipefail

data=$1
if [ $# -ge 2 ]; then
  logdir=$2
else
  logdir=${data}/log
fi
if [ $# -ge 3 ]; then
  fbankdir=$3
else
  fbankdir=${data}/data
fi

# make $fbankdir an absolute pathname.
fbankdir=$(perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' ${fbankdir} ${PWD})

# use "name" as part of name of the archive.
name=$(basename ${data})

mkdir -p ${fbankdir} || exit 1;
mkdir -p ${logdir} || exit 1;

if [ -f ${data}/feats.scp ]; then
  mkdir -p ${data}/.backup
  echo "$0: moving $data/feats.scp to $data/.backup"
  mv ${data}/feats.scp ${data}/.backup
fi

scp=${data}/wav.scp

utils/validate_data_dir.sh --no-text --no-feats ${data} || exit 1;

split_scps=""
for n in $(seq ${nj}); do
    split_scps="${split_scps} ${logdir}/wav.${n}.scp"
done

utils/split_scp.pl ${scp} ${split_scps} || exit 1;

if ${write_utt2num_frames}; then
  write_num_frames_opt="--write-num-frames=ark,t:${logdir}/utt2num_frames.JOB"
else
  write_num_frames_opt=
fi

if [ "${filetype}" == hdf5 ]; then
    ext=h5
else
    ext=ark
fi

if [ -f ${data}/segments ]; then
    echo "$0 [info]: segments file exists: using that."
    split_segments=""
    for n in $(seq ${nj}); do
        split_segments="${split_segments} ${logdir}/segments.${n}"
    done

    utils/split_scp.pl ${data}/segments ${split_segments}

    ${cmd} JOB=1:${nj} ${logdir}/make_fbank_${name}.JOB.log \
        compute-fbank-feats.py \
            --fs ${fs} \
            --fmax ${fmax} \
            --fmin ${fmin} \
            --n_fft ${n_fft} \
            --n_shift ${n_shift} \
            --win_length ${win_length} \
            --window ${window} \
            --n_mels ${n_mels} \
            ${write_num_frames_opt} \
            --compress=${compress} \
            --filetype ${filetype} \
            --normalize ${normalize} \
            --segment=${logdir}/segments.JOB scp:${scp} \
            ark,scp:${fbankdir}/raw_fbank_${name}.JOB.${ext},${fbankdir}/raw_fbank_${name}.JOB.scp

else
  echo "$0: [info]: no segments file exists: assuming pcm.scp indexed by utterance."
  split_scps=""
  for n in $(seq ${nj}); do
    split_scps="${split_scps} ${logdir}/wav.${n}.scp"
  done

  utils/split_scp.pl ${scp} ${split_scps}

  ${cmd} JOB=1:${nj} ${logdir}/make_fbank_${name}.JOB.log \
      compute-fbank-feats.py \
          --fs ${fs} \
          --fmax ${fmax} \
          --fmin ${fmin} \
          --n_fft ${n_fft} \
          --n_shift ${n_shift} \
          --win_length ${win_length} \
          --window ${window} \
          --n_mels ${n_mels} \
          ${write_num_frames_opt} \
          --compress=${compress} \
          --filetype ${filetype} \
          --normalize ${normalize} \
          scp:${logdir}/wav.JOB.scp \
          ark,scp:${fbankdir}/raw_fbank_${name}.JOB.${ext},${fbankdir}/raw_fbank_${name}.JOB.scp
fi


# concatenate the .scp files together.
for n in $(seq ${nj}); do
    cat ${fbankdir}/raw_fbank_${name}.${n}.scp || exit 1;
done > ${data}/feats.scp || exit 1

if ${write_utt2num_frames}; then
    for n in $(seq ${nj}); do
        cat ${logdir}/utt2num_frames.${n} || exit 1;
    done > ${data}/utt2num_frames || exit 1
    rm ${logdir}/utt2num_frames.* 2>/dev/null
fi

rm -f ${logdir}/wav.*.scp ${logdir}/segments.* 2>/dev/null

# Write the filetype, this will be used for data2json.sh
echo ${filetype} > ${data}/filetype

nf=$(wc -l < ${data}/feats.scp)
nu=$(wc -l < ${data}/wav.scp)
if [ ${nf} -ne ${nu} ]; then
    echo "It seems not all of the feature files were successfully ($nf != $nu);"
    echo "consider using utils/fix_data_dir.sh $data"
fi

echo "Succeeded creating filterbank features for $name"


================================================
FILE: egs/espnet_utils/make_pair_json.py
================================================
#!/usr/bin/env python3
# encoding: utf-8

# Copyright 2020 Nagoya University (Wen-Chin Huang)
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

import argparse
from io import open
import json
import logging
import sys

from espnet.utils.cli_utils import get_commandline_args


def get_parser():
    parser = argparse.ArgumentParser(
        description="Merge source and target data.json files into one json file.",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
    )
    parser.add_argument("--src-json", type=str, help="Json file for the source speaker")
    parser.add_argument(
        "--trg-json",
        type=str,
        default=None,
        help="Json file for the target speaker. If not specified, use source only.",
    )
    parser.add_argument(
        "--num_utts", default=-1, type=int, help="Number of utterances (take from head)"
    )
    parser.add_argument("--verbose", "-V", default=1, type=int, help="Verbose option")
    parser.add_argument(
        "--out",
        "-O",
        type=str,
        help="The output filename. " "If omitted, then output to sys.stdout",
    )
    return parser


if __name__ == "__main__":
    parser = get_parser()
    args = parser.parse_args()

    # logging info
    logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
    if args.verbose > 0:
        logging.basicConfig(level=logging.INFO, format=logfmt)
    else:
        logging.basicConfig(level=logging.WARN, format=logfmt)
    logging.info(get_commandline_args())

    with open(args.src_json, "rb") as f:
        src_json = json.load(f)["utts"]
    if args.trg_json:
        with open(args.trg_json, "rb") as f:
            trg_json = json.load(f)["utts"]

    # get source and target speaker
    _ = list(src_json.keys())[0].split("_")
    srcspk = _[0]
    if args.trg_json:
        _ = list(trg_json.keys())[0].split("_")
        trgspk = _[0]

    count = 0
    data = {"utts": {}}
    # (dirty) loop through input only because in/out should have same files
    for k, v in src_json.items():
        _ = k.split("_")
        number = "_".join(_[1:])

        entry = {"input": src_json[srcspk + "_" + number]["input"]}

        if args.trg_json:
            entry["output"] = trg_json[trgspk + "_" + number]["input"]
            entry["output"][0]["name"] = "target1"

        data["utts"][number] = entry
        count += 1
        if args.num_utts > 0 and count >= args.num_utts:
            break

    if args.out is None:
        out = sys.stdout
    else:
        out = open(args.out, "w", encoding="utf-8")

    json.dump(
        data,
        out,
        indent=4,
        ensure_ascii=False,
        separators=(",", ": "),
    )


================================================
FILE: egs/espnet_utils/make_stft.sh
================================================
#!/usr/bin/env bash

# Copyright 2018 Nagoya University (Tomoki Hayashi)
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

# Begin configuration section.
nj=4
fs=none
n_fft=1024
n_shift=512
win_length=
window=hann
write_utt2num_frames=true
cmd=run.pl
compress=true
normalize=16  # The bit-depth of the input wav files
filetype=mat # mat or hdf5
# End configuration section.

help_message=$(cat <<EOF
Usage: $0 [options] <data-dir> [<log-dir> [<stft-dir>] ]
e.g.: $0 data/train exp/make_stft/train stft
Note: <log-dir> defaults to <data-dir>/log, and <stft-dir> defaults to <data-dir>/data
Options:
  --nj <nj>                                        # number of parallel jobs
  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs.
  --filetype <mat|hdf5|sound.hdf5>                 # Specify the format of feats file
EOF
)
echo "$0 $*"  # Print the command line for logging

. parse_options.sh || exit 1;

if [ $# -lt 1 ] || [ $# -gt 3 ]; then
    echo "${help_message}"
    exit 1;
fi

set -euo pipefail

data=$1
if [ $# -ge 2 ]; then
  logdir=$2
else
  logdir=${data}/log
fi
if [ $# -ge 3 ]; then
  stftdir=$3
else
  stftdir=${data}/data
fi

# make $stftdir an absolute pathname.
stftdir=$(perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' ${stftdir} ${PWD})

# use "name" as part of name of the archive.
name=$(basename ${data})

mkdir -p ${stftdir} || exit 1;
mkdir -p ${logdir} || exit 1;

if [ -f ${data}/feats.scp ]; then
  mkdir -p ${data}/.backup
  echo "$0: moving ${data}/feats.scp to ${data}/.backup"
  mv ${data}/feats.scp ${data}/.backup
fi

scp=${data}/wav.scp

utils/validate_data_dir.sh --no-text --no-feats ${data} || exit 1;

if ${write_utt2num_frames}; then
  write_num_frames_opt="--write-num-frames=ark,t:${logdir}/utt2num_frames.JOB"
else
  write_num_frames_opt=
fi

if [ "${filetype}" == hdf5 ]; then
    ext=h5
else
    ext=ark
fi

if [ -f ${data}/segments ]; then
    echo "$0 [info]: segments file exists: using that."
    split_segments=""
    for n in $(seq ${nj}); do
        split_segments="${split_segments} ${logdir}/segments.${n}"
    done

    utils/split_scp.pl ${data}/segments ${split_segments}

    ${cmd} JOB=1:${nj} ${logdir}/make_stft_${name}.JOB.log \
        compute-stft-feats.py \
            --win_length ${win_length} \
            --n_fft ${n_fft} \
            --n_shift ${n_shift} \
            --window ${window} \
            ${write_num_frames_opt} \
            --compress=${compress} \
            --filetype ${filetype} \
            --normalize ${normalize} \
            --segment=${logdir}/segments.JOB scp:${scp} \
            ark,scp:${stftdir}/raw_stft_${name}.JOB.${ext},${stftdir}/raw_stft_${name}.JOB.scp

else
    echo "$0: [info]: no segments file exists: assuming pcm.scp indexed by utterance."
    split_scps=""
    for n in $(seq ${nj}); do
        split_scps="${split_scps} ${logdir}/wav.${n}.scp"
    done

    utils/split_scp.pl ${scp} ${split_scps} || exit 1;

${cmd} JOB=1:${nj} ${logdir}/make_stft_${name}.JOB.log \
    compute-stft-feats.py \
        --fs ${fs} \
        --win_length ${win_length} \
        --n_fft ${n_fft} \
        --n_shift ${n_shift} \
        --window ${window} \
        ${write_num_frames_opt} \
        --compress=${compress} \
        --filetype ${filetype} \
        --normalize ${normalize} \
        scp:${logdir}/wav.JOB.scp \
        ark,scp:${stftdir}/raw_stft_${name}.JOB.${ext},${stftdir}/raw_stft_${name}.JOB.scp
fi

# concatenate the .scp files together.
for n in $(seq ${nj}); do
    cat ${stftdir}/raw_stft_${name}.${n}.scp || exit 1;
done > ${data}/feats.scp || exit 1

if ${write_utt2num_frames}; then
    for n in $(seq ${nj}); do
        cat ${logdir}/utt2num_frames.${n} || exit 1;
    done > ${data}/utt2num_frames || exit 1
    rm ${logdir}/utt2num_frames.* 2>/dev/null
fi

rm -f ${logdir}/wav.*.scp ${logdir}/segments.* 2>/dev/null

# Write the filetype, this will be used for data2json.sh
echo ${filetype} > ${data}/filetype

nf=$(wc -l < ${data}/feats.scp)
nu=$(wc -l < ${data}/wav.scp)
if [ ${nf} -ne ${nu} ]; then
    echo "It seems not all of the feature files were successfully ($nf != $nu);"
    echo "consider using utils/fix_data_dir.sh $data"
fi

echo "Succeeded creating filterbank features for $name"


================================================
FILE: egs/espnet_utils/mbr_analysis.py
================================================
# Author: Jinchuan Tian ; tianjinchuan@stu.pku.edu.cn ; tyriontian@tencent.com
# This script provides:
# (1) CER (2) Bayesian Risk and its variance

import sys
import json
import math
import editdistance
import numpy as np

def main():
    # load json file
    f = open(sys.argv[1], "rb")
    results_json = json.load(f)["utts"]

    num_err, num_tot = 0, 0
    risk_stat, sum_prob_stat, ref_prob_stat = [], [], []
    for uttid, info in results_json.items():
        try:
            hypotheses = info["output"]
            ref_token = hypotheses[0]["token"]
        
            # hypothesis and their probability
            texts, probs, find_ref = [], [], False
            for h in hypotheses:
                text = h["rec_token"].replace("<eos>", "").strip()
                texts.append(text)
                probs.append(math.exp(h["score"]))
                if ref_token == text:
                    ref_prob_stat.append(math.exp(h["score"]))
                    find_ref = True
    
            if not find_ref:
                ref_prob_stat.append(0.0)
    
            # find edit-distance
            edit_dists = [editdistance.eval(ref_token, rec_token) \
                          for rec_token in texts]
    
            # bayesian risk
            weighted_probs = [a * b for a, b in zip(edit_dists, probs)]
            risk = sum(weighted_probs) / (sum(probs) + 1e-10)
            risk_stat.append(risk)
    
            # sum prob 
            sum_prob_stat.append(sum(probs))
    
            # cer statistics
            num_err += edit_dists[0]
            num_tot += len(ref_token.strip().split())
        except:
            pass

    # conclusion
    print("### MBR statistics on {} ###".format(sys.argv[1]))
    cer = num_err / num_tot * 100
    print("CER: {:.4f}% {}/{}".format(cer, num_err, num_tot))
    
    br_mean, br_deviation = np.mean(risk_stat), np.sqrt(np.var(risk_stat))
    print("Mean and Deviation of Bayesian Risk: {:.4f} | {:.4f}".format(br_mean, br_deviation))

    sum_prob_mean, sum_prob_deviation = np.mean(sum_prob_stat), np.sqrt(np.var(sum_prob_stat))
    ref_prob_mean, ref_prob_deviation = np.mean(ref_prob_stat), np.sqrt(np.var(ref_prob_stat))
    print("Mean and Deviation of Accumulated probability: {:.4f} | {:.4f}".format(sum_prob_mean, sum_prob_deviation))
    print("Mean and Deviation of Reference probability: {:.4f} | {:.4f}".format(ref_prob_mean, ref_prob_deviation))


if __name__ == "__main__":
    main() 


================================================
FILE: egs/espnet_utils/mcd_calculate.py
================================================
#!/usr/bin/env python3
# encoding: utf-8

# Copyright 2020 Nagoya University (Wen-Chin Huang)
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

# Calculate MCD using converted waveform.

import argparse
import fnmatch
import multiprocessing as mp
import os

from fastdtw import fastdtw
import numpy as np
import pysptk
import pyworld as pw
import scipy
from scipy.io import wavfile
from scipy.signal import firwin
from scipy.signal import lfilter


def find_files(root_dir, query="*.wav", include_root_dir=True):
    """Find files recursively.

    Args:
        root_dir (str): Root root_dir to find.
        query (str): Query to find.
        include_root_dir (bool): If False, root_dir name is not included.

    Returns:
        list: List of found filenames.

    """
    files = []
    for root, dirnames, filenames in os.walk(root_dir, followlinks=True):
        for filename in fnmatch.filter(filenames, query):
            files.append(os.path.join(root, filename))
    if not include_root_dir:
        files = [file_.replace(root_dir + "/", "") for file_ in files]

    return files


def low_cut_filter(x, fs, cutoff=70):
    """FUNCTION TO APPLY LOW CUT FILTER

    Args:
        x (ndarray): Waveform sequence
        fs (int): Sampling frequency
        cutoff (float): Cutoff frequency of low cut filter

    Return:
        (ndarray): Low cut filtered waveform sequence
    """

    nyquist = fs // 2
    norm_cutoff = cutoff / nyquist

    # low cut filter
    fil = firwin(255, norm_cutoff, pass_zero=False)
    lcf_x = lfilter(fil, 1, x)

    return lcf_x


def spc2npow(spectrogram):
    """Calculate normalized power sequence from spectrogram

    Parameters
    ----------
    spectrogram : array, shape (T, `fftlen / 2 + 1`)
        Array of spectrum envelope

    Return
    ------
    npow : array, shape (`T`, `1`)
        Normalized power sequence

    """

    # frame based processing
    npow = np.apply_along_axis(_spvec2pow, 1, spectrogram)

    meanpow = np.mean(npow)
    npow = 10.0 * np.log10(npow / meanpow)

    return npow


def _spvec2pow(specvec):
    """Convert a spectrum envelope into a power

    Parameters
    ----------
    specvec : vector, shape (`fftlen / 2 + 1`)
        Vector of specturm envelope |H(w)|^2

    Return
    ------
    power : scala,
        Power of a frame

    """

    # set FFT length
    fftl2 = len(specvec) - 1
    fftl = fftl2 * 2

    # specvec is not amplitude spectral |H(w)| but power spectral |H(w)|^2
    power = specvec[0] + specvec[fftl2]
    for k in range(1, fftl2):
        power += 2.0 * specvec[k]
    power /= fftl

    return power


def extfrm(data, npow, power_threshold=-20):
    """Extract frame over the power threshold

    Parameters
    ----------
    data: array, shape (`T`, `dim`)
        Array of input data
    npow : array, shape (`T`)
        Vector of normalized power sequence.
    power_threshold : float, optional
        Value of power threshold [dB]
        Default set to -20

    Returns
    -------
    data: array, shape (`T_ext`, `dim`)
        Remaining data after extracting frame
        `T_ext` <= `T`

    """

    T = data.shape[0]
    if T != len(npow):
        raise ("Length of two vectors is different.")

    valid_index = np.where(npow > power_threshold)
    extdata = data[valid_index]
    assert extdata.shape[0] <= T

    return extdata


def world_extract(wav_path, args):
    fs, x = wavfile.read(wav_path)
    x = np.array(x, dtype=np.float64)
    x = low_cut_filter(x, fs)

    # extract features
    f0, time_axis = pw.harvest(
        x, fs, f0_floor=args.f0min, f0_ceil=args.f0max, frame_period=args.shiftms
    )
    sp = pw.cheaptrick(x, f0, time_axis, fs, fft_size=args.fftl)
    ap = pw.d4c(x, f0, time_axis, fs, fft_size=args.fftl)
    mcep = pysptk.sp2mc(sp, args.mcep_dim, args.mcep_alpha)
    npow = spc2npow(sp)

    return {
        "sp": sp,
        "mcep": mcep,
        "ap": ap,
        "f0": f0,
        "npow": npow,
    }


def get_basename(path):
    return os.path.splitext(os.path.split(path)[-1])[0]


def calculate(file_list, gt_file_list, args, MCD):

    for i, cvt_path in enumerate(file_list):
        corresponding_list = list(
            filter(lambda gt_path: get_basename(gt_path) in cvt_path, gt_file_list)
        )
        assert len(corresponding_list) == 1
        gt_path = corresponding_list[0]
        gt_basename = get_basename(gt_path)

        # extract ground truth and converted features
        gt_feats = world_extract(gt_path, args)
        cvt_feats = world_extract(cvt_path, args)

        # VAD & DTW based on power
        gt_mcep_nonsil_pow = extfrm(gt_feats["mcep"], gt_feats["npow"])
        cvt_mcep_nonsil_pow = extfrm(cvt_feats["mcep"], cvt_feats["npow"])
        _, path = fastdtw(
            cvt_mcep_nonsil_pow,
            gt_mcep_nonsil_pow,
            dist=scipy.spatial.distance.euclidean,
        )
        twf_pow = np.array(path).T

        # MCD using power-based DTW
        cvt_mcep_dtw_pow = cvt_mcep_nonsil_pow[twf_pow[0]]
        gt_mcep_dtw_pow = gt_mcep_nonsil_pow[twf_pow[1]]
        diff2sum = np.sum((cvt_mcep_dtw_pow - gt_mcep_dtw_pow) ** 2, 1)
        mcd = np.mean(10.0 / np.log(10.0) * np.sqrt(2 * diff2sum), 0)

        print("{} {}".format(gt_basename, mcd))
        MCD.append(mcd)


def get_parser():

    parser = argparse.ArgumentParser(description="calculate MCD.")
    parser.add_argument(
        "--wavdir",
        required=True,
        type=str,
        help="path of directory for converted waveforms",
    )
    parser.add_argument(
        "--gtwavdir",
        required=True,
        type=str,
        help="path of directory for ground truth waveforms",
    )

    # analysis related
    parser.add_argument(
        "--mcep_dim", default=41, type=int, help="dimension of mel cepstrum coefficient"
    )
    parser.add_argument(
        "--mcep_alpha", default=0.41, type=int, help="all pass constant"
    )
    parser.add_argument("--fftl", default=1024, type=int, help="fft length")
    parser.add_argument("--shiftms", default=5, type=int, help="frame shift (ms)")
    parser.add_argument(
        "--f0min", required=True, type=int, help="fo search range (min)"
    )
    parser.add_argument(
        "--f0max", required=True, type=int, help="fo search range (max)"
    )

    parser.add_argument(
        "--n_jobs", default=40, type=int, help="number of parallel jobs"
    )
    return parser


def main():
    args = get_parser().parse_args()

    # find files
    converted_files = sorted(find_files(args.wavdir))
    gt_files = sorted(find_files(args.gtwavdir))

    # Get and divide list

    print("number of utterances = %d" % len(converted_files))
    file_lists = np.array_split(converted_files, args.n_jobs)
    file_lists = [f_list.tolist() for f_list in file_lists]

    # multi processing
    with mp.Manager() as manager:
        MCD = manager.list()
        processes = []
        for f in file_lists:
            p = mp.Process(target=calculate, args=(f, gt_files, args, MCD))
            p.start()
            processes.append(p)

        # wait for all process
        for p in processes:
            p.join()

        mMCD = np.mean(np.array(MCD))
        print("Mean MCD: {:.2f}".format(mMCD))


if __name__ == "__main__":
    main()


================================================
FILE: egs/espnet_utils/merge_scp2json.py
================================================
#!/usr/bin/env python3
# encoding: utf-8


import argparse
import codecs
from distutils.util import strtobool
from io import open
import json
import logging
import sys

from espnet.utils.cli_utils import get_commandline_args

PY2 = sys.version_info[0] == 2
sys.stdin = codecs.getreader("utf-8")(sys.stdin if PY2 else sys.stdin.buffer)
sys.stdout = codecs.getwriter("utf-8")(sys.stdout if PY2 else sys.stdout.buffer)


# Special types:
def shape(x):
    """Change str to List[int]

    >>> shape('3,5')
    [3, 5]
    >>> shape(' [3, 5] ')
    [3, 5]

    """

    # x: ' [3, 5] ' -> '3, 5'
    x = x.strip()
    if x[0] == "[":
        x = x[1:]
    if x[-1] == "]":
        x = x[:-1]

    return list(map(int, x.split(",")))


def get_parser():
    parser = argparse.ArgumentParser(
        description="Given each file paths with such format as "
        "<key>:<file>:<type>. type> can be omitted and the default "
        'is "str". e.g. {} '
        "--input-scps feat:data/feats.scp shape:data/utt2feat_shape:shape "
        "--input-scps feat:data/feats2.scp shape:data/utt2feat2_shape:shape "
        "--output-scps text:data/text shape:data/utt2text_shape:shape "
        "--scps utt2spk:data/utt2spk".format(sys.argv[0]),
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
    )
    parser.add_argument(
        "--input-scps",
        type=str,
        nargs="*",
        action="append",
        default=[],
        help="Json files for the inputs",
    )
    parser.add_argument(
        "--output-scps",
        type=str,
        nargs="*",
        action="append",
        default=[],
        help="Json files for the outputs",
    )
    parser.add_argument(
        "--scps",
        type=str,
        nargs="+",
        default=[],
        help="The json files except for the input and outputs",
    )
    parser.add_argument("--verbose", "-V", default=1, type=int, help="Verbose option")
    parser.add_argument(
        "--allow-one-column",
        type=strtobool,
        default=False,
        help="Allow one column in input scp files. "
        "In this case, the value will be empty string.",
    )
    parser.add_argument(
        "--out",
        "-O",
        type=str,
        help="The output filename. " "If omitted, then output to sys.stdout",
    )
    return parser


if __name__ == "__main__":
    parser = get_parser()
    args = parser.parse_args()
    args.scps = [args.scps]

    # logging info
    logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
    if args.verbose > 0:
        logging.basicConfig(level=logging.INFO, format=logfmt)
    else:
        logging.basicConfig(level=logging.WARN, format=logfmt)
    logging.info(get_commandline_args())

    # List[List[Tuple[str, str, Callable[[str], Any], str, str]]]
    input_infos = []
    output_infos = []
    infos = []
    for lis_list, key_scps_list in [
        (input_infos, args.input_scps),
        (output_infos, args.output_scps),
        (infos, args.scps),
    ]:
        for key_scps in key_scps_list:
            lis = []
            for key_scp in key_scps:
                sps = key_scp.split(":")
                if len(sps) == 2:
                    key, scp = sps
                    type_func = None
                    type_func_str = "none"
                elif len(sps) == 3:
                    key, scp, type_func_str = sps
                    fail = False

                    try:
                        # type_func: Callable[[str], Any]
                        # e.g. type_func_str = "int" -> type_func = int
                        type_func = eval(type_func_str)
                    except Exception:
                        raise RuntimeError("Unknown type: {}".format(type_func_str))

                    if not callable(type_func):
                        raise RuntimeError("Unknown type: {}".format(type_func_str))

                else:
                    raise RuntimeError(
                        "Format <key>:<filepath> "
                        "or <key>:<filepath>:<type>  "
                        "e.g. feat:data/feat.scp "
                        "or shape:data/feat.scp:shape: {}".format(key_scp)
                    )

                for item in lis:
                    if key == item[0]:
                        raise RuntimeError(
                            'The key "{}" is duplicated: {} {}'.format(
                                key, item[3], key_scp
                            )
                        )

                lis.append((key, scp, type_func, key_scp, type_func_str))
            lis_list.append(lis)

    # Open  scp files
    input_fscps = [
        [open(i[1], "r", encoding="utf-8") for i in il] for il in input_infos
    ]
    output_fscps = [
        [open(i[1], "r", encoding="utf-8") for i in il] for il in output_infos
    ]
    fscps = [[open(i[1], "r", encoding="utf-8") for i in il] for il in infos]

    # Note(kamo): What is done here?
    # The final goal is creating a JSON file such as.
    # {
    #     "utts": {
    #         "sample_id1": {(omitted)},
    #         "sample_id2": {(omitted)},
    #          ....
    #     }
    # }
    #
    # To reduce memory usage, reading the input text files for each lines
    # and writing JSON elements per samples.
    if args.out is None:
        out = sys.stdout
    else:
        out = open(args.out, "w", encoding="utf-8")
    out.write('{\n    "utts": {\n')
    nutt = 0
    while True:
        nutt += 1
        # List[List[str]]
        input_lines = [[f.readline() for f in fl] for fl in input_fscps]
        output_lines = [[f.readline() for f in fl] for fl in output_fscps]
        lines = [[f.readline() for f in fl] for fl in fscps]

        # Get the first line
        concat = sum(input_lines + output_lines + lines, [])
        if len(concat) == 0:
            break
        first = concat[0]

        # Sanity check: Must be sorted by the first column and have same keys
        count = 0
        for ls_list in (input_lines, output_lines, lines):
            for ls in ls_list:
                for line in ls:
                    if line == "" or first == "":
                        if line != first:
                            concat = sum(input_infos + output_infos + infos, [])
                            raise RuntimeError(
                                "The number of lines mismatch "
                                'between: "{}" and "{}"'.format(
                                    concat[0][1], concat[count][1]
                                )
                            )

                    elif line.split()[0] != first.split()[0]:
                        concat = sum(input_infos + output_infos + infos, [])
                        raise RuntimeError(
                            "The keys are mismatch at {}th line "
                            'between "{}" and "{}":\n>>> {}\n>>> {}'.format(
                                nutt,
                                concat[0][1],
                                concat[count][1],
                                first.rstrip(),
                                line.rstrip(),
                            )
                        )
                    count += 1

        # The end of file
        if first == "":
            if nutt != 1:
                out.write("\n")
            break
        if nutt != 1:
            out.write(",\n")

        entry = {}
        for inout, _lines, _infos in [
            ("input", input_lines, input_infos),
            ("output", output_lines, output_infos),
            ("other", lines, infos),
        ]:

            lis = []
            for idx, (line_list, info_list) in enumerate(zip(_lines, _infos), 1):
                if inout == "input":
                    d = {"name": "input{}".format(idx)}
                elif inout == "output":
                    d = {"name": "target{}".format(idx)}
                else:
                    d = {}

                # info_list: List[Tuple[str, str, Callable]]
                # line_list: List[str]
                for line, info in zip(line_list, info_list):
                    sps = line.split(None, 1)
                    if len(sps) < 2:
                        if not args.allow_one_column:
                            raise RuntimeError(
                                "Format error {}th line in {}: "
                                ' Expecting "<key> <value>":\n>>> {}'.format(
                                    nutt, info[1], line
                                )
                            )
                        uttid = sps[0]
                        value = ""
                    else:
                        uttid, value = sps

                    key = info[0]
                    type_func = info[2]
                    value = value.rstrip()

                    if type_func is not None:
                        try:
                            # type_func: Callable[[str], Any]
                            value = type_func(value)
                        except Exception:
                            logging.error(
                                '"{}" is an invalid function '
                                "for the {} th line in {}: \n>>> {}".format(
                                    info[4], nutt, info[1], line
                                )
                            )
                            raise

                    d[key] = value
                lis.append(d)

            if inout != "other":
                entry[inout] = lis
            else:
                # If key == 'other'. only has the first item
                entry.update(lis[0])

        entry = json.dumps(
            entry, indent=4, ensure_ascii=False, sort_keys=True, separators=(",", ": ")
        )
        # Add indent
        indent = "    " * 2
        entry = ("\n" + indent).join(entry.split("\n"))

        uttid = first.split()[0]
        out.write('        "{}": {}'.format(uttid, entry))

    out.write("    }\n}\n")

    logging.info("{} entries in {}".format(nutt, out.name))


================================================
FILE: egs/espnet_utils/mergejson.py
================================================
#!/usr/bin/env python3
# encoding: utf-8

# Copyright 2017 Johns Hopkins University (Shinji Watanabe)
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)


import argparse
import codecs
import json
import logging
import os
import sys

from espnet.utils.cli_utils import get_commandline_args

is_python2 = sys.version_info[0] == 2


def get_parser():
    parser = argparse.ArgumentParser(
        description="merge json files",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
    )
    parser.add_argument(
        "--input-jsons",
        type=str,
        nargs="+",
        action="append",
        default=[],
        help="Json files for the inputs",
    )
    parser.add_argument(
        "--output-jsons",
        type=str,
        nargs="+",
        action="append",
        default=[],
        help="Json files for the outputs",
    )
    parser.add_argument(
        "--jsons",
        type=str,
        nargs="+",
        action="append",
        default=[],
        help="The json files except for the input and outputs",
    )
    parser.add_argument("--verbose", "-V", default=0, type=int, help="Verbose option")
    parser.add_argument("-O", dest="output", type=str, help="Output json file")
    return parser


if __name__ == "__main__":
    parser = get_parser()
    args = parser.parse_args()

    # logging info
    logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
    if args.verbose > 0:
        logging.basicConfig(level=logging.INFO, format=logfmt)
    else:
        logging.basicConfig(level=logging.WARN, format=logfmt)
    logging.info(get_commandline_args())

    js_dict = {}  # Dict[str, List[List[Dict[str, Dict[str, dict]]]]]
    # make intersection set for utterance keys
    intersec_ks = None  # Set[str]
    for jtype, jsons_list in [
        ("input", args.input_jsons),
        ("output", args.output_jsons),
        ("other", args.jsons),
    ]:
        js_dict[jtype] = []
        for jsons in jsons_list:
            js = []
            for x in jsons:
                if os.path.isfile(x):
                    with codecs.open(x, encoding="utf-8") as f:
                        j = json.load(f)
                    ks = list(j["utts"].keys())
                    logging.info(x + ": has " + str(len(ks)) + " utterances")
                    if intersec_ks is not None:
                        intersec_ks = intersec_ks.intersection(set(ks))
                        if len(intersec_ks) == 0:
                            logging.warning("No intersection")
                            break
                    else:
                        intersec_ks = set(ks)
                    js.append(j)
            js_dict[jtype].append(js)
    logging.info("new json has " + str(len(intersec_ks)) + " utterances")

    new_dic = {}
    for k in intersec_ks:
        new_dic[k] = {"input": [], "output": []}
        for jtype in ["input", "output", "other"]:
            for idx, js in enumerate(js_dict[jtype], 1):
                # Merge dicts from jsons into a dict
                dic = {k2: v for j in js for k2, v in j["utts"][k].items()}

                if jtype == "other":
                    new_dic[k].update(dic)
                else:
                    _dic = {}

                    # FIXME(kamo): ad-hoc way to change str to List[int]
                    if jtype == "input":
                        _dic["name"] = "input{}".format(idx)
                        if "ilen" in dic and "idim" in dic:
                            _dic["shape"] = (int(dic["ilen"]), int(dic["idim"]))
                        elif "ilen" in dic:
                            _dic["shape"] = (int(dic["ilen"]),)
                        elif "idim" in dic:
                            _dic["shape"] = (int(dic["idim"]),)

                    elif jtype == "output":
                        _dic["name"] = "target{}".format(idx)
                        if "olen" in dic and "odim" in dic:
                            _dic["shape"] = (int(dic["olen"]), int(dic["odim"]))
                        elif "ilen" in dic:
                            _dic["shape"] = (int(dic["olen"]),)
                        elif "idim" in dic:
                            _dic["shape"] = (int(dic["odim"]),)
                    if "shape" in dic:
                        # shape: "80,1000" -> [80, 1000]
                        _dic["shape"] = list(map(int, dic["shape"].split(",")))

                    for k2, v in dic.items():
                        if k2 not in ["ilen", "idim", "olen", "odim", "shape"]:
                            _dic[k2] = v
                    new_dic[k][jtype].append(_dic)

    # ensure "ensure_ascii=False", which is a bug
    if args.output is not None:
        sys.stdout = codecs.open(args.output, "w", encoding="utf-8")
    else:
        sys.stdout = codecs.getwriter("utf-8")(
            sys.stdout if is_python2 else sys.stdout.buffer
        )
    print(
        json.dumps(
            {"utts": new_dic},
            indent=4,
            ensure_ascii=False,
            sort_keys=True,
            separators=(",", ": "),
        )
    )


================================================
FILE: egs/espnet_utils/mix-mono-wav-scp.py
================================================
#!/usr/bin/env python3
import argparse
import io
import sys

PY2 = sys.version_info[0] == 2

if PY2:
    from itertools import izip_longest as zip_longest
else:
    from itertools import zip_longest


def get_parser():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Mixing wav.scp files into a multi-channel wav.scp " "using sox.",
    )
    parser.add_argument("scp", type=str, nargs="+", help="Give wav.scp")
    parser.add_argument(
        "out",
        nargs="?",
        type=argparse.FileType("w"),
        default=sys.stdout,
        help="The output filename. " "If omitted, then output to sys.stdout",
    )
    return parser


def main():
    parser = get_parser()
    args = parser.parse_args()

    fscps = [io.open(scp, "r", encoding="utf-8") for scp in args.scp]
    for linenum, lines in enumerate(zip_longest(*fscps)):
        keys = []
        wavs = []

        for line, scp in zip(lines, args.scp):
            if line is None:
                raise RuntimeError("Numbers of line mismatch")

            sps = line.split(" ", 1)
            if len(sps) != 2:
                raise RuntimeError(
                    'Invalid line is found: {}, line {}: "{}" '.format(
                        scp, linenum, line
                    )
                )
            key, wav = sps
            keys.append(key)
            wavs.append(wav.strip())

        if not all(k == keys[0] for k in keys):
            raise RuntimeError(
                "The ids mismatch. Hint; the input files must be "
                "sorted and must have same ids: {}".format(keys)
            )

        args.out.write(
            "{} sox -M {} -c {} -t wav - |\n".format(
                keys[0], " ".join("{}".format(w) for w in wavs), len(fscps)
            )
        )


if __name__ == "__main__":
    main()


================================================
FILE: egs/espnet_utils/mmi_rescore.sh
================================================
decode_dir=$1
dict=$2

mkdir -p $decode_dir/rescore
dir=$decode_dir/rescore

mkdir -p $dir/best

for w in 0.05 0.1 0.2 0.3; do
    mkdir -p $dir/$w
    (python3 espnet_utils/rerank_mmi.py $decode_dir/data.json $w ${dir}/${w}/data.1.json
    score_sclite.sh  --sppd3 true $dir/$w ${dict} > ${dir}/$w/decode_result.txt) &
done
wait 


================================================
FILE: egs/espnet_utils/pack_model.sh
================================================
#!/usr/bin/env bash

# Copyright 2019 Johns Hopkins University (Shinji Watanabe)
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

[ -f ./path.sh ] && . ./path.sh

results=""
# e.g., "exp/tr_it_pytorch_train/decode_dt_it_decode/result.wrd.txt
#        exp/tr_it_pytorch_train/decode_et_it_decode/result.wrd.txt"'
lm=""
dict=""
etc=""
outfile="model"
preprocess_conf=""

help_message=$(cat <<EOF
Usage: $0 --lm <lm> --dict <dict> <tr_conf> <dec_conf> <cmvn> <e2e>, for example:
<lm>:       exp/train_rnnlm/rnnlm.model.best
<dict>:     data/lang_char
<tr_conf>:  conf/train.yaml
<dec_conf>: conf/decode.yaml
<cmvn>:     data/tr_it/cmvn.ark
<e2e>:      exp/tr_it_pytorch_train/results/model.last10.avg.best
EOF
)
. utils/parse_options.sh

if [ $# != 4 ]; then
    echo "${help_message}"
    exit 1
fi

tr_conf=$1
dec_conf=$2
cmvn=$3
e2e=$4

echo "  - Model files (archived to ${outfile}.tar.gz by \`\$ pack_model.sh\`)"
echo "    - model link: (put the model link manually. please contact Shinji Watanabe <shinjiw@ieee.org> if you want a web storage to put your files)"

# configs
if [ -e ${tr_conf} ]; then
    tar cfh ${outfile}.tar ${tr_conf}
    echo -n "    - training config file: \`"
    echo ${tr_conf} | sed -e "s/$/\`/"
else
    echo "missing ${tr_conf}"
    exit 1
fi
if [ -e ${dec_conf} ]; then
    tar rfh ${outfile}.tar ${dec_conf}
    echo -n "    - decoding config file: \`"
    echo ${dec_conf} | sed -e "s/$/\`/"
else
    echo "missing ${dec_conf}"
    exit 1
fi
# NOTE(kan-bayashi): preprocess conf is optional
if [ -n "${preprocess_conf}" ]; then
    tar rfh ${outfile}.tar ${preprocess_conf}
    echo -n "    - preprocess config file: \`"
    echo ${preprocess_conf} | sed -e "s/$/\`/"
fi

# cmvn
if [ -e ${cmvn} ]; then
    tar rfh ${outfile}.tar ${cmvn}
    echo -n "    - cmvn file: \`"
    echo ${cmvn} | sed -e "s/$/\`/"
else
    echo "missing ${cmvn}"
    exit 1
fi

# e2e
if [ -e ${e2e} ]; then
    tar rfh ${outfile}.tar ${e2e}
    echo -n "    - e2e file: \`"
    echo ${e2e} | sed -e "s/$/\`/"

    e2e_conf=$(dirname ${e2e})/model.json
    if [ ! -e ${e2e_conf} ]; then
	echo missing ${e2e_conf}
	exit 1
    else
	echo -n "    - e2e JSON file: \`"
	echo ${e2e_conf} | sed -e "s/$/\`/"
	tar rfh ${outfile}.tar ${e2e_conf}
    fi
else
    echo "missing ${e2e}"
    exit 1
fi

# lm
if [ -n "${lm}" ]; then
    if [ -e ${lm} ]; then
	tar rfh ${outfile}.tar ${lm}
	echo -n "    - lm file: \`"
	echo ${lm} | sed -e "s/$/\`/"

	lm_conf=$(dirname ${lm})/model.json
	if [ ! -e ${lm_conf} ]; then
	    echo missing ${lm_conf}
	    exit 1
	else
	    echo -n "    - lm JSON file: \`"
	    echo ${lm_conf} | sed -e "s/$/\`/"
	    tar rfh ${outfile}.tar ${lm_conf}
	fi
    else
	echo "missing ${lm}"
	exit 1
    fi
fi

# dict
if [ -n "${dict}" ]; then
    if [ -e ${dict} ]; then
	tar rfh ${outfile}.tar ${dict}
	echo -n "    - dict file: \`"
	echo ${dict} | sed -e "s/$/\`/"
    else
	echo "missing ${dict}"
	exit 1
    fi
fi

# etc
for x in ${etc}; do
    if [ -e ${x} ]; then
	tar rfh ${outfile}.tar ${x}
	echo -n "    - etc file: \`"
	echo ${x} | sed -e "s/$/\`/"
    else
	echo "missing ${x}"
	exit 1
    fi
done

# finally compress the tar file
gzip -f ${outfile}.tar

# results
if [ -n "${results}" ]; then
    echo "  - Results (paste them by yourself or obtained by \`\$ pack_model.sh --results <results>\`)"
    echo "\`\`\`"
fi
for x in ${results}; do
    if [ -e ${x} ]; then
	echo "${x}"
	grep -e Avg -e SPKR -m 2 ${x}
    else
	echo "missing ${x}"
	exit 1
    fi
done
if [ -n "${results}" ]; then
    echo "\`\`\`"
fi

exit 0


================================================
FILE: egs/espnet_utils/prepare_block_load.sh
================================================
num_split=32
bpe_model=

. utils/parse_options.sh

kdir=$1 # kaldi dataset directory
ddir=$2 # dump directory
dst=$3 # distination directory
dict=$4

# step 1: sort the scp in dumpdir according to the utt2num_frames
mkdir -p $dst
tmpdir=$dst/tmp; mkdir -p $tmpdir
python3 espnet_utils/sort_scp_by_length.py $ddir/feats.scp $ddir/utt2num_frames \
                                   $tmpdir/feats.scp $tmpdir/utt2num_frames

# step 2: split the feats.scp and utt2num_frames according to  `num_split`
fests_scps=""
for idx in `seq 1 $num_split`; do
   dir=$dst/$idx; mkdir -p $dir
   feats_scps="$feats_scps $dir/feats.scp"
done
python3 espnet_utils/split_scp.py $tmpdir/feats.scp $feats_scps

for idx in `seq 1 $num_split`; do
   dir=$dst/$idx;
   python3 espnet_utils/filter_scp.py $dir/feats.scp \
       $tmpdir/utt2num_frames > $dir/utt2num_frames &
done
wait

# step 3: copy-feats
for idx in `seq 1 $num_split`; do
   dir=$dst/$idx;
   python3 espnet_utils/split_scp_fix_length.py $dir/feats.scp
   nj=`ls $dir/feats.*.scp | wc -l`
   ${decode_cmd} JOB=1:$nj $dir/copy_logs/copy_feat.JOB.log \
       copy-feats --compress=true --compression-method=2 \
           scp:$dir/feats.JOB.scp \
           ark,scp:$dir/feats.JOB.ark,$dir/feats_copy.JOB.scp
   for j in `seq 1 $nj`; do
       cat $dir/feats_copy.${j}.scp 
   done > $dir/feats.scp
done

# step 4: filter the kaldi format data
for idx in `seq 1 $num_split`; do
    dir=$dst/$idx;
    mkdir -p $dir/kaldi_files
    for f in text utt2spk spk2utt text_org; do
        if [ -f  $kdir/$f ]; then
        python3 espnet_utils/filter_scp.py $dir/feats.scp \
            $kdir/$f > $dir/kaldi_files/$f &
        fi
    done
    wait
done

# step 5: make json
for idx in `seq 1 $num_split`; do
    dir=$dst/$idx;
    
    if [ -f $dir/kaldi_files/text_org ]; then
        json_opts="--text_org $dir/kaldi_files/text_org"
    else
        json_opts=""
    fi

    if [ ! -z $bpe_model ]; then
        json_opts="$json_opts --bpecode $bpe_model"
    else
        json_opts="$json_opts"
    fi

    bash espnet_utils/data2json.sh $json_opts --feat $dir/feats.scp \
        $dir/kaldi_files $dict > $dir/data.json &
done
wait


================================================
FILE: egs/espnet_utils/prepare_mer.py
================================================
import sys

def is_all_chinese(strs):
    for _char in strs:
        if not '\u4e00' <= _char <= '\u9fa5':
            return False
    return True

in_f, chn_f, eng_f = sys.argv[1:4]
chn_writer = open(chn_f, 'w', encoding="utf-8")
eng_writer = open(eng_f, 'w', encoding="utf-8")

for line in open(in_f, encoding="utf-8"):
    elems = line.strip().split()
    uttid = elems[-1]
    chn_buf, eng_buf = [], []
    for c in elems[:-1]:
        if is_all_chinese(c):
            chn_buf.append(c)
        else:
            eng_buf.append(c)
    
    chn_str = " ".join(chn_buf + [uttid]) + "\n"
    eng_str = " ".join(eng_buf + [uttid]) + "\n"

    chn_writer.write(chn_str)
    eng_writer.write(eng_str)

chn_writer.close()
eng_writer.close()


================================================
FILE: egs/espnet_utils/queue-freegpu.pl
================================================
#!/usr/bin/env perl
use strict;
use warnings;

# Copyright 2012  Johns Hopkins University (Author: Daniel Povey).
#           2014  Vimal Manohar (Johns Hopkins University)
# Apache 2.0.

use File::Basename;
use Cwd;
use Getopt::Long;

# queue.pl has the same functionality as run.pl, except that
# it runs the job in question on the queue (Sun GridEngine).
# This version of queue.pl uses the task array functionality
# of the grid engine.  Note: it's different from the queue.pl
# in the s4 and earlier scripts.

# The script now supports configuring the queue system using a config file
# (default in conf/queue.conf; but can be passed specified with --config option)
# and a set of command line options.
# The current script handles:
# 1) Normal configuration arguments
# For e.g. a command line option of "--gpu 1" could be converted into the option
# "-q g.q -l gpu=1" to qsub. How the CLI option is handled is determined by a
# line in the config file like
# gpu=* -q g.q -l gpu=$0
# $0 here in the line is replaced with the argument read from the CLI and the
# resulting string is passed to qsub.
# 2) Special arguments to options such as
# gpu=0
# If --gpu 0 is given in the command line, then no special "-q" is given.
# 3) Default argument
# default gpu=0
# If --gpu option is not passed in the command line, then the script behaves as
# if --gpu 0 was passed since 0 is specified as the default argument for that
# option
# 4) Arbitrary options and arguments.
# Any command line option starting with '--' and its argument would be handled
# as long as its defined in the config file.
# 5) Default behavior
# If the config file that is passed using is not readable, then the script
# behaves as if the queue has the following config file:
# $ cat conf/queue.conf
# # Default configuration
# command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
# option mem=* -l mem_free=$0,ram_free=$0
# option mem=0          # Do not add anything to qsub_opts
# option num_threads=* -pe smp $0
# option num_threads=1  # Do not add anything to qsub_opts
# option max_jobs_run=* -tc $0
# default gpu=0
# option gpu=0 -q all.q
# option gpu=* -l gpu=$0 -q g.q

my $qsub_opts = "";
my $sync = 0;
my $num_threads = 1;
my $gpu = 0;

my $config = "conf/queue.conf";

my %cli_options = ();

my $jobname;
my $jobstart;
my $jobend;
my $array_job = 0;
my $sge_job_id;

sub print_usage() {
  print STDERR
   "Usage: queue.pl [options] [JOB=1:n] log-file command-line arguments...\n" .
   "e.g.: queue.pl foo.log echo baz\n" .
   " (which will echo \"baz\", with stdout and stderr directed to foo.log)\n" .
   "or: queue.pl -q all.q\@xyz foo.log echo bar \| sed s/bar/baz/ \n" .
   " (which is an example of using a pipe; you can provide other escaped bash constructs)\n" .
   "or: queue.pl -q all.q\@qyz JOB=1:10 foo.JOB.log echo JOB \n" .
   " (which illustrates the mechanism to submit parallel jobs; note, you can use \n" .
   "  another string other than JOB)\n" .
   "Note: if you pass the \"-sync y\" option to qsub, this script will take note\n" .
   "and change its behavior.  Otherwise it uses qstat to work out when the job finished\n" .
   "Options:\n" .
   "  --config <config-file> (default: $config)\n" .
   "  --mem <mem-requirement> (e.g. --mem 2G, --mem 500M, \n" .
   "                           also support K and numbers mean bytes)\n" .
   "  --num-threads <num-threads> (default: $num_threads)\n" .
   "  --max-jobs-run <num-jobs>\n" .
   "  --gpu <0|1> (default: $gpu)\n";
  exit 1;
}

sub caught_signal {
  if ( defined $sge_job_id ) { # Signal trapped after submitting jobs
    my $signal = $!;
    system ("qdel $sge_job_id");
    print STDERR "Caught a signal: $signal , deleting SGE task: $sge_job_id and exiting\n";
    exit(2);
  }
}

if (@ARGV < 2) {
  print_usage();
}

for (my $x = 1; $x <= 2; $x++) { # This for-loop is to
  # allow the JOB=1:n option to be interleaved with the
  # options to qsub.
  while (@ARGV >= 2 && $ARGV[0] =~ m:^-:) {
    my $switch = shift @ARGV;

    if ($switch eq "-V") {
      $qsub_opts .= "-V ";
    } else {
      my $argument = shift @ARGV;
      if ($argument =~ m/^--/) {
        print STDERR "WARNING: suspicious argument '$argument' to $switch; starts with '-'\n";
      }
      if ($switch eq "-sync" && $argument =~ m/^[yY]/) {
        $sync = 1;
        $qsub_opts .= "$switch $argument ";
      } elsif ($switch eq "-pe") { # e.g. -pe smp 5
        my $argument2 = shift @ARGV;
        $qsub_opts .= "$switch $argument $argument2 ";
        $num_threads = $argument2;
      } elsif ($switch =~ m/^--/) { # Config options
        # Convert CLI option to variable name
        # by removing '--' from the switch and replacing any
        # '-' with a '_'
        $switch =~ s/^--//;
        $switch =~ s/-/_/g;
        $cli_options{$switch} = $argument;
      } else {  # Other qsub options - passed as is
        $qsub_opts .= "$switch $argument ";
      }
    }
  }
  if ($ARGV[0] =~ m/^([\w_][\w\d_]*)+=(\d+):(\d+)$/) { # e.g. JOB=1:20
    $array_job = 1;
    $jobname = $1;
    $jobstart = $2;
    $jobend = $3;
    shift;
    if ($jobstart > $jobend) {
      die "queue.pl: invalid job range $ARGV[0]";
    }
    if ($jobstart <= 0) {
      die "run.pl: invalid job range $ARGV[0], start must be strictly positive (this is a GridEngine limitation).";
    }
  } elsif ($ARGV[0] =~ m/^([\w_][\w\d_]*)+=(\d+)$/) { # e.g. JOB=1.
    $array_job = 1;
    $jobname = $1;
    $jobstart = $2;
    $jobend = $2;
    shift;
  } elsif ($ARGV[0] =~ m/.+\=.*\:.*$/) {
    print STDERR "queue.pl: Warning: suspicious first argument to queue.pl: $ARGV[0]\n";
  }
}

if (@ARGV < 2) {
  print_usage();
}

if (exists $cli_options{"config"}) {
  $config = $cli_options{"config"};
}

my $default_config_file = <<'EOF';
# Default configuration
command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
option mem=* -l mem_free=$0,ram_free=$0
option mem=0          # Do not add anything to qsub_opts
option num_threads=* -pe smp $0
option num_threads=1  # Do not add anything to qsub_opts
option max_jobs_run=* -tc $0
default gpu=0
option gpu=0
option gpu=* -l gpu=$0 -q '*.q'
EOF

# Here the configuration options specified by the user on the command line
# (e.g. --mem 2G) are converted to options to the qsub system as defined in
# the config file. (e.g. if the config file has the line
# "option mem=* -l ram_free=$0,mem_free=$0"
# and the user has specified '--mem 2G' on the command line, the options
# passed to queue system would be "-l ram_free=2G,mem_free=2G
# A more detailed description of the ways the options would be handled is at
# the top of this file.

$SIG{INT} = \&caught_signal;
$SIG{TERM} = \&caught_signal;

my $opened_config_file = 1;

open CONFIG, "<$config" or $opened_config_file = 0;

my %cli_config_options = ();
my %cli_default_options = ();

if ($opened_config_file == 0 && exists($cli_options{"config"})) {
  print STDERR "Could not open config file $config\n";
  exit(1);
} elsif ($opened_config_file == 0 && !exists($cli_options{"config"})) {
  # Open the default config file instead
  open (CONFIG, "echo '$default_config_file' |") or die "Unable to open pipe\n";
  $config = "Default config";
}

my $qsub_cmd = "";
my $read_command = 0;

while(<CONFIG>) {
  chomp;
  my $line = $_;
  $_ =~ s/\s*#.*//g;
  if ($_ eq "") { next; }
  if ($_ =~ /^command (.+)/) {
    $read_command = 1;
    $qsub_cmd = $1 . " ";
  } elsif ($_ =~ m/^option ([^=]+)=\* (.+)$/) {
    # Config option that needs replacement with parameter value read from CLI
    # e.g.: option mem=* -l mem_free=$0,ram_free=$0
    my $option = $1;     # mem
    my $arg= $2;         # -l mem_free=$0,ram_free=$0
    if ($arg !~ m:\$0:) {
      die "Unable to parse line '$line' in config file ($config)\n";
    }
    if (exists $cli_options{$option}) {
      # Replace $0 with the argument read from command line.
      # e.g. "-l mem_free=$0,ram_free=$0" -> "-l mem_free=2G,ram_free=2G"
      $arg =~ s/\$0/$cli_options{$option}/g;
      $cli_config_options{$option} = $arg;
    }
  } elsif ($_ =~ m/^option ([^=]+)=(\S+)\s?(.*)$/) {
    # Config option that does not need replacement
    # e.g. option gpu=0 -q all.q
    my $option = $1;      # gpu
    my $value = $2;       # 0
    my $arg = $3;         # -q all.q
    if (exists $cli_options{$option}) {
      $cli_default_options{($option,$value)} = $arg;
    }
  } elsif ($_ =~ m/^default (\S+)=(\S+)/) {
    # Default options. Used for setting default values to options i.e. when
    # the user does not specify the option on the command line
    # e.g. default gpu=0
    my $option = $1;  # gpu
    my $value = $2;   # 0
    if (!exists $cli_options{$option}) {
      # If the user has specified this option on the command line, then we
      # don't have to do anything
      $cli_options{$option} = $value;
    }
  } else {
    print STDERR "queue.pl: unable to parse line '$line' in config file ($config)\n";
    exit(1);
  }
}

close(CONFIG);

if ($read_command != 1) {
  print STDERR "queue.pl: config file ($config) does not contain the line \"command .*\"\n";
  exit(1);
}

for my $option (keys %cli_options) {
  if ($option eq "config") { next; }
  if ($option eq "max_jobs_run" && $array_job != 1) { next; }
  my $value = $cli_options{$option};

  if (exists $cli_default_options{($option,$value)}) {
    $qsub_opts .= "$cli_default_options{($option,$value)} ";
  } elsif (exists $cli_config_options{$option}) {
    $qsub_opts .= "$cli_config_options{$option} ";
  } else {
    if ($opened_config_file == 0) { $config = "default config file"; }
    die "queue.pl: Command line option $option not described in $config (or value '$value' not allowed)\n";
  }
}

my $cwd = getcwd();
my $logfile = shift @ARGV;

if ($array_job == 1 && $logfile !~ m/$jobname/
    && $jobend > $jobstart) {
  print STDERR "queue.pl: you are trying to run a parallel job but "
    . "you are putting the output into just one log file ($logfile)\n";
  exit(1);
}

#
# Work out the command; quote escaping is done here.
# Note: the rules for escaping stuff are worked out pretty
# arbitrarily, based on what we want it to do.  Some things that
# we pass as arguments to queue.pl, such as "|", we want to be
# interpreted by bash, so we don't escape them.  Other things,
# such as archive specifiers like 'ark:gunzip -c foo.gz|', we want
# to be passed, in quotes, to the Kaldi program.  Our heuristic
# is that stuff with spaces in should be quoted.  This doesn't
# always work.
#
my $cmd = "";

foreach my $x (@ARGV) {
  if ($x =~ m/^\S+$/) { $cmd .= $x . " "; } # If string contains no spaces, take
                                            # as-is.
  elsif ($x =~ m:\":) { $cmd .= "'$x' "; } # else if no dbl-quotes, use single
  else { $cmd .= "\"$x\" "; }  # else use double.
}

#
# Work out the location of the script file, and open it for writing.
#
my $dir = dirname($logfile);
my $base = basename($logfile);
my $qdir = "$dir/q";
$qdir =~ s:/(log|LOG)/*q:/q:; # If qdir ends in .../log/q, make it just .../q.
my $queue_logfile = "$qdir/$base";

if (!-d $dir) { system "mkdir -p $dir 2>/dev/null"; } # another job may be doing this...
if (!-d $dir) { die "Cannot make the directory $dir\n"; }
# make a directory called "q",
# where we will put the log created by qsub... normally this doesn't contain
# anything interesting, evertyhing goes to $logfile.
# in $qdir/sync we'll put the done.* files... we try to keep this
# directory small because it's transmitted over NFS many times.
if (! -d "$qdir/sync") {
  system "mkdir -p $qdir/sync 2>/dev/null";
  sleep(5); ## This is to fix an issue we encountered in denominator lattice creation,
  ## where if e.g. the exp/tri2b_denlats/log/15/q directory had just been
  ## created and the job immediately ran, it would die with an error because nfs
  ## had not yet synced.  I'm also decreasing the acdirmin and acdirmax in our
  ## NFS settings to something like 5 seconds.
}

my $queue_array_opt = "";
if ($array_job == 1) { # It's an array job.
  $queue_array_opt = "-t $jobstart:$jobend";
  $logfile =~ s/$jobname/\$SGE_TASK_ID/g; # This variable will get
  # replaced by qsub, in each job, with the job-id.
  $cmd =~ s/$jobname/\$\{SGE_TASK_ID\}/g; # same for the command...
  $queue_logfile =~ s/\.?$jobname//; # the log file in the q/ subdirectory
  # is for the queue to put its log, and this doesn't need the task array subscript
  # so we remove it.
}

# queue_scriptfile is as $queue_logfile [e.g. dir/q/foo.log] but
# with the suffix .sh.
my $queue_scriptfile = $queue_logfile;
($queue_scriptfile =~ s/\.[a-zA-Z]{1,5}$/.sh/) || ($queue_scriptfile .= ".sh");
if ($queue_scriptfile !~ m:^/:) {
  $queue_scriptfile = $cwd . "/" . $queue_scriptfile; # just in case.
}

# We'll write to the standard input of "qsub" (the file-handle Q),
# the job that we want it to execute.
# Also keep our current PATH around, just in case there was something
# in it that we need (although we also source ./path.sh)

my $syncfile = "$qdir/sync/done.$$";

unlink($queue_logfile, $syncfile);
#
# Write to the script file, and then close it.
#
open(Q, ">$queue_scriptfile") || die "Failed to write to $queue_scriptfile";

print Q "#!/usr/bin/env bash\n";
print Q "cd $cwd\n";
print Q ". ./path.sh\n";
print Q "( echo '#' Running on \`hostname\`\n";
print Q "  echo '#' Started at \`date\`\n";
print Q "  echo -n '# '; cat <<EOF\n";
print Q "$cmd\n"; # this is a way of echoing the command into a comment in the log file,
print Q "EOF\n"; # without having to escape things like "|" and quote characters.
print Q ") >$logfile\n";
print Q "if ! which free-gpu.sh &> /dev/null; then\n";
print Q "   echo 'command not found: free-gpu.sh not found.'\n";
print Q "   exit 1\n";
print Q "fi\n";
print Q "gpuid=\$(free-gpu.sh -n $cli_options{'gpu'})\n";
print Q "if [[ \${gpuid} == -1 ]]; then\n";
print Q "   echo 'Failed to find enough free GPUs: $cli_options{'gpu'}'\n";
print Q "   exit 1\n";
print Q "fi\n";
print Q "echo \"free gpu: \${gpuid}\" >>$logfile\n";
print Q "export CUDA_VISIBLE_DEVICES=\${gpuid}\n";
print Q "time1=\`date +\"%s\"\`\n";
print Q " ( $cmd ) 2>>$logfile >>$logfile\n";
print Q "ret=\$?\n";
print Q "time2=\`date +\"%s\"\`\n";
print Q "echo '#' Accounting: time=\$((\$time2-\$time1)) threads=$num_threads >>$logfile\n";
print Q "echo '#' Finished at \`date\` with status \$ret >>$logfile\n";
print Q "[ \$ret -eq 137 ] && exit 100;\n"; # If process was killed (e.g. oom) it will exit with status 137;
  # let the script return with status 100 which will put it to E state; more easily rerunnable.
if ($array_job == 0) { # not an array job
  print Q "touch $syncfile\n"; # so we know it's done.
} else {
  print Q "touch $syncfile.\$SGE_TASK_ID\n"; # touch a bunch of sync-files.
}
print Q "exit \$[\$ret ? 1 : 0]\n"; # avoid status 100 which grid-engine
print Q "## submitted with:\n";       # treats specially.
$qsub_cmd .= "-o $queue_logfile $qsub_opts $queue_array_opt $queue_scriptfile >>$queue_logfile 2>&1";
print Q "# $qsub_cmd\n";
if (!close(Q)) { # close was not successful... || die "Could not close script file $shfile";
  die "Failed to close the script file (full disk?)";
}
chmod 0755, $queue_scriptfile;

# This block submits the job to the queue.
for (my $try = 1; $try < 5; $try++) {
  my $ret = system ($qsub_cmd);
  if ($ret != 0) {
    if ($sync && $ret == 256) { # this is the exit status when a job failed (bad exit status)
      if (defined $jobname) {
        $logfile =~ s/\$SGE_TASK_ID/*/g;
      }
      print STDERR "queue.pl: job writing to $logfile failed\n";
      exit(1);
    } else {
      print STDERR "queue.pl: Error submitting jobs to queue (return status was $ret)\n";
      print STDERR "queue log file is $queue_logfile, command was $qsub_cmd\n";
      my $err = `tail $queue_logfile`;
      print STDERR "Output of qsub was: $err\n";
      if ($err =~ m/gdi request/ || $err =~ m/qmaster/) {
        # When we get queue connectivity problems we usually see a message like:
        # Unable to run job: failed receiving gdi request response for mid=1 (got
        # syncron message receive timeout error)..
        my $waitfor = 20;
        print STDERR "queue.pl: It looks like the queue master may be inaccessible. " .
          " Trying again after $waitfor seconts\n";
        sleep($waitfor);
        # ... and continue throught the loop.
      } else {
        exit(1);
      }
    }
  } else {
    last;  # break from the loop.
  }
}

if (! $sync) { # We're not submitting with -sync y, so we
  # need to wait for the jobs to finish.  We wait for the
  # sync-files we "touched" in the script to exist.
  my @syncfiles = ();
  if (!defined $jobname) { # not an array job.
    push @syncfiles, $syncfile;
  } else {
    for (my $jobid = $jobstart; $jobid <= $jobend; $jobid++) {
      push @syncfiles, "$syncfile.$jobid";
    }
  }
  # We will need the sge_job_id, to check that job still exists
  { # This block extracts the numeric SGE job-id from the log file in q/.
    # It may be used later to query 'qstat' about the job.
    open(L, "<$queue_logfile") || die "Error opening log file $queue_logfile";
    undef $sge_job_id;
    while (<L>) {
      if (m/Your job\S* (\d+)[. ].+ has been submitted/) {
        if (defined $sge_job_id) {
          die "Error: your job was submitted more than once (see $queue_logfile)";
        } else {
          $sge_job_id = $1;
        }
      }
    }
    close(L);
    if (!defined $sge_job_id) {
      die "Error: log file $queue_logfile does not specify the SGE job-id.";
    }
  }
  my $check_sge_job_ctr=1;

  my $wait = 0.1;
  my $counter = 0;
  foreach my $f (@syncfiles) {
    # wait for the jobs to finish one by one.
    while (! -f $f) {
      sleep($wait);
      $wait *= 1.2;
      if ($wait > 3.0) {
        $wait = 3.0; # never wait more than 3 seconds.
        # the following (.kick) commands are basically workarounds for NFS bugs.
        if (rand() < 0.25) { # don't do this every time...
          if (rand() > 0.5) {
            system("touch $qdir/sync/.kick");
          } else {
            unlink("$qdir/sync/.kick");
          }
        }
        if ($counter++ % 10 == 0) {
          # This seems to kick NFS in the teeth to cause it to refresh the
          # directory.  I've seen cases where it would indefinitely fail to get
          # updated, even though the file exists on the server.
          # Only do this every 10 waits (every 30 seconds) though, or if there
          # are many jobs waiting they can overwhelm the file server.
          system("ls $qdir/sync >/dev/null");
        }
      }

      # The purpose of the next block is so that queue.pl can exit if the job
      # was killed without terminating.  It's a bit complicated because (a) we
      # don't want to overload the qmaster by querying it too frequently), and
      # (b) sometimes the qmaster is unreachable or temporarily down, and we
      # don't want this to necessarily kill the job.
      if (($check_sge_job_ctr < 100 && ($check_sge_job_ctr++ % 10) == 0) ||
          ($check_sge_job_ctr >= 100 && ($check_sge_job_ctr++ % 50) == 0)) {
        # Don't run qstat too often, avoid stress on SGE; the if-condition above
        # is designed to check every 10 waits at first, and eventually every 50
        # waits.
        if ( -f $f ) { next; }  #syncfile appeared: OK.
        my $output = `qstat -j $sge_job_id 2>&1`;
        my $ret = $?;
        if ($ret >> 8 == 1 && $output !~ m/qmaster/ &&
            $output !~ m/gdi request/) {
          # Don't consider immediately missing job as error, first wait some
          # time to make sure it is not just delayed creation of the syncfile.

          sleep(3);
          # Sometimes NFS gets confused and thinks it's transmitted the directory
          # but it hasn't, due to timestamp issues.  Changing something in the
          # directory will usually fix that.
          system("touch $qdir/sync/.kick");
          unlink("$qdir/sync/.kick");
          if ( -f $f ) { next; }   #syncfile appeared, ok
          sleep(7);
          system("touch $qdir/sync/.kick");
          sleep(1);
          unlink("qdir/sync/.kick");
          if ( -f $f ) {  next; }   #syncfile appeared, ok
          sleep(60);
          system("touch $qdir/sync/.kick");
          sleep(1);
          unlink("$qdir/sync/.kick");
          if ( -f $f ) { next; }  #syncfile appeared, ok
          $f =~ m/\.(\d+)$/ || die "Bad sync-file name $f";
          my $job_id = $1;
          if (defined $jobname) {
            $logfile =~ s/\$SGE_TASK_ID/$job_id/g;
          }
          my $last_line = `tail -n 1 $logfile`;
          if ($last_line =~ m/status 0$/ && (-M $logfile) < 0) {
            # if the last line of $logfile ended with "status 0" and
            # $logfile is newer than this program [(-M $logfile) gives the
            # time elapsed between file modification and the start of this
            # program], then we assume the program really finished OK,
            # and maybe something is up with the file system.
            print STDERR "**queue.pl: syncfile $f was not created but job seems\n" .
              "**to have finished OK.  Probably your file-system has problems.\n" .
              "**This is just a warning.\n";
            last;
          } else {
            chop $last_line;
            print STDERR "queue.pl: Error, unfinished job no " .
              "longer exists, log is in $logfile, last line is '$last_line', " .
              "syncfile is $f, return status of qstat was $ret\n" .
              "Possible reasons: a) Exceeded time limit? -> Use more jobs!" .
              " b) Shutdown/Frozen machine? -> Run again!  Qmaster output " .
              "was: $output\n";
            exit(1);
          }
        } elsif ($ret != 0) {
          print STDERR "queue.pl: Warning: qstat command returned status $ret (qstat -j $sge_job_id,$!)\n";
          print STDERR "queue.pl: output was: $output";
        }
      }
    }
  }
  unlink(@syncfiles);
}

# OK, at this point we are synced; we know the job is done.
# But we don't know about its exit status.  We'll look at $logfile for this.
# First work out an array @logfiles of file-locations we need to
# read (just one, unless it's an array job).
my @logfiles = ();
if (!defined $jobname) { # not an array job.
  push @logfiles, $logfile;
} else {
  for (my $jobid = $jobstart; $jobid <= $jobend; $jobid++) {
    my $l = $logfile;
    $l =~ s/\$SGE_TASK_ID/$jobid/g;
    push @logfiles, $l;
  }
}

my $num_failed = 0;
my $status = 1;
foreach my $l (@logfiles) {
  my @wait_times = (0.1, 0.2, 0.2, 0.3, 0.5, 0.5, 1.0, 2.0, 5.0, 5.0, 5.0, 10.0, 25.0);
  for (my $iter = 0; $iter <= @wait_times; $iter++) {
    my $line = `tail -10 $l 2>/dev/null`; # Note: although this line should be the last
    # line of the file, I've seen cases where it was not quite the last line because
    # of delayed output by the process that was running, or processes it had called.
    # so tail -10 gives it a little leeway.
    if ($line =~ m/with status (\d+)/) {
      $status = $1;
      last;
    } else {
      if ($iter < @wait_times) {
        sleep($wait_times[$iter]);
      } else {
        if (! -f $l) {
          print STDERR "Log-file $l does not exist.\n";
        } else {
          print STDERR "The last line of log-file $l does not seem to indicate the "
            . "return status as expected\n";
        }
        exit(1);                # Something went wrong with the queue, or the
        # machine it was running on, probably.
      }
    }
  }
  # OK, now we have $status, which is the return-status of
  # the command in the job.
  if ($status != 0) { $num_failed++; }
}
if ($num_failed == 0) { exit(0); }
else { # we failed.
  if (@logfiles == 1) {
    if (defined $jobname) { $logfile =~ s/\$SGE_TASK_ID/$jobstart/g; }
    print STDERR "queue.pl: job failed with status $status, log is in $logfile\n";
    if ($logfile =~ m/JOB/) {
      print STDERR "queue.pl: probably you forgot to put JOB=1:\$nj in your script.\n";
    }
  } else {
    if (defined $jobname) { $logfile =~ s/\$SGE_TASK_ID/*/g; }
    my $numjobs = 1 + $jobend - $jobstart;
    print STDERR "queue.pl: $num_failed / $numjobs failed, log is in $logfile\n";
  }
  exit(1);
}


================================================
FILE: egs/espnet_utils/recog_wav.sh
================================================
#!/usr/bin/env bash

# Copyright 2019 Nagoya University (Takenori Yoshimura)
#           2019 RevComm Inc. (Takekatsu Hiramura)
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

if [ ! -f path.sh ] || [ ! -f cmd.sh ]; then
    echo "Please change current directory to recipe directory e.g., egs/tedlium2/asr1"
    exit 1
fi

. ./path.sh

# general configuration
backend=pytorch
stage=0        # start from 0 if you need to start from data preparation
stop_stage=100
ngpu=0         # number of gpus ("0" uses cpu, otherwise use gpu)
debugmode=1
verbose=1      # verbose option

# feature configuration
do_delta=false
cmvn=

# rnnlm related
use_lang_model=true
lang_model=

# decoding parameter
recog_model=
decode_config=
decode_dir=decode
api=v2

# download related
models=tedlium2.transformer.v1

help_message=$(cat <<EOF
Usage:
    $0 [options] <wav_file>

Options:
    --backend <chainer|pytorch>     # chainer or pytorch (Default: pytorch)
    --ngpu <ngpu>                   # Number of GPUs (Default: 0)
    --decode_dir <directory_name>   # Name of directory to store decoding temporary data
    --models <model_name>           # Model name (e.g. tedlium2.transformer.v1)
    --cmvn <path>                   # Location of cmvn.ark
    --lang_model <path>             # Location of language model
    --recog_model <path>            # Location of E2E model
    --decode_config <path>          # Location of configuration file
    --api <api_version>             # API version (v1 or v2, available in only pytorch backend)

Example:
    # Record audio from microphone input as example.wav
    rec -c 1 -r 16000 example.wav trim 0 5

    # Decode using model name
    $0 --models tedlium2.transformer.v1 example.wav

    # Decode with streaming mode (only RNN with API v1 is supported)
    $0 --models tedlium2.rnn.v2 --api v1 example.wav

    # Decode using model file
    $0 --cmvn cmvn.ark --lang_model rnnlm.model.best --recog_model model.acc.best --decode_config conf/decode.yaml example.wav

    # Decode with GPU (require batchsize > 0 in configuration file)
    $0 --ngpu 1 example.wav

Available models:
    - tedlium2.rnn.v1
    - tedlium2.rnn.v2
    - tedlium2.transformer.v1
    - tedlium3.transformer.v1
    - librispeech.transformer.v1
    - librispeech.transformer.v1.transformerlm.v1
    - commonvoice.transformer.v1
    - csj.transformer.v1
EOF
)
. utils/parse_options.sh || exit 1;

# make shellcheck happy
train_cmd=
decode_cmd=

. ./cmd.sh

wav=$1
download_dir=${decode_dir}/download

if [ $# -lt 1 ]; then
    echo "${help_message}"
    exit 1;
fi

set -e
set -u
set -o pipefail

# check api version
if [ "${api}" = "v2" ] && [ "${backend}" = "chainer" ]; then
    echo "chainer backend does not support api v2." >&2
    exit 1;
fi

# Check model name or model file is set
if [ -z $models ]; then
    if [ $use_lang_model = "true" ]; then
        if [[ -z $cmvn || -z $lang_model || -z $recog_model || -z $decode_config ]]; then
            echo 'Error: models or set of cmvn, lang_model, recog_model and decode_config are required.' >&2
            exit 1
        fi
    else
        if [[ -z $cmvn || -z $recog_model || -z $decode_config ]]; then
            echo 'Error: models or set of cmvn, recog_model and decode_config are required.' >&2
            exit 1
        fi
    fi
fi

dir=${download_dir}/${models}
mkdir -p ${dir}

function download_models () {
    if [ -z $models ]; then
        return
    fi

    file_ext="tar.gz"
    case "${models}" in
        "tedlium2.rnn.v1") share_url="https://drive.google.com/open?id=1UqIY6WJMZ4sxNxSugUqp3mrGb3j6h7xe"; api=v1 ;;
        "tedlium2.rnn.v2") share_url="https://drive.google.com/open?id=1cac5Uc09lJrCYfWkLQsF8eapQcxZnYdf"; api=v1 ;;
        "tedlium2.transformer.v1") share_url="https://drive.google.com/open?id=1cVeSOYY1twOfL9Gns7Z3ZDnkrJqNwPow" ;;
        "tedlium3.transformer.v1") share_url="https://drive.google.com/open?id=1zcPglHAKILwVgfACoMWWERiyIquzSYuU" ;;
        "librispeech.transformer.v1") share_url="https://drive.google.com/open?id=1BtQvAnsFvVi-dp_qsaFP7n4A_5cwnlR6" ;;
        "librispeech.transformer.v1.transformerlm.v1") share_url="https://drive.google.com/open?id=17cOOSHHMKI82e1MXj4r2ig8gpGCRmG2p" ;;
        "commonvoice.transformer.v1") share_url="https://drive.google.com/open?id=1tWccl6aYU67kbtkm8jv5H6xayqg1rzjh" ;;
        "csj.transformer.v1") share_url="https://drive.google.com/open?id=120nUQcSsKeY5dpyMWw_kI33ooMRGT2uF" ;;
        *) echo "No such models: ${models}"; exit 1 ;;
    esac

    if [ ! -e ${dir}/.complete ]; then
        download_from_google_drive.sh ${share_url} ${dir} ${file_ext}
        touch ${dir}/.complete
    fi
}

# Download trained models
if [ -z "${cmvn}" ]; then
    download_models
    cmvn=$(find ${download_dir}/${models} -name "cmvn.ark" | head -n 1)
fi
if [ -z "${lang_model}" ] && ${use_lang_model}; then
    download_models
    lang_model=$(find ${download_dir}/${models} -name "rnnlm*.best*" | head -n 1)
fi
if [ -z "${recog_model}" ]; then
    download_models
    recog_model=$(find ${download_dir}/${models} -name "model*.best*" | head -n 1)
fi
if [ -z "${decode_config}" ]; then
    download_models
    decode_config=$(find ${download_dir}/${models} -name "decode*.yaml" | head -n 1)
fi
if [ -z "${wav}" ]; then
    download_models
    wav=$(find ${download_dir}/${models} -name "*.wav" | head -n 1)
fi

# Check file existence
if [ ! -f "${cmvn}" ]; then
    echo "No such CMVN file: ${cmvn}"
    exit 1
fi
if [ ! -f "${lang_model}" ] && ${use_lang_model}; then
    echo "No such language model: ${lang_model}"
    exit 1
fi
if [ ! -f "${recog_model}" ]; then
    echo "No such E2E model: ${recog_model}"
    exit 1
fi
if [ ! -f "${decode_config}" ]; then
    echo "No such config file: ${decode_config}"
    exit 1
fi
if [ ! -f "${wav}" ]; then
    echo "No such WAV file: ${wav}"
    exit 1
fi

base=$(basename $wav .wav)
decode_dir=${decode_dir}/${base}

if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    echo "stage 0: Data preparation"

    mkdir -p ${decode_dir}/data
    echo "$base sox $wav -R -r 16000 -c 1 -b 16 -t wav - dither |" > ${decode_dir}/data/wav.scp
    echo "X $base" > ${decode_dir}/data/spk2utt
    echo "$base X" > ${decode_dir}/data/utt2spk
    echo "$base X" > ${decode_dir}/data/text
fi

if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    echo "stage 1: Feature Generation"

    steps/make_fbank_pitch.sh --cmd "$train_cmd" --nj 1 --write_utt2num_frames true \
        ${decode_dir}/data ${decode_dir}/log ${decode_dir}/fbank

    feat_recog_dir=${decode_dir}/dump; mkdir -p ${feat_recog_dir}
    dump.sh --cmd "$train_cmd" --nj 1 --do_delta ${do_delta} \
        ${decode_dir}/data/feats.scp ${cmvn} ${decode_dir}/log \
        ${feat_recog_dir}
fi

if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
    echo "stage 2: Json Data Preparation"

    dict=${decode_dir}/dict
    echo "<unk> 1" > ${dict}
    feat_recog_dir=${decode_dir}/dump
    data2json.sh --feat ${feat_recog_dir}/feats.scp \
        ${decode_dir}/data ${dict} > ${feat_recog_dir}/data.json
    rm -f ${dict}
fi

if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
    echo "stage 3: Decoding"
    if ${use_lang_model}; then
        recog_opts="--rnnlm ${lang_model}"
    else
        recog_opts=""
    fi
    feat_recog_dir=${decode_dir}/dump

    ${decode_cmd} ${decode_dir}/log/decode.log \
        asr_recog.py \
        --config ${decode_config} \
        --ngpu ${ngpu} \
        --backend ${backend} \
        --debugmode ${debugmode} \
        --verbose ${verbose} \
        --recog-json ${feat_recog_dir}/data.json \
        --result-label ${decode_dir}/result.json \
        --model ${recog_model} \
        --api ${api} \
        ${recog_opts}

    echo ""
    recog_text=$(grep rec_text ${decode_dir}/result.json | sed -e 's/.*: "\(.*\)".*/\1/' | sed -e 's/<eos>//')
    echo "Recognized text: ${recog_text}"
    echo ""
    echo "Finished"
fi


================================================
FILE: egs/espnet_utils/reduce_data_dir.sh
================================================
#!/usr/bin/env bash

# koried, 10/29/2012

# Reduce a data set based on a list of turn-ids

help_message="usage: $0 srcdir turnlist destdir"

if [ $1 == "--help" ]; then
    echo "${help_message}"
    exit 0;
fi

if [ $# != 3 ]; then
    echo "${help_message}"
    exit 1;
fi

srcdir=$1
reclist=$2
destdir=$3

if [ ! -f ${srcdir}/utt2spk ]; then
echo "$0: no such file $srcdir/utt2spk"
exit 1;
fi

function do_filtering {
# assumes the utt2spk and spk2utt files already exist.
	[ -f ${srcdir}/feats.scp ] && utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/feats.scp >${destdir}/feats.scp
	[ -f ${srcdir}/wav.scp ] && utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/wav.scp >${destdir}/wav.scp
	[ -f ${srcdir}/text ] && utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/text >${destdir}/text
	[ -f ${srcdir}/utt2num_frames ] && utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/utt2num_frames >${destdir}/utt2num_frames
	[ -f ${srcdir}/spk2gender ] && utils/filter_scp.pl ${destdir}/spk2utt <${srcdir}/spk2gender >${destdir}/spk2gender
	[ -f ${srcdir}/cmvn.scp ] && utils/filter_scp.pl ${destdir}/spk2utt <${srcdir}/cmvn.scp >${destdir}/cmvn.scp
	if [ -f ${srcdir}/segments ]; then
		utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/segments >${destdir}/segments
		awk '{print $2;}' ${destdir}/segments | sort | uniq > ${destdir}/reco # recordings.
		# The next line would override the command above for wav.scp, which would be incorrect.
		[ -f ${srcdir}/wav.scp ] && utils/filter_scp.pl ${destdir}/reco <${srcdir}/wav.scp >${destdir}/wav.scp
		[ -f ${srcdir}/reco2file_and_channel ] && \
			utils/filter_scp.pl ${destdir}/reco <${srcdir}/reco2file_and_channel >${destdir}/reco2file_and_channel
		
		# Filter the STM file for proper sclite scoring (this will also remove the comments lines)
		[ -f ${srcdir}/stm ] && utils/filter_scp.pl ${destdir}/reco < ${srcdir}/stm > ${destdir}/stm
		rm ${destdir}/reco
	fi
	srcutts=$(wc -l < ${srcdir}/utt2spk)
	destutts=$(wc -l < ${destdir}/utt2spk)
	echo "Reduced #utt from $srcutts to $destutts"
}

mkdir -p ${destdir}

# filter the utt2spk based on the set of recordings
utils/filter_scp.pl ${reclist} < ${srcdir}/utt2spk > ${destdir}/utt2spk

utils/utt2spk_to_spk2utt.pl < ${destdir}/utt2spk > ${destdir}/spk2utt
do_filtering;


================================================
FILE: egs/espnet_utils/remove_longshortdata.sh
================================================
#!/usr/bin/env bash

# Copyright 2017 Johns Hopkins University (Shinji Watanabe)
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

. ./path.sh

maxframes=2000
minframes=10
maxchars=200
minchars=0
nlsyms=""
no_feat=false
trans_type=char

help_message="usage: $0 olddatadir newdatadir"

. utils/parse_options.sh || exit 1;

if [ $# != 2 ]; then
    echo "${help_message}"
    exit 1;
fi

sdir=$1
odir=$2
mkdir -p ${odir}/tmp

if [ ${no_feat} = true ]; then
    # for machine translation
    cut -d' ' -f 1 ${sdir}/text > ${odir}/tmp/reclist1
else
    echo "extract utterances having less than $maxframes or more than $minframes frames"
    utils/data/get_utt2num_frames.sh ${sdir}
    < ${sdir}/utt2num_frames  awk -v maxframes="$maxframes" '{ if ($2 < maxframes) print }' \
        | awk -v minframes="$minframes" '{ if ($2 > minframes) print }' \
        | awk '{print $1}' > ${odir}/tmp/reclist1
fi

echo "extract utterances having less than $maxchars or more than $minchars characters"
# counting number of chars. Use (NF - 1) instead of NF to exclude the utterance ID column
if [ -z ${nlsyms} ]; then
text2token.py -s 1 -n 1 ${sdir}/text --trans_type ${trans_type} \
    | awk -v maxchars="$maxchars" '{ if (NF - 1 < maxchars) print }' \
    | awk -v minchars="$minchars" '{ if (NF - 1 > minchars) print }' \
    | awk '{print $1}' > ${odir}/tmp/reclist2
else
text2token.py -l ${nlsyms} -s 1 -n 1 ${sdir}/text --trans_type ${trans_type} \
    | awk -v maxchars="$maxchars" '{ if (NF - 1 < maxchars) print }' \
    | awk -v minchars="$minchars" '{ if (NF - 1 > minchars) print }' \
    | awk '{print $1}' > ${odir}/tmp/reclist2
fi

# extract common lines
comm -12 <(sort ${odir}/tmp/reclist1) <(sort ${odir}/tmp/reclist2) > ${odir}/tmp/reclist

reduce_data_dir.sh ${sdir} ${odir}/tmp/reclist ${odir}
utils/fix_data_dir.sh ${odir}

oldnum=$(wc -l ${sdir}/feats.scp | awk '{print $1}')
newnum=$(wc -l ${odir}/feats.scp | awk '{print $1}')
echo "change from $oldnum to $newnum"


================================================
FILE: egs/espnet_utils/remove_punctuation.pl
================================================
#!/usr/bin/perl

use warnings;
use strict;

binmode(STDIN,":utf8");
binmode(STDOUT,":utf8");

while(<STDIN>) {
  $_ = " $_ ";

  # remove punctuation except apostrophe
  s/<space>/spacemark/g;  # for scoring
  s/'/apostrophe/g;
  s/[[:punct:]]//g;
  s/apostrophe/'/g;
  s/spacemark/<space>/g;  # for scoring

  # remove whitespace
  s/\s+/ /g;
  s/^\s+//;
  s/\s+$//;

  print "$_\n";
}


================================================
FILE: egs/espnet_utils/rerank_mmi.py
================================================
import sys
import json
import codecs


json_f = sys.argv[1]
json_f_out = sys.argv[3]
weight = float(sys.argv[2])

with codecs.open(json_f, "r", encoding="utf-8") as f:
        j = json.load(f)

for name in j["utts"]:
    hyp_lst = j["utts"][name]["output"]
    for hyp in hyp_lst:
        hyp["score"] = float(hyp["score"]) + float(hyp["mmi_tot_score"]) * weight
    hyp_lst.sort(key=lambda hyp: hyp["score"], reverse=True)
    j["utts"][name]["output"] = hyp_lst

with open(json_f_out, "wb") as f:
    f.write(
        json.dumps(
            j, indent=4, ensure_ascii=False, sort_keys=True
        ).encode("utf_8")
    )


================================================
FILE: egs/espnet_utils/result2json.py
================================================
#!/usr/bin/env python3
# encoding: utf-8

# Copyright 2017 Johns Hopkins University (Shinji Watanabe)
#           2018 Xuankai Chang (Shanghai Jiao Tong University)
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

import argparse
import codecs
import json
import re
import sys

is_python2 = sys.version_info[0] == 2


def get_parser():
    parser = argparse.ArgumentParser(
        description="convert sclite's result.txt file to json"
    )
    parser.add_argument("--key", "-k", type=str, help="key")
    return parser


if __name__ == "__main__":
    parser = get_parser()
    args = parser.parse_args()

    key = re.findall(r"r\d+h\d+", args.key)[0]

    re_id = r"^id: "
    re_strings = {
        "Speaker": r"^Speaker sentences",
        "Scores": r"^Scores: ",
        "REF": r"^REF: ",
        "HYP": r"^HYP: ",
    }
    re_id = re.compile(re_id)
    re_patterns = {}
    for p in re_strings.keys():
        re_patterns[p] = re.compile(re_strings[p])

    ret = {}
    tmp_id = None
    tmp_ret = {}

    sys.stdin = codecs.getreader("utf-8")(sys.stdin if is_python2 else sys.stdin.buffer)
    sys.stdout = codecs.getwriter("utf-8")(
        sys.stdout if is_python2 else sys.stdout.buffer
    )
    line = sys.stdin.readline()
    while line:
        x = line.rstrip()
        x_split = x.split()

        if re_id.match(x):
            if tmp_id:
                ret[tmp_id] = {key: tmp_ret}
                tmp_ret = {}
            tmp_id = x_split[1]
        for p in re_patterns.keys():
            if re_patterns[p].match(x):
                tmp_ret[p] = " ".join(x_split[1:])
        line = sys.stdin.readline()

    if tmp_ret != {}:
        ret[tmp_id] = {key: tmp_ret}

    all_l = {"utts": ret}
    # ensure "ensure_ascii=False", which is a bug
    jsonstring = json.dumps(
        all_l, indent=4, ensure_ascii=False, sort_keys=True, separators=(",", ": ")
    )
    print(jsonstring)


================================================
FILE: egs/espnet_utils/score_bleu.sh
================================================
#!/usr/bin/env bash

# Copyright 2019 Kyoto University (Hirofumi Inaguma)
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

export LC_ALL=C

. ./path.sh

nlsyms=""
bpe=""
bpemodel=""
filter=""
case=lc
set=""
remove_nonverbal=true

. utils/parse_options.sh

if [ $# -lt 3 ]; then
    echo "Usage: $0 <decode-dir> <tgt_lang> <dict-tgt> <dict-src>";
    exit 1;
fi

dir=$1
tgt_lang=$2
dic_tgt=$3
dic_src=$4

concatjson.py ${dir}/data.*.json > ${dir}/data.json
json2trn_mt.py ${dir}/data.json ${dic_tgt} --refs ${dir}/ref.trn.org \
    --hyps ${dir}/hyp.trn.org --srcs ${dir}/src.trn.org --dict-src ${dic_src}

# remove uttterance id
perl -pe 's/\([^\)]+\)\n/\n/g;' ${dir}/ref.trn.org > ${dir}/ref.trn
perl -pe 's/\([^\)]+\)\n/\n/g;' ${dir}/hyp.trn.org > ${dir}/hyp.trn
perl -pe 's/\([^\)]+\)\n/\n/g;' ${dir}/src.trn.org > ${dir}/src.trn

# remove non-verbal labels (optional)
perl -pe 's/\([^\)]+\)//g;' ${dir}/ref.trn > ${dir}/ref.rm.trn
perl -pe 's/\([^\)]+\)//g;' ${dir}/hyp.trn > ${dir}/hyp.rm.trn
perl -pe 's/\([^\)]+\)//g;' ${dir}/src.trn > ${dir}/src.rm.trn

if [ -n "$bpe" ]; then
    if [ ${remove_nonverbal} ]; then
        spm_decode --model=${bpemodel} --input_format=piece < ${dir}/ref.rm.trn | sed -e "s/▁/ /g" > ${dir}/ref.wrd.trn
        spm_decode --model=${bpemodel} --input_format=piece < ${dir}/hyp.rm.trn | sed -e "s/▁/ /g" > ${dir}/hyp.wrd.trn
        spm_decode --model=${bpemodel} --input_format=piece < ${dir}/src.rm.trn | sed -e "s/▁/ /g" > ${dir}/src.wrd.trn
    else
        spm_decode --model=${bpemodel} --input_format=piece < ${dir}/ref.trn | sed -e "s/▁/ /g" > ${dir}/ref.wrd.trn
        spm_decode --model=${bpemodel} --input_format=piece < ${dir}/hyp.trn | sed -e "s/▁/ /g" > ${dir}/hyp.wrd.trn
        spm_decode --model=${bpemodel} --input_format=piece < ${dir}/src.trn | sed -e "s/▁/ /g" > ${dir}/src.wrd.trn
    fi
else
    if [ ${remove_nonverbal} ]; then
        sed -e "s/ //g" -e "s/(/ (/" -e "s/<space>/ /g" -e "s/>/> /g" ${dir}/ref.rm.trn > ${dir}/ref.wrd.trn
        sed -e "s/ //g" -e "s/(/ (/" -e "s/<space>/ /g" -e "s/>/> /g" ${dir}/hyp.rm.trn > ${dir}/hyp.wrd.trn
        sed -e "s/ //g" -e "s/(/ (/" -e "s/<space>/ /g" -e "s/>/> /g" ${dir}/src.rm.trn > ${dir}/src.wrd.trn
    else
        sed -e "s/ //g" -e "s/(/ (/" -e "s/<space>/ /g" -e "s/>/> /g" ${dir}/ref.trn > ${dir}/ref.wrd.trn
        sed -e "s/ //g" -e "s/(/ (/" -e "s/<space>/ /g" -e "s/>/> /g" ${dir}/hyp.trn > ${dir}/hyp.wrd.trn
        sed -e "s/ //g" -e "s/(/ (/" -e "s/<space>/ /g" -e "s/>/> /g" ${dir}/src.trn > ${dir}/src.wrd.trn
    fi
fi

# detokenize
detokenizer.perl -l ${tgt_lang} -q < ${dir}/ref.wrd.trn > ${dir}/ref.wrd.trn.detok
detokenizer.perl -l ${tgt_lang} -q < ${dir}/hyp.wrd.trn > ${dir}/hyp.wrd.trn.detok
detokenizer.perl -l ${tgt_lang} -q < ${dir}/src.wrd.trn > ${dir}/src.wrd.trn.detok

# remove language IDs
if [ -n "${nlsyms}" ]; then
    cp ${dir}/ref.wrd.trn.detok ${dir}/ref.wrd.trn.detok.tmp
    cp ${dir}/hyp.wrd.trn.detok ${dir}/hyp.wrd.trn.detok.tmp
    cp ${dir}/src.wrd.trn.detok ${dir}/src.wrd.trn.detok.tmp
    filt.py -v $nlsyms ${dir}/ref.wrd.trn.detok.tmp > ${dir}/ref.wrd.trn.detok
    filt.py -v $nlsyms ${dir}/hyp.wrd.trn.detok.tmp > ${dir}/hyp.wrd.trn.detok
    filt.py -v $nlsyms ${dir}/src.wrd.trn.detok.tmp > ${dir}/src.wrd.trn.detok
fi
if [ -n "${filter}" ]; then
    sed -i.bak3 -f ${filter} ${dir}/hyp.wrd.trn.detok
    sed -i.bak3 -f ${filter} ${dir}/ref.wrd.trn.detok
    sed -i.bak3 -f ${filter} ${dir}/src.wrd.trn.detok
fi
# NOTE: this must be performed after detokenization so that punctuation marks are not removed

if [ ${case} = tc ]; then
    echo ${set} > ${dir}/result.tc.txt
    multi-bleu-detok.perl ${dir}/ref.wrd.trn.detok < ${dir}/hyp.wrd.trn.detok >> ${dir}/result.tc.txt
    echo "write a case-sensitive BLEU result in ${dir}/result.tc.txt"
    cat ${dir}/result.tc.txt
else
    echo ${set} > ${dir}/result.lc.txt
    multi-bleu-detok.perl -lc ${dir}/ref.wrd.trn.detok < ${dir}/hyp.wrd.trn.detok > ${dir}/result.lc.txt
    echo "write a case-insensitive BLEU result in ${dir}/result.lc.txt"
    cat ${dir}/result.lc.txt
fi

# TODO(hirofumi): add TER & METEOR metrics here


================================================
FILE: egs/espnet_utils/score_lang_id.py
================================================
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

# Copyright 2021 Johns Hopkins University (Jiatong Shi)
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

import argparse
import codecs
import sys


def get_parser():
    parser = argparse.ArgumentParser(description="language identification scoring")
    parser.add_argument("--ref", type=str, help="input reference", required=True)
    parser.add_argument("--hyp", type=str, help="input hypotheses", required=True)
    parser.add_argument(
        "--out",
        type=argparse.FileType("w"),
        default=sys.stdout,
        help="The output filename. " "If omitted, then output to sys.stdout",
    )
    return parser


def main(args):
    args = get_parser().parse_args(args)
    scoring(args.ref, args.hyp, args.out)


def scoring(ref, hyp, out):
    ref_file = codecs.open(ref, "r", encoding="utf-8")
    hyp_file = codecs.open(hyp, "r", encoding="utf-8")

    utt_num = 0
    correct = 0

    while True:
        ref_utt = ref_file.readline()
        hyp_utt = hyp_file.readline()

        if not ref_utt or not hyp_utt:
            break

        [rec_id, lid, utt_id] = ref_utt.strip().split()
        [hrec_id, hlid, hutt_id] = hyp_utt.strip().split()

        assert (rec_id == hrec_id and utt_id == hutt_id) and "Mismatch in trn id"

        if lid == hlid:
            correct += 1
        utt_num += 1
    out.write(
        "Language Identification Scoring: Accuracy {:.4f} ({}/{})".format(
            (correct / float(utt_num)), correct, utt_num
        )
    )


if __name__ == "__main__":
    main(sys.argv[1:])


================================================
FILE: egs/espnet_utils/score_sclite.sh
================================================
#!/usr/bin/env bash

# Copyright 2017 Johns Hopkins University (Shinji Watanabe)
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

[ -f ./path.sh ] && . ./path.sh

nlsyms=""
wer=false
mer=false
bpe=""
bpemodel=""
remove_blank=true
filter=""
num_spkrs=1
help_message="Usage: $0 <data-dir> <dict>"
sppd3=false

. utils/parse_options.sh

if [ $# != 2 ]; then
    echo "${help_message}"
    exit 1;
fi

dir=$1
dic=$2

concatjson.py ${dir}/data.*.json > ${dir}/data.json

if [ $num_spkrs -eq 1 ]; then
  json2trn.py ${dir}/data.json ${dic} --num-spkrs ${num_spkrs} --refs ${dir}/ref.trn --hyps ${dir}/hyp.trn

  if ${remove_blank}; then
      sed -i.bak2 -r 's/<blank> //g' ${dir}/hyp.trn
  fi
  if [ -n "${nlsyms}" ]; then
      cp ${dir}/ref.trn ${dir}/ref.trn.org
      cp ${dir}/hyp.trn ${dir}/hyp.trn.org
      filt.py -v ${nlsyms} ${dir}/ref.trn.org > ${dir}/ref.trn
      filt.py -v ${nlsyms} ${dir}/hyp.trn.org > ${dir}/hyp.trn
  fi
  if [ -n "${filter}" ]; then
      sed -i.bak3 -f ${filter} ${dir}/hyp.trn
      sed -i.bak3 -f ${filter} ${dir}/ref.trn
  fi

  if [ $sppd3 = true ]; then
      cp ${dir}/hyp.trn ${dir}/hyp.trn.org 
      python3 espnet_utils/filter_trn.py $dir/hyp.trn.org > ${dir}/hyp.trn
  fi

  sclite -r ${dir}/ref.trn trn -h ${dir}/hyp.trn trn -i rm -o all stdout > ${dir}/result.txt
  
  echo "write a CER (or TER) result in ${dir}/result.txt"
  grep -e Avg -e SPKR -m 2 ${dir}/result.txt
  python3 espnet_utils/double_precious_cer.py ${dir}/result.txt

  if ${wer}; then
      if [ -n "$bpe" ]; then
  	    spm_decode --model=${bpemodel} --input_format=piece < ${dir}/ref.trn | sed -e "s/▁/ /g" > ${dir}/ref.wrd.trn
  	    spm_decode --model=${bpemodel} --input_format=piece < ${dir}/hyp.trn | sed -e "s/▁/ /g" > ${dir}/hyp.wrd.trn
      else
  	    sed -e "s/ //g" -e "s/(/ (/" -e "s/<space>/ /g" ${dir}/ref.trn > ${dir}/ref.wrd.trn
  	    sed -e "s/ //g" -e "s/(/ (/" -e "s/<space>/ /g" ${dir}/hyp.trn > ${dir}/hyp.wrd.trn
      fi
      sclite -r ${dir}/ref.wrd.trn trn -h ${dir}/hyp.wrd.trn trn -i rm -o all stdout > ${dir}/result.wrd.txt

      echo "write a WER result in ${dir}/result.wrd.txt"
      grep -e Avg -e SPKR -m 2 ${dir}/result.wrd.txt
      python3 espnet_utils/double_precious_cer.py ${dir}/result.wrd.txt

      if ${mer}; then
         python3 espnet_utils/prepare_mer.py ${dir}/ref.wrd.trn ${dir}/ref.wrd.trn.chn ${dir}/ref.wrd.trn.eng
         python3 espnet_utils/prepare_mer.py ${dir}/hyp.wrd.trn ${dir}/hyp.wrd.trn.chn ${dir}/hyp.wrd.trn.eng 
         sclite -r ${dir}/ref.wrd.trn.chn trn -h ${dir}/hyp.wrd.trn.chn trn -i rm -o all stdout > ${dir}/result.wrd.chn.txt
         sclite -r ${dir}/ref.wrd.trn.eng trn -h ${dir}/hyp.wrd.trn.eng trn -i rm -o all stdout > ${dir}/result.wrd.eng.txt
         
         echo "write a Mandarin CER result of code-switch data in ${dir}/result.wrd.chn.txt"
         grep -e Avg -e SPKR -m 2 ${dir}/result.wrd.chn.txt
         echo "write a English MER result of code-switch data in ${dir}/result.wrd.eng.txt"
         grep -e Avg -e SPKR -m 2 ${dir}/result.wrd.eng.txt
      fi
  fi
elif [ ${num_spkrs} -lt 4 ]; then
  ref_trns=""
  hyp_trns=""
  for i in $(seq ${num_spkrs}); do
      ref_trns=${ref_trns}"${dir}/ref${i}.trn "
      hyp_trns=${hyp_trns}"${dir}/hyp${i}.trn "
  done
  json2trn.py ${dir}/data.json ${dic} --num-spkrs ${num_spkrs} --refs ${ref_trns} --hyps ${hyp_trns}

  for n in $(seq ${num_spkrs}); do
      if ${remove_blank}; then
          sed -i.bak2 -r 's/<blank> //g' ${dir}/hyp${n}.trn
      fi
      if [ -n "${nlsyms}" ]; then
          cp ${dir}/ref${n}.trn ${dir}/ref${n}.trn.org
          cp ${dir}/hyp${n}.trn ${dir}/hyp${n}.trn.org
          filt.py -v ${nlsyms} ${dir}/ref${n}.trn.org > ${dir}/ref${n}.trn
          filt.py -v ${nlsyms} ${dir}/hyp${n}.trn.org > ${dir}/hyp${n}.trn
      fi
      if [ -n "${filter}" ]; then
          sed -i.bak3 -f ${filter} ${dir}/hyp${n}.trn
          sed -i.bak3 -f ${filter} ${dir}/ref${n}.trn
      fi
  done

  results_str=""
  for (( i=0; i<$((num_spkrs * num_spkrs)); i++ )); do
      ind_r=$((i / num_spkrs + 1))
      ind_h=$((i % num_spkrs + 1))
      results_str=${results_str}"${dir}/result_r${ind_r}h${ind_h}.txt "
      sclite -r ${dir}/ref${ind_r}.trn trn -h ${dir}/hyp${ind_h}.trn trn -i rm -o all stdout > ${dir}/result_r${ind_r}h${ind_h}.txt
  done

  echo "write CER (or TER) results in ${dir}/result_r*h*.txt"
  eval_perm_free_error.py --num-spkrs ${num_spkrs} \
      ${results_str} > ${dir}/min_perm_result.json
  sed -n '2,4p' ${dir}/min_perm_result.json

  if ${wer}; then
      for n in $(seq ${num_spkrs}); do
          if [ -n "$bpe" ]; then
              spm_decode --model=${bpemodel} --input_format=piece < ${dir}/ref${n}.trn | sed -e "s/▁/ /g" > ${dir}/ref${n}.wrd.trn
              spm_decode --model=${bpemodel} --input_format=piece < ${dir}/hyp${n}.trn | sed -e "s/▁/ /g" > ${dir}/hyp${n}.wrd.trn
          else
              sed -e "s/ //g" -e "s/(/ (/" -e "s/<space>/ /g" ${dir}/ref${n}.trn > ${dir}/ref${n}.wrd.trn
              sed -e "s/ //g" -e "s/(/ (/" -e "s/<space>/ /g" ${dir}/hyp${n}.trn > ${dir}/hyp${n}.wrd.trn
          fi
      done
      results_str=""
      for (( i=0; i<$((num_spkrs * num_spkrs)); i++ )); do
          ind_r=$((i / num_spkrs + 1))
          ind_h=$((i % num_spkrs + 1))
          results_str=${results_str}"${dir}/result_r${ind_r}h${ind_h}.wrd.txt "
          sclite -r ${dir}/ref${ind_r}.wrd.trn trn -h ${dir}/hyp${ind_h}.wrd.trn trn -i rm -o all stdout > ${dir}/result_r${ind_r}h${ind_h}.wrd.txt
      done

      echo "write WER results in ${dir}/result_r*h*.wrd.txt"
      eval_perm_free_error.py --num-spkrs ${num_spkrs} \
          ${results_str} > ${dir}/min_perm_result.wrd.json
      sed -n '2,4p' ${dir}/min_perm_result.wrd.json
  fi
fi


================================================
FILE: egs/espnet_utils/score_sclite_case.sh
================================================
#!/usr/bin/env bash

# Copyright 2018 Kyoto University (Hirofumi Inaguma)
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

export LC_ALL=C

. ./path.sh

nlsyms=""
wer=false
bpe=""
bpemodel=""
remove_blank=true
filter=""
case=lc.rm

. utils/parse_options.sh

if [ $# != 2 ]; then
    echo "Usage: $0 <data-dir> <dict>";
    exit 1;
fi

dir=$1
dic=$2

concatjson.py ${dir}/data.*.json > ${dir}/data.json
json2trn.py ${dir}/data.json ${dic} --refs ${dir}/ref.trn --hyps ${dir}/hyp.trn

if ${remove_blank}; then
    sed -i.bak2 -r 's/<blank> //g' ${dir}/hyp.trn
fi
if [ -n "${nlsyms}" ]; then
    cp ${dir}/ref.trn ${dir}/ref.trn.org
    cp ${dir}/hyp.trn ${dir}/hyp.trn.org
    filt.py -v ${nlsyms} ${dir}/ref.trn.org > ${dir}/ref.trn
    filt.py -v ${nlsyms} ${dir}/hyp.trn.org > ${dir}/hyp.trn
fi
if [ -n "${filter}" ]; then
    sed -i.bak3 -f ${filter} ${dir}/hyp.trn
    sed -i.bak3 -f ${filter} ${dir}/ref.trn
fi

# case-sensitive WER
if [ ${case} = tc ]; then

  # detokenize
  detokenizer.perl -l en -q < ${dir}/ref.trn > ${dir}/ref.trn.detok
  detokenizer.perl -l en -q < ${dir}/hyp.trn > ${dir}/hyp.trn.detok

  sclite -s -r ${dir}/ref.trn.detok trn -h ${dir}/hyp.trn.detok trn -i rm -o all stdout > ${dir}/result.tc.txt

  echo "write a case-sensitive CER (or TER) result in ${dir}/result.tc.txt"
  grep -e Avg -e SPKR -m 2 ${dir}/result.tc.txt

  if ${wer}; then
      if [ -n "$bpe" ]; then
          spm_decode --model=${bpemodel} --input_format=piece < ${dir}/ref.trn | sed -e "s/▁/ /g" > ${dir}/ref.wrd.trn
          spm_decode --model=${bpemodel} --input_format=piece < ${dir}/hyp.trn | sed -e "s/▁/ /g" > ${dir}/hyp.wrd.trn
      else
          sed -e "s/ //g" -e "s/(/ (/" -e "s/<space>/ /g" ${dir}/ref.trn > ${dir}/ref.wrd.trn
          sed -e "s/ //g" -e "s/(/ (/" -e "s/<space>/ /g" ${dir}/hyp.trn > ${dir}/hyp.wrd.trn
      fi

      # detokenize
      detokenizer.perl -l en -q < ${dir}/ref.wrd.trn > ${dir}/ref.wrd.trn.detok
      detokenizer.perl -l en -q < ${dir}/hyp.wrd.trn > ${dir}/hyp.wrd.trn.detok

      sclite -s -r ${dir}/ref.wrd.trn.detok trn -h ${dir}/hyp.wrd.trn.detok trn -i rm -o all stdout > ${dir}/result.wrd.tc.txt

      echo "write a case-sensitive WER result in ${dir}/result.wrd.tc.txt"
      grep -e Avg -e SPKR -m 2 ${dir}/result.wrd.tc.txt
  fi
fi

# lowercasing
lowercase.perl < ${dir}/hyp.trn > ${dir}/hyp.trn.lc
lowercase.perl < ${dir}/ref.trn > ${dir}/ref.trn.lc

# remove punctuation
paste -d "(" <(cut -d '(' -f 1 ${dir}/hyp.trn.lc | remove_punctuation.pl | sed -e "s/  / /g") <(cut -d '(' -f 2- ${dir}/hyp.trn.lc) > ${dir}/hyp.trn.lc.rm
paste -d "(" <(cut -d '(' -f 1 ${dir}/ref.trn.lc | remove_punctuation.pl | sed -e "s/  / /g") <(cut -d '(' -f 2- ${dir}/ref.trn.lc) > ${dir}/ref.trn.lc.rm

# detokenize
detokenizer.perl -l en -q < ${dir}/ref.trn.lc.rm > ${dir}/ref.trn.lc.rm.detok
detokenizer.perl -l en -q < ${dir}/hyp.trn.lc.rm > ${dir}/hyp.trn.lc.rm.detok

sclite -r ${dir}/ref.trn.lc.rm.detok trn -h ${dir}/hyp.trn.lc.rm.detok trn -i rm -o all stdout > ${dir}/result.txt

echo "write a CER (or TER) result in ${dir}/result.txt"
grep -e Avg -e SPKR -m 2 ${dir}/result.txt

if ${wer}; then
    if [ -n "$bpe" ]; then
        spm_decode --model=${bpemodel} --input_format=piece < ${dir}/ref.trn.lc.rm | sed -e "s/▁/ /g" > ${dir}/ref.wrd.trn.lc.rm
        spm_decode --model=${bpemodel} --input_format=piece < ${dir}/hyp.trn.lc.rm | sed -e "s/▁/ /g" > ${dir}/hyp.wrd.trn.lc.rm
    else
        sed -e "s/ //g" -e "s/(/ (/" -e "s/<space>/ /g" ${dir}/ref.trn.lc.rm > ${dir}/ref.wrd.trn.lc.rm
        sed -e "s/ //g" -e "s/(/ (/" -e "s/<space>/ /g" ${dir}/hyp.trn.lc.rm > ${dir}/hyp.wrd.trn.lc.rm
    fi

    # detokenize
    detokenizer.perl -l en -q < ${dir}/ref.wrd.trn.lc.rm > ${dir}/ref.wrd.trn.lc.rm.detok
    detokenizer.perl -l en -q < ${dir}/hyp.wrd.trn.lc.rm > ${dir}/hyp.wrd.trn.lc.rm.detok

    sclite -r ${dir}/ref.wrd.trn.lc.rm.detok trn -h ${dir}/hyp.wrd.trn.lc.rm.detok trn -i rm -o all stdout > ${dir}/result.wrd.txt

    echo "write a WER result in ${dir}/result.wrd.txt"
    grep -e Avg -e SPKR -m 2 ${dir}/result.wrd.txt
fi


================================================
FILE: egs/espnet_utils/score_sclite_wo_dict.sh
================================================
#!/usr/bin/env bash

# Copyright 2019 Okayama University (Katsuki Inoue)
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

[ -f ./path.sh ] && . ./path.sh

wer=false
num_spkrs=1
help_message="Usage: $0 <data-dir>"

. utils/parse_options.sh

if [ $# != 1 ]; then
    echo "${help_message}"
    exit 1;
fi

dir=$1

concatjson.py ${dir}/data.*.json > ${dir}/data.json

if [ $num_spkrs -eq 1 ]; then
    json2trn_wo_dict.py ${dir}/data.json --num-spkrs ${num_spkrs} --refs ${dir}/ref_org.wrd.trn --hyps ${dir}/hyp_org.wrd.trn
   
    cat < ${dir}/hyp_org.wrd.trn | sed -e 's/▁//' | sed -e 's/▁/ /g' > ${dir}/hyp.wrd.trn
    cat < ${dir}/ref_org.wrd.trn | sed -e 's/\.//g' -e 's/\,//g' > ${dir}/ref.wrd.trn

    cat < ${dir}/hyp.wrd.trn | awk -v FS='' '{a=0;for(i=1;i<=NF;i++){if($i=="("){a=1};if(a==0){printf("%s ",$i)}else{printf("%s",$i)}}printf("\n")}' > ${dir}/hyp.trn
    cat < ${dir}/ref.wrd.trn | awk -v FS='' '{a=0;for(i=1;i<=NF;i++){if($i=="("){a=1};if(a==0){printf("%s ",$i)}else{printf("%s",$i)}}printf("\n")}' > ${dir}/ref.trn

    sclite -r ${dir}/ref.trn trn -h ${dir}/hyp.trn -i rm -o all stdout > ${dir}/result.txt
    echo "write a CER result in ${dir}/result.txt"
    grep -e Avg -e SPKR -m 2 ${dir}/result.txt
    
    if ${wer}; then
        sclite -r ${dir}/ref.wrd.trn trn -h ${dir}/hyp.wrd.trn -i rm -o all stdout > ${dir}/result.wrd.txt
        echo "write a WER result in ${dir}/result.wrd.txt"
        grep -e Avg -e SPKR -m 2 ${dir}/result.wrd.txt
        
        sclite -r ${dir}/ref_org.wrd.trn trn -h ${dir}/hyp.wrd.trn trn -i rm -o all stdout > ${dir}/result_w_punc.wrd.txt
        echo "write a WER result in ${dir}/result_w_punc.wrd.txt"
        grep -e Avg -e SPKR -m 2 ${dir}/result_w_punc.wrd.txt

    fi
fi


================================================
FILE: egs/espnet_utils/scp2json.py
================================================
#!/usr/bin/env python3
# encoding: utf-8

# Copyright 2017 Johns Hopkins University (Shinji Watanabe)
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

import argparse
import codecs
import json
import sys

is_python2 = sys.version_info[0] == 2


def get_parser():
    parser = argparse.ArgumentParser(
        description="convert scp to json",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
    )
    parser.add_argument("--key", "-k", type=str, help="key")
    return parser


if __name__ == "__main__":
    parser = get_parser()
    args = parser.parse_args()

    new_line = {}
    sys.stdin = codecs.getreader("utf-8")(sys.stdin if is_python2 else sys.stdin.buffer)
    sys.stdout = codecs.getwriter("utf-8")(
        sys.stdout if is_python2 else sys.stdout.buffer
    )
    line = sys.stdin.readline()
    while line:
        x = line.rstrip().split()
        v = {args.key: " ".join(x[1:])}
        new_line[x[0]] = v
        line = sys.stdin.readline()

    all_l = {"utts": new_line}

    # ensure "ensure_ascii=False", which is a bug
    jsonstring = json.dumps(
        all_l, indent=4, ensure_ascii=False, sort_keys=True, separators=(",", ": ")
    )
    print(jsonstring)


================================================
FILE: egs/espnet_utils/show_result.sh
================================================
#!/usr/bin/env bash
mindepth=0
maxdepth=1

. utils/parse_options.sh

if [ $# -gt 1 ]; then
    echo "Usage: $0 --mindepth 0 --maxdepth 1 [exp]" 1>&2
    echo ""
    echo "Show the system environments and the evaluation results in Markdown format."
    echo 'The default of <exp> is "exp/".'
    exit 1
fi

[ -f ./path.sh ] && . ./path.sh
set -euo pipefail
if [ $# -eq 1 ]; then
    exp=$1
else
    exp=exp
fi


cat << EOF
<!-- Generated by $0 -->
# RESULTS
## Environments
- date: \`$(LC_ALL=C date)\`
EOF

python3 << EOF
import sys, espnet, chainer, torch
pyversion = sys.version.replace('\n', ' ')

print(f"""- python version: \`{pyversion}\`
- espnet version: \`espnet {espnet.__version__}\`
- chainer version: \`chainer {chainer.__version__}\`
- pytorch version: \`pytorch {torch.__version__}\`""")
EOF

cat << EOF
- Git hash: \`$(git rev-parse HEAD)\`
  - Commit date: \`$(git log -1 --format='%cd')\`

EOF

while IFS= read -r expdir; do
    if ls ${expdir}/decode_*/result.txt &> /dev/null; then
    # 1. Show the result table
    cat << EOF
## $(basename ${expdir})
### CER

|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
|---|---|---|---|---|---|---|---|---|
EOF
        grep -e Avg ${expdir}/decode_*/result.txt \
            | sed -e "s#${expdir}/\([^/]*\)/result.txt:#|\1#g" \
            | sed -e 's#Sum/Avg##g' | tr '|' ' ' | tr -s ' ' '|'
        echo

        # 2. Show the result table for WER
        if ls ${expdir}/decode_*/result.wrd.txt &> /dev/null; then
            cat << EOF
### WER

|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
|---|---|---|---|---|---|---|---|---|
EOF
            grep -e Avg ${expdir}/decode_*/result.wrd.txt \
                | sed -e "s#${expdir}/\([^/]*\)/result.wrd.txt:#|\1#g" \
                | sed -e 's#Sum/Avg##g' | tr '|' ' ' | tr -s ' ' '|'
            echo
        fi
    fi
done < <(find ${exp} -mindepth ${mindepth} -maxdepth ${maxdepth} -type d)


================================================
FILE: egs/espnet_utils/significant_test.sh
================================================
adir=$1 # reference
bdir=$2 # tested 

for part in trn wrd.trn wrd.trn.chn wrd.trn.eng; do
    if [ -f $adir/ref.$part ] && [ -f $adir/ref.$part ]; then 
        (sclite -F -i wsj -r $adir/ref.$part -h $adir/hyp.$part -o sgml
        sclite -F -i wsj -r $bdir/ref.$part -h $bdir/hyp.$part -o sgml

        cat $adir/hyp.${part}.sgml $bdir/hyp.${part}.sgml | sc_stats -p -t mapsswe -v -u -n $bdir/result.${part}.mapsswe
        ) &
    fi
done
wait 


================================================
FILE: egs/espnet_utils/sort_scp_by_length.py
================================================
import sys
import os

in_scp = sys.argv[1]
in_frame = sys.argv[2]
out_scp = sys.argv[3]
out_frame = sys.argv[4]

# read scp as dict
scp_dict = {}
for line in open(in_scp, encoding="utf-8"):
    uttid, add = line.strip().split()
    scp_dict[uttid] = add

# read utt2frames
frame_lst = []
for line in open(in_frame, encoding="utf-8"):
    uttid, length = line.strip().split()
    length = int(length)
    frame_lst.append([uttid, length])

frame_lst.sort(key=lambda x: x[1])

scp_writer = open(out_scp, 'w', encoding="utf-8")
frame_writer = open(out_frame, 'w', encoding='utf-8')

for e in frame_lst:
    uttid, length = e
    add = scp_dict[uttid]
    scp_writer.write(f"{uttid} {add}\n")
    frame_writer.write(f"{uttid} {length}\n")
scp_writer.close()
frame_writer.close()

"""
max_utt = 256
count = 1
out_scp_base = os.path.basename(out_scp)
out_scp_base = out_scp_base.replace(".", f".{count}.")
out_scp_dir = os.path.dirname(out_scp)
out_scp = os.path.join(out_scp_dir, out_scp_base)
for i, e in enumerate(frame_lst):
    if i % max_utt == 0:
       scp_writer = open(out_scp, 'w', encoding='utf-8')
       out_scp = out_scp.replace(f".{count}.", f".{count+1}.") 
       count += 1
    uttid, _ = e
    add = scp_dict[uttid]
    scp_writer.write(f"{uttid} {add}\n")
"""


================================================
FILE: egs/espnet_utils/speed_perturb.sh
================================================
#!/usr/bin/env bash

# Copyright 2021 Kyoto University (Hirofumi Inaguma)
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

cases=""
speeds="0.9 1.0 1.1"
langs=""
write_utt2num_frames=true
nj=32
cmd=""

help_message=$(cat <<EOF
Usage: $0 [options] <data-dir> <destination-dir> <fbankdir>
e.g.: $0 data/train en de
Options:
  --cases                              # target case information (e.g., lc.rm, lc, tc)
  --speeds                             # speed used in speed perturbation (e.g., 0.9. 1.0, 1.1)
  --langs                              # all languages (source + target)
  --write_utt2num_frames               # write utt2num_frames in steps/make_fbank_pitch.sh
  --cmd <run.pl|queue.pl <queue opts>> # how to run jobs
  --nj <nj>                            # number of parallel jobs
EOF
)
echo "$0 $*"  # Print the command line for logging

. parse_options.sh || exit 1;

if [ $# -lt 1 ] || [ $# -gt 3 ]; then
    echo "${help_message}"
    exit 1;
fi

set -euo pipefail

data_dir=$1
dst=$2
fbankdir=$3

tmpdir=$(mktemp -d ${data_dir}/tmp-XXXXX)
trap 'rm -rf ${tmpdir}' EXIT

for sp in ${speeds}; do
    utils/perturb_data_dir_speed.sh ${sp} ${data_dir} ${tmpdir}/temp.${sp}
done
utils/combine_data.sh --extra-files utt2uniq ${dst} ${tmpdir}/temp.*

steps/make_fbank_pitch.sh --cmd ${cmd} --nj ${nj} --write_utt2num_frames ${write_utt2num_frames} \
    ${dst} exp/make_fbank/"$(basename ${dst})" ${fbankdir}
utils/fix_data_dir.sh ${dst}
utils/validate_data_dir.sh --no-text ${dst}

if [ -n "${langs}" ]; then
    # for ST/MT recipe + ASR recipe in ST recipe
   for lang in ${langs}; do
        for case in ${cases}; do
            if [ -f ${dst}/text.${case}.${lang} ]; then
                rm ${dst}/text.${case}.${lang}
            fi
        done
        touch ${dst}/text.${case}.${lang}

        for sp in ${speeds}; do
            awk -v p="sp${sp}-" '{printf("%s %s%s\n", $1, p, $1);}' ${data_dir}/utt2spk > ${dst}/utt_map

            for case in ${cases}; do
                utils/apply_map.pl -f 1 ${dst}/utt_map <${data_dir}/text.${case}.${lang} >> ${dst}/text.${case}.${lang}
            done
        done
    done
else
    # for ASR only recipe
    touch ${dst}/text
    for sp in ${speeds}; do
        awk -v p="sp${sp}-" '{printf("%s %s%s\n", $1, p, $1);}' ${data_dir}/utt2spk > ${dst}/utt_map
        utils/apply_map.pl -f 1 ${dst}/utt_map <${data_dir}/text >>${dst}/text
    done
fi

rm -rf ${tmpdir}*


================================================
FILE: egs/espnet_utils/split_scp.py
================================================
import sys

in_f = sys.argv[1]
writers = []
for f in sys.argv[2:]:
    writer = open(f, 'w', encoding='utf-8')
    writers.append(writer)
num_writers = len(writers)

for i, line in enumerate(open(in_f, encoding='utf-8')):
    writer = writers[i % num_writers]
    writer.write(line)

for w in writers:
    writer.close()
    

================================================
FILE: egs/espnet_utils/split_scp_fix_length.py
================================================
import sys
import os

in_f = sys.argv[1]

max_utt=360 # So batch size could be 1, 2, 3, 4, 5, 6, 8, 10, 12 etc.
count = 1

out_scp_base = os.path.basename(in_f)
out_scp_base = out_scp_base.replace(".", f".{count}.")
out_scp_dir = os.path.dirname(in_f)
out_scp = os.path.join(out_scp_dir, out_scp_base)
for i, line in enumerate(open(in_f, encoding='utf-8')):
    if i % max_utt == 0:
        scp_writer = open(out_scp, 'w', encoding="utf-8")
        out_scp = out_scp.replace(f".{count}.", f".{count+1}.") 
        count += 1
    scp_writer.write(line)
    

================================================
FILE: egs/espnet_utils/splitjson.py
================================================
#!/usr/bin/env python3
# encoding: utf-8

# Copyright 2017 Johns Hopkins University (Shinji Watanabe)
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)


import argparse
import codecs
import json
import logging
import os
import sys

import numpy as np

from espnet.utils.cli_utils import get_commandline_args


def get_parser():
    parser = argparse.ArgumentParser(
        description="split a json file for parallel processing",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
    )
    parser.add_argument("json", type=str, help="json file")
    parser.add_argument(
        "--parts", "-p", type=int, help="Number of subparts to be prepared", default=0
    )
    parser.add_argument(
        "--original-order", action="store_true", help="If set, not sort utts by keys"
    )
    return parser


if __name__ == "__main__":
    args = get_parser().parse_args()

    # logging info
    logging.basicConfig(
        level=logging.INFO,
        format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
    )
    logging.info(get_commandline_args())

    # check directory
    filename = os.path.basename(args.json).split(".")[0]
    dirname = os.path.dirname(args.json)
    dirname = "{}/split{}utt".format(dirname, args.parts)
    if not os.path.exists(dirname):
        os.makedirs(dirname)

    # load json and split keys
    j = json.load(codecs.open(args.json, "r", encoding="utf-8"))
    if args.original_order:
        utt_ids = list(j["utts"].keys())
    else:
        utt_ids = sorted(list(j["utts"].keys()))
    logging.info("number of utterances = %d" % len(utt_ids))
    if len(utt_ids) < args.parts:
        logging.error("#utterances < #splits. Use smaller split number.")
        sys.exit(1)
    utt_id_lists = np.array_split(utt_ids, args.parts)
    utt_id_lists = [utt_id_list.tolist() for utt_id_list in utt_id_lists]

    for i, utt_id_list in enumerate(utt_id_lists):
        new_dic = dict()
        for utt_id in utt_id_list:
            new_dic[utt_id] = j["utts"][utt_id]
        jsonstring = json.dumps(
            {"utts": new_dic},
            indent=4,
            ensure_ascii=False,
            sort_keys=not args.original_order,
            separators=(",", ": "),
        )
        fl = "{}/{}.{}.json".format(dirname, filename, i + 1)
        sys.stdout = codecs.open(fl, "w+", encoding="utf-8")
        print(jsonstring)
        sys.stdout.close()


================================================
FILE: egs/espnet_utils/spm_decode
================================================
#!/usr/bin/env python
# Copyright (c) Facebook, Inc. and its affiliates.
# All rights reserved.
#
# This source code is licensed under the license found in the
# https://github.com/pytorch/fairseq/blob/master/LICENSE


import argparse
import sys

import sentencepiece as spm

def is_all_chinese(strs):
    for _char in strs:
        if not '\u4e00' <= _char <= '\u9fa5':
            return False
    return True

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--model", required=True,
                        help="sentencepiece model to use for decoding")
    parser.add_argument("--input", default=None, help="input file to decode")
    parser.add_argument("--input_format", choices=["piece", "id"], default="piece")
    args = parser.parse_args()

    sp = spm.SentencePieceProcessor()
    sp.Load(args.model)

    if args.input_format == "piece":
        def decode(l):
            return "".join(sp.DecodePieces(l))
    elif args.input_format == "id":
        def decode(l):
            return "".join(sp.DecodeIds(l))
    else:
        raise NotImplementedError

    def tok2int(tok):
        # remap reference-side <unk> (represented as <<unk>>) to 0
        return int(tok) if tok != "<<unk>>" else 0

    def multilingual_decode(line):
        def process_segment(buf):
            segment = "".join(buf).split() # string of bpes
            segment = decode(segment).split() # list of words
            return segment

        ans, buf = [], []
        for c in line:
            if is_all_chinese(c):
                if buf:
                    ans.extend(process_segment(buf))
                    buf = []
                ans.append(c)
            else:
                buf.append(c)
        if buf:
            ans.extend(process_segment(buf))

        ans = " ".join(ans)
        return ans
                    

    if args.input is None:
        h = sys.stdin
    else:
        h = open(args.input, "r", encoding="utf-8")
    for line in h:
        print(multilingual_decode(line))
        # print(decode(line.split()))


if __name__ == "__main__":
    main()


================================================
FILE: egs/espnet_utils/spm_encode
================================================
#!/usr/bin/env python
# Copyright (c) Facebook, Inc. and its affiliates.
# All rights reserved.
#
# This source code is licensed under the license found in
# https://github.com/pytorch/fairseq/blob/master/LICENSE


import argparse
import contextlib
import sys

import sentencepiece as spm

def is_all_chinese(strs):
    for _char in strs:
        if not '\u4e00' <= _char <= '\u9fa5':
            return False
    return True

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--model", required=True,
                        help="sentencepiece model to use for encoding")
    parser.add_argument("--inputs", nargs="+", default=['-'],
                        help="input files to filter/encode")
    parser.add_argument("--outputs", nargs="+", default=['-'],
                        help="path to save encoded outputs")
    parser.add_argument("--output_format", choices=["piece", "id"], default="piece")
    parser.add_argument("--min-len", type=int, metavar="N",
                        help="filter sentence pairs with fewer than N tokens")
    parser.add_argument("--max-len", type=int, metavar="N",
                        help="filter sentence pairs with more than N tokens")
    parser.add_argument("--split-chn", action="store_true",
                        help="if true, remove all space between chn tokens")
    args = parser.parse_args()

    assert len(args.inputs) == len(args.outputs), \
        "number of input and output paths should match"

    sp = spm.SentencePieceProcessor()
    sp.Load(args.model)

    if args.output_format == "piece":
        def encode(l):
            return sp.EncodeAsPieces(l)
    elif args.output_format == "id":
        def encode(l):
            return list(map(str, sp.EncodeAsIds(l)))
    else:
        raise NotImplementedError

    if args.min_len is not None or args.max_len is not None:
        def valid(line):
            return (
                (args.min_len is None or len(line) >= args.min_len) and
                (args.max_len is None or len(line) <= args.max_len)
            )
    else:
        def valid(lines):
            return True

    with contextlib.ExitStack() as stack:
        inputs = [
            stack.enter_context(open(input, "r", encoding="utf-8"))
            if input != "-" else sys.stdin
            for input in args.inputs
        ]
        outputs = [
            stack.enter_context(open(output, "w", encoding="utf-8"))
            if output != "-" else sys.stdout
            for output in args.outputs
        ]

        stats = {
            "num_empty": 0,
            "num_filtered": 0,
        }

        if args.split_chn:
            process_chn = lambda x: " ".join(list(x))
        else:
            process_chn = lambda x: x 

        def multilingual_encode(string):
            ans = []
            pieces = string.strip().split()
  
            for p in pieces:
                if is_all_chinese(p):
                    ans.append(process_chn(p))
                else:
                    ans.extend(encode(p))
            
            return ans

        def encode_line(line):
            line = line.strip()
            if len(line) > 0:
                # line = encode(line)
                line = multilingual_encode(line)
                if valid(line):
                    return line
                else:
                    stats["num_filtered"] += 1
            else:
                stats["num_empty"] += 1
            return None

        for i, lines in enumerate(zip(*inputs), start=1):
            enc_lines = list(map(encode_line, lines))
            if not any(enc_line is None for enc_line in enc_lines):
                for enc_line, output_h in zip(enc_lines, outputs):
                    print(" ".join(enc_line), file=output_h)
            if i % 10000 == 0:
                print("processed {} lines".format(i), file=sys.stderr)

        print("skipped {} empty lines".format(stats["num_empty"]), file=sys.stderr)
        print("filtered {} lines".format(stats["num_filtered"]), file=sys.stderr)


if __name__ == "__main__":
    main()


================================================
FILE: egs/espnet_utils/spm_train
================================================
#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates.
# All rights reserved.
#
# This source code is licensed under the license found in the
# https://github.com/pytorch/fairseq/blob/master/LICENSE
import sys

import sentencepiece as spm


if __name__ == "__main__":
    spm.SentencePieceTrainer.Train(" ".join(sys.argv[1:]))


================================================
FILE: egs/espnet_utils/stdout.pl
================================================
#!/usr/bin/env perl
use warnings; #sed replacement for -w perl parameter

# In general, doing
#  run.pl some.log a b c is like running the command a b c in
# the bash shell, and putting the standard error and output into some.log.
# To run parallel jobs (backgrounded on the host machine), you can do (e.g.)
#  run.pl JOB=1:4 some.JOB.log a b c JOB is like running the command a b c JOB
# and putting it in some.JOB.log, for each one. [Note: JOB can be any identifier].
# If any of the jobs fails, this script will fail.

# A typical example is:
#  run.pl some.log my-prog "--opt=foo bar" foo \|  other-prog baz
# and run.pl will run something like:
# ( my-prog '--opt=foo bar' foo |  other-prog baz ) >& some.log
#
# Basically it takes the command-line arguments, quotes them
# as necessary to preserve spaces, and evaluates them with bash.
# In addition it puts the command line at the top of the log, and
# the start and end times of the command at the beginning and end.
# The reason why this is useful is so that we can create a different
# version of this program that uses a queueing system instead.

# use Data::Dumper;

@ARGV < 2 && die "usage: run.pl log-file command-line arguments...";


$max_jobs_run = -1;
$jobstart = 1;
$jobend = 1;
$ignored_opts = ""; # These will be ignored.

# First parse an option like JOB=1:4, and any
# options that would normally be given to
# queue.pl, which we will just discard.

for (my $x = 1; $x <= 2; $x++) { # This for-loop is to
  # allow the JOB=1:n option to be interleaved with the
  # options to qsub.
  while (@ARGV >= 2 && $ARGV[0] =~ m:^-:) {
    # parse any options that would normally go to qsub, but which will be ignored here.
    my $switch = shift @ARGV;
    if ($switch eq "-V") {
      $ignored_opts .= "-V ";
    } elsif ($switch eq "--max-jobs-run" || $switch eq "-tc") {
      # we do support the option --max-jobs-run n, and its GridEngine form -tc n.
      $max_jobs_run = shift @ARGV;
      if (! ($max_jobs_run > 0)) {
        die "run.pl: invalid option --max-jobs-run $max_jobs_run";
      }
    } else {
      my $argument = shift @ARGV;
      if ($argument =~ m/^--/) {
        print STDERR "run.pl: WARNING: suspicious argument '$argument' to $switch; starts with '-'\n";
      }
      if ($switch eq "-sync" && $argument =~ m/^[yY]/) {
        $ignored_opts .= "-sync "; # Note: in the
        # corresponding code in queue.pl it says instead, just "$sync = 1;".
      } elsif ($switch eq "-pe") { # e.g. -pe smp 5
        my $argument2 = shift @ARGV;
        $ignored_opts .= "$switch $argument $argument2 ";
      } elsif ($switch eq "--gpu") {
        $using_gpu = $argument;
      } else {
        # Ignore option.
        $ignored_opts .= "$switch $argument ";
      }
    }
  }
  if ($ARGV[0] =~ m/^([\w_][\w\d_]*)+=(\d+):(\d+)$/) { # e.g. JOB=1:20
    $jobname = $1;
    $jobstart = $2;
    $jobend = $3;
    shift;
    if ($jobstart > $jobend) {
      die "run.pl: invalid job range $ARGV[0]";
    }
    if ($jobstart <= 0) {
      die "run.pl: invalid job range $ARGV[0], start must be strictly positive (this is required for GridEngine compatibility).";
    }
  } elsif ($ARGV[0] =~ m/^([\w_][\w\d_]*)+=(\d+)$/) { # e.g. JOB=1.
    $jobname = $1;
    $jobstart = $2;
    $jobend = $2;
    shift;
  } elsif ($ARGV[0] =~ m/.+\=.*\:.*$/) {
    print STDERR "run.pl: Warning: suspicious first argument to run.pl: $ARGV[0]\n";
  }
}

# Users found this message confusing so we are removing it.
# if ($ignored_opts ne "") {
#   print STDERR "run.pl: Warning: ignoring options \"$ignored_opts\"\n";
# }

if ($max_jobs_run == -1) { # If --max-jobs-run option not set,
                           # then work out the number of processors if possible,
                           # and set it based on that.
  $max_jobs_run = 0;
  if ($using_gpu) {
    if (open(P, "nvidia-smi -L |")) {
      $max_jobs_run++ while (<P>);
      close(P);
    }
    if ($max_jobs_run == 0) {
      $max_jobs_run = 1;
      print STDERR "run.pl: Warning: failed to detect number of GPUs from nvidia-smi, using ${max_jobs_run}\n";
    }
  } elsif (open(P, "</proc/cpuinfo")) {  # Linux
    while (<P>) { if (m/^processor/) { $max_jobs_run++; } }
    if ($max_jobs_run == 0) {
      print STDERR "run.pl: Warning: failed to detect any processors from /proc/cpuinfo\n";
      $max_jobs_run = 10;  # reasonable default.
    }
    close(P);
  } elsif (open(P, "sysctl -a |")) {  # BSD/Darwin
    while (<P>) {
      if (m/hw\.ncpu\s*[:=]\s*(\d+)/) { # hw.ncpu = 4, or hw.ncpu: 4
        $max_jobs_run = $1;
        last;
      }
    }
    close(P);
    if ($max_jobs_run == 0) {
      print STDERR "run.pl: Warning: failed to detect any processors from sysctl -a\n";
      $max_jobs_run = 10;  # reasonable default.
    }
  } else {
    # allow at most 32 jobs at once, on non-UNIX systems; change this code
    # if you need to change this default.
    $max_jobs_run = 32;
  }
  # The just-computed value of $max_jobs_run is just the number of processors
  # (or our best guess); and if it happens that the number of jobs we need to
  # run is just slightly above $max_jobs_run, it will make sense to increase
  # $max_jobs_run to equal the number of jobs, so we don't have a small number
  # of leftover jobs.
  $num_jobs = $jobend - $jobstart + 1;
  if (!$using_gpu &&
      $num_jobs > $max_jobs_run && $num_jobs < 1.4 * $max_jobs_run) {
    $max_jobs_run = $num_jobs;
  }
}

$logfile = shift @ARGV;

if (defined $jobname && $logfile !~ m/$jobname/ &&
    $jobend > $jobstart) {
  print STDERR "run.pl: you are trying to run a parallel job but "
    . "you are putting the output into just one log file ($logfile)\n";
  exit(1);
}

$cmd = "";

foreach $x (@ARGV) {
    if ($x =~ m/^\S+$/) { $cmd .=  $x . " "; }
    elsif ($x =~ m:\":) { $cmd .= "'$x' "; }
    else { $cmd .= "\"$x\" "; }
}

#$Data::Dumper::Indent=0;
$ret = 0;
$numfail = 0;
%active_pids=();

use POSIX ":sys_wait_h";
for ($jobid = $jobstart; $jobid <= $jobend; $jobid++) {
  if (scalar(keys %active_pids) >= $max_jobs_run) {

    # Lets wait for a change in any child's status
    # Then we have to work out which child finished
    $r = waitpid(-1, 0);
    $code = $?;
    if ($r < 0 ) { die "run.pl: Error waiting for child process"; } # should never happen.
    if ( defined $active_pids{$r} ) {
        $jid=$active_pids{$r};
        $fail[$jid]=$code;
        if ($code !=0) { $numfail++;}
        delete $active_pids{$r};
        # print STDERR "Finished: $r/$jid " .  Dumper(\%active_pids) . "\n";
    } else {
        die "run.pl: Cannot find the PID of the chold process that just finished.";
    }

    # In theory we could do a non-blocking waitpid over all jobs running just
    # to find out if only one or more jobs finished during the previous waitpid()
    # However, we just omit this and will reap the next one in the next pass
    # through the for(;;) cycle
  }
  $childpid = fork();
  if (!defined $childpid) { die "run.pl: Error forking in run.pl (writing to $logfile)"; }
  if ($childpid == 0) { # We're in the child... this branch
    # executes the job and returns (possibly with an error status).
    if (defined $jobname) {
      $cmd =~ s/$jobname/$jobid/g;
      $logfile =~ s/$jobname/$jobid/g;
    }
    system("mkdir -p `dirname $logfile` 2>/dev/null");
    open(F, ">$logfile") || die "run.pl: Error opening log file $logfile";
    print F "# " . $cmd . "\n";
    print F "# Started at " . `date`;
    $starttime = `date +'%s'`;
    print F "#\n";
    close(F);

    # Pipe into bash.. make sure we're not using any other shell.
    open(B, "|bash") || die "run.pl: Error opening shell command";
    print B "( " . $cmd . ") |& tee -a $logfile";
    close(B);                   # If there was an error, exit status is in $?
    $ret = $?;

    $lowbits = $ret & 127;
    $highbits = $ret >> 8;
    if ($lowbits != 0) { $return_str = "code $highbits; signal $lowbits" }
    else { $return_str = "code $highbits"; }

    $endtime = `date +'%s'`;
    open(F, ">>$logfile") || die "run.pl: Error opening log file $logfile (again)";
    $enddate = `date`;
    chop $enddate;
    print F "# Accounting: time=" . ($endtime - $starttime) . " threads=1\n";
    print F "# Ended ($return_str) at " . $enddate . ", elapsed time " . ($endtime-$starttime) . " seconds\n";
    close(F);
    exit($ret == 0 ? 0 : 1);
  } else {
    $pid[$jobid] = $childpid;
    $active_pids{$childpid} = $jobid;
    # print STDERR "Queued: " .  Dumper(\%active_pids) . "\n";
  }
}

# Now we have submitted all the jobs, lets wait until all the jobs finish
foreach $child (keys %active_pids) {
    $jobid=$active_pids{$child};
    $r = waitpid($pid[$jobid], 0);
    $code = $?;
    if ($r == -1) { die "run.pl: Error waiting for child process"; } # should never happen.
    if ($r != 0) { $fail[$jobid]=$code; $numfail++ if $code!=0; } # Completed successfully
}

# Some sanity checks:
# The $fail array should not contain undefined codes
# The number of non-zeros in that array  should be equal to $numfail
# We cannot do foreach() here, as the JOB ids do not necessarily start by zero
$failed_jids=0;
for ($jobid = $jobstart; $jobid <= $jobend; $jobid++) {
  $job_return = $fail[$jobid];
  if (not defined $job_return ) {
    # print Dumper(\@fail);

    die "run.pl: Sanity check failed: we have indication that some jobs are running " .
      "even after we waited for all jobs to finish" ;
  }
  if ($job_return != 0 ){ $failed_jids++;}
}
if ($failed_jids != $numfail) {
  die "run.pl: Sanity check failed: cannot find out how many jobs failed ($failed_jids x $numfail)."
}
if ($numfail > 0) { $ret = 1; }

if ($ret != 0) {
  $njobs = $jobend - $jobstart + 1;
  if ($njobs == 1) {
    if (defined $jobname) {
      $logfile =~ s/$jobname/$jobstart/; # only one numbered job, so replace name with
                                         # that job.
    }
    print STDERR "run.pl: job failed, log is in $logfile\n";
    if ($logfile =~ m/JOB/) {
      print STDERR "run.pl: probably you forgot to put JOB=1:\$nj in your script.";
    }
  }
  else {
    $logfile =~ s/$jobname/*/g;
    print STDERR "run.pl: $numfail / $njobs failed, log is in $logfile\n";
  }
}


exit ($ret);


================================================
FILE: egs/espnet_utils/synth_wav.sh
================================================
#!/usr/bin/env bash

# Copyright 2019 Nagoya University (Takenori Yoshimura)
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

if [ ! -f path.sh ] || [ ! -f cmd.sh ]; then
    echo "Please change directory to e.g., egs/ljspeech/tts1"
    exit 1
fi

# shellcheck disable=SC1091
. ./path.sh || exit 1;
# shellcheck disable=SC1091
. ./cmd.sh || exit 1;

# general configuration
backend=pytorch
stage=0        # start from 0 if you need to start from data preparation
stop_stage=100
ngpu=0         # number of gpus ("0" uses cpu, otherwise use gpu)
debugmode=1
verbose=1      # verbose option

# feature configuration
fs=22050      # sampling frequency
fmax=""       # maximum frequency
fmin=""       # minimum frequency
n_mels=80     # number of mel basis
n_fft=1024    # number of fft points
n_shift=256   # number of shift points
win_length="" # window length
cmvn=

# dictionary related
dict=
trans_type="char"

# embedding related
input_wav=

# decoding related
synth_model=
decode_config=
decode_dir=decode
griffin_lim_iters=64

# download related
models=ljspeech.transformer.v1
vocoder_models=ljspeech.parallel_wavegan.v1

help_message=$(cat <<EOF
Usage:
    $ $0 <text>

Note:
    This code does not include text frontend part. Please clean the input
    text manually. Also, you need to modify feature configuration according
    to the model. Default setting is for ljspeech models, so if you want to
    use other pretrained models, please modify the parameters by yourself.
    For our provided models, you can find them in the tables at
    https://github.com/espnet/espnet#tts-demo.
    If you are beginner, instead of this script, I strongly recommend trying
    the following colab notebook at first, which includes all of the procedure
    from text frontend, feature generation, and waveform generation.
    https://colab.research.google.com/github/espnet/notebook/blob/master/tts_realtime_demo.ipynb

Example:
    # make text file and then generate it
    # (for the default model, ljspeech, we use upper-case char sequence as the input)
    echo "THIS IS A DEMONSTRATION OF TEXT TO SPEECH." > example.txt
    $0 example.txt

    # also you can use multiple text
    echo "THIS IS A DEMONSTRATION OF TEXT TO SPEECH." > example.txt
    echo "TEXT TO SPEECH IS A TECHQNIQUE TO CONVERT TEXT INTO SPEECH." >> example.txt
    $0 example.txt

    # you can specify the pretrained models
    $0 --models ljspeech.transformer.v3 example.txt

    # also you can specify vocoder model
    $0 --vocoder_models ljspeech.wavenet.mol.v2 example.txt

Available models:
    - ljspeech.tacotron2.v1
    - ljspeech.tacotron2.v2
    - ljspeech.tacotron2.v3
    - ljspeech.transformer.v1
    - ljspeech.transformer.v2
    - ljspeech.transformer.v3
    - ljspeech.fastspeech.v1
    - ljspeech.fastspeech.v2
    - ljspeech.fastspeech.v3
    - libritts.tacotron2.v1
    - libritts.transformer.v1
    - jsut.transformer.v1
    - jsut.tacotron2.v1
    - csmsc.transformer.v1
    - csmsc.fastspeech.v3

Available vocoder models:
    - ljspeech.wavenet.softmax.ns.v1
    - ljspeech.wavenet.mol.v1
    - ljspeech.parallel_wavegan.v1
    - libritts.wavenet.mol.v1
    - jsut.wavenet.mol.v1
    - jsut.parallel_wavegan.v1
    - csmsc.wavenet.mol.v1
    - csmsc.parallel_wavegan.v1

Model details:
    | Model name              | Lang | Fs [Hz] | Mel range [Hz] | FFT / Shift / Win [pt] | Input type |
    | ----------------------- | ---- | ------- | -------------- | ---------------------- | ---------- |
    | ljspeech.tacotron2.v1   | EN   | 22.05k  | None           | 1024 / 256 / None      | char       |
    | ljspeech.tacotron2.v2   | EN   | 22.05k  | None           | 1024 / 256 / None      | char       |
    | ljspeech.tacotron2.v3   | EN   | 22.05k  | None           | 1024 / 256 / None      | char       |
    | ljspeech.transformer.v1 | EN   | 22.05k  | None           | 1024 / 256 / None      | char       |
    | ljspeech.transformer.v2 | EN   | 22.05k  | None           | 1024 / 256 / None      | char       |
    | ljspeech.transformer.v3 | EN   | 22.05k  | None           | 1024 / 256 / None      | phn        |
    | ljspeech.fastspeech.v1  | EN   | 22.05k  | None           | 1024 / 256 / None      | char       |
    | ljspeech.fastspeech.v2  | EN   | 22.05k  | None           | 1024 / 256 / None      | char       |
    | ljspeech.fastspeech.v3  | EN   | 22.05k  | None           | 1024 / 256 / None      | phn        |
    | libritts.tacotron2.v1   | EN   | 24k     | 80-7600        | 1024 / 256 / None      | char       |
    | libritts.transformer.v1 | EN   | 24k     | 80-7600        | 1024 / 256 / None      | char       |
    | jsut.tacotron2          | JP   | 24k     | 80-7600        | 2048 / 300 / 1200      | phn        |
    | jsut.transformer        | JP   | 24k     | 80-7600        | 2048 / 300 / 1200      | phn        |
    | csmsc.transformer.v1    | ZH   | 24k     | 80-7600        | 2048 / 300 / 1200      | pinyin     |
    | csmsc.fastspeech.v3     | ZH   | 24k     | 80-7600        | 2048 / 300 / 1200      | pinyin     |

Vocoder model details:
    | Model name                     | Lang | Fs [Hz] | Mel range [Hz] | FFT / Shift / Win [pt] | Model type       |
    | ------------------------------ | ---- | ------- | -------------- | ---------------------- | ---------------- |
    | ljspeech.wavenet.softmax.ns.v1 | EN   | 22.05k  | None           | 1024 / 256 / None      | Softmax WaveNet  |
    | ljspeech.wavenet.mol.v1        | EN   | 22.05k  | None           | 1024 / 256 / None      | MoL WaveNet      |
    | ljspeech.parallel_wavegan.v1   | EN   | 22.05k  | None           | 1024 / 256 / None      | Parallel WaveGAN |
    | libritts.wavenet.mol.v1        | EN   | 24k     | None           | 1024 / 256 / None      | MoL WaveNet      |
    | jsut.wavenet.mol.v1            | JP   | 24k     | 80-7600        | 2048 / 300 / 1200      | MoL WaveNet      |
    | jsut.parallel_wavegan.v1       | JP   | 24k     | 80-7600        | 2048 / 300 / 1200      | Parallel WaveGAN |
    | csmsc.wavenet.mol.v1           | ZH   | 24k     | 80-7600        | 2048 / 300 / 1200      | MoL WaveNet      |
    | csmsc.parallel_wavegan.v1      | ZH   | 24k     | 80-7600        | 2048 / 300 / 1200      | Parallel WaveGAN |

EOF
)

# shellcheck disable=SC1091
. utils/parse_options.sh || exit 1;

txt=$1
download_dir=${decode_dir}/download

if [ $# -ne 1 ]; then
    echo "${help_message}"
    exit 1;
fi

set -e
set -u
set -o pipefail

function download_models () {
    case "${models}" in
        "ljspeech.tacotron2.v1") share_url="https://drive.google.com/open?id=1dKzdaDpOkpx7kWZnvrvx2De7eZEdPHZs" ;;
        "ljspeech.tacotron2.v2") share_url="https://drive.google.com/open?id=11T9qw8rJlYzUdXvFjkjQjYrp3iGfQ15h" ;;
        "ljspeech.tacotron2.v3") share_url="https://drive.google.com/open?id=1hiZn14ITUDM1nkn-GkaN_M3oaTOUcn1n" ;;
        "ljspeech.transformer.v1") share_url="https://drive.google.com/open?id=13DR-RB5wrbMqBGx_MC655VZlsEq52DyS" ;;
        "ljspeech.transformer.v2") share_url="https://drive.google.com/open?id=1xxAwPuUph23RnlC5gym7qDM02ZCW9Unp" ;;
        "ljspeech.transformer.v3") share_url="https://drive.google.com/open?id=1M_w7nxI6AfbtSHpMO-exILnAc_aUYvXP" ;;
        "ljspeech.fastspeech.v1") share_url="https://drive.google.com/open?id=17RUNFLP4SSTbGA01xWRJo7RkR876xM0i" ;;
        "ljspeech.fastspeech.v2") share_url="https://drive.google.com/open?id=1zD-2GMrWM3thaDpS3h3rkTU4jIC0wc5B";;
        "ljspeech.fastspeech.v3") share_url="https://drive.google.com/open?id=1W86YEQ6KbuUTIvVURLqKtSNqe_eI2GDN";;
        "libritts.tacotron2.v1") share_url="https://drive.google.com/open?id=1iAXwC0AuWusa9AcFeUVkcNLG0I-hnSr3" ;;
        "libritts.transformer.v1") share_url="https://drive.google.com/open?id=1Xj73mDPuuPH8GsyNO8GnOC3mn0_OK4g3";;
        "jsut.transformer.v1") share_url="https://drive.google.com/open?id=1mEnZfBKqA4eT6Bn0eRZuP6lNzL-IL3VD" ;;
        "jsut.tacotron2.v1") share_url="https://drive.google.com/open?id=1kp5M4VvmagDmYckFJa78WGqh1drb_P9t" ;;
        "csmsc.transformer.v1") share_url="https://drive.google.com/open?id=1bTSygvonv5TS6-iuYsOIUWpN2atGnyhZ";;
        "csmsc.fastspeech.v3") share_url="https://drive.google.com/open?id=1T8thxkAxjGFPXPWPTcKLvHnd6lG0-82R";;
        *) echo "No such models: ${models}"; exit 1 ;;
    esac

    dir=${download_dir}/${models}
    mkdir -p "${dir}"
    if [ ! -e "${dir}/.complete" ]; then
        download_from_google_drive.sh "${share_url}" "${dir}" "tar.gz"
	touch "${dir}/.complete"
    fi
}

function download_vocoder_models () {
    case "${vocoder_models}" in
        "ljspeech.wavenet.softmax.ns.v1") share_url="https://drive.google.com/open?id=1eA1VcRS9jzFa-DovyTgJLQ_jmwOLIi8L";;
        "ljspeech.wavenet.mol.v1") share_url="https://drive.google.com/open?id=1sY7gEUg39QaO1szuN62-Llst9TrFno2t";;
        "ljspeech.parallel_wavegan.v1") share_url="https://drive.google.com/open?id=1tv9GKyRT4CDsvUWKwH3s_OfXkiTi0gw7";;
        "libritts.wavenet.mol.v1") share_url="https://drive.google.com/open?id=1jHUUmQFjWiQGyDd7ZeiCThSjjpbF_B4h";;
        "jsut.wavenet.mol.v1") share_url="https://drive.google.com/open?id=187xvyNbmJVZ0EZ1XHCdyjZHTXK9EcfkK";;
        "jsut.parallel_wavegan.v1") share_url="https://drive.google.com/open?id=1OwrUQzAmvjj1x9cDhnZPp6dqtsEqGEJM";;
        "csmsc.wavenet.mol.v1") share_url="https://drive.google.com/open?id=1PsjFRV5eUP0HHwBaRYya9smKy5ghXKzj";;
        "csmsc.parallel_wavegan.v1") share_url="https://drive.google.com/open?id=10M6H88jEUGbRWBmU1Ff2VaTmOAeL8CEy";;
        *) echo "No such models: ${vocoder_models}"; exit 1 ;;
    esac

    dir=${download_dir}/${vocoder_models}
    mkdir -p "${dir}"
    if [ ! -e "${dir}/.complete" ]; then
        download_from_google_drive.sh "${share_url}" "${dir}" ".tar.gz"
	touch "${dir}/.complete"
    fi
}

# Download trained models
if [ -z "${cmvn}" ]; then
    download_models
    cmvn=$(find "${download_dir}/${models}" -name "cmvn.ark" | head -n 1)
fi
if [ -z "${dict}" ]; then
    download_models
    dict=$(find "${download_dir}/${models}" -name "*_units.txt" | head -n 1)
fi
if [ -z "${synth_model}" ]; then
    download_models
    synth_model=$(find "${download_dir}/${models}" -name "model*.best" | head -n 1)
fi
if [ -z "${decode_config}" ]; then
    download_models
    decode_config=$(find "${download_dir}/${models}" -name "decode*.yaml" | head -n 1)
fi

synth_json=$(basename "${synth_model}")
model_json="$(dirname "${synth_model}")/${synth_json%%.*}.json"
use_speaker_embedding=$(grep use_speaker_embedding "${model_json}" | sed -e "s/.*: \(.*\),/\1/")
if [ "${use_speaker_embedding}" = "false" ] || [ "${use_speaker_embedding}" = "0" ]; then
    use_input_wav=false
else
    use_input_wav=true
fi
if [ -z "${input_wav}" ] && "${use_input_wav}"; then
    download_models
    input_wav=$(find "${download_dir}/${models}" -name "*.wav" | head -n 1)
fi

# Check file existence
if [ ! -f "${cmvn}" ]; then
    echo "No such CMVN file: ${cmvn}"
    exit 1
fi
if [ ! -f "${dict}" ]; then
    echo "No such dictionary: ${dict}"
    exit 1
fi
if [ ! -f "${synth_model}" ]; then
    echo "No such E2E model: ${synth_model}"
    exit 1
fi
if [ ! -f "${decode_config}" ]; then
    echo "No such config file: ${decode_config}"
    exit 1
fi
if [ ! -f "${input_wav}" ] && ${use_input_wav}; then
    echo "No such WAV file for extracting meta information: ${input_wav}"
    exit 1
fi
if [ ! -f "${txt}" ]; then
    echo "No such txt file: ${txt}"
    exit 1
fi

base=$(basename "${txt}" .txt)
decode_dir=${decode_dir}/${base}

if [ "${stage}" -le 0 ] && [ "${stop_stage}" -ge 0 ]; then
    echo "stage 0: Data preparation"

    [ -e "${decode_dir}/data" ] && rm -rf "${decode_dir}/data"
    mkdir -p "${decode_dir}/data"
    num_lines=$(wc -l < "${txt}")
    for idx in $(seq "${num_lines}"); do
        echo "${base}_${idx} X" >> "${decode_dir}/data/wav.scp"
        echo "X ${base}_${idx}" >> "${decode_dir}/data/spk2utt"
        echo "${base}_${idx} X" >> "${decode_dir}/data/utt2spk"
        echo -n "${base}_${idx} " >> "${decode_dir}/data/text"
        sed -n "${idx}"p "${txt}" >> "${decode_dir}/data/text"
    done

    mkdir -p "${decode_dir}/dump"
    data2json.sh --trans_type "${trans_type}" "${decode_dir}/data" "${dict}" > "${decode_dir}/dump/data.json"
fi

if [ "${stage}" -le 1 ] && [ "${stop_stage}" -ge 1 ] && "${use_input_wav}"; then
    echo "stage 1: x-vector extraction"

    utils/copy_data_dir.sh "${decode_dir}/data" "${decode_dir}/data2"
    sed -i -e "s;X$;${input_wav};g" "${decode_dir}/data2/wav.scp"
    utils/data/resample_data_dir.sh 16000 "${decode_dir}/data2"
    # shellcheck disable=SC2154
    steps/make_mfcc.sh \
        --write-utt2num-frames true \
        --mfcc-config conf/mfcc.conf \
        --nj 1 --cmd "${train_cmd}" \
        "${decode_dir}/data2" "${decode_dir}/log" "${decode_dir}/mfcc"
    utils/fix_data_dir.sh "${decode_dir}/data2"
    sid/compute_vad_decision.sh --nj 1 --cmd "$train_cmd" \
        "${decode_dir}/data2" "${decode_dir}/log" "${decode_dir}/mfcc"
    utils/fix_data_dir.sh "${decode_dir}/data2"

    nnet_dir=${download_dir}/xvector_nnet_1a
    if [ ! -e "${nnet_dir}" ]; then
        echo "X-vector model does not exist. Download pre-trained model."
        wget http://kaldi-asr.org/models/8/0008_sitw_v2_1a.tar.gz
        tar xvf 0008_sitw_v2_1a.tar.gz
        mv 0008_sitw_v2_1a/exp/xvector_nnet_1a "${download_dir}"
        rm -rf 0008_sitw_v2_1a.tar.gz 0008_sitw_v2_1a
    fi
    sid/nnet3/xvector/extract_xvectors.sh --cmd "${train_cmd} --mem 4G" --nj 1 \
        "${nnet_dir}" "${decode_dir}/data2" \
        "${decode_dir}/xvectors"

    local/update_json.sh "${decode_dir}/dump/data.json" "${decode_dir}/xvectors/xvector.scp"
fi

if [ "${stage}" -le 2 ] && [ "${stop_stage}" -ge 2 ]; then
    echo "stage 2: Decoding"

    # shellcheck disable=SC2154
    ${decode_cmd} "${decode_dir}/log/decode.log" \
        tts_decode.py \
        --config "${decode_config}" \
        --ngpu "${ngpu}" \
        --backend "${backend}" \
        --debugmode "${debugmode}" \
        --verbose "${verbose}" \
        --out "${decode_dir}/outputs/feats" \
        --json "${decode_dir}/dump/data.json" \
        --model "${synth_model}"
fi

outdir=${decode_dir}/outputs; mkdir -p "${outdir}_denorm"
if [ "${stage}" -le 3 ] && [ "${stop_stage}" -ge 3 ]; then
    echo "stage 3: Synthesis with Griffin-Lim"

    apply-cmvn --norm-vars=true --reverse=true "${cmvn}" \
        scp:"${outdir}/feats.scp" \
        ark,scp:"${outdir}_denorm/feats.ark,${outdir}_denorm/feats.scp"

    convert_fbank.sh --nj 1 --cmd "${decode_cmd}" \
        --fs "${fs}" \
        --fmax "${fmax}" \
        --fmin "${fmin}" \
        --n_fft "${n_fft}" \
        --n_shift "${n_shift}" \
        --win_length "${win_length}" \
        --n_mels "${n_mels}" \
        --iters "${griffin_lim_iters}" \
        "${outdir}_denorm" \
        "${decode_dir}/log" \
        "${decode_dir}/wav"

    echo ""
    echo "Synthesized wav: ${decode_dir}/wav/${base}.wav"
    echo ""
    echo "Finished"
fi

if [ "${stage}" -le 4 ] && [ "${stop_stage}" -ge 4 ]; then
    echo "stage 4: Synthesis with Neural Vocoder"
    model_corpus=$(echo ${models} | cut -d. -f 1)
    vocoder_model_corpus=$(echo ${vocoder_models} | cut -d. -f 1)
    if [ "${model_corpus}" != "${vocoder_model_corpus}" ]; then
        echo "${vocoder_models} does not support ${models} (Due to the sampling rate mismatch)."
        exit 1
    fi
    download_vocoder_models
    dst_dir=${decode_dir}/wav_wnv

    # This is hardcoded for now.
    if [[ "${vocoder_models}" == *".mol."* ]]; then
        # Needs to use https://github.com/r9y9/wavenet_vocoder
        # that supports mixture of logistics/gaussians
        MDN_WAVENET_VOC_DIR=./local/r9y9_wavenet_vocoder
        if [ ! -d "${MDN_WAVENET_VOC_DIR}" ]; then
            git clone https://github.com/r9y9/wavenet_vocoder "${MDN_WAVENET_VOC_DIR}"
            cd "${MDN_WAVENET_VOC_DIR}" && pip install . && cd -
        fi
        checkpoint=$(find "${download_dir}/${vocoder_models}" -name "*.pth" | head -n 1)
        feats2npy.py "${outdir}/feats.scp" "${outdir}_npy"
        python3 ${MDN_WAVENET_VOC_DIR}/evaluate.py "${outdir}_npy" "${checkpoint}" "${dst_dir}" \
            --hparams "batch_size=1" \
            --verbose "${verbose}"
        rm -rf "${outdir}_npy"
    elif [[ "${vocoder_models}" == *".parallel_wavegan."* ]]; then
        checkpoint=$(find "${download_dir}/${vocoder_models}" -name "*.pkl" | head -n 1)
        if ! command -v parallel-wavegan-decode > /dev/null; then
            pip install parallel-wavegan
        fi
        parallel-wavegan-decode \
            --scp "${outdir}/feats.scp" \
            --checkpoint "${checkpoint}" \
            --outdir "${dst_dir}" \
            --verbose "${verbose}"
    else
        checkpoint=$(find "${download_dir}/${vocoder_models}" -name "checkpoint*" | head -n 1)
        generate_wav.sh --nj 1 --cmd "${decode_cmd}" \
            --fs "${fs}" \
            --n_fft "${n_fft}" \
            --n_shift "${n_shift}" \
            "${checkpoint}" \
            "${outdir}_denorm" \
            "${decode_dir}/log" \
            "${dst_dir}"
    fi
    echo ""
    echo "Synthesized wav: ${decode_dir}/wav_wnv/${base}.wav"
    echo ""
    echo "Finished"
fi


================================================
FILE: egs/espnet_utils/text2token.py
================================================
#!/usr/bin/env python3

# Copyright 2017 Johns Hopkins University (Shinji Watanabe)
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)


import argparse
import codecs
import re
import sys

is_python2 = sys.version_info[0] == 2


def exist_or_not(i, match_pos):
    start_pos = None
    end_pos = None
    for pos in match_pos:
        if pos[0] <= i < pos[1]:
            start_pos = pos[0]
            end_pos = pos[1]
            break

    return start_pos, end_pos


def get_parser():
    parser = argparse.ArgumentParser(
        description="convert raw text to tokenized text",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
    )
    parser.add_argument(
        "--nchar",
        "-n",
        default=1,
        type=int,
        help="number of characters to split, i.e., \
                        aabb -> a a b b with -n 1 and aa bb with -n 2",
    )
    parser.add_argument(
        "--skip-ncols", "-s", default=0, type=int, help="skip first n columns"
    )
    parser.add_argument("--space", default="<space>", type=str, help="space symbol")
    parser.add_argument(
        "--non-lang-syms",
        "-l",
        default=None,
        type=str,
        help="list of non-linguistic symobles, e.g., <NOISE> etc.",
    )
    parser.add_argument("text", type=str, default=False, nargs="?", help="input text")
    parser.add_argument(
        "--trans_type",
        "-t",
        type=str,
        default="char",
        choices=["char", "phn"],
        help="""Transcript type. char/phn. e.g., for TIMIT FADG0_SI1279 -
                        If trans_type is char,
                        read from SI1279.WRD file -> "bricks are an alternative"
                        Else if trans_type is phn,
                        read from SI1279.PHN file -> "sil b r ih sil k s aa r er n aa l
                        sil t er n ih sil t ih v sil" """,
    )
    return parser


def main():
    parser = get_parser()
    args = parser.parse_args()

    rs = []
    if args.non_lang_syms is not None:
        with codecs.open(args.non_lang_syms, "r", encoding="utf-8") as f:
            nls = [x.rstrip() for x in f.readlines()]
            rs = [re.compile(re.escape(x)) for x in nls]

    if args.text:
        f = codecs.open(args.text, encoding="utf-8")
    else:
        f = codecs.getreader("utf-8")(sys.stdin if is_python2 else sys.stdin.buffer)

    sys.stdout = codecs.getwriter("utf-8")(
        sys.stdout if is_python2 else sys.stdout.buffer
    )
    line = f.readline()
    n = args.nchar
    while line:
        x = line.split()
        print(" ".join(x[: args.skip_ncols]), end=" ")
        a = " ".join(x[args.skip_ncols :])

        # get all matched positions
        match_pos = []
        for r in rs:
            i = 0
            while i >= 0:
                m = r.search(a, i)
                if m:
                    match_pos.append([m.start(), m.end()])
                    i = m.end()
                else:
                    break

        if args.trans_type == "phn":
            a = a.split(" ")
        else:
            if len(match_pos) > 0:
                chars = []
                i = 0
                while i < len(a):
                    start_pos, end_pos = exist_or_not(i, match_pos)
                    if start_pos is not None:
                        chars.append(a[start_pos:end_pos])
                        i = end_pos
                    else:
                        chars.append(a[i])
                        i += 1
                a = chars

            a = [a[j : j + n] for j in range(0, len(a), n)]

        a_flat = []
        for z in a:
            a_flat.append("".join(z))

        a_chars = [z.replace(" ", args.space) for z in a_flat]
        if args.trans_type == "phn":
            a_chars = [z.replace("sil", args.space) for z in a_chars]
        print(" ".join(a_chars))
        line = f.readline()


if __name__ == "__main__":
    main()


================================================
FILE: egs/espnet_utils/text2vocabulary.py
================================================
#!/usr/bin/env python3

# Copyright 2018 Mitsubishi Electric Research Laboratories (Takaaki Hori)
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

import argparse
import codecs
import logging
import six
import sys

is_python2 = sys.version_info[0] == 2


def get_parser():
    parser = argparse.ArgumentParser(
        description="create a vocabulary file from text files",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
    )
    parser.add_argument(
        "--output", "-o", default="", type=str, help="output a vocabulary file"
    )
    parser.add_argument("--cutoff", "-c", default=0, type=int, help="cut-off frequency")
    parser.add_argument(
        "--vocabsize", "-s", default=20000, type=int, help="vocabulary size"
    )
    parser.add_argument("text_files", nargs="*", help="input text files")
    return parser


if __name__ == "__main__":
    parser = get_parser()
    args = parser.parse_args()

    # count the word occurrences
    counts = {}
    exclude = ["<sos>", "<eos>", "<unk>"]
    if len(args.text_files) == 0:
        args.text_files.append("-")
    for fn in args.text_files:
        fd = (
            codecs.open(fn, "r", encoding="utf-8")
            if fn != "-"
            else codecs.getreader("utf-8")(
                sys.stdin if is_python2 else sys.stdin.buffer
            )
        )
        for ln in fd.readlines():
            for tok in ln.split():
                if tok not in exclude:
                    if tok not in counts:
                        counts[tok] = 1
                    else:
                        counts[tok] += 1
        if fn != "-":
            fd.close()

    # limit the vocabulary size
    total_count = sum(counts.values())
    invocab_count = 0
    vocabulary = []
    for w, c in sorted(counts.items(), key=lambda x: -x[1]):
        if c <= args.cutoff:
            break
        if len(vocabulary) >= args.vocabsize:
            break
        vocabulary.append(w)
        invocab_count += c

    logging.warning(
        "OOV rate = %.2f %%" % (float(total_count - invocab_count) / total_count * 100)
    )
    # write the vocabulary
    fd = (
        codecs.open(args.output, "w", encoding="utf-8")
        if args.output
        else codecs.getwriter("utf-8")(sys.stdout if is_python2 else sys.stdout.buffer)
    )
    six.print_("<unk> 1", file=fd)
    for n, w in enumerate(sorted(vocabulary)):
        six.print_("%s %d" % (w, n + 2), file=fd)
    if args.output:
        fd.close()


================================================
FILE: egs/espnet_utils/text_norm.py
================================================
# author: tyriontian
# tyriontian@tencent.com

import sys
import os
import jieba
import argparse
import cn2an
from string import punctuation as en_pun
from zhon.hanzi import punctuation as zh_pun
pun = en_pun + zh_pun

def remove_punc(s):
    for c in pun:
        s = s.replace(c, "")
    return s

def is_all_chinese(strs):
    for _char in strs:
        if not '\u4e00' <= _char <= '\u9fa5':
            return False
    return True

def is_contain_chinese(check_str):
    for ch in check_str:
        if u'\u4e00' <= ch <= u'\u9fff':
            return True
    return False

def splice(s):
    # the segmentation results are only on Chinese
    # we do not splice English
    buf = []
    ans = []
    for c in s:
        if is_all_chinese(c):
            if buf:
                buf_str = "".join(buf)
                buf = []
                ans.append(buf_str)
            ans.append(c)
        else:
            buf.append(c)

    # incase the last c is not Chinese
    if buf:
        buf_str = "".join(buf)
        ans.append(buf_str)

    ans = " ".join(ans)
    return ans

digit_dict = {"0": "零",
              "1": "一",
              "2": "二",
              "3": "三",
              "4": "四",
              "5": "五",
              "6": "六",
              "7": "七",
              "8": "八",
              "9": "九",}
def digit_norm(s):
    out = ""
    buf = ""
    for c in s:
        if not c.isdigit():
            if buf:
                try:
                    digit_str = cn2an.an2cn(buf)
                except:
                    print(f"cannot convert digit {buf}")
                    digit_str = "".join([digit_dict.get(x, "") for x in buf])
                out += digit_str
                buf = ""
            out += c
        else:
            buf += c
    if buf:
        buf = cn2an.an2cn(buf)
        out += buf
    return out
       

def remove_blank_chn(s):
    s = s.strip()
    out = ""
    for i in range(len(s)):
        if not s[i] == " ":
            out += s[i]
        else:
            a = is_all_chinese(s[i-1])
            b = is_all_chinese(s[i+1])
            # if not a and not b: 
            if not a or not b: # keep chn-eng <space> 
                out += s[i]
    return out

def add_blank_boundary(s):
    s = s.strip()
    out = ""
    for i in range(len(s) - 1):
        out += s[i]
        
        a = is_all_chinese(s[i])
        b = is_all_chinese(s[i+1])
        if a ^ b:
            out += " "
    
    out += s[-1]
    out = out.strip()
    return out

def split_eng_words(s):
    out = []
    for w in s.strip().split():
        if is_all_chinese(w):
            out.append(w)
        else:
            for c in w:
                out.append(c)
    out = " ".join(out)
    return out

def split_chn_words(s):
    out = []
    for w in s.strip().split():
        if is_all_chinese(w):
            out.extend(list(w))
        else:
            out.extend(w.split())
    out = " ".join(out)
    return out

def upper_or_lower(s, upper=True):
    if upper:
        return s.upper()
    else:
        return s.lower()
 
def process_one_line(content, args):
    # (1) remove punctuation and space
    content = remove_punc(content)

    # (2) remove ignore symbols
    if args.ignore is not None:
        ignores = args.ignore.split(",")
        for c in ignores:
            content.replace(c, "")
    
    # (3) digit norm and upper/lower
    content = digit_norm(content)
    content = upper_or_lower(content, args.eng_upper)

    # (4) remove all blank except those between eng words
    #     This is for kaldi/text
    content = remove_blank_chn(content)
    
    if args.segment_chn:
        content = split_chn_words(content) 

    if not args.segment:
        return content

    # (5) split by jieba. There should be a blank
    #     at any chn-eng boundary 
    else:
        content = add_blank_boundary(content)
        if args.segment_eng:
            content = split_eng_words(content)
        content = content.strip().split()
        out = []
        for p in content:
            if is_all_chinese(p):
                out.extend(jieba.lcut(p, HMM=False))
            else:
                out.append(p)
        out = " ".join(out)
        return out

def get_parser():
    parser = argparse.ArgumentParser(
        description="Normalize the text",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
    )
    parser.add_argument(
        "--in-f", type=str, help="input file")
    parser.add_argument(
        "--out-f", type=str, help="output file")
    parser.add_argument(
        "--freq-dict", type=str, default=None, help="frequency dict")
    parser.add_argument(
        "--segment", action='store_true', help="do segmentation in output file")
    parser.add_argument(
        "--segment-eng", action='store_true', help="segment english into chars")
    parser.add_argument(
        "--segment-chn", action='store_true', help="segment mandarin into chars")
    parser.add_argument(
        "--ignore", type=str, default=None, help="symbol to remove in output file")
    parser.add_argument(
        "--eng-upper", action='store_true', help="all english in upper class")
    return parser

def main():
    parser = get_parser()
    args = parser.parse_args()

    if args.segment and args.freq_dict:
        jieba.set_dictionary(args.freq_dict)

    writer = open(args.out_f, 'w', encoding="utf-8") 
    for line in open(args.in_f, encoding="utf-8"):
        elems = line.strip().split()

        # we skip the empty string
        if len(elems) == 1:
            print(f"Empty text found for {elems[0]}")
            continue

        uttid, content = elems[0], elems[1:]
        content = " ".join(content)

        content = process_one_line(content, args)
        writer.write(f"{uttid} {content}\n")  

if __name__ == "__main__":
    main()


================================================
FILE: egs/espnet_utils/trace_rnnt.py
================================================
# Author: Jinchuan Tian
# tyriontian@tencent.com

# A simple example to show the inference process of RNN-T modules
# The only dependency for this script is pytorch (torch1.7.1+cuda101)
#
# run: python3 trace_rnnt.py <resources-dir>
# egs: python3 trace_rnnt.py ./resources


import sys
import os
import torch
import json
from argparse import Namespace

# If you do not need the Espnet dependency, you can just copy the transducer directory
from espnet.nets.pytorch_backend.transducer.custom_encoder import CustomEncoder
from espnet.nets.pytorch_backend.transducer.rnn_decoder import DecoderRNNT
from espnet.nets.pytorch_backend.transducer.joint_network import JointNetwork


def main():
    """ parse configs """
    export_dir = sys.argv[1]
    json_file = os.path.join(export_dir, "model.json")
    idim, odim, args = json.load(open(json_file))
    args = Namespace(**args) 
    device = torch.device("cuda") # also works for CPU 
    
    """ load modules """
    encoder = CustomEncoder(
                idim,
                args.enc_block_arch,
                input_layer=args.custom_enc_input_layer,
                repeat_block=args.enc_block_repeat,
                self_attn_type=args.custom_enc_self_attn_type,
                positional_encoding_type=args.custom_enc_positional_encoding_type,
                positionwise_activation_type=args.custom_enc_pw_activation_type,
                conv_mod_activation_type=args.custom_enc_conv_mod_activation_type,
                aux_task_layer_list=args.aux_task_layer_list,
    )
    enc_pt = os.path.join(export_dir, "encoder.pt")
    encoder.load_state_dict(torch.load(enc_pt))
    encoder.eval().to(device)

    decoder = DecoderRNNT(
                odim,
                args.dtype,
                args.dlayers,
                args.dunits,
                args.char_list.index("<blank>"),
                args.dec_embed_dim,
                args.dropout_rate_decoder,
                args.dropout_rate_embed_decoder,
    )
    dec_pt = os.path.join(export_dir, "decoder.pt")
    decoder.load_state_dict(torch.load(dec_pt))
    decoder.eval().to(device)

    joint_network = JointNetwork(
        odim, 
        encoder.enc_out, 
        args.dunits, 
        args.joint_dim, 
        args.joint_activation_type
    )
    joint_pt = os.path.join(export_dir, "joint_net.pt")
    joint_network.load_state_dict(torch.load(joint_pt))
    joint_network.eval().to(device)
    print("INFO: Successfully load encoder, decoder and joint-network")

    """ Module Inference """
    B = 2                        # Batch_size
    T = 400                      # Maximum time index
    U = 4                        # Maximum word index
    enc_idim = idim
    enc_odim = encoder.enc_out
    n_vocab = odim
    dec_odim = args.dunits

    # For batch-inference, you may want to pass masks to the encoder and call it like 
    # 'encoder(enc_in, masks)'. In this case, the paddings will not be considered.
    # See espnet/nets/pytorch_backend/nets_utils.py:make_non_pad_mask for details.
    # but it's ok if B = 1.
    enc_in = torch.rand([B, T, enc_idim]).to(device)
    enc_out, _  = encoder(enc_in, None)
    print("encoder_out size: ", enc_out.size())  # enc_out: [B, sub(T), enc_odim], T is sub-sumpled by a factor of 6
     
    # decoder inference
    decoder.set_device(enc_out.device) # needed before inference
    decoder.set_data_type(enc_out.dtype) # needed before inference
    # The LSTM should work as long as the 'ey' is consistent with 'states'.
    # So you may use a cache and a state-select method to save computation.
    states = decoder.init_state(B)
    for _ in range(U):
        tokens = torch.randint(low=0, high=n_vocab, size=[B, 1]).to(device)
        ey = decoder.embed(tokens)
        dec_out, states = decoder.rnn_forward(ey, states)
    print("decoder_out size: ", dec_out.size()) # dec_out: [B, 1, dec_odim]

    # joint-network inference
    # It is safe to feed two 4-dim tensors.
    # However, the joint network should work as long as two conditions are met.
    # (1) element-wise addtion of enc_out and dec_out will not raise shape error (allow broadcastable)
    # (2) enc_out.size()[-1] == dec_out.size()[-1] == size_of_joint_net
    # The size of output should be the same with enc_out except the last dimention:
    # the last dimention is n_vocab
    enc_out = enc_out.unsqueeze(2) # [B, sub(T), 1, enc_odim]
    dec_out = dec_out.unsqueeze(1) # [B, 1, U, dec_odim]
    joint_out = joint_network(enc_out, dec_out)
    print("joint_out size: ", joint_out.size()) # [B, T, U, n_vocab] 
    
    # the output distribution is over this char_list: args.char_list
 
if __name__ == "__main__":
    main()


================================================
FILE: egs/espnet_utils/train_lms_srilm.sh
================================================
#!/usr/bin/env bash
# Copyright 2018 AIShell-Foundation(Authors:Jiayu DU, Xingyu NA, Bengu WU, Hao ZHENG)
#           2018 Beijing Shell Shell Tech. Co. Ltd. (Author: Hui BU)
# Apache 2.0

unk="<unk>"
lm_opts="-wbdiscount"
order=3

. ./path.sh
. ./utils/parse_options.sh

if [ $# -ne 3 ]; then
  echo "train_lms.sh <lexicon> <word-segmented-text> <dir>"
  echo " e.g train_lms.sh data/local/dict/lexicon.txt data/local/train/text data/local/lm"
  echo $@
  exit 1;
fi

lexicon=$1
text=$2
dir=$3

for f in "$text" "$lexicon"; do
  [ ! -f $x ] && echo "$0: No such file $f" && exit 1;
done

kaldi_lm=`which train_lm.sh`
if [ -z $kaldi_lm ]; then
  echo "$0: train_lm.sh is not found. That might mean it's not installed"
  echo "$0: or it is not added to PATH"
  echo "$0: Use the script tools/extras/install_kaldi_lm.sh to install it"
  exit 1
fi

mkdir -p $dir
cleantext=$dir/text.no_oov

cat $text | awk -v lex=$lexicon 'BEGIN{while((getline<lex) >0){ seen[$1]=1; } }
  {for(n=1; n<=NF;n++) {  if (seen[$n]) { printf("%s ", $n); } else {printf("'$unk' ");} } printf("\n");}' \
  > $cleantext || exit 1;

cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | sort | uniq -c | \
   sort -nr > $dir/word.counts || exit 1;

# Get counts from acoustic training transcripts, and add  one-count
# for each word in the lexicon (but not silence, we don't want it
# in the LM-- we'll add it optionally later).
cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | \
  cat - <(grep -w -v '!SIL' $lexicon | awk '{print $1}') | \
   sort | uniq -c | sort -nr > $dir/unigram.counts || exit 1;

# note: we probably won't really make use of <unk> as there aren't any OOVs
cat $dir/unigram.counts  | awk '{print $2}' | get_word_map.pl "<s>" "</s>" $unk > $dir/word_map \
   || exit 1;

# note: ignore 1st field of train.txt, it's the utterance-id.
cat $cleantext | awk -v wmap=$dir/word_map 'BEGIN{while((getline<wmap)>0)map[$1]=$2;}
  { for(n=2;n<=NF;n++) { printf map[$n]; if(n<NF){ printf " "; } else { print ""; }}}' | gzip -c >$dir/train.gz \
   || exit 1;

# From here is some commands to do a baseline with SRILM (assuming
# you have it installed).
heldout_sent=200 # Don't change this if you want result to be comparable with
    # kaldi_lm results
sdir=$dir/srilm # in case we want to use SRILM to double-check perplexities.
mkdir -p $sdir
cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n<NF) printf " "; else print ""; }}' | \
  head -$heldout_sent > $sdir/heldout
cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n<NF) printf " "; else print ""; }}' | \
  tail -n +$heldout_sent > $sdir/train

cat $dir/word_map | awk '{print $1}' | cat - <(echo "<s>"; echo "</s>" ) > $sdir/wordlist

ngram-count -text $sdir/train -order $order -limit-vocab -vocab $sdir/wordlist -unk \
  -map-unk $unk $lm_opts -interpolate -lm $sdir/srilm.o3g.kn.gz
ngram -lm $sdir/srilm.o3g.kn.gz -ppl $sdir/heldout
# 0 zeroprobs, logprob= -250954 ppl= 90.5091 ppl1= 132.482

# Note: perplexity SRILM gives to Kaldi-LM model is same as kaldi-lm reports above.
# Difference in WSJ must have been due to different treatment of <UNK>.
# ngram -lm $dir/3gram-mincount/lm_unpruned.gz  -ppl $sdir/heldout
# 0 zeroprobs, logprob= -250913 ppl= 90.4439 ppl1= 132.379

echo "local/train_lms.sh succeeded"
exit 0


================================================
FILE: egs/espnet_utils/translate_wav.sh
================================================
#!/usr/bin/env bash

# Copyright 2020 The ESPnet Authors.
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

if [ ! -f path.sh ] || [ ! -f cmd.sh ]; then
    echo "Please change current directory to recipe directory e.g., egs/tedlium2/asr1"
    exit 1
fi

. ./path.sh

# general configuration
stage=0        # start from 0 if you need to start from data preparation
stop_stage=100
ngpu=0         # number of gpus ("0" uses cpu, otherwise use gpu)
debugmode=1
detokenize=true
verbose=1      # verbose option

# feature configuration
do_delta=false
cmvn=

# decoding parameter
trans_model=
decode_config=
decode_dir=decode
api=v2

# download related
models=must_c.transformer.v1.en-fr

help_message=$(cat <<EOF
Usage:
    $0 [options] <wav_file>

Options:
    --ngpu <ngpu>                   # Number of GPUs (Default: 0)
    --decode_dir <directory_name>   # Name of directory to store decoding temporary data
    --models <model_name>           # Model name (e.g. must_c.transformer.v1.en-fr)
    --cmvn <path>                   # Location of cmvn.ark
    --trans_model <path>            # Location of E2E model
    --decode_config <path>          # Location of configuration file
    --api <api_version>             # API version (v1 or v2)

Example:
    # Record audio from microphone input as example.wav
    rec -c 1 -r 16000 example.wav trim 0 5

    # Decode using model name
    $0 --models must_c.transformer.v1.en-fr example.wav

    # Decode using model file
    $0 --cmvn cmvn.ark --trans_model model.acc.best --decode_config conf/decode.yaml example.wav

    # Decode with GPU (require batchsize > 0 in configuration file)
    $0 --ngpu 1 example.wav

Available models:
    - must_c.transformer.v1.en-fr
    - fisher_callhome_spanish.transformer.v1.es-en
EOF
)
. utils/parse_options.sh || exit 1;

# make shellcheck happy
train_cmd=
decode_cmd=

. ./cmd.sh

wav=$1
download_dir=${decode_dir}/download

if [ $# -lt 1 ]; then
    echo "${help_message}"
    exit 1;
fi

set -e
set -u
set -o pipefail

# Check model name or model file is set
if [ -z $models ]; then
    if [[ -z $cmvn || -z $trans_model || -z $decode_config ]]; then
        echo "Error: models or set of cmvn, trans_model and decode_config are required." >&2
        exit 1
    fi
fi

dir=${download_dir}/${models}
mkdir -p ${dir}

function download_models () {
    if [ -z $models ]; then
        return
    fi
    case "${models}" in
        # TODO(karita): register more models
        "must_c.transformer.v1.en-fr") share_url="https://drive.google.com/open?id=1wFIAqxoBUioTKTLRLv29KzvphkUm3qdo" ;;
        "fisher_callhome_spanish.transformer.v1.es-en") share_url="https://drive.google.com/open?id=1hawp5ZLw4_SIHIT3edglxbKIIkPVe8n3" ;;
        *) echo "No such models: ${models}"; exit 1 ;;
    esac

    if [ ! -e ${dir}/.complete ]; then
        download_from_google_drive.sh ${share_url} ${dir} ".tar.gz"
        touch ${dir}/.complete
    fi
}

# Download trained models
if [ -z "${cmvn}" ]; then
    download_models
    cmvn=$(find ${download_dir}/${models} -name "cmvn.ark" | head -n 1)
fi
if [ -z "${trans_model}" ]; then
    download_models
    trans_model=$(find ${download_dir}/${models} -name "model*.best*" | head -n 1)
fi
if [ -z "${decode_config}" ]; then
    download_models
    decode_config=$(find ${download_dir}/${models} -name "decode*.yaml" | head -n 1)
fi
if [ -z "${wav}" ]; then
    download_models
    wav=$(find ${download_dir}/${models} -name "*.wav" | head -n 1)
fi

# Check file existence
if [ ! -f "${cmvn}" ]; then
    echo "No such CMVN file: ${cmvn}"
    exit 1
fi
if [ ! -f "${trans_model}" ]; then
    echo "No such E2E model: ${trans_model}"
    exit 1
fi
if [ ! -f "${decode_config}" ]; then
    echo "No such config file: ${decode_config}"
    exit 1
fi
if [ ! -f "${wav}" ]; then
    echo "No such WAV file: ${wav}"
    exit 1
fi

base=$(basename $wav .wav)
decode_dir=${decode_dir}/${base}

if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    echo "stage 0: Data preparation"

    mkdir -p ${decode_dir}/data
    echo "$base $wav" > ${decode_dir}/data/wav.scp
    echo "X $base" > ${decode_dir}/data/spk2utt
    echo "$base X" > ${decode_dir}/data/utt2spk
    echo "$base X" > ${decode_dir}/data/text
fi

if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    echo "stage 1: Feature Generation"

    steps/make_fbank_pitch.sh --cmd "$train_cmd" --nj 1 --write_utt2num_frames true \
        ${decode_dir}/data ${decode_dir}/log ${decode_dir}/fbank

    feat_trans_dir=${decode_dir}/dump; mkdir -p ${feat_trans_dir}
    dump.sh --cmd "$train_cmd" --nj 1 --do_delta ${do_delta} \
        ${decode_dir}/data/feats.scp ${cmvn} ${decode_dir}/log \
        ${feat_trans_dir}
fi

if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
    echo "stage 2: Json Data Preparation"

    dict=${decode_dir}/dict
    echo "<unk> 1" > ${dict}
    feat_trans_dir=${decode_dir}/dump
    data2json.sh --feat ${feat_trans_dir}/feats.scp \
        ${decode_dir}/data ${dict} > ${feat_trans_dir}/data.json
    rm -f ${dict}
fi

if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
    echo "stage 3: Decoding"
    feat_trans_dir=${decode_dir}/dump


    ${decode_cmd} ${decode_dir}/log/decode.log \
        st_trans.py \
        --config ${decode_config} \
        --ngpu ${ngpu} \
        --backend pytorch \
        --debugmode ${debugmode} \
        --verbose ${verbose} \
        --trans-json ${feat_trans_dir}/data.json \
        --result-label ${decode_dir}/result.json \
        --model ${trans_model} \
        --api ${api}

    echo ""
    trans_text=$(grep rec_text ${decode_dir}/result.json | sed -e 's/.*: "\(.*\)".*/\1/' | sed -e 's/<eos>//')
    if $detokenize; then
        trans_text=$(echo ${trans_text} | sed -s 's/▁/ /g' | detokenizer.perl)
    fi
    echo "Translated text: ${trans_text}"
    echo ""
    echo "Finished"
fi


================================================
FILE: egs/espnet_utils/trim_silence.py
================================================
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

# Copyright 2018 Nagoya University (Tomoki Hayashi)
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)


import argparse
import codecs
import logging
import os

import kaldiio
import librosa
import matplotlib.pyplot as plt
import numpy

from espnet.utils.cli_utils import get_commandline_args


def _time_to_str(time_idx):
    time_idx = time_idx * 10 ** 4
    return "%06d" % time_idx


def get_parser():
    parser = argparse.ArgumentParser(
        description="Trim slience with simple power thresholding "
        "and make segments file.",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
    )
    parser.add_argument("--fs", type=int, help="Sampling frequency")
    parser.add_argument(
        "--threshold", type=float, default=60, help="Threshold in decibels"
    )
    parser.add_argument(
        "--win_length", type=int, default=1024, help="Analisys window length in point"
    )
    parser.add_argument(
        "--shift_length", type=int, default=256, help="Shift length in point"
    )
    parser.add_argument(
        "--min_silence", type=float, default=0.01, help="minimum silence length"
    )
    parser.add_argument(
        "--figdir", type=str, default="figs", help="Directory to save figures"
    )
    parser.add_argument("--verbose", "-V", default=0, type=int, help="Verbose option")
    parser.add_argument(
        "--normalize",
        choices=[1, 16, 24, 32],
        type=int,
        default=None,
        help="Give the bit depth of the PCM, "
        "then normalizes data to scale in [-1,1]",
    )
    parser.add_argument("rspecifier", type=str, help="WAV scp file")
    parser.add_argument("wspecifier", type=str, help="Segments file")

    return parser


def main():
    parser = get_parser()
    args = parser.parse_args()

    # set logger
    logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
    if args.verbose > 0:
        logging.basicConfig(level=logging.INFO, format=logfmt)
    else:
        logging.basicConfig(level=logging.WARN, format=logfmt)
    logging.info(get_commandline_args())

    if not os.path.exists(args.figdir):
        os.makedirs(args.figdir)

    with kaldiio.ReadHelper(args.rspecifier) as reader, codecs.open(
        args.wspecifier, "w", encoding="utf-8"
    ) as f:
        for utt_id, (rate, array) in reader:
            assert rate == args.fs
            array = array.astype(numpy.float32)
            if args.normalize is not None and args.normalize != 1:
                array = array / (1 << (args.normalize - 1))
            array_trim, idx = librosa.effects.trim(
                y=array,
                top_db=args.threshold,
                frame_length=args.win_length,
                hop_length=args.shift_length,
            )
            start, end = idx / args.fs

            # save figure
            plt.subplot(2, 1, 1)
            plt.plot(array)
            plt.title("Original")
            plt.subplot(2, 1, 2)
            plt.plot(array_trim)
            plt.title("Trim")
            plt.tight_layout()
            plt.savefig(args.figdir + "/" + utt_id + ".png")
            plt.close()

            # added minimum silence part
            start = max(0.0, start - args.min_silence)
            end = min(len(array) / args.fs, end + args.min_silence)

            # write to segments file
            segment = "%s %s %f %f\n" % (utt_id, utt_id, start, end)
            f.write(segment)


if __name__ == "__main__":
    main()


================================================
FILE: egs/espnet_utils/trim_silence.sh
================================================
#!/usr/bin/env bash

# Copyright 2018 Nagoya University (Tomoki Hayashi)
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

fs=16000
win_length=1024
shift_length=256
threshold=60
min_silence=0.01
normalize=16
cmd=run.pl
nj=32

help_message=$(cat <<EOF
Usage: $0 [options] <data-dir> <log-dir>
e.g.: $0 data/train exp/trim_silence/train
Options:
  --fs <fs>                      # sampling frequency (default=16000)
  --win_length <win_length>      # window length in point (default=1024)
  --shift_length <shift_length>  # shift length in point (default=256)
  --threshold <threshold>        # power threshold in db (default=60)
  --min_silence <sec>            # minimum silence lenght in sec (default=0.01)
  --normalize <bit>              # audio bit (default=16)
  --cmd <cmd>                    # how to run jobs (default=run.pl)
  --nj <nj>                      # number of parallel jobs (default=32)
EOF
)

. utils/parse_options.sh || exit 1;

if [ ! $# -eq 2 ]; then
    echo "${help_message}"
    exit 1;
fi

set -euo pipefail
data=$1
logdir=$2

tmpdir=$(mktemp -d ${data}/tmp-XXXX)
split_scps=""
for n in $(seq ${nj}); do
    split_scps="${split_scps} ${tmpdir}/wav.${n}.scp"
done
utils/split_scp.pl ${data}/wav.scp ${split_scps} || exit 1;

# make segments file describing start and end time
${cmd} JOB=1:${nj} ${logdir}/trim_silence.JOB.log \
    MPLBACKEND=Agg trim_silence.py \
        --fs ${fs} \
        --win_length ${win_length} \
        --shift_length ${shift_length} \
        --threshold ${threshold} \
        --min_silence ${min_silence} \
        --normalize ${normalize} \
        --figdir ${logdir}/figs \
        scp:${tmpdir}/wav.JOB.scp \
        ${tmpdir}/segments.JOB

# concatenate segments
for n in $(seq ${nj}); do
    cat ${tmpdir}/segments.${n} || exit 1;
done > ${data}/segments || exit 1
rm -rf ${tmpdir}

# check
utils/validate_data_dir.sh --no-feats ${data}
echo "Successfully trimed silence part."


================================================
FILE: egs/espnet_utils/trn2ctm.py
================================================
#!/usr/bin/python

import argparse
import codecs
import math
import re
import sys

is_python2 = sys.version_info[0] == 2


def get_parser():
    parser = argparse.ArgumentParser(description="convert trn to ctm")
    parser.add_argument("trn", type=str, default=None, nargs="?", help="input trn")
    parser.add_argument("ctm", type=str, default=None, nargs="?", help="output ctm")
    return parser


def main(args):
    args = get_parser().parse_args(args)
    convert(args.trn, args.ctm)


def convert(trn=None, ctm=None):
    if trn is not None:
        with codecs.open(trn, "r", encoding="utf-8") as trn:
            content = trn.readlines()
    else:
        trn = codecs.getreader("utf-8")(sys.stdin if is_python2 else sys.stdin.buffer)
        content = trn.readlines()
    split_content = []
    for i, line in enumerate(content):
        idx = line.rindex("(")
        split = [line[:idx].strip().upper(), line[idx + 1 :].strip()[:-1]]
        while "((" in split[0]:
            split[0] = split[0].replace("((", "(")
        while "  " in split[0]:
            split[0] = split[0].replace("  ", " ")
        segm_info = re.split("[-_]", split[1])
        segm_info = [s.strip() for s in segm_info]
        col1 = segm_info[0] + "_" + segm_info[1]
        col2 = segm_info[2]
        start_time_int = int(segm_info[6])
        end_time_int = int(segm_info[7])
        diff_int = end_time_int - start_time_int
        word_split = split[0].split(" ")
        word_split = list(
            filter(lambda x: len(x) > 0 and any([c != " " for c in x]), word_split)
        )
        if len(word_split) > 0:
            step_int = int(math.floor(float(diff_int) / len(word_split)))
            step = str(step_int)
            for j, word in enumerate(word_split):
                start_time = str(int(start_time_int + step_int * j))
                col3 = (
                    (start_time[:-2] if len(start_time) > 2 else "0")
                    + "."
                    + (start_time[-2:] if len(start_time) > 1 else "00")
                )
                if j == len(word_split) - 1:
                    diff = str(int(end_time_int - int(start_time)))
                else:
                    diff = step
                col4 = (diff[:-2] if len(diff) > 2 else "0") + "." + diff[-2:]
                segm_info = [col1, col2, col3, col4]
                split_content.append(" ".join(segm_info) + "  " + word)
    if ctm is not None:
        sys.stdout = codecs.open(ctm, "w", encoding="utf-8")
    else:
        sys.stdout = codecs.getwriter("utf-8")(
            sys.stdout if is_python2 else sys.stdout.buffer
        )
    for c_line in split_content:
        print(c_line)


if __name__ == "__main__":
    main(sys.argv[1:])


================================================
FILE: egs/espnet_utils/trn2stm.py
================================================
#!/usr/bin/python

import argparse
import codecs
import re
import sys

is_python2 = sys.version_info[0] == 2


def get_parser():
    parser = argparse.ArgumentParser(description="convert trn to stm")
    parser.add_argument(
        "--orig-stm",
        type=str,
        default=None,
        nargs="?",
        help="Original stm file to add additional information to the generated one",
    )
    parser.add_argument("trn", type=str, default=None, nargs="?", help="input trn")
    parser.add_argument("stm", type=str, default=None, nargs="?", help="output stm")
    return parser


def main(args):
    args = get_parser().parse_args(args)
    convert(args.trn, args.stm, args.orig_stm)


def convert(trn=None, stm=None, orig_stm=None):
    if orig_stm is not None:
        with codecs.open(orig_stm, "r", encoding="utf-8") as orig_stm:
            orig_content = orig_stm.readlines()
            has_orig = True
            header = []
            content = []
            for line in orig_content:
                (header if line.startswith(";;") else content).append(line.strip())
            del orig_content
            content = [x.split(" ") for x in content]
            mapping = {}
            for x in content:
                mapping[x[2]] = x[5]
            del content
    else:
        has_orig = False
        header = None
        mapping = None

    if trn is not None:
        with codecs.open(trn, "r", encoding="utf-8") as trn:
            content = trn.readlines()
    else:
        trn = codecs.getreader("utf-8")(sys.stdin if is_python2 else sys.stdin.buffer)
        content = trn.readlines()

    for i, line in enumerate(content):
        idx = line.rindex("(")
        split = [line[:idx].strip().upper() + " ", line[idx + 1 :].strip()[:-1]]
        while "((" in split[0]:
            split[0] = split[0].replace("((", "(")
        while "  " in split[0]:
            split[0] = split[0].replace("  ", " ")
        segm_info = re.split("[-_]", split[1])
        segm_info = [s.strip() for s in segm_info]
        col1 = segm_info[0] + "_" + segm_info[1]
        col2 = segm_info[2]
        col3 = segm_info[3] + "_" + segm_info[4] + "_" + segm_info[5]
        start_time = str(int(segm_info[6]))
        end_time = str(int(segm_info[7]))
        col4 = (
            (start_time[:-2] if len(start_time) > 2 else "0")
            + "."
            + (start_time[-2:] if len(start_time) > 1 else "00")
        )
        col5 = (
            (end_time[:-2] if len(end_time) > 2 else "0")
            + "."
            + (end_time[-2:] if len(end_time) > 1 else "00")
        )
        col6 = mapping[col3] if has_orig else ""
        segm_info = [col1, col2, col3, col4, col5, col6]
        content[i] = " ".join(segm_info) + "  " + split[0]
    if stm is not None:
        sys.stdout = codecs.open(stm, "w", encoding="utf-8")
    else:
        sys.stdout = codecs.getwriter("utf-8")(
            sys.stdout if is_python2 else sys.stdout.buffer
        )
    if has_orig:
        for h_line in header:
            print(h_line)
    for c_line in content:
        print(c_line)


if __name__ == "__main__":
    main(sys.argv[1:])


================================================
FILE: egs/espnet_utils/update_json.sh
================================================
#!/usr/bin/env bash

# Copyright 2020 Kyoto University (Hirofumi Inaguma)
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

echo "$0 $*" >&2 # Print the command line for logging
. ./path.sh

nlsyms=""
oov="<unk>"
bpecode=""
verbose=0
trans_type=char

text=""
multilingual=false

help_message=$(cat << EOF
Usage: $0 <json> <data-dir> <dict>
e.g. $0 data/train data/lang_1char/train_units.txt
Options:
  --oov <oov-word>                                 # Default: <unk>
  --verbose <num>                                  # Default: 0
EOF
)
. utils/parse_options.sh

if [ $# != 3 ]; then
    echo "${help_message}" 1>&2
    exit 1;
fi

set -euo pipefail

json=$1
dir=$2
dic=$3
json_dir=$(dirname ${json})
tmpdir=$(mktemp -d ${dir}/tmp-XXXXX)
trap 'rm -rf ${tmpdir}' EXIT

if [ -z ${text} ]; then
    text=${dir}/text
fi

# 2. Create scp files for outputs
mkdir -p ${tmpdir}/output
if [ -n "${bpecode}" ]; then
    if [ ${multilingual} = true ]; then
        # remove a space before the language ID
        paste -d " " <(awk '{print $1}' ${text}) <(cut -f 2- -d" " ${text} \
            | spm_encode --model=${bpecode} --output_format=piece | cut -f 2- -d" ") \
            > ${tmpdir}/output/token.scp
    else
        paste -d " " <(awk '{print $1}' ${text}) <(cut -f 2- -d" " ${text} \
            | spm_encode --model=${bpecode} --output_format=piece) \
            > ${tmpdir}/output/token.scp
    fi
elif [ -n "${nlsyms}" ]; then
    text2token.py -s 1 -n 1 -l ${nlsyms} ${text} --trans_type ${trans_type} > ${tmpdir}/output/token.scp
else
    text2token.py -s 1 -n 1 ${text} --trans_type ${trans_type} > ${tmpdir}/output/token.scp
fi
< ${tmpdir}/output/token.scp utils/sym2int.pl --map-oov ${oov} -f 2- ${dic} > ${tmpdir}/output/tokenid.scp
awk '{print $1 " " NF-1}' ${tmpdir}/output/tokenid.scp > ${tmpdir}/output/olen.scp
# +2 comes from CTC blank and EOS
vocsize=$(tail -n 1 ${dic} | awk '{print $2}')
odim=$(echo "$vocsize + 2" | bc)
awk -v odim=${odim} '{print $1 " " odim}' ${text} > ${tmpdir}/output/odim.scp

cat ${text} > ${tmpdir}/output/text.scp


# 4. Create JSON files from each scp files
rm -f ${tmpdir}/*/*.json
for x in "${tmpdir}"/output/*.scp; do
    k=$(basename ${x} .scp)
    < ${x} scp2json.py --key ${k} > ${tmpdir}/output/${k}.json
done

# add to json
addjson.py --verbose ${verbose} -i false \
  ${json} ${tmpdir}/output/text.json ${tmpdir}/output/token.json ${tmpdir}/output/tokenid.json ${tmpdir}/output/olen.json ${tmpdir}/output/odim.json > ${tmpdir}/data.json
mkdir -p ${json_dir}/.backup
echo "json updated. original json is kept in ${json_dir}/.backup."
cp ${json} ${json_dir}/.backup/"$(basename ${json})"
cp ${tmpdir}/data.json ${json}

rm -fr ${tmpdir}


================================================
FILE: egs/espnet_utils/word_ngram_rescore.py
================================================
# author: tyriontian
# tyriontian@tencent.com

import json
import sys
import codecs
import torch

from espnet.nets.scorers.word_ngram import WordNgram

def score_texts(ngram, texts, ignore_strs=["<eos>", " "]):
    for s in ignore_strs:
        texts = [t.replace(s, "") for t in texts]
    return ngram.score_texts(texts).cpu().tolist()

def main():
    js_file = sys.argv[1]
    ngram_dir = sys.argv[2]
    weight = float(sys.argv[3])
    js_file_tgt = sys.argv[4]

    # read json
    with codecs.open(js_file, "r", encoding="utf-8") as f:
        js = json.load(f)
        js = js["utts"]
    
    # load word-level N-gram LM
    device = torch.device("cpu")
    ngram = WordNgram(ngram_dir, device)

    # rescore each hypothesis and sort
    for name in js.keys():
        hyp_lst = js[name]["output"]

        texts = []
        for hyp in hyp_lst:
            texts.append(hyp["rec_text"])

        text_scores = score_texts(ngram, texts)
        
        for j, hyp in enumerate(hyp_lst):
            hyp_lst[j]["score"] += text_scores[j] * weight
            hyp_lst[j]["word_ngram_score"] = text_scores[j]

        hyp_lst.sort(key=lambda hyp: hyp["score"], reverse=True)
        js[name]["output"] = hyp_lst

    js = {"utts": js}

    # write new json
    with open(js_file_tgt, "wb") as f:
        f.write(
            json.dumps(
                js, indent=4, ensure_ascii=False, sort_keys=True
            ).encode("utf_8")
        )
    
    
if __name__ == "__main__":
    main()


================================================
FILE: egs/espnet_utils/word_ngram_rescore.sh
================================================
decode_dir=$1
word_ngram=$2
dict=$3

. utils/parse_options.sh || exit 1;

mkdir -p $decode_dir/rescore

for w in 0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9; do
    (mkdir -p $decode_dir/word_ngram_rescore/$w; subdir=$decode_dir/word_ngram_rescore/$w
    python3 espnet_utils/word_ngram_rescore.py $decode_dir/data.json \
      $word_ngram $w $subdir/data.1.json
    bash espnet_utils/score_sclite.sh $subdir $dict \
      > $subdir/decode_result.txt) & 
done
wait 


================================================
FILE: egs/steps/align_basis_fmllr.sh
================================================
#!/usr/bin/env bash
# Copyright 2012  Johns Hopkins University (Author: Daniel Povey)
# Copyright 2013  GoVivace Inc (Author: Nagendra Goel)
# Apache 2.0

# Computes training alignments; assumes features are (LDA+MLLT or delta+delta-delta)
# + fMLLR (probably with SAT models).
# It first computes an alignment with the final.alimdl (or the final.mdl if final.alimdl
# is not present), then does 2 iterations of fMLLR estimation.

# If you supply the --use-graphs option, it will use the training
# graphs from the source directory (where the model is).  In this
# case the number of jobs must match the source directory.


# Begin configuration section.
stage=0
nj=4
cmd=run.pl
use_graphs=false
# Begin configuration.
scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
basis_fmllr_opts="--fmllr-min-count=22  --num-iters=10 --size-scale=0.2 --step-size-iters=3"
beam=10
retry_beam=40
boost_silence=1.5 # factor by which to boost silence during alignment.
fmllr_update_type=full
# End configuration options.

echo "$0 $@"  # Print the command line for logging

[ -f path.sh ] && . ./path.sh # source the path.
. parse_options.sh || exit 1;

if [ $# != 4 ]; then
   echo "usage: steps/align_basis_fmllr.sh <data-dir> <lang-dir> <src-dir> <align-dir>"
   echo "e.g.:  steps/align_basis_fmllr.sh data/train data/lang exp/tri4 exp/tri4_ali"
   echo "Note: <src-dir> should ideally have been trained by steps/train_sat_basis.sh, or"
   echo "if a non-SAT system (not recommended), the basis should have been computed"
   echo "by steps/get_fmllr_basis.sh."
   echo "main options (for others, see top of script file)"
   echo "  --config <config-file>                           # config containing options"
   echo "  --nj <nj>                                        # number of parallel jobs"
   echo "  --use-graphs true                                # use graphs in src-dir"
   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
   echo "  --fmllr-update-type (full|diag|offset|none)      # default full."
   exit 1;
fi

data=$1
lang=$2
srcdir=$3
dir=$4
graphdir=$dir

oov=`cat $lang/oov.int` || exit 1;
silphonelist=`cat $lang/phones/silence.csl` || exit 1;
sdata=$data/split$nj

mkdir -p $dir/log
echo $nj > $dir/num_jobs
[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;


for f in $srcdir/tree  $srcdir/final.mdl $srcdir/fmllr.basis \
                       $data/feats.scp $lang/phones.txt; do
  if [ ! -f $f ]; then
    echo "$0: expected file $f to exist"
    exit 1
  fi
done

utils/lang/check_phones_compatible.sh $lang/phones.txt $srcdir/phones.txt || exit 1;
cp $lang/phones.txt $dir || exit 1;


cp $srcdir/{tree,final.mdl} $dir || exit 1;
cp $srcdir/final.occs $dir;
splice_opts=`cat $srcdir/splice_opts 2>/dev/null` # frame-splicing options.
cp $srcdir/splice_opts $dir 2>/dev/null # frame-splicing options.
cmvn_opts=`cat $srcdir/cmvn_opts 2>/dev/null`
cp $srcdir/cmvn_opts $dir 2>/dev/null # cmn/cmvn option.
delta_opts=`cat $srcdir/delta_opts 2>/dev/null`
cp $srcdir/delta_opts $dir 2>/dev/null

if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
echo "$0: feature type is $feat_type"

case $feat_type in
  delta) sifeats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas $delta_opts ark:- ark:- |";;
  lda) sifeats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |"
    cp $srcdir/final.mat $dir
   ;;
  *) echo "Invalid feature type $feat_type" && exit 1;
esac

## Set up model and alignment model.
mdl=$srcdir/final.mdl
if [ -f $srcdir/final.alimdl ]; then
  alimdl=$srcdir/final.alimdl
else
  alimdl=$srcdir/final.mdl
fi
[ ! -f $mdl ] && echo "$0: no such model $mdl" && exit 1;
alimdl_cmd="gmm-boost-silence --boost=$boost_silence `cat $lang/phones/optional_silence.csl` $alimdl - |"
mdl_cmd="gmm-boost-silence --boost=$boost_silence `cat $lang/phones/optional_silence.csl` $mdl - |"

## Work out where we're getting the graphs from.
if $use_graphs; then
  [ "$nj" != "`cat $srcdir/num_jobs`" ] && \
    echo "$0: you specified --use-graphs true, but #jobs mismatch." && exit 1;
  [ ! -f $srcdir/fsts.1.gz ] && echo "No graphs in $srcdir" && exit 1;
  graphdir=$srcdir
else
  graphdir=$dir
  if [ $stage -le 0 ]; then
    echo "$0: compiling training graphs"
    tra="ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt $sdata/JOB/text|";
    $cmd JOB=1:$nj $dir/log/compile_graphs.JOB.log  \
      compile-train-graphs --read-disambig-syms=$lang/phones/disambig.int $dir/tree $dir/final.mdl  $lang/L.fst "$tra" \
        "ark:|gzip -c >$dir/fsts.JOB.gz" || exit 1;
  fi
fi


if [ $stage -le 1 ]; then
  echo "$0: aligning data in $data using $alimdl and speaker-independent features."
  $cmd JOB=1:$nj $dir/log/align_pass1.JOB.log \
    gmm-align-compiled $scale_opts --beam=$beam --retry-beam=$retry_beam "$alimdl_cmd" \
    "ark:gunzip -c $graphdir/fsts.JOB.gz|" "$sifeats" "ark:|gzip -c >$dir/pre_ali.JOB.gz" || exit 1;
fi

if [ $stage -le 2 ]; then
  echo "$0: computing fMLLR transforms"
  if [ "$alimdl" != "$mdl" ]; then
    $cmd JOB=1:$nj $dir/log/fmllr.JOB.log \
      ali-to-post "ark:gunzip -c $dir/pre_ali.JOB.gz|" ark:- \| \
      weight-silence-post 0.0 $silphonelist $alimdl ark:- ark:- \| \
      gmm-post-to-gpost $alimdl "$sifeats" ark:- ark:- \| \
      gmm-est-basis-fmllr-gpost $basis_fmllr_opts --spk2utt=ark:$sdata/JOB/spk2utt \
        $mdl $srcdir/fmllr.basis "$sifeats"  ark,s,cs:- \
        ark:$dir/trans.JOB || exit 1;
  else
    $cmd JOB=1:$nj $dir/log/fmllr.JOB.log \
      ali-to-post "ark:gunzip -c $dir/pre_ali.JOB.gz|" ark:- \| \
      weight-silence-post 0.0 $silphonelist $alimdl ark:- ark:- \| \
      gmm-est-basis-fmllr $basis_fmllr_opts --spk2utt=ark:$sdata/JOB/spk2utt \
         $mdl $srcdir/fmllr.basis "$sifeats" \
        ark,s,cs:- ark:$dir/trans.JOB || exit 1;
  fi
fi

feats="$sifeats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$dir/trans.JOB ark:- ark:- |"

if [ $stage -le 3 ]; then
  echo "$0: doing final alignment."
  $cmd JOB=1:$nj $dir/log/align_pass2.JOB.log \
    gmm-align-compiled $scale_opts --beam=$beam --retry-beam=$retry_beam "$mdl_cmd" \
    "ark:gunzip -c $graphdir/fsts.JOB.gz|" "$feats" "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1;
fi

#rm $dir/pre_ali.*.gz

echo "$0: done aligning data."

utils/summarize_warnings.pl $dir/log

exit 0;


================================================
FILE: egs/steps/align_basis_fmllr_lats.sh
================================================
#!/usr/bin/env bash
#
# Copyright 2012-2015  Johns Hopkins University (Author: Daniel Povey)
# Apache 2.0

# Version of align_fmllr_lats.sh that uses "basis fMLLR", so it is suitable for
# situations where there is very little data per speaker (e.g. when there is a
# one-to-one mapping between utterances and speakers).  Intended for use where
# the model was trained with basis-fMLLR (i.e.  when you trained the model with
# train_sat_basis.sh where you normally would have trained with train_sat.sh),
# or when it was trained with SAT but you ran get_fmllr_basis.sh on the
# source-model directory.

# Begin configuration section.
stage=0
nj=4
cmd=run.pl
# Begin configuration.
scale_opts="--transition-scale=1.0 --self-loop-scale=0.1"
acoustic_scale=0.1
beam=10
retry_beam=40
final_beam=20  # For the lattice-generation phase there is no retry-beam.  This
               # is a limitation of gmm-latgen-faster.  We just use an
               # intermediate beam.  We'll lose a little data and it will be
               # slightly slower.  (however, the min-active of 200 that
               # gmm-latgen-faster defaults to may help.)
boost_silence=1.0 # factor by which to boost silence during alignment.
basis_fmllr_opts="--fmllr-min-count=22  --num-iters=10 --size-scale=0.2 --step-size-iters=3"

generate_ali_from_lats=false # If true, alingments generated from lattices.
# End configuration options.

echo "$0 $@"  # Print the command line for logging

[ -f path.sh ] && . ./path.sh # source the path.
. parse_options.sh || exit 1;

if [ $# != 4 ]; then
   echo "usage: steps/align_fmllr_lats.sh <data-dir> <lang-dir> <src-dir> <align-dir>"
   echo "e.g.:  steps/align_fmllr_lats.sh data/train data/lang exp/tri1 exp/tri1_lats"
   echo "main options (for others, see top of script file)"
   echo "  --config <config-file>                           # config containing options"
   echo "  --nj <nj>                                        # number of parallel jobs"
   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
   exit 1;
fi

data=$1
lang=$2
srcdir=$3
dir=$4

if [ ! -f $srcdir/fmllr.basis ]; then
  echo "$0: expected $srcdir/fmllr.basis to exist.   Run get_fmllr_basis.sh on $srcdir."
fi

for f in $data/feats.scp $lang/phones.txt $srcdir/final.mdl; do
  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
done


oov=`cat $lang/oov.int` || exit 1;
silphonelist=`cat $lang/phones/silence.csl` || exit 1;
sdata=$data/split$nj

mkdir -p $dir/log
echo $nj > $dir/num_jobs
[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;

utils/lang/check_phones_compatible.sh $lang/phones.txt $srcdir/phones.txt || exit 1;
cp $lang/phones.txt $dir || exit 1;

cp $srcdir/{tree,final.mdl} $dir || exit 1;
cp $srcdir/final.alimdl $dir 2>/dev/null
cp $srcdir/final.occs $dir;
splice_opts=`cat $srcdir/splice_opts 2>/dev/null` # frame-splicing options.
cp $srcdir/splice_opts $dir 2>/dev/null # frame-splicing options.
cmvn_opts=`cat $srcdir/cmvn_opts 2>/dev/null`
cp $srcdir/cmvn_opts $dir 2>/dev/null # cmn/cmvn option.
delta_opts=`cat $srcdir/delta_opts 2>/dev/null`
cp $srcdir/delta_opts $dir 2>/dev/null

if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
echo "$0: feature type is $feat_type"

case $feat_type in
  delta) sifeats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas $delta_opts ark:- ark:- |";;
  lda) sifeats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |"
    cp $srcdir/final.mat $dir
    cp $srcdir/full.mat $dir 2>/dev/null
   ;;
  *) echo "Invalid feature type $feat_type" && exit 1;
esac

## Set up model and alignment model.
mdl=$srcdir/final.mdl
if [ -f $srcdir/final.alimdl ]; then
  alimdl=$srcdir/final.alimdl
else
  alimdl=$srcdir/final.mdl
fi
[ ! -f $mdl ] && echo "$0: no such model $mdl" && exit 1;
alimdl_cmd="gmm-boost-silence --boost=$boost_silence `cat $lang/phones/optional_silence.csl` $alimdl - |"
mdl_cmd="gmm-boost-silence --boost=$boost_silence `cat $lang/phones/optional_silence.csl` $mdl - |"


## because gmm-latgen-faster doesn't support adding the transition-probs to the
## graph itself, we need to bake them into the compiled graphs.  This means we can't reuse previously compiled graphs,
## because the other scripts write them without transition probs.
if [ $stage -le 0 ]; then
  echo "$0: compiling training graphs"
  tra="ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt $sdata/JOB/text|";
  $cmd JOB=1:$nj $dir/log/compile_graphs.JOB.log  \
    compile-train-graphs --read-disambig-syms=$lang/phones/disambig.int $scale_opts $dir/tree $dir/final.mdl  $lang/L.fst "$tra" \
    "ark:|gzip -c >$dir/fsts.JOB.gz" || exit 1;
fi


if [ $stage -le 1 ]; then
  # Note: we need to set --transition-scale=0.0 --self-loop-scale=0.0 because,
  # as explained above, we compiled the transition probs into the training
  # graphs.
  echo "$0: aligning data in $data using $alimdl and speaker-independent features."
  $cmd JOB=1:$nj $dir/log/align_pass1.JOB.log \
    gmm-align-compiled --transition-scale=0.0 --self-loop-scale=0.0 --acoustic-scale=$acoustic_scale \
        --beam=$beam --retry-beam=$retry_beam "$alimdl_cmd" \
    "ark:gunzip -c $dir/fsts.JOB.gz|" "$sifeats" "ark:|gzip -c >$dir/pre_ali.JOB.gz" || exit 1;
fi

if [ $stage -le 2 ]; then
  echo "$0: computing fMLLR transforms"
  if [ "$alimdl" != "$mdl" ]; then
    $cmd JOB=1:$nj $dir/log/fmllr.JOB.log \
      ali-to-post "ark:gunzip -c $dir/pre_ali.JOB.gz|" ark:- \| \
      weight-silence-post 0.0 $silphonelist $alimdl ark:- ark:- \| \
      gmm-post-to-gpost $alimdl "$sifeats" ark:- ark:- \| \
      gmm-est-basis-fmllr-gpost $basis_fmllr_opts \
      --spk2utt=ark:$sdata/JOB/spk2utt $mdl $srcdir/fmllr.basis "$sifeats" \
      ark,s,cs:- ark:$dir/trans.JOB || exit 1;
  else
    $cmd JOB=1:$nj $dir/log/fmllr.JOB.log \
      ali-to-post "ark:gunzip -c $dir/pre_ali.JOB.gz|" ark:- \| \
      weight-silence-post 0.0 $silphonelist $alimdl ark:- ark:- \| \
      gmm-est-basis-fmllr $basis_fmllr_opts \
      --spk2utt=ark:$sdata/JOB/spk2utt $mdl $srcdir/fmllr.basis "$sifeats" \
      ark,s,cs:- ark:$dir/trans.JOB || exit 1;
  fi
fi

feats="$sifeats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$dir/trans.JOB ark:- ark:- |"

if [ $stage -le 3 ]; then
  # Warning: gmm-latgen-faster doesn't support a retry-beam so you may get more
  # alignment errors (however, it does have a default min-active=200 so this
  # will tend to reduce alignment errors).
  # --allow_partial=false makes sure we reach the end of the decoding graph.
  # --word-determinize=false makes sure we retain the alternative pronunciations of
  #   words (including alternatives regarding optional silences).
  #  --lattice-beam=$beam keeps all the alternatives that were within the beam,
  #    it means we do no pruning of the lattice (lattices from a training transcription
  #    will be small anyway).
  echo "$0: generating lattices containing alternate pronunciations."
  $cmd JOB=1:$nj $dir/log/generate_lattices.JOB.log \
    gmm-latgen-faster --acoustic-scale=$acoustic_scale --beam=$final_beam \
        --lattice-beam=$final_beam --allow-partial=false --word-determinize=false \
      "$mdl_cmd" "ark:gunzip -c $dir/fsts.JOB.gz|" "$feats" \
      "ark:|gzip -c >$dir/lat.JOB.gz" || exit 1;
fi

if [ $stage -le 4 ] && $generate_ali_from_lats; then
  # If generate_alignments is true, ali.*.gz is generated in lats dir
  $cmd JOB=1:$nj $dir/log/generate_alignments.JOB.log \
    lattice-best-path --acoustic-scale=$acoustic_scale "ark:gunzip -c $dir/lat.JOB.gz |" \
    ark:/dev/null "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1;
fi

rm $dir/pre_ali.*.gz 2>/dev/null || true

echo "$0: done generating lattices from training transcripts."

utils/summarize_warnings.pl $dir/log

exit 0;


================================================
FILE: egs/steps/align_fmllr.sh
================================================
#!/usr/bin/env bash
# Copyright 2012  Johns Hopkins University (Author: Daniel Povey)
# Apache 2.0

# Computes training alignments; assumes features are (LDA+MLLT or delta+delta-delta)
# + fMLLR (probably with SAT models).
# It first computes an alignment with the final.alimdl (or the final.mdl if final.alimdl
# is not present), then does 2 iterations of fMLLR estimation.

# If you supply the --use-graphs option, it will use the training
# graphs from the source directory (where the model is).  In this
# case the number of jobs must match the source directory.


# Begin configuration section.
stage=0
nj=4
cmd=run.pl
use_graphs=false
# Begin configuration.
scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
beam=10
retry_beam=40
careful=false
boost_silence=1.0 # factor by which to boost silence during alignment.
fmllr_update_type=full
# End configuration options.

echo "$0 $@"  # Print the command line for logging

[ -f path.sh ] && . ./path.sh # source the path.
. parse_options.sh || exit 1;

if [ $# != 4 ]; then
   echo "usage: steps/align_fmllr.sh <data-dir> <lang-dir> <src-dir> <align-dir>"
   echo "e.g.:  steps/align_fmllr.sh data/train data/lang exp/tri1 exp/tri1_ali"
   echo "main options (for others, see top of script file)"
   echo "  --config <config-file>                           # config containing options"
   echo "  --nj <nj>                                        # number of parallel jobs"
   echo "  --use-graphs true                                # use graphs in src-dir"
   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
   echo "  --fmllr-update-type (full|diag|offset|none)      # default full."
   exit 1;
fi

data=$1
lang=$2
srcdir=$3
dir=$4

oov=`cat $lang/oov.int` || exit 1;
silphonelist=`cat $lang/phones/silence.csl` || exit 1;
sdata=$data/split$nj

mkdir -p $dir/log
echo $nj > $dir/num_jobs
[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;

utils/lang/check_phones_compatible.sh $lang/phones.txt $srcdir/phones.txt || exit 1;
cp $lang/phones.txt $dir || exit 1;

cp $srcdir/{tree,final.mdl} $dir || exit 1;
cp $srcdir/final.alimdl $dir 2>/dev/null
cp $srcdir/final.occs $dir;
splice_opts=`cat $srcdir/splice_opts 2>/dev/null` # frame-splicing options.
cp $srcdir/splice_opts $dir 2>/dev/null # frame-splicing options.
cmvn_opts=`cat $srcdir/cmvn_opts 2>/dev/null`
cp $srcdir/cmvn_opts $dir 2>/dev/null # cmn/cmvn option.
delta_opts=`cat $srcdir/delta_opts 2>/dev/null`
cp $srcdir/delta_opts $dir 2>/dev/null

if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
echo "$0: feature type is $feat_type"

case $feat_type in
  delta) sifeats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas $delta_opts ark:- ark:- |";;
  lda) sifeats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |"
    cp $srcdir/final.mat $dir
    cp $srcdir/full.mat $dir 2>/dev/null
   ;;
  *) echo "Invalid feature type $feat_type" && exit 1;
esac

## Set up model and alignment model.
mdl=$srcdir/final.mdl
if [ -f $srcdir/final.alimdl ]; then
  alimdl=$srcdir/final.alimdl
else
  alimdl=$srcdir/final.mdl
fi
[ ! -f $mdl ] && echo "$0: no such model $mdl" && exit 1;
alimdl_cmd="gmm-boost-silence --boost=$boost_silence `cat $lang/phones/optional_silence.csl` $alimdl - |"
mdl_cmd="gmm-boost-silence --boost=$boost_silence `cat $lang/phones/optional_silence.csl` $mdl - |"


## Work out where we're getting the graphs from.
if $use_graphs; then
  [ "$nj" != "`cat $srcdir/num_jobs`" ] && \
    echo "$0: you specified --use-graphs true, but #jobs mismatch." && exit 1;
  [ ! -f $srcdir/fsts.1.gz ] && echo "No graphs in $srcdir" && exit 1;
  graphdir=$srcdir
else
  graphdir=$dir
  if [ $stage -le 0 ]; then
    echo "$0: compiling training graphs"
    tra="ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt $sdata/JOB/text|";
    $cmd JOB=1:$nj $dir/log/compile_graphs.JOB.log  \
      compile-train-graphs --read-disambig-syms=$lang/phones/disambig.int $dir/tree $dir/final.mdl  $lang/L.fst "$tra" \
        "ark:|gzip -c >$dir/fsts.JOB.gz" || exit 1;
  fi
fi


if [ $stage -le 1 ]; then
  echo "$0: aligning data in $data using $alimdl and speaker-independent features."
  $cmd JOB=1:$nj $dir/log/align_pass1.JOB.log \
    gmm-align-compiled $scale_opts --beam=$beam --retry-beam=$retry_beam --careful=$careful "$alimdl_cmd" \
    "ark:gunzip -c $graphdir/fsts.JOB.gz|" "$sifeats" "ark:|gzip -c >$dir/pre_ali.JOB.gz" || exit 1;
fi

if [ $stage -le 2 ]; then
  echo "$0: computing fMLLR transforms"
  if [ "$alimdl" != "$mdl" ]; then
    $cmd JOB=1:$nj $dir/log/fmllr.JOB.log \
      ali-to-post "ark:gunzip -c $dir/pre_ali.JOB.gz|" ark:- \| \
      weight-silence-post 0.0 $silphonelist $alimdl ark:- ark:- \| \
      gmm-post-to-gpost $alimdl "$sifeats" ark:- ark:- \| \
      gmm-est-fmllr-gpost --fmllr-update-type=$fmllr_update_type \
      --spk2utt=ark:$sdata/JOB/spk2utt $mdl "$sifeats" \
      ark,s,cs:- ark:$dir/trans.JOB || exit 1;
  else
    $cmd JOB=1:$nj $dir/log/fmllr.JOB.log \
      ali-to-post "ark:gunzip -c $dir/pre_ali.JOB.gz|" ark:- \| \
      weight-silence-post 0.0 $silphonelist $alimdl ark:- ark:- \| \
      gmm-est-fmllr --fmllr-update-type=$fmllr_update_type \
      --spk2utt=ark:$sdata/JOB/spk2utt $mdl "$sifeats" \
      ark,s,cs:- ark:$dir/trans.JOB || exit 1;
  fi
fi

feats="$sifeats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$dir/trans.JOB ark:- ark:- |"

if [ $stage -le 3 ]; then
  echo "$0: doing final alignment."
  $cmd JOB=1:$nj $dir/log/align_pass2.JOB.log \
    gmm-align-compiled $scale_opts --beam=$beam --retry-beam=$retry_beam --careful=$careful "$mdl_cmd" \
    "ark:gunzip -c $graphdir/fsts.JOB.gz|" "$feats" "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1;
fi

rm $dir/pre_ali.*.gz

echo "$0: done aligning data."

steps/diagnostic/analyze_alignments.sh --cmd "$cmd" $lang $dir

utils/summarize_warnings.pl $dir/log

exit 0;


================================================
FILE: egs/steps/align_fmllr_lats.sh
================================================
#!/usr/bin/env bash
#
# Copyright 2012-2015  Johns Hopkins University (Author: Daniel Povey)
# Apache 2.0

# Version of align_fmllr.sh that generates lattices (lat.*.gz) with
# alignments of alternative pronunciations in them.  Mainly intended
# as a precursor to LF-MMI/chain training for now.

# Begin configuration section.
stage=0
nj=4
cmd=run.pl
# Begin configuration.
scale_opts="--transition-scale=1.0 --self-loop-scale=0.1"
acoustic_scale=0.1
beam=10
retry_beam=40
final_beam=20  # For the lattice-generation phase there is no retry-beam.  This
               # is a limitation of gmm-latgen-faster.  We just use an
               # intermediate beam.  We'll lose a little data and it will be
               # slightly slower.  (however, the min-active of 200 that
               # gmm-latgen-faster defaults to may help.)
boost_silence=1.0 # factor by which to boost silence during alignment.
fmllr_update_type=full
generate_ali_from_lats=false # If true, alingments generated from lattices.
max_active=7000
# End configuration options.

echo "$0 $@"  # Print the command line for logging

[ -f path.sh ] && . ./path.sh # source the path.
. parse_options.sh || exit 1;

if [ $# != 4 ]; then
   echo "usage: steps/align_fmllr_lats.sh <data-dir> <lang-dir> <src-dir> <align-dir>"
   echo "e.g.:  steps/align_fmllr_lats.sh data/train data/lang exp/tri1 exp/tri1_lats"
   echo "main options (for others, see top of script file)"
   echo "  --config <config-file>                           # config containing options"
   echo "  --nj <nj>                                        # number of parallel jobs"
   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
   echo "  --fmllr-update-type (full|diag|offset|none)      # default full."
   exit 1;
fi

data=$1
lang=$2
srcdir=$3
dir=$4

oov=`cat $lang/oov.int` || exit 1;
silphonelist=`cat $lang/phones/silence.csl` || exit 1;
sdata=$data/split$nj

mkdir -p $dir/log
echo $nj > $dir/num_jobs
[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;

utils/lang/check_phones_compatible.sh $lang/phones.txt $srcdir/phones.txt || exit 1;
cp $lang/phones.txt $dir || exit 1;

cp $srcdir/{tree,final.mdl} $dir || exit 1;
cp $srcdir/final.alimdl $dir 2>/dev/null
cp $srcdir/final.occs $dir;
splice_opts=`cat $srcdir/splice_opts 2>/dev/null` # frame-splicing options.
cp $srcdir/splice_opts $dir 2>/dev/null # frame-splicing options.
cmvn_opts=`cat $srcdir/cmvn_opts 2>/dev/null`
cp $srcdir/cmvn_opts $dir 2>/dev/null # cmn/cmvn option.
delta_opts=`cat $srcdir/delta_opts 2>/dev/null`
cp $srcdir/delta_opts $dir 2>/dev/null

if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
echo "$0: feature type is $feat_type"

case $feat_type in
  delta) sifeats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas $delta_opts ark:- ark:- |";;
  lda) sifeats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |"
    cp $srcdir/final.mat $dir
    cp $srcdir/full.mat $dir 2>/dev/null
   ;;
  *) echo "Invalid feature type $feat_type" && exit 1;
esac

## Set up model and alignment model.
mdl=$srcdir/final.mdl
if [ -f $srcdir/final.alimdl ]; then
  alimdl=$srcdir/final.alimdl
else
  alimdl=$srcdir/final.mdl
fi
[ ! -f $mdl ] && echo "$0: no such model $mdl" && exit 1;
alimdl_cmd="gmm-boost-silence --boost=$boost_silence `cat $lang/phones/optional_silence.csl` $alimdl - |"
mdl_cmd="gmm-boost-silence --boost=$boost_silence `cat $lang/phones/optional_silence.csl` $mdl - |"


## because gmm-latgen-faster doesn't support adding the transition-probs to the
## graph itself, we need to bake them into the compiled graphs.  This means we can't reuse previously compiled graphs,
## because the other scripts write them without transition probs.
if [ $stage -le 0 ]; then
  echo "$0: compiling training graphs"
  tra="ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt $sdata/JOB/text|";
  $cmd JOB=1:$nj $dir/log/compile_graphs.JOB.log  \
    compile-train-graphs --read-disambig-syms=$lang/phones/disambig.int $scale_opts $dir/tree $dir/final.mdl  $lang/L.fst "$tra" \
    "ark:|gzip -c >$dir/fsts.JOB.gz" || exit 1;
fi


if [ $stage -le 1 ]; then
  # Note: we need to set --transition-scale=0.0 --self-loop-scale=0.0 because,
  # as explained above, we compiled the transition probs into the training
  # graphs.
  echo "$0: aligning data in $data using $alimdl and speaker-independent features."
  $cmd JOB=1:$nj $dir/log/align_pass1.JOB.log \
    gmm-align-compiled --transition-scale=0.0 --self-loop-scale=0.0 --acoustic-scale=$acoustic_scale \
        --beam=$beam --retry-beam=$retry_beam "$alimdl_cmd" \
    "ark:gunzip -c $dir/fsts.JOB.gz|" "$sifeats" "ark:|gzip -c >$dir/pre_ali.JOB.gz" || exit 1;
fi

if [ $stage -le 2 ]; then
  echo "$0: computing fMLLR transforms"
  if [ "$alimdl" != "$mdl" ]; then
    $cmd JOB=1:$nj $dir/log/fmllr.JOB.log \
      ali-to-post "ark:gunzip -c $dir/pre_ali.JOB.gz|" ark:- \| \
      weight-silence-post 0.0 $silphonelist $alimdl ark:- ark:- \| \
      gmm-post-to-gpost $alimdl "$sifeats" ark:- ark:- \| \
      gmm-est-fmllr-gpost --fmllr-update-type=$fmllr_update_type \
      --spk2utt=ark:$sdata/JOB/spk2utt $mdl "$sifeats" \
      ark,s,cs:- ark:$dir/trans.JOB || exit 1;
  else
    $cmd JOB=1:$nj $dir/log/fmllr.JOB.log \
      ali-to-post "ark:gunzip -c $dir/pre_ali.JOB.gz|" ark:- \| \
      weight-silence-post 0.0 $silphonelist $alimdl ark:- ark:- \| \
      gmm-est-fmllr --fmllr-update-type=$fmllr_update_type \
      --spk2utt=ark:$sdata/JOB/spk2utt $mdl "$sifeats" \
      ark,s,cs:- ark:$dir/trans.JOB || exit 1;
  fi
fi

feats="$sifeats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$dir/trans.JOB ark:- ark:- |"

if [ $stage -le 3 ]; then
  # Warning: gmm-latgen-faster doesn't support a retry-beam so you may get more
  # alignment errors (however, it does have a default min-active=200 so this
  # will tend to reduce alignment errors).
  # --allow_partial=false makes sure we reach the end of the decoding graph.
  # --word-determinize=false makes sure we retain the alternative pronunciations of
  #   words (including alternatives regarding optional silences).
  #  --lattice-beam=$beam keeps all the alternatives that were within the beam,
  #    it means we do no pruning of the lattice (lattices from a training transcription
  #    will be small anyway).
  echo "$0: generating lattices containing alternate pronunciations."
  $cmd JOB=1:$nj $dir/log/generate_lattices.JOB.log \
    gmm-latgen-faster --max-active=$max_active --acoustic-scale=$acoustic_scale --beam=$final_beam \
        --lattice-beam=$final_beam --allow-partial=false --word-determinize=false \
      "$mdl_cmd" "ark:gunzip -c $dir/fsts.JOB.gz|" "$feats" \
      "ark:|gzip -c >$dir/lat.JOB.gz" || exit 1;
fi

if [ $stage -le 4 ] && $generate_ali_from_lats; then
  # If generate_alignments is true, ali.*.gz is generated in lats dir
  $cmd JOB=1:$nj $dir/log/generate_alignments.JOB.log \
    lattice-best-path --acoustic-scale=$acoustic_scale "ark:gunzip -c $dir/lat.JOB.gz |" \
    ark:/dev/null "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1;
fi

rm $dir/pre_ali.*.gz 2>/dev/null || true

echo "$0: done generating lattices from training transcripts."

utils/summarize_warnings.pl $dir/log

exit 0;


================================================
FILE: egs/steps/align_lvtln.sh
================================================
#!/usr/bin/env bash

# Copyright 2014  Vimal Manohar

# Computes training alignments; assumes features are (LDA+MLLT or delta+delta-delta)
# Will ignore fMLLR.
# Will estimate VTLN warping factors
# as a by product, which can be used to extract VTLN-warped features.

# Begin configuration section
stage=0
nj=4
cmd=run.pl
use_graphs=false
# Begin configuration.
scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
beam=10.0
retry_beam=40
boost_silence=1.0 # factor by which to boost silence during alignment.
logdet_scale=1.0
cleanup=false

# End configuration section
echo "$0 $@"  # Print the command line for logging

[ -f ./path.sh ] && . ./path.sh; # source the path.
. parse_options.sh || exit 1;

if [ $# != 4 ]; then
   echo "Wrong #arguments ($#, expected 4)"
   echo "Usage: steps/align_lvtln.sh [options] <data-dir> <lang-dir> <src-dir>  <align-dir>"
   echo " e.g.: steps/align_lvtln.sh data/train data/lang exp/tri2c exp/tri2c_ali"
   echo "main options (for others, see top of script file)"
   echo "  --config <config-file>                   # config containing options"
   echo "  --nj <nj>                                # number of parallel jobs"
   echo "  --use-graphs true                                # use graphs in src-dir"
   echo "  --cmd <cmd>                              # Command to run in parallel with"
   exit 1;
fi

data=$1
lang=$2
srcdir=$3
dir=$4

oov=`cat $lang/oov.int` || exit 1;
silphonelist=`cat $lang/phones/silence.csl` || exit 1;
sdata=$data/split$nj

if [ -f $data/spk2warp ]; then
  echo "$0: file $data/spk2warp exists.  This script expects non-VTLN features"
  exit 1;
fi

mkdir -p $dir/log
echo $nj > $dir/num_jobs
[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;

utils/lang/check_phones_compatible.sh $lang/phones.txt $srcdir/phones.txt || exit 1;
cp $lang/phones.txt $dir || exit 1;

cp $srcdir/{tree,final.mdl,final.lvtln} $dir || exit 1;
cp $srcdir/final.occs $dir;

splice_opts=`cat $srcdir/splice_opts 2>/dev/null` # frame-splicing options.
cp $srcdir/splice_opts $dir 2>/dev/null # frame-splicing options.
cmvn_opts=`cat $srcdir/cmvn_opts 2>/dev/null`
cp $srcdir/cmvn_opts $dir 2>/dev/null # cmn/cmvn option.

## Set up the unadapted features "$sifeats"
if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
echo "$0: feature type is $feat_type";
case $feat_type in
  delta) sifeats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
  lda) sifeats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |"
    cp $srcdir/final.mat $srcdir/full.mat $dir    
    ;;
  *) echo "Invalid feature type $feat_type" && exit 1;
esac

## Set up model and alignment model.
mdl=$srcdir/final.mdl
if [ -f $srcdir/final.alimdl ]; then
  alimdl=$srcdir/final.alimdl
else
  alimdl=$srcdir/final.mdl
fi
[ ! -f $mdl ] && echo "$0: no such model $mdl" && exit 1;
alimdl_cmd="gmm-boost-silence --boost=$boost_silence `cat $lang/phones/optional_silence.csl` $alimdl - |"
mdl_cmd="gmm-boost-silence --boost=$boost_silence `cat $lang/phones/optional_silence.csl` $mdl - |"

## Work out where we're getting the graphs from.
if $use_graphs; then
  [ "$nj" != "`cat $srcdir/num_jobs`" ] && \
    echo "$0: you specified --use-graphs true, but #jobs mismatch." && exit 1;
  [ ! -f $srcdir/fsts.1.gz ] && echo "No graphs in $srcdir" && exit 1;
  graphdir=$srcdir
else
  graphdir=$dir
  if [ $stage -le 0 ]; then
    echo "$0: compiling training graphs"
    tra="ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt $sdata/JOB/text|";   
    $cmd JOB=1:$nj $dir/log/compile_graphs.JOB.log  \
      compile-train-graphs --read-disambig-syms=$lang/phones/disambig.int $dir/tree $dir/final.mdl  $lang/L.fst "$tra" \
        "ark:|gzip -c >$dir/fsts.JOB.gz" || exit 1;
  fi
fi

if [ $stage -le 1 ]; then
  echo "$0: aligning data in $data using $alimdl and speaker-independent features."
  $cmd JOB=1:$nj $dir/log/align_pass1.JOB.log \
    gmm-align-compiled $scale_opts --beam=$beam --retry-beam=$retry_beam "$alimdl_cmd" \
    "ark:gunzip -c $graphdir/fsts.JOB.gz|" "$sifeats" "ark:|gzip -c >$dir/pre_ali.JOB.gz" || exit 1;
fi

if [ -f $data/segments ]; then
  subset_utts="ark:extract-segments scp:$sdata/JOB/wav.scp $sdata/JOB/segments ark:- |"
else
  echo "$0 [info]: no segments file exists: using wav.scp directly."
  subset_utts="ark:wav-copy scp:$sdata/JOB/wav.scp ark:- |"
fi

## Get the first-pass LVTLN transforms
if [ $stage -le 2 ]; then
  echo "$0: computing first-pass LVTLN transforms."
  $cmd JOB=1:$nj $dir/log/lvtln_pass1.JOB.log \
    ali-to-post "ark:gunzip -c $dir/pre_ali.JOB.gz|" ark:- \| \
    weight-silence-post 0.0 $silphonelist $alimdl ark:- ark:- \| \
    gmm-post-to-gpost $alimdl "$sifeats" ark:- ark:- \| \
    gmm-est-lvtln-trans --verbose=1 --spk2utt=ark:$sdata/JOB/spk2utt --logdet-scale=$logdet_scale \
    $mdl $dir/final.lvtln "$sifeats" ark,s,cs:- ark:$dir/trans_pass1.JOB \
    ark,t:$dir/warp_pass1.JOB || exit 1;
fi
##

feats1="$sifeats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$dir/trans_pass1.JOB ark:- ark:- |"

## Do a second pass of estimating the LVTLN transform.

if [ $stage -le 3 ]; then
  echo "$0: realigning with transformed features"
  $cmd JOB=1:$nj $dir/log/align_pass2.JOB.log \
    gmm-align-compiled $scale_opts --beam=$beam --retry-beam=$retry_beam "$mdl_cmd" \
    "ark:gunzip -c $graphdir/fsts.JOB.gz|" "$feats1" "ark:|gzip -c >$dir/ali_pass2.JOB.gz" || exit 1;
fi

if [ $stage -le 4 ]; then
  echo "$0: re-estimating LVTLN transforms"
  $cmd JOB=1:$nj $dir/log/lvtln_pass1.JOB.log \
    ali-to-post "ark:gunzip -c $dir/ali_pass2.JOB.gz|" ark:- \| \
    weight-silence-post 0.0 $silphonelist $alimdl ark:- ark:- \| \
    gmm-post-to-gpost $alimdl "$feats1" ark:- ark:- \| \
    gmm-est-lvtln-trans --verbose=1 --spk2utt=ark:$sdata/JOB/spk2utt --logdet-scale=$logdet_scale \
    $mdl $dir/final.lvtln "$sifeats" ark,s,cs:- ark:$dir/trans.JOB \
    ark,t:$dir/warp.JOB || exit 1;
fi

feats="$sifeats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$dir/trans.JOB ark:- ark:- |"

if [ $stage -le 5 ]; then
  # This second alignment does not affect the transforms.
  echo "$0: realigning with the second-pass LVTLN transforms"
  $cmd JOB=1:$nj $dir/log/align.JOB.log \
    gmm-align-compiled $scale_opts --beam=$beam --retry-beam=$retry_beam "$mdl_cmd" \
    "ark:gunzip -c $graphdir/fsts.JOB.gz|" "$feats" "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1;
fi

if [ -f $dir/warp.1 ]; then
  for j in $(seq $nj); do cat $dir/warp_pass1.$j; done > $dir/0.warp || exit 1;
  for j in $(seq $nj); do cat $dir/warp.$j; done > $dir/final.warp || exit 1;
  ns1=$(cat $dir/0.warp | wc -l)
  ns2=$(cat $dir/final.warp | wc -l)
  ! [ "$ns1" == "$ns2" ] && echo "$0: Number of speakers differ pass1 vs pass2, $ns1 != $ns2" && exit 1;

  paste $dir/0.warp $dir/final.warp | awk '{x=$2 - $4; if ((x>0?x:-x) > 0.010001) { print $1, $2, $4; }}' > $dir/warp_changed
  nc=$(cat $dir/warp_changed | wc -l)
  echo "$0: For $nc speakers out of $ns1, warp changed pass1 vs pass2 by >0.01, see $dir/warp_changed for details"
fi

if true; then # Diagnostics
  if [ -f $data/spk2gender ]; then 
    # To make it easier to eyeball the male and female speakers' warps
    # separately, separate them out.
    for g in m f; do # means: for gender in male female
      cat $dir/final.warp | \
        utils/filter_scp.pl <(grep -w $g $data/spk2gender | awk '{print $1}') > $dir/final.warp.$g
      echo -n "The last few warp factors for gender $g are: "
      tail -n 10 $dir/final.warp.$g | awk '{printf("%s ", $2);}'; 
      echo
    done
  fi
fi

if $cleanup; then
  rm $dir/pre_ali.*.gz $dir/ali_pass?.*.gz $dir/trans_pass1.* $dir/warp_pass1.* $dir/warp.*
fi

exit 0;


================================================
FILE: egs/steps/align_raw_fmllr.sh
================================================
#!/usr/bin/env bash
# Copyright 2012-2013  Johns Hopkins University (Author: Daniel Povey)
# Apache 2.0

# Computes training alignments; assumes features are (LDA+MLLT or delta+delta-delta)
# + fMLLR (probably with SAT models).
# It first computes an alignment with the final.alimdl (or the final.mdl if final.alimdl
# is not present), then does 2 iterations of fMLLR estimation.

# If you supply the --use-graphs option, it will use the training
# graphs from the source directory (where the model is).  In this
# case the number of jobs must match the source directory.


# Begin configuration section.
stage=0
nj=4
cmd=run.pl
use_graphs=false
# Begin configuration.
scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
beam=10
retry_beam=40
boost_silence=1.0 # factor by which to boost silence during alignment.
# End configuration options.

echo "$0 $@"  # Print the command line for logging

[ -f path.sh ] && . ./path.sh # source the path.
. parse_options.sh || exit 1;

if [ $# != 4 ]; then
   echo "usage: steps/align_fmllr.sh <data-dir> <lang-dir> <src-dir> <align-dir>"
   echo "e.g.:  steps/align_fmllr.sh data/train data/lang exp/tri1 exp/tri1_ali"
   echo "main options (for others, see top of script file)"
   echo "  --config <config-file>                           # config containing options"
   echo "  --nj <nj>                                        # number of parallel jobs"
   echo "  --use-graphs true                                # use graphs in src-dir"
   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
   exit 1;
fi

data=$1
lang=$2
srcdir=$3
dir=$4

oov=`cat $lang/oov.int` || exit 1;
silphonelist=`cat $lang/phones/silence.csl` || exit 1;
sdata=$data/split$nj

mkdir -p $dir/log
echo $nj > $dir/num_jobs
[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;

utils/lang/check_phones_compatible.sh $lang/phones.txt $srcdir/phones.txt || exit 1;
cp $lang/phones.txt $dir || exit 1;

cp $srcdir/{tree,final.mdl} $dir || exit 1;
cp $srcdir/final.occs $dir;
splice_opts=`cat $srcdir/splice_opts 2>/dev/null` # frame-splicing options.
cp $srcdir/splice_opts $dir 2>/dev/null # frame-splicing options.
cmvn_opts=`cat $srcdir/cmvn_opts 2>/dev/null`
cp $srcdir/cmvn_opts $dir 2>/dev/null # cmn/cmvn option.

if [[ ! -f $srcdir/final.mat || ! -f $srcdir/full.mat ]]; then
  echo "$0: we require final.mat and full.mat in the source directory $srcdir"
fi

full_lda_mat="get-full-lda-mat --print-args=false $srcdir/final.mat $srcdir/full.mat -|"
cp $srcdir/full.mat $srcdir/final.mat $dir

raw_dim=$(feat-to-dim scp:$data/feats.scp -) || exit 1;
! [ "$raw_dim" -gt 0 ] && echo "raw feature dim not set" && exit 1;

splicedfeats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- |"
sifeats="$splicedfeats transform-feats $srcdir/final.mat ark:- ark:- |"

## Set up model and alignment model.
mdl=$srcdir/final.mdl
if [ -f $srcdir/final.alimdl ]; then
  alimdl=$srcdir/final.alimdl
else
  alimdl=$srcdir/final.mdl
fi
[ ! -f $mdl ] && echo "$0: no such model $mdl" && exit 1;
alimdl_cmd="gmm-boost-silence --boost=$boost_silence `cat $lang/phones/optional_silence.csl` $alimdl - |"
mdl_cmd="gmm-boost-silence --boost=$boost_silence `cat $lang/phones/optional_silence.csl` $mdl - |"


## Work out where we're getting the graphs from.
if $use_graphs; then
  [ "$nj" != "`cat $srcdir/num_jobs`" ] && \
    echo "$0: you specified --use-graphs true, but #jobs mismatch." && exit 1;
  [ ! -f $srcdir/fsts.1.gz ] && echo "No graphs in $srcdir" && exit 1;
  graphdir=$srcdir
else
  graphdir=$dir
  if [ $stage -le 0 ]; then
    echo "$0: compiling training graphs"
    tra="ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt $sdata/JOB/text|";
    $cmd JOB=1:$nj $dir/log/compile_graphs.JOB.log  \
      compile-train-graphs --read-disambig-syms=$lang/phones/disambig.int $dir/tree $dir/final.mdl  $lang/L.fst "$tra" \
        "ark:|gzip -c >$dir/fsts.JOB.gz" || exit 1;
  fi
fi


if [ $stage -le 1 ]; then
  echo "$0: aligning data in $data using $alimdl and speaker-independent features."
  $cmd JOB=1:$nj $dir/log/align_pass1.JOB.log \
    gmm-align-compiled $scale_opts --beam=$beam --retry-beam=$retry_beam "$alimdl_cmd" \
    "ark:gunzip -c $graphdir/fsts.JOB.gz|" "$sifeats" "ark:|gzip -c >$dir/pre_ali.JOB.gz" || exit 1;
fi

if [ $stage -le 2 ]; then
  echo "$0: computing fMLLR transforms"
  if [ "$alimdl" != "$mdl" ]; then
    $cmd JOB=1:$nj $dir/log/fmllr.JOB.log \
      ali-to-post "ark:gunzip -c $dir/pre_ali.JOB.gz|" ark:- \| \
      weight-silence-post 0.0 $silphonelist $alimdl ark:- ark:- \| \
      gmm-post-to-gpost $alimdl "$sifeats" ark:- ark:- \| \
      gmm-est-fmllr-raw-gpost --raw-feat-dim=$raw_dim --spk2utt=ark:$sdata/JOB/spk2utt \
       $mdl "$full_lda_mat" "$splicedfeats" ark,s,cs:- ark:$dir/raw_trans.JOB || exit 1;
  else
    $cmd JOB=1:$nj $dir/log/fmllr.JOB.log \
      ali-to-post "ark:gunzip -c $dir/pre_ali.JOB.gz|" ark:- \| \
      weight-silence-post 0.0 $silphonelist $alimdl ark:- ark:- \| \
      gmm-est-fmllr-raw --raw-feat-dim=$raw_dim --spk2utt=ark:$sdata/JOB/spk2utt $mdl "$full_lda_mat" \
       "$splicedfeats" ark,s,cs:- ark:$dir/raw_trans.JOB || exit 1;
  fi
fi

feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$dir/raw_trans.JOB ark:- ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |"

if [ $stage -le 3 ]; then
  echo "$0: doing final alignment."
  $cmd JOB=1:$nj $dir/log/align_pass2.JOB.log \
    gmm-align-compiled $scale_opts --beam=$beam --retry-beam=$retry_beam "$mdl_cmd" \
    "ark:gunzip -c $graphdir/fsts.JOB.gz|" "$feats" "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1;
fi

rm $dir/pre_ali.*.gz

echo "$0: done aligning data."

steps/diagnostic/analyze_alignments.sh --cmd "$cmd" $lang $dir

utils/summarize_warnings.pl $dir/log

exit 0;


================================================
FILE: egs/steps/align_sgmm2.sh
================================================
#!/usr/bin/env bash
# Copyright 2012  Johns Hopkins University (Author: Daniel Povey)
# Apache 2.0

# Computes training alignments and (if needed) speaker-vectors, given an
# SGMM system.  If the system is built on top of SAT, you should supply
# transforms with the --transform-dir option.

# If you supply the --use-graphs option, it will use the training
# graphs from the source directory.

# Begin configuration section.
stage=0
nj=4
cmd=run.pl
use_graphs=false # use graphs from srcdir
use_gselect=false # use gselect info from srcdir [regardless, we use
   # Gaussian-selection info, we might have to compute it though.]
gselect=15  # Number of Gaussian-selection indices for SGMMs.
# Begin configuration.
scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
beam=10
retry_beam=40
transform_dir=  # directory to find fMLLR transforms in.
# End configuration options.

echo "$0 $@"  # Print the command line for logging

[ -f path.sh ] && . ./path.sh # source the path.
. parse_options.sh || exit 1;

if [ $# != 4 ]; then
   echo "usage: steps/align_sgmm2.sh <data-dir> <lang-dir> <src-dir> <align-dir>"
   echo "e.g.:  steps/align_sgmm2.sh --transform-dir exp/tri3b data/train data/lang \\"
   echo "           exp/sgmm4a exp/sgmm5a_ali"
   echo "main options (for others, see top of script file)"
   echo "  --config <config-file>                           # config containing options"
   echo "  --nj <nj>                                        # number of parallel jobs"
   echo "  --use-graphs true                                # use graphs in src-dir"
   echo "  --transform-dir <transform-dir>                  # directory to find fMLLR transforms"
   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
   exit 1;
fi

data=$1
lang=$2
srcdir=$3
dir=$4

oov=`cat $lang/oov.int` || exit 1;
silphonelist=`cat $lang/phones/silence.csl` || exit 1;
splice_opts=`cat $srcdir/splice_opts 2>/dev/null` # frame-splicing options.
cmvn_opts=`cat $srcdir/cmvn_opts 2>/dev/null`
sdata=$data/split$nj

mkdir -p $dir/log
cp $srcdir/splice_opts $dir 2>/dev/null # frame-splicing options.
cp $srcdir/cmvn_opts $dir 2>/dev/null # cmn/cmvn option.
echo $nj > $dir/num_jobs
[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;

utils/lang/check_phones_compatible.sh $lang/phones.txt $srcdir/phones.txt || exit 1;
cp $lang/phones.txt $dir || exit 1;

cp $srcdir/{tree,final.mdl} $dir || exit 1;
[ -f $srcdir/final.alimdl ] && cp $srcdir/final.alimdl $dir
cp $srcdir/final.occs $dir;

## Set up features.
if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
echo "$0: feature type is $feat_type"

case $feat_type in
  delta) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
  lda) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |"
    cp $srcdir/final.mat $dir
   ;;
  *) echo "Invalid feature type $feat_type" && exit 1;
esac
if [ ! -z "$transform_dir" ]; then
  echo "$0: using transforms from $transform_dir"
  [ ! -f $transform_dir/trans.1 ] && echo "$0: no such file $transform_dir/trans.1" && exit 1;
  [ "$nj" -ne "`cat $transform_dir/num_jobs`" ] \
    && echo "$0: #jobs mismatch with transform-dir." && exit 1;
  feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$transform_dir/trans.JOB ark:- ark:- |"
elif grep 'transform-feats --utt2spk' $srcdir/log/acc.0.1.log 2>/dev/null; then
  echo "$0: **WARNING**: you seem to be using an SGMM system trained with transforms,"
  echo "  but you are not providing the --transform-dir option during alignment."
fi
##

## Set up model and alignment model.
mdl=$srcdir/final.mdl
if [ -f $srcdir/final.alimdl ]; then
  alimdl=$srcdir/final.alimdl
else
  alimdl=$srcdir/final.mdl
fi
[ ! -f $mdl ] && echo "$0: no such model $mdl" && exit 1;

## Work out where we're getting the graphs from.
if $use_graphs; then
  [ "$nj" != "`cat $srcdir/num_jobs`" ] && \
    echo "$0: you specified --use-graphs true, but #jobs mismatch." && exit 1;
  [ ! -f $srcdir/fsts.1.gz ] && echo "No graphs in $srcdir" && exit 1;
  graphdir=$srcdir
  ln.pl $srcdir/fsts.*.gz $dir
else
  graphdir=$dir
  if [ $stage -le 0 ]; then
    echo "$0: compiling training graphs"
    tra="ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt $sdata/JOB/text|";
    $cmd JOB=1:$nj $dir/log/compile_graphs.JOB.log  \
      compile-train-graphs --read-disambig-syms=$lang/phones/disambig.int $dir/tree $dir/final.mdl  $lang/L.fst "$tra" \
        "ark:|gzip -c >$dir/fsts.JOB.gz" || exit 1;
  fi
fi

## Work out where we're getting the Gaussian-selection info from
if $use_gselect; then
  [ "$nj" != "`cat $srcdir/num_jobs`" ] && \
    echo "$0: you specified --use-gselect true, but #jobs mismatch." && exit 1;
  [ ! -f $srcdir/gselect.1.gz ] && echo "No gselect info in $srcdir" && exit 1;
  graphdir=$srcdir
  gselect_opt="--gselect=ark,s,cs:gunzip -c $srcdir/gselect.JOB.gz|"
  ln.pl $srcdir/gselect.*.gz $dir
else
  graphdir=$dir
  if [ $stage -le 1 ]; then
    echo "$0: computing Gaussian-selection info"
    # Note: doesn't matter whether we use $alimdl or $mdl, they will
    # have the same gselect info.
    $cmd JOB=1:$nj $dir/log/gselect.JOB.log \
      sgmm2-gselect --full-gmm-nbest=$gselect $alimdl \
      "$feats" "ark:|gzip -c >$dir/gselect.JOB.gz" || exit 1;
  fi
  gselect_opt="--gselect=ark,s,cs:gunzip -c $dir/gselect.JOB.gz|"
fi


if [ $alimdl == $mdl ]; then
  # Speaker-independent decoding-- just one pass.  Not normal.
  T=`sgmm2-info $mdl | grep 'speaker vector space' | awk '{print $NF}'` || exit 1;
  [ "$T" -ne 0 ] && echo "No alignment model, yet speaker vector space nonempty" && exit 1;

  if [ $stage -le 2 ]; then
    echo "$0: aligning data in $data using model $mdl (no speaker-vectors)"
    $cmd JOB=1:$nj $dir/log/align_pass1.JOB.log \
      sgmm2-align-compiled $scale_opts --beam=$beam --retry-beam=$retry_beam $alimdl \
      "ark:gunzip -c $graphdir/fsts.JOB.gz|" "$feats" "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1;
  fi
  echo "$0: done aligning data."
  exit 0;
fi

# Continue with system with speaker vectors.
if [ $stage -le 2 ]; then
  echo "$0: aligning data in $data using model $alimdl"
  $cmd JOB=1:$nj $dir/log/align_pass1.JOB.log \
    sgmm2-align-compiled $scale_opts "$gselect_opt" --beam=$beam --retry-beam=$retry_beam $alimdl \
    "ark:gunzip -c $graphdir/fsts.JOB.gz|" "$feats" "ark:|gzip -c >$dir/pre_ali.JOB.gz" || exit 1;
fi

if [ $stage -le 3 ]; then
  echo "$0: computing speaker vectors (1st pass)"
  $cmd JOB=1:$nj $dir/log/spk_vecs1.JOB.log \
    ali-to-post "ark:gunzip -c $dir/pre_ali.JOB.gz|" ark:- \| \
    weight-silence-post 0.0 $silphonelist $alimdl ark:- ark:- \| \
    sgmm2-post-to-gpost "$gselect_opt" $alimdl "$feats" ark:- ark:- \| \
    sgmm2-est-spkvecs-gpost --spk2utt=ark:$sdata/JOB/spk2utt \
     $mdl "$feats" ark,s,cs:- ark:$dir/pre_vecs.JOB || exit 1;
fi

if [ $stage -le 4 ]; then
  echo "$0: computing speaker vectors (2nd pass)"
  $cmd JOB=1:$nj $dir/log/spk_vecs2.JOB.log \
    ali-to-post "ark:gunzip -c $dir/pre_ali.JOB.gz|" ark:- \| \
    weight-silence-post 0.0 $silphonelist $alimdl ark:- ark:- \| \
    sgmm2-est-spkvecs --spk2utt=ark:$sdata/JOB/spk2utt "$gselect_opt" \
     --spk-vecs=ark:$dir/pre_vecs.JOB $mdl "$feats" ark,s,cs:- ark:$dir/vecs.JOB || exit 1;
  rm $dir/pre_vecs.*
fi

if [ $stage -le 5 ]; then
  echo "$0: doing final alignment."
  $cmd JOB=1:$nj $dir/log/align_pass2.JOB.log \
    sgmm2-align-compiled $scale_opts "$gselect_opt" --beam=$beam --retry-beam=$retry_beam \
     --utt2spk=ark:$sdata/JOB/utt2spk --spk-vecs=ark:$dir/vecs.JOB \
     $mdl "ark:gunzip -c $graphdir/fsts.JOB.gz|" "$feats" "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1;
fi

rm $dir/pre_ali.*.gz

echo "$0: done aligning data."

steps/diagnostic/analyze_alignments.sh --cmd "$cmd" $lang $dir

utils/summarize_warnings.pl $dir/log

exit 0;


================================================
FILE: egs/steps/align_si.sh
================================================
#!/usr/bin/env bash
# Copyright 2012  Johns Hopkins University (Author: Daniel Povey)
# Apache 2.0

# Computes training alignments using a model with delta or
# LDA+MLLT features.

# If you supply the "--use-graphs true" option, it will use the training
# graphs from the source directory (where the model is).  In this
# case the number of jobs must match with the source directory.


# Begin configuration section.
nj=4
cmd=run.pl
use_graphs=false
# Begin configuration.
scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
beam=10
retry_beam=40
careful=false
boost_silence=1.0 # Factor by which to boost silence during alignment.
# End configuration options.

echo "$0 $@"  # Print the command line for logging

[ -f path.sh ] && . ./path.sh # source the path.
. parse_options.sh || exit 1;

if [ $# != 4 ]; then
   echo "usage: steps/align_si.sh <data-dir> <lang-dir> <src-dir> <align-dir>"
   echo "e.g.:  steps/align_si.sh data/train data/lang exp/tri1 exp/tri1_ali"
   echo "main options (for others, see top of script file)"
   echo "  --config <config-file>                           # config containing options"
   echo "  --nj <nj>                                        # number of parallel jobs"
   echo "  --use-graphs true                                # use graphs in src-dir"
   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
   exit 1;
fi

data=$1
lang=$2
srcdir=$3
dir=$4


for f in $data/text $lang/oov.int $srcdir/tree $srcdir/final.mdl; do
  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1;
done

oov=`cat $lang/oov.int` || exit 1;
mkdir -p $dir/log
echo $nj > $dir/num_jobs
sdata=$data/split$nj
splice_opts=`cat $srcdir/splice_opts 2>/dev/null` # frame-splicing options.
cp $srcdir/splice_opts $dir 2>/dev/null # frame-splicing options.
cmvn_opts=`cat $srcdir/cmvn_opts 2>/dev/null`
cp $srcdir/cmvn_opts $dir 2>/dev/null # cmn/cmvn option.
delta_opts=`cat $srcdir/delta_opts 2>/dev/null`
cp $srcdir/delta_opts $dir 2>/dev/null

[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;

utils/lang/check_phones_compatible.sh $lang/phones.txt $srcdir/phones.txt || exit 1;
cp $lang/phones.txt $dir || exit 1;

cp $srcdir/{tree,final.mdl} $dir || exit 1;
cp $srcdir/final.occs $dir;


if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
echo "$0: feature type is $feat_type"

case $feat_type in
  delta) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas $delta_opts ark:- ark:- |";;
  lda) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |"
    cp $srcdir/final.mat $srcdir/full.mat $dir
   ;;
  *) echo "$0: invalid feature type $feat_type" && exit 1;
esac

echo "$0: aligning data in $data using model from $srcdir, putting alignments in $dir"

mdl="gmm-boost-silence --boost=$boost_silence `cat $lang/phones/optional_silence.csl` $dir/final.mdl - |"

if $use_graphs; then
  [ $nj != "`cat $srcdir/num_jobs`" ] && echo "$0: mismatch in num-jobs" && exit 1;
  [ ! -f $srcdir/fsts.1.gz ] && echo "$0: no such file $srcdir/fsts.1.gz" && exit 1;

  $cmd JOB=1:$nj $dir/log/align.JOB.log \
    gmm-align-compiled $scale_opts --beam=$beam --retry-beam=$retry_beam --careful=$careful "$mdl" \
      "ark:gunzip -c $srcdir/fsts.JOB.gz|" "$feats" "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1;
else
  tra="ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt $sdata/JOB/text|";
  # We could just use gmm-align in the next line, but it's less efficient as it compiles the
  # training graphs one by one.
  $cmd JOB=1:$nj $dir/log/align.JOB.log \
    compile-train-graphs --read-disambig-syms=$lang/phones/disambig.int $dir/tree $dir/final.mdl  $lang/L.fst "$tra" ark:- \| \
    gmm-align-compiled $scale_opts --beam=$beam --retry-beam=$retry_beam --careful=$careful "$mdl" ark:- \
      "$feats" "ark,t:|gzip -c >$dir/ali.JOB.gz" || exit 1;
fi

steps/diagnostic/analyze_alignments.sh --cmd "$cmd" $lang $dir

echo "$0: done aligning data."


================================================
FILE: egs/steps/best_path_weights.sh
================================================
#!/usr/bin/env bash

# Copyright 2014-17 Vimal Manohar

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#  http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.


# This script gets from the lattice the best path alignments and frame-level
# posteriors of the pdfs in the best path alignment.
# The output directory has the format of an alignment directory.
# It can optionally read alignments from a directory, in which case,
# the script gets frame-level posteriors of the pdf corresponding to those
# alignments.
# The frame-level posteriors in the form of kaldi vectors and are 
# output in weights.scp.

set -e

# begin configuration section.
cmd=run.pl
stage=-10
acwt=0.1
#end configuration section.

if [ -f ./path.sh ]; then . ./path.sh; fi
. utils/parse_options.sh || exit 1;

if [ $# -ne 3 ] && [ $# -ne 4 ]; then
  cat <<EOF
    Usage: $0 [options] <data-dir> <decode-dir> [<ali-dir>] <out-dir>
      E.g. $0 data/train_unt.seg exp/tri1/decode exp/tri1/best_path
    Options:
      --cmd (run.pl|queue.pl...)      # specify how to run the sub-processes.
EOF
  
  exit 1;
fi

data=$1
decode_dir=$2
dir=${@: -1}  # last argument to the script

ali_dir=$dir
if [ $# -eq 4 ]; then
  ali_dir=$3
fi

mkdir -p $dir

nj=$(cat $decode_dir/num_jobs)
echo $nj > $dir/num_jobs

if [ $stage -le 1 ]; then
  mkdir -p $dir/log
  $cmd JOB=1:$nj $dir/log/best_path.JOB.log \
    lattice-best-path --acoustic-scale=$acwt \
      "ark,s,cs:gunzip -c $decode_dir/lat.JOB.gz |" \
      ark:/dev/null "ark:| gzip -c > $dir/ali.JOB.gz" || exit 1
fi

# Find where the final.mdl is.
if [ -f $(dirname $decode_dir)/final.mdl ]; then
  src_dir=$(dirname $decode_dir)
else
  src_dir=$decode_dir
fi

cp $src_dir/cmvn_opts $dir/ || exit 1
for f in final.mat splice_opts frame_subsampling_factor; do
  if [ -f $src_dir/$f ]; then cp $src_dir/$f $dir; fi
done

# make $dir an absolute pathname.
fdir=$(perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' $dir ${PWD})

model=$src_dir/final.mdl
tree=$src_dir/tree

for f in $model $decode_dir/lat.1.gz $tree; do
  if [ ! -f $f ]; then echo "$0: expecting file $f to exist" && exit 1; fi
done

cp $model $tree $dir || exit 1

ali_nj=$(cat $ali_dir/num_jobs) || exit 1
if [ $nj -ne $ali_nj ]; then
  echo "$0: $decode_dir and $ali_dir have different number of jobs. Redo alignment with $nj jobs."
  exit 1
fi

if [ $stage -lt 2 ]; then
  $cmd JOB=1:$nj $dir/log/get_post.JOB.log \
    lattice-to-post --acoustic-scale=$acwt \
      "ark,s,cs:gunzip -c $decode_dir/lat.JOB.gz|" ark:- \| \
    post-to-pdf-post $model ark,s,cs:- ark:- \| \
    get-post-on-ali ark,s,cs:- \
    "ark,s,cs:gunzip -c $ali_dir/ali.JOB.gz | convert-ali $dir/final.mdl $model $tree ark,s,cs:- ark:- | ali-to-pdf $model ark,s,cs:- ark:- |" \
    "ark,scp:$fdir/weights.JOB.ark,$fdir/weights.JOB.scp" || exit 1
fi

for n in `seq $nj`; do
  cat $dir/weights.$n.scp 
done > $dir/weights.scp

rm $dir/weights.*.scp

exit 0


================================================
FILE: egs/steps/cleanup/clean_and_segment_data.sh
================================================
#!/usr/bin/env bash

# Copyright 2016  Vimal Manohar
#           2016  Johns Hopkins University (author: Daniel Povey)
# Apache 2.0

# This script demonstrates how to re-segment training data selecting only the
# "good" audio that matches the transcripts.
# The basic idea is to decode with an existing in-domain GMM acoustic model, and
# a biased language model built from the reference transcript, and then work out
# the segmentation from a ctm like file.

set -e -o pipefail

stage=0

cmd=run.pl
cleanup=true
nj=4
graph_opts=
segmentation_opts=

. ./path.sh
. utils/parse_options.sh


if [ $# -ne 5 ]; then
  echo "Usage: $0 [options] <data> <lang> <srcdir> <dir> <cleaned-data>"
  echo " This script does data cleanup to remove bad portions of transcripts and"
  echo " may do other minor modifications of transcripts such as allowing repetitions"
  echo " for disfluencies, and adding or removing non-scored words (by default:"
  echo " words that map to 'silence phones')"
  echo " Note: <srcdir> is expected to contain a GMM-based model, preferably a"
  echo " SAT-trained one (see train_sat.sh)."
  echo " If <srcdir> contains fMLLR transforms (trans.*) they are assumed to"
  echo " be transforms corresponding to the data in <data>.  If <srcdir> is for different"
  echo " dataset, and you're using SAT models, you should align <data> with <srcdir>"
  echo " using align_fmllr.sh, and supply that directory as <srcdir>"
  echo ""
  echo "e.g. $0 data/train data/lang exp/tri3 exp/tri3_cleanup data/train_cleaned"
  echo "Options:"
  echo "  --stage <n>             # stage to run from, to enable resuming from partially"
  echo "                          # completed run (default: 0)"
  echo "  --cmd '$cmd'            # command to submit jobs with (e.g. run.pl, queue.pl)"
  echo "  --nj <n>                # number of parallel jobs to use in graph creation and"
  echo "                          # decoding"
  echo "  --segmentation-opts 'opts'  # Additional options to segment_ctm_edits.py."
  echo "                              # Please run steps/cleanup/internal/segment_ctm_edits.py"
  echo "                              # without arguments to see allowed options."
  echo "  --graph-opts 'opts'         # Additional options to make_biased_lm_graphs.sh."
  echo "                              # Please run steps/cleanup/make_biased_lm_graphs.sh"
  echo "                              # without arguments to see allowed options."
  echo "  --cleanup        <true|false>  # Clean up intermediate files afterward.  Default true."
  exit 1

fi

data=$1
lang=$2
srcdir=$3
dir=$4
data_out=$5


for f in $srcdir/{final.mdl,tree,cmvn_opts} $data/utt2spk $data/feats.scp $lang/words.txt $lang/oov.txt; do
  if [ ! -f $f ]; then
    echo "$0: expected file $f to exist."
    exit 1
  fi
done

mkdir -p $dir
cp $srcdir/final.mdl $dir
cp $srcdir/tree $dir
cp $srcdir/cmvn_opts $dir
cp $srcdir/{splice_opts,delta_opts,final.mat,final.alimdl} $dir 2>/dev/null || true

utils/lang/check_phones_compatible.sh $lang/phones.txt $srcdir/phones.txt
cp $lang/phones.txt $dir

if [ $stage -le 1 ]; then
  echo "$0: Building biased-language-model decoding graphs..."
  steps/cleanup/make_biased_lm_graphs.sh $graph_opts \
    --nj $nj --cmd "$cmd" \
     $data $lang $dir $dir/graphs
fi

if [ $stage -le 2 ]; then
  echo "$0: Decoding with biased language models..."
  transform_opt=
  if [ -f $srcdir/trans.1 ]; then
    # If srcdir contained trans.* then we assume they are fMLLR transforms for
    # this data, and we use them.
    transform_opt="--transform-dir $srcdir"
  fi
  # Note: the --beam 15.0 (vs. the default 13.0) does actually slow it
  # down substantially, around 0.35xRT to 0.7xRT on tedlium.
  # I want to test at some point whether it's actually necessary to have
  # this largish beam.
  steps/cleanup/decode_segmentation.sh \
      --beam 15.0 --nj $nj --cmd "$cmd --mem 4G" $transform_opt \
      --skip-scoring true --allow-partial false \
       $dir/graphs $data $dir/lats

  # the following is for diagnostics, e.g. it will give us the lattice depth.
  steps/diagnostic/analyze_lats.sh --cmd "$cmd" $lang $dir/lats
fi

if [ $stage -le 3 ]; then
  echo "$0: Doing oracle alignment of lattices..."
  steps/cleanup/lattice_oracle_align.sh \
    --cmd "$cmd" $data $lang $dir/lats $dir/lattice_oracle
fi


if [ $stage -le 4 ]; then
  echo "$0: using default values of non-scored words..."

  # At the level of this script we just hard-code it that non-scored words are
  # those that map to silence phones (which is what get_non_scored_words.py
  # gives us), although this could easily be made user-configurable.  This list
  # of non-scored words affects the behavior of several of the data-cleanup
  # scripts; essentially, we view the non-scored words as negotiable when it
  # comes to the reference transcript, so we'll consider changing the reference
  # to match the hyp when it comes to these words.
  steps/cleanup/internal/get_non_scored_words.py $lang > $dir/non_scored_words.txt
fi

if [ $stage -le 5 ]; then
  echo "$0: modifying ctm-edits file to allow repetitions [for dysfluencies] and "
  echo "   ... to fix reference mismatches involving non-scored words. "

  $cmd $dir/log/modify_ctm_edits.log \
    steps/cleanup/internal/modify_ctm_edits.py --verbose=3 $dir/non_scored_words.txt \
    $dir/lattice_oracle/ctm_edits $dir/ctm_edits.modified

  echo "   ... See $dir/log/modify_ctm_edits.log for details and stats, including"
  echo " a list of commonly-repeated words."
fi

if [ $stage -le 6 ]; then
  echo "$0: applying 'taint' markers to ctm-edits file to mark silences and"
  echo "  ... non-scored words that are next to errors."
  $cmd $dir/log/taint_ctm_edits.log \
       steps/cleanup/internal/taint_ctm_edits.py $dir/ctm_edits.modified $dir/ctm_edits.tainted
  echo "... Stats, including global cor/ins/del/sub stats, are in $dir/log/taint_ctm_edits.log."
fi


if [ $stage -le 7 ]; then
  echo "$0: creating segmentation from ctm-edits file."

  $cmd $dir/log/segment_ctm_edits.log \
    steps/cleanup/internal/segment_ctm_edits.py \
       $segmentation_opts \
       --oov-symbol-file=$lang/oov.txt \
      --ctm-edits-out=$dir/ctm_edits.segmented \
      --word-stats-out=$dir/word_stats.txt \
      $dir/non_scored_words.txt \
      $dir/ctm_edits.tainted $dir/text $dir/segments

  echo "$0: contents of $dir/log/segment_ctm_edits.log are:"
  cat $dir/log/segment_ctm_edits.log
  echo "For word-level statistics on p(not-being-in-a-segment), with 'worst' words at the top,"
  echo "see $dir/word_stats.txt"
  echo "For detailed utterance-level debugging information, see $dir/ctm_edits.segmented"
fi

if [ $stage -le 8 ]; then
  echo "$0: working out required segment padding to account for feature-generation edge effects."
  # make sure $data/utt2dur exists.
  utils/data/get_utt2dur.sh $data
  # utt2dur.from_ctm contains lines of the form 'utt dur',  e.g.
  # AMI_EN2001a_H00_MEE068_0000557_0000594 0.35
  # where the times are ultimately derived from the num-frames in the features.
  cat $dir/lattice_oracle/ctm_edits | \
     awk '{utt=$1; t=$3+$4; if (t > dur[$1]) dur[$1] = t; } END{for (k in dur) print k, dur[k];}' | \
     sort > $dir/utt2dur.from_ctm
  # the apply_map command below gives us lines of the form 'utt dur-from-$data/utt2dur dur-from-utt2dur.from_ctm',
  # e.g. AMI_EN2001a_H00_MEE068_0000557_0000594 0.37 0.35
  utils/apply_map.pl -f 1 <(awk '{print $1,$1,$2}' <$data/utt2dur) <$dir/utt2dur.from_ctm  | \
    awk '{printf("%.3f\n", $2 - $3); }' | sort | uniq -c | sort -nr > $dir/padding_frequencies
  # there are values other than the most-frequent one (0.02) in there because
  # of wav files that were shorter than the segment info.
  padding=$(head -n 1 $dir/padding_frequencies | awk '{print $2}')
  echo "$0: we'll pad segments with $padding seconds at segment ends to correct for feature-generation end effects"
  echo $padding >$dir/segment_end_padding
fi


if [ $stage -le 8 ]; then
  echo "$0: based on the segments and text file in $dir/segments and $dir/text, creating new data-dir in $data_out"
  padding=$(cat $dir/segment_end_padding)  # e.g. 0.02
  utils/data/subsegment_data_dir.sh --segment-end-padding $padding ${data} $dir/segments $dir/text $data_out
  # utils/data/subsegment_data_dir.sh can output directories that have e.g. to many entries left in wav.scp
  # Clean this up with the fix_dat_dir.sh script
  utils/fix_data_dir.sh $data_out
fi

if [ $stage -le 9 ]; then
  echo "$0: recomputing CMVN stats for the new data"
  # Caution: this script puts the CMVN stats in $data_out/data,
  # e.g. data/train_cleaned/data.  This is not the general pattern we use.
  steps/compute_cmvn_stats.sh $data_out $data_out/log $data_out/data
fi

if $cleanup; then
  echo "$0: cleaning up intermediate files"
  rm -r $dir/graphs/fsts $dir/graphs/HCLG.fsts.scp || true
  rm -r $dir/lats/lat.*.gz $dir/lats/split_fsts || true
  rm $dir/lattice_oracle/lat.*.gz || true
fi

echo "$0: done."


================================================
FILE: egs/steps/cleanup/clean_and_segment_data_nnet3.sh
================================================
#!/usr/bin/env bash

# Copyright 2016  Vimal Manohar
#           2016  Johns Hopkins University (author: Daniel Povey)
# Apache 2.0

# This script demonstrates how to re-segment training data selecting only the
# "good" audio that matches the transcripts.
# This script is like clean_and_segment_data.sh, but uses nnet3 model instead of
# a GMM for decoding.
# The basic idea is to decode with an existing in-domain nnet3 acoustic model,
# and a biased language model built from the reference transcript, and then work
# out the segmentation from a ctm like file.

set -e
set -o pipefail
set -u

stage=0

cmd=run.pl
cleanup=true  # remove temporary directories and files
nj=4
# Decode options
graph_opts=
scale_opts=
beam=15.0
lattice_beam=1.0

acwt=0.1  # Just a default value, used for adaptation and beam-pruning..
lmwt=10

# Contexts must ideally match training
extra_left_context=0  # Set to some large value, typically 40 for LSTM (must match training)
extra_right_context=0
extra_left_context_initial=-1
extra_right_context_final=-1
frames_per_chunk=150

# i-vector options
extractor=    # i-Vector extractor. If provided, will extract i-vectors.
              # Required if the network was trained with i-vector extractor.
use_vad=false # Use energy-based VAD for i-vector extraction

segmentation_opts=

. ./path.sh
. utils/parse_options.sh


if [ $# -ne 5 ]; then
  cat <<EOF
  Usage: $0 [--extractor <ivector-extractor>] [options] <data> <lang> <srcdir> <dir> <cleaned-data>
   This script does data cleanup to remove bad portions of transcripts and
   may do other minor modifications of transcripts such as allowing repetitions
   for disfluencies, and adding or removing non-scored words (by default:
   words that map to 'silence phones')
   Note: <srcdir> is expected to contain a nnet3-based model.
   <ivector-extractor> and decoding options like --extra-left-context must match
   the appropriate options used for training.

  e.g. $0 data/train data/lang exp/tri3 exp/tri3_cleanup data/train_cleaned
  main options (for others, see top of script file):
    --stage <n>             # stage to run from, to enable resuming from partially
                            # completed run (default: 0)
    --cmd '$cmd'            # command to submit jobs with (e.g. run.pl, queue.pl)
    --nj <n>                # number of parallel jobs to use in graph creation and
                            # decoding
    --graph-opts 'opts'         # Additional options to make_biased_lm_graphs.sh.
                                # Please run steps/cleanup/make_biased_lm_graphs.sh
                                # without arguments to see allowed options.
    --segmentation-opts 'opts'  # Additional options to segment_ctm_edits.py.
                                # Please run steps/cleanup/internal/segment_ctm_edits.py
                                # without arguments to see allowed options.
    --cleanup        <true|false>  # Clean up intermediate files afterward.  Default true.
    --extractor <extractor>     # i-vector extractor directory if i-vector is
                                # to be used during decoding. Must match
                                # the extractor used for training neural-network.
    --use-vad <true|false>      # If true, uses energy-based VAD to apply frame weights
                                # for i-vector stats extraction
EOF
  exit 1
fi

data=$1
lang=$2
srcdir=$3
dir=$4
data_out=$5


extra_files=
if [ ! -z "$extractor" ]; then
  extra_files="$extractor/final.ie"
fi

for f in $srcdir/{final.mdl,tree,cmvn_opts} $data/utt2spk $data/feats.scp \
  $lang/words.txt $lang/oov.txt $extra_files; do
  if [ ! -f $f ]; then
    echo "$0: expected file $f to exist."
    exit 1
  fi
done

mkdir -p $dir
cp $srcdir/final.mdl $dir
cp $srcdir/tree $dir
cp $srcdir/cmvn_opts $dir
cp $srcdir/{splice_opts,delta_opts,final.mat,final.alimdl} $dir 2>/dev/null || true
cp $srcdir/frame_subsampling_factor $dir 2>/dev/null || true

if [ -f $srcdir/frame_subsampling_factor ]; then
  echo "$0: guessing that this is a chain system, checking parameters."
  if [ -z $scale_opts ]; then
    echo "$0: setting scale_opts"
    scale_opts="--self-loop-scale=1.0 --transition-scale=1.0"
  fi
  if [ $acwt == 0.1 ]; then
    echo "$0: setting acwt=1.0"
    acwt=1.0
  fi
  if [ $lmwt == 10 ]; then
    echo "$0: setting lmwt=1.0"
    lmwt=1
  fi
fi

utils/lang/check_phones_compatible.sh $lang/phones.txt $srcdir/phones.txt
cp $lang/phones.txt $dir

if [ $stage -le 1 ]; then
  echo "$0: Building biased-language-model decoding graphs..."


  steps/cleanup/make_biased_lm_graphs.sh $graph_opts --scale-opts "$scale_opts" \
    --nj $nj --cmd "$cmd" \
     $data $lang $dir $dir/graphs
fi

online_ivector_dir=
if [ ! -z "$extractor" ]; then
  online_ivector_dir=$dir/ivectors_$(basename $data)

  if [ $stage -le 2 ]; then
    # Compute energy-based VAD
    if $use_vad; then
      steps/compute_vad_decision.sh $data \
        $data/log $data/data
    fi

    steps/online/nnet2/extract_ivectors_online.sh \
      --nj $nj --cmd "$cmd --mem 4G" --use-vad $use_vad \
      $data $extractor $online_ivector_dir
  fi
fi

if [ $stage -le 3 ]; then
  echo "$0: Decoding with biased language models..."

  steps/cleanup/decode_segmentation_nnet3.sh \
    --acwt $acwt  \
    --beam $beam --lattice-beam $lattice_beam --nj $nj --cmd "$cmd --mem 4G" \
    --skip-scoring true --allow-partial false \
    --extra-left-context $extra_left_context \
    --extra-right-context $extra_right_context \
    --extra-left-context-initial $extra_left_context_initial \
    --extra-right-context-final $extra_right_context_final \
    --frames-per-chunk $frames_per_chunk \
    ${online_ivector_dir:+--online-ivector-dir $online_ivector_dir} \
    $dir/graphs $data $dir/lats

  # the following is for diagnostics, e.g. it will give us the lattice depth.
  steps/diagnostic/analyze_lats.sh --cmd "$cmd" $lang $dir/lats
fi

frame_shift_opt=
if [ -f $srcdir/frame_subsampling_factor ]; then
  frame_shift_opt="--frame-shift 0.0$(cat $srcdir/frame_subsampling_factor)"
fi

if [ $stage -le 4 ]; then
  echo "$0: Doing oracle alignment of lattices..."
  steps/cleanup/lattice_oracle_align.sh --cmd "$cmd --mem 4G" $frame_shift_opt \
    $data $lang $dir/lats $dir/lattice_oracle
fi


if [ $stage -le 4 ]; then
  echo "$0: using default values of non-scored words..."

  # At the level of this script we just hard-code it that non-scored words are
  # those that map to silence phones (which is what get_non_scored_words.py
  # gives us), although this could easily be made user-configurable.  This list
  # of non-scored words affects the behavior of several of the data-cleanup
  # scripts; essentially, we view the non-scored words as negotiable when it
  # comes to the reference transcript, so we'll consider changing the reference
  # to match the hyp when it comes to these words.
  steps/cleanup/internal/get_non_scored_words.py $lang > $dir/non_scored_words.txt
fi

if [ $stage -le 5 ]; then
  echo "$0: modifying ctm-edits file to allow repetitions [for dysfluencies] and "
  echo "   ... to fix reference mismatches involving non-scored words. "

  $cmd $dir/log/modify_ctm_edits.log \
    steps/cleanup/internal/modify_ctm_edits.py --verbose=3 $dir/non_scored_words.txt \
    $dir/lattice_oracle/ctm_edits $dir/ctm_edits.modified

  echo "   ... See $dir/log/modify_ctm_edits.log for details and stats, including"
  echo " a list of commonly-repeated words."
fi

if [ $stage -le 6 ]; then
  echo "$0: applying 'taint' markers to ctm-edits file to mark silences and"
  echo "  ... non-scored words that are next to errors."
  $cmd $dir/log/taint_ctm_edits.log \
       steps/cleanup/internal/taint_ctm_edits.py $dir/ctm_edits.modified $dir/ctm_edits.tainted
  echo "... Stats, including global cor/ins/del/sub stats, are in $dir/log/taint_ctm_edits.log."
fi


if [ $stage -le 7 ]; then
  echo "$0: creating segmentation from ctm-edits file."

  $cmd $dir/log/segment_ctm_edits.log \
    steps/cleanup/internal/segment_ctm_edits.py \
      $segmentation_opts \
      --oov-symbol-file=$lang/oov.txt \
      --ctm-edits-out=$dir/ctm_edits.segmented \
      --word-stats-out=$dir/word_stats.txt \
      $dir/non_scored_words.txt \
      $dir/ctm_edits.tainted $dir/text $dir/segments

  echo "$0: contents of $dir/log/segment_ctm_edits.log are:"
  cat $dir/log/segment_ctm_edits.log
  echo "For word-level statistics on p(not-being-in-a-segment), with 'worst' words at the top,"
  echo "see $dir/word_stats.txt"
  echo "For detailed utterance-level debugging information, see $dir/ctm_edits.segmented"
fi

if [ $stage -le 8 ]; then
  echo "$0: working out required segment padding to account for feature-generation edge effects."
  # make sure $data/utt2dur exists.
  utils/data/get_utt2dur.sh $data
  # utt2dur.from_ctm contains lines of the form 'utt dur',  e.g.
  # AMI_EN2001a_H00_MEE068_0000557_0000594 0.35
  # where the times are ultimately derived from the num-frames in the features.
  cat $dir/lattice_oracle/ctm_edits | \
     awk '{utt=$1; t=$3+$4; if (t > dur[$1]) dur[$1] = t; } END{for (k in dur) print k, dur[k];}' | \
     sort > $dir/utt2dur.from_ctm
  # the apply_map command below gives us lines of the form 'utt dur-from-$data/utt2dur dur-from-utt2dur.from_ctm',
  # e.g. AMI_EN2001a_H00_MEE068_0000557_0000594 0.37 0.35
  utils/apply_map.pl -f 1 <(awk '{print $1,$1,$2}' <$data/utt2dur) <$dir/utt2dur.from_ctm  | \
    awk '{printf("%.3f\n", $2 - $3); }' | sort | uniq -c | sort -nr > $dir/padding_frequencies
  # there are values other than the most-frequent one (0.02) in there because
  # of wav files that were shorter than the segment info.
  padding=$(head -n 1 $dir/padding_frequencies | awk '{print $2}')
  echo "$0: we'll pad segments with $padding seconds at segment ends to correct for feature-generation end effects"
  echo $padding >$dir/segment_end_padding
fi


if [ $stage -le 8 ]; then
  echo "$0: based on the segments and text file in $dir/segments and $dir/text, creating new data-dir in $data_out"
  padding=$(cat $dir/segment_end_padding)  # e.g. 0.02
  utils/data/subsegment_data_dir.sh --segment-end-padding $padding ${data} $dir/segments $dir/text $data_out
  # utils/data/subsegment_data_dir.sh can output directories that have e.g. to many entries left in wav.scp
  # Clean this up with the fix_dat_dir.sh script
  utils/fix_data_dir.sh $data_out
fi

if [ $stage -le 9 ]; then
  echo "$0: recomputing CMVN stats for the new data"
  # Caution: this script puts the CMVN stats in $data_out/data,
  # e.g. data/train_cleaned/data.  This is not the general pattern we use.
  steps/compute_cmvn_stats.sh $data_out $data_out/log $data_out/data
fi

if $cleanup; then
  echo "$0: cleaning up intermediate files"
  rm -r $dir/graphs/fsts $dir/graphs/HCLG.fsts.scp || true
  rm -r $dir/lats/lat.*.gz $dir/lats/split_fsts || true
  rm $dir/lattice_oracle/lat.*.gz || true
fi

echo "$0: done."


================================================
FILE: egs/steps/cleanup/combine_short_segments.py
================================================
#!/usr/bin/env python

# Copyright 2016 Vijayaditya Peddinti
# Apache 2.0

from __future__ import print_function
import argparse
import sys
import os
import subprocess
import errno
import copy
import shutil
import warnings

def GetArgs():
    # we add compulsary arguments as named arguments for readability
    parser = argparse.ArgumentParser(description="""
    **Warning, this script is deprecated.  Please use utils/data/combine_short_segments.sh**
    This script concatenates segments in the input_data_dir to ensure that"""
    " the segments in the output_data_dir have a specified minimum length.",
    formatter_class=argparse.ArgumentDefaultsHelpFormatter)


    parser.add_argument("--minimum-duration", type=float, required = True,
                        help="Minimum duration of the segments in the output directory")
    parser.add_argument("--input-data-dir", type=str, required = True)
    parser.add_argument("--output-data-dir", type=str, required = True)

    print(' '.join(sys.argv))
    args = parser.parse_args()
    return args

def RunKaldiCommand(command, wait = True):
    """ Runs commands frequently seen in Kaldi scripts. These are usually a
        sequence of commands connected by pipes, so we use shell=True """
    p = subprocess.Popen(command, shell = True,
                         stdout = subprocess.PIPE,
                         stderr = subprocess.PIPE)

    if wait:
        [stdout, stderr] = p.communicate()
        if p.returncode is not 0:
            raise Exception("There was an error while running the command {0}\n".format(command)+"-"*10+"\n"+stderr)
        return stdout, stderr
    else:
        return p

def MakeDir(dir):
    try:
        os.mkdir(dir)
    except OSError as exc:
        if exc.errno != errno.EEXIST:
            raise exc
        raise Exception("Directory {0} already exists".format(dir))
        pass

def CheckFiles(input_data_dir):
    for file_name in ['spk2utt', 'text', 'utt2spk', 'feats.scp']:
        file_name = '{0}/{1}'.format(input_data_dir, file_name)
        if not os.path.exists(file_name):
            raise Exception("There is no such file {0}".format(file_name))

def ParseFileToDict(file, assert2fields = False, value_processor = None):
    if value_processor is None:
        value_processor = lambda x: x[0]

    dict = {}
    for line in open(file, 'r'):
        parts = line.split()
        if assert2fields:
            assert(len(parts) == 2)

        dict[parts[0]] = value_processor(parts[1:])
    return dict

def WriteDictToFile(dict, file_name):
    file = open(file_name, 'w')
    keys = dict.keys()
    keys.sort()
    for key in keys:
        value = dict[key]
        if type(value) in [list, tuple] :
            if type(value) is tuple:
                value = list(value)
            value.sort()
            value = ' '.join(value)
        file.write('{0}\t{1}\n'.format(key, value))
    file.close()


def ParseDataDirInfo(data_dir):
    data_dir_file = lambda file_name: '{0}/{1}'.format(data_dir, file_name)

    utt2spk = ParseFileToDict(data_dir_file('utt2spk'))
    spk2utt = ParseFileToDict(data_dir_file('spk2utt'), value_processor = lambda x: x)
    text = ParseFileToDict(data_dir_file('text'), value_processor = lambda x: " ".join(x))
    # we want to assert feats.scp has just 2 fields, as we don't know how
    # to process it otherwise
    feat = ParseFileToDict(data_dir_file('feats.scp'), assert2fields = True)
    utt2dur = ParseFileToDict(data_dir_file('utt2dur'), value_processor = lambda x: float(x[0]))
    utt2uniq = None
    if os.path.exists(data_dir_file('utt2uniq')):
        utt2uniq = ParseFileToDict(data_dir_file('utt2uniq'))
    return utt2spk, spk2utt, text, feat, utt2dur, utt2uniq


def GetCombinedUttIndexRange(utt_index, utts, utt_durs, minimum_duration):
    # We want the minimum number of concatenations
    # to reach the minimum_duration. If two concatenations satisfy
    # the minimum duration constraint we choose the shorter one.
    left_index = utt_index - 1
    right_index = utt_index + 1
    num_remaining_segments = len(utts) - 1
    cur_utt_dur = utt_durs[utts[utt_index]]

    while num_remaining_segments > 0:

        left_utt_dur = 0
        if left_index >= 0:
            left_utt_dur = utt_durs[utts[left_index]]
        right_utt_dur = 0
        if right_index <= len(utts) - 1:
            right_utt_dur = utt_durs[utts[right_index]]

        right_combined_utt_dur = cur_utt_dur + right_utt_dur
        left_combined_utt_dur = cur_utt_dur + left_utt_dur
        left_right_combined_utt_dur = cur_utt_dur + left_utt_dur + right_utt_dur

        combine_left_exit = False
        combine_right_exit = False
        if right_combined_utt_dur >= minimum_duration:
            if left_combined_utt_dur >= minimum_duration:
                if left_combined_utt_dur <= right_combined_utt_dur:
                    combine_left_exit = True
                else:
                    combine_right_exit = True
            else:
                combine_right_exit = True
        elif left_combined_utt_dur >= minimum_duration:
            combine_left_exit = True
        elif left_right_combined_utt_dur >= minimum_duration :
            combine_left_exit = True
            combine_right_exit = True

        if combine_left_exit and combine_right_exit:
            cur_utt_dur = left_right_combined_utt_dur
            break
        elif combine_left_exit:
            cur_utt_dur = left_combined_utt_dur
            # move back the right_index as we don't need to combine it
            right_index = right_index - 1
            break
        elif combine_right_exit:
            cur_utt_dur = right_combined_utt_dur
            # move back the left_index as we don't need to combine it
            left_index = left_index + 1
            break

        # couldn't satisfy minimum duration requirement so continue search
        if left_index >= 0:
            num_remaining_segments = num_remaining_segments - 1
        if right_index <= len(utts) - 1:
            num_remaining_segments = num_remaining_segments - 1

        left_index = left_index - 1
        right_index = right_index + 1

        cur_utt_dur = left_right_combined_utt_dur
    left_index = max(0, left_index)
    right_index = min(len(utts)-1, right_index)
    return left_index, right_index, cur_utt_dur


def WriteCombinedDirFiles(output_dir, utt2spk, spk2utt, text, feat, utt2dur, utt2uniq):
    out_dir_file = lambda file_name: '{0}/{1}'.format(output_dir, file_name)
    total_combined_utt_list = []
    for speaker in spk2utt.keys():
        utts = spk2utt[speaker]
        for utt in utts:
            if type(utt) is tuple:
                #this is a combined utt
                total_combined_utt_list.append((speaker, utt))

    for speaker, combined_utt_tuple in total_combined_utt_list:
        combined_utt_list = list(combined_utt_tuple)
        combined_utt_list.sort()
        new_utt_name = "-".join(combined_utt_list)+'-appended'

        # updating the utt2spk dict
        for utt in combined_utt_list:
            spk_name = utt2spk.pop(utt)
        utt2spk[new_utt_name] = spk_name

        # updating the spk2utt dict
        spk2utt[speaker].remove(combined_utt_tuple)
        spk2utt[speaker].append(new_utt_name)

        # updating the text dict
        combined_text = []
        for utt in combined_utt_list:
            combined_text.append(text.pop(utt))
        text[new_utt_name] = ' '.join(combined_text)

        # updating the feat dict
        combined_feat = []
        for utt in combined_utt_list:
            combined_feat.append(feat.pop(utt))
        feat_command = "concat-feats --print-args=false {feats} - |".format(feats = " ".join(combined_feat))
        feat[new_utt_name] = feat_command

        # updating utt2dur
        combined_dur = 0
        for utt in combined_utt_list:
            combined_dur += utt2dur.pop(utt)
        utt2dur[new_utt_name] = combined_dur

        # updating utt2uniq
        if utt2uniq is not None:
            combined_uniqs = []
            for utt in combined_utt_list:
                combined_uniqs.append(utt2uniq.pop(utt))
            # utt2uniq file is used to map perturbed data to original unperturbed
            # versions so that the training cross validation sets can avoid overlap
            # of data however if perturbation changes the length of the utterance
            # (e.g. speed perturbation) the utterance combinations in each
            # perturbation of the original recording can be very different. So there
            # is no good way to find the utt2uniq mapping so that we can avoid
            # overlap.
            utt2uniq[new_utt_name] = combined_uniqs[0]


    WriteDictToFile(utt2spk, out_dir_file('utt2spk'))
    WriteDictToFile(spk2utt, out_dir_file('spk2utt'))
    WriteDictToFile(feat, out_dir_file('feats.scp'))
    WriteDictToFile(text, out_dir_file('text'))
    if utt2uniq is not None:
        WriteDictToFile(utt2uniq, out_dir_file('utt2uniq'))
    WriteDictToFile(utt2dur, out_dir_file('utt2dur'))


def CombineSegments(input_dir, output_dir, minimum_duration):
    utt2spk, spk2utt, text, feat, utt2dur, utt2uniq = ParseDataDirInfo(input_dir)
    total_combined_utt_list = []

    # copy the duration dictionary so that we can modify it
    utt_durs = copy.deepcopy(utt2dur)
    speakers = spk2utt.keys()
    speakers.sort()
    for speaker in speakers:

        utts = spk2utt[speaker] # this is an assignment of the reference
        # In WriteCombinedDirFiles the values of spk2utt will have the list
        # of combined utts which will be used as reference

        # we make an assumption that the sorted uttlist corresponds
        # to contiguous segments. This is true only if utt naming
        # is done according to accepted conventions
        # this is an easily violatable assumption. Have to think of a better
        # way to do this.
        utts.sort()
        utt_index = 0
        while utt_index < len(utts):
            if utt_durs[utts[utt_index]] < minimum_duration:
                left_index, right_index, cur_utt_dur = GetCombinedUttIndexRange(utt_index, utts, utt_durs, minimum_duration)
                if not cur_utt_dur >= minimum_duration:
                    # this is a rare occurrence, better make the user aware of this
                    # situation and let them deal with it
                    warnings.warn('Speaker {0} does not have enough utterances to satisfy the minimum duration '
                                  'constraint. Not modifying these utterances'.format(speaker))
                    utt_index = utt_index + 1
                    continue
                combined_duration = 0
                combined_utts = []
                # update the utts_dur dictionary
                for utt in utts[left_index:right_index + 1]:
                    combined_duration += utt_durs.pop(utt)
                    if type(utt) is tuple:
                        for item in utt:
                            combined_utts.append(item)
                    else:
                        combined_utts.append(utt)
                combined_utts = tuple(combined_utts) # converting to immutable type to use as dictionary key
                assert(cur_utt_dur == combined_duration)

                # now modify the utts list
                combined_indices = list(range(left_index, right_index + 1))
                # start popping from the largest index so that the lower
                # indexes are valid
                for i in combined_indices[::-1]:
                    utts.pop(i)
                utts.insert(left_index, combined_utts)
                utt_durs[combined_utts] = combined_duration
                utt_index = left_index
            utt_index = utt_index + 1
    WriteCombinedDirFiles(output_dir, utt2spk, spk2utt, text, feat, utt2dur, utt2uniq)

def Main():
    print("""steps/cleanup/combine_short_segments.py: warning: this script is deprecated and will be removed.
          Please use utils/data/combine_short_segments.sh""", file = sys.stderr)
    args = GetArgs()

    CheckFiles(args.input_data_dir)
    MakeDir(args.output_data_dir)
    feat_lengths = {}
    segments_file = '{0}/segments'.format(args.input_data_dir)

    RunKaldiCommand("utils/data/get_utt2dur.sh {0}".format(args.input_data_dir))

    CombineSegments(args.input_data_dir, args.output_data_dir, args.minimum_duration)

    RunKaldiCommand("utils/utt2spk_to_spk2utt.pl {od}/utt2spk > {od}/spk2utt".format(od = args.output_data_dir))
    if os.path.exists('{0}/cmvn.scp'.format(args.input_data_dir)):
        shutil.copy('{0}/cmvn.scp'.format(args.input_data_dir), args.output_data_dir)

    RunKaldiCommand("utils/fix_data_dir.sh {0}".format(args.output_data_dir))
if __name__ == "__main__":
    Main()


================================================
FILE: egs/steps/cleanup/create_segments_from_ctm.pl
================================================
#!/usr/bin/env perl

# Copyright 2014  Guoguo Chen; 2015 Nagendra Kumar Goel
# Apache 2.0

use strict;
use warnings;
use Getopt::Long;

# $SIG{__WARN__} = sub { $DB::single = 1 };

my $Usage = <<'EOU';
This script creates the segments file and text file for a data directory with
new segmentation. It takes a ctm file and an "alignment" file. The ctm file
corresponds to the audio that we want to make segmentations for, and is created
by decoding the audio using existing in-domain models. The "alignment" file is
generated by the binary align-text, and is Levenshtein alignment between the
original transcript and the decoded output.

Internally, the script first tries to find silence regions (gaps in the CTM).
If a silence region is found, and the neighboring words are free of errors
according to the alignment file, then this silence region will be taken as
a split point, and new segment will be created. If the new segment we are going
to output is too long (longer than --max-seg-length), the script will split
the long segments into smaller pieces with length roughly --max-seg-length.
If you are going to use --wer-cutoff to filter out segments with high WER, make
sure you set it to a reasonable value. If the value you set is higher than the
WER from your alignment file, then most of the segments will be filtered out.

Usage: steps/cleanup/create_segments_from_ctm.pl [options] \
                              <ctm> <aligned.txt> <segments> <text>
 e.g.: steps/cleanup/create_segments_from_ctm.pl \
          train_si284_split.ctm train_si284_split.aligned.txt \
          data/train_si284_reseg/segments data/train_si284_reseg/text

Allowed options:
  --max-seg-length  : Maximum length of new segments (default = 10.0)
  --min-seg-length  : Minimum length of new segments (default = 2.0)
  --min-sil-length  : Minimum length of silence as split point (default = 0.5)
  --separator       : Separator for aligned pairs (default = ";")
  --special-symbol  : Special symbol to aligned with inserted or deleted words
                      (default = "<***>")
  --wer-cutoff      : Ignore segments with WER higher than the specified value.
                      -1 means no segment will be ignored. (default = -1)
  --use-silence-midpoints : Set to 1 if you want to use silence midpoints
                      instead of min_sil_length for silence overhang.(default 0)
  --force-correct-boundary-words : Set to zero if the segments will not be
                      required to have boundary words to be correct. Default 1
  --aligned-ctm-filename : If set, the intermediate aligned ctm
                      is saved to this file
EOU

my $max_seg_length = 10.0;
my $min_seg_length = 2.0;
my $min_sil_length = 0.5;
my $separator = ";";
my $special_symbol = "<***>";
my $wer_cutoff = -1;
my $use_silence_midpoints = 0;
my $force_correct_boundary_words = 1;
my $aligned_ctm_filename = "";
GetOptions(
  'wer-cutoff=f' => \$wer_cutoff,
  'max-seg-length=f' => \$max_seg_length,
  'min-seg-length=f' => \$min_seg_length,
  'min-sil-length=f' => \$min_sil_length,
  'use-silence-midpoints=f' => \$use_silence_midpoints,
  'force-correct-boundary-words=f' => \$force_correct_boundary_words,
  'aligned-ctm-filename=s' => \$aligned_ctm_filename,
  'separator=s'      => \$separator,
  'special-symbol=s' => \$special_symbol);

if (@ARGV != 4) {
  die $Usage;
}

my ($ctm_in, $align_in, $segments_out, $text_out) = @ARGV;

open(CI, "<$ctm_in") || die "Error: fail to open $ctm_in\n";
open(AI, "<$align_in") || die "Error: fail to open $align_in\n";
open(my $SO, ">$segments_out") || die "Error: fail to open $segments_out\n";
open(my $TO, ">$text_out") || die "Error: fail to open $text_out\n";
my $ACT= undef;
if ($aligned_ctm_filename ne "") {
    open($ACT, ">$aligned_ctm_filename");
}
# Prints the current segment to file.
sub PrintSegment {
  my ($aligned_ctm, $wav_id, $min_sil_length, $min_seg_length,
      $seg_start_index, $seg_end_index, $seg_count, $SO, $TO) = @_;

  if ($seg_start_index > $seg_end_index) {
    return -1;
  }

  # Removes the surrounding silence.
  while ($seg_start_index < scalar(@{$aligned_ctm}) &&
         $aligned_ctm->[$seg_start_index]->[0] eq "<eps>") {
    $seg_start_index += 1;
  }
  while ($seg_end_index >= 0 &&
         $aligned_ctm->[$seg_end_index]->[0] eq "<eps>") {
    $seg_end_index -= 1;
  }
  if ($seg_start_index > $seg_end_index) {
    return -1;
  }

  # Filters out segments with high WER.
  if ($wer_cutoff != -1) {
    my $num_errors = 0; my $num_words = 0;
    for (my $i = $seg_start_index; $i <= $seg_end_index; $i += 1) {
      if ($aligned_ctm->[$i]->[0] ne "<eps>") {
        $num_words += 1;
      }
      $num_errors += $aligned_ctm->[$i]->[3];
    }
    if ($num_errors / $num_words > $wer_cutoff || $num_words < 1) {
      return -1;
    }
  }

  # Works out the surrounding silence.
  my $index = $seg_start_index - 1;
  while ($index >= 0 && $aligned_ctm->[$index]->[0] eq
         "<eps>" && $aligned_ctm->[$index]->[3] == 0) {
    $index -= 1;
  }
  my $left_of_segment_has_deletion = "false";
  $left_of_segment_has_deletion = "true"
      if ($index > 0 && $aligned_ctm->[$index-1]->[0] ne "<eps>"
          && $aligned_ctm->[$index-1]->[3] == 0);

  my $pad_start_sil = ($aligned_ctm->[$seg_start_index]->[1] -
                       $aligned_ctm->[$index + 1]->[1]) / 2.0;
  if (($left_of_segment_has_deletion eq "true") || !$use_silence_midpoints) {
      if ($pad_start_sil > $min_sil_length / 2.0) {
          $pad_start_sil = $min_sil_length / 2.0;
      }
  }
  my $right_of_segment_has_deletion = "false";
  $index = $seg_end_index + 1;
  while ($index < scalar(@{$aligned_ctm}) &&
         $aligned_ctm->[$index]->[0] eq "<eps>" &&
         $aligned_ctm->[$index]->[3] == 0) {
    $index += 1;
  }
  $right_of_segment_has_deletion = "true"
      if ($index < scalar(@{$aligned_ctm})-1 && $aligned_ctm->[$index+1]->[0] ne
          "<eps>" && $aligned_ctm->[$index - 1]->[3] > 0);
  my $pad_end_sil = ($aligned_ctm->[$index - 1]->[1] +
                     $aligned_ctm->[$index - 1]->[2] -
                     $aligned_ctm->[$seg_end_index]->[1] -
                     $aligned_ctm->[$seg_end_index]->[2]) / 2.0;
  if (($right_of_segment_has_deletion eq "true") || !$use_silence_midpoints) {
      if ($pad_end_sil > $min_sil_length / 2.0) {
          $pad_end_sil = $min_sil_length / 2.0;
      }
  }

  my $seg_start = $aligned_ctm->[$seg_start_index]->[1] - $pad_start_sil;
  my $seg_end = $aligned_ctm->[$seg_end_index]->[1] +
                $aligned_ctm->[$seg_end_index]->[2] + $pad_end_sil;
  if ($seg_end - $seg_start < $min_seg_length) {
      return -1;
  }

  $seg_start = sprintf("%.2f", $seg_start);
  $seg_end = sprintf("%.2f", $seg_end);
  my $seg_id = $wav_id . "_" . sprintf("%05d", $seg_count);
  print $SO "$seg_id $wav_id $seg_start $seg_end\n";

  print $TO "$seg_id ";
  for (my $x = $seg_start_index; $x <= $seg_end_index; $x += 1) {
    if ($aligned_ctm->[$x]->[0] ne "<eps>") {
      print $TO "$aligned_ctm->[$x]->[0] ";
    }
  }
  print $TO "\n";
  return 0;
}

# Computes split point.
sub GetSplitPoint {
  my ($aligned_ctm, $seg_start_index, $seg_end_index, $max_seg_length) = @_;

  # Scan in the reversed order so we can maximize the length.
  my $split_point = $seg_start_index;
  for (my $x = $seg_end_index; $x > $seg_start_index; $x -= 1) {
    my $current_seg_length = $aligned_ctm->[$x]->[1] +
                             $aligned_ctm->[$x]->[2] -
                             $aligned_ctm->[$seg_start_index]->[1];
    if ($current_seg_length <= $max_seg_length) {
      $split_point = $x;
      last;
    }
  }
  return $split_point;
}

# Computes segment length without surrounding silence.
sub GetSegmentLengthNoSil {
  my ($aligned_ctm, $seg_start_index, $seg_end_index) = @_;
  while ($seg_start_index < scalar(@{$aligned_ctm}) &&
         $aligned_ctm->[$seg_start_index]->[0] eq "<eps>") {
    $seg_start_index += 1;
  }
  while ($seg_end_index >= 0 &&
         $aligned_ctm->[$seg_end_index]->[0] eq "<eps>") {
    $seg_end_index -= 1;
  }
  if ($seg_start_index > $seg_end_index) {
    return 0;
  }
  my $current_seg_length = $aligned_ctm->[$seg_end_index]->[1] +
                           $aligned_ctm->[$seg_end_index]->[2] -
                           $aligned_ctm->[$seg_start_index]->[1];
  return $current_seg_length;
}

# Force splits long segments.
sub SplitLongSegment {
  my ($aligned_ctm, $wav_id, $max_seg_length, $min_sil_length,
      $seg_start_index, $seg_end_index, $current_seg_count, $SO, $TO) = @_;
  # If the segment is too long, we manually split it. We make sure that the
  # resulting segments are at least ($max_seg_length / 2) seconds long.
  my $current_seg_length = $aligned_ctm->[$seg_end_index]->[1] +
                           $aligned_ctm->[$seg_end_index]->[2] -
                           $aligned_ctm->[$seg_start_index]->[1];
  my $current_seg_index = $seg_start_index;
  my $aligned_ctm_size = scalar(@{$aligned_ctm});
  while ($current_seg_length > 1.5 * $max_seg_length && $current_seg_index < $aligned_ctm_size-1) {
    my $split_point = GetSplitPoint($aligned_ctm, $current_seg_index,
                                    $seg_end_index, $max_seg_length);
    my $ans = PrintSegment($aligned_ctm, $wav_id, $min_sil_length,
                           $min_seg_length, $current_seg_index, $split_point,
                           $current_seg_count, $SO, $TO);
    $current_seg_count += 1 if ($ans != -1);
    $current_seg_index = $split_point + 1;
    $current_seg_length = $aligned_ctm->[$seg_end_index]->[1] +
                          $aligned_ctm->[$seg_end_index]->[2] -
                          $aligned_ctm->[$current_seg_index]->[1];
  }

  if ($current_seg_index eq $aligned_ctm_size-1) {
      my $ans = PrintSegment($aligned_ctm, $wav_id, $min_sil_length,
                             $min_seg_length, $current_seg_index, $current_seg_index,
                             $current_seg_count, $SO, $TO);
      $current_seg_count += 1 if ($ans != -1);
      return ($current_seg_count, $current_seg_index);
  }

  if ($current_seg_length > $max_seg_length) {
    my $split_point = GetSplitPoint($aligned_ctm, $current_seg_index,
                                    $seg_end_index,
                                    $current_seg_length / 2.0 + 0.01);
    my $ans = PrintSegment($aligned_ctm, $wav_id, $min_sil_length,
                           $min_seg_length, $current_seg_index, $split_point,
                           $current_seg_count, $SO, $TO);
    $current_seg_count += 1 if ($ans != -1);
    $current_seg_index = $split_point + 1;
  }

  my $split_point = GetSplitPoint($aligned_ctm, $current_seg_index,
                                  $seg_end_index, $max_seg_length + 0.01);
  my $ans = PrintSegment($aligned_ctm, $wav_id, $min_sil_length,
                         $min_seg_length, $current_seg_index, $split_point,
                         $current_seg_count, $SO, $TO);
  $current_seg_count += 1 if ($ans != -1);
  $current_seg_index = $split_point + 1;

  return ($current_seg_count, $current_seg_index);
}

# Processes each wav file.
sub ProcessWav {
  my ($max_seg_length, $min_seg_length, $min_sil_length, $special_symbol,
      $current_ctm, $current_align, $SO, $TO, $ACT) = @_;

  my $wav_id = $current_ctm->[0]->[0];
  my $channel_id = $current_ctm->[0]->[1];
  defined($wav_id) || die "Error: empty wav section\n";

  # First, we have to align the ctm file to the Levenshtein alignment.
  # @aligned_ctm is a list of the following:
  # [word, start_time, duration, num_errors]
  my $ctm_index = 0;
  my @aligned_ctm = ();
  foreach my $entry (@{$current_align}) {
    my $ref_word = $entry->[0];
    my $hyp_word = $entry->[1];
    if ($hyp_word eq $special_symbol) {
      # Case 1: deletion, $hyp does not correspond to a word in the ctm file.
      my $start = 0.0; my $dur = 0.0;
      if (defined($aligned_ctm[-1])) {
        $start = $aligned_ctm[-1]->[1] + $aligned_ctm[-1]->[2];
      }
      push(@aligned_ctm, [$ref_word, $start, $dur, 1]);
    } else {
      # Case 2: non-deletion, now $hyp corresponds to a word in ctm file.
      while ($current_ctm->[$ctm_index]->[4] eq "<eps>") {
        # Case 2.1: ctm contains silence at the corresponding place.
        push(@aligned_ctm, ["<eps>", $current_ctm->[$ctm_index]->[2],
                             $current_ctm->[$ctm_index]->[3], 0]);
        $ctm_index += 1;
      }
      my $ctm_word = $current_ctm->[$ctm_index]->[4];
      $hyp_word eq $ctm_word ||
        die "Error: got word $hyp_word in alignment but $ctm_word in ctm\n";
      my $start = $current_ctm->[$ctm_index]->[2];
      my $dur = $current_ctm->[$ctm_index]->[3];
      if ($ref_word ne $ctm_word) {
        if ($ref_word eq $special_symbol) {
          # Case 2.2: insertion, we propagate the duration and error to the
          #           previous one.
          if (defined($aligned_ctm[-1])) {
            $aligned_ctm[-1]->[2] += $dur;
            $aligned_ctm[-1]->[3] += 1;
          } else {
            push(@aligned_ctm, ["<eps>", $start, $dur, 1]);
          }
        } else {
          # Case 2.3: substitution.
          push(@aligned_ctm, [$ref_word, $start, $dur, 1]);
        }
      } else {
        # Case 2.4: correct.
        push(@aligned_ctm, [$ref_word, $start, $dur, 0]);
      }
      $ctm_index += 1;
    }
  }

  # Save the aligned CTM if needed
  if(defined($ACT)){
    for (my $i = 0; $i <= $#aligned_ctm; $i++) {
      print $ACT "$wav_id $channel_id $aligned_ctm[$i][1] $aligned_ctm[$i][2] ";
      print $ACT "$aligned_ctm[$i][0] $aligned_ctm[$i][3]\n";
    }
  }

  # Second, we create segments from @align_ctm, using simple greedy method.
  my $current_seg_index = 0;
  my $current_seg_count = 0;
  for (my $x = 0; $x < @aligned_ctm; $x += 1) {
    my $lcorrect = "true"; my $rcorrect = "true";
    $lcorrect = "false" if ($x > 0 && $aligned_ctm[$x - 1]->[3] > 0);
    $rcorrect = "false" if ($x < @aligned_ctm - 1 &&
                            $aligned_ctm[$x + 1]->[3] > 0);

    my $current_seg_length = GetSegmentLengthNoSil(\@aligned_ctm,
                                                   $current_seg_index, $x);

    # We split the audio, if the silence is longer than the requested silence
    # length, and if there are no alignment error around it. We also make sure
    # that segment contains actual words, instead of pure silence.
    if ($aligned_ctm[$x]->[0] eq "<eps>" &&
        $aligned_ctm[$x]->[2] >= $min_sil_length
       && (($force_correct_boundary_words && $lcorrect eq "true" &&
            $rcorrect eq "true") || !$force_correct_boundary_words)) {
      if ($current_seg_length <= $max_seg_length &&
          $current_seg_length >= $min_seg_length) {
        my $ans = PrintSegment(\@aligned_ctm, $wav_id, $min_sil_length,
                               $min_seg_length, $current_seg_index, $x,
                               $current_seg_count, $SO, $TO);
        $current_seg_count += 1 if ($ans != -1);
        $current_seg_index = $x + 1;
      } elsif ($current_seg_length > $max_seg_length) {
        ($current_seg_count, $current_seg_index)
          = SplitLongSegment(\@aligned_ctm, $wav_id, $max_seg_length,
                             $min_sil_length, $current_seg_index, $x,
                             $current_seg_count, $SO, $TO);
      }
    }
  }

  # Last segment.
  if ($current_seg_index <= @aligned_ctm - 1) {
    SplitLongSegment(\@aligned_ctm, $wav_id, $max_seg_length, $min_sil_length,
                     $current_seg_index, @aligned_ctm - 1,
                     $current_seg_count, $SO, $TO);
  }
}

# Insert <eps> as silence so the down stream process will be easier. Example:
#
# Input ctm:
# 011 A 3.39 0.23 SELL
# 011 A 3.62 0.18 OFF
# 011 A 3.83 0.45 ASSETS
#
# Output ctm:
# 011 A 3.39 0.23 SELL
# 011 A 3.62 0.18 OFF
# 011 A 3.80 0.03 <eps>
# 011 A 3.83 0.45 ASSETS
sub InsertSilence {
  my ($ctm_in, $ctm_out) = @_;
  for (my $x = 1; $x < @{$ctm_in}; $x += 1) {
    push(@{$ctm_out}, $ctm_in->[$x - 1]);

    my $new_start = sprintf("%.2f",
                            $ctm_in->[$x - 1]->[2] + $ctm_in->[$x - 1]->[3]);
    if ($new_start < $ctm_in->[$x]->[2]) {
      my $new_dur = sprintf("%.2f", $ctm_in->[$x]->[2] - $new_start);
      push(@{$ctm_out}, [$ctm_in->[$x - 1]->[0], $ctm_in->[$x - 1]->[1],
                         $new_start, $new_dur, "<eps>"]);
    }
  }
  push(@{$ctm_out}, $ctm_in->[@{$ctm_in} - 1]);
}

# Reads the alignment.
my %aligned = ();
while (<AI>) {
  chomp;
  my @col = split;
  @col >= 2 || die "Error: bad line $_\n";
  my $wav = shift @col;
  if ( (@col + 0) % 3 != 2) {
    die "Bad line in align-text output (unexpected number of fields): $_";
  }
  my @pairs = ();

  for (my $x = 0; $x * 3 + 2 < @col; $x++) {
    my $first_word = $col[$x * 3];
    my $second_word = $col[$x * 3 + 1];
    if ($x * 3 + 2 < @col) {
      if ($col[$x * 3 + 2] ne $separator) {
        die "Bad line in align-text output (expected separator '$separator'): $_";
      }
    }
    # the [ ] expression returns a reference to a new anonymous array.
    push(@pairs, [ $first_word, $second_word ]);
  }
  ! defined($aligned{$wav}) || die "Error: $wav has already been processed\n";
  $aligned{$wav} = \@pairs;
}

# Reads the ctm file and creates the segmentation.
my $previous_wav_id = "";
my $previous_channel_id = "";
my @current_wav = ();
while (<CI>) {
  chomp;
  my @col = split;
  @col >= 5 || die "Error: bad line $_\n";
  if ($previous_wav_id eq $col[0] && $previous_channel_id eq $col[1]) {
    push(@current_wav, \@col);
  } else {
    if (@current_wav > 0) {
      defined($aligned{$previous_wav_id}) ||
        die "Error: no alignment info for $previous_wav_id\n";
      my @current_wav_silence = ();
      InsertSilence(\@current_wav, \@current_wav_silence);
      ProcessWav($max_seg_length, $min_seg_length, $min_sil_length,
                 $special_symbol, \@current_wav_silence,
                 $aligned{$previous_wav_id}, $SO, $TO, $ACT);
    }
    @current_wav = ();
    push(@current_wav, \@col);
    $previous_wav_id = $col[0];
    $previous_channel_id = $col[1];
  }
}

# The last wav file.
if (@current_wav > 0) {
  defined($aligned{$previous_wav_id}) ||
    die "Error: no alignment info for $previous_wav_id\n";
  my @current_wav_silence = ();
  InsertSilence(\@current_wav, \@current_wav_silence);
  ProcessWav($max_seg_length, $min_seg_length, $min_sil_length, $special_symbol,
             \@current_wav_silence, $aligned{$previous_wav_id}, $SO, $TO, $ACT);
}

close(CI);
close(AI);
close($SO);
close($TO);
close($ACT) if defined($ACT);


================================================
FILE: egs/steps/cleanup/debug_lexicon.sh
================================================
#!/usr/bin/env bash
# Copyright 2014  Johns Hopkins University (Author: Daniel Povey)
# Apache 2.0

# this script gets some stats that will help you debug the lexicon.

# Begin configuration section.
stage=1
remove_stress=false
nj=10  # number of jobs for various decoding-type things that we run.
cmd=run.pl
alidir=
# End configuration section

echo "$0 $@"  # Print the command line for logging

[ -f path.sh ] && . ./path.sh # source the path.
. parse_options.sh || exit 1;

if [ $# != 5 ]; then
   echo "usage: $0 <data-dir> <lang-dir> <src-dir> <src-dict> <dir>"
   echo "e.g.: $0 data/train data/lang exp/tri4b data/local/dict/lexicon.txt exp/debug_lexicon"
   echo "main options (for others, see top of script file)"
   echo "  --nj <nj>                                        # number of parallel jobs"
   echo "  --cmd <cmd>                                      # command to run jobs, e.g. run.pl,queue.pl"
   echo "  --stage <stage>                                  # use to control partial reruns."
   echo "  --remove-stress <true|false>                     # if true, remove stress before printing analysis"
   echo "                                                   # note: if you change this, you only have to rerun"
   echo "                                                   # from stage 10."
   echo "  --alidir <alignment-dir>                         # if supplied, training-data alignments and transforms"
   echo "                                                   # are obtained from here instead of being generated."
   exit 1;
fi

data=$1
lang=$2
src=$3
srcdict=$4
dir=$5

set -e

for f in $data/feats.scp $lang/phones.txt $src/final.mdl $srcdict; do
  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1;
done

mkdir -p $dir
utils/lang/check_phones_compatible.sh $lang/phones.txt $srcdir/phones.txt
cp $lang/phones.txt $dir

if [ -z $alidir ]; then
  alidir=${src}_ali_$(basename $data)
  if [ $stage -le 1 ]; then
    steps/align_fmllr.sh --cmd "$cmd" --nj $nj $data $lang $src $alidir
  fi
fi

phone_lang=data/$(basename $lang)_phone_bg

if [ $stage -le 2 ]; then
  utils/lang/make_phone_bigram_lang.sh $lang $alidir $phone_lang
fi

if [ $stage -le 3 ]; then
  utils/mkgraph.sh $phone_lang $src $src/graph_phone_bg
fi

if [ $stage -le 4 ]; then
  steps/decode_si.sh --skip-scoring true \
    --cmd "$cmd" --nj $nj --transform-dir $alidir \
    --acwt 0.25 --beam 10.0 --lattice-beam 5.0 --max-active 2500 \
    $src/graph_phone_bg $data $src/decode_$(basename $data)_phone_bg
fi

if [ $stage -le 5 ]; then
  steps/get_train_ctm.sh --print-silence true --use-segments false \
     --cmd "$cmd" $data $lang $alidir
fi

if [ $stage -le 6 ]; then
  steps/get_ctm.sh --use-segments false --cmd "$cmd" --min-lmwt 3 --max-lmwt 8 \
     $data $phone_lang $src/decode_$(basename $data)_phone_bg
fi

if [ $stage -le 7 ]; then
  mkdir -p $dir
  # lmwt=4 corresponds to the scale we decoded at.
  cp $src/decode_$(basename $data)_phone_bg/score_4/$(basename $data).ctm $dir/phone.ctm

  cp $alidir/ctm $dir/word.ctm
fi

if [ $stage -le 8 ]; then
# we'll use 'sort' to do most of the heavy lifting when processing the data.
# suppose word.ctm has an entry like
# sw02054 A 213.32 0.24 and
# we'll convert it into two entries like this, with the start and end separately:
# sw02054-A 0021332 START and
# sw02054-A 0021356 END and
#
# and suppose phone.ctm has lines like
# sw02054 A 213.09 0.24 sil
# sw02054 A 213.33 0.13 ae_B
# we'll convert them into lines where the time is derived the midpoint of the phone, like
# sw02054 A 0021321 PHONE sil
# sw02054 A 0021340 PHONE ae_B
# and then we'll remove the optional-silence phones and, if needed, the word-boundary markers from
# the phones, to get just
# sw02054 A 0021340 PHONE ae
# then after sorting and merge-sorting the two ctm files we can easily
# work out for each word, what the phones were during that time.

  grep -v '<eps>' $phone_lang/phones.txt | awk '{print $1, $1}' | \
    sed 's/_B$//' | sed 's/_I$//' | sed 's/_E$//' | sed 's/_S$//' >$dir/phone_map.txt


  export LC_ALL=C

  cat $dir/phone.ctm | utils/apply_map.pl -f 5 $dir/phone_map.txt > $dir/phone_mapped.ctm

  cat $dir/word.ctm  | awk '{printf("%s-%s %010.0f START %s\n", $1, $2, 1000*$3, $5); printf("%s-%s %010.0f END %s\n", $1, $2, 1000*($3+$4), $5);}' | \
    sort > $dir/word_processed.ctm

  # filter out those utteraces which only appea in phone_processed.ctm but not in word_processed.ctm
  cat $dir/phone_mapped.ctm | awk '{printf("%s-%s %010.0f PHONE %s\n", $1, $2, 1000*($3+(0.5*$4)), $5);}' | \
    awk 'NR==FNR{a[$1] = 1; next} {if($1 in a) print $0}' $dir/word_processed.ctm - | \
    sort > $dir/phone_processed.ctm

  # merge-sort both ctm's
  sort -m $dir/word_processed.ctm $dir/phone_processed.ctm > $dir/combined.ctm
fi

# after merge-sort of the two ctm's, we add <eps> to cover "deserted" phones due to precision limits, and then merge all consecutive <eps>'s.
if [ $stage -le 9 ]; then
  awk '{print $1, $3, $4}' $dir/combined.ctm | \
     perl -e ' while (<>) { chop; @A = split(" ", $_); ($utt, $a,$b) = @A;
     if ($a eq "START") { $cur_word = $b; @phones = (); }
     if ($a eq "END") { print $utt, " ", $cur_word, " ", join(" ", @phones), "\n"; }
     if ($a eq "PHONE") { if ($prev eq "END") {print $utt, " ", "<eps>", " ", $b, "\n";} else {push @phones, $b;}} $prev = $a;} ' |\
     awk 'BEGIN{merge_prev=0;} {utt=$1;word=$2;pron=$3;for (i=4;i<=NF;i++) pron=pron" "$i;
     if (word_prev == "<eps>" && word == "<eps>" && utt_prev == utt) {merge=0;pron_prev=pron_prev" "pron;} else {merge=1;}
     if(merge_prev==1) {print utt_prev, word_prev, pron_prev;};
     merge_prev=merge; utt_prev=utt; word_prev=word; pron_prev=pron;}
     END{if(merge_prev==1) {print utt_prev, word_prev, pron_prev;}}' > $dir/ctm_prons.txt

  steps/cleanup/internal/get_non_scored_words.py $lang > $dir/non_scored_words
  steps/cleanup/internal/get_pron_stats.py $dir/ctm_prons.txt $phone_lang/phones/silence.txt $phone_lang/phones/optional_silence.txt $dir/non_scored_words - | \
    sort -nr > $dir/prons.txt
fi

if [ $stage -le 10 ]; then
  if $remove_stress; then
    perl -e 'while(<>) { @A=split(" ", $_); for ($n=1;$n<@A;$n++) { $A[$n] =~ s/[0-9]$//; } print join(" ", @A) . "\n"; } ' \
      <$srcdict >$dir/lexicon.txt
  else
    cp $srcdict $dir/lexicon.txt
  fi
  silphone=$(cat $phone_lang/phones/optional_silence.txt)
  echo "<eps> $silphone" >> $dir/lexicon.txt

  awk '{count[$2] += $1;} END {for (w in count){print w, count[w];}}' \
      <$dir/prons.txt >$dir/counts.txt


  cat $dir/prons.txt | \
    if $remove_stress; then
      perl -e 'while(<>) { @A=split(" ", $_); for ($n=1;$n<@A;$n++) { $A[$n] =~ s/[0-9]$//; } print join(" ", @A) . "\n"; } '
    else
      cat
    fi | perl -e '
     print ";; <count-of-this-pron> <rank-of-this-pron> <frequency-of-this-pron> CORRECT|INCORRECT <word> <pron>\n";
     open(D, "<$ARGV[0]") || die "opening dict file $ARGV[0]";
     # create a hash of all reference pronuncations, and for each word, record
     # a list of the prons, separated by " | ".
     while (<D>) {
        @A = split(" ", $_); $is_pron{join(" ",@A)} = 1;
        $w = shift @A;
        if (!defined $prons{$w}) { $prons{$w} = join(" ", @A); }
        else { $prons{$w} = $prons{$w} . " | " . join(" ", @A); }
     }
     open(C, "<$ARGV[1]") || die "opening counts file $ARGV[1];";
     while (<C>) { @A = split(" ", $_); $word_count{$A[0]} = $A[1]; }
     while (<STDIN>) { @A = split(" ", $_);
       $count = shift @A; $word = $A[0]; $freq = sprintf("%0.2f", $count / $word_count{$word});
       $rank = ++$wcount{$word}; # 1 if top observed pron of word, 2 if second...
       $str = (defined $is_pron{join(" ", @A)} ? "CORRECT" : "INCORRECT");
       shift @A;
       print "$count $rank $freq $str $word \"" . join(" ", @A) . "\", ref = \"$prons{$word}\"\n";
     } ' $dir/lexicon.txt $dir/counts.txt  >$dir/pron_info.txt

  grep -v '^;;' $dir/pron_info.txt | \
     awk '{ word=$5; count=$1; if (tot[word] == 0) { first_line[word] = $0; }
            corr[word] += ($4 == "CORRECT" ? count : 0); tot[word] += count; }
          END {for (w in tot) { printf("%s\t%s\t%s\t\t%s\n", tot[w], w, (corr[w]/tot[w]), first_line[w]); }} ' \
     | sort -k1 -nr | cat <( echo ';; <total-count-of-word> <word> <correct-proportion>      <first-corresponding-line-in-pron_info.txt>') - \
      > $dir/word_info.txt
fi

if [ $stage -le 11 ]; then
  echo "$0: some of the more interesting stuff in $dir/pron_info.txt follows."
  echo "# grep -w INCORRECT $dir/pron_info.txt  | grep -w 1 | head -n 20"

  grep -w INCORRECT $dir/pron_info.txt  | grep -w 1 | head -n 20

  echo "$0: here are some other interesting things.."
  echo "# grep -w INCORRECT $dir/pron_info.txt  | grep -w 1 | awk '\$3 > 0.4 && \$1 > 10' | head -n 20"
  grep -w INCORRECT $dir/pron_info.txt  | grep -w 1 | awk '$3 > 0.4 && $1 > 10' | head -n 20

  echo "$0: here are some high-frequency words whose reference pronunciations rarely show up."
  echo "# awk '\$3 < 0.1' $dir/word_info.txt  | head -n 20"
  awk '$3 < 0.1 || $1 == ";;"' $dir/word_info.txt  | head -n 20


fi


================================================
FILE: egs/steps/cleanup/decode_fmllr_segmentation.sh
================================================
#!/usr/bin/env bash

# Copyright 2014  Guoguo Chen, 2015 GoVivace Inc. (Nagendra Goel)
#           2017  Vimal Manohar
# Apache 2.0

# Similar to steps/cleanup/decode_segmentation.sh, but does fMLLR adaptation.
# Decoding script with per-utterance graph that does fMLLR adaptation.
# This can be on top of delta+delta-delta, or LDA+MLLT features.

# There are 3 models involved potentially in this script,
# and for a standard, speaker-independent system they will all be the same.
# The "alignment model" is for the 1st-pass decoding and to get the
# Gaussian-level alignments for the "adaptation model" the first time we
# do fMLLR.  The "adaptation model" is used to estimate fMLLR transforms
# and to generate state-level lattices.  The lattices are then rescored
# with the "final model".

# The following table explains where we get these 3 models from.
# Note: $srcdir is one level up from the decoding directory.
#
#   Model              Default source:
#
#  "alignment model"   $srcdir/final.alimdl              --alignment-model <model>
#                     (or $srcdir/final.mdl if alimdl absent)
#  "adaptation model"  $srcdir/final.mdl                 --adapt-model <model>
#  "final model"       $srcdir/final.mdl                 --final-model <model>

set -e
set -o pipefail

# Begin configuration section
first_beam=10.0 # Beam used in initial, speaker-indep. pass
first_max_active=2000 # max-active used in initial pass.
alignment_model=
adapt_model=
final_model=
stage=0
acwt=0.083333 # Acoustic weight used in getting fMLLR transforms, and also in
              # lattice generation.
max_active=7000
beam=13.0
lattice_beam=6.0
nj=4
silence_weight=0.01
cmd=run.pl
si_dir=
fmllr_update_type=full
num_threads=1 # if >1, will use gmm-latgen-faster-parallel
parallel_opts=  # ignored now.
skip_scoring=false
scoring_opts=
max_fmllr_jobs=25  # I've seen the fMLLR jobs overload NFS badly if the decoding
                   # was started with a lot of many jobs, so we limit the number of
                   # parallel jobs to 25 by default.  End configuration section
allow_partial=true
# End configuration section.

echo "$0 $@"  # Print the command line for logging

[ -f ./path.sh ] && . ./path.sh; # source the path.
. parse_options.sh || exit 1;

if [ $# != 3 ]; then
   echo "$0: This is a special decoding script for segmentation where we"
   echo "use one decoding graph per segment. We assume a file HCLG.fsts.scp exists"
   echo "which is the scp file of the graphs for each segment."
   echo "This will normally be obtained by steps/cleanup/make_biased_lm_graphs.sh."
   echo ""
   echo "Usage: $0 [options] <graph-dir> <data-dir> <decode-dir>"
   echo " e.g.: $0 exp/tri2b/graph_train_si284_split \\"
   echo "             data/train_si284_split exp/tri2b/decode_train_si284_split"
   echo ""
   echo "where <decode-dir> is assumed to be a sub-directory of the directory"
   echo "where the model is."
   echo ""
   echo "main options (for others, see top of script file)"
   echo "  --config <config-file>                   # config containing options"
   echo "  --nj <nj>                                # number of parallel jobs"
   echo "  --cmd <cmd>                              # Command to run in parallel with"
   echo "  --adapt-model <adapt-mdl>                # Model to compute transforms with"
   echo "  --alignment-model <ali-mdl>              # Model to get Gaussian-level alignments for"
   echo "                                           # 1st pass of transform computation."
   echo "  --final-model <finald-mdl>               # Model to finally decode with"
   echo "  --si-dir <speaker-indep-decoding-dir>    # use this to skip 1st pass of decoding"
   echo "                                           # Caution-- must be with same tree"
   echo "  --acwt <acoustic-weight>                 # default 0.08333 ... used to get posteriors"
   echo "  --num-threads <n>                        # number of threads to use, default 1."
   echo "  --scoring-opts <opts>                    # options to local/score.sh"
   exit 1;
fi


graphdir=$1
data=$2
dir=`echo $3 | sed 's:/$::g'` # remove any trailing slash.

srcdir=`dirname $dir`; # Assume model directory one level up from decoding directory.
sdata=$data/split$nj;

thread_string=
[ $num_threads -gt 1 ] && thread_string="-parallel --num-threads=$num_threads"


mkdir -p $dir/log
split_data.sh $data $nj || exit 1;
echo $nj > $dir/num_jobs
splice_opts=`cat $srcdir/splice_opts 2>/dev/null` || true  # frame-splicing options.
cmvn_opts=`cat $srcdir/cmvn_opts 2>/dev/null`
delta_opts=`cat $srcdir/delta_opts 2>/dev/null` || true

silphonelist=`cat $graphdir/phones/silence.csl` || exit 1;

utils/lang/check_phones_compatible.sh $graphdir/phones.txt $srcdir/phones.txt

# Some checks.  Note: we don't need $srcdir/tree but we expect
# it should exist, given the current structure of the scripts.
for f in $graphdir/HCLG.fsts.scp $data/feats.scp $srcdir/tree; do
  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
done

# Split HCLG.fsts.scp by input utterance
n1=$(cat $graphdir/HCLG.fsts.scp | wc -l)
n2=$(cat $data/feats.scp | wc -l)
if [ $n1 != $n2 ]; then
  echo "$0: expected $n2 graphs in $graphdir/HCLG.fsts.scp, got $n1"
fi

mkdir -p $dir/split_fsts
sort -k1,1 $graphdir/HCLG.fsts.scp > $dir/HCLG.fsts.sorted.scp
utils/filter_scps.pl --no-warn -f 1 JOB=1:$nj \
  $sdata/JOB/feats.scp $dir/HCLG.fsts.sorted.scp $dir/split_fsts/HCLG.fsts.JOB.scp
HCLG=scp:$dir/split_fsts/HCLG.fsts.JOB.scp


## Work out name of alignment model. ##
if [ -z "$alignment_model" ]; then
  if [ -f "$srcdir/final.alimdl" ]; then alignment_model=$srcdir/final.alimdl;
  else alignment_model=$srcdir/final.mdl; fi
fi
[ ! -f "$alignment_model" ] && echo "$0: no alignment model $alignment_model " && exit 1;
##

## Do the speaker-independent decoding, if --si-dir option not present. ##
if [ -z "$si_dir" ]; then # we need to do the speaker-independent decoding pass.
  si_dir=${dir}.si # Name it as our decoding dir, but with suffix ".si".
  if [ $stage -le 0 ]; then
    if [ -f "$graphdir/num_pdfs" ]; then
      [ "`cat $graphdir/num_pdfs`" -eq `am-info --print-args=false $alignment_model | grep pdfs | awk '{print $NF}'` ] || \
        { echo "Mismatch in number of pdfs with $alignment_model"; exit 1; }
    fi
    steps/cleanup/decode_segmentation.sh --scoring-opts "$scoring_opts" \
           --num-threads $num_threads --skip-scoring $skip_scoring \
           --acwt $acwt --nj $nj --cmd "$cmd" --beam $first_beam \
           --model $alignment_model --max-active \
           $first_max_active $graphdir $data $si_dir || exit 1;
  fi
fi
##

## Some checks, and setting of defaults for variables.
[ "$nj" -ne "`cat $si_dir/num_jobs`" ] && echo "Mismatch in #jobs with si-dir" && exit 1;
[ ! -f "$si_dir/lat.1.gz" ] && echo "No such file $si_dir/lat.1.gz" && exit 1;
[ -z "$adapt_model" ] && adapt_model=$srcdir/final.mdl
[ -z "$final_model" ] && final_model=$srcdir/final.mdl
for f in $adapt_model $final_model; do
  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
done
##

## Set up the unadapted features "$sifeats"
if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
echo "$0: feature type is $feat_type";
case $feat_type in
  delta) sifeats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas $delta_opts ark:- ark:- |";;
  lda) sifeats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |";;
  *) echo "Invalid feature type $feat_type" && exit 1;
esac
##

## Now get the first-pass fMLLR transforms.
if [ $stage -le 1 ]; then
  echo "$0: getting first-pass fMLLR transforms."
  $cmd --max-jobs-run $max_fmllr_jobs JOB=1:$nj $dir/log/fmllr_pass1.JOB.log \
    gunzip -c $si_dir/lat.JOB.gz \| \
    lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \
    weight-silence-post $silence_weight $silphonelist $alignment_model ark:- ark:- \| \
    gmm-post-to-gpost $alignment_model "$sifeats" ark:- ark:- \| \
    gmm-est-fmllr-gpost --fmllr-update-type=$fmllr_update_type \
    --spk2utt=ark:$sdata/JOB/spk2utt $adapt_model "$sifeats" ark,s,cs:- \
    ark:$dir/pre_trans.JOB || exit 1;
fi
##

pass1feats="$sifeats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$dir/pre_trans.JOB ark:- ark:- |"

## Do the main lattice generation pass.  Note: we don't determinize the lattices at
## this stage, as we're going to use them in acoustic rescoring with the larger
## model, and it's more correct to store the full state-level lattice for this purpose.
if [ $stage -le 2 ]; then
  echo "$0: doing main lattice generation phase"
  if [ -f "$graphdir/num_pdfs" ]; then
    [ "`cat $graphdir/num_pdfs`" -eq `am-info --print-args=false $adapt_model | grep pdfs | awk '{print $NF}'` ] || \
      { echo "Mismatch in number of pdfs with $adapt_model"; exit 1; }
  fi
  $cmd --num-threads $num_threads JOB=1:$nj $dir/log/decode.JOB.log \
    gmm-latgen-faster$thread_string --max-active=$max_active --beam=$beam --lattice-beam=$lattice_beam \
    --acoustic-scale=$acwt --determinize-lattice=false \
    --allow-partial=$allow_partial --word-symbol-table=$graphdir/words.txt \
    $adapt_model "$HCLG" "$pass1feats" "ark:|gzip -c > $dir/lat.tmp.JOB.gz"
fi
##

## Do a second pass of estimating the transform-- this time with the lattices
## generated from the alignment model.  Compose the transforms to get
## $dir/trans.1, etc.
if [ $stage -le 3 ]; then
  echo "$0: estimating fMLLR transforms a second time."
  $cmd --max-jobs-run $max_fmllr_jobs JOB=1:$nj $dir/log/fmllr_pass2.JOB.log \
    lattice-determinize-pruned$thread_string --acoustic-scale=$acwt --beam=4.0 \
    "ark:gunzip -c $dir/lat.tmp.JOB.gz|" ark:- \| \
    lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \
    weight-silence-post $silence_weight $silphonelist $adapt_model ark:- ark:- \| \
    gmm-est-fmllr --fmllr-update-type=$fmllr_update_type \
    --spk2utt=ark:$sdata/JOB/spk2utt $adapt_model "$pass1feats" \
    ark,s,cs:- ark:$dir/trans_tmp.JOB '&&' \
    compose-transforms --b-is-affine=true ark:$dir/trans_tmp.JOB ark:$dir/pre_trans.JOB \
    ark:$dir/trans.JOB  || exit 1;
fi
##

feats="$sifeats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$dir/trans.JOB ark:- ark:- |"

# Rescore the state-level lattices with the final adapted features, and the final model
# (which by default is $srcdir/final.mdl, but which may be specified on the command line,
# useful in case of discriminatively trained systems).
# At this point we prune and determinize the lattices and write them out, ready for
# language model rescoring.

if [ $stage -le 4 ]; then
  echo "$0: doing a final pass of acoustic rescoring."
  $cmd --num-threads $num_threads JOB=1:$nj $dir/log/acoustic_rescore.JOB.log \
    gmm-rescore-lattice $final_model "ark:gunzip -c $dir/lat.tmp.JOB.gz|" "$feats" ark:- \| \
    lattice-determinize-pruned$thread_string --acoustic-scale=$acwt --beam=$lattice_beam ark:- \
    "ark:|gzip -c > $dir/lat.JOB.gz" '&&' rm $dir/lat.tmp.JOB.gz || exit 1;
fi

if ! $skip_scoring ; then
  [ ! -x local/score.sh ] && \
    echo "$0: Not scoring because local/score.sh does not exist or not executable." && exit 1;
  local/score.sh --cmd "$cmd" $scoring_opts $data $graphdir $dir ||
    { echo "$0: Scoring failed. (ignore by '--skip-scoring true')"; exit 1; }
fi

rm $dir/{trans_tmp,pre_trans}.*

exit 0;


================================================
FILE: egs/steps/cleanup/decode_segmentation.sh
================================================
#!/usr/bin/env bash

# Copyright 2014  Guoguo Chen, 2015 GoVivace Inc. (Nagendra Goel)
#           2017  Vimal Manohar
# Apache 2.0

# Some basic error checking, similar to steps/decode.sh is added.

set -e
set -o pipefail

# Begin configuration section.
transform_dir=   # this option won't normally be used, but it can be used if you
                 # want to supply existing fMLLR transforms when decoding.
iter=
model= # You can specify the model to use (e.g. if you want to use the .alimdl)
stage=0
nj=4
cmd=run.pl
max_active=7000
beam=13.0
lattice_beam=6.0
acwt=0.083333 # note: only really affects pruning (scoring is on lattices).
num_threads=1 # if >1, will use gmm-latgen-faster-parallel
parallel_opts=  # ignored now.
scoring_opts=
allow_partial=true
# note: there are no more min-lmwt and max-lmwt options, instead use
# e.g. --scoring-opts "--min-lmwt 1 --max-lmwt 20"
skip_scoring=false
# End configuration section.

echo "$0 $@"  # Print the command line for logging

[ -f ./path.sh ] && . ./path.sh; # source the path.
. parse_options.sh || exit 1;

if [ $# != 3 ]; then
   echo "$0: This is a special decoding script for segmentation where we"
   echo "use one decoding graph per segment. We assume a file HCLG.fsts.scp exists"
   echo "which is the scp file of the graphs for each segment."
   echo "This will normally be obtained by steps/cleanup/make_biased_lm_graphs.sh."
   echo "This script does not estimate fMLLR transforms; you have to use"
   echo "the --transform-dir option if you want to use fMLLR."
   echo ""
   echo "Usage: $0 [options] <graph-dir> <data-dir> <decode-dir>"
   echo " e.g.: $0 exp/tri2b/graph_train_si284_split \\"
   echo "             data/train_si284_split exp/tri2b/decode_train_si284_split"
   echo ""
   echo "where <decode-dir> is assumed to be a sub-directory of the directory"
   echo "where the model is."
   echo ""
   echo "main options (for others, see top of script file)"
   echo "  --config <config-file>                           # config containing options"
   echo "  --nj <nj>                                        # number of parallel jobs"
   echo "  --iter <iter>                                    # Iteration of model to test."
   echo "  --model <model>                                  # which model to use (e.g. to"
   echo "                                                   # specify the final.alimdl)"
   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
   echo "  --transform-dir <trans-dir>                      # dir to find fMLLR transforms "
   echo "  --acwt <float>                                   # acoustic scale used for lattice generation "
   echo "  --scoring-opts <string>                          # options to local/score.sh"
   echo "  --num-threads <n>                                # number of threads to use, default 1."
   exit 1;
fi


graphdir=$1
data=$2
dir=$3

mkdir -p $dir/log

if [ -e $dir/final.mdl ]; then
  srcdir=$dir
elif [ -e $dir/../final.mdl ]; then
  srcdir=$(dirname $dir)
else
  echo "$0: expected either $dir/final.mdl or $dir/../final.mdl to exist"
  exit 1
fi
sdata=$data/split$nj;

[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
echo $nj > $dir/num_jobs

if [ -z "$model" ]; then # if --model <mdl> was not specified on the command line...
  if [ -z $iter ]; then model=$srcdir/final.mdl;
  else model=$srcdir/$iter.mdl; fi
fi

if [ $(basename $model) != final.alimdl ] ; then
  # Do not use the $srcpath -- look at the path where the model is
  if [ -f $(dirname $model)/final.alimdl ] && [ -z "$transform_dir" ]; then
    echo -e '\n\n'
    echo $0 'WARNING: Running speaker independent system decoding using a SAT model!'
    echo $0 'WARNING: This is OK if you know what you are doing...'
    echo -e '\n\n'
  fi
fi

for f in $sdata/1/feats.scp $sdata/1/cmvn.scp $model $graphdir/HCLG.fsts.scp; do
  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
done

utils/lang/check_phones_compatible.sh $graphdir/phones.txt $srcdir/phones.txt

# Split HCLG.fsts.scp by input utterance
n1=$(cat $graphdir/HCLG.fsts.scp | wc -l)
n2=$(cat $data/feats.scp | wc -l)
if [ $n1 != $n2 ]; then
  echo "$0: expected $n2 graphs in $graphdir/HCLG.fsts.scp, got $n1"
fi


mkdir -p $dir/split_fsts
sort -k1,1 $graphdir/HCLG.fsts.scp > $dir/HCLG.fsts.sorted.scp
utils/filter_scps.pl --no-warn -f 1 JOB=1:$nj \
  $sdata/JOB/feats.scp $dir/HCLG.fsts.sorted.scp $dir/split_fsts/HCLG.fsts.JOB.scp
HCLG=scp:$dir/split_fsts/HCLG.fsts.JOB.scp

if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
echo "$0: feature type is $feat_type";

splice_opts=`cat $srcdir/splice_opts 2>/dev/null` || true # frame-splicing options.
cmvn_opts=`cat $srcdir/cmvn_opts 2>/dev/null` || true
delta_opts=`cat $srcdir/delta_opts 2>/dev/null` || true

thread_string=
[ $num_threads -gt 1 ] && thread_string="-parallel --num-threads=$num_threads"

case $feat_type in
  delta) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas $delta_opts ark:- ark:- |";;
  lda) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |";;
  *) echo "Invalid feature type $feat_type" && exit 1;
esac
if [ ! -z "$transform_dir" ]; then # add transforms to features...
  echo "Using fMLLR transforms from $transform_dir"
  [ ! -f $transform_dir/trans.1 ] && echo "Expected $transform_dir/trans.1 to exist" && exit 1
  [ ! -s $transform_dir/num_jobs ] && \
    echo "$0: expected $transform_dir/num_jobs to contain the number of jobs." && exit 1;
  nj_orig=$(cat $transform_dir/num_jobs)
  if [ $nj -ne $nj_orig ]; then
    # Copy the transforms into an archive with an index.
    echo "$0: num-jobs for transforms mismatches, so copying them."
    for n in $(seq $nj_orig); do cat $transform_dir/trans.$n; done | \
       copy-feats ark:- ark,scp:$dir/trans.ark,$dir/trans.scp || exit 1;
    feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk scp:$dir/trans.scp ark:- ark:- |"
  else
    # number of jobs matches with alignment dir.
    feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/trans.JOB ark:- ark:- |"
  fi
fi

if [ $stage -le 0 ]; then
  if [ -f "$graphdir/num_pdfs" ]; then
    [ "`cat $graphdir/num_pdfs`" -eq `am-info --print-args=false $model | grep pdfs | awk '{print $NF}'` ] || \
      { echo "Mismatch in number of pdfs with $model"; exit 1; }
  fi
  $cmd --num-threads $num_threads JOB=1:$nj $dir/log/decode.JOB.log \
    gmm-latgen-faster$thread_string --max-active=$max_active --beam=$beam --lattice-beam=$lattice_beam \
    --acoustic-scale=$acwt --allow-partial=$allow_partial --word-symbol-table=$graphdir/words.txt \
    $model "$HCLG" "$feats" "ark:|gzip -c > $dir/lat.JOB.gz" || exit 1;
fi

if ! $skip_scoring ; then
  [ ! -x local/score.sh ] && \
    echo "$0: Not scoring because local/score.sh does not exist or not executable." && exit 1;
  local/score.sh --cmd "$cmd" $scoring_opts $data $graphdir $dir ||
    { echo "$0: Scoring failed. (ignore by '--skip-scoring true')"; exit 1; }
fi

exit 0;


================================================
FILE: egs/steps/cleanup/decode_segmentation_nnet3.sh
================================================
#!/usr/bin/env bash

# Copyright 2014  Guoguo Chen, 2015 GoVivace Inc. (Nagendra Goel)
#           2017  Vimal Manohar
# Apache 2.0

# This script is similar to steps/cleanup/decode_segmentation.sh, but 
# does decoding using nnet3 model.

set -e
set -o pipefail

# Begin configuration section.
stage=-1
nj=4 # number of decoding jobs.
acwt=0.1  # Just a default value, used for adaptation and beam-pruning..
post_decode_acwt=1.0  # can be used in 'chain' systems to scale acoustics by 10 so the
                      # regular scoring script works.
cmd=run.pl
beam=15.0
frames_per_chunk=50
max_active=7000
min_active=200
ivector_scale=1.0
lattice_beam=8.0  # Beam we use in lattice generation. We can reduce this if 
                  # we only need the best path
iter=final
num_threads=1 # if >1, will use gmm-latgen-faster-parallel
scoring_opts=
skip_scoring=false
allow_partial=true
extra_left_context=0
extra_right_context=0
extra_left_context_initial=-1
extra_right_context_final=-1
online_ivector_dir=
minimize=false
# End configuration section.

echo "$0 $@"  # Print the command line for logging

[ -f ./path.sh ] && . ./path.sh; # source the path.
. utils/parse_options.sh || exit 1;

if [ $# -ne 3 ]; then
   echo "$0: This is a special decoding script for segmentation where we"
   echo "use one decoding graph per segment. We assume a file HCLG.fsts.scp exists"
   echo "which is the scp file of the graphs for each segment."
   echo "This will normally be obtained by steps/cleanup/make_biased_lm_graphs.sh."
   echo ""
   echo "Usage: $0 [options] <graph-dir> <data-dir> <decode-dir>"
   echo " e.g.: $0 --online-ivector-dir exp/nnet3/ivectors_train_si284_split "
   echo "             exp/nnet3/tdnn/graph_train_si284_split \\"
   echo "             data/train_si284_split exp/nnet3/tdnn/decode_train_si284_split"
   echo ""
   echo "where <decode-dir> is assumed to be a sub-directory of the directory"
   echo "where the model is."
   echo ""
   echo "main options (for others, see top of script file)"
   echo "  --config <config-file>                           # config containing options"
   echo "  --nj <nj>                                        # number of parallel jobs"
   echo "  --iter <iter>                                    # Iteration of model to test."
   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
   echo "  --acwt <float>                                   # acoustic scale used for lattice generation "
   echo "  --scoring-opts <string>                          # options to local/score.sh"
   echo "  --num-threads <n>                                # number of threads to use, default 1."
   exit 1;
fi


graphdir=$1
data=$2
dir=$3

mkdir -p $dir/log

if [ -e $dir/$iter.mdl ]; then
  srcdir=$dir
elif [ -e $dir/../$iter.mdl ]; then
  srcdir=$(dirname $dir)
else
  echo "$0: expected either $dir/$iter.mdl or $dir/../$iter.mdl to exist"
  exit 1
fi
model=$srcdir/$iter.mdl


extra_files=
if [ ! -z "$online_ivector_dir" ]; then
  steps/nnet2/check_ivectors_compatible.sh $srcdir $online_ivector_dir || exit 1
  extra_files="$online_ivector_dir/ivector_online.scp $online_ivector_dir/ivector_period"
fi

utils/lang/check_phones_compatible.sh $graph_dir/phones.txt $srcdir/phones.txt || exit 1

for f in $graphdir/HCLG.fsts.scp $data/feats.scp $model $extra_files; do
  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
done

sdata=$data/split$nj;
cmvn_opts=`cat $srcdir/cmvn_opts` || exit 1;
thread_string=
[ $num_threads -gt 1 ] && thread_string="-parallel --num-threads=$num_threads"

mkdir -p $dir/log
[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
echo $nj > $dir/num_jobs

# Split HCLG.fsts.scp by input utterance
n1=$(cat $graphdir/HCLG.fsts.scp | wc -l)
n2=$(cat $data/feats.scp | wc -l)
if [ $n1 != $n2 ]; then
  echo "$0: expected $n2 graphs in $graphdir/HCLG.fsts.scp, got $n1"
fi

mkdir -p $dir/split_fsts
sort -k1,1 $graphdir/HCLG.fsts.scp > $dir/HCLG.fsts.sorted.scp
utils/filter_scps.pl --no-warn -f 1 JOB=1:$nj \
  $sdata/JOB/feats.scp $dir/HCLG.fsts.sorted.scp $dir/split_fsts/HCLG.fsts.JOB.scp
HCLG=scp:$dir/split_fsts/HCLG.fsts.JOB.scp

## Set up features.
echo "$0: feature type is raw"

feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- |"

if [ ! -z "$online_ivector_dir" ]; then
  ivector_period=$(cat $online_ivector_dir/ivector_period) || exit 1;
  ivector_opts="--online-ivectors=scp:$online_ivector_dir/ivector_online.scp --online-ivector-period=$ivector_period"
fi

if [ "$post_decode_acwt" == 1.0 ]; then
  lat_wspecifier="ark:|gzip -c >$dir/lat.JOB.gz"
else
  lat_wspecifier="ark:|lattice-scale --acoustic-scale=$post_decode_acwt ark:- ark:- | gzip -c >$dir/lat.JOB.gz"
fi

frame_subsampling_opt=
if [ -f $srcdir/frame_subsampling_factor ]; then
  # e.g. for 'chain' systems
  frame_subsampling_opt="--frame-subsampling-factor=$(cat $srcdir/frame_subsampling_factor)"
fi

if [ $stage -le 1 ]; then
  if [ -f "$graphdir/num_pdfs" ]; then
    [ "`cat $graphdir/num_pdfs`" -eq `am-info --print-args=false $model | grep pdfs | awk '{print $NF}'` ] || \
      { echo "Mismatch in number of pdfs with $model"; exit 1; }
  fi
  $cmd --num-threads $num_threads JOB=1:$nj $dir/log/decode.JOB.log \
    nnet3-latgen-faster$thread_string $ivector_opts $frame_subsampling_opt \
     --frames-per-chunk=$frames_per_chunk \
     --extra-left-context=$extra_left_context \
     --extra-right-context=$extra_right_context \
     --extra-left-context-initial=$extra_left_context_initial \
     --extra-right-context-final=$extra_right_context_final \
     --minimize=$minimize --max-active=$max_active --min-active=$min_active --beam=$beam \
     --lattice-beam=$lattice_beam --acoustic-scale=$acwt --allow-partial=$allow_partial \
     --word-symbol-table=$graphdir/words.txt "$model" \
     "$HCLG" "$feats" "$lat_wspecifier" || exit 1;
fi


if [ $stage -le 2 ]; then
  if ! $skip_scoring ; then
    [ ! -x local/score.sh ] && \
      echo "$0: Not scoring because local/score.sh does not exist or not executable." && exit 1;
    iter_opt=
    [ "$iter" != "final" ] && iter_opt="--iter $iter"
    local/score.sh $iter_opt $scoring_opts --cmd "$cmd" $data $graphdir $dir ||
      { echo "$0: Scoring failed. (ignore by '--skip-scoring true')"; exit 1; }
  fi
fi
echo "Decoding done."
exit 0;


================================================
FILE: egs/steps/cleanup/find_bad_utts.sh
================================================
#!/usr/bin/env bash
# Copyright 2012-2014  Johns Hopkins University (Author: Daniel Povey)
# Apache 2.0

# Computes training alignments using a model with delta or
# LDA+MLLT features.  This version, rather than just using the
# text to align, computes mini-language models (unigram) from the text
# and a few common words in the LM.

# Begin configuration section.
nj=4
cmd=run.pl
use_graphs=false
# Begin configuration.
scale_opts="--transition-scale=1.0 --self-loop-scale=0.1"
acoustic_scale=0.1
beam=15.0
lattice_beam=8.0
max_active=750
transform_dir=  # directory to find fMLLR transforms in.
top_n_words=100 # Number of common words that we compile into each graph (most frequent
                # in $lang/text.
stage=-1
cleanup=true
# End configuration options.

echo "$0 $@"  # Print the command line for logging

[ -f path.sh ] && . ./path.sh # source the path.
. parse_options.sh || exit 1;

if [ $# != 4 ]; then
  echo "$0: Warning: this script is deprecated and will be removed."
  echo "  ... please use steps/cleanup/clean_and_segment_data.sh,"
  echo " which produces the same output formats as this script"
  echo " (e.g. all_info.sorted.txt)"
  echo "Usage: $0 <data-dir> <lang-dir> <src-dir> <dir>"
  echo "e.g.:  $0 data/train data/lang exp/tri1 exp/tri1_debug"
  echo "main options (for others, see top of script file)"
  echo "  --config <config-file>                           # config containing options"
  echo "  --nj <nj>                                        # number of parallel jobs"
  echo "  --use-graphs true                                # use graphs in src-dir"
  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
  exit 1;
fi

data=$1
lang=$2
srcdir=$3
dir=$4

for f in $data/text $lang/oov.int $srcdir/tree $srcdir/final.mdl \
    $lang/L_disambig.fst $lang/phones/disambig.int; do
  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1;
done

oov=`cat $lang/oov.int` || exit 1;
mkdir -p $dir/log
echo $nj > $dir/num_jobs
sdata=$data/split$nj
splice_opts=`cat $srcdir/splice_opts 2>/dev/null` # frame-splicing options.
cp $srcdir/splice_opts $dir 2>/dev/null # frame-splicing options.
cmvn_opts=`cat $srcdir/cmvn_opts 2>/dev/null`
cp $srcdir/cmvn_opts $dir 2>/dev/null # cmn/cmvn option.

utils/lang/check_phones_compatible.sh $lang/phones.txt $srcdir/phones.txt || exit 1;
cp $lang/phones.txt $dir || exit 1;

[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;

cp $srcdir/{tree,final.mdl} $dir || exit 1;
cp $srcdir/final.occs $dir;


if [ $stage -le 0 ]; then
  utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt <$data/text | \
    awk '{for(x=2;x<=NF;x++) print $x;}' | sort | uniq -c | \
    sort -rn > $dir/word_counts.int || exit 1;
  num_words=$(awk '{x+=$1} END{print x}' < $dir/word_counts.int) || exit 1;
  # print top-n words with their unigram probabilities.

  head -n $top_n_words $dir/word_counts.int | awk -v tot=$num_words '{print $1/tot, $2;}' >$dir/top_words.int
  utils/int2sym.pl -f 2 $lang/words.txt <$dir/top_words.int >$dir/top_words.txt
fi

if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
echo "$0: feature type is $feat_type"

case $feat_type in
  delta) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
  lda) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |"
    cp $srcdir/final.mat $srcdir/full.mat $dir
   ;;
  *) echo "$0: invalid feature type $feat_type" && exit 1;
esac
if [ -z "$transform_dir" ] && [ -f $srcdir/trans.1 ]; then
  transform_dir=$srcdir
fi
if [ ! -z "$transform_dir" ]; then
  echo "$0: using transforms from $transform_dir"
  [ ! -f $transform_dir/trans.1 ] && echo "$0: no such file $transform_dir/trans.1" && exit 1;
  nj_orig=$(cat $transform_dir/num_jobs)
  if [ $nj -ne $nj_orig ]; then
    # Copy the transforms into an archive with an index.
    for n in $(seq $nj_orig); do cat $transform_dir/trans.$n; done | \
      copy-feats ark:- ark,scp:$dir/trans.ark,$dir/trans.scp || exit 1;
    feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk scp:$dir/trans.scp ark:- ark:- |"
  else
    # number of jobs matches with alignment dir.
    feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/trans.JOB ark:- ark:- |"
  fi
elif [ -f $srcdir/final.alimdl ]; then
  echo "$0: **WARNING**: you seem to be using an fMLLR system as input,"
  echo "  but you are not providing the --transform-dir option during alignment."
fi


if [ $stage -le 1 ]; then
  echo "$0: decoding $data using utterance-specific decoding graphs using model from $srcdir, output in $dir"

  rm $dir/edits.*.txt $dir/aligned_ref.*.txt 2>/dev/null

  $cmd JOB=1:$nj $dir/log/decode.JOB.log \
    utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt $sdata/JOB/text \| \
    steps/cleanup/make_utterance_fsts.pl $dir/top_words.int \| \
    compile-train-graphs-fsts $scale_opts --read-disambig-syms=$lang/phones/disambig.int \
     $dir/tree $dir/final.mdl $lang/L_disambig.fst ark:- ark:- \| \
    gmm-latgen-faster --acoustic-scale=$acoustic_scale --beam=$beam \
      --max-active=$max_active --lattice-beam=$lattice_beam \
      --word-symbol-table=$lang/words.txt \
     $dir/final.mdl ark:- "$feats" ark:- \| \
    lattice-oracle ark:- "ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt $sdata/JOB/text|" \
      ark,t:- ark,t:$dir/edits.JOB.txt \| \
    utils/int2sym.pl -f 2- $lang/words.txt '>' $dir/aligned_ref.JOB.txt || exit 1;
fi


if [ $stage -le 2 ]; then
  if [ -f $dir/edits.1.txt ]; then
    # the awk commands below are to ensure that partially-written files don't confuse us.
    for x in $(seq $nj); do cat $dir/edits.$x.txt; done | awk '{if(NF==2){print;}}' > $dir/edits.txt
    for x in $(seq $nj); do cat $dir/aligned_ref.$x.txt; done | awk '{if(NF>=1){print;}}' > $dir/aligned_ref.txt
  else
    echo "$0: warning: no file $dir/edits.1.txt, using previously concatenated file if present."
  fi

  # in case any utterances failed to align, get filtered copy of $data/text
  utils/filter_scp.pl $dir/edits.txt < $data/text  > $dir/text
  cat $dir/text | awk '{print $1, (NF-1);}' > $dir/length.txt

  n1=$(wc -l < $dir/edits.txt)
  n2=$(wc -l < $dir/aligned_ref.txt)
  n3=$(wc -l < $dir/text)
  n4=$(wc -l < $dir/length.txt)
  if [ $n1 -ne $n2 ] || [ $n2 -ne $n3 ] || [ $n3 -ne $n4 ]; then
    echo "$0: mismatch in lengths of files:"
    wc $dir/edits.txt $dir/aligned_ref.txt $dir/text $dir/length.txt
    exit 1;
  fi

  # note: the format of all_info.txt is:
  # <utterance-id>   <number of errors>  <reference-length>  <decoded-output>   <reference>
  # with the fields separated by tabs, e.g.
  # adg04_sr009_trn 1 	12	 SHOW THE GRIDLEY+S TRACK IN BRIGHT ORANGE WITH HORNE+S IN DIM RED AT	 SHOW THE GRIDLEY+S TRACK IN BRIGHT ORANGE WITH HORNE+S IN DIM RED

  paste $dir/edits.txt \
      <(awk '{print $2}' $dir/length.txt) \
      <(awk '{$1="";print;}' <$dir/aligned_ref.txt) \
      <(awk '{$1="";print;}' <$dir/text) > $dir/all_info.txt

  sort -nr -k2 $dir/all_info.txt > $dir/all_info.sorted.txt

  if $cleanup; then
    rm $dir/edits.*.txt $dir/aligned_ref.*.txt
  fi

fi

if [ $stage -le 3 ]; then
  ###
  # These stats might help people figure out what is wrong with the data
  # a)human-friendly and machine-parsable alignment in the file per_utt_details.txt
  # b)evaluation of per-speaker performance to possibly find speakers with
  #   distinctive accents/speech disorders and similar
  # c)Global analysis on (Ins/Del/Sub) operation, which might be used to figure
  #   out if there is systematic issue with lexicon, pronunciation or phonetic confusability

  mkdir -p $dir/analysis
  align-text --special-symbol="***"  ark:$dir/text ark:$dir/aligned_ref.txt  ark,t:- | \
    utils/scoring/wer_per_utt_details.pl --special-symbol "***" > $dir/analysis/per_utt_details.txt

  cat $dir/analysis/per_utt_details.txt | \
    utils/scoring/wer_per_spk_details.pl $data/utt2spk > $dir/analysis/per_spk_details.txt

  cat $dir/analysis/per_utt_details.txt | \
    utils/scoring/wer_ops_details.pl --special-symbol "***" | \
    sort -i -b -k1,1 -k4,4nr -k2,2 -k3,3 > $dir/analysis/ops_details.txt

fi


================================================
FILE: egs/steps/cleanup/find_bad_utts_nnet.sh
================================================
#!/usr/bin/env bash
# Copyright 2012-2014  Johns Hopkins University (Author: Daniel Povey)
#           2016       Api.ai (Author: Ilya Platonov)      
# Apache 2.0
#
# Tweaked version of find_bad_utts.sh to work with nnet2 and nnet3(supports chain models) non-ivector models.
# This script uses nnet-info and nnet3-am-info to determine type of nnet (nnet2 or nnet3).
# Use --acoustic-scale=1.0 for chain models.
#
# Begin configuration section.  
nj=8
cmd=run.pl
use_graphs=false
# Begin configuration.
scale_opts="--transition-scale=1.0 --self-loop-scale=0.1"
acoustic_scale=0.1
beam=15.0
lattice_beam=8.0
max_active=750
transform_dir=  # directory to find fMLLR transforms in.
top_n_words=100 # Number of common words that we compile into each graph (most frequent
                # in $lang/text.
stage=-1
cleanup=true
# End configuration options.

echo "$0 $@"  # Print the command line for logging

[ -f path.sh ] && . ./path.sh # source the path.
. parse_options.sh || exit 1;

if [ $# != 4 ]; then
   echo "usage: $0 <data-dir> <lang-dir> <src-dir> <dir>"
   echo "e.g.:  $0 data/train data/lang exp/tri1 exp/tri1_debug"
   echo "main options (for others, see top of script file)"
   echo "  --config <config-file>                           # config containing options"
   echo "  --nj <nj>                                        # number of parallel jobs"
   echo "  --use-graphs true                                # use graphs in src-dir"
   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
   exit 1;
fi

data=$1
lang=$2
srcdir=$3
dir=$4

for f in $data/text $lang/oov.int $srcdir/tree $srcdir/final.mdl \
    $lang/L_disambig.fst $lang/phones/disambig.int; do
  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1;
done

oov=`cat $lang/oov.int` || exit 1;
mkdir -p $dir/log
echo $nj > $dir/num_jobs
sdata=$data/split$nj
splice_opts=`cat $srcdir/splice_opts 2>/dev/null` # frame-splicing options.
cp $srcdir/splice_opts $dir 2>/dev/null # frame-splicing options.
cmvn_opts=`cat $srcdir/cmvn_opts 2>/dev/null`
cp $srcdir/cmvn_opts $dir 2>/dev/null # cmn/cmvn option.

[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;

cp $srcdir/{tree,final.mdl} $dir || exit 1;

utils/lang/check_phones_compatible.sh $lang/phones.txt $srcdir/phones.txt || exit 1;
cp $lang/phones.txt $dir || exit 1;

#checking type of nnet
if nnet-info 1>/dev/null 2>/dev/null $srcdir/final.mdl; then 
  nnet_type="nnet";
  latgen_cmd="nnet-latgen-faster";
elif nnet3-am-info 1>/dev/null 2>/dev/null $srcdir/final.mdl; then
  nnet_type="nnet3"
  frame_subsampling_factor=1;
  nnet3_opt=
  if [ -f $srcdir/frame_subsampling_factor ]; then
    frame_subsampling_factor="$(cat $srcdir/frame_subsampling_factor)"
  fi
  if [ "$frame_subsamping_factor" != "1" ]; then
    nnet3_opt="--frame-subsampling-factor=$frame_subsampling_factor";
  fi
  latgen_cmd="nnet3-latgen-faster $nnet3_opt";
else
  echo "Unsupported type of nnet for $srcdir/final.mdl";
fi 

echo "nnet type is $nnet_type";


if [ $stage -le 0 ]; then
  utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt <$data/text | \
    awk '{for(x=2;x<=NF;x++) print $x;}' | sort | uniq -c | \
    sort -rn > $dir/word_counts.int || exit 1;
  num_words=$(awk '{x+=$1} END{print x}' < $dir/word_counts.int) || exit 1;
  # print top-n words with their unigram probabilities.

  head -n $top_n_words $dir/word_counts.int | awk -v tot=$num_words '{print $1/tot, $2;}' >$dir/top_words.int
  utils/int2sym.pl -f 2 $lang/words.txt <$dir/top_words.int >$dir/top_words.txt
fi

echo "$0: feature type is raw"

feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- |";

if [ $stage -le 1 ]; then
  echo "$0: decoding $data using utterance-specific decoding graphs using model from $srcdir, output in $dir"

  rm $dir/edits.*.txt $dir/aligned_ref.*.txt 2>/dev/null

  $cmd JOB=1:$nj $dir/log/decode.JOB.log \
    utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt $sdata/JOB/text \| \
    steps/cleanup/make_utterance_fsts.pl $dir/top_words.int \| \
    compile-train-graphs-fsts $scale_opts --read-disambig-syms=$lang/phones/disambig.int \
     $dir/tree $dir/final.mdl $lang/L_disambig.fst ark:- ark:- \| \
    $latgen_cmd --acoustic-scale=$acoustic_scale --beam=$beam \
      --max-active=$max_active --lattice-beam=$lattice_beam \
      --word-symbol-table=$lang/words.txt \
     $dir/final.mdl ark:- "$feats" ark:- \| \
    lattice-oracle ark:- "ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt $sdata/JOB/text|" \
      ark,t:- ark,t:$dir/edits.JOB.txt \| \
    utils/int2sym.pl -f 2- $lang/words.txt '>' $dir/aligned_ref.JOB.txt || exit 1;
fi


if [ $stage -le 2 ]; then
  if [ -f $dir/edits.1.txt ]; then
    # the awk commands below are to ensure that partially-written files don't confuse us.
    for x in $(seq $nj); do cat $dir/edits.$x.txt; done | awk '{if(NF==2){print;}}' > $dir/edits.txt
    for x in $(seq $nj); do cat $dir/aligned_ref.$x.txt; done | awk '{if(NF>=1){print;}}' > $dir/aligned_ref.txt
  else
    echo "$0: warning: no file $dir/edits.1.txt, using previously concatenated file if present."
  fi

  # in case any utterances failed to align, get filtered copy of $data/text
  utils/filter_scp.pl $dir/edits.txt < $data/text  > $dir/text
  cat $dir/text | awk '{print $1, (NF-1);}' > $dir/length.txt

  n1=$(wc -l < $dir/edits.txt)
  n2=$(wc -l < $dir/aligned_ref.txt)
  n3=$(wc -l < $dir/text)
  n4=$(wc -l < $dir/length.txt)
  if [ $n1 -ne $n2 ] || [ $n2 -ne $n3 ] || [ $n3 -ne $n4 ]; then
    echo "$0: mismatch in lengths of files:"
    wc $dir/edits.txt $dir/aligned_ref.txt $dir/text $dir/length.txt
    exit 1;
  fi

  # note: the format of all_info.txt is:
  # <utterance-id>   <number of errors>  <reference-length>  <decoded-output>   <reference>
  # with the fields separated by tabs, e.g.
  # adg04_sr009_trn 1 	12	 SHOW THE GRIDLEY+S TRACK IN BRIGHT ORANGE WITH HORNE+S IN DIM RED AT	 SHOW THE GRIDLEY+S TRACK IN BRIGHT ORANGE WITH HORNE+S IN DIM RED
  
  paste $dir/edits.txt \
      <(awk '{print $2}' $dir/length.txt) \
      <(awk '{$1="";print;}' <$dir/aligned_ref.txt) \
      <(awk '{$1="";print;}' <$dir/text) > $dir/all_info.txt

  sort -nr -k2 $dir/all_info.txt > $dir/all_info.sorted.txt

  if $cleanup; then
    rm $dir/edits.*.txt $dir/aligned_ref.*.txt
  fi

fi

if [ $stage -le 3 ]; then
  ###
  # These stats migh help people figure out what is wrong with the data
  # a)human-friendly and machine-parsable alignment in the file per_utt_details.txt
  # b)evaluation of per-speaker performance to possibly find speakers with 
  #   distinctive accents/speech disorders and similar
  # c)Global analysis on (Ins/Del/Sub) operation, which might be used to figure
  #   out if there is systematic issue with lexicon, pronunciation or phonetic confusability

  mkdir -p $dir/analysis
  align-text --special-symbol="***"  ark:$dir/text ark:$dir/aligned_ref.txt  ark,t:- | \
    utils/scoring/wer_per_utt_details.pl --special-symbol "***" > $dir/analysis/per_utt_details.txt

  cat $dir/analysis/per_utt_details.txt | \
    utils/scoring/wer_per_spk_details.pl $data/utt2spk > $dir/analysis/per_spk_details.txt

  cat $dir/analysis/per_utt_details.txt | \
    utils/scoring/wer_ops_details.pl --special-symbol "***" | \
    sort -i -b -k1,1 -k4,4nr -k2,2 -k3,3 > $dir/analysis/ops_details.txt

fi


================================================
FILE: egs/steps/cleanup/internal/align_ctm_ref.py
================================================
#! /usr/bin/env python

# Copyright 2016    Vimal Manohar
#           2020    Dongji Gao
# Apache 2.0.

"""This module aligns a hypothesis (CTM or text) with a reference to
find the best matching sub-sequence in the reference for the hypothesis
using Smith-Waterman like alignment.

e.g.: align_ctm_ref.py --hyp-format=CTM --ref=data/train/text --hyp=foo/ctm
        --output=foo/ctm_edits
"""

from __future__ import print_function
import argparse
import logging
import sys

sys.path.insert(0, 'steps')
import libs.common as common_lib

logger = logging.getLogger(__name__)
handler = logging.StreamHandler()
formatter = logging.Formatter("%(asctime)s [%(pathname)s:%(lineno)s - "
                              "%(funcName)s - %(levelname)s ] %(message)s")
handler.setFormatter(formatter)
logger.setLevel(logging.DEBUG)

verbose_level = 0


def get_args():
    parser = argparse.ArgumentParser(description="""
    This module aligns a hypothesis (CTM or text) with a reference to find the
    best matching sub-sequence in the reference for the hypothesis using
    Smith-Waterman like alignment.

    e.g.: align_ctm_ref.py --align-full-hyp=false --hyp-format=CTM
    --reco2file-and-channel=data/foo/reco2file_and_channel --ref=data/train/text
    --hyp=foo/ctm --output=foo/ctm_edits
    """)

    parser.add_argument("--hyp-format", type=str, choices=["Text", "CTM"],
                        default="CTM",
                        help="Format used for the hypothesis")
    parser.add_argument("--reco2file-and-channel", type=argparse.FileType('r'),
                        help="""reco2file_and_channel file.
                        This will be used to match references that are usually
                        indexed by the recording-id with the CTM lines that have
                        file and channel. This option is typically not
                        required.""")
    parser.add_argument("--eps-symbol", type=str, default="-",
                        help="Symbol used to contain alignment "
                        "to empty symbol")
    parser.add_argument("--oov-word", type=str, default=None,
                        action=common_lib.NullstrToNoneAction,
                        help="Symbol of OOV word in hypothesis")
    parser.add_argument("--symbol-table", type=argparse.FileType('r'),
                        help="""Symbol table for words in vocabulary. Used
                        to determine if a word is a OOV or not""")

    parser.add_argument("--correct-score", type=int, default=1,
                        help="Score for correct matches")
    parser.add_argument("--substitution-penalty", type=int, default=1,
                        help="Penalty for substitution errors")
    parser.add_argument("--deletion-penalty", type=int, default=1,
                        help="Penalty for deletion errors")
    parser.add_argument("--insertion-penalty", type=int, default=1,
                        help="Penalty for insertion errors")

    parser.add_argument("--align-full-hyp", type=str,
                        action=common_lib.StrToBoolAction,
                        choices=["true", "false"], default=True,
                        help="""Align full hypothesis i.e. trackback from
                        the end to get the alignment. This is different
                        from the normal Smith-Waterman alignment, where the
                        traceback will be from the maximum score.""")

    parser.add_argument("--debug-only", type=str, default="false",
                        choices=["true", "false"],
                        help="Run test functions only")
    parser.add_argument("--verbose", type=int, default=0,
                        choices=[0, 1, 2, 3],
                        help="Use larger value for more verbose logging.")

    parser.add_argument("--ref", dest='ref_in_file',
                        type=argparse.FileType('r'), required=True,
                        help="Reference text file")
    parser.add_argument("--hyp", dest='hyp_in_file', required=True,
                        type=argparse.FileType('r'),
                        help="Hypothesis text or CTM file")
    parser.add_argument("--output", dest='alignment_out_file', required=True,
                        type=argparse.FileType('w'),
                        help="""File to write output alignment.
                        If hyp-format=CTM, then the output is in the form of
                        CTM, but with two additional columns of Edit-type and
                        Reference-word matched to the hypothesis.""")

    args = parser.parse_args()

    args.debug_only = bool(args.debug_only == "true")

    global verbose_level
    verbose_level = args.verbose
    if args.verbose > 2:
        handler.setLevel(logging.DEBUG)
    else:
        handler.setLevel(logging.INFO)
    logger.addHandler(handler)

    return args


def read_text(text_file):
    """Reads a kaldi-format text file and yield elements of a dictionary
        { utterane_id : transcript (as a list of words) }

    The first-column of the text file is the utterance-id, which will be
    used as the key to index the dictionary elements.
    The remaining columns of the file are text of the transcript and they are
    returned as a list of words.
    """
    for line in text_file:
        parts = line.strip().split()
        if len(parts) < 1:
            raise RuntimeError(
                "Did not get enough columns; line {0} in {1}"
                "".format(line, text_file.name))
        elif len(parts) == 1:
            logger.warn("Empty transcript for utterance %s in %s",
                        parts[0], text_file.name)
            yield parts[0], []
        else:
            yield parts[0], parts[1:]
    text_file.close()


def read_ctm(ctm_file, file_and_channel2reco=None):
    """Reads a CTM file and yields elements of a dictionary
        { utterance-id : CTM for the utterance },
    where CTM for the utterance is stored as a list of lines
    from a CTM correponding to the utterance.

    Note: *_reco in the variables usually correspond to utterances rather
    than recordings.
    """
    prev_reco = ""
    ctm_lines = []
    for line in ctm_file:
        try:
            parts = line.strip().split()
            parts[2] = float(parts[2])
            parts[3] = float(parts[3])

            if len(parts) == 5:
                parts.append(1.0)   # confidence defaults to 1.0.

            if len(parts) != 6:
                raise ValueError("CTM must have 6 fields.")

            if file_and_channel2reco is None:
                reco = parts[0]
                if parts[1] != '1':
                    raise ValueError("Channel should be 1, "
                                     "got {0}".format(parts[1]))
            else:
                reco = file_and_channel2reco[(parts[0], parts[1])]
            if prev_reco != "" and reco != prev_reco:
                # New recording
                yield prev_reco, ctm_lines
                ctm_lines = []
            ctm_lines.append(parts[2:])
            prev_reco = reco
        except Exception:
            logger.error("Error in processing CTM line {0}".format(line))
            raise
    if prev_reco != "" and len(ctm_lines) > 0:
        yield prev_reco, ctm_lines
    ctm_file.close()


def smith_waterman_alignment(ref, hyp, similarity_score_function,
                             del_score, ins_score,
                             eps_symbol="<eps>", align_full_hyp=True):
    """Does Smith-Waterman alignment of reference sequence and hypothesis
    sequence.
    This is a special case of the Smith-Waterman alignment that assumes that
    the deletion and insertion costs are linear with number of incorrect words.

    If align_full_hyp is True, then the traceback of the alignment
    is started at the end of the hypothesis. This is when we want the
    reference that aligns with the full hypothesis.
    This differs from the normal Smith-Waterman alignment, where the traceback
    is from the highest score in the alignment score matrix. This
    can be obtained by setting align_full_hyp as False. This gets only the
    sub-sequence of the hypothesis that best matches with a
    sub-sequence of the reference.

    Returns a list of tuples where each tuple has the format:
        (ref_word, hyp_word, ref_word_from_index, hyp_word_from_index,
         ref_word_to_index, hyp_word_to_index)
    """
    output = []

    ref_len = len(ref)
    hyp_len = len(hyp)

    bp = [[] for x in range(ref_len+1)]

    # Score matrix of size (ref_len + 1) x (hyp_len + 1)
    # The index m, n in this matrix corresponds to the score
    # of the best matching sub-sequence pair between reference and hypothesis
    # ending with the reference word ref[m-1] and hypothesis word hyp[n-1].
    # If align_full_hyp is True, then the hypothesis sub-sequence is from
    # the 0th word i.e. hyp[0].
    H = [[] for x in range(ref_len+1)]

    for ref_index in range(ref_len+1):
        if align_full_hyp:
            H[ref_index] = [-(hyp_len+2) for x in range(hyp_len+1)]
            H[ref_index][0] = 0
        else:
            H[ref_index] = [0 for x in range(hyp_len+1)]
        bp[ref_index] = [(0, 0) for x in range(hyp_len+1)]

        if align_full_hyp and ref_index == 0:
            for hyp_index in range(1, hyp_len+1):
                H[0][hyp_index] = H[0][hyp_index-1] + ins_score
                bp[ref_index][hyp_index] = (ref_index, hyp_index-1)
                logger.debug(
                    "({0},{1}) -> ({2},{3}): {4}"
                    "".format(ref_index, hyp_index-1, ref_index, hyp_index,
                              H[ref_index][hyp_index]))

    max_score = -float("inf")
    max_score_element = (0, 0)

    for ref_index in range(1, ref_len+1):     # Reference
        for hyp_index in range(1, hyp_len+1):     # Hypothesis
            sub_or_ok = (H[ref_index-1][hyp_index-1]
                         + similarity_score_function(ref[ref_index-1],
                                                     hyp[hyp_index-1]))

            if ((not align_full_hyp and sub_or_ok > 0)
                    or (align_full_hyp
                        and sub_or_ok >= H[ref_index][hyp_index])):
                H[ref_index][hyp_index] = sub_or_ok
                bp[ref_index][hyp_index] = (ref_index-1, hyp_index-1)
                logger.debug(
                    "({0},{1}) -> ({2},{3}): {4} ({5},{6})"
                    "".format(ref_index-1, hyp_index-1, ref_index, hyp_index,
                              H[ref_index][hyp_index],
                              ref[ref_index-1], hyp[hyp_index-1]))

            if H[ref_index-1][hyp_index] + del_score > H[ref_index][hyp_index]:
                H[ref_index][hyp_index] = H[ref_index-1][hyp_index] + del_score
                bp[ref_index][hyp_index] = (ref_index-1, hyp_index)
                logger.debug(
                    "({0},{1}) -> ({2},{3}): {4}"
                    "".format(ref_index-1, hyp_index, ref_index, hyp_index,
                              H[ref_index][hyp_index]))

            if H[ref_index][hyp_index-1] + ins_score > H[ref_index][hyp_index]:
                H[ref_index][hyp_index] = H[ref_index][hyp_index-1] + ins_score
                bp[ref_index][hyp_index] = (ref_index, hyp_index-1)
                logger.debug(
                    "({0},{1}) -> ({2},{3}): {4}"
                    "".format(ref_index, hyp_index-1, ref_index, hyp_index,
                              H[ref_index][hyp_index]))

            #if hyp_index == hyp_len and H[ref_index][hyp_index] >= max_score:
            if ((not align_full_hyp or hyp_index == hyp_len)
                    and H[ref_index][hyp_index] >= max_score):
                max_score = H[ref_index][hyp_index]
                max_score_element = (ref_index, hyp_index)

    ref_index, hyp_index = max_score_element
    score = max_score
    logger.debug("Alignment score: %s for (%d, %d)",
                 score, ref_index, hyp_index)

    while ((not align_full_hyp and score >= 0)
           or (align_full_hyp and hyp_index > 0)):
        try:
            prev_ref_index, prev_hyp_index = bp[ref_index][hyp_index]
            if ((prev_ref_index, prev_hyp_index) == (ref_index, hyp_index)
                    or (prev_ref_index, prev_hyp_index) == (0, 0)):
                score = H[ref_index][hyp_index]
                if score != 0:
                    ref_word = ref[ref_index-1] if ref_index > 0 else eps_symbol
                    hyp_word = hyp[hyp_index-1] if hyp_index > 0 else eps_symbol
                    output.append((ref_word, hyp_word, prev_ref_index,
                        prev_hyp_index, ref_index, hyp_index))

                    ref_index, hyp_index = (prev_ref_index, prev_hyp_index)
                    score = H[ref_index][hyp_index]
                break

            if (ref_index == prev_ref_index + 1
                    and hyp_index == prev_hyp_index + 1):
                # Substitution or correct
                output.append(
                    (ref[ref_index-1] if ref_index > 0 else eps_symbol,
                     hyp[hyp_index-1] if hyp_index > 0 else eps_symbol,
                     prev_ref_index, prev_hyp_index, ref_index, hyp_index))
            elif (prev_hyp_index == hyp_index):
                # Deletion
                assert prev_ref_index == ref_index - 1
                output.append(
                    (ref[ref_index-1] if ref_index > 0 else eps_symbol,
                     eps_symbol,
                     prev_ref_index, prev_hyp_index, ref_index, hyp_index))
            elif (prev_ref_index == ref_index):
                # Insertion
                assert prev_hyp_index == hyp_index - 1
                output.append(
                    (eps_symbol,
                     hyp[hyp_index-1] if hyp_index > 0 else eps_symbol,
                     prev_ref_index, prev_hyp_index, ref_index, hyp_index))
            else:
                raise RuntimeError


            ref_index, hyp_index = (prev_ref_index, prev_hyp_index)
            score = H[ref_index][hyp_index]
        except Exception:
            logger.error("Unexpected entry (%d,%d) -> (%d,%d), %s, %s",
                         prev_ref_index, prev_hyp_index, ref_index, hyp_index,
                         ref[prev_ref_index], hyp[prev_hyp_index])
            raise RuntimeError("Unexpected result: Bug in code!!")

    assert (align_full_hyp or score == 0)

    output.reverse()

    if verbose_level > 2:
        for ref_index in range(ref_len+1):
            for hyp_index in range(hyp_len+1):
                print ("{0} ".format(H[ref_index][hyp_index]), end='',
                       file=sys.stderr)
            print ("", file=sys.stderr)

    logger.debug("Aligned output:")
    logger.debug("  -  ".join(["({0},{1})".format(x[4], x[5])
                               for x in output]))
    logger.debug("REF: ")
    logger.debug("    ".join(str(x[0]) for x in output))
    logger.debug("HYP:")
    logger.debug("    ".join(str(x[1]) for x in output))

    return (output, max_score)


def print_alignment(recording, alignment, out_file_handle):
    out_text = [recording]
    for line in alignment:
        try:
            out_text.append(line[1])
        except Exception:
            logger.error("Something wrong with alignment. "
                         "Invalid line {0}".format(line))
            raise
    print (" ".join(out_text), file=out_file_handle)


def get_edit_type(hyp_word, ref_word, duration=-1, eps_symbol='<eps>',
                  oov_word=None, symbol_table=None):
    if hyp_word == ref_word and hyp_word != eps_symbol:
        return 'cor'
    if hyp_word != eps_symbol and ref_word == eps_symbol:
        return 'ins'
    if hyp_word == eps_symbol and ref_word != eps_symbol and duration == 0.0:
        return 'del'
    if (hyp_word == oov_word and symbol_table is not None
            and len(symbol_table) > 0 and ref_word not in symbol_table):
        return 'cor'    # this special case is treated as correct
    if hyp_word == eps_symbol and ref_word == eps_symbol and duration > 0.0:
        # silence in hypothesis; we don't match this up with any reference
        # word.
        return 'sil'
    # The following assertion is because, based on how get_ctm_edits()
    # works, we shouldn't hit this case.
    assert hyp_word != eps_symbol and ref_word != eps_symbol
    return 'sub'


def get_ctm_edits(alignment_output, ctm_array, eps_symbol="<eps>",
                  oov_word=None, symbol_table=None):
    """
    This function takes two lists
        alignment_output = The output of smith_waterman_alignment() which is a
            list of tuples (ref_word, hyp_word, ref_word_from_index,
            hyp_word_from_index, ref_word_to_index, hyp_word_to_index)
        ctm_array = [ [ start1, duration1, hyp_word1, confidence1 ], ... ]
    and pads them with new list elements so that the entries 'match up'.

    Returns CTM edits lines, which are CTM lines appended with reference word
    and edit type.

    What we are aiming for is that for each i, ctm_array[i][2] ==
    alignment_output[i][1].  The reasons why this is not automatically true
    are:

     (1) There may be insertions in the hypothesis sequence that are not
         aligned with any reference words in the beginning of the
         alignment_output.
     (2) There may be deletions in the end of the alignment_output that
         do not correspond to any additional hypothesis CTM lines.

    We introduce suitable entries in to alignment_output and ctm_array as
    necessary to make them 'match up'.
    """
    ctm_edits = []
    ali_len = len(alignment_output)
    ctm_len = len(ctm_array)
    ali_pos = 0
    ctm_pos = 0

    # current_time is the end of the last ctm segment we processesed.
    current_time = ctm_array[0][0] if ctm_len > 0 else 0.0

    for (ref_word, hyp_word, ref_prev_i, hyp_prev_i,
         ref_i, hyp_i) in alignment_output:
        try:
            ctm_pos = hyp_prev_i
            # This is true because we cannot have errors at the end because
            # that will decrease the smith-waterman alignment score.
            assert ctm_pos < ctm_len
            assert len(ctm_array[ctm_pos]) == 4

            if hyp_prev_i == hyp_i:
                assert hyp_word == eps_symbol
                # These are deletions as there are no CTM entries
                # corresponding to these alignments.
                edit_type = get_edit_type(
                    hyp_word=eps_symbol, ref_word=ref_word,
                    duration=0.0, eps_symbol=eps_symbol,
                    oov_word=oov_word, symbol_table=symbol_table)
                ctm_line = [current_time, 0.0, eps_symbol, 1.0,
                            ref_word, edit_type]
                ctm_edits.append(ctm_line)
            else:
                assert hyp_i == hyp_prev_i + 1
                assert hyp_word == ctm_array[ctm_pos][2]
                # This is the normal case, where there are 2 entries where
                # they hyp-words match up.
                ctm_line = list(ctm_array[ctm_pos])
                if hyp_word == eps_symbol and ref_word != eps_symbol:
                    # This is a silence in hypothesis aligned with a reference
                    # word. We split this into two ctm edit lines where the
                    # first one is a deletion of duration 0 and the second
                    # one is a silence of duration given by the ctm line.
                    edit_type = get_edit_type(
                        hyp_word=eps_symbol, ref_word=ref_word,
                        duration=0.0, eps_symbol=eps_symbol,
                        oov_word=oov_word, symbol_table=symbol_table)
                    assert edit_type == 'del'
                    ctm_edits.append([current_time, 0.0, eps_symbol, 1.0,
                                      ref_word, edit_type])

                    edit_type = get_edit_type(
                        hyp_word=eps_symbol, ref_word=eps_symbol,
                        duration=ctm_line[1], eps_symbol=eps_symbol,
                        oov_word=oov_word, symbol_table=symbol_table)
                    assert edit_type == 'sil'
                    ctm_line.extend([eps_symbol, edit_type])
                    ctm_edits.append(ctm_line)
                else:
                    edit_type = get_edit_type(
                        hyp_word=hyp_word, ref_word=ref_word,
                        duration=ctm_line[1], eps_symbol=eps_symbol,
                        oov_word=oov_word, symbol_table=symbol_table)
                    ctm_line.extend([ref_word, edit_type])
                    ctm_edits.append(ctm_line)
                current_time = (ctm_array[ctm_pos][0]
                                + ctm_array[ctm_pos][1])
        except Exception:
            logger.error("Could not get ctm edits for "
                         "edits@{edits_pos} = {0}, ctm@{ctm_pos} = {1}".format(
                            ("NONE" if ali_pos >= ali_len
                             else alignment_output[ali_pos]),
                            ("NONE" if ctm_pos >= ctm_len
                             else ctm_array[ctm_pos]),
                            edits_pos=ali_pos, ctm_pos=ctm_pos))
            logger.error("alignment = {0}".format(alignment_output))
            raise
    return ctm_edits


def ctm_line_to_string(ctm_line):
    if len(ctm_line) != 8:
        raise RuntimeError("len(ctm_line) expected to be {0}. "
                           "Invalid line {1}".format(8, ctm_line))

    return " ".join([str(x) for x in ctm_line])


def test_alignment(align_full_hyp):
    hyp = "GCCAT"
    ref = "AGCACACA"

    verbose = 3
    logger.info("REF: %s", ref)
    logger.info("HYP: %s", hyp)

    output, score = smith_waterman_alignment(
        ref, hyp, similarity_score_function=lambda x, y: 2 if (x == y) else -1,
        del_score=-1, ins_score=-1, eps_symbol="-", align_full_hyp=align_full_hyp)

    print_alignment("Alignment", output, out_file_handle=sys.stderr)


def run(args):
    if args.debug_only:
        test_alignment(args.align_full_hyp)
        raise SystemExit("Exiting since --debug-only was true")

    def similarity_score_function(x, y):
        if x == y:
            return args.correct_score
        return -args.substitution_penalty

    del_score = -args.deletion_penalty
    ins_score = -args.insertion_penalty

    reco2file_and_channel = {}
    file_and_channel2reco = {}

    if args.reco2file_and_channel is not None:
        for line in args.reco2file_and_channel:
            parts = line.strip().split()

            reco2file_and_channel[parts[0]] = (parts[1], parts[2])
            file_and_channel2reco[(parts[1], parts[2])] = parts[0]
        args.reco2file_and_channel.close()
    else:
        file_and_channel2reco = None

    symbol_table = {}
    if args.symbol_table is not None:
        for line in args.symbol_table:
            parts = line.strip().split()
            symbol_table[parts[0]] = int(parts[1])
        args.symbol_table.close()

    if args.hyp_format == "Text":
        hyp_lines = {key: value
                     for (key, value) in read_text(args.hyp_in_file)}
    else:
        hyp_lines = {key: value
                     for (key, value) in read_ctm(args.hyp_in_file,
                                                  file_and_channel2reco)}

    num_err = 0
    num_done = 0
    for reco, ref_text in read_text(args.ref_in_file):
        try:
            if reco not in hyp_lines:
                num_err += 1
                raise Warning("Could not find recording {0} "
                              "in hypothesis {1}".format(
                                  reco, args.hyp_in_file.name))
                continue

            if args.hyp_format == "CTM":
                hyp_array = [x[2] for x in hyp_lines[reco]]
            else:
                hyp_array = hyp_lines[reco]

            if args.reco2file_and_channel is None:
                reco2file_and_channel[reco] = (reco, "1")

            logger.debug("Running Smith-Waterman alignment for %s", reco)

            output, score = smith_waterman_alignment(
                ref_text, hyp_array, eps_symbol=args.eps_symbol,
                similarity_score_function=similarity_score_function,
                del_score=del_score, ins_score=ins_score,
                align_full_hyp=args.align_full_hyp)

            if args.hyp_format == "CTM":
                ctm_edits = get_ctm_edits(output, hyp_lines[reco],
                                          eps_symbol=args.eps_symbol,
                                          oov_word=args.oov_word,
                                          symbol_table=symbol_table)
                for line in ctm_edits:
                    ctm_line = list(reco2file_and_channel[reco])
                    ctm_line.extend(line)
                    print(ctm_line_to_string(ctm_line),
                          file=args.alignment_out_file)
            else:
                print_alignment(
                    reco, output, out_file_handle=args.alignment_out_file)
            num_done += 1
        except:
            logger.error("Alignment failed for recording {0} "
                         "with ref = {1} and hyp = {2}".format(
                             reco, " ".join(ref_text),
                             " ".join(hyp_array)))
            raise

    logger.info("Processed %d recordings; failed with %d", num_done, num_err)

    if num_done == 0:
        raise RuntimeError("Processed 0 recordings.")


def main():
    args = get_args()

    try:
        run(args)
    except Exception:
        logger.error("Failed to align ref and hypotheses; "
                     "got exception ", exc_info=True)
        raise SystemExit(1)
    finally:
        if args.reco2file_and_channel is not None:
            args.reco2file_and_channel.close()
        args.ref_in_file.close()
        args.hyp_in_file.close()
        args.alignment_out_file.close()


if __name__ == '__main__':
    main()


================================================
FILE: egs/steps/cleanup/internal/compute_tf_idf.py
================================================
#! /usr/bin/env python

from __future__ import print_function
import argparse
import logging
import sys

import tf_idf
sys.path.insert(0, 'steps')

logger = logging.getLogger('tf_idf')
logger.setLevel(logging.INFO)
handler = logging.StreamHandler()
handler.setLevel(logging.INFO)
formatter = logging.Formatter("%(asctime)s [%(filename)s:%(lineno)s - "
                              "%(funcName)s - %(levelname)s ] %(message)s")
handler.setFormatter(formatter)
logger.addHandler(handler)


def _get_args():
    parser = argparse.ArgumentParser(
        description="""This script takes in a set of documents and computes the
        TF-IDF for each n-gram up to the specified order.  The script can also
        load IDF stats from a different file instead of computing them from the
        input set of documents.""")

    parser.add_argument("--tf-weighting-scheme", type=str, default="raw",
                        choices=["binary", "raw", "log", "normalized"],
                        help="""The function applied on the raw
                        term-frequencies f(t,d) when computing tf(t,d).
                        TF weighting schemes:-
                        binary : tf(t,d) = 1 if t in d else 0
                        raw    : tf(t,d) = f(t,d)
                        log    : tf(t,d) = 1 + log(f(t,d))
                        normalized : tf(t,d) = K + (1-K) * """
                        """f(t,d) / max{f(t',d): t' in d}""")
    parser.add_argument("--tf-normalization-factor", type=float, default=0.5,
                        help="K value for normalized TF weighting scheme")
    parser.add_argument("--idf-weighting-scheme", type=str, default="log",
                        choices=["unary", "log", "log-smoothed",
                                 "probabilistic"],
                        help="""The function applied on the raw
                        inverse-document frequencies n(t) = |d in D: t in d|
                        when computing idf(t,d).
                        IDF weighting schemes:-
                        unary  : idf(t,D) = 1
                        log    : idf(t,D) = log (N / 1 + n(t))
                        log-smoothed : idf(t,D) = log(1 + N / n(t))
                        probabilistic: idf(t,D) = log((N - n(t)) / n(t))""")
    parser.add_argument("--ngram-order", type=int, default=2,
                        help="Accumulate for terms upto this n-grams order")

    parser.add_argument("--input-idf-stats", type=argparse.FileType('r'),
                        help="If provided, IDF stats are loaded from this "
                        "file")
    parser.add_argument("--output-idf-stats", type=argparse.FileType('w'),
                        help="If providied, IDF stats are written to this "
                        "file")
    parser.add_argument("--accumulate-over-docs", type=str, default="true",
                        choices=["true", "false"],
                        help="If true, the stats are accumulated over all the "
                        "documents and a single tf-idf-file is written out.")
    parser.add_argument("docs", type=argparse.FileType('r'),
                        help="Input documents in kaldi text format i.e. "
                        "<document-id> <text>")
    parser.add_argument("tf_idf_file", type=argparse.FileType('w'),
                        help="Output tf-idf for each (t,d) pair in the "
                        "input documents written in the format "
                        "<terms> <document-id> <tf-idf>")

    args = parser.parse_args()

    if args.tf_normalization_factor >= 1.0 or args.tf_normalization_factor < 0:
        raise ValueError("--tf-normalization-factor must be in [0,1)")

    args.accumulate_over_docs = bool(args.accumulate_over_docs == "true")

    if not args.accumulate_over_docs and args.input_idf_stats is None:
        raise TypeError(
            "If --accumulate-over-docs=false is provided, "
            "then --input-idf-stats must be provided.")

    return args


def _run(args):
    tf_stats = tf_idf.TFStats()
    idf_stats = tf_idf.IDFStats()

    if args.input_idf_stats is not None:
        idf_stats.read(args.input_idf_stats)

    num_done = 0
    for line in args.docs:
        parts = line.strip().split()
        doc = parts[0]
        tf_stats.accumulate(doc, parts[1:], args.ngram_order)

        if not args.accumulate_over_docs:
            # Write the document-id and the corresponding tf-idf values.
            print (doc, file=args.tf_idf_file, end=' ')
            tf_idf.write_tfidf_from_stats(
                tf_stats, idf_stats, args.tf_idf_file,
                tf_weighting_scheme=args.tf_weighting_scheme,
                idf_weighting_scheme=args.idf_weighting_scheme,
                tf_normalization_factor=args.tf_normalization_factor,
                expected_document_id=doc)
            tf_stats = tf_idf.TFStats()
        num_done += 1

    if args.accumulate_over_docs:
        tf_stats.compute_term_stats(idf_stats=idf_stats
                                              if args.input_idf_stats is None
                                              else None)

        if args.output_idf_stats is not None:
            idf_stats.write(args.output_idf_stats)
            args.output_idf_stats.close()

        tf_idf.write_tfidf_from_stats(
            tf_stats, idf_stats, args.tf_idf_file,
            tf_weighting_scheme=args.tf_weighting_scheme,
            idf_weighting_scheme=args.idf_weighting_scheme,
            tf_normalization_factor=args.tf_normalization_factor)

    if num_done == 0:
        raise RuntimeError("Could not compute TF-IDF for any query documents")

def main():
    args = _get_args()

    try:
        _run(args)
    finally:
        if args.input_idf_stats is not None:
            args.input_idf_stats.close()
        if args.output_idf_stats is not None:
            args.output_idf_stats.close()
        args.docs.close()
        args.tf_idf_file.close()


if __name__ == '__main__':
    main()


================================================
FILE: egs/steps/cleanup/internal/ctm_to_text.pl
================================================
#! /usr/bin/perl

# Copyright 2016  Vimal Manohar
# Apache 2.0.

use strict;
use warnings;

if (scalar @ARGV != 1 && scalar @ARGV != 3) {
  my $usage = <<END;
This script converts a CTM into kaldi text format by concatenating the words
belonging to the same utterance (or recording) and outputs the same to the
standard output.
If --non-scored-words list file is provided with one word per line, then 
those words are not added to the text.

The CTM format is <file> <channel> <start-time> <duration> <word> [<conf>].
This script assumes the CTM to be in NIST sorted order given by UNIX
sort command "sort +0 -1 +1 -2 +2nb -3"

Usage: ctm_to_text.pl [--non-scored-words <file>] <ctm-file> > <text>
END
  die $usage;
}

my $non_scored_words_list = "";
if (scalar @ARGV > 1) {
  if ($ARGV[0] eq "--non-scored-words") {
    shift @ARGV;
    $non_scored_words_list = shift @ARGV;
  } else {
    die "Unknown option $ARGV[0]\n";
  }
}

my %non_scored_words;
$non_scored_words{"<eps>"} = 1;

if ($non_scored_words_list ne "") {
  open NONSCORED, $non_scored_words_list or die "Failed to open $non_scored_words_list";
  
  while (<NONSCORED>) {
    chomp;
    my @F = split;
    $non_scored_words{$F[0]} = 1;
  }

  close NONSCORED;
}

my $ctm_file = shift @ARGV;
open CTM, $ctm_file or die "Failed to open $ctm_file";

my $prev_utt = "";
my @text;

while (<CTM>) {
  chomp;
  my @F = split;

  my $utt = $F[0];
  if ($utt ne $prev_utt && $prev_utt ne "") {
    if (scalar @text > 0) {
      print $prev_utt . " " . join(" ", @text) . "\n";
    }
    @text = ();
  }
  
  if (scalar @F < 5 || scalar @F > 6) {
    die "Invalid line $_ in CTM $ctm_file\n";
  }

  if (!defined $non_scored_words{$F[4]}) {
    push @text, $F[4];
  }

  $prev_utt = $utt;
}

close CTM;
    
if (scalar @text > 0) {
  print $prev_utt . " " . join(" ", @text) . "\n";
}


================================================
FILE: egs/steps/cleanup/internal/get_ctm_edits.py
================================================
#!/usr/bin/env python3

# Copyright 2016   Vimal Manohar
#           2016   Johns Hopkins University (author: Daniel Povey)
# Apache 2.0

from __future__ import print_function
import sys, operator, argparse

# Modify the CTM to include for each token the information from Levenshtein
# alignment of 'hypothesis' and 'reference'
# (i.e. the output of 'align-text'.

# The information added to each token in the CTM is the reference word and one
# of the following edit-types:
#  'cor' = correct  [note: as a special case we count as correct cases where
#                    the hypothesis word is the OOV symbol and the reference
#                    word is OOV w.r.t. the supplied vocabulary.]
#  'sub' = substitution
#  'del' = deletion
#  'ins' = insertion
#  'sil' = (silence in ctm; does not consume a reference word)
# note: the script modify_ctm_edits.py will add the new
# note: the following extra edit-type may be added by modify_ctm_edits.py:
#  'fix'  ... this is like 'cor', but it means the reference has been modified
#             to fix non-scoreable errors [typically errors that don't change the
#             meaning], so we don't trust the word or value it as much as a 'cor'.
#

# Note: Additional lines are added to the CTM to account for deletions.

# Input CTM:
# (note: the <eps> is for silence in the input CTM that comes from
# optional-silence in the graph.  However, the input edits don't have anything
# for these silences.
# We assume (and check) that the channel will always be '1', because the
# input CTMs are expected to be 'per utterance', not including real
# recording-ids.

# Input ctm format:
# <file-id> <channel> <start-time> <duration> <hyp-word> [<confidence>]
# note, the confidence defaults to 1 if not provided (these
# scripts don't actually use the confidence field).

## TimBrown_2008P-0007226-0007620 1 0.000 0.100 when
## TimBrown_2008P-0007226-0007620 1 0.100 0.090 i
## TimBrown_2008P-0007226-0007620 1 0.190 0.300 some
## TimBrown_2008P-0007226-0007620 1 0.490 0.110 when
## TimBrown_2008P-0007226-0007620 1 0.600 0.060 i
## TimBrown_2008P-0007226-0007620 1 0.660 0.190 say
## TimBrown_2008P-0007226-0007620 1 0.850 0.450 go
## TimBrown_2008P-0007226-0007620 1 1.300 0.310 [COUGH]
## TimBrown_2008P-0007226-0007620 1 1.610 0.130 you
## TimBrown_2008P-0007226-0007620 1 1.740 0.180 got
## TimBrown_2008P-0007226-0007620 1 1.920 0.370 thirty
## TimBrown_2008P-0007226-0007620 1 2.290 0.830 seconds
## TimBrown_2008P-0007226-0007620 1 3.120 0.330 <eps>
## TimBrown_2008P-0007226-0007620 1 3.450 0.040 [BREATH]
## TimBrown_2008P-0007226-0007620 1 3.490 0.110 to
## TimBrown_2008P-0007226-0007620 1 3.600 0.320 [NOISE]

# Input Levenshtein edits : (the output of 'align-text' post-processed by 'wer_per_utt_details.pl')

# AJJacobs_2007P-0001605-0003029 i i ; thought thought ; i'd i'd ; tell tell ; you you ; a a ; little little ; about about ; [UH] [UH] ; what what ; i i ; like like ; to to ; write write ; and and ; [UH] [UH] ; i i ; like like ; to to ; [UH] [UH] ; immerse immerse ; myself myself ; [SMACK] [SMACK] ; in in ; my my ; topics topics ; [UM] [UM] ; i i ; just just ; like like ; to to ; [UH] [UH] ; dive dive ; [SMACK] [SMACK] ; right right ; in in ; and and ; become become ; [UH] [UH] ; sort sort ; of of ; a a ; human human ; guinea guinea ; pig pig ; [BREATH] [BREATH] ; and and ; [UH] [UH]
# AJJacobs_2007P-0003133-0004110 i i ; see see ; my my ; life life ; as as ; a a ; series series ; of of ; experiments experiments ; [BREATH] [BREATH] ; so so ; [UH] [UH] ; i i ; [NOISE] [NOISE] ; work work ; for for ; esquire esquire ; magazine magazine ; <eps> and ; a a ; couple couple ; of of ; years years ; ago ago ; [BREATH] [BREATH] ; i i ; wrote wrote ; an an ; article article ; called called ; [NOISE] [NOISE] ; my my ; outsourced outsourced ; life life


# Output format:
# <file-id> <channel> <start-time> <duration> <hyp-word> <confidence> <ref-word> <edit-type>

# AJJacobs_2007P-0001605-0003029 1 0 0.09 <eps> 1.0 <eps> sil
# AJJacobs_2007P-0001605-0003029 1 0.09 0.15 i 1.0 i cor
# AJJacobs_2007P-0001605-0003029 1 0.24 0.25 thought 1.0 thought cor
# AJJacobs_2007P-0001605-0003029 1 0.49 0.14 i'd 1.0 i'd cor
# AJJacobs_2007P-0001605-0003029 1 0.63 0.22 tell 1.0 tell cor
# AJJacobs_2007P-0001605-0003029 1 0.85 0.11 you 1.0 you cor
# AJJacobs_2007P-0001605-0003029 1 0.96 0.05 a 1.0 a cor
# AJJacobs_2007P-0001605-0003029 1 1.01 0.24 little 1.0 little cor
# AJJacobs_2007P-0001605-0003029 1 1.25 0.5 about 1.0 about cor
# AJJacobs_2007P-0001605-0003029 1 1.75 0.48 [UH] 1.0 [UH] cor
# AJJacobs_2007P-0001605-0003029 1 2.23 0.34 <eps> 1.0 <eps> sil
# AJJacobs_2007P-0001605-0003029 1 2.57 0.21 what 1.0 what cor
# AJJacobs_2007P-0001605-0003029 1 2.78 0.1 i 1.0 i cor
# AJJacobs_2007P-0001605-0003029 1 2.88 0.22 like 1.0 like cor
# AJJacobs_2007P-0001605-0003029 1 3.1 0.13 to 1.0 to cor
# AJJacobs_2007P-0001605-0003029 1 3.23 0.37 write 1.0 write cor
# AJJacobs_2007P-0001605-0003029 1 3.6 0.03 <eps> 1.0 <eps> sil
# AJJacobs_2007P-0001605-0003029 1 3.63 0.36 and 1.0 and cor


parser = argparse.ArgumentParser(
    description = "Append to the CTM the Levenshtein alignment of 'hypothesis' and 'reference'; "
    "creates augmented CTM with extra fields (see script for details)")

parser.add_argument("--oov", type = int, default = -1,
                    help = "The integer representation of the OOV symbol; substitutions "
                    "by the OOV symbol for out-of-vocabulary reference words are treated "
                    "as correct, if you also supply the --symbol-table option.")
parser.add_argument("--symbol-table", type = str,
                    help = "The words.txt your system used; if supplied, it is used to "
                    "determine OOV words (and such words will count as correct if "
                    "substituted by the OOV symbol).  See also the --oov option")
# Required arguments
parser.add_argument("edits_in", metavar = "<edits-in>",
                    help = "Filename of output of 'align-text', which this program reads. "
                    "Use /dev/stdin for standard input.")
parser.add_argument("ctm_in", metavar = "<ctm-in>",
                    help = "Filename of input hypothesis in ctm format")
parser.add_argument("ctm_edits_out", metavar = "<ctm-edits-out>",
                    help = "Filename of output (CTM appended with word-edit information)")
args = parser.parse_args()


def OpenFiles():
    global ctm_edits_out, edits_in, ctm_in, symbol_table, oov_word
    try:
        ctm_edits_out = open(args.ctm_edits_out, 'w', encoding='utf-8')
    except:
        sys.exit("get_ctm_edits.py: error opening ctm-edits file {0} for output".format(
                args.ctm_edits_out))
    try:
        edits_in = open(args.edits_in, encoding='utf-8')
    except:
        sys.exit("get_ctm_edits.py: error opening edits file {0} for input".format(
                args.edits_in))
    try:
        ctm_in = open(args.ctm_in, encoding='utf-8')
    except:
        sys.exit("get_ctm_edits.py: error opening ctm file {0} for input".format(
                args.ctm_in))

    symbol_table = set()
    oov_word = None
    if args.symbol_table != None:
        if args.oov == -1:
            print("get_ctm_edits.py: error: if you set the the --symbol-table option "
                  "you must also set the --oov option", file = sys.stderr)
        try:
            f = open(args.symbol_table, 'r', encoding='utf-8')
            for line in f.readlines():
                [ word, integer ] = line.split()
                if int(integer) == args.oov:
                    oov_word = word
                symbol_table.add(word)
        except:
            sys.exit("get_ctm_edits.py: error opening symbol-table file {0} for "
                     "input (or bad file), exception is: {1}".format(args.symbol_table))
        f.close()
        if oov_word == None:
            sys.exit("get_ctm_edits.py: OOV word not found: check the values of "
                     "--symbol-table={0} and --oov={1}".format(args.symbol_table,
                                                               args.oov))

# This function takes two lists
# edits_array = [ [ hyp_word1, ref_word1], [ hyp_word2, ref_word2 ], ... ]
# ctm_array = [ [ start1, duration1, hyp_word1, confidence1 ], ... ]
#
# and pads them with new list elements so that the entries 'match up'.  What we
# are aiming for is that for each i, ctm_array[i][2] == edits_array[i][0].  The
# reasons why this is not automatically true are:
#
#  (1) There may be deletions in the hypothesis sequence, which would lead to
#      pairs like [ '<eps>', ref_word ].
#  (2) The ctm may have been written 'with silence', which will lead to
#      ctm entries like [ 1, 7.8, 0.9, '<eps>' ] where the '<eps>' refers
#      to the optional-silence from the lexicon.
#
# We introduce suitable entries in to edits_array and ctm_array as necessary
# to make them 'match up'.  This function returns the pair (new_edits_array,
# new_ctm_array).
def PadArrays(edits_array, ctm_array):
    new_edits_array = []
    new_ctm_array = []
    edits_len = len(edits_array)
    ctm_len = len(ctm_array)
    edits_pos = 0
    ctm_pos = 0
    # current_time is the end of the last ctm segment we processesed.
    current_time = ctm_array[0][0] if ctm_len > 0 else 0.0
    while edits_pos < edits_len or ctm_pos < ctm_len:
        if edits_pos < edits_len and ctm_pos < ctm_len and \
                edits_array[edits_pos][0] == ctm_array[ctm_pos][2] and \
                edits_array[edits_pos][0] != '<eps>':
            # This is the normal case, where there are 2 entries where
            # they hyp-words match up
            new_edits_array.append(edits_array[edits_pos])
            edits_pos += 1
            new_ctm_array.append(ctm_array[ctm_pos])
            current_time = ctm_array[ctm_pos][0] + ctm_array[ctm_pos][1]
            ctm_pos += 1
        elif edits_pos < edits_len and edits_array[edits_pos][0] == '<eps>':
            # There was a deletion.  Pad with an empty ctm segment with '<eps>' as
            # the word.
            new_edits_array.append(edits_array[edits_pos])
            edits_pos += 1
            duration = 0.0
            confidence = 1.0
            new_ctm_array.append([ current_time, duration, '<eps>', confidence])
        elif ctm_pos < ctm_len and ctm_array[ctm_pos][2] == '<eps>':
            # There was silence in the ctm, and either we're reached the end of the
            # edits sequence, or the hyp word was not '<eps>':

            new_edits_array.append(['<eps>', '<eps>'])
            new_ctm_array.append(ctm_array[ctm_pos])
            current_time = ctm_array[ctm_pos][0] + ctm_array[ctm_pos][1]
            ctm_pos += 1
        else:
            raise Exception("Could not align edits_array = {0} and ctm_array = {1}; "
                            "edits-position = {2}, ctm-position = {3}, "
                            "pending-edit={4}, pending-ctm-entry={5}".format(
                    edits_array, ctm_array, edits_pos, ctm_pos,
                    edits_array[edits_pos] if edits_pos < edits_len else None,
                    ctm_array[ctm_pos] if ctm_pos < ctm_len else None))
    assert len(new_edits_array) == len(new_ctm_array)
    return (new_edits_array, new_ctm_array)


# This function returns the appropriate edit-type to output in the ctm-edits
# file.  The ref_word and hyp_word and duration are the values we'll print in
# the ctm-edits file.
def GetEditType(hyp_word, ref_word, duration):
    global oov_word
    if hyp_word == ref_word and hyp_word !='<eps>':
        return 'cor'
    elif hyp_word != '<eps>' and ref_word == '<eps>':
        return 'ins'
    elif hyp_word == '<eps>' and ref_word != '<eps>' and duration == 0.0:
        return 'del'
    elif hyp_word == oov_word and \
         len(symbol_table) != 0 and not ref_word in symbol_table:
        return 'cor'   # this special case is treated as correct.
    elif hyp_word == '<eps>' == ref_word and duration > 0.0:
        # silence in hypothesis; we don't match this up with any reference word.
        return 'sil'
    else:
        # The following assertion is because, based on how PadArrays
        # works, we shouldn't hit this case.
        assert hyp_word != '<eps>' and ref_word != '<eps>'
        return 'sub'

# this prints a number with a certain number of digits after
# the point, while removing trailing zeros.
def FloatToString(f):
    num_digits = 6 # we want to print 6 digits after the zero
    g = f
    while abs(g) > 1.0:
        g *= 0.1
        num_digits += 1
    format_str = '%.{0}g'.format(num_digits)
    return format_str % f


def OutputCtm(utterance_id, edits_array, ctm_array):
    global ctm_edits_out
    # note: this function expects the padded entries created by PadARrays.
    assert len(edits_array) == len(ctm_array)
    channel = '1'  # this is hardcoded at both input and output, since this CTM
                   # doesn't really represent recordings, only utterances.
    for i in range(len(edits_array)):
        ( hyp_word, ref_word ) = edits_array[i]
        ( start_time, duration, hyp_word2, confidence ) = ctm_array[i]
        if not hyp_word == hyp_word2:
            print("Error producing output CTM for edit = {0} and ctm = {1}".format(
                    edits_array[i], ctm_array[i]), file = sys.stderr)
            sys.exit(1)
        assert hyp_word == hyp_word2
        edit_type = GetEditType(hyp_word, ref_word, duration)
        print(utterance_id, channel, FloatToString(start_time),
              FloatToString(duration), hyp_word, confidence, ref_word,
              edit_type, file = ctm_edits_out)


def ProcessOneUtterance(utterance_id, edits_line, ctm_lines):
    try:
        # Remove the utterance-id from the beginning of the edits line
        edits_fields = edits_line[len(utterance_id) + 1:]

        # e.g. if edits_fields is now 'i i ; see be ; my my ', edits_array will become
        #  [ ['i', 'i'], ['see', 'be'], ['my', 'my'] ]
        fields_split = edits_fields.split()
        first_fields, second_fields = fields_split[0::3], fields_split[1::3]
        if (
            len(first_fields) != len(second_fields) or
            (len(fields_split) >= 3 and set(fields_split[2::3]) != {';'})
        ):
            sys.exit("get_ctm_edits.py: could not make sense of edits line: " + edits_line)

        edits_array = list(zip(first_fields, second_fields))

        # ctm_array will now become something like [ ['1', '1.010', '0.240', 'little ' ], ... ]
        ctm_array = [ x.split() for x in ctm_lines ]
        ctm_array = []
        for line in ctm_lines:
            try:
                # Strip off the utterance-id and split the remaining fields
                # which should be: channel==1, start, dur, word, [confidence]
                a = line[len(utterance_id) + 1:].split()
                if len(a) == 4:
                    a.append(1.0)  # confidence defaults to 1.0.
                [ channel, start, dur, word, confidence ] = a
                if channel != '1':
                    raise Exception("Channel should be 1, got: " + channel)
                ctm_array.append([ float(start), float(dur), word, float(confidence) ])
            except Exception as e:
                sys.exit("get_ctm_edits.py: error procesing ctm line {0} "
                         "... exception is: {1} {2}".format(line, type(e), str(e)))
        # ctm_array will now be something like [ [ 1.010, 0.240, 'little ', 1.0 ], ... ]

        # The following call pads the edits and ctm arrays with appropriate
        # entries so that they have the same length and the elements 'match up'.
        (edits_array, ctm_array) = PadArrays(edits_array, ctm_array)
    except Exception as e:
        sys.exit("get_ctm_edits.py: error processing utterance {0}, error was: {1}".format(
                utterance_id, str(e)))
    OutputCtm(utterance_id, edits_array, ctm_array)

def ProcessData():
    num_utterances_processed = 0

    pending_ctm_line = ctm_in.readline()

    while True:
        this_edits_line = edits_in.readline()
        if this_edits_line == '':
            if pending_ctm_line != '':
                sys.exit("get_ctm_edits.py: edits_in input {0} ended before "
                         "ctm input was ended.  We processed {1} "
                         "utterances.".format(args.edits_in, num_utterances_processed))
            break
        a = this_edits_line.split()
        if len(a) == 0:
            sys.exit("get_ctm_edits.py: edits_input {0} had an empty line".format(
                    args.edits_in))
        utterance_id = a[0]
        utterance_id_len = len(utterance_id)
        this_utterance_ctm_lines = []
        while len(pending_ctm_line.strip()) > 0 and pending_ctm_line.split()[0] == utterance_id:
            this_utterance_ctm_lines.append(pending_ctm_line)
            pending_ctm_line = ctm_in.readline()
        ProcessOneUtterance(utterance_id, this_edits_line,
                            this_utterance_ctm_lines)
        num_utterances_processed += 1
    print("get_ctm_edits.py: processed {0} utterances".format(
            num_utterances_processed), file=sys.stderr)


OpenFiles()
ProcessData()


================================================
FILE: egs/steps/cleanup/internal/get_non_scored_words.py
================================================
#!/usr/bin/env python3

# Copyright 2016   Vimal Manohar
#           2016   Johns Hopkins University (author: Daniel Povey)
# Apache 2.0

from __future__ import print_function
import argparse
import logging
import operator
import os
import sys
from collections import defaultdict

import io
sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding="utf8")
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf8")


logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
handler = logging.StreamHandler()
handler.setLevel(logging.INFO)
formatter = logging.Formatter("%(asctime)s [%(pathname)s:%(lineno)s - "
                              "%(funcName)s - %(levelname)s ] %(message)s")
handler.setFormatter(formatter)
logger.addHandler(handler)

# If you supply the <lang> directory (the one that corresponds to
# how you decoded the data) to this script, it assumes that the <lang>
# directory contains phones/align_lexicon.int, and it uses this to work
# out a reasonable guess of the non-scored phones, based on which have
# a single-word pronunciation that maps to a silence phone.
# It then uses the words.txt to work out the written form of those words.

parser = argparse.ArgumentParser(
    description = "This program works out a reasonable guess at a list of "
    "non-scored words (words that won't affect the WER evaluation): "
    "things like [COUGH], [NOISE] and so on.  This is useful because a list of "
    "such words is required by some other scripts (e.g. modify_ctm_edits.py), "
    "and it's inconvenient to have to specify the list manually for each language. "
    "This program writes out the words in text form, one per line.")

parser.add_argument("lang", type = str,
                    help = "The lang/ directory.  This program expects "
                    "lang/words.txt and lang/phones/silence.int and "
                    "lang/phones/align_lexicon.int to exist, and will use them to work "
                    "out a reasonable guess of the non-scored words  (as those whose "
                    "pronunciations are a single phone in the 'silphones' list)")

args = parser.parse_args()

non_scored_words = set()


def read_lang(lang_dir):
    global non_scored_words

    if not os.path.isdir(lang_dir):
        logger.error("expected lang/ directory %s to "
                     "exist.", lang_dir)
        raise RuntimeError

    for f in [ '/words.txt', '/phones/silence.int', '/phones/align_lexicon.int' ]:
        if not os.path.exists(lang_dir + f):
            logger.error("expected file %s%s to exist.", lang_dir, f)
            raise RuntimeError

    # read silence-phones.
    try:
        silence_phones = set()
        for line in open(lang_dir + '/phones/silence.int').readlines():
            silence_phones.add(int(line))
    except Exception:
        logger.error("problem reading file "
                     "%s/phones/silence.int", lang_dir)
        raise

    # read align_lexicon.int.
    # format is: <word-index> <word-index> <phone-index1> <phone-index2> ..
    # We're looking for line of the form:
    # w w p
    # where w > 0 and p is in the set 'silence_phones'
    try:
        silence_word_ints = set()
        for line in open(lang_dir + '/phones/align_lexicon.int').readlines():
            a = line.split()
            if len(a) == 3 and a[0] == a[1] and int(a[0]) > 0 and \
                    int(a[2]) in silence_phones:
                silence_word_ints.add(int(a[0]))
    except Exception:
        logger.error("problem reading file %s/phones/align_lexicon.int",
                     lang_dir)
        raise

    try:
        for line in open(lang_dir + '/words.txt', encoding='utf-8').readlines():
            [ word, integer ] = line.split()
            if int(integer) in silence_word_ints:
                non_scored_words.add(word)
    except Exception:
        logger.error("problem reading file %s/words.txt.int", lang_dir)
        raise

    if not len(non_scored_words) == len(silence_word_ints):
        raise RuntimeError("error getting silence words, len({0}) != len({1})"
                           "".format(non_scored_words, silence_word_ints))
    for word in non_scored_words:
        print(word)


read_lang(args.lang)


================================================
FILE: egs/steps/cleanup/internal/get_pron_stats.py
================================================
#!/usr/bin/env python

# Copyright 2016  Xiaohui Zhang
# Apache 2.0.

from __future__ import print_function
from __future__ import division
import argparse
import sys
import warnings

# Collect pronounciation stats from a ctm_prons.txt file of the form output
# by steps/cleanup/debug_lexicon.sh.  This input file has lines of the form:
#  utt_id word phone1 phone2 .. phoneN
#  e.g.
#  foo-bar123-342  hello h eh l l ow
# (and this script does require that lines from the same utterance be ordered in
# order of time).
# The output of this program is word pronunciation stats of the form:
#  count word phone1 .. phoneN
#  e.g.:
#  24.0  hello h ax l l ow
# This program uses various heuristics to account for the fact that in the input ctm_prons.txt
# file may not always be well aligned.  As a result of some of these heuristics the counts will
# not always be integers.

def GetArgs():
    parser = argparse.ArgumentParser(description = "Accumulate pronounciation statistics from "
                                     "a ctm_prons.txt file.",
                                     epilog = "See steps/cleanup/debug_lexicon.sh for example")
    parser.add_argument("ctm_prons_file", metavar = "<ctm-prons-file>", type = str,
                        help = "File containing word-pronounciation alignments obtained from a ctm file; "
                        "It represents phonetic decoding results, aligned with word boundaries obtained"
                        "from forced alignments."
                        "each line must be <utt_id> <word> <phones>")
    parser.add_argument("silence_file", metavar = "<silphone-file>", type = str,
                        help = "File containing a list of silence phones.")
    parser.add_argument("optional_silence_file", metavar = "<optional_silence>", type = str,
                        help = "File containing the optional silence phone. We'll be replacing empty prons by this,"
                        "because empty prons would cause a problem for lattice word alignment.")
    parser.add_argument("non_scored_words_file", metavar = "<non-scored-words-file>", type = str,
                        help = "File containing a list of non-scored words.")
    parser.add_argument("stats_file", metavar = "<stats-file>", type = str,
                        help = "Write accumulated statitistics to this file; each line represents how many times "
                        "a specific word-pronunciation pair appears in the phonetic decoding results (ctm_pron_file)."
                        "each line is <count> <word> <phones>")
    print (' '.join(sys.argv), file=sys.stderr)

    args = parser.parse_args()
    args = CheckArgs(args)

    return args

def CheckArgs(args):
    if args.ctm_prons_file == "-":
        args.ctm_prons_file_handle = sys.stdin
    else:
        args.ctm_prons_file_handle = open(args.ctm_prons_file)
    args.non_scored_words_file_handle = open(args.non_scored_words_file)
    args.silence_file_handle = open(args.silence_file)
    args.optional_silence_file_handle = open(args.optional_silence_file)
    if args.stats_file == "-":
        args.stats_file_handle = sys.stdout
    else:
        args.stats_file_handle = open(args.stats_file, "w")
    return args

def ReadEntries(file_handle):
    entries = set()
    for line in file_handle:
        entries.add(line.strip())
    return entries

# Basically, this function generates an "info" list from a ctm_prons file.
# Each entry in the list represents the pronounciation candidate(s) of a word.
# For each non-<eps> word, the entry is a list: [utt_id, word, set(pronunciation_candidates)]. e.g:
# [911Mothers_2010W-0010916-0012901-1, other, set('AH DH ER', 'AH DH ER K AH N')]
# For each <eps>, we split the phones it aligns to into two parts: "nonsil_left",
# which includes phones before the first silphone, and "nonsil_right", which includes
# phones after the last silphone. For example, for <eps> : 'V SIL B AH SIL',
# nonsil_left is 'V' and nonsil_right is empty ''. After processing an <eps> entry
# in ctm_prons, we put it in "info" as an entry:  [utt_id, word, nonsil_right]
# only if it's nonsil_right segment is not empty, which may be used when processing
# the next word.
#
# Normally, one non-<eps> word is only aligned to one pronounciation candidate. However
# when there is a preceding/following <eps>, like in the following example, we
# assume the phones aligned to <eps> should be statistically distributed
# to its neighboring words (BTW we assume there are no consecutive <eps> within an utterance.)
# Thus we append the "nonsil_left" segment of these phones to the pronounciation
# of the preceding word, if the last phone of this pronounciation is not a silence phone,
# Similarly we can add a pron candidate to the following word.
#
# For example, for the following part of a ctm_prons file:
# 911Mothers_2010W-0010916-0012901-1 other AH DH ER
# 911Mothers_2010W-0010916-0012901-1 <eps> K AH N SIL B
# 911Mothers_2010W-0010916-0012901-1 because IH K HH W AA Z AH
# 911Mothers_2010W-0010916-0012901-1 <eps> V SIL
# 911Mothers_2010W-0010916-0012901-1 when W EH N
# 911Mothers_2010W-0010916-0012901-1 people P IY P AH L
# 911Mothers_2010W-0010916-0012901-1 <eps> SIL
# 911Mothers_2010W-0010916-0012901-1 heard HH ER
# 911Mothers_2010W-0010916-0012901-1 <eps> D
# 911Mothers_2010W-0010916-0012901-1 that SIL DH AH T
# 911Mothers_2010W-0010916-0012901-1 my M AY
#
# The corresponding segment in the "info" list is:
# [911Mothers_2010W-0010916-0012901-1, other, set('AH DH ER', 'AH DH ER K AH N')]
# [911Mothers_2010W-0010916-0012901-1, <eps>, 'B'
# [911Mothers_2010W-0010916-0012901-1, because, set('IH K HH W AA Z AH', 'B IH K HH W AA Z AH', 'IH K HH W AA Z AH V', 'B IH K HH W AA Z AH V')]
# [911Mothers_2010W-0010916-0012901-1, when, set('W EH N')]
# [911Mothers_2010W-0010916-0012901-1, people, set('P IY P AH L')]
# [911Mothers_2010W-0010916-0012901-1, <eps>, 'D']
# [911Mothers_2010W-0010916-0012901-1, that, set('SIL DH AH T')]
# [911Mothers_2010W-0010916-0012901-1, my, set('M AY')]
#
# Then we accumulate pronouciation stats from "info". Basically, for each occurence
# of a word, each pronounciation candidate gets equal soft counts. e.g. In the above
# example, each pron candidate of "because" gets a count of 1/4. The stats is stored
# in a dictionary (word, pron) : count.

def GetStatsFromCtmProns(silphones, optional_silence, non_scored_words, ctm_prons_file_handle):
    info = []
    for line in ctm_prons_file_handle.readlines():
        splits = line.strip().split()
        utt = splits[0]
        word = splits[1]
        phones = splits[2:]
        if phones == []:
            phones = [optional_silence]
        # extract the nonsil_left and nonsil_right segments, and then try to
        # append nonsil_left to the pron candidates of preceding word, getting
        # extended pron candidates.
        # Note: the ctm_pron file may have cases like:
        # KevinStone_2010U-0024782-0025580-1 [UH] EH
        # KevinStone_2010U-0024782-0025580-1 fda F T
        # KevinStone_2010U-0024782-0025580-1 [NOISE] IY EY
        # which means non-scored-words (except oov symbol <unk>/<UNK>) behaves like <eps>.
        # So we apply the same merging method in these cases.
        if word == '<eps>' or (word in non_scored_words and word != '<unk>' and word != '<UNK>'):
            nonsil_left = []
            nonsil_right = []
            for phone in phones:
                if phone in silphones:
                    break
                nonsil_left.append(phone)

            for phone in reversed(phones):
                if phone in silphones:
                    break
                nonsil_right.insert(0, phone)

            # info[-1][0] is the utt_id of the last entry
            if len(nonsil_left) > 0 and len(info) > 0 and utt == info[-1][0]:
                # pron_ext is a set of extended pron candidates.
                pron_ext = set()
                # info[-1][2] is the set of pron candidates of the last entry.
                for pron in info[-1][2]:
                    # skip generating the extended pron candidate if
                    # the pron ends with a silphone.
                    ends_with_sil = False
                    for sil in silphones:
                        if pron.endswith(sil):
                            ends_with_sil = True
                    if not ends_with_sil:
                        pron_ext.add(pron+" "+" ".join(nonsil_left))
                if isinstance(info[-1][2], set):
                    info[-1][2] = info[-1][2].union(pron_ext)
            if len(nonsil_right) > 0:
                info.append([utt, word, " ".join(nonsil_right)])
        else:
            prons = set()
            prons.add(" ".join(phones))
            # If there's a preceding <eps>/non_scored_words (which means the third field is a string rather than a set of strings),
            # we append it's nonsil_right segment to the pron candidates of the current word.
            if len(info) > 0 and utt == info[-1][0] and isinstance(info[-1][2], str) and (phones == [] or phones[0] not in silphones):
                # info[-1][2] is the nonsil_right segment of the phones aligned to the last <eps>/non_scored_words.
                prons.add(info[-1][2]+' '+" ".join(phones))
            info.append([utt, word, prons])
    stats = {}
    for utt, word, prons in info:
        # If the prons is not a set, the current word must be <eps> or an non_scored_word,
        # where we just left the nonsil_right part as prons.
        if isinstance(prons, set) and len(prons) > 0:
            count = 1.0 / float(len(prons))
            for pron in prons:
                phones = pron.strip().split()
                # post-processing: remove all begining/trailing silence phones.
                # we allow only candidates that either consist of a single silence
                # phone, or the silence phones are inside non-silence phones.
                if len(phones) > 1:
                    begin = 0
                    for phone in phones:
                        if phone in silphones:
                            begin += 1
                        else:
                            break
                    if begin == len(phones):
                        begin -= 1
                    phones = phones[begin:]
                    if len(phones) == 1:
                        break
                    end = len(phones)
                    for phone in reversed(phones):
                        if phone in silphones:
                            end -= 1
                        else:
                            break
                    phones = phones[:end]
                phones = " ".join(phones)
                stats[(word, phones)] = stats.get((word, phones), 0) + count
    return stats

def WriteStats(stats, file_handle):
    for word_pron, count in stats.items():
        print('{0} {1} {2}'.format(count, word_pron[0], word_pron[1]), file=file_handle)
    file_handle.close()

def Main():
    args = GetArgs()
    silphones = ReadEntries(args.silence_file_handle)
    non_scored_words = ReadEntries(args.non_scored_words_file_handle)
    optional_silence = ReadEntries(args.optional_silence_file_handle)
    stats = GetStatsFromCtmProns(silphones, optional_silence.pop(), non_scored_words, args.ctm_prons_file_handle)
    WriteStats(stats, args.stats_file_handle)

if __name__ == "__main__":
    Main()


================================================
FILE: egs/steps/cleanup/internal/make_one_biased_lm.py
================================================
#!/usr/bin/env python3

# Copyright 2016  Johns Hopkins University (Author: Daniel Povey)
# Apache 2.0.

from __future__ import print_function
from __future__ import division
import sys
import argparse
import math
from collections import defaultdict

import io
sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding="utf8")
sys.stderr = io.TextIOWrapper(sys.stderr.buffer,encoding="utf8")
sys.stdin = io.TextIOWrapper(sys.stdin.buffer,encoding="utf8")

parser = argparse.ArgumentParser(description="""
This script creates a biased language model suitable for alignment and
data-cleanup purposes.   It reads (possibly multiple) lines of integerized text
from the input and writes a text-form FST of a backoff language model to
the standard output, to be piped into fstcompile.""")

parser.add_argument("--word-disambig-symbol", type = int, required = True,
                    help = "Integer corresponding to the disambiguation "
                    "symbol (normally #0) for backoff arcs")
parser.add_argument("--ngram-order", type = int, default = 4,
                    choices = [2,3,4,5,6,7],
                    help = "Maximum order of n-gram to use (but see also "
                    "--min-lm-state-count; the effective order may be less.")
parser.add_argument("--min-lm-state-count", type = int, default = 10,
                    help = "Minimum count below which we will completely "
                    "discount an LM-state (if it is of order > 2, i.e. "
                    "history-length > 1).")
parser.add_argument("--top-words", type = str,
                    help = "File containing frequent words and probabilities to be added into "
                    "the language model, with lines in the format '<integer-id-of-word> <prob>'. "
                    "These probabilities will be added to the probabilities in the unigram "
                    "backoff state and then renormalized; this option allows you to introduce "
                    "common words to the LM with specified probabilities.")
parser.add_argument("--discounting-constant", type = float, default = 0.3,
                    help = "Discounting constant D for standard (unmodified) Kneser-Ney; "
                    "must be strictly between 0 and 1.  A value closer to 0 will give "
                    "you a more-strongly-biased LM.")
parser.add_argument("--verbose", type = int, default = 0,
                    choices=[0,1,2,3,4,5], help = "Verbose level")

args = parser.parse_args()

if args.verbose >= 1:
    print(' '.join(sys.argv), file = sys.stderr)


class NgramCounts(object):
    ## A note on data-structure.
    ## Firstly, all words are represented as integers.
    ## We store n-gram counts as an array, indexed by (history-length == n-gram order minus one)
    ## (note: python calls arrays "lists")  of dicts from histories to counts, where
    ## histories are arrays of integers and "counts" are dicts from integer to float.
    ## For instance, when accumulating the 4-gram count for the '8' in the sequence '5 6 7 8',
    ## we'd do as follows:
    ##  self.counts[3][[5,6,7]][8] += 1.0
    ## where the [3] indexes an array, the [[5,6,7]] indexes a dict, and
    ## the [8] indexes a dict.
    def __init__(self, ngram_order):
        self.ngram_order = ngram_order
        # Integerized counts will never contain negative numbers, so
        # inside this program, we use -3 and -2 for the BOS and EOS symbols
        # respectively.
        # Note: it's actually important that the bos-symbol is the most negative;
        # it helps ensure that we print the state with left-context <s> first
        # when we print the FST, and this means that the start-state will have
        # the correct value.
        self.bos_symbol = -3
        self.eos_symbol = -2
        # backoff_symbol is kind of a pseudo-word, it's used in keeping track of
        # the backoff counts in each state.
        self.backoff_symbol = -1
        self.counts = []
        for n in range(ngram_order):
            # The 'lambda: defaultdict(float)' is an anonymous function taking
            # no arguments that returns a new defaultdict(float).
            # If we index self.counts[n][history] for a history-length n < ngram_order
            # and a previously unseen history, it will create a new defaultdict
            # that defaults to 0.0 [since the function float() will return 0.0].
            # This means that we can index self.counts without worrying about
            # undefined values.
            self.counts.append(defaultdict(lambda: defaultdict(float)))

    # adds a raw count (called while processing input data).
    # Suppose we see the sequence '6 7 8 9' and ngram_order=4, 'history'
    # would be (6,7,8) and 'predicted_word' would be 9; 'count' would be
    # 1.0.
    def AddCount(self, history, predicted_word, count):
        self.counts[len(history)][history][predicted_word] += count

    # 'line' is a string containing a sequence of integer word-ids.
    # This function adds the un-smoothed counts from this line of text.
    def AddRawCountsFromLine(self, line):
        try:
            words = [self.bos_symbol] + [ int(x) for x in line.split() ] + [self.eos_symbol]
        except:
            sys.exit("make_one_biased_lm.py: bad input line {0} (expected a sequence "
                     "of integers)".format(line))

        for n in range(1, len(words)):
            predicted_word = words[n]
            history_start = max(0, n + 1 - self.ngram_order)
            history = tuple(words[history_start:n])
            self.AddCount(history, predicted_word, 1.0)

    def AddRawCountsFromStandardInput(self):
        lines_processed = 0
        while True:
            line = sys.stdin.readline()
            if line == '':
                break
            self.AddRawCountsFromLine(line)
            lines_processed += 1
        if lines_processed == 0 or args.verbose > 0:
            print("make_one_biased_lm.py: processed {0} lines of input".format(
                    lines_processed), file = sys.stderr)


    # This function returns a dict from history (as a tuple of integers of
    # length > 1, ignoring lower-order histories), to the total count of this
    # history state plus all history-states which back off to this history state.
    # It's used inside CompletelyDiscountLowCountStates().
    def GetHistToTotalCount(self):
        ans = defaultdict(float)
        for n in range(2, self.ngram_order):
            for hist, word_to_count in self.counts[n].items():
                total_count = sum(word_to_count.values())
                while len(hist) >= 2:
                    ans[hist] += total_count
                    hist = hist[1:]
        return ans


    # This function will completely discount the counts in any LM-states of
    # order > 2 (i.e. history-length > 1) that have total count below
    # 'min_count'; when computing the total counts, we include higher-order
    # LM-states that would back off to 'this' lm-state, in the total.
    def CompletelyDiscountLowCountStates(self, min_count):
        hist_to_total_count = self.GetHistToTotalCount()
        for n in reversed(list(range(2, self.ngram_order))):
            this_order_counts = self.counts[n]
            to_delete = []
            for hist in this_order_counts.keys():
                if hist_to_total_count[hist] < min_count:
                    # we need to completely back off this count.
                    word_to_count = this_order_counts[hist]
                    # mark this key for deleting
                    to_delete.append(hist)
                    backoff_hist = hist[1:]  # this will be a tuple not a list.
                    for word, count in word_to_count.items():
                        self.AddCount(backoff_hist, word, count)
            for hist in to_delete:
                del this_order_counts[hist]

    # This backs off the counts according to Kneser-Ney (unmodified,
    # with interpolation).
    def ApplyBackoff(self, D):
        assert D > 0.0 and D < 1.0
        for n in reversed(list(range(1, self.ngram_order))):
            this_order_counts = self.counts[n]
            for hist, word_to_count in this_order_counts.items():
                backoff_hist = hist[1:]
                backoff_word_to_count = self.counts[n-1][backoff_hist]
                this_discount_total = 0.0
                for word in word_to_count:
                    assert word_to_count[word] >= 1.0
                    word_to_count[word] -= D
                    this_discount_total += D
                    # Interpret the following line as incrementing the
                    # count-of-counts for the next-lower order.
                    backoff_word_to_count[word] += 1.0
                word_to_count[self.backoff_symbol] += this_discount_total


    # This function prints out to stderr the n-gram counts stored in this
    # object; it's used for debugging.
    def Print(self, info_string):
        print(info_string, file=sys.stderr)
        # these are useful for debug.
        total = 0.0
        total_excluding_backoff = 0.0
        for this_order_counts in self.counts:
            for hist, word_to_count in this_order_counts.items():
                this_total_count = sum(word_to_count.values())
                print('{0}: total={1} '.format(hist, this_total_count),
                      end='', file=sys.stderr)
                print(' '.join(['{0} -> {1} '.format(word, count)
                                for word, count in word_to_count.items() ]),
                      file = sys.stderr)
                total += this_total_count
                total_excluding_backoff += this_total_count
                if self.backoff_symbol in word_to_count:
                    total_excluding_backoff -= word_to_count[self.backoff_symbol]
        print('total count = {0}, excluding discount = {1}'.format(
                total, total_excluding_backoff), file = sys.stderr)

    def AddTopWords(self, top_words_file):
        empty_history = ()
        word_to_count = self.counts[0][empty_history]
        total = sum(word_to_count.values())
        try:
            f = open(top_words_file, mode='r', encoding='utf-8')
        except:
            sys.exit("make_one_biased_lm.py: error opening top-words file: "
                     "--top-words=" + top_words_file)
        while True:
            line = f.readline()
            if line == '':
                break
            try:
                [ word_index, prob ] = line.split()
                word_index = int(word_index)
                prob = float(prob)
                assert word_index > 0 and prob > 0.0
                word_to_count[word_index] += prob * total
            except Exception as e:
                sys.exit("make_one_biased_lm.py: could not make sense of the "
                         "line '{0}' in op-words file: {1} ".format(line, str(e)))
        f.close()


    def GetTotalCountMap(self):
        # This function, called from PrintAsFst, returns a map from
        # history to the total-count for that state.
        total_count_map = dict()
        for n in range(0, self.ngram_order):
            for hist, word_to_count in self.counts[n].items():
                total_count_map[hist] = sum(word_to_count.values())
        return total_count_map

    def GetHistToStateMap(self):
        # This function, called from PrintAsFst, returns a map from
        # history to integer FST-state.
        hist_to_state = dict()
        fst_state_counter = 0
        for n in range(0, self.ngram_order):
            for hist in self.counts[n].keys():
                hist_to_state[hist] = fst_state_counter
                fst_state_counter += 1
        return hist_to_state

    def GetProb(self, hist, word, total_count_map):
        total_count = total_count_map[hist]
        word_to_count = self.counts[len(hist)][hist]
        prob = float(word_to_count[word]) / total_count
        if len(hist) > 0 and word != self.backoff_symbol:
            prob_in_backoff = self.GetProb(hist[1:], word, total_count_map)
            backoff_prob = float(word_to_count[self.backoff_symbol]) / total_count
            prob += backoff_prob * prob_in_backoff
        return prob

    # This function prints the estimated language model as an FST.
    def PrintAsFst(self, word_disambig_symbol):
        # n is the history-length (== order + 1).  We iterate over the
        # history-length in the order 1, 0, 2, 3, and then iterate over the
        # histories of each order in sorted order.  Putting order 1 first
        # and sorting on the histories
        # ensures that the bigram state with <s> as the left context comes first.
        # (note: self.bos_symbol is the most negative symbol)

        # History will map from history (as a tuple) to integer FST-state.
        hist_to_state = self.GetHistToStateMap()
        total_count_map = self.GetTotalCountMap()

        for n in [ 1, 0 ] + list(range(2, self.ngram_order)):
            this_order_counts = self.counts[n]
            # For order 1, make sure the keys are sorted.
            keys = this_order_counts.keys() if n != 1 else sorted(this_order_counts.keys())
            for hist in keys:
                word_to_count = this_order_counts[hist]
                this_fst_state = hist_to_state[hist]

                for word in word_to_count.keys():
                    # work out this_cost.  Costs in OpenFst are negative logs.
                    this_cost = -math.log(self.GetProb(hist, word, total_count_map))

                    if word > 0: # a real word.
                        next_hist = hist + (word,)  # appending tuples
                        while not next_hist in hist_to_state:
                            next_hist = next_hist[1:]
                        next_fst_state = hist_to_state[next_hist]
                        print(this_fst_state, next_fst_state, word, word,
                              this_cost)
                    elif word == self.eos_symbol:
                        # print final-prob for this state.
                        print(this_fst_state, this_cost)
                    else:
                        assert word == self.backoff_symbol
                        backoff_fst_state = hist_to_state[hist[1:len(hist)]]
                        print(this_fst_state, backoff_fst_state,
                              word_disambig_symbol, 0, this_cost)


ngram_counts = NgramCounts(args.ngram_order)
ngram_counts.AddRawCountsFromStandardInput()

if args.verbose >= 3:
    ngram_counts.Print("Raw counts:")
ngram_counts.CompletelyDiscountLowCountStates(args.min_lm_state_count)
if args.verbose >= 3:
    ngram_counts.Print("Counts after discounting low-count states:")
ngram_counts.ApplyBackoff(args.discounting_constant)
if args.verbose >= 3:
    ngram_counts.Print("Counts after applying Kneser-Ney discounting:")
if args.top_words != None:
    ngram_counts.AddTopWords(args.top_words)
    if args.verbose >= 3:
        ngram_counts.Print("Counts after applying top-n-words")
ngram_counts.PrintAsFst(args.word_disambig_symbol)


# test comand:
# (echo 6 7 8 4; echo 7 8 9; echo 7 8) | ./make_one_biased_lm.py --word-disambig-symbol=1000 --min-lm-state-count=2 --verbose=3 --top-words=<(echo 1 0.5; echo 2 0.25)


================================================
FILE: egs/steps/cleanup/internal/modify_ctm_edits.py
================================================
#!/usr/bin/env python3

# Copyright 2016   Vimal Manohar
#           2016   Johns Hopkins University (author: Daniel Povey)
# Apache 2.0

from __future__ import print_function
import argparse
import logging
import sys
from collections import defaultdict

"""
This script reads and writes the 'ctm-edits' file that is
produced by get_ctm_edits.py.

It modifies the ctm-edits so that non-scored words
are not counted as errors: for instance, if there are things like
[COUGH] and [NOISE] in the transcript, deletions, insertions and
substitutions involving them are allowed, and we modify the reference
to correspond to the hypothesis.

If you supply the <lang> directory (the one that corresponds to
how you decoded the data) to this script, it assumes that the <lang>
directory contains phones/align_lexicon.int, and it uses this to work
out a reasonable guess of the non-scored phones, based on which have
a single-word pronunciation that maps to a silence phone.
It then uses the words.txt to work out the written form of those words.

Alternatively, you may specify a file containing the non-scored words one
per line, with the --non-scored-words option.

Non-scored words that were deleted (i.e. they were in the ref but not the
hyp) are simply removed from the ctm.  For non-scored words that
were inserted or substituted, we change the reference word to match the
hyp word, but instead of marking the operation as 'cor' (correct), we
mark it as 'fix' (fixed), so that it will not be positively counted as a correct
word for purposes of finding the optimal segment boundaries.

e.g.
<file-id> <channel> <start-time> <duration> <conf> <hyp-word> <ref-word> <edit-type>
[note: the <channel> will always be 1].

AJJacobs_2007P-0001605-0003029 1 0 0.09 <eps> 1.0 <eps> sil
AJJacobs_2007P-0001605-0003029 1 0.09 0.15 i 1.0 i cor
AJJacobs_2007P-0001605-0003029 1 0.24 0.25 thought 1.0 thought cor
AJJacobs_2007P-0001605-0003029 1 0.49 0.14 i'd 1.0 i'd cor
AJJacobs_2007P-0001605-0003029 1 0.63 0.22 tell 1.0 tell cor
AJJacobs_2007P-0001605-0003029 1 0.85 0.11 you 1.0 you cor
AJJacobs_2007P-0001605-0003029 1 0.96 0.05 a 1.0 a cor
AJJacobs_2007P-0001605-0003029 1 1.01 0.24 little 1.0 little cor
AJJacobs_2007P-0001605-0003029 1 1.25 0.5 about 1.0 about cor
AJJacobs_2007P-0001605-0003029 1 1.75 0.48 [UH] 1.0 [UH] cor
"""

logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
handler = logging.StreamHandler()
handler.setLevel(logging.INFO)
formatter = logging.Formatter('%(asctime)s [%(filename)s:%(lineno)s - '
                              '%(funcName)s - %(levelname)s ] %(message)s')
handler.setFormatter(formatter)
logger.addHandler(handler)


parser = argparse.ArgumentParser(
    description = "This program modifies the reference in the ctm-edits which "
    "is output by steps/cleanup/internal/get_ctm_edits.py, to allow insertions, deletions and "
    "substitutions of non-scored words, and [if --allow-repetitions=true], "
    "duplications of single words or pairs of scored words (to account for dysfluencies "
    "that were not transcribed).  Note: deletions and substitutions of non-scored words "
    "after the reference is corrected, will be marked as operation 'fix' rather than "
    "'cor' (correct) so that the downstream processing knows that this was not in "
    "the original reference.  Also by defaults tags non-scored words as such when "
    "they are correct; see the --tag-non-scored option.")

parser.add_argument("--verbose", type = int, default = 1,
                    choices=[0,1,2,3],
                    help = "Verbose level, higher = more verbose output")
parser.add_argument("--allow-repetitions", type = str, default = 'true',
                    choices=['true','false'],
                    help = "If true, allow repetitions in the transcript of one or "
                    "two-word sequences: for instance if the ref says 'i' but "
                    "the hyp says 'i i', or the ref says 'but then' and the hyp says "
                    "'but then but then', fix the reference accordingly.  Intervening "
                    "non-scored words are allowed between the repetitions.  These "
                    "fixes will be marked as 'cor', not as 'fix', since there is "
                    "generally no way to tell which repetition was the 'real' one "
                    "(and since we're generally confident that such things were "
                    "actually uttered).")
parser.add_argument("non_scored_words_in", metavar = "<non-scored-words-file>",
                    help="Filename of file containing a list of non-scored words, "
                    "one per line. See steps/cleanup/get_nonscored_words.py.")
parser.add_argument("ctm_edits_in", metavar = "<ctm-edits-in>",
                    help = "Filename of input ctm-edits file. "
                    "Use /dev/stdin for standard input.")
parser.add_argument("ctm_edits_out", metavar = "<ctm-edits-out>",
                    help = "Filename of output ctm-edits file. "
                    "Use /dev/stdout for standard output.")

args = parser.parse_args()


def ReadNonScoredWords(non_scored_words_file):
    global non_scored_words
    try:
        f = open(non_scored_words_file, encoding='utf-8')
    except:
        sys.exit("modify_ctm_edits.py: error opening file: "
                 "--non-scored-words=" + non_scored_words_file)
    for line in f.readlines():
        a = line.split()
        if not len(line.split()) == 1:
            sys.exit("modify_ctm_edits.py: bad line in non-scored-words "
                     "file {0}: {1}".format(non_scored_words_file, line))
        non_scored_words.add(a[0])
    f.close()


# The ctm-edits file format is as follows [note: file-id is really utterance-id
# in this context].
# <file-id> <channel> <start-time> <duration> <conf> <hyp-word> <ref-word> <edit>
# e.g.:
# AJJacobs_2007P-0001605-0003029 1 0 0.09 <eps> 1.0 <eps> sil
# AJJacobs_2007P-0001605-0003029 1 0.09 0.15 i 1.0 i cor
# ...
# This function processes a single line of ctm-edits input for fixing
# "non-scored" words.  The input 'a' is the split line as an array of fields.
# It modifies the object 'a'.   This function returns the modified array,
# and please note that it is destructive of its input 'a'.
# If it returnso the empty array then the line is to be deleted.
def ProcessLineForNonScoredWords(a):
    global num_lines, num_correct_lines, ref_change_stats
    try:
        assert len(a) == 8
        num_lines += 1
        # we could do:
        # [ file, channel, start, duration, hyp_word, confidence, ref_word, edit_type ] = a
        duration = a[3]
        hyp_word = a[4]
        ref_word = a[6]
        edit_type = a[7]
        if edit_type == 'ins':
            assert ref_word == '<eps>'
            if hyp_word in non_scored_words:
                # insert this non-scored word into the reference.
                ref_change_stats[ref_word + ' -> ' + hyp_word] += 1
                ref_word = hyp_word
                edit_type = 'fix'
        elif edit_type == 'del':
            assert hyp_word == '<eps>' and float(duration) == 0.0
            if ref_word in non_scored_words:
                ref_change_stats[ref_word + ' -> ' + hyp_word] += 1
                return []
        elif edit_type == 'sub':
            assert hyp_word != '<eps>'
            if hyp_word in non_scored_words and ref_word in non_scored_words:
                # we also allow replacing one non-scored word with another.
                ref_change_stats[ref_word + ' -> ' + hyp_word] += 1
                ref_word = hyp_word
                edit_type = 'fix'
        else:
            assert edit_type == 'cor' or edit_type == 'sil'
            num_correct_lines += 1

        a[4] = hyp_word
        a[6] = ref_word
        a[7] = edit_type
        return a

    except Exception:
        logger.error("bad line in ctm-edits input: "
                     "{0}".format(a))
        raise RuntimeError

# This function processes the split lines of one utterance (as a
# list of lists of fields), to allow repetitions of words, so if the
# reference says 'i' but the hyp says 'i i', or the ref says
# 'you know' and the hyp says 'you know you know', we change the
# ref to match.
# It returns the modified list-of-lists [but note that the input
# is actually modified].
def ProcessUtteranceForRepetitions(split_lines_of_utt):
    global non_scored_words, repetition_stats
    # The array 'selected_lines' will contain the indexes of of selected
    # elements of 'split_lines_of_utt'.  Consider split_line =
    # split_lines_of_utt[i].  If the hyp and ref words in split_line are both
    # either '<eps>' or non-scoreable words, we discard the index.
    # Otherwise we put it into selected_lines.
    selected_line_indexes = []
    # selected_edits will contain, for each element of selected_line_indexes, the
    # corresponding edit_type from the original utterance previous to
    # this function call ('cor', 'ins', etc.).
    #
    # As a special case, if there was a substitution ('sub') where the
    # reference word was a non-scored word and the hyp word was a real word,
    # we mark it in this array as 'ins', because for purposes of this algorithm
    # it behaves the same as an insertion.
    #
    # Whenever we do any operation that will change the reference, we change
    # all the selected_edits in the array to None so that they won't match
    # any further operations.
    selected_edits = []
    # selected_hyp_words will contain, for each element of selected_line_indexes, the
    # corresponding hyp_word.
    selected_hyp_words = []

    for i in range(len(split_lines_of_utt)):
        split_line = split_lines_of_utt[i]
        hyp_word = split_line[4]
        ref_word = split_line[6]
        # keep_this_line will be True if we are going to keep this line in the
        # 'selected lines' for further processing of repetitions.  We only
        # eliminate lines involving non-scored words or epsilon in both hyp
        # and reference position
        # [note: epsilon in hyp position for non-empty segments indicates
        #  optional-silence, and it does make sense to make this 'invisible',
        #  just like non-scored words, for the purposes of this code.]
        keep_this_line = True
        if (hyp_word == '<eps>' or hyp_word in non_scored_words) and \
           (ref_word == '<eps>' or ref_word in non_scored_words):
            keep_this_line = False
        if keep_this_line:
            selected_line_indexes.append(i)
            edit_type = split_line[7]
            if edit_type == 'sub' and ref_word in non_scored_words:
                assert not hyp_word in non_scored_words
                # For purposes of this algorithm, substitution of, say,
                # '[COUGH]' by 'hello' behaves like an insertion of 'hello',
                # since we're willing to remove the '[COUGH]' from the
                # transript.
                edit_type = 'ins'
            selected_edits.append(edit_type)
            selected_hyp_words.append(hyp_word)

    # indexes_to_fix will be a list of indexes into 'selected_indexes' where we
    # plan to fix the ref to match the hyp.
    indexes_to_fix = []

    # This loop scans for, and fixes, two-word insertions that follow,
    # or precede, the corresponding correct words.
    for i in range(0, len(selected_line_indexes) - 3):
        this_indexes = selected_line_indexes[i:i+4]
        this_hyp_words = selected_hyp_words[i:i+4]

        if this_hyp_words[0] == this_hyp_words[2] and \
           this_hyp_words[1] == this_hyp_words[3] and \
           this_hyp_words[0] != this_hyp_words[1]:
            # if the hyp words were of the form [ 'a', 'b', 'a', 'b' ]...
            this_edits = selected_edits[i:i+4]
            if this_edits == [ 'cor', 'cor', 'ins', 'ins' ] or \
                    this_edits == [ 'ins', 'ins', 'cor', 'cor' ]:
                if this_edits[0] == 'cor':
                    indexes_to_fix += [ i+2, i+3 ]
                else:
                    indexes_to_fix += [ i, i+1 ]

                # the next line prevents this region of the text being used
                # in any further edits.
                selected_edits[i:i+4] = [ None, None, None, None ]
                word_pair = this_hyp_words[0] + ' '  + this_hyp_words[1]
                # e.g. word_pair = 'hi there'
                # add 2 because these stats are of words.
                repetition_stats[word_pair] += 2
                # the next line prevents this region of the text being used
                # in any further edits.
                selected_edits[i:i+4] = [ None, None, None, None ]

    # This loop scans for, and fixes, one-word insertions that follow,
    # or precede, the corresponding correct words.
    for i in range(0, len(selected_line_indexes) - 1):
        this_indexes = selected_line_indexes[i:i+2]
        this_hyp_words = selected_hyp_words[i:i+2]

        if this_hyp_words[0] == this_hyp_words[1]:
            # if the hyp words were of the form [ 'a', 'a' ]...
            this_edits = selected_edits[i:i+2]
            if this_edits == [ 'cor', 'ins' ] or this_edits == [ 'ins', 'cor' ]:
                if this_edits[0] == 'cor':
                    indexes_to_fix.append(i+1)
                else:
                    indexes_to_fix.append(i)
                repetition_stats[this_hyp_words[0]] += 1
                # the next line prevents this region of the text being used
                # in any further edits.
                selected_edits[i:i+2] = [ None, None ]

    for i in indexes_to_fix:
        j = selected_line_indexes[i]
        split_line = split_lines_of_utt[j]
        ref_word = split_line[6]
        hyp_word = split_line[4]
        assert ref_word == '<eps>' or ref_word in non_scored_words
        # we replace reference with the decoded word, which will be a
        # repetition.
        split_line[6] = hyp_word
        split_line[7] = 'cor'

    return split_lines_of_utt


# note: split_lines_of_utt is a list of lists, one per line, each containing the
# sequence of fields.
# Returns the same format of data after processing.
def ProcessUtterance(split_lines_of_utt):
    new_split_lines_of_utt = []
    for split_line in split_lines_of_utt:
        new_split_line = ProcessLineForNonScoredWords(split_line)
        if new_split_line != []:
            new_split_lines_of_utt.append(new_split_line)
    if args.allow_repetitions == 'true':
        new_split_lines_of_utt = ProcessUtteranceForRepetitions(new_split_lines_of_utt)
    return new_split_lines_of_utt


def ProcessData():
    try:
        f_in = open(args.ctm_edits_in, encoding='utf-8')
    except:
        sys.exit("modify_ctm_edits.py: error opening ctm-edits input "
                 "file {0}".format(args.ctm_edits_in))
    try:
        f_out = open(args.ctm_edits_out, 'w', encoding='utf-8')
    except:
        sys.exit("modify_ctm_edits.py: error opening ctm-edits output "
                 "file {0}".format(args.ctm_edits_out))
    num_lines_processed = 0


    # Most of what we're doing in the lines below is splitting the input lines
    # and grouping them per utterance, before giving them to ProcessUtterance()
    # and then printing the modified lines.
    first_line = f_in.readline()
    if first_line == '':
        sys.exit("modify_ctm_edits.py: empty input")
    split_pending_line = first_line.split()
    if len(split_pending_line) == 0:
        sys.exit("modify_ctm_edits.py: bad input line " + first_line)
    cur_utterance = split_pending_line[0]
    split_lines_of_cur_utterance = []

    while True:
        if len(split_pending_line) == 0 or split_pending_line[0] != cur_utterance:
            split_lines_of_cur_utterance = ProcessUtterance(split_lines_of_cur_utterance)
            for split_line in split_lines_of_cur_utterance:
                print(' '.join(split_line), file = f_out)
            split_lines_of_cur_utterance = []
            if len(split_pending_line) == 0:
                break
            else:
                cur_utterance = split_pending_line[0]

        split_lines_of_cur_utterance.append(split_pending_line)
        next_line = f_in.readline()
        split_pending_line = next_line.split()
        if len(split_pending_line) == 0:
            if next_line != '':
                sys.exit("modify_ctm_edits.py: got an empty or whitespace input line")
    try:
        f_out.close()
    except:
        sys.exit("modify_ctm_edits.py: error closing ctm-edits output "
                 "(broken pipe or full disk?)")

def PrintNonScoredStats():
    if args.verbose < 1:
        return
    if num_lines == 0:
        print("modify_ctm_edits.py: processed no input.", file = sys.stderr)
    num_lines_modified = sum(ref_change_stats.values())
    num_incorrect_lines = num_lines - num_correct_lines
    percent_lines_incorrect= '%.2f' % (num_incorrect_lines * 100.0 / num_lines)
    percent_modified = '%.2f' % (num_lines_modified * 100.0 / num_lines);
    if num_incorrect_lines > 0:
        percent_of_incorrect_modified = '%.2f' % (num_lines_modified * 100.0 /
                                                  num_incorrect_lines)
    else:
        percent_of_incorrect_modified = float('nan')
    print("modify_ctm_edits.py: processed {0} lines of ctm ({1}% of which incorrect), "
          "of which {2} were changed fixing the reference for non-scored words "
          "({3}% of lines, or {4}% of incorrect lines)".format(
            num_lines, percent_lines_incorrect, num_lines_modified,
            percent_modified, percent_of_incorrect_modified),
          file = sys.stderr)

    keys = sorted(ref_change_stats.keys(), reverse=True,
                  key = lambda x: ref_change_stats[x])
    num_keys_to_print = 40 if args.verbose >= 2 else 10

    print("modify_ctm_edits.py: most common edits (as percentages "
          "of all such edits) are:\n" +
          ('\n'.join([ '%s [%.2f%%]' % (k, ref_change_stats[k]*100.0/num_lines_modified)
                     for k in keys[0:num_keys_to_print]]))
          + '\n...'if num_keys_to_print < len(keys) else '',
          file = sys.stderr)


def PrintRepetitionStats():
    if args.verbose < 1 or sum(repetition_stats.values()) == 0:
        return
    num_lines_modified = sum(repetition_stats.values())
    num_incorrect_lines = num_lines - num_correct_lines
    percent_lines_incorrect= '%.2f' % (num_incorrect_lines * 100.0 / num_lines)
    percent_modified = '%.2f' % (num_lines_modified * 100.0 / num_lines);
    if num_incorrect_lines > 0:
        percent_of_incorrect_modified = '%.2f' % (num_lines_modified * 100.0 /
                                                  num_incorrect_lines)
    else:
        percent_of_incorrect_modified = float('nan')
    print("modify_ctm_edits.py: processed {0} lines of ctm ({1}% of which incorrect), "
          "of which {2} were changed fixing the reference for repetitions ({3}% of "
          "lines, or {4}% of incorrect lines)".format(
            num_lines, percent_lines_incorrect, num_lines_modified,
            percent_modified, percent_of_incorrect_modified),
          file = sys.stderr)

    keys = sorted(repetition_stats.keys(), reverse=True,
                  key = lambda x: repetition_stats[x])
    num_keys_to_print = 40 if args.verbose >= 2 else 10

    print("modify_ctm_edits.py: most common repetitions inserted into reference (as percentages "
          "of all words fixed in this way) are:\n" +
          ('\n'.join([ '%s [%.2f%%]' % (k, repetition_stats[k]*100.0/num_lines_modified)
                     for k in keys[0:num_keys_to_print]]))
          + '\n...' if num_keys_to_print < len(keys) else '',
          file = sys.stderr)


non_scored_words = set()
ReadNonScoredWords(args.non_scored_words_in)

num_lines = 0
num_correct_lines = 0
# ref_change_stats will be a map from a string like
# 'foo -> bar' to an integer count; it keeps track of how much we changed
# the reference.
ref_change_stats = defaultdict(int)
# repetition_stats will be a map from strings like
# 'a', or 'a b' (the repeated strings), to an integer count; like
# ref_change_stats, it keeps track of how many changes we made
# in allowing repetitions.
repetition_stats = defaultdict(int)

ProcessData()
PrintNonScoredStats()
PrintRepetitionStats()


================================================
FILE: egs/steps/cleanup/internal/resolve_ctm_edits_overlaps.py
================================================
#! /usr/bin/env python

# Copyright 2014  Johns Hopkins University (Authors: Daniel Povey)
#           2014  Vijayaditya Peddinti
#           2016  Vimal Manohar
# Apache 2.0.

"""
Script to combine ctms edits with overlapping segments obtained from
smith-waterman alignment. This script is similar to utils/ctm/resolve_ctm_edits.py,
where the overlapping region is just split in two. The approach here is a
little more advanced since we have access to the WER
(w.r.t. the reference text). It finds the WER of the overlapped region
in the two overlapping segments, and chooses the better one.
"""

from __future__ import print_function
from __future__ import division
import argparse
import collections
import logging

logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
handler = logging.StreamHandler()
handler.setLevel(logging.INFO)
formatter = logging.Formatter(
    '%(asctime)s [%(pathname)s:%(lineno)s - '
    '%(funcName)s - %(levelname)s ] %(message)s')
handler.setFormatter(formatter)
logger.addHandler(handler)


def get_args():
    """gets command line arguments"""

    usage = """ Python script to resolve overlaps in ctms """
    parser = argparse.ArgumentParser(usage)
    parser.add_argument('segments', type=argparse.FileType('r'),
                        help='use segments to resolve overlaps')
    parser.add_argument('ctm_edits_in', type=argparse.FileType('r'),
                        help='input_ctm_file')
    parser.add_argument('ctm_edits_out', type=argparse.FileType('w'),
                        help='output_ctm_file')
    parser.add_argument('--verbose', type=int, default=0,
                        help="Higher value for more verbose logging.")
    args = parser.parse_args()

    if args.verbose > 2:
        logger.setLevel(logging.DEBUG)
        handler.setLevel(logging.DEBUG)

    return args


def read_segments(segments_file):
    """Read from segments and returns two dictionaries,
    {utterance-id: (recording_id, start_time, end_time)}
    {recording_id: list-of-utterances}
    """
    segments = {}
    reco2utt = collections.defaultdict(list)

    num_lines = 0
    for line in segments_file:
        num_lines += 1
        parts = line.strip().split()
        assert len(parts) in [4, 5]
        segments[parts[0]] = (parts[1], float(parts[2]), float(parts[3]))
        reco2utt[parts[1]].append(parts[0])

    logger.info("Read %d lines from segments file %s",
                num_lines, segments_file.name)
    segments_file.close()

    return segments, reco2utt


def read_ctm_edits(ctm_edits_file, segments):
    """Read CTM from ctm_edits_file into a dictionary of values indexed by the
    recording.
    It is assumed to be sorted by the recording-id and utterance-id.

    Returns a dictionary {recording : ctm_edit_lines}
        where ctm_lines is a list of lines of CTM corresponding to the
        utterances in the recording.
        The format is as follows:
        [[(utteranceA, channelA, start_time1, duration1, hyp_word1, conf1, ref_word1, edit_type1),
          (utteranceA, channelA, start_time2, duration2, hyp_word2, conf2, ref_word2, edit_type2),
          ...
          (utteranceA, channelA, start_timeN, durationN, hyp_wordN, confN, ref_wordN, edit_typeN)],
         [(utteranceB, channelB, start_time1, duration1, hyp_word1, conf1, ref_word1, edit_type1),
          (utteranceB, channelB, start_time2, duration2, hyp_word2, conf2, ref_word2, edit_type2),
          ...],
         ...
         [...
          (utteranceZ, channelZ, start_timeN, durationN, hyp_wordN, confN, ref_wordN, edit_typeN)]
        ]

    Arguments:
        segments - Dictionary containing the output of read_segments()
            { utterance_id: (recording_id, start_time, end_time) }
    """
    ctm_edits = {}

    num_lines = 0
    for line in ctm_edits_file:
        num_lines += 1
        parts = line.split()

        utt = parts[0]
        reco = segments[utt][0]

        if (reco, utt) not in ctm_edits:
            ctm_edits[(reco, utt)] = []

        ctm_edits[(reco, utt)].append(
            [parts[0], parts[1], float(parts[2]), float(parts[3]),
             parts[4], float(parts[5])] + parts[6:])

    logger.info("Read %d lines from CTM %s", num_lines, ctm_edits_file.name)

    ctm_edits_file.close()
    return ctm_edits


def wer(ctm_edit_lines):
    num_words = 0
    num_incorrect_words = 0
    for line in ctm_edit_lines:
        if line[7] != 'sil':
            num_words += 1
            if line[7] in ['ins', 'del', 'sub']:
                num_incorrect_words += 1
    if num_words == 0 and num_incorrect_words > 0:
        return float('inf')
    if num_words == 0 and num_incorrect_words == 0:
        return 0
    return float(num_incorrect_words) / num_words


def choose_best_ctm_lines(first_lines, second_lines,
                          window_length, overlap_length):
    """Returns ctm lines that have lower WER. If the WER is the lines with
    the higher number of words is returned.
    """
    i, best_lines = min((0, first_lines),
                        (1, second_lines),
                        key=lambda x: wer(x[1]))
    return i


def resolve_overlaps(ctm_edits, segments):
    """Resolve overlaps within segments of the same recording.

    Returns new lines of CTM for the recording.

    Arguments:
        ctm_edits - The CTM lines for a single recording. This is one value
            stored in the dictionary read by read_ctm(). Assumes that the lines
            are sorted by the utterance-ids.
            The format is the following:
            [[(utteranceA, channelA, start_time1, duration1, hyp_word1, conf1),
              (utteranceA, channelA, start_time2, duration2, hyp_word2, conf2),
              ...
              (utteranceA, channelA, start_timeN, durationN, hyp_wordN, confN)
             ],
             [(utteranceB, channelB, start_time1, duration1, hyp_word1, conf1),
              (utteranceB, channelB, start_time2, duration2, hyp_word2, conf2),
              ...],
             ...
             [...
              (utteranceZ, channelZ, start_timeN, durationN, hyp_wordN, confN)]
            ]
            Expects this to be non-empty.
        segments - Dictionary containing the output of read_segments()
            { utterance_id: (recording_id, start_time, end_time) }
        """
    total_ctm_edits = []
    assert len(ctm_edits) > 0

    # First column of first line in CTM for first utterance
    next_utt = ctm_edits[0][0][0]
    for utt_index, ctm_edits_for_cur_utt in enumerate(ctm_edits):
        if utt_index == len(ctm_edits) - 1:
            break

        if len(ctm_edits_for_cur_utt) == 0:
            next_utt = ctm_edits[utt_index + 1][0][0]
            continue

        cur_utt = ctm_edits_for_cur_utt[0][0]
        if cur_utt != next_utt:
            logger.error(
                "Current utterance %s is not the same as the next "
                "utterance %s in previous iteration.\n"
                "CTM is not sorted by utterance-id?",
                cur_utt, next_utt)
            raise ValueError

        # Assumption here is that the segments are written in
        # consecutive order in time.
        ctm_edits_for_next_utt = ctm_edits[utt_index + 1]
        next_utt = ctm_edits_for_next_utt[0][0]
        if segments[next_utt][1] < segments[cur_utt][1]:
            logger.error(
                "Next utterance %s <= Current utterance %s. "
                "CTM edits is not sorted by utterance-id.",
                next_utt, cur_utt)
            raise ValueError

        try:
            # length of this utterance
            window_length = segments[cur_utt][2] - segments[cur_utt][1]

            # overlap of this segment with the next segment
            # i.e. current_utterance_end_time - next_utterance_start_time
            # Note: It is possible for this to be negative when there is
            # actually no overlap between consecutive segments.
            try:
                overlap = segments[cur_utt][2] - segments[next_utt][1]
            except KeyError:
                logger("Could not find utterance %s in segments",
                       next_utt)
                raise

            # find the first word that is in the overlap
            # at the end of the cur utt
            try:
                cur_utt_end_index = next(
                    (i for i, line in enumerate(ctm_edits_for_cur_utt)
                     if line[2] + line[3] / 2.0 > window_length - overlap))
            except StopIteration:
                cur_utt_end_index = len(ctm_edits_for_cur_utt)

            cur_utt_end_lines = ctm_edits_for_cur_utt[cur_utt_end_index:]

            # find the last word that is not in the overlap
            # at the beginning of the next utt
            try:
                next_utt_start_index = next(
                    (i for i, line in enumerate(ctm_edits_for_next_utt)
                     if line[2] + line[3] / 2.0 > overlap))
            except StopIteration:
                next_utt_start_index = 0

            next_utt_start_lines = ctm_edits_for_next_utt[:
                                                          next_utt_start_index]

            choose_index = choose_best_ctm_lines(
                cur_utt_end_lines, next_utt_start_lines,
                window_length, overlap)

            # Ignore the hypotheses beyond this midpoint. They will be
            # considered as part of the next segment.
            if choose_index == 1:
                total_ctm_edits.extend(
                    ctm_edits_for_cur_utt[:cur_utt_end_index])
            else:
                total_ctm_edits.extend(ctm_edits_for_cur_utt)

            if choose_index == 0 and next_utt_start_index > 0:
                # Update the ctm_edits_for_next_utt to include only the lines
                # starting from index.
                ctm_edits[utt_index + 1] = (
                    ctm_edits_for_next_utt[next_utt_start_index:])
            # else leave the ctm_edits as is.
        except:
            logger.error("Could not resolve overlaps between CTM edits for "
                         "%s and %s", cur_utt, next_utt)
            logger.error("Current CTM:")
            for line in ctm_edits_for_cur_utt:
                logger.error(ctm_edit_line_to_string(line))
            logger.error("Next CTM:")
            for line in ctm_edits_for_next_utt:
                logger.error(ctm_edit_line_to_string(line))
            raise

    # merge the last ctm entirely
    total_ctm_edits.extend(ctm_edits[-1])

    return total_ctm_edits


def ctm_edit_line_to_string(line):
    """Converts a line of CTM edit to string."""
    return "{0} {1} {2} {3} {4} {5} {6}".format(line[0], line[1], line[2],
                                                line[3], line[4], line[5],
                                                " ".join(line[6:]))


def write_ctm_edits(ctm_edit_lines, out_file):
    """Writes CTM lines stored in a list to file."""
    for line in ctm_edit_lines:
        print(ctm_edit_line_to_string(line), file=out_file)


def run(args):
    """this method does everything in this script"""
    segments, reco2utt = read_segments(args.segments)
    ctm_edits = read_ctm_edits(args.ctm_edits_in, segments)

    for reco, utts in reco2utt.items():
        ctm_edits_for_reco = []
        for utt in sorted(utts, key=lambda x: segments[x][1]):
            if (reco, utt) in ctm_edits:
                ctm_edits_for_reco.append(ctm_edits[(reco, utt)])
        try:
            if len(ctm_edits_for_reco) == 0:
                logger.warn('CTMs for recording %s is empty.',
                            reco)
                continue   # Go to the next recording

            # Process CTMs in the recordings
            ctm_edits_for_reco = resolve_overlaps(ctm_edits_for_reco, segments)
            write_ctm_edits(ctm_edits_for_reco, args.ctm_edits_out)
        except Exception:
            logger.error("Failed to process CTM edits for recording %s",
                         reco)
            raise
    args.ctm_edits_out.close()
    logger.info("Wrote CTM for %d recordings.", len(ctm_edits))


def main():
    """The main function which parses arguments and call run()."""
    args = get_args()
    try:
        run(args)
    except:
        logger.error("Failed to resolve overlaps", exc_info=True)
        raise RuntimeError
    finally:
        try:
            for f in [args.segments, args.ctm_edits_in, args.ctm_edits_out]:
                if f is not None:
                    f.close()
        except IOError:
            logger.error("Could not close some files. "
                         "Disk error or broken pipes?")
            raise
        except UnboundLocalError:
            raise SystemExit(1)


if __name__ == "__main__":
    main()


================================================
FILE: egs/steps/cleanup/internal/retrieve_similar_docs.py
================================================
#! /usr/bin/env python

# Copyright 2017  Vimal Manohar
# Apache 2.0.

"""This script retrieves documents similar to the query documents
using a similarity score based on the total TFIDF for all the terms in the
query document.

Some terminology:
    original utterance-id = The utterance-id of the original long audio segments
        and the corresponding reference transcript
    source-text = reference transcript
    source-text-id = original utterance-id
    sub-segment = Approximately 30s long chunk of the original utterance
    query-id = utterance-id of the sub-segment
    document = Approximately 1000 words of a source-text
    doc-id = Id of the document

e.g.
foo1 A B C D E F is in the original text file
and foo1 foo 100 200 is in the original segments file.

Here foo1 is the source-text-id and "A B C D" is the reference transcript. It
is a 100s long segment from the recording foo.

foo1 is split into 30s long sub-segments as follows:
foo1-1 foo1 100 130
foo1-2 foo1 125 155
foo1-3 foo1 150 180
foo1-4 foo1 175 200

foo1-{1,2,3,4} are query-ids.

The source-text for foo1 is split into two-word documents.
doc1 A B
doc2 C D
doc3 E F

doc{1,2,3} are doc-ids.

--source-text2doc-ids option is given a mapping that contains
foo1 doc1 doc2 doc3

--query-id2source-text-id option is given a mapping that contains
foo1-1 foo1
foo1-2 foo1
foo1-3 foo1
foo1-4 foo1

The query TF-IDFs are all indexed by the utterance-id of the sub-segments
of the original utterances.
The source TF-IDFs use the document-ids created by splitting the source-text
(corresponding to original utterances) into documents.

For each query (sub-segment), we need to retrieve the documents that were
created from the same original utterance that the sub-segment was from. For
this, we have to load the source TF-IDF that has those documents. This
information is provided using the option --source-text2tf-idf-file, which
is like an SCP file with the first column being the source-text-id and the
second column begin the location of TF-IDF for the documents corresponding
to that source-text-id.

The output of this script is a file where the first column is the
query-id (i.e. sub-segment-id) and the remaining columns, which is at least
one in number and a maxmium of (1 + 2 * num-neighbors-to-search) columns
are tuples separated by commas
(<doc-id>, <start-fraction>, <end-fraction>), where <doc-id> is the document-id
<start-fraction> is the proportion of the document from the beginning
that needs to be in the retrieved set.
<end-fraction> is the proportion of the document from the end
that needs to be in the retrieved set.
If both <start-fraction> and <end-fraction> are 1, then the full document is
added to the retrieved set.
Some examples of the lines in the output file are:
foo1-1 doc1,1,1
foo1-2 doc1,0,0.2 doc2,1,1 doc3,0.2,0
"""

from __future__ import print_function
import argparse
import logging

import tf_idf


logger = logging.getLogger('__name__')
handler = logging.StreamHandler()
handler.setLevel(logging.INFO)
formatter = logging.Formatter("%(asctime)s [%(filename)s:%(lineno)s - "
                              "%(funcName)s - %(levelname)s ] %(message)s")
handler.setFormatter(formatter)

for l in [logger, logging.getLogger('tf_idf'), logging.getLogger('libs')]:
    l.setLevel(logging.DEBUG)
    l.addHandler(handler)


def get_args():
    parser = argparse.ArgumentParser(
        description="""This script retrieves documents similar to the
        query documents using a similarity score based on the total TFIDF for
        all the terms in the query document.
        See the beginning of the script for more details about the
        arguments to the script.""")

    parser.add_argument("--verbose", type=int, default=0, choices=[0, 1, 2, 3],
                        help="Higher for more logging statements")

    parser.add_argument("--num-neighbors-to-search", type=int, default=0,
                        help="""Number of neighboring documents to search
                        around the one retrieved based on maximum tf-idf
                        similarity. A value of 0 means only the document
                        with the maximum tf-idf similarity is retrieved,
                        and none of the documents adjacent to it.""")
    parser.add_argument("--neighbor-tfidf-threshold", type=float, default=0.9,
                        help="""Ignore neighbors that have tf-idf similarity
                        with the query document less than this threshold
                        factor lower than the best score.""")
    parser.add_argument("--partial-doc-fraction", default=0.2,
                        help="""The fraction of neighboring document that will
                        be part of the retrieved document set.
                        If this is greater than 0, then a fraction of words
                        from the neighboring documents is added to the
                        retrieved document.""")

    parser.add_argument("--source-text-id2doc-ids",
                        type=argparse.FileType('r'), required=True,
                        help="""A mapping from the source text to a list of
                        documents that it is broken into
                        <text-utterance-id> <document-id-1> ...
                        <document-id-N>""")
    parser.add_argument("--query-id2source-text-id",
                        type=argparse.FileType('r'), required=True,
                        help="""A mapping from the query document-id to a
                        source text from which a document needs to be
                        retrieved.""")
    parser.add_argument("--source-text-id2tfidf", type=argparse.FileType('r'),
                        required=True,
                        help="""An SCP file for the TF-IDF for source
                        documents indexed by the source-text-id.""")
    parser.add_argument("--query-tfidf", type=argparse.FileType('r'),
                        required=True,
                        help="""Archive of TF-IDF objects for query documents
                        indexed by the query-id.
                        The format is
                        query-id <TFIDF> ... </TFIDF>
                        """)
    parser.add_argument("--relevant-docs", type=argparse.FileType('w'),
                        required=True,
                        help="""Output archive of a list of source documents
                        similar to a query document, indexed by the
                        query document id.""")

    args = parser.parse_args()

    if args.partial_doc_fraction < 0 or args.partial_doc_fraction > 1:
        logger.error("--partial-doc-fraction must be in [0,1]")
        raise ValueError

    return args


def read_map(file_handle, num_values_per_key=None,
             min_num_values_per_key=None, must_contain_unique_key=True):
    """Reads a map from a file into a dictionary and returns it.
    Expects the map is stored in the file in the following format:
    <key> <value-1> <value-2> ... <value-N>
    The values are returned as a tuple stored in a dictionary indexed by the
    "key".

    Arguments:
        file_handle - A handle to an opened input file containing the map
        num_values_per_key - If provided, the function raises an error if
                             the number of values read for a key in the input
                             file does not match the "num_values_per_key"
        min_num_values_per_key - If provided, the function raises an error
                                 if the number of values read for a key in the
                                 input file is less than
                                 "min_num_values_per_key"
        must_contain_unique_key - If set to True, then it is required that the
                                  file has a unique key; otherwise this
                                  function will exit with error.

    Returns:
        { key: tuple(values) }
    """
    dict_map = {}
    for line in file_handle:
        try:
            parts = line.strip().split()
            key = parts[0]

            if (num_values_per_key is not None
                    and len(parts) - 1 != num_values_per_key):
                logger.error(
                    "Expecting {0} columns; Got {1}.".format(
                        num_values_per_key + 1, len(parts)))
                raise TypeError

            if (min_num_values_per_key is not None
                    and len(parts) - 1 < min_num_values_per_key):
                logger.error(
                    "Expecting at least {0} columns; Got {1}.".format(
                        min_num_values_per_key + 1, len(parts)))
                raise TypeError

            if must_contain_unique_key and key in dict_map:
                logger.error("Found duplicate key %s", key)
                raise TypeError

            if num_values_per_key is not None and num_values_per_key == 1:
                dict_map[key] = parts[1]
            else:
                dict_map[key] = parts[1:]
        except Exception:
            logger.error("Failed reading line %s in file %s",
                         line, file_handle.name)
            raise
    file_handle.close()
    return dict_map


def get_document_ids(source_docs, indexes):
    indexes = sorted(
        [(key, value[0], value[1]) for key, value in indexes.items()],
        key=lambda x: x[0])

    doc_ids = []
    for i, partial_start, partial_end in indexes:
        try:
            doc_ids.append((source_docs[i], partial_start, partial_end))
        except IndexError:
            pass
    return doc_ids


def run(args):
    """The main function that does all the processing.
    Takes as argument the Namespace object obtained from _get_args().
    """
    query_id2source_text_id = read_map(args.query_id2source_text_id,
                                       num_values_per_key=1)
    source_text_id2doc_ids = read_map(args.source_text_id2doc_ids,
                                      min_num_values_per_key=1)

    source_text_id2tfidf = read_map(args.source_text_id2tfidf,
                                    num_values_per_key=1)

    num_queries = 0
    prev_source_text_id = ""
    for query_id, query_tfidf in tf_idf.read_tfidf_ark(args.query_tfidf):
        num_queries += 1

        # The source text from which a document is to be retrieved for the
        # input query
        source_text_id = query_id2source_text_id[query_id]

        if prev_source_text_id != source_text_id:
            source_tfidf = tf_idf.TFIDF()
            source_tfidf.read(
                open(source_text_id2tfidf[source_text_id]))
            prev_source_text_id = source_text_id

        # The source documents corresponding to the source text.
        # This is set of documents which will be searched over for the query.
        source_doc_ids = source_text_id2doc_ids[source_text_id]

        scores = query_tfidf.compute_similarity_scores(
            source_tfidf, source_docs=source_doc_ids, query_id=query_id)

        assert len(scores) > 0, (
            "Did not get scores for query {0}".format(query_id))

        if args.verbose > 2:
            for tup, score in scores.items():
                logger.debug("Score, {num}: {0} {1} {2}".format(
                    tup[0], tup[1], score, num=num_queries))

        best_index, best_doc_id = max(
            enumerate(source_doc_ids), key=lambda x: scores[(query_id, x[1])])
        best_score = scores[(query_id, best_doc_id)]

        assert source_doc_ids[best_index] == best_doc_id
        assert best_score == max([scores[(query_id, x)]
                                  for x in source_doc_ids])

        best_indexes = {}

        if args.num_neighbors_to_search == 0:
            best_indexes[best_index] = (1, 1)
            if best_index > 0:
                best_indexes[best_index - 1] = (0, args.partial_doc_fraction)
            if best_index < len(source_doc_ids) - 1:
                best_indexes[best_index + 1] = (args.partial_doc_fraction, 0)
        else:
            excluded_indexes = set()
            for index in range(
                    max(best_index - args.num_neighbors_to_search, 0),
                    min(best_index + args.num_neighbors_to_search + 1,
                        len(source_doc_ids))):
                if (scores[(query_id, source_doc_ids[index])]
                        >= args.neighbor_tfidf_threshold * best_score):
                    best_indexes[index] = (1, 1)    # Type 2
                    if index > 0 and index - 1 in excluded_indexes:
                        try:
                            # Type 1 and 3
                            start_frac, end_frac = best_indexes[index - 1]
                            assert end_frac == 0
                            best_indexes[index - 1] = (
                                start_frac, args.partial_doc_fraction)
                        except KeyError:
                            # Type 1
                            best_indexes[index - 1] = (
                                0, args.partial_doc_fraction)
                else:
                    excluded_indexes.add(index)
                    if index > 0 and index - 1 not in excluded_indexes:
                        # Type 3
                        best_indexes[index] = (args.partial_doc_fraction, 0)

        best_docs = get_document_ids(source_doc_ids, best_indexes)

        assert len(best_docs) > 0, (
            "Did not get best docs for query {0}\n"
            "Scores: {1}\n"
            "Source docs: {2}\n"
            "Best index: {best_index}, score: {best_score}\n".format(
                query_id, scores, source_doc_ids,
                best_index=best_index, best_score=best_score))
        assert (best_doc_id, 1.0, 1.0) in best_docs

        print ("{0} {1}".format(query_id, " ".join(
            ["%s,%.2f,%.2f" % x for x in best_docs])),
               file=args.relevant_docs)

    if num_queries == 0:
        raise RuntimeError("Failed to retrieve any document.")

    logger.info("Retrieved similar documents for "
                "%d queries", num_queries)


def main():
    args = get_args()

    if args.verbose > 1:
        handler.setLevel(logging.DEBUG)
    try:
        run(args)
    finally:
        for f in [args.query_id2source_text_id, args.source_text_id2doc_ids,
                  args.relevant_docs, args.query_tfidf, args.source_text_id2tfidf]:
            f.close()


if __name__ == '__main__':
    main()


================================================
FILE: egs/steps/cleanup/internal/segment_ctm_edits.py
================================================
#!/usr/bin/env python3


# Copyright 2016   Vimal Manohar
#           2016   Johns Hopkins University (author: Daniel Povey)
# Apache 2.0

from __future__ import print_function
from __future__ import division
import sys, operator, argparse, os
from collections import defaultdict

# This script reads 'ctm-edits' file format that is produced by get_ctm_edits.py
# and modified by modify_ctm_edits.py and taint_ctm_edits.py Its function is to
# produce a segmentation and text from the ctm-edits input.

# The ctm-edits file format that this script expects is as follows
# <file-id> <channel> <start-time> <duration> <conf> <hyp-word> <ref-word> <edit> ['tainted']
# [note: file-id is really utterance-id at this point].

parser = argparse.ArgumentParser(
    description = "This program produces segmentation and text information "
    "based on reading ctm-edits input format which is produced by "
    "steps/cleanup/internal/get_ctm_edits.py, steps/cleanup/internal/modify_ctm_edits.py and "
    "steps/cleanup/internal/taint_ctm_edits.py.",
 formatter_class=argparse.ArgumentDefaultsHelpFormatter)

parser.add_argument("--min-segment-length", type = float, default = 0.5,
                    help = "Minimum allowed segment length (in seconds) for any "
                    "segment; shorter segments than this will be discarded.")
parser.add_argument("--min-new-segment-length", type = float, default = 1.0,
                    help = "Minimum allowed segment length (in seconds) for newly "
                    "created segments (i.e. not identical to the input utterances). "
                    "Expected to be >= --min-segment-length.")
parser.add_argument("--frame-length", type = float, default = 0.01,
                    help = "This only affects rounding of the output times; they will "
                    "be constrained to multiples of this value.")
parser.add_argument("--max-tainted-length", type = float, default = 0.05,
                    help = "Maximum allowed length of any 'tainted' line.  Note: "
                    "'tainted' lines may only appear at the boundary of a "
                    "segment")
parser.add_argument("--max-edge-silence-length", type = float, default = 0.5,
                    help = "Maximum allowed length of silence if it appears at the "
                    "edge of a segment (will be truncated).  This rule is "
                    "relaxed if such truncation would take a segment below "
                    "the --min-segment-length or --min-new-segment-length.")
parser.add_argument("--max-edge-non-scored-length", type = float, default = 0.5,
                    help = "Maximum allowed length of a non-scored word (noise, cough, etc.) "
                    "if it appears at the edge of a segment (will be truncated). "
                    "This rule is relaxed if such truncation would take a "
                    "segment below the --min-segment-length.")
parser.add_argument("--max-internal-silence-length", type = float, default = 2.0,
                    help = "Maximum allowed length of silence if it appears inside a segment "
                    "(will cause the segment to be split).")
parser.add_argument("--max-internal-non-scored-length", type = float, default = 2.0,
                    help = "Maximum allowed length of a non-scored word (noise, etc.) if "
                    "it appears inside a segment (will cause the segment to be "
                    "split).  Note: reference words which are real words but OOV "
                    "are not included in this category.")
parser.add_argument("--unk-padding", type = float, default = 0.05,
                    help = "Amount of padding with <unk> that we do if a segment boundary is "
                    "next to errors (ins, del, sub).  That is, we add this amount of "
                    "time to the segment and add the <unk> word to cover the acoustics. "
                    "If nonzero, the --oov-symbol-file option must be supplied.")
parser.add_argument("--max-junk-proportion", type = float, default = 0.1,
                    help = "Maximum proportion of the time of the segment that may "
                    "consist of potentially bad data, in which we include 'tainted' lines of "
                    "the ctm-edits input and unk-padding.")
parser.add_argument("--min-split-point-duration", type=float, default=0.1,
                    help="""Minimum duration of silence or non-scored word
                    to be considered a viable split point when
                    truncating based on junk proportion.""")
parser.add_argument("--max-deleted-words-kept-when-merging", type = int, default = 1,
                    help = "When merging segments that are found to be overlapping or "
                    "adjacent after all other processing, keep in the transcript the "
                    "reference words that were deleted between the segments [if any] "
                    "as long as there were no more than this many reference words. "
                    "Setting this to zero will mean that any reference words that "
                    "were deleted between the segments we're about to reattach will "
                    "not appear in the generated transcript (so we'll match the hyp).")
parser.add_argument("--oov-symbol-file", type = str, default = None,
                    help = "Filename of file such as data/lang/oov.txt which contains "
                    "the text form of the OOV word, normally '<unk>'.  Supplied as "
                    "a file to avoid complications with escaping.  Necessary if "
                    "the --unk-padding option has a nonzero value (which it does "
                    "by default.")
parser.add_argument("--ctm-edits-out", type = str,
                    help = "Filename to output an extended version of the ctm-edits format "
                    "with segment start and end points noted.  This file is intended to be "
                    "read by humans; there are currently no scripts that will read it.")
parser.add_argument("--word-stats-out", type = str,
                    help = "Filename for output of word-level stats, of the form "
                    "'<word> <bad-proportion> <total-count-in-ref>', e.g. 'hello 0.12 12408', "
                    "where the <bad-proportion> is the proportion of the time that this "
                    "reference word does not make it into a segment.  It can help reveal words "
                    "that have problematic pronunciations or are associated with "
                    "transcription errors.")


parser.add_argument("non_scored_words_in", metavar = "<non-scored-words-file>",
                    help="Filename of file containing a list of non-scored words, "
                    "one per line. See steps/cleanup/internal/get_nonscored_words.py.")
parser.add_argument("ctm_edits_in", metavar = "<ctm-edits-in>",
                    help = "Filename of input ctm-edits file. "
                    "Use /dev/stdin for standard input.")
parser.add_argument("text_out", metavar = "<text-out>",
                    help = "Filename of output text file (same format as data/train/text, i.e. "
                    "<new-utterance-id> <word1> <word2> ... <wordN>")
parser.add_argument("segments_out", metavar = "<segments-out>",
                    help = "Filename of output segments.  This has the same format as data/train/segments, "
                    "but instead of <recording-id>, the second field is the old utterance-id, i.e "
                    "<new-utterance-id> <old-utterance-id> <start-time> <end-time>")

args = parser.parse_args()


def IsTainted(split_line_of_utt):
    return len(split_line_of_utt) > 8 and split_line_of_utt[8] == 'tainted'

# This function returns a list of pairs (start-index, end-index) representing
# the cores of segments (so if a pair is (s, e), then the core of a segment
# would span (s, s+1, ... e-1).
#
# By the 'core of a segment', we mean a sequence of ctm-edits lines including at
# least one 'cor' line and a contiguous sequence of other lines of the type
# 'cor', 'fix' and 'sil' that must be not tainted.  The segment core excludes
# any tainted lines at the edge of a segment, which will be added later.
#
# We only initiate segments when it contains something correct and not realized
# as unk (i.e. ref==hyp); and we extend it with anything that is 'sil' or 'fix'
# or 'cor' that is not tainted.  Contiguous regions of 'true' in the resulting
# boolean array will then become the cores of prototype segments, and we'll add
# any adjacent tainted words (or parts of them).
def ComputeSegmentCores(split_lines_of_utt):
    num_lines = len(split_lines_of_utt)
    line_is_in_segment_core = [ False] * num_lines
    for i in range(num_lines):
        if split_lines_of_utt[i][7] == 'cor' and \
            split_lines_of_utt[i][4] == split_lines_of_utt[i][6]:
            line_is_in_segment_core[i] = True

    # extend each proto-segment forwards as far as we can:
    for i in range(1, num_lines):
        if line_is_in_segment_core[i-1] and not line_is_in_segment_core[i]:
            edit_type = split_lines_of_utt[i][7]
            if not IsTainted(split_lines_of_utt[i]) and \
                (edit_type == 'cor' or edit_type == 'sil' or edit_type == 'fix'):
                line_is_in_segment_core[i] = True

    # extend each proto-segment backwards as far as we can:
    for i in reversed(range(0, num_lines - 1)):
        if line_is_in_segment_core[i+1] and not line_is_in_segment_core[i]:
            edit_type = split_lines_of_utt[i][7]
            if not IsTainted(split_lines_of_utt[i]) and \
               (edit_type == 'cor' or edit_type == 'sil' or edit_type == 'fix'):
                line_is_in_segment_core[i] = True


    segment_ranges = []
    cur_segment_start = None
    for i in range(0, num_lines):
        if line_is_in_segment_core[i]:
            if cur_segment_start == None:
                cur_segment_start = i
        else:
            if cur_segment_start != None:
                segment_ranges.append( (cur_segment_start, i) )
                cur_segment_start = None
    if cur_segment_start != None:
        segment_ranges.append( (cur_segment_start, num_lines) )

    return segment_ranges

class Segment(object):
    def __init__(self, split_lines_of_utt, start_index, end_index, debug_str = None):
        self.split_lines_of_utt = split_lines_of_utt
        # start_index is the index of the first line that appears in this
        # segment, and end_index is one past the last line.  This does not
        # include unk-padding.
        self.start_index = start_index
        self.end_index = end_index
        # If the following values are nonzero, then when we create the segment
        # we will add <unk> at the start and end of the segment [representing
        # partial words], with this amount of additional audio.
        self.start_unk_padding = 0.0
        self.end_unk_padding = 0.0

        # debug_str keeps track of the 'core' of the segment.
        if debug_str == None:
            debug_str = 'core-start={0},core-end={1}'.format(start_index,end_index)
        self.debug_str = debug_str

        # This gives the proportion of the time of the first line in the segment
        # that we keep.  Usually 1.0 but may be less if we've trimmed away some
        # proportion of the time.
        self.start_keep_proportion = 1.0
        # This gives the proportion of the time of the last line in the segment
        # that we keep.  Usually 1.0 but may be less if we've trimmed away some
        # proportion of the time.
        self.end_keep_proportion = 1.0

    # This is stage 1 of segment processing (after creating the boundaries of the
    # core of the segment, which is done outside of this class).a
    #
    # This function may reduce start_index and/or increase end_index by
    # including a single adjacent 'tainted' line from the ctm-edits file.  This
    # is only done if the lines at the boundaries of the segment are currently
    # real non-silence words and not non-scored words.  The idea is that we
    # probably don't want to start or end the segment right at the boundary of a
    # real word, we want to add some kind of padding.
    def PossiblyAddTaintedLines(self):
        global non_scored_words
        split_lines_of_utt = self.split_lines_of_utt
        # we're iterating over the segment (start, end)
        for b in [False, True]:
            if b:
                boundary_index = self.end_index - 1
                adjacent_index = self.end_index
            else:
                boundary_index = self.start_index
                adjacent_index = self.start_index - 1
            if adjacent_index >= 0 and adjacent_index < len(split_lines_of_utt):
                # only consider merging the adjacent word into the segment if we're not
                # at a segment boundary.
                adjacent_line_is_tainted = IsTainted(split_lines_of_utt[adjacent_index])
                # if the adjacent line wasn't tainted, then there must have been
                # another stronger reason why we didn't include it in the core
                # of the segment (probably that it was an ins, del or sub), so
                # there is no point considering it.
                if adjacent_line_is_tainted:
                    boundary_edit_type = split_lines_of_utt[boundary_index][7]
                    boundary_hyp_word = split_lines_of_utt[boundary_index][7]
                    # we only add the tainted line to the segment if the word at
                    # the boundary was a non-silence word that was correctly
                    # decoded and not fixed [see modify_ctm_edits.py.]
                    if boundary_edit_type == 'cor' and \
                       not boundary_hyp_word in non_scored_words:
                        # Add the adjacent tainted line to the segment.
                        if b:
                            self.end_index += 1
                        else:
                            self.start_index -= 1

    # This is stage 2 of segment processing.
    # This function will split a segment into multiple pieces if any of the
    # internal [non-boundary] silences or non-scored words are longer
    # than the allowed values --max-internal-silence-length and
    # --max-internal-non-scored-length.  This function returns a
    # list of segments.  In the normal case (where there is no splitting)
    # it just returns an array with a single element 'self'.
    def PossiblySplitSegment(self):
        global non_scored_words, args
        # make sure the segment hasn't been processed more than we expect.
        assert self.start_unk_padding == 0.0 and self.end_unk_padding == 0.0 and \
              self.start_keep_proportion == 1.0 and self.end_keep_proportion == 1.0
        segments = []  # the answer
        cur_start_index = self.start_index
        cur_start_is_split = False
        # only consider splitting at non-boundary lines.  [we'd just truncate
        # the boundary lines.]
        for index_to_split_at in range(cur_start_index + 1, self.end_index - 1):
            this_split_line = self.split_lines_of_utt[index_to_split_at]
            this_duration = float(this_split_line[3])
            this_edit_type = this_split_line[7]
            this_ref_word = this_split_line[6]
            if (this_edit_type == 'sil' and this_duration > args.max_internal_silence_length) or \
               (this_ref_word in non_scored_words and this_duration > args.max_internal_non_scored_length):
                # We split this segment at this index, dividing the word in two
                # [later on, in PossiblyTruncateBoundaries, it may be further
                # truncated.]
                # Note: we use 'index_to_split_at + 1' because the Segment constructor
                # takes an 'end-index' which is interpreted as one past the end.
                new_segment = Segment(self.split_lines_of_utt, cur_start_index,
                                      index_to_split_at + 1, self.debug_str)
                if cur_start_is_split:
                    new_segment.start_keep_proportion = 0.5
                new_segment.end_keep_proportion = 0.5
                cur_start_is_split = True
                cur_start_index = index_to_split_at
                segments.append(new_segment)
        if len(segments) == 0:  # We did not split.
            segments.append(self)
        else:
            # We did split.  Add the very last segment.
            new_segment = Segment(self.split_lines_of_utt, cur_start_index,
                                  self.end_index, self.debug_str)
            assert cur_start_is_split
            new_segment.start_keep_proportion = 0.5
            segments.append(new_segment)
        return segments


    # This is stage 3 of segment processing.  It will truncate the silences and
    # non-scored words at the segment boundaries if they are longer than the
    # --max-edge-silence-length and --max-edge-non-scored-length respectively
    # (and to the extent that this wouldn't take us below the
    # --min-segment-length or --min-new-segment-length).
    def PossiblyTruncateBoundaries(self):
        for b in [True, False]:
            if b:
                this_index = self.start_index
            else:
                this_index = self.end_index - 1
            this_split_line = self.split_lines_of_utt[this_index]
            truncated_duration = None
            this_duration = float(this_split_line[3])
            this_edit = this_split_line[7]
            this_ref_word = this_split_line[6]
            if this_edit == 'sil' and \
               this_duration > args.max_edge_silence_length:
                truncated_duration = args.max_edge_silence_length
            elif this_ref_word in non_scored_words and \
                 this_duration > args.max_edge_non_scored_length:
                truncated_duration = args.max_edge_non_scored_length
            if truncated_duration != None:
                keep_proportion = truncated_duration / this_duration
                if b:
                    self.start_keep_proportion = keep_proportion
                else:
                    self.end_keep_proportion = keep_proportion

    # This relaxes the segment-boundary truncation of
    # PossiblyTruncateBoundaries(), if it would take us below
    # min-new-segment-length or min-segment-length.  Note: this does not relax
    # the boundary truncation for a particular boundary (start or end) if that
    # boundary corresponds to a 'tainted' line of the ctm (because it's
    # dangerous to include too much 'tainted' audio).
    def RelaxBoundaryTruncation(self):
        # this should be called before adding unk padding.
        assert self.start_unk_padding == self.end_unk_padding == 0.0
        if self.start_keep_proportion == self.end_keep_proportion == 1.0:
            return  # nothing to do there was no truncation.
        length_cutoff = max(args.min_new_segment_length, args.min_segment_length)
        length_with_truncation = self.Length()
        if length_with_truncation >= length_cutoff:
            return  # Nothing to do.
        orig_start_keep_proportion = self.start_keep_proportion
        orig_end_keep_proportion = self.end_keep_proportion
        if not IsTainted(self.split_lines_of_utt[self.start_index]):
            self.start_keep_proportion = 1.0
        if not IsTainted(self.split_lines_of_utt[self.end_index - 1]):
            self.end_keep_proportion = 1.0
        length_with_relaxed_boundaries = self.Length()
        if length_with_relaxed_boundaries <= length_cutoff:
            # Completely undo the truncation [to the extent allowed by the
            # presence of tainted lines at the start/end] if, even without
            # truncation, we'd be below the length cutoff.  This segment may be
            # removed later on (but it may not, if removing truncation makes us
            # identical to the input utterance, and the length is between
            # min_segment_length min_new_segment_length).
            return
        # Next, compute an interpolation constant a such that the
        # {start,end}_keep_proportion values will equal a *
        # [values-computed-by-PossiblyTruncateBoundaries()] + (1-a) * [completely-relaxed-values].
        # we're solving the equation:
        # length_cutoff = a * length_with_truncation + (1-a) * length_with_relaxed_boundaries
        # -> length_cutoff - length_with_relaxed_boundaries =
        #        a * (length_with_truncation - length_with_relaxed_boundaries)
        # -> a = (length_cutoff - length_with_relaxed_boundaries) / (length_with_truncation - length_with_relaxed_boundaries)
        a = (length_cutoff - length_with_relaxed_boundaries) / \
            (length_with_truncation - length_with_relaxed_boundaries)
        if a < 0.0 or a > 1.0:
            print("segment_ctm_edits.py: bad 'a' value = {0}".format(a), file = sys.stderr)
            return
        self.start_keep_proportion = \
           a * orig_start_keep_proportion + (1-a) * self.start_keep_proportion
        self.end_keep_proportion = \
           a * orig_end_keep_proportion + (1-a) * self.end_keep_proportion
        if not abs(self.Length() - length_cutoff) < 0.01:
            print("segment_ctm_edits.py: possible problem relaxing boundary "
                  "truncation, length is {0} vs {1}".format(self.Length(), length_cutoff),
                  file = sys.stderr)


    # This is stage 4 of segment processing.
    # This function may set start_unk_padding and end_unk_padding to nonzero
    # values.  This is done if the current boundary words are real, scored
    # words and we're not next to the beginning or end of the utterance.
    def PossiblyAddUnkPadding(self):
        for b in [True, False]:
            if b:
                this_index = self.start_index
            else:
                this_index = self.end_index - 1
            this_split_line = self.split_lines_of_utt[this_index]
            this_start_time = float(this_split_line[2])
            this_ref_word = this_split_line[6]
            this_edit = this_split_line[7]
            if this_edit == 'cor' and not this_ref_word in non_scored_words:
                # we can consider adding unk-padding.
                if b: # start of utterance.
                    unk_padding = args.unk_padding
                    if unk_padding > this_start_time:  # close to beginning of file
                        unk_padding = this_start_time
                    # If we could add less than half of the specified
                    # unk-padding, don't add any (because when we add
                    # unk-padding we add the unknown-word symbol '<unk>', and if
                    # there isn't enough space to traverse the HMM we don't want
                    # to do it at all.
                    if unk_padding < 0.5 * args.unk_padding:
                        unk_padding = 0.0
                    self.start_unk_padding = unk_padding
                else: # end of utterance.
                    this_end_time = this_start_time + float(this_split_line[3])
                    last_line = self.split_lines_of_utt[-1]
                    utterance_end_time = float(last_line[2]) + float(last_line[3])
                    max_allowable_padding = utterance_end_time - this_end_time
                    assert max_allowable_padding > -0.01
                    unk_padding = args.unk_padding
                    if unk_padding > max_allowable_padding:
                        unk_padding = max_allowable_padding
                    # If we could add less than half of the specified
                    # unk-padding, don't add any (because when we add
                    # unk-padding we add the unknown-word symbol '<unk>', and if
                    # there isn't enough space to traverse the HMM we don't want
                    # to do it at all.
                    if unk_padding < 0.5 * args.unk_padding:
                        unk_padding = 0.0
                    self.end_unk_padding = unk_padding

    # This function will merge the segment in 'other' with the segment
    # in 'self'.  It is only to be called when 'self' and 'other' are from
    # the same utterance, 'other' is after 'self' in time order (based on
    # the original segment cores), and self.EndTime() >= other.StartTime().
    # Note: in this situation there will normally be deleted words
    # between the two segments.  What this program does with the deleted
    # words depends on '--max-deleted-words-kept-when-merging'.  If there
    # were any inserted words in the transcript (less likely), this
    # program will keep the reference.
    def MergeWithSegment(self, other):
        assert self.EndTime() >= other.StartTime() and \
               self.StartTime() < other.EndTime() and \
               self.split_lines_of_utt is other.split_lines_of_utt
        orig_self_end_index = self.end_index
        self.debug_str = "({0}/merged-with/{1})".format(self.debug_str, other.debug_str)
        # everything that relates to the end of this segment gets copied
        # from 'other'.
        self.end_index = other.end_index
        self.end_unk_padding = other.end_unk_padding
        self.end_keep_proportion = other.end_keep_proportion
        # The next thing we have to do is to go over any lines of the ctm that
        # appear between 'self' and 'other', or are shared between both (this
        # would only happen for tainted silence or non-scored-word segments),
        # and decide what to do with them.  We'll keep the reference for any
        # substitutions or insertions (which anyway are unlikely to appear
        # in these merged segments).  Note: most of this happens in self.Text(),
        # but at this point we need to decide whether to mark any deletions
        # as 'discard-this-word'.
        first_index_of_overlap = min(orig_self_end_index - 1, other.start_index)
        last_index_of_overlap = max(orig_self_end_index - 1, other.start_index)
        num_deleted_words = 0
        for i in range(first_index_of_overlap, last_index_of_overlap + 1):
            edit_type = self.split_lines_of_utt[i][7]
            if edit_type == 'del':
                num_deleted_words += 1
        if num_deleted_words > args.max_deleted_words_kept_when_merging:
            for i in range(first_index_of_overlap, last_index_of_overlap + 1):
                if self.split_lines_of_utt[i][7] == 'del':
                    self.split_lines_of_utt[i].append('do-not-include-in-text')

    # Returns the start time of the utterance (within the enclosing utterance)
    # This is before any rounding.
    def StartTime(self):
        first_line = self.split_lines_of_utt[self.start_index]
        first_line_start = float(first_line[2])
        first_line_duration = float(first_line[3])
        first_line_end = first_line_start + first_line_duration
        return first_line_end - self.start_unk_padding \
              - (first_line_duration * self.start_keep_proportion)

    # Returns some string-valued information about 'this' that is useful for debugging.
    def DebugInfo(self):
        return 'start=%d,end=%d,unk-padding=%.2f,%.2f,keep-proportion=%.2f,%.2f,' % \
            (self.start_index, self.end_index, self.start_unk_padding,
             self.end_unk_padding, self.start_keep_proportion, self.end_keep_proportion) + \
         self.debug_str

    # Returns the start time of the utterance (within the enclosing utterance)
    def EndTime(self):
        last_line = self.split_lines_of_utt[self.end_index - 1]
        last_line_start = float(last_line[2])
        last_line_duration = float(last_line[3])
        return last_line_start + (last_line_duration * self.end_keep_proportion) \
             + self.end_unk_padding

    # Returns the segment length in seconds.
    def Length(self):
        return self.EndTime() - self.StartTime()

    def IsWholeUtterance(self):
        # returns true if this segment corresponds to the whole utterance that
        # it's a part of (i.e. its start/end time are zero and the end-time of
        # the last segment.
        last_line_of_utt = self.split_lines_of_utt[-1]
        last_line_end_time = float(last_line_of_utt[2]) + float(last_line_of_utt[3])
        return abs(self.StartTime() - 0.0) < 0.001 and \
               abs(self.EndTime() - last_line_end_time) < 0.001

    # Returns the proportion of the duration of this segment that consists of
    # unk-padding and tainted lines of input (will be between 0.0 and 1.0).
    def JunkProportion(self):
        # Note: only the first and last lines could possibly be tainted as
        # that's how we create the segments; and if either or both are tainted
        # the utterance must contain other lines, so double-counting is not a
        # problem.
        junk_duration = self.start_unk_padding + self.end_unk_padding
        first_split_line = self.split_lines_of_utt[self.start_index]
        if IsTainted(first_split_line):
            first_duration = float(first_split_line[3])
            junk_duration += first_duration * self.start_keep_proportion
        last_split_line = self.split_lines_of_utt[self.end_index - 1]
        if IsTainted(last_split_line):
            last_duration = float(last_split_line[3])
            junk_duration += last_duration * self.end_keep_proportion
        return junk_duration / self.Length()

    # This function will remove something from the beginning of the
    # segment if it's possible to cleanly lop off a bit that contains
    # more junk, as a proportion of its length, than 'args.junk_proportion'.
    # Junk is defined as unk-padding and/or tainted segments.
    # It considers as a potential split point, the first silence
    # segment or non-tainted non-scored-word segment in the
    # utterance.  See also TruncateEndForJunkProportion
    def PossiblyTruncateStartForJunkProportion(self):
        begin_junk_duration = self.start_unk_padding
        first_split_line = self.split_lines_of_utt[self.start_index]
        if IsTainted(first_split_line):
            first_duration = float(first_split_line[3])
            begin_junk_duration += first_duration * self.start_keep_proportion
        if begin_junk_duration == 0.0:
            # nothing to do.
            return

        candidate_start_index = None
        # the following iterates over all lines internal to the utterance.
        for i in range(self.start_index + 1, self.end_index - 1):
            this_split_line = self.split_lines_of_utt[i]
            this_edit_type = this_split_line[7]
            this_ref_word = this_split_line[6]
            # We'll consider splitting on silence and on non-scored words.
            # (i.e. making the silence or non-scored word the left boundary of
            # the new utterance and discarding the piece to the left of that).
            if ((this_edit_type == 'sil'
                 or (this_edit_type == 'cor'
                     and this_ref_word in non_scored_words))
                and (float(this_split_line[3])
                     > args.min_split_point_duration)):
                candidate_start_index = i
                candidate_start_time = float(this_split_line[2])
                break  # Consider only the first potential truncation.
        if candidate_start_index is None:
            return  # Nothing to do as there is no place to split.
        candidate_removed_piece_duration = candidate_start_time - self.StartTime()
        if float(begin_junk_duration) / candidate_removed_piece_duration < args.max_junk_proportion:
            return  # Nothing to do as the candidate piece to remove has too
                    # little junk.
        # OK, remove the piece.
        self.start_index = candidate_start_index
        self.start_unk_padding = 0.0
        self.start_keep_proportion = 1.0
        self.debug_str += ',truncated-start-for-junk'

    # This is like PossiblyTruncateStartForJunkProportion(), but
    # acts on the end of the segment; see comments there.
    def PossiblyTruncateEndForJunkProportion(self):
        end_junk_duration = self.end_unk_padding
        last_split_line = self.split_lines_of_utt[self.end_index - 1]
        if IsTainted(last_split_line):
            last_duration = float(last_split_line[3])
            end_junk_duration += last_duration * self.end_keep_proportion
        if end_junk_duration == 0.0:
            # nothing to do.
            return

        candidate_end_index = None
        # the following iterates over all lines internal to the utterance
        # (starting from the end).
        for i in reversed(range(self.start_index + 1, self.end_index - 1)):
            this_split_line = self.split_lines_of_utt[i]
            this_edit_type = this_split_line[7]
            this_ref_word = this_split_line[6]
            # We'll consider splitting on silence and on non-scored words.
            # (i.e. making the silence or non-scored word the right boundary of
            # the new utterance and discarding the piece to the right of that).
            if ((this_edit_type == 'sil'
                 or (this_edit_type == 'cor'
                     and this_ref_word in non_scored_words))
                and (float(this_split_line[3])
                     > args.min_split_point_duration)):
                candidate_end_index = i + 1  # note: end-indexes are one past the last.
                candidate_end_time = float(this_split_line[2]) + float(this_split_line[3])
                break  # Consider only the latest potential truncation.
        if candidate_end_index is None:
            return  # Nothing to do as there is no place to split.
        candidate_removed_piece_duration = self.EndTime() - candidate_end_time
        if float(end_junk_duration) / candidate_removed_piece_duration < args.max_junk_proportion:
            return  # Nothing to do as the candidate piece to remove has too
                    # little junk.
        # OK, remove the piece.
        self.end_index = candidate_end_index
        self.end_unk_padding = 0.0
        self.end_keep_proportion = 1.0
        self.debug_str += ',truncated-end-for-junk'


    # this will return true if there is at least one word in the utterance
    # that's a scored word (not a non-scored word) and not an OOV word that's
    # realized as unk.  This becomes a filter on keeping segments.
    def ContainsAtLeastOneScoredNonOovWord(self):
        global non_scored_words
        for i in range(self.start_index, self.end_index):
            this_split_line = self.split_lines_of_utt[i]
            this_hyp_word = this_split_line[4]
            this_ref_word = this_split_line[6]
            this_edit = this_split_line[7]
            if this_edit == 'cor' and not this_ref_word in non_scored_words \
               and this_ref_word == this_hyp_word:
                return True
        return False

    # Returns the text corresponding to this utterance, as a string.
    def Text(self):
        global oov_symbol
        text_array = []
        if self.start_unk_padding != 0.0:
            text_array.append(oov_symbol)
        for i in range(self.start_index, self.end_index):
            this_split_line = self.split_lines_of_utt[i]
            this_edit = this_split_line[7]
            this_ref_word = this_split_line[6]
            if this_ref_word != '<eps>' and this_split_line[-1] != 'do-not-include-in-text':
                text_array.append(this_ref_word)
        if self.end_unk_padding != 0.0:
            text_array.append(oov_symbol)
        return ' '.join(text_array)


# Here, 'text' will be something that indicates the stage of processing,
# e.g. 'Stage 0: segment cores', 'Stage 1: add tainted lines',
#, etc.
def AccumulateSegmentStats(segment_list, text):
    global segment_total_length, num_segments
    for segment in segment_list:
        num_segments[text] += 1
        segment_total_length[text] += segment.Length()

def PrintSegmentStats():
    global segment_total_length, num_segments, \
       num_utterances, num_utterances_without_segments, \
       total_length_of_utterances

    print('Number of utterances is %d, of which %.2f%% had no segments after '
          'all processing; total length of data in original utterances (in seconds) '
          'was %d' % (num_utterances,
                      num_utterances_without_segments * 100.0 / num_utterances,
                      total_length_of_utterances),
          file = sys.stderr)


    keys = sorted(segment_total_length.keys())
    for i in range(len(keys)):
        key = keys[i]
        if i > 0:
            delta_percentage = '[%+.2f%%]' % ((segment_total_length[key] - segment_total_length[keys[i-1]])
                                              * 100.0 / total_length_of_utterances)
        print('At %s, num-segments is %d, total length %.2f%% of original total %s' % (
                key, num_segments[key],
                segment_total_length[key] * 100.0 / total_length_of_utterances,
                delta_percentage if i > 0 else ''),
              file = sys.stderr)

# This function creates the segments for an utterance as a list
# of class Segment.
# It returns a 2-tuple (list-of-segments, list-of-deleted-segments)
# where the deleted segments are only useful for diagnostic printing.
# Note: split_lines_of_utt is a list of lists, one per line, each containing the
# sequence of fields.
def GetSegmentsForUtterance(split_lines_of_utt):
    global num_utterances, num_utterances_without_segments, total_length_of_utterances

    num_utterances += 1

    segment_ranges = ComputeSegmentCores(split_lines_of_utt)

    utterance_end_time = float(split_lines_of_utt[-1][2]) + float(split_lines_of_utt[-1][3])
    total_length_of_utterances += utterance_end_time

    segments = [ Segment(split_lines_of_utt, x[0], x[1])
                 for x in segment_ranges ]

    AccumulateSegmentStats(segments, 'stage  0 [segment cores]')
    for segment in segments:
        segment.PossiblyAddTaintedLines()
    AccumulateSegmentStats(segments, 'stage  1 [add tainted lines]')
    new_segments = []
    for s in segments:
        new_segments += s.PossiblySplitSegment()
    segments = new_segments
    AccumulateSegmentStats(segments, 'stage  2 [split segments]')
    for s in segments:
        s.PossiblyTruncateBoundaries()
    AccumulateSegmentStats(segments, 'stage  3 [truncate boundaries]')
    for s in segments:
        s.RelaxBoundaryTruncation()
    AccumulateSegmentStats(segments, 'stage  4 [relax boundary truncation]')
    for s in segments:
        s.PossiblyAddUnkPadding()
    AccumulateSegmentStats(segments, 'stage  5 [unk-padding]')

    deleted_segments = []
    new_segments = []
    for s in segments:
        # the 0.999 allows for roundoff error.
        if (not s.IsWholeUtterance() and s.Length() < 0.999 * args.min_new_segment_length):
            s.debug_str += '[deleted-because-of--min-new-segment-length]'
            deleted_segments.append(s)
        else:
            new_segments.append(s)
    segments = new_segments
    AccumulateSegmentStats(segments, 'stage  6 [remove new segments under --min-new-segment-length')

    new_segments = []
    for s in segments:
        # the 0.999 allows for roundoff error.
        if s.Length() < 0.999 * args.min_segment_length:
            s.debug_str += '[deleted-because-of--min-segment-length]'
            deleted_segments.append(s)
        else:
            new_segments.append(s)
    segments = new_segments
    AccumulateSegmentStats(segments, 'stage  7 [remove segments under --min-segment-length')

    for s in segments:
        s.PossiblyTruncateStartForJunkProportion()
    AccumulateSegmentStats(segments, 'stage  8 [truncate segment-starts for --max-junk-proportion')

    for s in segments:
        s.PossiblyTruncateEndForJunkProportion()
    AccumulateSegmentStats(segments, 'stage  9 [truncate segment-ends for --max-junk-proportion')

    new_segments = []
    for s in segments:
        if s.ContainsAtLeastOneScoredNonOovWord():
            new_segments.append(s)
        else:
            s.debug_str += '[deleted-because-no-scored-non-oov-words]'
            deleted_segments.append(s)

    segments = new_segments
    AccumulateSegmentStats(segments, 'stage 10 [remove segments without scored,non-OOV words]')

    new_segments = []
    for s in segments:
        j = s.JunkProportion()
        if j <= args.max_junk_proportion:
            new_segments.append(s)
        else:
            s.debug_str += '[deleted-because-junk-proportion={0}]'.format(j)
            deleted_segments.append(s)

    segments = new_segments
    AccumulateSegmentStats(segments, 'stage 11 [remove segments with junk exceeding --max-junk-proportion]')

    new_segments = []
    if len(segments) > 0:
        new_segments.append(segments[0])
        for i in range(1, len(segments)):
            if new_segments[-1].EndTime() >= segments[i].StartTime():
                new_segments[-1].MergeWithSegment(segments[i])
            else:
                new_segments.append(segments[i])
    segments = new_segments
    AccumulateSegmentStats(segments, 'stage 12 [merge overlapping or touching segments]')

    for i in range(len(segments) - 1):
        if segments[i].EndTime() > segments[i+1].StartTime():
            # this just adds something to --ctm-edits-out output
            segments[i+1].debug_str += ",overlaps-previous-segment"

    if len(segments) == 0:
        num_utterances_without_segments += 1

    return (segments, deleted_segments)

# this prints a number with a certain number of digits after
# the point, while removing trailing zeros.
def FloatToString(f):
    num_digits = 6 # we want to print 6 digits after the zero
    g = f
    while abs(g) > 1.0:
        g *= 0.1
        num_digits += 1
    format_str = '%.{0}g'.format(num_digits)
    return format_str % f

# Gives time in string form as an exact multiple of the frame-length, e.g. 0.01
# (after rounding).
def TimeToString(time, frame_length):
    n = round(time / frame_length)
    assert n >= 0
    # The next function call will remove trailing zeros while printing it, so
    # that e.g. 0.01 will be printed as 0.01 and not 0.0099999999999999.  It
    # seems that doing this in a simple way is not really possible (at least,
    # not without assuming that frame_length is of the form 10^-n, which we
    # don't really want to do).
    return FloatToString(n * frame_length)

def WriteSegmentsForUtterance(text_output_handle, segments_output_handle,
                              old_utterance_name, segments):
    num_digits = len('{}'.format(len(segments)))
    for n in range(len(segments)):
        segment = segments[n]
        # split utterances will be named foo-bar-1 foo-bar-2, etc.
        new_utterance_name = "{old}-{index:0{width}}".format(
                                 old=old_utterance_name, index=n+1,
                                 width=num_digits)
        # print a line to the text output of the form like
        # <new-utterance-id> <text>
        # like:
        # foo-bar-1 hello this is dan
        print(new_utterance_name, segment.Text(), file = text_output_handle)
        # print a line to the segments output of the form
        # <new-utterance-id> <old-utterance-id> <start-time> <end-time>
        # like:
        # foo-bar-1 foo-bar 5.1 7.2
        print(new_utterance_name, old_utterance_name,
              TimeToString(segment.StartTime(), args.frame_length),
              TimeToString(segment.EndTime(), args.frame_length),
              file = segments_output_handle)


# Note, this is destrutive of 'segments_for_utterance', but it won't matter.
def PrintDebugInfoForUtterance(ctm_edits_out_handle,
                               split_lines_of_cur_utterance,
                               segments_for_utterance,
                               deleted_segments_for_utterance):
    # info_to_print will be list of 2-tuples (time, 'start-segment-n'|'end-segment-n')
    # representing the start or end times of segments.
    info_to_print = []
    for n in range(len(segments_for_utterance)):
        segment = segments_for_utterance[n]
        start_string = 'start-segment-{0}[{1}]'.format(n+1, segment.DebugInfo())
        info_to_print.append( (segment.StartTime(), start_string) )
        end_string = 'end-segment-{}'.format(n+1)
        info_to_print.append( (segment.EndTime(), end_string) )
    # for segments that were deleted we print info like start-deleted-segment-1, and
    # otherwise similar info to segments that were retained.
    for n in range(len(deleted_segments_for_utterance)):
        segment = deleted_segments_for_utterance[n]
        start_string = 'start-deleted-segment-{0}[{1}]'.format(n+1, segment.DebugInfo())
        info_to_print.append( (segment.StartTime(), start_string) )
        end_string = 'end-deleted-segment-{}'.format(n+1)
        info_to_print.append( (segment.EndTime(), end_string) )

    info_to_print = sorted(info_to_print)

    for i in range(len(split_lines_of_cur_utterance)):
        split_line=split_lines_of_cur_utterance[i]
        split_line[0] += '[{}]'.format(i)    # add an index like [0], [1], to
                                             # the utterance-id so we can easily
                                             # look up segment indexes.
        start_time = float(split_line[2])
        end_time = start_time + float(split_line[3])
        split_line_copy = list(split_line)
        while len(info_to_print) > 0 and info_to_print[0][0] <= end_time:
            (segment_start, string) = info_to_print[0]
            # shift the first element off of info_to_print.
            info_to_print = info_to_print[1:]
            # add a field like 'start-segment1[...]=3.21' to what we're about to print.
            split_line_copy.append(string + "=" + TimeToString(segment_start, args.frame_length))
        print(' '.join(split_line_copy), file = ctm_edits_out_handle)

# This accumulates word-level stats about, for each reference word, with what
# probability it will end up in the core of a segment.  Words with low
# probabilities of being in segments will generally be associated with some kind
# of error (there is a higher probability of having a wrong lexicon entry).
def AccWordStatsForUtterance(split_lines_of_utt,
                             segments_for_utterance):
    # word_count_pair is a map from a string (the word) to
    # a list [total-count, count-not-within-segments]
    global word_count_pair
    line_is_in_segment = [ False ] * len(split_lines_of_utt)
    for segment in segments_for_utterance:
        for i in range(segment.start_index, segment.end_index):
            line_is_in_segment[i] = True
    for i in range(len(split_lines_of_utt)):
        this_ref_word = split_lines_of_utt[i][6]
        if this_ref_word != '<eps>':
            word_count_pair[this_ref_word][0] += 1
            if not line_is_in_segment[i]:
                word_count_pair[this_ref_word][1] += 1

def PrintWordStats(word_stats_out):
    try:
        f = open(word_stats_out, 'w', encoding='utf-8')
    except:
        sys.exit("segment_ctm_edits.py: error opening word-stats file --word-stats-out={0} "
                 "for writing".format(word_stats_out))
    global word_count_pair
    # Sort from most to least problematic.  We want to give more prominence to
    # words that are most frequently not in segments, but also to high-count
    # words.  Define badness = pair[1] / pair[0], and total_count = pair[0],
    # where 'pair' is a value of word_count_pair.  We'll reverse sort on
    # badness^3 * total_count = pair[1]^3 / pair[0]^2.
    for key, pair in sorted(word_count_pair.items(),
                      key = lambda item: (item[1][1] ** 3) * 1.0 / (item[1][0] ** 2),
                      reverse = True):
        badness = pair[1] * 1.0 / pair[0]
        total_count = pair[0]
        print(key, badness, total_count, file = f)
    try:
        f.close()
    except:
        sys.exit("segment_ctm_edits.py: error closing file --word-stats-out={0} "
                 "(full disk?)".format(word_stats_out))
    print("segment_ctm_edits.py: please see the file {0} for word-level statistics "
          "saying how frequently each word was excluded for a segment; format is "
          "<word> <proportion-of-time-excluded> <total-count>.  Particularly "
          "problematic words appear near the top of the file.".format(word_stats_out),
          file = sys.stderr)


def ProcessData():
    try:
        f_in = open(args.ctm_edits_in, encoding='utf-8')
    except:
        sys.exit("segment_ctm_edits.py: error opening ctm-edits input "
                 "file {0}".format(args.ctm_edits_in))
    try:
        text_output_handle = open(args.text_out, 'w', encoding='utf-8')
    except:
        sys.exit("segment_ctm_edits.py: error opening text output "
                 "file {0}".format(args.text_out))
    try:
        segments_output_handle = open(args.segments_out, 'w', encoding='utf-8')
    except:
        sys.exit("segment_ctm_edits.py: error opening segments output "
                 "file {0}".format(args.text_out))
    if args.ctm_edits_out != None:
        try:
            ctm_edits_output_handle = open(args.ctm_edits_out, 'w', encoding='utf-8')
        except:
            sys.exit("segment_ctm_edits.py: error opening ctm-edits output "
                     "file {0}".format(args.ctm_edits_out))

    # Most of what we're doing in the lines below is splitting the input lines
    # and grouping them per utterance, before giving them to ProcessUtterance()
    # and then printing the modified lines.
    first_line = f_in.readline()
    if first_line == '':
        sys.exit("segment_ctm_edits.py: empty input")
    split_pending_line = first_line.split()
    if len(split_pending_line) == 0:
        sys.exit("segment_ctm_edits.py: bad input line " + first_line)
    cur_utterance = split_pending_line[0]
    split_lines_of_cur_utterance = []

    while True:
        if len(split_pending_line) == 0 or split_pending_line[0] != cur_utterance:
            (segments_for_utterance,
             deleted_segments_for_utterance) = GetSegmentsForUtterance(split_lines_of_cur_utterance)
            AccWordStatsForUtterance(split_lines_of_cur_utterance, segments_for_utterance)
            WriteSegmentsForUtterance(text_output_handle, segments_output_handle,
                                      cur_utterance, segments_for_utterance)
            if args.ctm_edits_out != None:
                PrintDebugInfoForUtterance(ctm_edits_output_handle,
                                           split_lines_of_cur_utterance,
                                           segments_for_utterance,
                                           deleted_segments_for_utterance)
            split_lines_of_cur_utterance = []
            if len(split_pending_line) == 0:
                break
            else:
                cur_utterance = split_pending_line[0]

        split_lines_of_cur_utterance.append(split_pending_line)
        next_line = f_in.readline()
        split_pending_line = next_line.split()
        if len(split_pending_line) == 0:
            if next_line != '':
                sys.exit("segment_ctm_edits.py: got an empty or whitespace input line")
    try:
        text_output_handle.close()
        segments_output_handle.close()
        if args.ctm_edits_out != None:
            ctm_edits_output_handle.close()
    except:
        sys.exit("segment_ctm_edits.py: error closing one or more outputs "
                 "(broken pipe or full disk?)")


def ReadNonScoredWords(non_scored_words_file):
    global non_scored_words
    try:
        f = open(non_scored_words_file, encoding='utf-8')
    except:
        sys.exit("segment_ctm_edits.py: error opening file: "
                 "--non-scored-words=" + non_scored_words_file)
    for line in f.readlines():
        a = line.split()
        if not len(line.split()) == 1:
            sys.exit("segment_ctm_edits.py: bad line in non-scored-words "
                     "file {0}: {1}".format(non_scored_words_file, line))
        non_scored_words.add(a[0])
    f.close()


non_scored_words = set()
ReadNonScoredWords(args.non_scored_words_in)

oov_symbol = None
if args.oov_symbol_file != None:
    try:
        with open(args.oov_symbol_file, encoding='utf-8') as f:
            line = f.readline()
            assert len(line.split()) == 1
            oov_symbol = line.split()[0]
            assert f.readline() == ''
    except Exception as e:
        sys.exit("segment_ctm_edits.py: error reading file --oov-symbol-file=" +
                 args.oov_symbol_file + ", error is: " + str(e))
elif args.unk_padding != 0.0:
    sys.exit("segment_ctm_edits.py: if the --unk-padding option is nonzero (which "
             "it is by default, the --oov-symbol-file option must be supplied.")

# segment_total_length and num_segments are maps from
# 'stage' strings; see AccumulateSegmentStats for details.
segment_total_length = defaultdict(int)
num_segments = defaultdict(int)
# the lambda expression below is an anonymous function that takes no arguments
# and returns the new list [0, 0].
word_count_pair = defaultdict(lambda: [0, 0])
num_utterances = 0
num_utterances_without_segments = 0
total_length_of_utterances = 0


ProcessData()
PrintSegmentStats()
if args.word_stats_out != None:
    PrintWordStats(args.word_stats_out)
if args.ctm_edits_out != None:
    print("segment_ctm_edits.py: detailed utterance-level debug information "
          "is in " + args.ctm_edits_out, file = sys.stderr)


================================================
FILE: egs/steps/cleanup/internal/segment_ctm_edits_mild.py
================================================
#! /usr/bin/env python

# Copyright 2016   Vimal Manohar
#           2016   Johns Hopkins University (author: Daniel Povey)
# Apache 2.0

from __future__ import print_function
from __future__ import division
import argparse
import copy
import logging
import heapq
import sys
from collections import defaultdict

"""
This script reads 'ctm-edits' file format that is produced by align_ctm_ref.py
and modified by modify_ctm_edits.py and taint_ctm_edits.py. Its function is to
produce a segmentation and text from the ctm-edits input.

It is a milder version of the script segment_ctm_edits.py i.e. it allows
to keep more of the reference. This is useful for segmenting long-audio
based on imperfect transcripts.

The ctm-edits file format that this script expects is as follows
<file-id> <channel> <start-time> <duration> <conf> <hyp-word> <ref-word> <edit>
['tainted']
[note: file-id is really utterance-id at this point].
"""

_global_logger = logging.getLogger(__name__)
_global_logger.setLevel(logging.INFO)
_global_handler = logging.StreamHandler()
_global_handler.setLevel(logging.INFO)
_global_formatter = logging.Formatter(
    '%(asctime)s [%(pathname)s:%(lineno)s - '
    '%(funcName)s - %(levelname)s ] %(message)s')
_global_handler.setFormatter(_global_formatter)
_global_logger.addHandler(_global_handler)

_global_non_scored_words = {}


def non_scored_words():
    return _global_non_scored_words


def get_args():
    parser = argparse.ArgumentParser(
        description="""This program produces segmentation and text information
        based on reading ctm-edits input format which is produced by
        steps/cleanup/internal/get_ctm_edits.py,
        steps/cleanup/internal/modify_ctm_edits.py and
        steps/cleanup/internal/taint_ctm_edits.py.""",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    parser.add_argument("--min-segment-length", type=float, default=0.5,
                        help="""Minimum allowed segment length (in seconds) for
                        any segment; shorter segments than this will be
                        discarded.""")
    parser.add_argument("--min-new-segment-length", type=float, default=1.0,
                        help="""Minimum allowed segment length (in seconds) for
                        newly created segments (i.e. not identical to the input
                        utterances).
                        Expected to be >= --min-segment-length.""")
    parser.add_argument("--frame-length", type=float, default=0.01,
                        help="""This only affects rounding of the output times;
                        they will be constrained to multiples of this
                        value.""")
    parser.add_argument("--max-tainted-length", type=float, default=0.05,
                        help="""Maximum allowed length of any 'tainted' line.
                        Note: 'tainted' lines may only appear at the boundary
                        of a segment""")
    parser.add_argument("--max-edge-silence-length", type=float, default=0.5,
                        help="""Maximum allowed length of silence if it appears
                        at the edge of a segment (will be truncated).  This
                        rule is relaxed if such truncation would take a segment
                        below the --min-segment-length or
                        --min-new-segment-length.""")
    parser.add_argument("--max-edge-non-scored-length", type=float,
                        default=0.5,
                        help="""Maximum allowed length of a non-scored word
                        (noise, cough, etc.) if it appears at the edge of a
                        segment (will be truncated).  This rule is relaxed if
                        such truncation would take a segment below the
                        --min-segment-length.""")
    parser.add_argument("--max-internal-silence-length", type=float,
                        default=2.0,
                        help="""Maximum allowed length of silence if it appears
                        inside a segment (will cause the segment to be
                        split).""")
    parser.add_argument("--max-internal-non-scored-length", type=float,
                        default=2.0,
                        help="""Maximum allowed length of a non-scored word
                        (noise, etc.) if it appears inside a segment (will
                        cause the segment to be split).
                        Note: reference words which are real words but OOV are
                        not included in this category.""")
    parser.add_argument("--unk-padding", type=float, default=0.05,
                        help="""Amount of padding with <unk> that we do if a
                        segment boundary is next to errors (ins, del, sub).
                        That is, we add this amount of time to the segment and
                        add the <unk> word to cover the acoustics.  If nonzero,
                        the --oov-symbol-file option must be supplied.""")
    parser.add_argument("--max-junk-proportion", type=float, default=0.1,
                        help="""Maximum proportion of the time of the segment
                        that may consist of potentially bad data, in which we
                        include 'tainted' lines of the ctm-edits input and
                        unk-padding.""")
    parser.add_argument("--min-split-point-duration", type=float, default=0.0,
                        help="""Minimum duration of silence or non-scored word
                        to be considered a viable split point when
                        truncating based on junk proportion.""")
    parser.add_argument("--max-deleted-words-kept-when-merging",
                        dest='max_deleted_words', type=int, default=1,
                        help="""When merging segments that are found to be
                        overlapping or adjacent after all other processing,
                        keep in the transcript the reference words that were
                        deleted between the segments [if any] as long as there
                        were no more than this many reference words.  Setting
                        this to zero will mean that any reference words that
                        were deleted between the segments we're about to
                        reattach will not appear in the generated transcript
                        (so we'll match the hyp).""")

    parser.add_argument("--splitting.min-silence-length",
                        dest="min_silence_length_to_split",
                        type=float, default=0.3,
                        help="""Only considers silences that are at least this
                        long as potential split points""")
    parser.add_argument("--splitting.min-non-scored-length",
                        dest="min_non_scored_length_to_split",
                        type=float, default=0.1,
                        help="""Only considers non-scored words that are at
                        least this long as potential split points""")
    parser.add_argument("--splitting.max-segment-length",
                        dest="max_segment_length_for_splitting",
                        type=float, default=10,
                        help="""Try to split long segments into segments that
                        are smaller that this size. See
                        possibly_split_long_segments() in Segment class.""")
    parser.add_argument("--splitting.hard-max-segment-length",
                        dest="hard_max_segment_length",
                        type=float, default=15,
                        help="""Split all segments that are longer than this
                        uniformly into segments of size
                        --splitting.max-segment-length""")

    parser.add_argument("--merging-score.silence-factor",
                        dest="silence_factor",
                        type=float, default=1,
                        help="""Weightage on the silence length when merging
                        segments""")
    parser.add_argument("--merging-score.incorrect-words-factor",
                        dest="incorrect_words_factor",
                        type=float, default=1,
                        help="""Weightage on the incorrect_words_length when
                        merging segments""")
    parser.add_argument("--merging-score.tainted-words-factor",
                        dest="tainted_words_factor",
                        type=float, default=1,
                        help="""Weightage on the WER including the
                        tainted words as incorrect words.""")

    parser.add_argument("--merging.max-wer",
                        dest="max_wer",
                        type=float, default=10.0,
                        help="Max WER%% of merged segments when merging")
    parser.add_argument("--merging.max-bad-proportion",
                        dest="max_bad_proportion",
                        type=float, default=0.2,
                        help="""Maximum length of silence, junk and incorrect
                        words in a merged segment allowed as a fraction of the
                        total length of merged segment.""")
    parser.add_argument("--merging.max-segment-length",
                        dest='max_segment_length_for_merging',
                        type=float, default=10,
                        help="""Maximum segment length allowed for merged
                        segment""")
    parser.add_argument("--merging.max-intersegment-incorrect-words-length",
                        dest='max_intersegment_incorrect_words_length',
                        type=float, default=0.2,
                        help="""Maximum length of intersegment region that
                        can be of incorrect word. This is to
                        allow cases where there may be a lot of silence in the
                        segment but the incorrect words are few, while
                        preventing regions that have a lot of incorrect
                        words.""")

    parser.add_argument("--oov-symbol-file", type=argparse.FileType('r'),
                        help="""Filename of file such as data/lang/oov.txt
                        which contains the text form of the OOV word, normally
                        '<unk>'.  Supplied as a file to avoid complications
                        with escaping.  Necessary if the --unk-padding option
                        has a nonzero value (which it does by default.""")
    parser.add_argument("--ctm-edits-out", type=argparse.FileType('w'),
                        help="""Filename to output an extended version of the
                        ctm-edits format with segment start and end points
                        noted.  This file is intended to be read by humans;
                        there are currently no scripts that will read it.""")
    parser.add_argument("--word-stats-out", type=argparse.FileType('w'),
                        help="""Filename for output of word-level stats, of the
                        form '<word> <bad-proportion> <total-count-in-ref>',
                        e.g. 'hello 0.12 12408', where the <bad-proportion> is
                        the proportion of the time that this reference word
                        does not make it into a segment.  It can help reveal
                        words that have problematic pronunciations or are
                        associated with transcription errors.""")

    parser.add_argument("non_scored_words_in",
                        metavar="<non-scored-words-file>",
                        type=argparse.FileType('r'),
                        help="""Filename of file containing a list of
                        non-scored words, one per line. See
                        steps/cleanup/internal/get_nonscored_words.py.""")
    parser.add_argument("ctm_edits_in", metavar="<ctm-edits-in>",
                        type=argparse.FileType('r'),
                        help="""Filename of input ctm-edits file.  Use
                        /dev/stdin for standard input.""")
    parser.add_argument("text_out", metavar="<text-out>",
                        type=argparse.FileType('w'),
                        help="""Filename of output text file (same format as
                        data/train/text, i.e.  <new-utterance-id> <word1>
                        <word2> ... <wordN>""")
    parser.add_argument("segments_out", metavar="<segments-out>",
                        type=argparse.FileType('w'),
                        help="""Filename of output segments.  This has the same
                        format as data/train/segments, but instead of
                        <recording-id>, the second field is the old
                        utterance-id, i.e <new-utterance-id> <old-utterance-id>
                        <start-time> <end-time>""")

    parser.add_argument("--verbose", type=int, default=0,
                        help="Use higher verbosity for more debugging output")

    args = parser.parse_args()

    if args.verbose > 2:
        _global_handler.setLevel(logging.DEBUG)
        _global_logger.setLevel(logging.DEBUG)

    return args


def is_tainted(split_line_of_utt):
    """Returns True if this line in ctm-edit is "tainted."""
    return len(split_line_of_utt) > 8 and split_line_of_utt[8] == 'tainted'


def compute_segment_cores(split_lines_of_utt):
    """
    This function returns a list of pairs (start-index, end-index) representing
    the cores of segments (so if a pair is (s, e), then the core of a segment
    would span (s, s+1, ... e-1).

    The argument 'split_lines_of_utt' is list of lines from a ctm-edits file
    corresponding to a single utterance.

    By the 'core of a segment', we mean a sequence of ctm-edits lines including
    at least one 'cor' line and a contiguous sequence of other lines of the
    type 'cor', 'fix' and 'sil' that must be not tainted.  The segment core
    excludes any tainted lines at the edge of a segment, which will be added
    later.

    We only initiate segments when it contains something correct and not
    realized as unk (i.e. ref==hyp); and we extend it with anything that is
    'sil' or 'fix' or 'cor' that is not tainted.  Contiguous regions of 'true'
    in the resulting boolean array will then become the cores of prototype
    segments, and we'll add any adjacent tainted words (or parts of them).
    """
    num_lines = len(split_lines_of_utt)
    line_is_in_segment_core = [False] * num_lines
    # include only the correct lines
    for i in range(num_lines):
        if (split_lines_of_utt[i][7] == 'cor'
                and split_lines_of_utt[i][4] == split_lines_of_utt[i][6]):
            line_is_in_segment_core[i] = True

    # extend each proto-segment forwards as far as we can:
    for i in range(1, num_lines):
        if line_is_in_segment_core[i - 1] and not line_is_in_segment_core[i]:
            edit_type = split_lines_of_utt[i][7]
            if (not is_tainted(split_lines_of_utt[i])
                    and (edit_type == 'cor' or edit_type == 'sil'
                         or edit_type == 'fix')):
                line_is_in_segment_core[i] = True

    # extend each proto-segment backwards as far as we can:
    for i in reversed(range(0, num_lines - 1)):
        if line_is_in_segment_core[i + 1] and not line_is_in_segment_core[i]:
            edit_type = split_lines_of_utt[i][7]
            if (not is_tainted(split_lines_of_utt[i])
                    and (edit_type == 'cor' or edit_type == 'sil'
                         or edit_type == 'fix')):
                line_is_in_segment_core[i] = True

    # Get contiguous regions of line in the form of a list
    # of (start_index, end_index)
    segment_ranges = []
    cur_segment_start = None
    for i in range(0, num_lines):
        if line_is_in_segment_core[i]:
            if cur_segment_start is None:
                cur_segment_start = i
        else:
            if cur_segment_start is not None:
                segment_ranges.append((cur_segment_start, i))
                cur_segment_start = None
    if cur_segment_start is not None:
        segment_ranges.append((cur_segment_start, num_lines))

    return segment_ranges


class SegmentStats(object):
    """Class to store various statistics of segments."""

    def __init__(self):
        self.num_incorrect_words = 0
        self.num_tainted_words = 0
        self.incorrect_words_length = 0
        self.tainted_nonsilence_length = 0
        self.silence_length = 0
        self.num_words = 0
        self.total_length = 0

    def wer(self):
        """Returns WER%"""
        try:
            return float(self.num_incorrect_words) * 100.0 / self.num_words
        except ZeroDivisionError:
            return float("inf")

    def bad_proportion(self):
        assert self.total_length > 0
        proportion = float(self.silence_length + self.tainted_nonsilence_length
                           + self.incorrect_words_length) / self.total_length
        if proportion > 1.00005:
            raise RuntimeError("Error in segment stats {0}".format(self))
        return proportion

    def incorrect_proportion(self):
        assert self.total_length > 0
        proportion = float(self.incorrect_words_length) / self.total_length
        if proportion > 1.00005:
            raise RuntimeError("Error in segment stats {0}".format(self))
        return proportion

    def combine(self, other, scale=1):
        """Merges this stats with another stats object."""
        self.num_incorrect_words += scale * other.num_incorrect_words
        self.num_tainted_words += scale * other.num_tainted_words
        self.num_words += scale * other.num_words
        self.incorrect_words_length += scale * other.incorrect_words_length
        self.tainted_nonsilence_length += (scale
                                           * other.tainted_nonsilence_length)
        self.silence_length += scale * other.silence_length
        self.total_length += scale * other.total_length

    def assert_equal(self, other):
        try:
            assert self.num_incorrect_words == other.num_incorrect_words
            assert self.num_tainted_words == other.num_tainted_words
            assert (abs(self.incorrect_words_length
                        - other.incorrect_words_length) < 0.01)
            assert (abs(self.tainted_nonsilence_length
                        - other.tainted_nonsilence_length) < 0.01)
            assert abs(self.silence_length - other.silence_length) < 0.01
            assert self.num_words == other.num_words
            assert abs(self.total_length - other.total_length) < 0.01
        except AssertionError:
            _global_logger.error("self %s != other %s", self, other)
            raise

    def compare(self, other):
        """Returns true if this stats is same as another stats object."""
        if self.num_incorrect_words != other.num_incorrect_words:
            return False
        if self.num_tainted_words != other.num_tainted_words:
            return False
        if self.incorrect_words_length != other.incorrect_words_length:
            return False
        if self.tainted_nonsilence_length != other.tainted_nonsilence_length:
            return False
        if self.silence_length != other.silence_length:
            return False
        if self.num_words != other.num_words:
            return False
        if self.total_length != other.total_length:
            return False
        return True

    def __str__(self):
        return ("num-incorrect-words={num_incorrect:d},"
                "num-tainted-words={num_tainted:d},"
                "num-words={num_words:d},"
                "incorrect-length={incorrect_length:.2f},"
                "silence-length={sil_length:.2f},"
                "tainted-nonsilence-length={tainted_nonsilence_length:.2f},"
                "total-length={total_length:.2f}".format(
                    num_incorrect=self.num_incorrect_words,
                    num_tainted=self.num_tainted_words,
                    num_words=self.num_words,
                    incorrect_length=self.incorrect_words_length,
                    sil_length=self.silence_length,
                    tainted_nonsilence_length=self.tainted_nonsilence_length,
                    total_length=self.total_length))


class Segment(object):
    """Class to store segments."""

    def __init__(self, split_lines_of_utt, start_index, end_index,
                 debug_str=None, compute_segment_stats=False,
                 segment_stats=None):
        self.split_lines_of_utt = split_lines_of_utt

        # start_index is the index of the first line that appears in this
        # segment, and end_index is one past the last line.  This does not
        # include unk-padding.
        self.start_index = start_index
        self.end_index = end_index
        assert end_index > start_index

        # If the following values are nonzero, then when we create the segment
        # we will add <unk> at the start and end of the segment [representing
        # partial words], with this amount of additional audio.
        self.start_unk_padding = 0.0
        self.end_unk_padding = 0.0

        # debug_str keeps track of the 'core' of the segment.
        if debug_str is None:
            debug_str = 'core-start={0},core-end={1}'.format(start_index,
                                                             end_index)
        else:
            assert type(debug_str) == str
        self.debug_str = debug_str

        # This gives the proportion of the time of the first line in the
        # segment that we keep.  Usually 1.0 but may be less if we've trimmed
        # away some proportion of the time.
        self.start_keep_proportion = 1.0
        # This gives the proportion of the time of the last line in the segment
        # that we keep.  Usually 1.0 but may be less if we've trimmed away some
        # proportion of the time.
        self.end_keep_proportion = 1.0

        self.stats = None

        if compute_segment_stats:
            self.compute_stats()

        if segment_stats is not None:
            self.compute_stats()
            self.stats.assert_equal(segment_stats)
            self.stats = segment_stats

    def copy(self, copy_stats=True):
        segment = Segment(self.split_lines_of_utt, self.start_index,
                          self.end_index, debug_str=self.debug_str,
                          segment_stats=(None if not copy_stats
                                         else copy.deepcopy(self.stats)))
        segment.start_keep_proportion = self.start_keep_proportion
        segment.end_keep_proportion = self.end_keep_proportion
        segment.start_unk_padding = self.start_unk_padding
        segment.end_unk_padding = self.end_unk_padding
        return segment

    def __str__(self):
        return self.debug_info()

    def compute_stats(self):
        """Compute stats for this segment and store them in SegmentStats
        structure.
        This is typically called just before merging segments.
        """
        self.stats = SegmentStats()
        for i in range(self.start_index, self.end_index):
            this_duration = float(self.split_lines_of_utt[i][3])
            assert self.start_keep_proportion == 1.0
            assert self.end_keep_proportion == 1.0
            # TODO(vimal): Decide if keep proportion must be applied
            # if i == self.start_index:
            #     this_duration *= self.start_keep_proportion
            # if i == self.end_index - 1:
            #     this_duration *= self.end_keep_proportion
            if self.end_index - 1 == self.start_index:
                # TODO(vimal): Is this true?
                assert self.start_keep_proportion == self.end_keep_proportion

            try:
                if self.split_lines_of_utt[i][7] not in ['cor', 'fix', 'sil']:
                    # TODO(vimal): The commented part below is is apparently
                    # not true in modify_ctm_edits.py.
                    # Need to check this or change comments there.
                    # assert (self.split_lines_of_utt[i][6]
                    #         not in non_scored_words)
                    assert not is_tainted(self.split_lines_of_utt[i])
                    self.stats.num_incorrect_words += 1
                    self.stats.incorrect_words_length += this_duration
                if self.split_lines_of_utt[i][7] == 'sil':
                    self.stats.silence_length += this_duration
                else:
                    if (self.split_lines_of_utt[i][6]
                            not in non_scored_words()):
                        self.stats.num_words += 1
                if (is_tainted(self.split_lines_of_utt[i])
                        and self.split_lines_of_utt[i][7] not in 'sil'
                        and (self.split_lines_of_utt[i][6]
                             not in non_scored_words())):
                    # If ref_word is not a non-scored word, this would be
                    # counted as an incorrect word.
                    self.stats.num_tainted_words += 1
                    self.stats.tainted_nonsilence_length += this_duration
            except Exception:
                _global_logger.error(
                    "Something went wrong when computing stats at "
                    "ctm line %s", self.split_lines_of_utt[i])
                raise
        self.stats.total_length = self.length()

        try:
            assert (self.stats.tainted_nonsilence_length
                    + self.stats.silence_length
                    + self.stats.incorrect_words_length - 0.001
                    <= self.stats.total_length)
        except AssertionError:
            _global_logger.error(
                "Something wrong with the stats for segment %s", self)
            raise

    def possibly_add_tainted_lines(self):
        """
        This is stage 1 of segment processing (after creating the boundaries of
        the core of the segment, which is done outside of this class).

        This function may reduce start_index and/or increase end_index by
        including a single adjacent 'tainted' line from the ctm-edits file.
        This is only done if the lines at the boundaries of the segment are
        currently real non-silence words and not non-scored words.  The idea is
        that we probably don't want to start or end the segment right at the
        boundary of a real word, we want to add some kind of padding.
        """
        split_lines_of_utt = self.split_lines_of_utt
        # we're iterating over the segment (start, end)
        for b in [False, True]:
            if b:
                boundary_index = self.end_index - 1
                adjacent_index = self.end_index
            else:
                boundary_index = self.start_index
                adjacent_index = self.start_index - 1
            if (adjacent_index >= 0
                    and adjacent_index < len(split_lines_of_utt)):
                # only consider merging the adjacent word into the segment if
                # we're not at the boundary of the utterance.
                adjacent_line_is_tainted = is_tainted(
                    split_lines_of_utt[adjacent_index])
                # if the adjacent line wasn't tainted, then there must have
                # been another stronger reason why we didn't include it in the
                # core of the segment (probably that it was an ins, del or
                # sub), so there is no point considering it.
                if adjacent_line_is_tainted:
                    boundary_edit_type = split_lines_of_utt[boundary_index][7]
                    boundary_ref_word = split_lines_of_utt[boundary_index][6]
                    # Even if the edit_type is 'cor', it is possible that
                    # column 4 (hyp_word) is not the same as column 6
                    # (ref_word) because the ref_word is an OOV and the
                    # hyp_word is OOV symbol.

                    # we only add the tainted line to the segment if the word
                    # at the boundary was a non-silence word that was correctly
                    # decoded and not fixed [see modify_ctm_edits.py.]
                    if (boundary_edit_type == 'cor'
                            and (boundary_ref_word
                                 not in non_scored_words())):
                        # Add the adjacent tainted line to the segment.
                        if b:
                            self.end_index += 1
                        else:
                            self.start_index -= 1

    def possibly_split_segment(self, max_internal_silence_length,
                               max_internal_non_scored_length):
        """
        This is stage 3 of segment processing.
        This function will split a segment into multiple pieces if any of the
        internal [non-boundary] silences or non-scored words are longer
        than the allowed values --max-internal-silence-length and
        --max-internal-non-scored-length.
        This function returns a list of segments.
        In the normal case (where there is no splitting) it just returns an
        array with a single element 'self'.

        Note: --max-internal-silence-length and
        --max-internal-non-scored-length can be set to very large values
        to avoid any splitting.
        """
        # make sure the segment hasn't been processed more than we expect.
        assert (self.start_unk_padding == 0.0 and self.end_unk_padding == 0.0
                and self.start_keep_proportion == 1.0
                and self.end_keep_proportion == 1.0)
        segments = []  # the answer
        cur_start_index = self.start_index
        cur_start_is_split = False
        # only consider splitting at non-boundary lines.  [we'd just truncate
        # the boundary lines.]
        for index_to_split_at in range(cur_start_index + 1,
                                       self.end_index - 1):
            this_split_line = self.split_lines_of_utt[index_to_split_at]
            this_duration = float(this_split_line[3])
            this_edit_type = this_split_line[7]
            this_ref_word = this_split_line[6]
            if ((this_edit_type == 'sil' and
                 this_duration > max_internal_silence_length)
                    or (this_ref_word in non_scored_words()
                        and (this_duration
                             > max_internal_non_scored_length))):
                # We split this segment at this index, dividing the word in two
                # [later on, in possibly_truncate_boundaries, it may be further
                # truncated.]
                # Note: we use 'index_to_split_at + 1' because the Segment
                # constructor takes an 'end-index' which is interpreted as one
                # past the end.
                new_segment = Segment(self.split_lines_of_utt, cur_start_index,
                                      index_to_split_at + 1,
                                      debug_str=self.debug_str)
                if cur_start_is_split:
                    new_segment.start_keep_proportion = 0.5
                new_segment.end_keep_proportion = 0.5
                cur_start_is_split = True
                cur_start_index = index_to_split_at
                segments.append(new_segment)
        if len(segments) == 0:  # We did not split.
            segments.append(self)
        else:
            # We did split.  Add the very last segment.
            new_segment = Segment(self.split_lines_of_utt, cur_start_index,
                                  self.end_index,
                                  debug_str=self.debug_str)
            assert cur_start_is_split
            new_segment.start_keep_proportion = 0.5
            segments.append(new_segment)
        return segments

    def possibly_split_long_segment(self, max_segment_length,
                                    hard_max_segment_length,
                                    min_silence_length_to_split,
                                    min_non_scored_length_to_split):
        """
        This is stage 4 of segment processing.
        This function will split a segment into multiple pieces if it is
        longer than the value --max-segment-length.
        It tries to split at silences and non-scored words that are
        at least --min-silence-length-to-split or
        --min-non-scored-length-to-split long.
        If this is not possible and the segments are still longer than
        --hard-max-segment-length, then this is split into equal length
        pieces of approximately --max-segment-length long.
        This function returns a list of segments.
        In the normal case (where there is no splitting) it just returns an
        array with a single element 'self'.
        """
        # make sure the segment hasn't been processed more than we expect.
        assert self.start_unk_padding == 0.0 and self.end_unk_padding == 0.0
        if self.length() < max_segment_length:
            return [self]

        segments = [self]  # the answer
        cur_start_index = self.start_index

        split_indexes = []
        # only consider splitting at non-boundary lines.  [we'd just truncate
        # the boundary lines.]
        for index_to_split_at in range(cur_start_index + 1,
                                       self.end_index - 1):
            this_split_line = self.split_lines_of_utt[index_to_split_at]
            this_duration = float(this_split_line[3])
            this_edit_type = this_split_line[7]
            this_ref_word = this_split_line[6]
            this_is_tainted = is_tainted(this_split_line)
            if (this_edit_type == 'sil'
                    and this_duration > min_silence_length_to_split):
                split_indexes.append((index_to_split_at, this_duration,
                                      this_is_tainted))

            if (this_ref_word in non_scored_words()
                    and (this_duration > min_non_scored_length_to_split)):
                split_indexes.append((index_to_split_at, this_duration,
                                      this_is_tainted))
        split_indexes.sort(key=lambda x: x[1], reverse=True)
        split_indexes.sort(key=lambda x: x[2])

        while True:
            if len(split_indexes) == 0:
                break

            new_segments = []

            for segment in segments:
                if segment.length() < max_segment_length:
                    new_segments.append(segment)
                    continue

                try:
                    index_to_split_at = next(
                        (x[0] for x in split_indexes
                         if (x[0] > segment.start_index
                             and x[0] < segment.end_index - 1)))
                except StopIteration:
                    _global_logger.debug(
                        "Could not find an index in the range (%d, %d) in "
                        "split-indexes %s", segment.start_index,
                        segment.end_index - 1, split_indexes)
                    new_segments.append(segment)
                    continue

                # We split this segment at this index, dividing the word in two
                # [later on, in possibly_truncate_boundaries, it may be further
                # truncated.]
                # Note: we use 'index_to_split_at + 1' because the Segment
                # constructor takes an 'end-index' which is interpreted as one
                # past the end.
                new_segment = Segment(
                    self.split_lines_of_utt, segment.start_index,
                    index_to_split_at + 1, debug_str=self.debug_str)
                new_segment.end_keep_proportion = 0.5
                new_segments.append(new_segment)

                new_segment = Segment(
                    self.split_lines_of_utt, index_to_split_at,
                    segment.end_index, debug_str=self.debug_str)
                new_segment.start_keep_proportion = 0.5
                new_segments.append(new_segment)

            if len(segments) == len(new_segments):
                # No splitting done
                break
            segments = new_segments

            for i, x in enumerate(segments):
                _global_logger.debug("Segment %d = %s", i, x)

        new_segments = []
        # Split segments that are still very long
        for segment in segments:
            if segment.length() < hard_max_segment_length:
                new_segments.append(segment)
                continue

            cur_start_index = segment.start_index
            cur_start = segment.start_time()

            index_to_split_at = None
            try:
                while True:
                    index_to_split_at = next(
                        (i for i in range(cur_start_index, segment.end_index)
                         if (float(self.split_lines_of_utt[i][2])
                             >= cur_start + max_segment_length)))

                    new_segment = Segment(
                        self.split_lines_of_utt, cur_start_index,
                        index_to_split_at)
                    new_segments.append(new_segment)

                    cur_start_index = index_to_split_at
                    cur_start = float(
                        self.split_lines_of_utt[cur_start_index][2])
                    index_to_split_at = None

                    if (segment.end_time() - cur_start
                            < hard_max_segment_length):
                        raise StopIteration
            except StopIteration:
                if index_to_split_at is None:
                    _global_logger.debug(
                        "Could not find an index in the range (%d, %d) with "
                        "start time > %.2f", cur_start_index,
                        segment.end_index, cur_start + max_segment_length)
                new_segment = Segment(
                    self.split_lines_of_utt, cur_start_index,
                    segment.end_index)
                new_segments.append(new_segment)
                break
        segments = new_segments
        return segments

    def possibly_truncate_boundaries(self, max_edge_silence_length,
                                     max_edge_non_scored_length):
        """
        This is stage 5 of segment processing.
        It will truncate the silences and non-scored words at the segment
        boundaries if they are longer than the --max-edge-silence-length and
        --max-edge-non-scored-length respectively
        (and to the extent that this wouldn't take us below the
        --min-segment-length or --min-new-segment-length. See
        relax_boundary_truncation()).

        Note: --max-edge-silence-length and --max-edge-non-scored-length
        can be set to very large values to avoid any truncation.
        """
        for b in [True, False]:
            if b:
                this_index = self.start_index
            else:
                this_index = self.end_index - 1
            this_split_line = self.split_lines_of_utt[this_index]
            truncated_duration = None
            this_duration = float(this_split_line[3])
            this_edit = this_split_line[7]
            this_ref_word = this_split_line[6]
            if (this_edit == 'sil'
                    and this_duration > max_edge_silence_length):
                truncated_duration = max_edge_silence_length
            elif (this_ref_word in non_scored_words()
                  and this_duration > max_edge_non_scored_length):
                truncated_duration = max_edge_non_scored_length
            if truncated_duration is not None:
                keep_proportion = truncated_duration / this_duration
                if b:
                    self.start_keep_proportion = keep_proportion
                else:
                    self.end_keep_proportion = keep_proportion

    def relax_boundary_truncation(self, min_segment_length,
                                  min_new_segment_length):
        """
        This relaxes the segment-boundary truncation of
        possibly_truncate_boundaries(), if it would take us below
        min-new-segment-length or min-segment-length.

        Note: this does not relax the boundary truncation for a particular
        boundary (start or end) if that boundary corresponds to a 'tainted'
        line of the ctm (because it's dangerous to include too much 'tainted'
        audio).
        """
        # this should be called before adding unk padding.
        assert self.start_unk_padding == self.end_unk_padding == 0.0
        if self.start_keep_proportion == self.end_keep_proportion == 1.0:
            return  # nothing to do there was no truncation.
        length_cutoff = max(min_new_segment_length, min_segment_length)
        length_with_truncation = self.length()
        if length_with_truncation >= length_cutoff:
            return  # Nothing to do.
        orig_start_keep_proportion = self.start_keep_proportion
        orig_end_keep_proportion = self.end_keep_proportion
        if not is_tainted(self.split_lines_of_utt[self.start_index]):
            self.start_keep_proportion = 1.0
        if not is_tainted(self.split_lines_of_utt[self.end_index - 1]):
            self.end_keep_proportion = 1.0
        length_with_relaxed_boundaries = self.length()
        if length_with_relaxed_boundaries <= length_cutoff:
            # Completely undo the truncation [to the extent allowed by the
            # presence of tainted lines at the start/end] if, even without
            # truncation, we'd be below the length cutoff.  This segment may be
            # removed later on (but it may not, if removing truncation makes us
            # identical to the input utterance, and the length is between
            # min_segment_length min_new_segment_length).
            return
        # Next, compute an interpolation constant a such that the
        # {start,end}_keep_proportion values will equal
        # a
        # * [values-computed-by-possibly_truncate_boundaries()]
        # + (1-a) * [completely-relaxed-values].
        # we're solving the equation:
        # length_cutoff = a * length_with_truncation
        #                 + (1-a) * length_with_relaxed_boundaries
        # -> length_cutoff - length_with_relaxed_boundaries =
        #        a * (length_with_truncation - length_with_relaxed_boundaries)
        # -> a = (length_cutoff - length_with_relaxed_boundaries)
        #        / (length_with_truncation - length_with_relaxed_boundaries)
        a = (length_cutoff - length_with_relaxed_boundaries) / (length_with_truncation - length_with_relaxed_boundaries)
        if a < 0.0 or a > 1.0:
            # TODO(vimal): Should this be an error?
            _global_logger.warn("bad 'a' value = %.4f", a)
            return
        self.start_keep_proportion = (
            a * orig_start_keep_proportion
            + (1 - a) * self.start_keep_proportion)
        self.end_keep_proportion = (
            a * orig_end_keep_proportion + (1 - a) * self.end_keep_proportion)
        if abs(self.length() - length_cutoff) >= 0.01:
            # TODO(vimal): Should this be an error?
            _global_logger.warn(
                "possible problem relaxing boundary "
                "truncation, length is %.2f vs %.2f", self.length(),
                length_cutoff)

    def possibly_add_unk_padding(self, max_unk_padding):
        """
        This is stage 7 of segment processing.
        This function may set start_unk_padding and end_unk_padding to nonzero
        values.  This is done if the current boundary words are real, scored
        words and we're not next to the beginning or end of the utterance.
        """
        for b in [True, False]:
            if b:
                this_index = self.start_index
            else:
                this_index = self.end_index - 1
            this_split_line = self.split_lines_of_utt[this_index]
            this_start_time = float(this_split_line[2])
            this_ref_word = this_split_line[6]
            this_edit = this_split_line[7]
            if this_edit == 'cor' and this_ref_word not in non_scored_words():
                # we can consider adding unk-padding.
                if b:   # start of utterance.
                    unk_padding = max_unk_padding
                    # close to beginning of file
                    if unk_padding > this_start_time:
                        unk_padding = this_start_time
                    # If we could add less than half of the specified
                    # unk-padding, don't add any (because when we add
                    # unk-padding we add the unknown-word symbol '<unk>', and
                    # if there isn't enough space to traverse the HMM we don't
                    # want to do it at all.
                    if unk_padding < 0.5 * max_unk_padding:
                        unk_padding = 0.0
                    self.start_unk_padding = unk_padding
                else:   # end of utterance.
                    this_end_time = this_start_time + float(this_split_line[3])
                    last_line = self.split_lines_of_utt[-1]
                    utterance_end_time = (float(last_line[2])
                                          + float(last_line[3]))
                    max_allowable_padding = utterance_end_time - this_end_time
                    assert max_allowable_padding > -0.01
                    unk_padding = max_unk_padding
                    if unk_padding > max_allowable_padding:
                        unk_padding = max_allowable_padding
                    # If we could add less than half of the specified
                    # unk-padding, don't add any (because when we add
                    # unk-padding we add the unknown-word symbol '<unk>',
                    # and if there isn't enough space to traverse the HMM we
                    # don't want to do it at all.
                    if unk_padding < 0.5 * max_unk_padding:
                        unk_padding = 0.0
                    self.end_unk_padding = unk_padding

    def start_time(self):
        """Returns the start time of the utterance (within the enclosing
        utterance).
        This is before any rounding.
        """
        if self.start_index == len(self.split_lines_of_utt):
            assert self.end_index == len(self.split_lines_of_utt)
            return self.end_time()
        first_line = self.split_lines_of_utt[self.start_index]
        first_line_start = float(first_line[2])
        first_line_duration = float(first_line[3])
        first_line_end = first_line_start + first_line_duration
        return (first_line_end - self.start_unk_padding
                - (first_line_duration * self.start_keep_proportion))

    def debug_info(self, include_stats=True):
        """Returns some string-valued information about 'this' that is useful
        for debugging."""
        if include_stats and self.stats is not None:
            stats = 'wer={wer:.2f},{stats},'.format(
                wer=self.stats.wer(), stats=self.stats)
        else:
            stats = ''

        return ('start={start:d},end={end:d},'
                'unk-padding={start_unk_padding:.2f},{end_unk_padding:.2f},'
                'keep-proportion={start_prop:.2f},{end_prop:.2f},'
                'start-time={start_time:.2f},end-time={end_time:.2f},'
                '{stats}'
                'debug-str={debug_str}'.format(
                    start=self.start_index, end=self.end_index,
                    start_unk_padding=self.start_unk_padding,
                    end_unk_padding=self.end_unk_padding,
                    start_prop=self.start_keep_proportion,
                    end_prop=self.end_keep_proportion,
                    start_time=self.start_time(), end_time=self.end_time(),
                    stats=stats, debug_str=self.debug_str))

    def end_time(self):
        """Returns the start time of the utterance (within the enclosing
        utterance)."""
        if self.end_index == 0:
            assert self.start_index == 0
            return self.start_time()
        last_line = self.split_lines_of_utt[self.end_index - 1]
        last_line_start = float(last_line[2])
        last_line_duration = float(last_line[3])
        return (last_line_start
                + (last_line_duration * self.end_keep_proportion)
                + self.end_unk_padding)

    def length(self):
        """Returns the segment length in seconds."""
        return self.end_time() - self.start_time()

    def is_whole_utterance(self):
        """returns true if this segment corresponds to the whole utterance that
        it's a part of (i.e. its start/end time are zero and the end-time of
        the last segment."""
        last_line_of_utt = self.split_lines_of_utt[-1]
        last_line_end_time = (float(last_line_of_utt[2])
                              + float(last_line_of_utt[3]))
        return (abs(self.start_time() - 0.0) < 0.001
                and abs(self.end_time() - last_line_end_time) < 0.001)

    def get_junk_proportion(self):
        """Returns the proportion of the duration of this segment that consists
        of unk-padding and tainted lines of input (will be between 0.0 and
        1.0)."""
        # Note: only the first and last lines could possibly be tainted as
        # that's how we create the segments; and if either or both are tainted
        # the utterance must contain other lines, so double-counting is not a
        # problem.
        junk_duration = self.start_unk_padding + self.end_unk_padding
        first_split_line = self.split_lines_of_utt[self.start_index]
        if is_tainted(first_split_line):
            first_duration = float(first_split_line[3])
            junk_duration += first_duration * self.start_keep_proportion
        last_split_line = self.split_lines_of_utt[self.end_index - 1]
        if is_tainted(last_split_line):
            last_duration = float(last_split_line[3])
            junk_duration += last_duration * self.end_keep_proportion
        return junk_duration / self.length()

    def get_junk_duration(self):
        """Returns duration of junk"""
        return self.get_junk_proportion() * self.length()

    def merge_adjacent_segment(self, other):
        """
        This function will merge the segment in 'other' with the segment
        in 'self'.  It is only to be called when 'self' and 'other' are from
        the same utterance, 'other' is after 'self' in time order (based on
        the original segment cores), and self.end_index <= self.start_index
        i.e. the two segments might have at most one index in common,
        which is usually a tainted word or silence.
        """
        try:
            assert self.end_index <= other.start_index + 1
            assert self.start_time() < other.end_time()
            assert self.split_lines_of_utt is other.split_lines_of_utt
        except AssertionError:
            _global_logger.error("self: %s", self)
            _global_logger.error("other: %s", other)
            raise

        assert self.start_index == 0 or self.start_index != other.start_index

        _global_logger.debug("Before merging: %s", self)

        assert not self.stats.compare(other.stats), "%s %s" % (self, other)
        self.stats.combine(other.stats)

        if self.end_index == other.start_index + 1:
            overlapping_segment = Segment(
                self.split_lines_of_utt, other.start_index,
                self.end_index, compute_segment_stats=True)
            self.stats.combine(overlapping_segment.stats, scale=-1)

        _global_logger.debug("Other segment: %s", other)

        self.debug_str = "({0}/merged-with-adjacent/{1})".format(
            self.debug_str, other.debug_str)

        # everything that relates to the end of this segment gets copied
        # from 'other'.
        self.end_index = other.end_index
        self.end_unk_padding = other.end_unk_padding
        self.end_keep_proportion = other.end_keep_proportion

        _global_logger.debug("After merging %s", self)
        return

    def merge_with_segment(self, other, max_deleted_words):
        """
        This function will merge the segment in 'other' with the segment
        in 'self'.  It is only to be called when 'self' and 'other' are from
        the same utterance, 'other' is after 'self' in time order (based on
        the original segment cores), and self.end_time() >= other.start_time().
        Note: in this situation there will normally be deleted words
        between the two segments.  What this program does with the deleted
        words depends on '--max-deleted-words-kept-when-merging'.  If there
        were any inserted words in the transcript (less likely), this
        program will keep the reference.

        Note: --max-deleted-words-kept-when-merging can be set to a very
        large value to keep all the words.
        """
        try:
            assert self.end_time() >= other.start_time()
            assert self.start_time() < other.end_time()
            assert self.split_lines_of_utt is other.split_lines_of_utt
        except AssertionError:
            _global_logger.error("self: %s", self)
            _global_logger.error("other: %s", other)
            raise

        assert self.start_index == 0 or self.start_index != other.start_index

        _global_logger.debug("Before merging: %s", self)

        assert (not self.stats.compare(other.stats)
                or self.start_time() != other.start_time()
                or self.end_time() != other.end_time()
                ), "%s %s" % (self, other)
        self.stats.combine(other.stats)

        _global_logger.debug("Other segment: %s", other)

        orig_self_end_index = self.end_index
        self.debug_str = "({0}/merged-with/{1})".format(
            self.debug_str, other.debug_str)

        # everything that relates to the end of this segment gets copied
        # from 'other'.
        self.end_index = other.end_index
        self.end_unk_padding = other.end_unk_padding
        self.end_keep_proportion = other.end_keep_proportion

        _global_logger.debug("After merging %s", self)

        # The next thing we have to do is to go over any lines of the ctm that
        # appear between 'self' and 'other', or are shared between both (this
        # would only happen for tainted silence or non-scored-word segments),
        # and decide what to do with them.  We'll keep the reference for any
        # substitutions or insertions (which anyway are unlikely to appear
        # in these merged segments).  Note: most of this happens in
        # self.Text(), but at this point we need to decide whether to mark any
        # deletions as 'discard-this-word'.
        try:
            if orig_self_end_index <= other.start_index:
                # No overlap in indexes
                first_index_of_overlap = orig_self_end_index
                last_index_of_overlap = other.start_index - 1
                segment = Segment(
                    self.split_lines_of_utt, orig_self_end_index,
                    other.start_index, compute_segment_stats=True)
                self.stats.combine(segment.stats)
            else:
                first_index_of_overlap = other.start_index
                last_index_of_overlap = orig_self_end_index - 1

            num_deleted_words = 0
            for i in range(first_index_of_overlap, last_index_of_overlap + 1):
                edit_type = self.split_lines_of_utt[i][7]
                if edit_type == 'del':
                    num_deleted_words += 1
            if num_deleted_words > max_deleted_words:
                for i in range(first_index_of_overlap,
                               last_index_of_overlap + 1):
                    if self.split_lines_of_utt[i][7] == 'del':
                        self.split_lines_of_utt[i].append(
                            'do-not-include-in-text')
        except:
            _global_logger.error(
                "first-index-of-overlap = %d", first_index_of_overlap)
            _global_logger.error(
                "last-index-of-overlap = %d", last_index_of_overlap)
            _global_logger.error("line = %d = %s", i,
                                 self.split_lines_of_utt[i])
            raise
        _global_logger.debug("After merging %s", self)

    def contains_atleast_one_scored_non_oov_word(self):
        """
        this will return true if there is at least one word in the utterance
        that's a scored word (not a non-scored word) and not an OOV word that's
        realized as unk.  This becomes a filter on keeping segments.
        """
        for i in range(self.start_index, self.end_index):
            this_split_line = self.split_lines_of_utt[i]
            this_hyp_word = this_split_line[4]
            this_ref_word = this_split_line[6]
            this_edit = this_split_line[7]
            if (this_edit == 'cor' and this_ref_word not in non_scored_words()
                    and this_ref_word == this_hyp_word):
                return True
        return False

    def text(self, oov_symbol, eps_symbol="<eps_symbol>"):
        """Returns the text corresponding to this utterance, as a string."""
        text_array = []
        if self.start_unk_padding != 0.0:
            text_array.append(oov_symbol)
        for i in range(self.start_index, self.end_index):
            this_split_line = self.split_lines_of_utt[i]
            this_ref_word = this_split_line[6]
            if (this_ref_word != eps_symbol
                    and this_split_line[-1] != 'do-not-include-in-text'):
                text_array.append(this_ref_word)
        if self.end_unk_padding != 0.0:
            text_array.append(oov_symbol)
        return ' '.join(text_array)


class SegmentsMerger(object):
    """This class contains methods for merging segments. It stores the
    appropriate statistics required for this process in objects of
    SegmentStats class.

    Paramters:
        segments - a reference to the list of inital segments
        merged_segments - stores all the initial segments as well
                          as the newly created segments
        between_segments - stores the inter-segment "segments"
                           for the initial segments
        split_lines_of_utt - a reference to the CTM lines
    """

    def __init__(self, segments):
        self.segments = segments

        try:
            self.split_lines_of_utt = segments[0].split_lines_of_utt
        except IndexError as e:
            _global_logger.error("No input segments found!")
            raise e

        self.merged_segments = {}
        self.between_segments = [None for i in range(len(segments) + 1)]

        if segments[0].start_index > 0:
            self.between_segments[0] = Segment(
                self.split_lines_of_utt, 0, segments[0].start_index,
                compute_segment_stats=True)

        for i, x in enumerate(segments):
            x.compute_stats()
            self.merged_segments[(i, )] = x

            if i > 0 and segments[i].start_index > segments[i - 1].end_index:
                self.between_segments[i] = Segment(
                    self.split_lines_of_utt, segments[i - 1].end_index,
                    segments[i].start_index, compute_segment_stats=True)

        if segments[-1].end_index < len(self.split_lines_of_utt):
            self.between_segments[-1] = Segment(
                self.split_lines_of_utt, segments[-1].end_index,
                len(self.split_lines_of_utt), compute_segment_stats=True)

    def _get_merged_cluster(self, cluster1, cluster2, rejected_clusters=None,
                            max_intersegment_incorrect_words_length=1):
        try:
            assert cluster2[0] > cluster1[-1]
            new_cluster = cluster1 + cluster2
            new_cluster_tup = tuple(new_cluster)

            if (rejected_clusters is not None
                    and new_cluster_tup in rejected_clusters):
                return (None, new_cluster, True)

            if new_cluster_tup in self.merged_segments:
                return (self.merged_segments[new_cluster_tup],
                        new_cluster, False)

            if cluster1[-1] == -1:
                assert len(cluster1) == 1
                # Consider merging cluster2 with the region before the 0^th
                # segment
                if (self.between_segments[0] is None
                        or self.between_segments[0].stats.total_length == 0
                        or (self.between_segments[0]
                            .stats.incorrect_words_length
                            > max_intersegment_incorrect_words_length)):
                    # Reject zero length or bad start region
                    return (None, new_cluster, True)
                merged_segment = self.between_segments[0].copy()
            else:
                merged_segment = self.merged_segments[tuple(cluster1)].copy()

                if cluster2[0] == len(self.segments):
                    assert len(cluster2) == 1
                    if (self.between_segments[-1] is None
                            or (self.between_segments[-1]
                                .stats.total_length == 0)
                            or (self.between_segments[-1]
                                .stats.incorrect_words_length
                                > max_intersegment_incorrect_words_length)):
                        # Reject zero length or bad end region
                        return (None, new_cluster, True)
                if self.between_segments[cluster2[0]] is not None:
                    if (self.between_segments[cluster2[0]]
                            .stats.incorrect_words_length
                            > max_intersegment_incorrect_words_length):
                        return (None, new_cluster, True)
                    merged_segment.merge_adjacent_segment(
                        self.between_segments[cluster2[0]])

            if cluster2[0] < len(self.segments):
                merged_segment.merge_adjacent_segment(
                    self.merged_segments[tuple(cluster2)])
            # else:
            # Already done
            # merged_segment.merge_adjacent_segment(self.between_segments[-1])

            self.merged_segments[new_cluster_tup] = merged_segment
            return (merged_segment, new_cluster, False)
        except:
            _global_logger.error("Failed merging cluster1 %s and cluster2 %s",
                                 cluster1, cluster2)
            for i in (cluster1 + cluster2):
                if i >= 0 and i < len(self.segments):
                    _global_logger.error("Segment %d = %s", i,
                                         self.segments[i])
            raise

    def merge_clusters(self, scoring_function,
                       max_wer=10, max_bad_proportion=0.3,
                       max_segment_length=10,
                       max_intersegment_incorrect_words_length=1):
        for i, x in enumerate(self.segments):
            _global_logger.debug("before agglomerative clustering, segment %d"
                                 " = %s", i, x)

        # Initial clusters are the individual segments themselves.
        clusters = [[x] for x in range(-1, len(self.segments) + 1)]

        rejected_clusters = set()

        while len(clusters) > 1:
            try:
                _global_logger.debug("Current clusters: %s", clusters)

                heap = []

                for i in range(len(clusters) - 1):
                    merged_segment, new_cluster, reject = (
                        self._get_merged_cluster(
                            clusters[i], clusters[i + 1], rejected_clusters,
                            max_intersegment_incorrect_words_length=(
                                max_intersegment_incorrect_words_length)))
                    if reject:
                        rejected_clusters.add(tuple(new_cluster))
                        continue
                    heapq.heappush(heap, ((-scoring_function(merged_segment), i),
                                          (merged_segment, i, new_cluster)))

                candidate_index = -1
                candidate_cluster = None

                while True:
                    try:
                        score, tup = heapq.heappop(heap)
                    except IndexError:
                        break

                    segment, index, cluster = tup

                    _global_logger.debug(
                        "Considering new cluster: (%d, %s)", index, cluster)

                    if segment.stats.wer() > max_wer:
                        _global_logger.debug(
                            "Rejecting cluster with "
                            "WER%% %.2f > %.2f", segment.stats.wer(), max_wer)
                        rejected_clusters.add(tuple(cluster))
                        continue

                    if segment.stats.bad_proportion() > max_bad_proportion:
                        _global_logger.debug(
                            "Rejecting cluster with bad-proportion "
                            "%.2f > %.2f", segment.stats.bad_proportion(),
                            max_bad_proportion)
                        rejected_clusters.add(tuple(cluster))
                        continue

                    if segment.stats.total_length > max_segment_length:
                        _global_logger.debug(
                            "Rejecting cluster with length "
                            "%.2f > %.2f", segment.stats.total_length,
                            max_segment_length)
                        rejected_clusters.add(tuple(cluster))
                        continue

                    candidate_index, candidate_cluster = tup[1:]
                    _global_logger.debug("Accepted cluster (%d, %s)",
                                         candidate_index, candidate_cluster)
                    break

                if candidate_index == -1:
                    return clusters

                new_clusters = []

                for i in range(candidate_index):
                    new_clusters.append(clusters[i])
                new_clusters.append(candidate_cluster)
                for i in range(candidate_index + 2, len(clusters)):
                    new_clusters.append(clusters[i])

                if len(new_clusters) >= len(clusters):
                    raise RuntimeError("Old: {0}; New: {1}".format(
                        clusters, new_clusters))
                clusters = new_clusters
            except Exception:
                _global_logger.error(
                    "Failed merging clusters %s", clusters)
                raise

        return clusters


def merge_segments(segments, args):
    if len(segments) == 0:
        _global_logger.debug("Got no segments at merging segments stage")
        return []

    def scoring_function(segment):
        stats = segment.stats
        try:
            return (-stats.wer() - args.silence_factor * stats.silence_length
                    - args.incorrect_words_factor
                    * stats.incorrect_words_length
                    - args.tainted_words_factor
                    * stats.num_tainted_words * 100.0 / stats.num_words)
        except ZeroDivisionError:
            return float("-inf")

    # Do agglomerative clustering on the initial segments with the score
    # for combining neighboring segments being the scoring_function on the
    # stats of the combined segment.
    merger = SegmentsMerger(segments)
    clusters = merger.merge_clusters(
        scoring_function, max_wer=args.max_wer,
        max_bad_proportion=args.max_bad_proportion,
        max_segment_length=args.max_segment_length_for_merging,
        max_intersegment_incorrect_words_length=(
            args.max_intersegment_incorrect_words_length))

    _global_logger.debug("Clusters to be merged: %s", clusters)

    # Do the actual merging based on the clusters.
    new_segments = []
    for cluster_index, cluster in enumerate(clusters):
        _global_logger.debug(
            "Merging cluster (%d, %s)", cluster_index, cluster)

        try:
            if cluster_index == 0 and len(cluster) == 1:
                assert cluster[0] == -1
                _global_logger.debug(
                    "Not adding region before the first segment")
                # skip adding the lines before the initial segment if its
                # not merged with the initial segment
                continue
            elif cluster_index == len(clusters) - 1 and len(cluster) == 1:
                _global_logger.debug(
                    "Not adding remaining end region %s",
                    cluster[0])
                assert cluster[0] == len(segments)
                # skip adding the lines after the last segment if its
                # not merged with the last segment
                break

            new_segments.append(merger.merged_segments[tuple(cluster)])
        except Exception:
            _global_logger.error("Error with cluster (%d, %s)",
                                 cluster_index, cluster)
            raise

    segments = new_segments

    for i, x in enumerate(segments):
        _global_logger.debug(
            "after agglomerative clustering: segment %d = %s", i, x)

    assert len(segments) > 0
    segment_index = 0
    # Ignore all the initial segments that have WER > max_wer
    while segment_index < len(segments):
        segment = segments[segment_index]
        if segment.stats.wer() < args.max_wer:
            break
        segment_index += 1

    if segment_index == len(segments):
        _global_logger.debug("No merged segments were below "
                             "WER%% %.2f", args.max_wer)
        return []

    _global_logger.debug("Merging overlapping segments starting from the "
                         "first segment with WER%% < max_wer i.e. %d = %s",
                         segment_index, segments[segment_index])

    new_segments = [segments[segment_index]]
    segment_index += 1
    while segment_index < len(segments):
        if segments[segment_index].stats.wer() > args.max_wer:
            # ignore this segment
            segment_index += 1
            continue
        if new_segments[-1].end_time() >= segments[segment_index].start_time():
            new_segments[-1].merge_with_segment(
                segments[segment_index], args.max_deleted_words)
        else:
            new_segments.append(segments[segment_index])
        segment_index += 1
    segments = new_segments

    return segments


def get_segments_for_utterance(split_lines_of_utt, args, utterance_stats):
    """
    This function creates the segments for an utterance as a list
    of class Segment.
    It returns a 2-tuple (list-of-segments, list-of-deleted-segments)
    where the deleted segments are only useful for diagnostic printing.
    Note: split_lines_of_utt is a list of lists, one per line, each containing
    the sequence of fields.
    """
    utterance_stats.num_utterances += 1

    segment_ranges = compute_segment_cores(split_lines_of_utt)

    utterance_end_time = (float(split_lines_of_utt[-1][2])
                          + float(split_lines_of_utt[-1][3]))
    utterance_stats.total_length_of_utterances += utterance_end_time

    segments = [Segment(split_lines_of_utt, x[0], x[1])
                for x in segment_ranges]

    utterance_stats.accumulate_segment_stats(
        segments, 'stage  0 [segment cores]')

    for i, x in enumerate(segments):
        _global_logger.debug("stage 0: segment %d = %s", i, x)

    if args.verbose > 4:
        print("Stage 0 [segment cores]:", file=sys.stderr)
        segments_copy = [x.copy() for x in segments]
        print_debug_info_for_utterance(sys.stderr,
                                       copy.deepcopy(split_lines_of_utt),
                                       segments_copy, [])

    for segment in segments:
        segment.possibly_add_tainted_lines()
    utterance_stats.accumulate_segment_stats(
        segments, 'stage  1 [add tainted lines]')

    for i, x in enumerate(segments):
        _global_logger.debug("stage 1: segment %d = %s", i, x)

    if args.verbose > 4:
        print("Stage 1 [add tainted lines]:", file=sys.stderr)
        segments_copy = [x.copy() for x in segments]
        print_debug_info_for_utterance(sys.stderr,
                                       copy.deepcopy(split_lines_of_utt),
                                       segments_copy, [])

    segments = merge_segments(segments, args)
    utterance_stats.accumulate_segment_stats(
        segments, 'stage  2 [merge segments]')

    for i, x in enumerate(segments):
        _global_logger.debug("stage 2: segment %d = %s", i, x)

    if args.verbose > 4:
        print("Stage 2 [merge segments]:", file=sys.stderr)
        segments_copy = [x.copy() for x in segments]
        print_debug_info_for_utterance(sys.stderr,
                                       copy.deepcopy(split_lines_of_utt),
                                       segments_copy, [])

    new_segments = []
    for s in segments:
        new_segments += s.possibly_split_segment(
            args.max_internal_silence_length,
            args.max_internal_non_scored_length)
    segments = new_segments
    utterance_stats.accumulate_segment_stats(
        segments, 'stage  3 [split segments]')

    for i, x in enumerate(segments):
        _global_logger.debug(
            "stage 3: segment %d, %s", i, x.debug_info(False))

    if args.verbose > 4:
        print("Stage 3 [split segments]:", file=sys.stderr)
        segments_copy = [x.copy() for x in segments]
        print_debug_info_for_utterance(sys.stderr,
                                       copy.deepcopy(split_lines_of_utt),
                                       segments_copy, [])

    new_segments = []
    for s in segments:
        new_segments += s.possibly_split_long_segment(
            args.max_segment_length_for_splitting,
            args.hard_max_segment_length,
            args.min_silence_length_to_split,
            args.min_non_scored_length_to_split)
    segments = new_segments
    utterance_stats.accumulate_segment_stats(
        segments, 'stage  4 [split long segments]')

    for i, x in enumerate(segments):
        _global_logger.debug(
            "stage 4: segment %d, %s", i, x.debug_info(False))

    if args.verbose > 4:
        print("Stage 4 [split long segments]:", file=sys.stderr)
        segments_copy = [x.copy() for x in segments]
        print_debug_info_for_utterance(sys.stderr,
                                       copy.deepcopy(split_lines_of_utt),
                                       segments_copy, [])

    for s in segments:
        s.possibly_truncate_boundaries(args.max_edge_silence_length,
                                       args.max_edge_non_scored_length)
    utterance_stats.accumulate_segment_stats(
        segments, 'stage  5 [truncate boundaries]')

    for i, x in enumerate(segments):
        _global_logger.debug(
            "stage 5: segment %d = %s", i, x.debug_info(False))

    if args.verbose > 4:
        print("Stage 5 [truncate boundaries]:", file=sys.stderr)
        segments_copy = [x.copy() for x in segments]
        print_debug_info_for_utterance(sys.stderr,
                                       copy.deepcopy(split_lines_of_utt),
                                       segments_copy, [])

    for s in segments:
        s.relax_boundary_truncation(args.min_segment_length,
                                    args.min_new_segment_length)
    utterance_stats.accumulate_segment_stats(
        segments, 'stage  6 [relax boundary truncation]')

    for i, x in enumerate(segments):
        _global_logger.debug(
            "stage 6: segment %d = %s", i, x.debug_info(False))

    if args.verbose > 4:
        print("Stage 6 [relax boundary truncation]:", file=sys.stderr)
        segments_copy = [x.copy() for x in segments]
        print_debug_info_for_utterance(sys.stderr,
                                       copy.deepcopy(split_lines_of_utt),
                                       segments_copy, [])

    for s in segments:
        s.possibly_add_unk_padding(args.unk_padding)
    utterance_stats.accumulate_segment_stats(
        segments, 'stage  7 [unk-padding]')

    for i, x in enumerate(segments):
        _global_logger.debug(
            "stage 7: segment %d = %s", i, x.debug_info(False))

    if args.verbose > 4:
        print("Stage 7 [unk-padding]:", file=sys.stderr)
        segments_copy = [x.copy() for x in segments]
        print_debug_info_for_utterance(sys.stderr,
                                       copy.deepcopy(split_lines_of_utt),
                                       segments_copy, [])

    deleted_segments = []
    new_segments = []
    for s in segments:
        # the 0.999 allows for roundoff error.
        if (not s.is_whole_utterance()
                and s.length() < 0.999 * args.min_new_segment_length):
            s.debug_str += '[deleted-because-of--min-new-segment-length]'
            deleted_segments.append(s)
        else:
            new_segments.append(s)
    segments = new_segments
    utterance_stats.accumulate_segment_stats(
        segments,
        'stage  8 [remove new segments under --min-new-segment-length')

    for i, x in enumerate(segments):
        _global_logger.debug(
            "stage 8: segment %d = %s", i, x.debug_info(False))

    if args.verbose > 4:
        print("Stage 8 [remove new segments under "
              "--min-new-segment-length]:", file=sys.stderr)
        segments_copy = [x.copy() for x in segments]
        print_debug_info_for_utterance(sys.stderr,
                                       copy.deepcopy(split_lines_of_utt),
                                       segments_copy, [])

    new_segments = []
    for s in segments:
        # the 0.999 allows for roundoff error.
        if s.length() < 0.999 * args.min_segment_length:
            s.debug_str += '[deleted-because-of--min-segment-length]'
            deleted_segments.append(s)
        else:
            new_segments.append(s)
    segments = new_segments
    utterance_stats.accumulate_segment_stats(
        segments, 'stage  9 [remove segments under --min-segment-length]')

    for i, x in enumerate(segments):
        _global_logger.debug(
            "stage 9: segment %d = %s", i, x.debug_info(False))

    if args.verbose > 4:
        print("Stage 9 [remove segments under "
              "--min-segment-length]:", file=sys.stderr)
        segments_copy = [x.copy() for x in segments]
        print_debug_info_for_utterance(sys.stderr,
                                       copy.deepcopy(split_lines_of_utt),
                                       segments_copy, [])

    new_segments = []
    for s in segments:
        if s.contains_atleast_one_scored_non_oov_word():
            new_segments.append(s)
        else:
            s.debug_str += '[deleted-because-no-scored-non-oov-words]'
            deleted_segments.append(s)
    segments = new_segments
    utterance_stats.accumulate_segment_stats(
        segments, 'stage 10 [remove segments without scored,non-OOV words]')

    for i, x in enumerate(segments):
        _global_logger.debug(
            "stage 10: segment %d = %s", i, x.debug_info(False))

    if args.verbose > 4:
        print("Stage 10 [remove segments without scored, non-OOV words "
              "", file=sys.stderr)
        segments_copy = [x.copy() for x in segments]
        print_debug_info_for_utterance(sys.stderr,
                                       copy.deepcopy(split_lines_of_utt),
                                       segments_copy, [])

    for i in range(len(segments) - 1):
        if segments[i].end_time() > segments[i + 1].start_time():
            # this just adds something to --ctm-edits-out output
            segments[i + 1].debug_str += ",overlaps-previous-segment"

    if len(segments) == 0:
        utterance_stats.num_utterances_without_segments += 1

    return (segments, deleted_segments)


def float_to_string(f):
    """ this prints a number with a certain number of digits after the point,
    while removing trailing zeros.
    """
    num_digits = 6  # we want to print 6 digits after the zero
    g = f
    while abs(g) > 1.0:
        g *= 0.1
        num_digits += 1
    format_str = '%.{0}g'.format(num_digits)
    return format_str % f


def time_to_string(time, frame_length):
    """ Gives time in string form as an exact multiple of the frame-length,
    e.g. 0.01 (after rounding).
    """
    n = round(time / frame_length)
    assert n >= 0
    # The next function call will remove trailing zeros while printing it, so
    # that e.g. 0.01 will be printed as 0.01 and not 0.0099999999999999.  It
    # seems that doing this in a simple way is not really possible (at least,
    # not without assuming that frame_length is of the form 10^-n, which we
    # don't really want to do).
    return float_to_string(n * frame_length)


def write_segments_for_utterance(text_output_handle, segments_output_handle,
                                 old_utterance_name, segments, oov_symbol,
                                 eps_symbol="<eps>", frame_length=0.01):
    num_digits = len(str(len(segments)))
    for n, segment in enumerate(segments):
        # split utterances will be named foo-bar-1 foo-bar-2, etc.
        new_utterance_name = "{old}-{index:0{width}}".format(
                                 old=old_utterance_name, index=n+1,
                                 width=num_digits)
        # print a line to the text output of the form like
        # <new-utterance-id> <text>
        # like:
        # foo-bar-1 hello this is dan
        print(new_utterance_name, segment.text(oov_symbol, eps_symbol),
              file=text_output_handle)
        # print a line to the segments output of the form
        # <new-utterance-id> <old-utterance-id> <start-time> <end-time>
        # like:
        # foo-bar-1 foo-bar 5.1 7.2
        print(new_utterance_name, old_utterance_name,
              time_to_string(segment.start_time(), frame_length),
              time_to_string(segment.end_time(), frame_length),
              file=segments_output_handle)


# Note, this is destrutive of 'segments_for_utterance', but it won't matter.
def print_debug_info_for_utterance(ctm_edits_out_handle,
                                   split_lines_of_cur_utterance,
                                   segments_for_utterance,
                                   deleted_segments_for_utterance,
                                   frame_length=0.01):
    # info_to_print will be list of 2-tuples
    # (time, 'start-segment-n'|'end-segment-n')
    # representing the start or end times of segments.
    info_to_print = []
    for n, segment in enumerate(segments_for_utterance):
        start_string = 'start-segment-{0}[{1}]'.format(n + 1,
                                                       segment.debug_info())
        info_to_print.append((segment.start_time(), start_string))
        end_string = 'end-segment-{0}'.format(n + 1)
        info_to_print.append((segment.end_time(), end_string))
    # for segments that were deleted we print info like
    # start-deleted-segment-1, and otherwise similar info to segments that were
    # retained.
    for n, segment in enumerate(deleted_segments_for_utterance):
        start_string = 'start-deleted-segment-{0}[{1}]'.format(
            n + 1, segment.debug_info(False))
        info_to_print.append((segment.start_time(), start_string))
        end_string = 'end-deleted-segment-{0}'.format(n + 1)
        info_to_print.append((segment.end_time(), end_string))

    info_to_print = sorted(info_to_print)

    for i, split_line in enumerate(split_lines_of_cur_utterance):
        # add an index like [0], [1], to the utterance-id so we can easily look
        # up segment indexes.
        split_line[0] += '[{0}]'.format(i)
        start_time = float(split_line[2])
        end_time = start_time + float(split_line[3])
        split_line_copy = list(split_line)
        while len(info_to_print) > 0 and info_to_print[0][0] <= end_time:
            (segment_start, string) = info_to_print[0]
            # shift the first element off of info_to_print.
            info_to_print = info_to_print[1:]
            # add a field like 'start-segment1[...]=3.21' to what we're about
            # to print.
            split_line_copy.append(
                '{0}={1}'.format(string,
                                 time_to_string(segment_start, frame_length)))
        print(' '.join(split_line_copy), file=ctm_edits_out_handle)


class WordStats(object):
    """
    This accumulates word-level stats about, for each reference word, with
    what probability it will end up in the core of a segment.  Words with
    low probabilities of being in segments will generally be associated
    with some kind of error (there is a higher probability of having a
    wrong lexicon entry).
    """
    def __init__(self):
        self.word_count_pair = defaultdict(lambda: [0, 0])

    def accumulate_for_utterance(self, split_lines_of_utt,
                                 segments_for_utterance,
                                 eps_symbol="<eps>"):
        # word_count_pair is a map from a string (the word) to
        # a list [total-count, count-not-within-segments]
        line_is_in_segment = [False] * len(split_lines_of_utt)
        for segment in segments_for_utterance:
            for i in range(segment.start_index, segment.end_index):
                line_is_in_segment[i] = True
        for i, split_line in enumerate(split_lines_of_utt):
            this_ref_word = split_line[6]
            if this_ref_word != eps_symbol:
                self.word_count_pair[this_ref_word][0] += 1
                if not line_is_in_segment[i]:
                    self.word_count_pair[this_ref_word][1] += 1

    def print(self, word_stats_out):
        # Sort from most to least problematic.  We want to give more prominence
        # to words that are most frequently not in segments, but also to
        # high-count words.  Define badness = pair[1] / pair[0], and
        # total_count = pair[0], where 'pair' is a value of word_count_pair.
        # We'll reverse sort on badness^3 * total_count = pair[1]^3 /
        # pair[0]^2.
        for key, pair in sorted(
                self.word_count_pair.items(),
                key=lambda item: (item[1][1] ** 3) * 1.0 / (item[1][0] ** 2),
                reverse=True):
            badness = pair[1] * 1.0 / pair[0]
            total_count = pair[0]
            print(key, badness, total_count, file=word_stats_out)
        try:
            word_stats_out.close()
        except:
            _global_logger.error("error closing file --word-stats-out=%s "
                                 "(full disk?)", word_stats_out.name)
            raise

        _global_logger.info(
            """please see the file %s for word-level
            statistics saying how frequently each word was excluded for a
            segment; format is <word> <proportion-of-time-excluded>
            <total-count>.  Particularly problematic words appear near the top
            of the file.""", word_stats_out.name)


def process_data(args, oov_symbol, utterance_stats, word_stats):
    """
    Most of what we're doing in the lines below is splitting the input lines
    and grouping them per utterance, before giving them to
    get_segments_for_utterance() and then printing the modified lines.
    """
    first_line = args.ctm_edits_in.readline()
    if first_line == '':
        sys.exit("segment_ctm_edits.py: empty input")
    split_pending_line = first_line.split()
    if len(split_pending_line) == 0:
        sys.exit("segment_ctm_edits.py: bad input line " + first_line)
    cur_utterance = split_pending_line[0]
    split_lines_of_cur_utterance = []

    while True:
        try:
            if (len(split_pending_line) == 0
                    or split_pending_line[0] != cur_utterance):
                # Read one whole utterance. Now process it.
                (segments_for_utterance,
                 deleted_segments_for_utterance) = get_segments_for_utterance(
                     split_lines_of_cur_utterance, args=args,
                     utterance_stats=utterance_stats)
                word_stats.accumulate_for_utterance(
                    split_lines_of_cur_utterance, segments_for_utterance)
                write_segments_for_utterance(
                    args.text_out, args.segments_out, cur_utterance,
                    segments_for_utterance, oov_symbol=oov_symbol,
                    frame_length=args.frame_length)
                if args.ctm_edits_out is not None:
                    print_debug_info_for_utterance(
                        args.ctm_edits_out, split_lines_of_cur_utterance,
                        segments_for_utterance, deleted_segments_for_utterance,
                        frame_length=args.frame_length)

                split_lines_of_cur_utterance = []
                if len(split_pending_line) == 0:
                    break
                else:
                    cur_utterance = split_pending_line[0]

            split_lines_of_cur_utterance.append(split_pending_line)
            next_line = args.ctm_edits_in.readline()
            split_pending_line = next_line.split()
            if len(split_pending_line) == 0:
                if next_line != '':
                    sys.exit("segment_ctm_edits.py: got an "
                             "empty or whitespace input line")
        except Exception:
            _global_logger.error(
                "Error with utterance %s", cur_utterance)
            raise


def read_non_scored_words(non_scored_words_file):
    for line in non_scored_words_file.readlines():
        parts = line.split()
        if not len(parts) == 1:
            raise RuntimeError(
                "segment_ctm_edits.py: bad line in non-scored-words "
                "file {0}: {1}".format(non_scored_words_file, line))
        _global_non_scored_words.add(parts[0])
    non_scored_words_file.close()


class UtteranceStats(object):

    def __init__(self):
        # segment_total_length and num_segments are maps from
        # 'stage' strings; see accumulate_segment_stats for details.
        self.segment_total_length = defaultdict(int)
        self.num_segments = defaultdict(int)
        # the lambda expression below is an anonymous function that takes no
        # arguments and returns the new list [0, 0].
        self.num_utterances = 0
        self.num_utterances_without_segments = 0
        self.total_length_of_utterances = 0

    def accumulate_segment_stats(self, segment_list, text):
        """
        Here, 'text' will be something that indicates the stage of processing,
        e.g. 'Stage 0: segment cores', 'Stage 1: add tainted lines', etc.
        """
        for segment in segment_list:
            self.num_segments[text] += 1
            self.segment_total_length[text] += segment.length()

    def print_segment_stats(self):
        _global_logger.info(
            """Number of utterances is %d, of which %.2f%% had no segments
            after all processing; total length of data in original utterances
            (in seconds) was %d""",
            self.num_utterances,
            (self.num_utterances_without_segments * 100.0
             / self.num_utterances),
            self.total_length_of_utterances)

        keys = sorted(self.segment_total_length.keys())
        for i, key in enumerate(keys):
            if i > 0:
                delta_percentage = '[%+.2f%%]' % (
                    (self.segment_total_length[key]
                     - self.segment_total_length[keys[i - 1]])
                    * 100.0 / self.total_length_of_utterances)
            _global_logger.info(
                'At %s, num-segments is %d, total length %.2f%% of '
                'original total %s',
                key, self.num_segments[key],
                (self.segment_total_length[key]
                 * 100.0 / self.total_length_of_utterances),
                delta_percentage if i > 0 else '')


def main():
    args = get_args()

    try:
        global _global_non_scored_words
        _global_non_scored_words = set()
        read_non_scored_words(args.non_scored_words_in)

        oov_symbol = None
        if args.oov_symbol_file is not None:
            try:
                line = args.oov_symbol_file.readline()
                assert len(line.split()) == 1
                oov_symbol = line.split()[0]
                assert args.oov_symbol_file.readline() == ''
                args.oov_symbol_file.close()
            except Exception:
                _global_logger.error("error reading file "
                                     "--oov-symbol-file=%s",
                                     args.oov_symbol_file.name)
                raise
        elif args.unk_padding != 0.0:
            raise ValueError(
                "if the --unk-padding option is nonzero (which "
                "it is by default, "
                "the --oov-symbol-file option must be supplied.")

        utterance_stats = UtteranceStats()
        word_stats = WordStats()
        process_data(args,
                     oov_symbol=oov_symbol, utterance_stats=utterance_stats,
                     word_stats=word_stats)

        try:
            args.text_out.close()
            args.segments_out.close()
            if args.ctm_edits_out is not None:
                args.ctm_edits_out.close()
        except:
            _global_logger.error("error closing one or more outputs "
                                 "(broken pipe or full disk?)")
            raise

        utterance_stats.print_segment_stats()
        if args.word_stats_out is not None:
            word_stats.print(args.word_stats_out)
        if args.ctm_edits_out is not None:
            _global_logger.info("detailed utterance-level debug information "
                                "is in %s", args.ctm_edits_out.name)
    except:
        _global_logger.error("Failed segmenting CTM edits")
        raise
    finally:
        try:
            args.text_out.close()
            args.segments_out.close()
            if args.ctm_edits_out is not None:
                args.ctm_edits_out.close()
        except:
            _global_logger.error("error closing one or more outputs "
                                 "(broken pipe or full disk?)")
            raise


if __name__ == '__main__':
    main()


================================================
FILE: egs/steps/cleanup/internal/split_text_into_docs.pl
================================================
#! /usr/bin/perl

# Copyright 2017  Vimal Manohar
# Apache 2.0.

# If 'text' contains:
#  utterance1 A B C D
#  utterance2 C B
#  and you ran:
#  split_text_into_docs.pl --max-words 2 text doc2text docs
#  then 'doc2text' would contain:
#  utterance1-1 utterance1
#  utterance1-2 utterance1
#  utterance2-1 utterance2
#  and 'docs' would contain:
#  utterance1-1 A B
#  utterance1-2 C D
#  utterance2-1 C B

use warnings;
use strict;

my $max_words = 1000;

my $usage = "Usage: steps/cleanup/internal/split_text_into_docs.pl [--max-words <int>] text doc2text docs\n";

while (@ARGV > 3) {
    if ($ARGV[0] eq "--max-words") {
        shift @ARGV;
        $max_words = shift @ARGV;
    } else {
        print STDERR "$usage";
        exit (1);
    }
}

if (scalar @ARGV != 3) {
  print STDERR "$usage";
  exit (1);
}

sub min ($$) { $_[$_[0] > $_[1]] }

open TEXT, $ARGV[0] or die "$0: Could not open file $ARGV[0] for reading\n";
open DOC2TEXT, ">", $ARGV[1] or die "$0: Could not open file $ARGV[1] for writing\n";
open DOCS, ">", $ARGV[2] or die "$0: Could not open file $ARGV[2] for writing\n";

while (<TEXT>) {
  chomp;
  my @F = split;
  my $utt = shift @F;
  my $num_words = scalar @F;

  if ($num_words  <= $max_words) {
    print DOCS "$_\n";
    print DOC2TEXT "$utt $utt\n";
    next;
  }

  my $num_docs = int($num_words / $max_words) + 1;
  my $num_words_shift = int($num_words / $num_docs) + 1;
  my $words_per_doc = $num_words_shift;

  #print STDERR ("$utt num-words=$num_words num-docs=$num_docs words-per-doc=$words_per_doc\n");
  
  for (my $i = 0; $i < $num_docs; $i++) {
    my $st = $i*$num_words_shift;
    my $end = min($st + $words_per_doc, $num_words) - 1;
    print DOCS ("$utt-$i " . join(" ", @F[$st..$end]) . "\n");
    print DOC2TEXT "$utt-$i $utt\n";
  }
}


================================================
FILE: egs/steps/cleanup/internal/stitch_documents.py
================================================
#! /usr/bin/env python

# Copyright 2016    Vimal Manohar
# Apache 2.0.

"""This script reads an archive of mapping from query to
documents and stitches the documents for each query into a
new document.
Here "document" is just a list of words.

query2docs is a mapping from query-id to a list of tuples
(document-id, start-fraction, end-fraction)
The tuple can be just the document-id, which is equivaluent to
specifying a start-fraction and end-fraction of 1.0
The start and end fractions are used to stitch only a part of the
document to the retrieved set for the query.

e.g.
query1 doc1 doc2
query2 doc1,0,0.3 doc2,1,1

input-documents
doc1 A B C
doc2 D E
output-documents
query1 A B C D E
query2 C D E
"""

from __future__ import print_function
import argparse
import logging

logger = logging.getLogger(__name__)
handler = logging.StreamHandler()
handler.setLevel(logging.INFO)
formatter = logging.Formatter("%(asctime)s [%(pathname)s:%(lineno)s - "
                              "%(funcName)s - %(levelname)s ] %(message)s")
handler.setFormatter(formatter)

for l in [logger, logging.getLogger('libs')]:
    l.setLevel(logging.DEBUG)
    l.addHandler(handler)


def get_args():
    """Returns arguments parsed from command-line."""

    parser = argparse.ArgumentParser(
        description="""This script reads an archive of mapping from query to
        documents and stitches the documents for each query into a new
        document.""")

    parser.add_argument("--query2docs", type=argparse.FileType('r'),
                        required=True,
                        help="""Input file containing an archive
                        of list of documents indexed by a query document
                        id.""")
    parser.add_argument("--input-documents", type=argparse.FileType('r'),
                        required=True,
                        help="""Input file containing the documents
                        indexed by the document id.""")
    parser.add_argument("--output-documents", type=argparse.FileType('w'),
                        required=True,
                        help="""Output documents indexed by the query
                        document-id, obtained by stitching input documents
                        corresponding to the query.""")
    parser.add_argument("--check-sorted-docs-per-query", type=str,
                        choices=["true", "false"], default="false",
                        help="If specified, the script will expect "
                        "the document ids in --query2docs to be "
                        "sorted.")

    args = parser.parse_args()

    args.check_sorted_docs_per_query = bool(
        args.check_sorted_docs_per_query == "true")

    return args


def run(args):
    documents = {}
    for line in args.input_documents:
        parts = line.strip().split()
        key = parts[0]
        documents[key] = parts[1:]
    args.input_documents.close()

    for line in args.query2docs:
        try:
            parts = line.strip().split()
            query = parts[0]
            document_infos = parts[1:]

            output_document = []
            prev_doc_id = ''
            for doc_info in document_infos:
                try:
                    doc_id, start_fraction, end_fraction = doc_info.split(',')
                    start_fraction = float(start_fraction)
                    end_fraction = float(end_fraction)
                except ValueError:
                    doc_id = doc_info
                    start_fraction = 1.0
                    end_fraction = 1.0

                if args.check_sorted_docs_per_query:
                    if prev_doc_id != '':
                        if doc_id <= prev_doc_id:
                            raise RuntimeError(
                                "Documents not sorted and "
                                "--check-sorted-docs-per-query was True; "
                                "{0} <= {1}".format(doc_id, prev_doc_id))
                    prev_doc_id = doc_id

                doc = documents[doc_id]
                num_words = len(doc)

                if start_fraction == 1.0 or end_fraction == 1.0:
                    assert end_fraction == end_fraction
                    output_document.extend(doc)
                else:
                    assert (start_fraction + end_fraction < 1.0)
                    if start_fraction > 0:
                        output_document.extend(
                            doc[0:int(start_fraction * num_words)])
                    if end_fraction > 0:
                        output_document.extend(
                            doc[int(end_fraction * num_words):])

            print ("{0} {1}".format(query, " ".join(output_document)),
                   file=args.output_documents)
        except Exception:
            logger.error("Error processing line %s in file %s", line,
                         args.query2docs.name)
            raise


def main():
    args = get_args()

    try:
        run(args)
    except:
        logger.error("Failed to stictch document; got error ",
                     exc_info=True)
        raise SystemExit(1)
    finally:
        for f in [args.query2docs, args.input_documents,
                  args.output_documents]:
            if f is not None:
                f.close()


if __name__ == '__main__':
    main()


================================================
FILE: egs/steps/cleanup/internal/taint_ctm_edits.py
================================================
#!/usr/bin/env python3

# Copyright 2016   Vimal Manohar
#           2016   Johns Hopkins University (author: Daniel Povey)
# Apache 2.0

from __future__ import print_function
import sys, operator, argparse, os
from collections import defaultdict

import io
sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding="utf8")


# This script reads and writes the 'ctm-edits' file that is
# produced by get_ctm_edits.py.
#
# It is to be applied after modify_ctm_edits.py.  Its function is to add, in
# certain circumstances, an optional extra field with the word 'tainted' to the
# ctm-edits format, e.g an input line like:
#
# AJJacobs_2007P-0001605-0003029 1 0 0.09 <eps> 1.0 <eps> sil
# might become:
# AJJacobs_2007P-0001605-0003029 1 0 0.09 <eps> 1.0 <eps> sil tainted
#
# It also deletes certain lines, representing deletions, from the ctm (if they
# were next to taintable lines... their presence could then be inferred from the
# 'tainted' flag).
#
# You should interpret the 'tainted' flag as "we're not sure what's going on here;
# don't trust this."
#
# One of the problem this script is trying to solve is that if we have errors
# that are adjacent to silence or non-scored words
# it's not at all clear whether the silence or non-scored words were really such,
# or might have contained actual words.
# Also, if we have words in the reference that were realized as '<unk>' in the
# hypothesis, and they are adjacent to errors, it's almost always the case
# that the '<unk>' doesn't really correspond to the word in the reference, so
# we mark these as 'tainted'.
#
# The rule for tainting is quite simple; see the code.


parser = argparse.ArgumentParser(
    description = "This program modifies the ctm-edits format to identify "
    "silence and 'fixed' non-scored-word lines, and lines where the hyp is "
    "<unk> and the reference is a real but OOV word, where there is a relatively "
    "high probability that something is going wrong so we shouldn't trust "
    "this line.  It adds the field 'tainted' to such "
    "lines.  Lines in the ctm representing deletions from the reference will "
    "be removed if they have 'tainted' adjacent lines (since it won't be clear "
    "where such reference words were really realized, if at all). "
    "See comments at the top of the script for more information.")

parser.add_argument("--verbose", type = int, default = 1,
                    choices=[0,1,2,3],
                    help = "Verbose level, higher = more verbose output")
parser.add_argument("--remove-deletions", type=str, default="true",
                    choices=["true", "false"],
                    help = "Remove deletions next to taintable lines")
parser.add_argument("ctm_edits_in", metavar = "<ctm-edits-in>",
                    help = "Filename of input ctm-edits file. "
                    "Use /dev/stdin for standard input.")
parser.add_argument("ctm_edits_out", metavar = "<ctm-edits-out>",
                    help = "Filename of output ctm-edits file. "
                    "Use /dev/stdout for standard output.")

args = parser.parse_args()
args.remove_deletions = bool(args.remove_deletions == "true")


# This function is the core of the program, that does the tainting and
# removes some lines representing deletions.
# split_lines_of_utt is a list of lists, one per line, each containing the
# sequence of fields.  Returns the same format of data after processing to add
# the 'tainted' field.  Note: this function is destructive of its input; the
# input will not have the same value afterwards.
def ProcessUtterance(split_lines_of_utt, remove_deletions=True):
    global num_lines_of_type, num_tainted_lines, \
           num_del_lines_giving_taint, num_sub_lines_giving_taint, \
           num_ins_lines_giving_taint

    # work out whether each line is taintable [i.e. silence or fix or unk replacing
    # real-word].
    taintable = [ False ] * len(split_lines_of_utt)
    for i in range(len(split_lines_of_utt)):
        edit_type = split_lines_of_utt[i][7]
        if edit_type == 'sil' or edit_type == 'fix':
            taintable[i] = True
        elif edit_type == 'cor' and split_lines_of_utt[i][4] != split_lines_of_utt[i][6]:
            # this is the case when <unk> replaces a real word that was out of
            # the vocabulary; we mark it as correct because such words do
            # translate to <unk> if we don't have a pronunciations.  However we
            # don't have good confidence that the alignments of such words are
            # accurate if they are adjacent to errors.
            taintable[i] = True


    for i in range(len(split_lines_of_utt)):
        edit_type = split_lines_of_utt[i][7]
        num_lines_of_type[edit_type] += 1
        if edit_type == 'del' or edit_type == 'sub' or edit_type == 'ins':
            tainted_an_adjacent_line = False
            # First go backwards tainting lines
            j = i - 1
            while j >= 0 and taintable[j]:
                tainted_an_adjacent_line = True
                if len(split_lines_of_utt[j]) == 8:
                    num_tainted_lines += 1
                    split_lines_of_utt[j].append('tainted')
                j -= 1
            # Next go forwards tainting lines
            j = i + 1
            while j < len(split_lines_of_utt) and taintable[j]:
                tainted_an_adjacent_line = True
                if len(split_lines_of_utt[j]) == 8:
                    num_tainted_lines += 1
                    split_lines_of_utt[j].append('tainted')
                j += 1
            if tainted_an_adjacent_line:
                if edit_type == 'del':
                    if remove_deletions:
                        split_lines_of_utt[i][7] = 'remove-this-line'
                    num_del_lines_giving_taint += 1
                elif edit_type == 'sub':
                    num_sub_lines_giving_taint += 1
                else:
                    num_ins_lines_giving_taint += 1

    new_split_lines_of_utt = []
    for i in range(len(split_lines_of_utt)):
        if (not remove_deletions
                or split_lines_of_utt[i][7] != 'remove-this-line'):
            new_split_lines_of_utt.append(split_lines_of_utt[i])
    return new_split_lines_of_utt


def ProcessData():
    try:
        f_in = open(args.ctm_edits_in, encoding="utf8")
    except:
        sys.exit("taint_ctm_edits.py: error opening ctm-edits input "
                 "file {0}".format(args.ctm_edits_in))
    try:
        f_out = open(args.ctm_edits_out, 'w', encoding="utf8")
    except:
        sys.exit("taint_ctm_edits.py: error opening ctm-edits output "
                 "file {0}".format(args.ctm_edits_out))
    num_lines_processed = 0


    # Most of what we're doing in the lines below is splitting the input lines
    # and grouping them per utterance, before giving them to ProcessUtterance()
    # and then printing the modified lines.
    first_line = f_in.readline()
    if first_line == '':
        sys.exit("taint_ctm_edits.py: empty input")
    split_pending_line = first_line.split()
    if len(split_pending_line) == 0:
        sys.exit("taint_ctm_edits.py: bad input line " + first_line)
    cur_utterance = split_pending_line[0]
    split_lines_of_cur_utterance = []

    while True:
        if len(split_pending_line) == 0 or split_pending_line[0] != cur_utterance:
            split_lines_of_cur_utterance = ProcessUtterance(
                split_lines_of_cur_utterance, args.remove_deletions)
            for split_line in split_lines_of_cur_utterance:
                print(' '.join(split_line), file = f_out)
            split_lines_of_cur_utterance = []
            if len(split_pending_line) == 0:
                break
            else:
                cur_utterance = split_pending_line[0]

        split_lines_of_cur_utterance.append(split_pending_line)
        next_line = f_in.readline()
        split_pending_line = next_line.split()
        if len(split_pending_line) == 0:
            if next_line != '':
                sys.exit("taint_ctm_edits.py: got an empty or whitespace input line")
    try:
        f_out.close()
    except:
        sys.exit("taint_ctm_edits.py: error closing ctm-edits output "
                 "(broken pipe or full disk?)")

def PrintNonScoredStats():
    if args.verbose < 1:
        return
    if num_lines == 0:
        print("taint_ctm_edits.py: processed no input.", file = sys.stderr)
    num_lines_modified = sum(ref_change_stats.values())
    num_incorrect_lines = num_lines - num_correct_lines
    percent_lines_incorrect= '%.2f' % (num_incorrect_lines * 100.0 / num_lines)
    percent_modified = '%.2f' % (num_lines_modified * 100.0 / num_lines);
    percent_of_incorrect_modified = '%.2f' % (num_lines_modified * 100.0 / num_incorrect_lines)
    print("taint_ctm_edits.py: processed {0} lines of ctm ({1}% of which incorrect), "
          "of which {2} were changed fixing the reference for non-scored words "
          "({3}% of lines, or {4}% of incorrect lines)".format(
            num_lines, percent_lines_incorrect, num_lines_modified,
            percent_modified, percent_of_incorrect_modified),
          file = sys.stderr)

    keys = sorted(list(ref_change_stats.keys()), reverse=True,
                  key = lambda x: ref_change_stats[x])
    num_keys_to_print = 40 if args.verbose >= 2 else 10

    print("taint_ctm_edits.py: most common edits (as percentages "
          "of all such edits) are:\n" +
          ('\n'.join([ '%s [%.2f%%]' % (k, ref_change_stats[k]*100.0/num_lines_modified)
                     for k in keys[0:num_keys_to_print]]))
          + '\n...'if num_keys_to_print < len(keys) else '',
          file = sys.stderr)


def PrintStats():
    tot_lines = sum(num_lines_of_type.values())
    if args.verbose < 1 or tot_lines == 0:
        return
    print("taint_ctm_edits.py: processed {0} input lines, whose edit-types were: ".format(tot_lines) +
          ', '.join([ '%s = %.2f%%' % (k, num_lines_of_type[k] * 100.0 / tot_lines)
                      for k in sorted(list(num_lines_of_type.keys()), reverse = True,
                                      key = lambda k: num_lines_of_type[k])  ]),
          file = sys.stderr)


    del_giving_taint_percent = num_del_lines_giving_taint * 100.0 / tot_lines
    sub_giving_taint_percent = num_sub_lines_giving_taint * 100.0 / tot_lines
    ins_giving_taint_percent = num_ins_lines_giving_taint * 100.0 / tot_lines
    tainted_lines_percent = num_tainted_lines * 100.0 / tot_lines

    print("taint_ctm_edits.py: as a percentage of all lines, (%.2f%%, %.2f%%, %.2f%%) were "
          "(deletions, substitutions, insertions) that tainted adjacent lines.  %.2f%% of all "
          "lines were tainted." % (del_giving_taint_percent, sub_giving_taint_percent,
                                   ins_giving_taint_percent, tainted_lines_percent),
          file = sys.stderr)


# num_lines_of_type will map from line-type ('cor', 'sub', etc.) to count.
num_lines_of_type = defaultdict(int)
num_tainted_lines = 0
num_del_lines_giving_taint = 0
num_sub_lines_giving_taint = 0
num_ins_lines_giving_taint = 0

ProcessData()
PrintStats()


================================================
FILE: egs/steps/cleanup/internal/tf_idf.py
================================================
# Copyright 2016    Vimal Manohar
# Apache 2.0.

"""This module contains structures to accumulate, store and use stats
for Term-frequency and Inverse-document-frequency values.
"""

from __future__ import print_function
from __future__ import division
import logging
import math
import re
import sys

sys.path.insert(0, 'steps')

logger = logging.getLogger('__name__')
logger.addHandler(logging.NullHandler())


class IDFStats(object):
    """Stores stats for computing inverse-document-frequencies.
    """
    def __init__(self):
        self.num_docs_for_term = {}
        self.num_docs = 0

    def get_inverse_document_frequency(self, term, weighting_scheme="log"):
        """Get IDF for a term.

        Weighting scheme is the function applied on the raw
        inverse-document frequencies n(t) = |d in D: t in d|
        when computing idf(t,d).
        Let N = Total number of documents.

        IDF weighting schemes:-
        unary  : idf(t,D) = 1
        log    : idf(t,D) = log (N / (1 + n(t)))
        log-smoothed : idf(t,D) = log(1 + N / n(t))
        probabilistic: idf(t,D) = log((N - n(t)) / n(t))
        """
        n_t = float(self.num_docs_for_term.get(term, 0))
        num_terms = len(self.num_docs_for_term)

        if num_terms == 0:
            raise RuntimeError("No IDF stats have been accumulated.")

        if weighting_scheme == "unary":
            return 1
        if weighting_scheme == "log":
            return math.log(float(self.num_docs) / (1.0 + n_t))
        if weighting_scheme == "log-smoothed":
            return math.log(1.0 + float(self.num_docs) / (1.0 + n_t))
        if weighting_scheme == "probabilitic":
            return math.log((self.num_docs - n_t - 1) / (1.0 + n_t))

    def accumulate(self, term):
        """Adds one count to the number of docs containing the term "term".
        """
        self.num_docs_for_term[term] = self.num_docs_for_term.get(term, 0) + 1
        if len(term) == 1:
            self.num_docs += 1

    def write(self, file_handle):
        """Writes the IDF stats to file using the format:
        <term-1> <term-2> ... <term-N> <num-docs>
        for n-gram (<term-1>, ... <term-N>)
        """
        for term, num in self.num_docs_for_term.items():
            if num == 0:
                continue
            assert isinstance(term, tuple)
            print ("{term} {n}".format(term=" ".join(term), n=num),
                   file=file_handle)

    def read(self, file_handle):
        """Loads IDF stats from file. """
        for line in file_handle:
            parts = line.strip().split()
            term = tuple(parts[0:-1])
            self.num_docs_for_term[term] = float(parts[-1])
            if len(term) == 1:
                self.num_docs += 1

        if len(self.num_docs_for_term) == 0:
            raise RuntimeError("Read no IDF stats.")


class TFStats(object):
    """Store stats for TF-IDF computation.
    A separate object of IDFStats is stored within this object.
    """
    def __init__(self):
        self.raw_counts = {}
        self.max_counts_for_term = {}

    def get_term_frequency(self, term, doc, weighting_scheme="raw",
                           normalization_factor=0.5):
        """Returns the term-frequency for (term, document) pair.

        The function applied on the raw term-frequencies f(t,d) when computing
        tf(t,d) is specified by the weighting_scheme.
        binary : tf(t,d) = 1 if t in d else 0
        raw    : tf(t,d) = f(t,d)
        log    : tf(t,d) = 1 + log(f(t,d))
        normalized : tf(t,d) = K + (1-K) * f(t,d) / max{f(t',d): t' in d}
        """
        if weighting_scheme == "binary":
            return 1 if (term, doc) in self.raw_counts else 0
        if weighting_scheme == "raw":
            return self.raw_counts.get((term, doc), 0)
        if weighting_scheme == "log":
            if (term, doc) in self.raw_counts:
                return 1 + math.log(self.raw_counts[(term, doc)])
            return 0
        if weighting_scheme == "normalized":
            return (normalization_factor
                    + (1 - normalization_factor)
                    * self.raw_counts.get((term, doc), 0)
                    / (1.0 + self.max_counts_for_term.get(term, 0)))
        raise KeyError("Unknown tf-weighting-scheme {0}".format(
            weighting_scheme))

    def accumulate(self, doc, text, ngram_order):
        """Accumulate raw stats from a document for upto the specified
        ngram-order."""
        for n in range(1, ngram_order + 1):
            for i in range(len(text)):
                term = tuple(text[i:(i+n)])
                self.raw_counts.setdefault((term, doc), 0)
                self.raw_counts[(term, doc)] += 1

    def compute_term_stats(self, idf_stats=None):
        """Compute the maximum counts for each term over all the documents
        based on the stored raw counts."""
        if len(self.raw_counts) == 0:
            raise RuntimeError("No (term, doc) found in tf-stats.")
        for tup, counts in self.raw_counts.items():
            term = tup[0]

            if counts > self.max_counts_for_term.get(term, 0):
                self.max_counts_for_term[term] = counts

            if idf_stats is not None:
                idf_stats.accumulate(term)

    def __str__(self):
        """Returns a string with all the stats in the following format:
        <n-gram order> <term-1> <term-2> ... <term-n> <document-id> <counts>
        """
        lines = []
        for tup, counts in self.raw_counts.items():
            term, doc = tup
            lines.append("{order} {term} {doc} {counts}".format(
                order=len(term), term=" ".join(term),
                doc=doc, counts=counts))
        return "\n".join(lines)

    def read(self, file_handle, ngram_order=None, idf_stats=None):
        """Reads the TF stats stored in a file in the following format:
        <ngram-order> <term-1> <term-2> ... <term-n> <document-id> <counts>

        If idf_stats is provided then idf_stats is accumulated simultaneously.
        """
        for line in file_handle:
            parts = line.strip().split()
            order = parts[0]
            assert len(parts) - 3 == order
            if ngram_order is not None and order > ngram_order:
                continue
            term = tuple(parts[1:(order+1)])
            doc = parts[-2]
            counts = float(parts[-1])

            self.raw_counts[(term, doc)] = counts

            if counts > self.max_counts_for_term.get(term, 0):
                self.max_counts_for_term[term] = counts

            if idf_stats is not None:
                idf_stats.accumulate(term)

        if len(self.raw_counts) == 0:
            raise RuntimeError("Read no TF stats.")


class TFIDF(object):
    """Class to store TF-IDF values for term-document pairs.

    Parameters:
        tf_idf - A dictionary of TF-IDF values indexed by (term, document)
                 tuple as key
    """

    def __init__(self):
        self.tf_idf = {}

    def get_value(self, term, doc):
        """Returns TF-IDF value for (term, doc) tuple if it exists.
        Otherwise returns 0.
        """
        return self.tf_idf[(term, doc)]

    def compute_similarity_scores(self, source_tfidf, source_docs=None,
                                  do_length_normalization=False,
                                  query_id=None):
        """Computes TF-IDF similarity score between each pair of query
        document contained in this object and the source documents
        in the source_tfidf object.

        Arguments:
            source_docs - If provided, the similarity scores are computed
                          for only the source documents contained in
                          source_docs.
            use_average - If True, then the similarity scores is
                          normalized by the length of query. This is usually
                          not required when the scores are only utilized
                          for ranking the source documents.
            query_id - If provided, check that this tf_idf object
                       contains values only for document with id 'query_id'

        Returns a dictionary
            { (query_document_id, source_document_id): similarity_score }
        """
        num_terms_per_doc = {}
        similarity_scores = {}

        for tup, value in self.tf_idf.items():
            term, doc = tup
            num_terms_per_doc[doc] = num_terms_per_doc.get(doc, 0) + 1

            if query_id is not None and doc != query_id:
                raise RuntimeError("TF-IDF contains document {0}, which is "
                                   "not the required query {1}. \n"
                                   "Something wrong in how this TF-IDF object "
                                   "was created or a bug in the "
                                   "calling script.".format(
                                       doc, query_id))

            if source_docs is not None:
                for src_doc in source_docs:
                    try:
                        src_value = source_tfidf.get_value(term, src_doc)
                    except KeyError:
                        logger.debug(
                            "Could not find ({term}, {src}) in "
                            "source_tfidf. "
                            "Choosing a tf-idf value of 0.".format(
                                term=term, src=src_doc))
                        src_value = 0

                    similarity_scores[(doc, src_doc)] = (
                        similarity_scores.get((doc, src_doc), 0)
                        + src_value * value)
            else:
                for src_tup, src_value in source_tfidf.tf_idf.items():
                    similarity_scores[(doc, src_doc)] = (
                        similarity_scores.get((doc, src_doc), 0)
                        + src_value * value)

        if do_length_normalization:
            for doc_pair, value in similarity_scores.items():
                doc, src_doc = doc_pair
                similarity_scores[(doc, src_doc)] = value / num_terms_per_doc[doc]

        if logger.isEnabledFor(logging.DEBUG):
            for doc, count in num_terms_per_doc.items():
                logger.debug(
                    'Seen {0} terms in query document {1}'.format(count, doc))

        return similarity_scores

    def read(self, tf_idf_file):
        """Loads TFIDF object from file."""

        if len(self.tf_idf) != 0:
            raise RuntimeError("TD-IDF object is not empty.")
        seen_footer = False
        line = tf_idf_file.readline()
        parts = line.strip().split()
        if re.search('^<TFIDF>', line) is None:
            raise TypeError(
                "Invalid format of TD-IDF object. "
                "Missing header <TFIDF>; got {0}".format(line))
        assert parts[0] == "<TFIDF>"
        if len(parts) > 1:
            # Read header; go to the rest of line
            line = " ".join(parts[1:])
        else:
            # Nothing in this line. Read the next lines.
            line = tf_idf_file.readline()
        while line:
            parts = line.strip().split()
            if re.search('</TFIDF>', line):
                if len(parts) > 1:
                    raise TypeError(
                        "Expecting footer </TFIDF> "
                        "to be on a separate line; got {0}".format(line))
                assert parts[0] == "</TFIDF>"
                seen_footer = True
                break
            if re.search('<TFIDF>', line):
                raise TypeError("Got unexpected header <TFIDF> in line "
                                "{0}".format(line))

            order = int(parts[0])
            term = tuple(parts[1:(order + 1)])
            doc = parts[-2]
            tfidf = float(parts[-1])

            entry = (term, doc)
            if entry in self.tf_idf:
                raise RuntimeError("Duplicate entry {0} found while reading "
                                   "TFIDF object.".format(entry))
            self.tf_idf[entry] = tfidf

            line = tf_idf_file.readline()
        if not seen_footer:
            raise TypeError(
                "Did not see footer </TFIDF> "
                "in TFIDF object; got {0}".format(line))

        if len(self.tf_idf) == 0:
            raise RuntimeError(
                "Read no TF-IDF values from file {0}".format(tf_idf_file.name))

    def write(self, tf_idf_file):
        """Writes TFIDF object to file."""

        print ("<TFIDF>", file=tf_idf_file)
        for tup, value in self.tf_idf.items():
            term, doc = tup
            print("{order} {term} {doc} {tfidf}".format(
                order=len(term), term=" ".join(term),
                doc=doc, tfidf=value),
                  file=tf_idf_file)
        print ("</TFIDF>", file=tf_idf_file)


def write_tfidf_from_stats(
        tf_stats, idf_stats, tf_idf_file, tf_weighting_scheme="raw",
        idf_weighting_scheme="log", tf_normalization_factor=0.5,
        expected_document_id=None):
    """Writes TF-IDF values to file args.tf_idf_file.
    The format used is
    <ngram-order> <term> <document> <tfidf>.
    Markers "<TFIDF>" and "</TFIDF>" are added for parsing this file
    easily.

    Arguments:
        tf_stats - A TFStats object
        idf_stats - An IDFStats object
        tf_idf_file - Output file to which the TF-IDF values will be written
        tf_weighting_scheme - See doc_string in TFStats class
        idf_weighting_scheme - See doc_string in IDFStats class
        tf_normalization_factor - See doc_string in TFStats class
        document_id - If provided, checks that the TFStats object contains
                      stats only for this document_id.
    """
    if len(tf_stats.raw_counts) == 0:
        raise RuntimeError("Supplied tf-stats object is empty.")

    if idf_stats.num_docs == 0:
        raise RuntimeError("Supplied idf-stats object is empty.")

    print ("<TFIDF>", file=tf_idf_file)
    for tup in tf_stats.raw_counts:
        term, doc = tup

        if expected_document_id is not None and doc != expected_document_id:
            raise RuntimeError("TFStats object contains stats with "
                               "document {0}, "
                               "which is not the specified "
                               "document {1}.".format(doc,
                                                      expected_document_id))

        tf_value = tf_stats.get_term_frequency(
            term, doc,
            weighting_scheme=tf_weighting_scheme,
            normalization_factor=tf_normalization_factor)

        idf_value = idf_stats.get_inverse_document_frequency(
            term, weighting_scheme=idf_weighting_scheme)

        print("{order} {term} {doc} {tfidf}".format(
            order=len(term), term=" ".join(term),
            doc=doc, tfidf=tf_value * idf_value),
              file=tf_idf_file)
    print ("</TFIDF>", file=tf_idf_file)


def read_key(fd):
  """ [str] = read_key(fd)
   Read the utterance-key from the opened ark/stream descriptor 'fd'.
  """
  str = ''
  while 1:
    char = fd.read(1)
    if char == '' : break
    if char == ' ' : break
    str += char
  str = str.strip()
  if str == '': return None # end of file,
  return str


def read_tfidf_ark(file_handle):
    """Read a kaldi archive of TFIDF objects indexed by a key (document-id).
    <document-id1> <tf-idf-object1>
    <document-id2> <tf-idf-object2>
    ...
    """
    try:
        key = read_key(file_handle)
        while key:
            tf_idf = TFIDF()
            try:
                tf_idf.read(file_handle)
            except RuntimeError:
                raise
            yield key, tf_idf
            key = read_key(file_handle)
    finally:
        file_handle.close()


================================================
FILE: egs/steps/cleanup/lattice_oracle_align.sh
================================================
#! /bin/bash

# Copyright 2016  Vimal Manohar
#           2016  Johns Hopkins University (author: Daniel Povey)
# Apache 2.0

set -e
set -o pipefail

cleanup=true
stage=0
cmd=run.pl
special_symbol="***"    # Special symbol to be aligned with the inserted or
                        # deleted words. Your sentences should not contain this
                        # symbol.
print_silence=true      # True if we want the silences in the ctm.  We do.
frame_shift=0.01

. ./path.sh
. utils/parse_options.sh

if [ $# -ne 4 ]; then
  echo "This script computes oracle paths for lattices (against a reference "
  echo "transcript) and does various kinds of processing of that, for use by "
  echo "steps/cleanup/cleanup_with_segmentation.sh."
  echo "Its main input is <latdir>/lat.*.gz."
  echo "This script outputs a human-readable word alignment of the oracle path"
  echo "through the lattice in <dir>/oracle_hyp.txt, and a time-aligned ctm version of"
  echo "the same in <dir>/ctm."
  echo "It also creates <dir>/edits.txt (the number of edits per utterance),"
  echo "<dir>/text (which is <data>/text but filtering out any utterances that"
  echo "were not decoded for some reason), and <dir>/length.txt, which is the length"
  echo "of the reference transcript, and <dir>/all_info.txt and <dir>/all_info.sorted.txt"
  echo "which contain all the info in a way that's easier to scan for humans."
  echo "Note: most of this is the same as is done in steps/cleanup/find_bad_utts.sh,"
  echo "except it runs from pre-existing lattices."
  echo ""
  echo "Usage: $0 <data> <lang> <latdir> <dir>"
  echo " e.g.: $0 data/train_si284 data/lang exp/tri4_bad_utts/lats exp/tri4_bad_utts/lattice_oracle"
  echo "Main options (for others, see top of script file)"
  echo "  --config <config-file>            # config containing options"
  echo "  --cleanup <true|false>            # set this to false to disable cleanup of "
  echo "                                    # temporary files (default: true)"
  echo "  --cmd <command-string>            # how to run jobs (default: run.pl)."
  echo "  --special-symbol <special-symbol> #  Symbol to pad with in insertions and deletions in the"
  echo "                                    # output produced in <dir>/analysis/ (default: '***'"
  echo "  --print-silence <true|false>      # Affects ctm generation; default is true (recommended)"
  echo "  --frame-shift <frame-shift>       # Frame shift in seconds; default: 0.01.  Affects ctm generation."
  exit 1
fi

data=$1
lang=$2
latdir=$3
dir=$4

for f in $lang/oov.int $lang/words.txt $data/text $latdir/lat.1.gz $latdir/num_jobs; do
  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1;
done

mkdir -p $dir/log

if [ -e $dir/final.mdl ]; then
  model=$dir/final.mdl
elif [ -e $dir/../final.mdl ]; then
  model=$dir/../final.mdl
else
  echo "$0: expected $dir/final.mdl or $dir/../final.mdl to exist"
  exit 1
fi

nj=$(cat $latdir/num_jobs)
oov=$(cat $lang/oov.int)

utils/split_data.sh $data $nj

sdata=$data/split${nj}

if [ $stage -le 1 ]; then
  $cmd JOB=1:$nj $dir/log/get_oracle.JOB.log \
    lattice-oracle --write-lattices="ark:|gzip -c > $dir/lat.JOB.gz" \
    "ark:gunzip -c $latdir/lat.JOB.gz |" \
    "ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt $sdata/JOB/text|" \
    ark,t:- \| utils/int2sym.pl -f 2- $lang/words.txt '>' $dir/oracle_hyp.JOB.txt || exit 1;

  echo -n "lattice_oracle_align.sh: overall oracle %WER is: "
  grep 'Overall %WER'  $dir/log/get_oracle.*.log  | \
    perl -e 'while (<>){ if (m: (\d+) / (\d+):) { $x += $1; $y += $2}}  printf("%.2f%%\n", $x*100.0/$y); ' | \
    tee $dir/log/oracle_overall_wer.log

  # the awk commands below are to ensure that partially-written files don't confuse us.
  for x in $(seq $nj); do cat $dir/oracle_hyp.$x.txt; done | awk '{if(NF>=1){print;}}' > $dir/oracle_hyp.txt
  if $cleanup; then
    rm $dir/oracle_hyp.*.txt
  fi
fi

echo $nj > $dir/num_jobs


if [ $stage -le 2 ]; then
  # The following command gets the time-aligned ctm as $dir/ctm.JOB.txt.

  if [ -f $lang/phones/word_boundary.int ]; then
    $cmd JOB=1:$nj $dir/log/get_ctm.JOB.log \
      set -o pipefail '&&' \
      lattice-align-words $lang/phones/word_boundary.int $model "ark:gunzip -c $dir/lat.JOB.gz|" ark:- \| \
      nbest-to-ctm --frame-shift=$frame_shift --print-silence=$print_silence ark:- - \| \
      utils/int2sym.pl -f 5 $lang/words.txt '>' $dir/ctm.JOB || exit 1;
  elif [ -f $lang/phones/align_lexicon.int ]; then
    $cmd JOB=1:$nj $dir/log/get_ctm.JOB.log \
      set -o pipefail '&&' \
      lattice-align-words-lexicon $lang/phones/align_lexicon.int $model  "ark:gunzip -c $dir/lat.JOB.gz|" ark:- \| \
      lattice-1best ark:- ark:- \| \
      nbest-to-ctm --frame-shift=$frame_shift --print-silence=$print_silence ark:- - \| \
      utils/int2sym.pl -f 5 $lang/words.txt '>' $dir/ctm.JOB || exit 1;
  else
    echo "$0: neither $lang/phones/word_boundary.int nor $lang/phones/align_lexicon.int exists: cannot align."
    exit 1;
  fi
  for j in $(seq $nj); do cat $dir/ctm.$j; done > $dir/ctm
  if $cleanup; then rm $dir/ctm.*; fi
  echo "$0: oracle ctm is in $dir/ctm"
fi


# Stages below are really just to satifsy your curiosity; the output is the same
# as that of find_bad_utts.sh.

if [ $stage -le 3 ]; then
  # in case any utterances failed to align, get filtered copy of $data/text
  utils/filter_scp.pl $dir/oracle_hyp.txt < $data/text  > $dir/text
  cat $dir/text | awk '{print $1, (NF-1);}' > $dir/length.txt

  mkdir -p $dir/analysis

  align-text --special-symbol="$special_symbol"  ark:$dir/text ark:$dir/oracle_hyp.txt  ark,t:- | \
    utils/scoring/wer_per_utt_details.pl --special-symbol "***" > $dir/analysis/per_utt_details.txt

  echo "$0: human-readable alignments are in $dir/analysis/per_utt_details.txt"

  awk '{if ($2 == "#csid") print $1" "($4+$5+$6)}' $dir/analysis/per_utt_details.txt > $dir/edits.txt

  n1=$(wc -l < $dir/edits.txt)
  n2=$(wc -l < $dir/oracle_hyp.txt)
  n3=$(wc -l < $dir/text)
  n4=$(wc -l < $dir/length.txt)
  if [ $n1 -ne $n2 ] || [ $n2 -ne $n3 ] || [ $n3 -ne $n4 ]; then
    echo "$0: mismatch in lengths of files:"
    wc $dir/edits.txt $dir/oracle_hyp.txt $dir/text $dir/length.txt
    exit 1;
  fi

  # note: the format of all_info.txt is:
  # <utterance-id>   <number of errors>  <reference-length>  <decoded-output>   <reference>
  # with the fields separated by tabs, e.g.
  # adg04_sr009_trn 1 	12	 SHOW THE GRIDLEY+S TRACK IN BRIGHT ORANGE WITH HORNE+S IN DIM RED AT	 SHOW THE GRIDLEY+S TRACK IN BRIGHT ORANGE WITH HORNE+S IN DIM RED

  paste $dir/edits.txt \
      <(awk '{print $2}' $dir/length.txt) \
      <(awk '{$1="";print;}' <$dir/oracle_hyp.txt) \
      <(awk '{$1="";print;}' <$dir/text) > $dir/all_info.txt

  sort -nr -k2 $dir/all_info.txt > $dir/all_info.sorted.txt

  echo "$0: per-utterance details sorted from worst to best utts are in $dir/all_info.sorted.txt"
  echo "$0: format is: utt-id num-errs ref-length decoded-output (tab) reference"
fi

if [ $stage -le 4 ]; then
  ###
  # These stats might help people figure out what is wrong with the data
  # a)human-friendly and machine-parsable alignment in the file per_utt_details.txt
  # b)evaluation of per-speaker performance to possibly find speakers with
  #   distinctive accents/speech disorders and similar
  # c)Global analysis on (Ins/Del/Sub) operation, which might be used to figure
  #   out if there is systematic issue with lexicon, pronunciation or phonetic confusability

  cat $dir/analysis/per_utt_details.txt | \
    utils/scoring/wer_per_spk_details.pl $data/utt2spk > $dir/analysis/per_spk_details.txt

  echo "$0: per-speaker details are in $dir/analysis/per_spk_details.txt"

  cat $dir/analysis/per_utt_details.txt | \
    utils/scoring/wer_ops_details.pl --special-symbol "$special_symbol" | \
    sort -i -b -k1,1 -k4,4nr -k2,2 -k3,3 > $dir/analysis/ops_details.txt

  echo "$0: per-word statistics [corr,sub,ins,del] are in $dir/analysis/ops_details.txt"
fi

if [ $stage -le 5 ]; then
  echo "$0: obtaining ctm edits"

  $cmd $dir/log/get_ctm_edits.log \
    align-text ark:$dir/oracle_hyp.txt ark:$dir/text ark,t:-  \| \
      steps/cleanup/internal/get_ctm_edits.py --oov=$oov --symbol-table=$lang/words.txt \
       /dev/stdin $dir/ctm $dir/ctm_edits || exit 1

  echo "$0: ctm with edits information appended is in $dir/ctm_edits"
fi


================================================
FILE: egs/steps/cleanup/make_biased_lm_graphs.sh
================================================
#!/usr/bin/env bash
# Copyright 2012-2016     Johns Hopkins University (Author: Daniel Povey)
#                2016     Vimal Manohar
# Apache 2.0


# This script creates biased decoding graphs based on the data transcripts as
# HCLG.fsts.scp, in the specified directory; this can be consumed by
# decode_segmentation.sh.
# This is for use in data-cleanup and data-filtering.


set -u
set -o pipefail
set -e

# Begin configuration section.
nj=10
cmd=run.pl
scale_opts="--transition-scale=1.0 --self-loop-scale=0.1"
top_n_words=100 # Number of common words that we compile into each graph (most frequent
                # in $data/text.orig.
top_n_words_weight=1.0  # this weight is before renormalization; it can be more
                        # or less than 1.
min_words_per_graph=100  # Utterances will be grouped so that they have at least
                         # this many words, before making the graph.
stage=0

### options for make_one_biased_lm.py.
ngram_order=4  # maximum n-gram order to use (but see also --min-lm-state-cout).
min_lm_state_count=10  # make this smaller (e.g. 2) for more strongly biased LM.
discounting_constant=0.3  # strictly between 0 and 1.  Make this closer to 0 for
                          # more strongly biased LM.

# End configuration options.

echo "$0 $@"  # Print the command line for logging

[ -f path.sh ] && . ./path.sh # source the path.
. parse_options.sh || exit 1;

if [ $# != 4 ]; then
   echo "usage: $0 <data-dir|text> <lang-dir> <dir> <graph-dir>"
   echo "e.g.:  $0 data/train data/lang exp/tri3_cleanup exp/tri3_cleanup/graphs"
   echo "  This script creates biased decoding graphs per utterance (or possibly"
   echo "  groups of utterances, depending on --min-words-per-graph).  Its output"
   echo "  goes to <dir>/HCLG.fsts.scp, indexed by utterance.  Directory <dir> is"
   echo "  required to be a model or alignment directory, containing 'tree' and 'final.mdl'."
   echo "Main options (for others, see top of script file)"
   echo "  --scale-opts <scale-opts>                 # Options relating to language"
   echo "                                            # model scale; default is "
   echo "                                            # '--transition-scale=1.0 --self-loop-scale=0.1'"
   echo "  --top-n-words <N>                         # Number of most-common-words to add with"
   echo "                                            # unigram probabilities into graph (default: 100)"
   echo "  --top-n-words-weight <float>              # Weight given to top-n-words portion of graph"
   echo "                                            # (before renormalizing); may be any positive"
   echo "                                            # number (default: 1.0)"
   echo "  --min-words-per-graph <N>                 # A constant that controls grouping of utterances"
   echo "                                            # (we make the LMs for groups of utterances)."
   echo "                                            # Default: 100."
   echo "  --ngram-order <N>                         # N-gram order in range [2,7].  Maximum n-gram order "
   echo "                                            # that may be used (but also see --min-lm-state-count)."
   echo "                                            # Default 4"
   echo "  --min-lm-state-count <N>                  # Minimum state count for an LM-state of order >2 to "
   echo "                                            # be completely pruned away [bigrams will always be kept]"
   echo "                                            # Default 10.  Smaller -> more strongly biased LM"
   echo "  --discounting-constant <float>            # Discounting constant for Kneser-Ney, strictly between 0"
   echo "                                            # and 1.  Default 0.3.  Smaller -> more strongly biased LM."
   echo "  --config <config-file>                    # config containing options"
   echo "  --nj <nj>                                 # number of parallel jobs"
   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
   exit 1;
fi

data_or_text=$1
lang=$2
dir=$3
graph_dir=$4

if [ -d $data_or_text ]; then
  text=$data_or_text/text
else
  text=$data_or_text
fi

mkdir -p $graph_dir

for f in $text $lang/oov.int $dir/tree $dir/final.mdl \
    $lang/L_disambig.fst $lang/phones/disambig.int; do
  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1;
done

utils/lang/check_phones_compatible.sh $lang/phones.txt $dir/phones.txt
cp $lang/phones.txt $graph_dir

oov=`cat $lang/oov.int` || exit 1;
mkdir -p $graph_dir/log

# create top_words.{int,txt}
if [ $stage -le 0 ]; then
  export LC_ALL=C
  # the following pipe will be broken due to the 'head'; don't fail.
  set +o pipefail
  utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt < $text | \
    awk '{for(x=2;x<=NF;x++) print $x;}' | sort | uniq -c | \
     sort -nr | head -n $top_n_words > $graph_dir/word_counts.int
  set -o pipefail
  total_count=$(awk '{x+=$1} END{print x}' < $graph_dir/word_counts.int)
  # print top-n words with their unigram probabilities.
  awk -v tot=$total_count -v weight=$top_n_words_weight '{print $2, ($1*weight)/tot;}' \
     <$graph_dir/word_counts.int >$graph_dir/top_words.int
  utils/int2sym.pl -f 1 $lang/words.txt <$graph_dir/top_words.int >$graph_dir/top_words.txt
fi

word_disambig_symbol=$(cat $lang/words.txt | grep -w "#0" | awk '{print $2}')
if [ -z "$word_disambig_symbol" ]; then
  echo "$0: error getting word disambiguation symbol"
  exit 1
fi

mkdir -p $graph_dir/texts
split_text=
for n in `seq $nj`; do
  split_text="$split_text $graph_dir/texts/text.$n"
done

utils/split_scp.pl $text $split_text

mkdir -p $graph_dir/log $graph_dir/fsts

# Make $dir an absolute pathname
dir=`perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' $dir ${PWD}`

if [ $stage -le 1 ]; then
  echo "$0: creating utterance-group-specific decoding graphs with biased LMs"

  # These options are passed through directly to make_one_biased_lm.py.
  lm_opts="--word-disambig-symbol=$word_disambig_symbol --ngram-order=$ngram_order --min-lm-state-count=$min_lm_state_count --discounting-constant=$discounting_constant --top-words=$graph_dir/top_words.int"

  $cmd JOB=1:$nj $graph_dir/log/compile_decoding_graphs.JOB.log \
    utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt $graph_dir/texts/text.JOB \| \
    steps/cleanup/make_biased_lms.py --min-words-per-graph=$min_words_per_graph \
      --lm-opts="$lm_opts" $graph_dir/fsts/utt2group.JOB \| \
    compile-train-graphs-fsts $scale_opts --read-disambig-syms=$lang/phones/disambig.int \
      $dir/tree $dir/final.mdl $lang/L_disambig.fst ark:- \
    ark,scp:$graph_dir/fsts/HCLG.fsts.JOB.ark,$graph_dir/fsts/HCLG.fsts.JOB.scp || exit 1
fi

for j in $(seq $nj); do cat $graph_dir/fsts/HCLG.fsts.$j.scp; done > $graph_dir/fsts/HCLG.fsts.per_utt.scp
for j in $(seq $nj); do cat $graph_dir/fsts/utt2group.$j; done > $graph_dir/fsts/utt2group


cp $lang/words.txt $graph_dir/
cp -r $lang/phones $graph_dir/

# The following command gives us an scp file relative to utterance-id.
utils/apply_map.pl -f 2 $graph_dir/fsts/HCLG.fsts.per_utt.scp <$graph_dir/fsts/utt2group > $graph_dir/HCLG.fsts.scp

n1=$(cat $text | wc -l)
n2=$(cat $graph_dir/HCLG.fsts.scp | wc -l)

if [ $[$n1*9] -gt $[$n2*10] ]; then
  echo "$0: too many utterances have no scp, something seems to have gone wrong."
  exit 1
fi

exit 0;


================================================
FILE: egs/steps/cleanup/make_biased_lms.py
================================================
#!/usr/bin/env python3

from __future__ import print_function
import sys
import argparse
import math
import subprocess
from collections import defaultdict

import io
sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding="utf8")
sys.stderr = io.TextIOWrapper(sys.stderr.buffer,encoding="utf8")
sys.stdin = io.TextIOWrapper(sys.stdin.buffer,encoding="utf8")

parser = argparse.ArgumentParser(description="""
This script is a wrapper for make_one_biased_lm.py that reads a Kaldi archive
of (integerized) text data from the standard input and writes a Kaldi archive of
backoff-language-model FSTs to the standard-output.  It takes care of
grouping utterances to respect the --min-words-per-graph option.  It writes
the graphs to the standard output and also outputs a map from input utterance-ids
to the per-group utterance-ids that index the output graphs.""")

parser.add_argument("--lm-opts", type = str, default = "",
                    help = "Options to pass in to make_one_biased_lm.py (which "
                    "creates the individual LM graphs), e.g. '--word-disambig-symbol=8721'.")
parser.add_argument("--min-words-per-graph", type = int, default = 100,
                    help = "Minimum number of words per utterance group; this program "
                    "will try to arrange the input utterances into groups such that each "
                    "one has at least this many words in total.")
parser.add_argument("utterance_map", type = str,
                    help = "Filename to which a map from input utterances to grouped "
                    "utterances, is written")

args = parser.parse_args()


try:
    utterance_map_file = open(args.utterance_map, "w", encoding="utf-8")
except:
    sys.exit("make_biased_lms.py: error opening {0} to write utterance map".format(
            args.utterance_map))

# This processes one group of input lines; 'group_of_lines' is
# an array of lines of input integerized text, e.g.
# [ 'utt1 67 89 432', 'utt2 89 48 62' ]
def ProcessGroupOfLines(group_of_lines):
    num_lines = len(group_of_lines)
    try:
        first_utterance_id = group_of_lines[0].split()[0]
    except:
        sys.exit("make_biased_lms.py: empty input line")

    group_utterance_id = '{0}-group-of-{1}'.format(first_utterance_id, num_lines)
    # print the group utterance-id to the stdout; it forms the name in
    # the text-form archive.
    print(group_utterance_id)
    sys.stdout.flush()

    try:
        command = "steps/cleanup/internal/make_one_biased_lm.py " + args.lm_opts
        p = subprocess.Popen(command, shell = True, stdin = subprocess.PIPE,
                             stdout = sys.stdout, stderr = sys.stderr)
        for line in group_of_lines:
            a = line.split()
            if len(a) == 0:
                sys.exit("make_biased_lms.py: empty input line")
            utterance_id = a[0]
            # print <utt> <utt-group> to utterance-map file
            print(utterance_id, group_utterance_id, file = utterance_map_file)
            rest_of_line = ' '.join(a[1:]) + '\n' # get rid of utterance id.
            p.stdin.write(rest_of_line.encode('utf-8'))
        p.stdin.close()
        assert p.wait() == 0
    except Exception:
        sys.stderr.write(
            "make_biased_lms.py: error calling subprocess, command was: " +
            command)
        raise
    # Print a blank line; this terminates the FST in the Kaldi fst-archive
    # format.
    print("")
    sys.stdout.flush()


num_words_this_group = 0
this_group_of_lines = []  # An array of strings, one per line

while True:
    line = sys.stdin.readline();
    num_words_this_group += len(line.split())
    if line != '':
        this_group_of_lines.append(line)
    if num_words_this_group >= args.min_words_per_graph or \
        (line == '' and len(this_group_of_lines) != 0):
        ProcessGroupOfLines(this_group_of_lines)
        num_words_this_group = 0
        this_group_of_lines = []
    if line == '':
        break


# test comand [to be run from ../..]
#

# (echo 1 0.5; echo 2 0.25) > top_words.txt
# (echo utt1 6 7 8 4; echo utt2 7 8 9; echo utt3 7 8) | steps/cleanup/make_biased_lms.py --lm-opts='--word-disambig-symbol=1000 --top-words=top_words.txt' foo; cat foo

# (echo utt1 6 7 8 4; echo utt2 7 8 9; echo utt3 7 8) | steps/cleanup/make_biased_lms.py --min-words-per-graph=4 --lm-opts='--word-disambig-symbol=1000 --top-words=top_words.txt' foo; cat foo


================================================
FILE: egs/steps/cleanup/make_segmentation_data_dir.sh
================================================
#!/usr/bin/env bash

# Copyright 2014  Guoguo Chen
# Apache 2.0

# Begin configuration section.
max_seg_length=10
min_seg_length=2
min_sil_length=0.5
time_precision=0.05
special_symbol="<***>"
separator=";"
wer_cutoff=-1
# End configuration section.

set -e

echo "$0 $@"

[ -f ./path.sh ] && . ./path.sh
. parse_options.sh || exit 1;

if [ $# -ne 3 ]; then
  echo "This script takes the ctm file that corresponds to the data directory"
  echo "created by steps/cleanup/split_long_utterance.sh, works out a new"
  echo "segmentation and creates a new data directory for the new segmentation."
  echo ""
  echo "Usage: $0 [options] <ctm-file> <old-data-dir> <new-data-dir>"
  echo " e.g.: $0 train_si284_split.ctm \\"
  echo "                          data/train_si284_split data/train_si284_reseg"
  echo "Options:"
  echo "    --wer-cutoff            # ignore segments with WER higher than the"
  echo "                            # specified value. -1 means no segment will"
  echo "                            # be ignored."
  echo "    --max-seg-length        # maximum length of new segments"
  echo "    --min-seg-length        # minimum length of new segments"
  echo "    --min-sil-length        # minimum length of silence as split point"
  echo "    --time-precision        # precision for determining \"same time\""
  echo "    --special-symbol        # special symbol to be aligned with"
  echo "                            # inserted or deleted words"
  echo "    --separator             # separator for aligned pairs"
  exit 1;
fi

ctm=$1
old_data_dir=$2
new_data_dir=$3

for f in $ctm $old_data_dir/text.orig $old_data_dir/utt2spk \
  $old_data_dir/wav.scp $old_data_dir/segments; do
  if [ ! -f $f ]; then
    echo "$0: expected $f to exist"
    exit 1;
  fi
done

mkdir -p $new_data_dir/tmp/
cp -f $old_data_dir/wav.scp $new_data_dir
[ -f old_data_dir/spk2gender ] &&  cp -f $old_data_dir/spk2gender $new_data_dir

# Removes the overlapping region (in utils/split_long_utterance.sh we create
# the segmentation with overlapping region).
#
# Note that for each audio file, we expect its segments have been sorted in time
# ascending order (if we ignore the overlap).
cat $ctm | perl -e '
  $precision = $ARGV[0];
  @ctm = ();
  %processed_ids = ();
  $previous_id = "";
  while (<STDIN>) {
    chomp;
    my @current = split;
    @current >= 5 || die "Error: bad line $_\n";
    $id = join("_", ($current[0], $current[1]));
    @previous = @{$ctm[-1]};

    # Start of a new audio file.
    if ($previous_id ne $id) {
      # Prints existing information.
      if (@ctm > 0) {
        foreach $line (@ctm) {
          print "$line->[0] $line->[1] $line->[2] $line->[3] $line->[4]\n";
        }
      }

      # Checks if the ctm file is sorted.
      if (defined($processed_ids{$id})) {
        die "Error: \"$current[0] $current[1]\" has already been processed\n";
      } else {
        $processed_ids{$id} = 1;
      }

      @ctm = ();
      push(@ctm, \@current);
      $previous_id = $id;
      next;
    }

    $new_start = sprintf("%.2f", $previous[2] + $previous[3]);

    if ($new_start > $current[2]) {
      # Case 2: scans for a splice point.
      $index = -1;
      while (defined($ctm[$index])
             && $ctm[$index]->[2] + $ctm[$index]->[3] > $current[2]) {
        if ($ctm[$index]->[4] eq $current[4]
            && abs($ctm[$index]->[2] - $current[2]) < $precision
            && abs($ctm[$index]->[3] - $current[3]) < $precision) {
          pop @ctm for 2..abs($index);
          last;
        } else {
          $index -= 1;
        }
      }
    } else {
      push(@ctm, \@current);
    }
  }

  if (@ctm > 0) {
    foreach $line (@ctm) {
      print "$line->[0] $line->[1] $line->[2] $line->[3] $line->[4]\n";
    }
  }' $time_precision > $new_data_dir/tmp/ctm

# Creates a text file from the ctm, which will be used in Levenshtein alignment.
# Note that we remove <eps> in the text file.
cat $new_data_dir/tmp/ctm | perl -e '
  $previous_wav = "";
  $previous_channel = "";
  $text = "";
  while (<STDIN>) {
    chomp;
    @col = split;
    @col >= 5 || die "Error: bad line $_\n";
    if ($previous_wav eq $col[0]) {
      $previous_channel eq $col[1] ||
        die "Error: more than one channels detected\n";
      if ($col[4] ne "<eps>") {
        $text .= " $col[4]";
      }
    } else {
      if ($text ne "") {
        print "$previous_wav $text\n";
      }
      $text = $col[4];
      $previous_wav = $col[0];
      $previous_channel = $col[1];
    }
  }
  if ($text ne "") {
    print "$previous_wav $text\n";
  }' > $new_data_dir/tmp/text

# Computes the Levenshtein alignment.
align-text --special-symbol=$special_symbol --separator=$separator \
  ark:$old_data_dir/text.orig ark:$new_data_dir/tmp/text \
  ark,t:$new_data_dir/tmp/aligned.txt

# Creates new segmentation.
steps/cleanup/create_segments_from_ctm.pl \
  --max-seg-length $max_seg_length --min-seg-length $min_seg_length \
  --min-sil-length $min_sil_length \
  --separator $separator --special-symbol $special_symbol \
  --wer-cutoff $wer_cutoff \
  $new_data_dir/tmp/ctm $new_data_dir/tmp/aligned.txt \
  $new_data_dir/segments $new_data_dir/text

# Now creates the new utt2spk and spk2utt file.
cat $old_data_dir/utt2spk | perl -e '
  ($old_seg_file, $new_seg_file, $utt2spk_file_out) = @ARGV;
  open(OS, "<$old_seg_file") || die "Error: fail to open $old_seg_file\n";
  open(NS, "<$new_seg_file") || die "Error: fail to open $new_seg_file\n";
  open(UO, ">$utt2spk_file_out") ||
    die "Error: fail to open $utt2spk_file_out\n";
  while (<STDIN>) {
    chomp;
    @col = split;
    @col == 2 || die "Error: bad line $_\n";
    $utt2spk{$col[0]} = $col[1];
  }
  while (<OS>) {
    chomp;
    @col = split;
    @col == 4 || die "Error: bad line $_\n";
    if (defined($wav2spk{$col[1]})) {
      $wav2spk{$col[1]} == $utt2spk{$col[0]} ||
        die "Error: multiple speakers detected for wav file $col[1]\n";
    } else {
      $wav2spk{$col[1]} = $utt2spk{$col[0]};
    }
  }
  while (<NS>) {
    chomp;
    @col = split;
    @col == 4 || die "Error: bad line $_\n";
    defined($wav2spk{$col[1]}) ||
      die "Error: could not find speaker for wav file $col[1]\n";
    print UO "$col[0] $wav2spk{$col[1]}\n";
  } ' $old_data_dir/segments $new_data_dir/segments $new_data_dir/utt2spk
utils/utt2spk_to_spk2utt.pl $new_data_dir/utt2spk > $new_data_dir/spk2utt

utils/fix_data_dir.sh $new_data_dir

exit 0;


================================================
FILE: egs/steps/cleanup/make_segmentation_graph.sh
================================================
#!/usr/bin/env bash

# Copyright 2014  Guoguo Chen
# Apache 2.0

# Begin configuration section.
nj=4
cmd=run.pl
tscale=1.0      # transition scale.
loopscale=0.1   # scale for self-loops.
cleanup=true
ngram_order=1
srilm_options="-wbdiscount"   # By default, use Witten-Bell discounting in SRILM
# End configuration section.

set -e

echo "$0 $@"

[ -f ./path.sh ] && . ./path.sh
. parse_options.sh || exit 1;

if [ $# -ne 4 ]; then
  echo "This script builds one decoding graph for each truncated utterance in"
  echo "segmentation. It first calls steps/cleanup/make_utterance_graph.sh to"
  echo "build one decoding graph for each original utterance, which will be"
  echo "shared by the truncated utterances from the same original utterance."
  echo "We assign the decoding graph to each truncated utterance using the scp"
  echo "file so that we can avoid duplicating the graphs on the disk."
  echo ""
  echo "Usage: $0 [options] <data-dir> <lang-dir> <model-dir> <graph-dir>"
  echo " e.g.: $0 data/train_si284_split/ \\"
  echo "                data/lang exp/tri2b/ exp/tri2b/graph_train_si284_split"
  echo ""
  echo "Options:"
  echo "    --ngram-order           # order of n-gram language model"
  echo "    --srilm-options         # options for ngram-count in SRILM tool"
  echo "    --tscale                # transition scale"
  echo "    --loopscale             # scale for self-loops"
  echo "    --cleanup               # if true, removes the intermediate files"
  exit 1;
fi

data=$1
lang=$2
model_dir=$3
graph_dir=$4

for f in $data/text.orig $data/orig2utt $lang/L_disambig.fst \
  $lang/words.txt $lang/oov.int $model_dir/final.mdl $model_dir/tree; do
  if [ ! -f $f ]; then
    echo "$0: expected $f to exist"
    exit 1;
  fi
done

utils/lang/check_phones_compatible.sh $lang/phones.txt $model_dir/phones.txt

# If --ngram-order is larger than 1, we will have to use SRILM
if [ $ngram_order -gt 1 ]; then
  ngram_count=`which ngram-count` || true
  if [ -z $ngram_count ]; then
    if uname -a | grep 64 >/dev/null; then # some kind of 64 bit...
      sdir=$KALDI_ROOT/tools/srilm/bin/i686-m64
    else
      sdir=$KALDI_ROOT/tools/srilm/bin/i686
    fi
    if [ -f $sdir/ngram-count ]; then
      echo Using SRILM tools from $sdir
      export PATH=$PATH:$sdir
    else
      echo You appear to not have SRILM tools installed, either on your path,
      echo or installed in $sdir.  See tools/install_srilm.sh for installation
      echo instructions.
      exit 1
    fi
  fi
fi

# Creates one graph for each transcript. We parallelize the process a little
# bit.
num_lines=`cat $data/text.orig | wc -l`
if [ $nj -gt $num_lines ]; then
  nj=$num_lines
  echo "$0: Too many number of jobs, using $nj instead"
fi

mkdir -p $graph_dir/split$nj
mkdir -p $graph_dir/log
 
split_texts=""
for n in $(seq $nj); do
  mkdir -p $graph_dir/split$nj/$n
  split_texts="$split_texts $graph_dir/split$nj/$n/text"
done
utils/split_scp.pl $data/text.orig $split_texts

$cmd JOB=1:$nj $graph_dir/log/make_utterance_graph.JOB.log \
  steps/cleanup/make_utterance_graph.sh --cleanup $cleanup \
  --tscale $tscale --loopscale $loopscale \
  --ngram-order $ngram_order --srilm-options "$srilm_options" \
  $graph_dir/split$nj/JOB/text $lang \
  $model_dir $graph_dir/split$nj/JOB || exit 1;

# Copies files from lang directory.
mkdir -p $graph_dir
cp -r $lang/* $graph_dir

am-info --print-args=false $model_dir/final.mdl |\
 grep pdfs | awk '{print $NF}' > $graph_dir/num_pdfs

# Creates the graph table.
cat $graph_dir/split$nj/*/HCLG.fsts.scp > $graph_dir/split$nj/HCLG.fsts.scp
fstcopy scp:$graph_dir/split$nj/HCLG.fsts.scp \
  "ark,scp:$graph_dir/HCLG.fsts,$graph_dir/tmp.HCLG.fsts.scp"

# The graphs we created above were indexed by the old utterance id. We have to
# duplicate them for the new utterance id. We do this in the scp file so we do
# not have to store the duplicated graphs on the disk.
cat $graph_dir/tmp.HCLG.fsts.scp | perl -e '
  open(O2U, "<$ARGV[0]") || die "Error: fail to open $ARGV[0]\n";
  while (<STDIN>) {
    chomp;
    @col = split;
    @col == 2 || die "Error: bad line $_\n";
    $scp{$col[0]} = $col[1];
  }
  while (<O2U>) {
    chomp;
    @col = split;
    @col >= 2 || die "Error: bad line $_\n";
    defined($scp{$col[0]}) ||
      die "Error: $col[0] not defined in original scp file\n";
    for ($i = 1; $i < @col; $i += 1) {
      print "$col[$i] $scp{$col[0]}\n"
    }
  }' $data/orig2utt > $graph_dir/HCLG.fsts.scp
rm $graph_dir/tmp.HCLG.fsts.scp

if $cleanup; then
  rm -r $graph_dir/split$nj
fi

exit 0;


================================================
FILE: egs/steps/cleanup/make_utterance_fsts.pl
================================================
#!/usr/bin/env perl
use warnings; #sed replacement for -w perl parameter

# makes unigram decoding-graph FSTs specific to each utterances, where the
# supplied top-n-words list together with the supervision text of the utterance are
# combined.

if (@ARGV != 1) {
  print STDERR "** Warning: this script is deprecated and will be removed.  See\n" .
               "** steps/cleanup/make_biased_lm_graphs.sh.\n" .
               "Usage: make_utterance_fsts.pl top-words-file.txt < text-archive > fsts-archive\n" .
               "e.g.: utils/sym2int.pl -f 2- data/lang/words.txt data/train/text | \\\n" .
               "  make_utterance_fsts.pl exp/foo/top_words.int | compile-train-graphs-fsts ... \n";
  exit(1);
}

($top_words_file) = @ARGV;

open(F, "<$top_words_file") || die "opening $top_words_file";

%top_word_probs = ( );

while(<F>) {
  @A = split;
  (@A == 2 && $A[0] > 0.0) || die "Bad line $_ in $top_words_file";
  $A[1] =~ m/^[0-9]+$/ || die "Expecting numeric word-ids in $top_words_file: $_\n";
  $top_word_probs{$A[1]} += $A[0];
}

while (<STDIN>) {
  @A = split;
  $utterance_id = shift @A;
  print "$utterance_id\n";
  $num_words = @A + 0;  # length of array @A
  %word_probs = %top_word_probs;
  foreach $w (@A) {
    $w =~ m/^[0-9]+$/ || die "Expecting numeric word-ids as stdin: $_";
    $word_probs{$w} += 1.0 / $num_words;
  }
  foreach $w (keys %word_probs) {
    $prob = $word_probs{$w};
    $prob > 0.0 || die "Word $w with bad probability $prob, utterance-id = $utterance_id\n";
    $cost = -log($prob);
    print "0 0 $w $w $cost\n";
  }
  $final_cost = -log(1.0 / $num_words);
  print "0 $final_cost\n";
  print "\n"; # Empty line terminates the FST in the text-archive format.
}


================================================
FILE: egs/steps/cleanup/make_utterance_graph.sh
================================================
#!/usr/bin/env bash

# Copyright 2014  Guoguo Chen
# Apache 2.0

# Begin configuration section.
tscale=1.0      # transition scale.
loopscale=0.1   # scale for self-loops.
cleanup=true
ngram_order=1
srilm_options="-wbdiscount"   # By default, use Witten-Bell discounting in SRILM
# End configuration section.

set -e

echo "$0 $@"

[ -f ./path.sh ] && . ./path.sh
. parse_options.sh || exit 1;

if [ $# -ne 4 ]; then
  echo "This script builds one decoding graph for each utterance using the"
  echo "corresponding text in the given <text> file. If --ngram-order is 1,"
  echo "then utils/make_unigram_grammar.pl will be used to build the unigram"
  echo "language model. Otherwise SRILM will be used instead. You are supposed"
  echo "to have SRILM installed if --ngram-order is larger than 1. The format"
  echo "of the given <text> file is same as the transcript text files in data"
  echo "directory."
  echo ""
  echo "Usage: $0 [options] <text> <lang-dir> <model-dir> <graph-dir>"
  echo " e.g.: $0 data/train_si284_split/text \\"
  echo "                data/lang exp/tri2b/ exp/tri2b/graph_train_si284_split"
  echo ""
  echo "Options:"
  echo "    --ngram-order           # order of n-gram language model"
  echo "    --srilm-options         # options for ngram-count in SRILM tool"
  echo "    --tscale                # transition scale"
  echo "    --loopscale             # scale for self-loops"
  echo "    --cleanup               # if true, removes the intermediate files"
  exit 1;
fi

text=$1
lang=$2
model_dir=$3
graph_dir=$4

for f in $lang/L_disambig.fst $lang/words.txt $lang/oov.int \
  $model_dir/final.mdl $model_dir/tree; do
  if [ ! -f $f ]; then
    echo "$0: expected $f to exist"
    exit 1;
  fi
done

mkdir -p $graph_dir/sub_graphs

utils/lang/check_phones_compatible.sh $lang/phones.txt $model_dir/phones.txt

# If --ngram-order is larger than 1, we will have to use SRILM
if [ $ngram_order -gt 1 ]; then
  ngram_count=`which ngram-count` || true
  if [ -z $ngram_count ]; then
    if uname -a | grep 64 >/dev/null; then # some kind of 64 bit...
      sdir=$KALDI_ROOT/tools/srilm/bin/i686-m64
    else
      sdir=$KALDI_ROOT/tools/srilm/bin/i686
    fi
    if [ -f $sdir/ngram-count ]; then
      echo Using SRILM tools from $sdir
      export PATH=$PATH:$sdir
    else
      echo You appear to not have SRILM tools installed, either on your path,
      echo or installed in $sdir.  See tools/install_srilm.sh for installation
      echo instructions.
      exit 1
    fi
  fi
fi

# Maps OOV words to the oov symbol.
oov=`cat $lang/oov.int`
oov_txt=`cat $lang/oov.txt`

N=`tree-info --print-args=false $model_dir/tree |\
  grep "context-width" | awk '{print $NF}'`
P=`tree-info --print-args=false $model_dir/tree |\
  grep "central-position" | awk '{print $NF}'`

# Loops over all utterances.
if [ -f $graph_dir/sub_graphs/HCLG.fsts.scp ]; then
  rm $graph_dir/sub_graphs/HCLG.fsts.scp
fi

cat $text | utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt | \
 utils/int2sym.pl -f 2- $lang/words.txt | \
 while read line; do
  uttid=`echo $line | cut -d ' ' -f 1`
  words=`echo $line | cut -d ' ' -f 2-`

  echo "$0: processing utterance $uttid."

  wdir=$graph_dir/sub_graphs/$uttid
  mkdir -p $wdir

  # Compiles G.fst
  if [ $ngram_order -eq 1 ]; then
    echo $words > $wdir/text
    cat $wdir/text | utils/sym2int.pl --map-oov $oov -f 1- $lang/words.txt | \
      utils/make_unigram_grammar.pl | fstcompile |\
      fstarcsort --sort_type=ilabel > $wdir/G.fst || exit 1;
  else
     echo $words | \
     perl -ane '@A = split; for ($n=0;$n<@A;$n++) { print "$A[$n] "; if(($n+1)%30000 == 0 || $n+1==@A) {print "\n";} }' \
     > $wdir/text
     ngram-count -text $wdir/text -order $ngram_order "$srilm_options" -lm - | \
      arpa2fst --disambig-symbol=#0 \
             --read-symbol-table=$lang/words.txt - $wdir/G.fst || exit 1;
  fi
  fstisstochastic $wdir/G.fst || echo "$0: $uttid/G.fst not stochastic."

  # Builds LG.fst
  fsttablecompose $lang/L_disambig.fst $wdir/G.fst |\
    fstdeterminizestar --use-log=true | fstminimizeencoded |\
    fstarcsort --sort_type=ilabel > $wdir/LG.fst || exit 1;
  fstisstochastic $wdir/LG.fst || echo "$0: $uttid/LG.fst not stochastic."

  # Builds CLG.fst
  clg=$wdir/CLG_${N}_${P}.fst
  fstcomposecontext --context-size=$N --central-position=$P \
    --read-disambig-syms=$lang/phones/disambig.int \
    --write-disambig-syms=$wdir/disambig_ilabels_${N}_${P}.int \
    $wdir/ilabels_${N}_${P} < $wdir/LG.fst | fstdeterminize > $wdir/CLG.fst
  fstisstochastic $wdir/CLG.fst  || echo "$0: $uttid/CLG.fst not stochastic."

  make-h-transducer --disambig-syms-out=$wdir/disambig_tid.int \
    --transition-scale=$tscale $wdir/ilabels_${N}_${P} \
    $model_dir/tree $model_dir/final.mdl > $wdir/Ha.fst

  # Builds HCLGa.fst
  fsttablecompose $wdir/Ha.fst $wdir/CLG.fst | \
    fstdeterminizestar --use-log=true | \
    fstrmsymbols $wdir/disambig_tid.int | fstrmepslocal | \
    fstminimizeencoded > $wdir/HCLGa.fst
  fstisstochastic $wdir/HCLGa.fst ||\
    echo "$0: $uttid/HCLGa.fst is not stochastic"

  add-self-loops --self-loop-scale=$loopscale --reorder=true \
    $model_dir/final.mdl < $wdir/HCLGa.fst > $wdir/HCLG.fst

  if [ $tscale == 1.0 -a $loopscale == 1.0 ]; then
    fstisstochastic $wdir/HCLG.fst ||\
      echo "$0: $uttid/HCLG.fst is not stochastic."
  fi

  echo "$uttid $wdir/HCLG.fst" >> $graph_dir/sub_graphs/HCLG.fsts.scp
  echo
 done

# Copies files from lang directory.
mkdir -p $graph_dir
cp -r $lang/* $graph_dir

am-info --print-args=false $model_dir/final.mdl |\
 grep pdfs | awk '{print $NF}' > $graph_dir/num_pdfs

# Creates the graph table.
fstcopy scp:$graph_dir/sub_graphs/HCLG.fsts.scp \
  "ark,scp:$graph_dir/HCLG.fsts,$graph_dir/HCLG.fsts.scp"

if $cleanup; then
  rm -r $graph_dir/sub_graphs
fi

exit 0;


================================================
FILE: egs/steps/cleanup/segment_long_utterances.sh
================================================
#!/usr/bin/env bash

# Copyright 2014  Guoguo Chen
#           2016  Vimal Manohar
# Apache 2.0

# This script performs segmentation of the input data based on the transcription
# and outputs segmented data along with the corresponding aligned transcription.
# The purpose of this script is to divide up the input data (which may consist
# of long recordings such as television shows or audiobooks) into segments which
# are of manageable length for further processing, along with the portion of the
# transcript that seems to match (aligns with) each segment.
# This the light-supervised training scenario where the input transcription is
# not expected to be completely clean and may have significant errors. 
# See "JHU Kaldi System for Arabic MGB-3 ASR Challenge using Diarization,
# Audio-transcript Alignment and Transfer Learning": Vimal Manohar, Daniel
# Povey, Sanjeev Khudanpur, ASRU 2017
# (http://www.danielpovey.com/files/2017_asru_mgb3.pdf) for details.
# The output data is not necessarily particularly clean; you can run
# steps/cleanup/clean_and_segment_data.sh on the output in order to
# further clean it and eliminate data where the transcript doesn't seem to
# match.

. ./path.sh

set -e
set -o pipefail
set -u

# Uniform segmentation options
max_segment_duration=30
overlap_duration=5
seconds_per_spk_max=30

# Decode options
graph_opts=
beam=15.0
lattice_beam=1.0
nj=4
lmwt=10

# TF-IDF similarity search options
max_words=1000
num_neighbors_to_search=1   # Number of neighboring documents to search around the one retrieved based on maximum tf-idf similarity.
neighbor_tfidf_threshold=0.5

align_full_hyp=false  # Align full hypothesis i.e. trackback from the end to get the alignment.

# First-pass segmentation opts
# These options are passed to the script
# steps/cleanup/internal/segment_ctm_edits_mild.py
segmentation_extra_opts=
min_split_point_duration=0.1
max_deleted_words_kept_when_merging=1
max_wer=50
max_segment_length_for_merging=60
max_bad_proportion=0.75
max_intersegment_incorrect_words_length=1
max_segment_length_for_splitting=10
hard_max_segment_length=15
min_silence_length_to_split_at=0.3
min_non_scored_length_to_split_at=0.3

stage=-1

cmd=run.pl

. utils/parse_options.sh

if [ $# -ne 5 ] && [ $# -ne 7 ]; then
    cat <<EOF
Usage: $0 [options] <model-dir> <lang> <data-in> [<text-in> <utt2text>] <segmented-data-out> <work-dir>
 e.g.: $0 exp/wsj_tri2b data/lang_nosp data/train_long data/train_long/text data/train_reseg exp/segment_wsj_long_utts_train
This script performs segmentation of the data in <data-in> and writes out the
segmented data (with a segments file) to
<segmented-data-out> along with the corresponding aligned transcription.
Note: If <utt2text> is not provided, the "text" file in <data-in> is used as the
raw transcripts to train biased LM for the utterances.
If <utt2text> is provided, then it should be a mapping from the utterance-ids in
<data-in> to the transcript-keys in the file <text-in>, which will be
used to train biased LMs for the utterances.
The purpose of this script is to divide up the input data (which may consist of
long recordings such as television shows or audiobooks) into segments which are
of manageable length for further processing, along with the portion of the
transcript that seems to match each segment.
The output data is not necessarily particularly clean; you are advised to run
steps/cleanup/clean_and_segment_data.sh on the output in order to further clean
it and eliminate data where the transcript doesn't seem to match.
EOF
    exit 1
fi

srcdir=$1
lang=$2
data=$3

extra_files=
utt2text=
text=$data/text
if [ $# -eq 7 ]; then
  text=$4
  utt2text=$5
  out_data=$6
  dir=$7
  extra_files="$utt2text"
else
  out_data=$4
  dir=$5
fi

for f in $data/feats.scp $text $extra_files $srcdir/tree \
  $srcdir/final.mdl $srcdir/cmvn_opts; do
  if [ ! -f $f ]; then
    echo "$0: Could not find file $f"
    exit 1
  fi
done

data_id=`basename $data`
mkdir -p $dir

data_uniform_seg=$dir/${data_id}_uniform_seg

frame_shift=`utils/data/get_frame_shift.sh $data`

# First we split the data into segments of around 30s long, on which
# it would be possible to do a decoding.
# A diarization step will be added in the future.
if [ $stage -le 1 ]; then
  echo "$0: Stage 1 (Splitting data directory $data into uniform segments)"

  utils/data/get_utt2dur.sh $data
  if [ ! -f $data/segments ]; then
    utils/data/get_segments_for_data.sh $data > $data/segments
  fi

  utils/data/get_uniform_subsegments.py \
    --max-segment-duration=$max_segment_duration \
    --overlap-duration=$overlap_duration \
    --max-remaining-duration=$(perl -e "print $max_segment_duration / 2.0") \
    $data/segments > $dir/uniform_sub_segments
fi

if [ $stage -le 2 ]; then
  echo "$0: Stage 2 (Prepare uniform sub-segmented data directory)"
  rm -r $data_uniform_seg || true

  if [ ! -z "$seconds_per_spk_max" ]; then
    utils/data/subsegment_data_dir.sh \
      $data $dir/uniform_sub_segments $dir/${data_id}_uniform_seg.temp

    utils/data/modify_speaker_info.sh --seconds-per-spk-max $seconds_per_spk_max \
      $dir/${data_id}_uniform_seg.temp $data_uniform_seg
  else
    utils/data/subsegment_data_dir.sh \
      $data $dir/uniform_sub_segments $data_uniform_seg
  fi

  utils/fix_data_dir.sh $data_uniform_seg

  # Compute new cmvn stats for the segmented data directory
  steps/compute_cmvn_stats.sh $data_uniform_seg/
fi

graph_dir=$dir/graphs_uniform_seg

if [ $stage -le 3 ]; then
  echo "$0: Stage 3 (Building biased-language-model decoding graphs)"

  cp $srcdir/final.mdl $dir
  cp $srcdir/tree $dir
  cp $srcdir/cmvn_opts $dir
  cp $srcdir/{splice_opts,delta_opts,final.mat,final.alimdl} $dir 2>/dev/null || true
  cp $srcdir/phones.txt $dir 2>/dev/null || true

  mkdir -p $graph_dir
  
  n_reco=$(cat $text | wc -l) || exit 1
  nj_reco=$nj

  if [ $nj -gt $n_reco ]; then
    nj_reco=$n_reco
  fi

  # Make graphs w.r.t. to the original text (usually recording-level)
  steps/cleanup/make_biased_lm_graphs.sh $graph_opts \
    --nj $nj_reco --cmd "$cmd" $text \
    $lang $dir $dir/graphs
  if [ -z "$utt2text" ]; then
    # and then copy it to the sub-segments.
    cat $dir/uniform_sub_segments | awk '{print $1" "$2}' | \
      utils/apply_map.pl -f 2 $dir/graphs/HCLG.fsts.scp | \
      sort -k1,1 > \
      $graph_dir/HCLG.fsts.scp
  else
    # and then copy it to the sub-segments.
    cat $dir/uniform_sub_segments | awk '{print $1" "$2}' | \
      utils/apply_map.pl -f 2 $utt2text | \
      utils/apply_map.pl -f 2 $dir/graphs/HCLG.fsts.scp | \
      sort -k1,1 > \
      $graph_dir/HCLG.fsts.scp
  fi

  cp $lang/words.txt $graph_dir
  cp -r $lang/phones $graph_dir
  [ -f $dir/graphs/num_pdfs ] && cp $dir/graphs/num_pdfs $graph_dir/
fi

decode_dir=$dir/lats
mkdir -p $decode_dir

if [ $stage -le 4 ]; then
  echo "$0: Decoding with biased language models..."

  if [ -f $srcdir/trans.1 ]; then
    steps/cleanup/decode_fmllr_segmentation.sh \
      --beam $beam --lattice-beam $lattice_beam --nj $nj --cmd "$cmd --mem 4G" \
      --skip-scoring true --allow-partial false \
      $graph_dir $data_uniform_seg $decode_dir
  else
    steps/cleanup/decode_segmentation.sh \
      --beam $beam --lattice-beam $lattice_beam --nj $nj --cmd "$cmd --mem 4G" \
      --skip-scoring true --allow-partial false \
      $graph_dir $data_uniform_seg $decode_dir
  fi
fi

if [ $stage -le 5 ]; then
  steps/get_ctm_fast.sh --frame_shift $frame_shift --lmwt $lmwt --cmd "$cmd --mem 4G" \
    --print-silence true \
    $data_uniform_seg $lang $decode_dir $decode_dir/ctm_$lmwt
fi

# Split the original text into documents, over which we can do
# searching reasonably efficiently. Also get a mapping from the original
# text to the created documents (i.e. text2doc)
# Since the Smith-Waterman alignment is linear in the length of the
# text, we want to keep it reasonably small (a few thousand words).

if [ $stage -le 6 ]; then
  # Split the reference text into documents.
  mkdir -p $dir/docs

  # text2doc is a mapping from the original transcript to the documents
  # it is split into.
  # The format is
  # <original-transcript> <doc1> <doc2> ...
  steps/cleanup/internal/split_text_into_docs.pl --max-words $max_words \
    $text $dir/docs/doc2text $dir/docs/docs.txt
  utils/utt2spk_to_spk2utt.pl $dir/docs/doc2text > $dir/docs/text2doc
fi

if [ $stage -le 7 ]; then
  # Get TF-IDF for the reference documents.
  echo $nj > $dir/docs/num_jobs

  utils/split_data.sh $data_uniform_seg $nj

  mkdir -p $dir/docs/split$nj/

  # First compute IDF stats
  $cmd $dir/log/compute_source_idf_stats.log \
    steps/cleanup/internal/compute_tf_idf.py \
    --tf-weighting-scheme="raw" \
    --idf-weighting-scheme="log" \
    --output-idf-stats=$dir/docs/idf_stats.txt \
    $dir/docs/docs.txt $dir/docs/src_tf_idf.txt

  # Split documents so that they can be accessed easily by parallel jobs.
  mkdir -p $dir/docs/split$nj/
  sdir=$dir/docs/split$nj
  for n in `seq $nj`; do

    # old2new_utts is a mapping from the original segments to the
    # new segments created by uniformly segmenting.
    # The format is <old-utterance> <new-utt1> <new-utt2> ...
    utils/filter_scp.pl $data_uniform_seg/split$nj/$n/utt2spk $dir/uniform_sub_segments | \
      cut -d ' ' -f 1,2 | utils/utt2spk_to_spk2utt.pl > $sdir/old2new_utts.$n.txt

    if [ ! -z "$utt2text" ]; then
      # utt2text, if provided, is a mapping from the <old-utterance> to
      # <original-transript>.
      # Since text2doc is mapping from <original-transcript> to documents, we
      # first have to find the original-transcripts that are in the current
      # split.
      utils/filter_scp.pl $sdir/old2new_utts.$n.txt $utt2text | \
        cut -d ' ' -f 2 | sort -u | \
        utils/filter_scp.pl /dev/stdin $dir/docs/text2doc > $sdir/text2doc.$n
    else
      utils/filter_scp.pl $sdir/old2new_utts.$n.txt \
        $dir/docs/text2doc > $sdir/text2doc.$n
    fi

    utils/spk2utt_to_utt2spk.pl $sdir/text2doc.$n | \
      utils/filter_scp.pl /dev/stdin $dir/docs/docs.txt > \
      $sdir/docs.$n.txt
  done

  # Compute TF-IDF for the source documents.
  $cmd JOB=1:$nj $dir/docs/log/get_tfidf_for_source_texts.JOB.log \
    steps/cleanup/internal/compute_tf_idf.py \
      --tf-weighting-scheme="raw" \
      --idf-weighting-scheme="log" \
      --input-idf-stats=$dir/docs/idf_stats.txt \
      $sdir/docs.JOB.txt $sdir/src_tf_idf.JOB.txt

  sdir=$dir/docs/split$nj
  # Make $sdir an absolute pathname.
  sdir=`perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' $sdir ${PWD}`

  for n in `seq $nj`; do
    awk -v f="$sdir/src_tf_idf.$n.txt" '{print $1" "f}' \
      $sdir/text2doc.$n
  done | perl -ane 'BEGIN { %tfidfs = (); }
  {
    if (!defined $tfidfs{$F[0]}) {
      $tfidfs{$F[0]} = $F[1];
    }
  }
  END {
  while(my ($k, $v) = each %tfidfs) {
    print "$k $v\n";
  } }' > $dir/docs/source2tf_idf.scp
fi

if [ $stage -le 8 ]; then
  echo "$0: using default values of non-scored words..."

  # At the level of this script we just hard-code it that non-scored words are
  # those that map to silence phones (which is what get_non_scored_words.py
  # gives us), although this could easily be made user-configurable.  This list
  # of non-scored words affects the behavior of several of the data-cleanup
  # scripts; essentially, we view the non-scored words as negotiable when it
  # comes to the reference transcript, so we'll consider changing the reference
  # to match the hyp when it comes to these words.
  steps/cleanup/internal/get_non_scored_words.py $lang > $dir/non_scored_words.txt
fi

if [ $stage -le 9 ]; then
  sdir=$dir/query_docs/split$nj
  mkdir -p $sdir

  # Compute TF-IDF for the query documents (decode hypotheses).
  # The output is an archive of TF-IDF indexed by the query.
  $cmd JOB=1:$nj $decode_dir/ctm_$lmwt/log/compute_query_tf_idf.JOB.log \
    steps/cleanup/internal/ctm_to_text.pl --non-scored-words $dir/non_scored_words.txt \
      $decode_dir/ctm_$lmwt/ctm.JOB \| \
    steps/cleanup/internal/compute_tf_idf.py \
      --tf-weighting-scheme="normalized" \
      --idf-weighting-scheme="log" \
      --input-idf-stats=$dir/docs/idf_stats.txt \
      --accumulate-over-docs=false \
      - $sdir/query_tf_idf.JOB.ark.txt

  # The relevant documents can be found using TF-IDF similarity and nearby
  # documents can also be picked for the Smith-Waterman alignment stage.

  # Get a mapping from the new utterance-ids to original transcripts
  if [ -z "$utt2text" ]; then
    awk '{print $1" "$2}' $dir/uniform_sub_segments > \
      $dir/new2orig_utt
  else
    awk '{print $1" "$2}' $dir/uniform_sub_segments | \
      utils/apply_map.pl -f 2 $utt2text > \
      $dir/new2orig_utt
  fi

  # The query TF-IDFs are all indexed by the utterance-id of the sub-segments.
  # The source TF-IDFs use the document-ids created by splitting the reference
  # text into documents.
  # For each query, we need to retrieve the documents that were created from
  # the same original utterance that the sub-segment was from. For this,
  # we have to load the source TF-IDF that has those documents. This
  # information is provided using the option --source-text-id2tf-idf-file.
  # The output of this script is a file where the first column is the
  # query-id (i.e. sub-segment-id) and the remaining columns, which is at least
  # one in number and a maxmium of (1 + 2 * num-neighbors-to-search) columns
  # is the document-ids for the retrieved documents.
  $cmd JOB=1:$nj $dir/log/retrieve_similar_docs.JOB.log \
    steps/cleanup/internal/retrieve_similar_docs.py \
      --query-tfidf=$dir/query_docs/split$nj/query_tf_idf.JOB.ark.txt \
      --source-text-id2tfidf=$dir/docs/source2tf_idf.scp \
      --source-text-id2doc-ids=$dir/docs/text2doc \
      --query-id2source-text-id=$dir/new2orig_utt \
      --num-neighbors-to-search=$num_neighbors_to_search \
      --neighbor-tfidf-threshold=$neighbor_tfidf_threshold \
      --relevant-docs=$dir/query_docs/split$nj/relevant_docs.JOB.txt

  $cmd JOB=1:$nj $decode_dir/ctm_$lmwt/log/get_ctm_edits.JOB.log \
    steps/cleanup/internal/stitch_documents.py \
      --query2docs=$dir/query_docs/split$nj/relevant_docs.JOB.txt \
      --input-documents=$dir/docs/split$nj/docs.JOB.txt \
      --output-documents=- \| \
    steps/cleanup/internal/align_ctm_ref.py --eps-symbol='"<eps>"' \
      --oov-word="'`cat $lang/oov.txt`'" --symbol-table=$lang/words.txt \
      --hyp-format=CTM --align-full-hyp=$align_full_hyp \
      --hyp=$decode_dir/ctm_$lmwt/ctm.JOB --ref=- \
      --output=$decode_dir/ctm_$lmwt/ctm_edits.JOB

  for n in `seq $nj`; do
    cat $decode_dir/ctm_$lmwt/ctm_edits.$n
  done > $decode_dir/ctm_$lmwt/ctm_edits

fi

if [ $stage -le 10 ]; then
  $cmd $dir/log/resolve_ctm_edits.log \
    steps/cleanup/internal/resolve_ctm_edits_overlaps.py \
    ${data_uniform_seg}/segments $decode_dir/ctm_$lmwt/ctm_edits $dir/ctm_edits
fi

if [ $stage -le 11 ]; then
  echo "$0: modifying ctm-edits file to allow repetitions [for dysfluencies] and "
  echo "   ... to fix reference mismatches involving non-scored words. "

  $cmd $dir/log/modify_ctm_edits.log \
    steps/cleanup/internal/modify_ctm_edits.py --verbose=3 $dir/non_scored_words.txt \
    $dir/ctm_edits $dir/ctm_edits.modified

  echo "   ... See $dir/log/modify_ctm_edits.log for details and stats, including"
  echo " a list of commonly-repeated words."
fi

if [ $stage -le 12 ]; then
  echo "$0: applying 'taint' markers to ctm-edits file to mark silences and"
  echo "  ... non-scored words that are next to errors."
  $cmd $dir/log/taint_ctm_edits.log \
       steps/cleanup/internal/taint_ctm_edits.py --remove-deletions=false \
       $dir/ctm_edits.modified $dir/ctm_edits.tainted
  echo "... Stats, including global cor/ins/del/sub stats, are in $dir/log/taint_ctm_edits.log."
fi

if [ $stage -le 13 ]; then
  echo "$0: creating segmentation from ctm-edits file."

  segmentation_opts=(
  --min-split-point-duration=$min_split_point_duration
  --max-deleted-words-kept-when-merging=$max_deleted_words_kept_when_merging
  --merging.max-wer=$max_wer
  --merging.max-segment-length=$max_segment_length_for_merging
  --merging.max-bad-proportion=$max_bad_proportion
  --merging.max-intersegment-incorrect-words-length=$max_intersegment_incorrect_words_length
  --splitting.max-segment-length=$max_segment_length_for_splitting
  --splitting.hard-max-segment-length=$hard_max_segment_length
  --splitting.min-silence-length=$min_silence_length_to_split_at
  --splitting.min-non-scored-length=$min_non_scored_length_to_split_at
  )

  $cmd $dir/log/segment_ctm_edits.log \
    steps/cleanup/internal/segment_ctm_edits_mild.py \
      ${segmentation_opts[@]} $segmentation_extra_opts \
      --oov-symbol-file=$lang/oov.txt \
      --ctm-edits-out=$dir/ctm_edits.segmented \
      --word-stats-out=$dir/word_stats.txt \
      $dir/non_scored_words.txt \
      $dir/ctm_edits.tainted $dir/text $dir/segments

  echo "$0: contents of $dir/log/segment_ctm_edits.log are:"
  cat $dir/log/segment_ctm_edits.log
  echo "For word-level statistics on p(not-being-in-a-segment), with 'worst' words at the top,"
  echo "see $dir/word_stats.txt"
  echo "For detailed utterance-level debugging information, see $dir/ctm_edits.segmented"
fi

mkdir -p $out_data
if [ $stage -le 14 ]; then
  utils/data/subsegment_data_dir.sh $data_uniform_seg \
    $dir/segments $dir/text $out_data
fi


================================================
FILE: egs/steps/cleanup/segment_long_utterances_nnet3.sh
================================================
#!/usr/bin/env bash

# Copyright 2014  Guoguo Chen
#           2016  Vimal Manohar
# Apache 2.0


# This script is similar to steps/cleanup/segment_long_utterances.sh, but
# uses nnet3 acoustic model instead of GMM acoustic model for decoding.
# This script performs segmentation of the input data based on the transcription
# and outputs segmented data along with the corresponding aligned transcription.
# The purpose of this script is to divide up the input data (which may consist
# of long recordings such as television shows or audiobooks) into segments which
# are of manageable length for further processing, along with the portion of the
# transcript that seems to match (aligns with) each segment.
# This the light-supervised training scenario where the input transcription is
# not expected to be completely clean and may have significant errors.
# See "JHU Kaldi System for Arabic MGB-3 ASR Challenge using Diarization,
# Audio-transcript Alignment and Transfer Learning": Vimal Manohar, Daniel
# Povey, Sanjeev Khudanpur, ASRU 2017
# (http://www.danielpovey.com/files/2017_asru_mgb3.pdf) for details.
# The output data is not necessarily particularly clean; you can run
# steps/cleanup/clean_and_segment_data_nnet3.sh on the output in order to
# further clean it and eliminate data where the transcript doesn't seem to
# match.


set -e
set -o pipefail
set -u

stage=-1
cmd=run.pl
nj=4

# Uniform segmentation options
max_segment_duration=30
overlap_duration=5
seconds_per_spk_max=30

# Decode options
graph_opts=
scale_opts=  # for making the graphs
beam=15.0
lattice_beam=1.0
lmwt=10
acwt=0.1  # Just a default value, used for adaptation and beam-pruning..

# Contexts must ideally match training
extra_left_context=0  # Set to some large value, typically 40 for LSTM (must match training)
extra_right_context=0
extra_left_context_initial=-1
extra_right_context_final=-1
frames_per_chunk=150

# i-vector options
extractor=    # i-Vector extractor. If provided, will extract i-vectors.
              # Required if the network was trained with i-vector extractor.
use_vad=false # Use energy-based VAD for i-vector extraction

# TF-IDF similarity search options
max_words=1000
num_neighbors_to_search=1   # Number of neighboring documents to search around the one retrieved based on maximum tf-idf similarity.
neighbor_tfidf_threshold=0.5

align_full_hyp=false  # Align full hypothesis i.e. trackback from the end to get the alignment.

# First-pass segmentation opts
# These options are passed to the script
# steps/cleanup/internal/segment_ctm_edits_mild.py
segmentation_extra_opts=
min_split_point_duration=0.1
max_deleted_words_kept_when_merging=1
max_wer=50
max_segment_length_for_merging=60
max_bad_proportion=0.75
max_intersegment_incorrect_words_length=1
max_segment_length_for_splitting=10
hard_max_segment_length=15
min_silence_length_to_split_at=0.3
min_non_scored_length_to_split_at=0.3


. ./path.sh
. utils/parse_options.sh

if [ $# -ne 5 ] && [ $# -ne 7 ]; then
  cat <<EOF
Usage: $0 [--extractor <ivector-extractor>] [options] <model-dir> <lang> <data-in> [<text-in> <utt2text>] <segmented-data-out> <work-dir>
 e.g.: $0 exp/wsj_tri2b data/lang_nosp data/train_long data/train_long/text data/train_reseg exp/segment_wsj_long_utts_train
This script performs segmentation of the data in <data-in> and writes out the
segmented data (with a segments file) to
<segmented-data-out> along with the corresponding aligned transcription.
Note: If <utt2text> is not provided, the "text" file in <data-in> is used as the
raw transcripts to train biased LM for the utterances.
If <utt2text> is provided, then it should be a mapping from the utterance-ids in
<data-in> to the transcript-keys in the file <text-in>, which will be
used to train biased LMs for the utterances.
The purpose of this script is to divide up the input data (which may consist of
long recordings such as television shows or audiobooks) into segments which are
of manageable length for further processing, along with the portion of the
transcript that seems to match each segment.
The output data is not necessarily particularly clean; you are advised to run
steps/cleanup/clean_and_segment_data.sh on the output in order to further clean
it and eliminate data where the transcript doesn't seem to match.
  main options (for others, see top of script file):
    --stage <n>             # stage to run from, to enable resuming from partially
                            # completed run (default: 0)
    --cmd '$cmd'            # command to submit jobs with (e.g. run.pl, queue.pl)
    --nj <n>                # number of parallel jobs to use in graph creation and
                            # decoding
    --graph-opts 'opts'         # Additional options to make_biased_lm_graphs.sh.
                                # Please run steps/cleanup/make_biased_lm_graphs.sh
                                # without arguments to see allowed options.
    --segmentation-extra-opts 'opts'  # Additional options to segment_ctm_edits_mild.py.
                                # Please run steps/cleanup/internal/segment_ctm_edits_mild.py
                                # without arguments to see allowed options.
    --align-full-hyp <true|false>  # If true, align full hypothesis
                                   i.e. trackback from the end to get the alignment.
                                   This is different from the normal
                                   Smith-Waterman alignment, where the
                                   traceback will be from the maximum score.
    --extractor <extractor>     # i-vector extractor directory if i-vector is
                                # to be used during decoding. Must match
                                # the extractor used for training neural-network.
    --use-vad <true|false>      # If true, uses energy-based VAD to apply frame weights
                                # for i-vector stats extraction
EOF
  exit 1
fi

srcdir=$1
lang=$2
data=$3

extra_files=
utt2text=
text=$data/text
if [ $# -eq 7 ]; then
  text=$4
  utt2text=$5
  out_data=$6
  dir=$7
  extra_files="$utt2text"
else
  out_data=$4
  dir=$5
fi

if [ ! -z "$extractor" ]; then
  extra_files="$extra_files $extractor/final.ie"
fi

for f in $data/feats.scp $text $extra_files $srcdir/tree \
  $srcdir/final.mdl $srcdir/cmvn_opts; do
  if [ ! -f $f ]; then
    echo "$0: Could not find file $f"
    exit 1
  fi
done

data_id=`basename $data`
mkdir -p $dir
cp $srcdir/final.mdl $dir
cp $srcdir/tree $dir
cp $srcdir/cmvn_opts $dir
cp $srcdir/{splice_opts,delta_opts,final.mat,final.alimdl} $dir 2>/dev/null || true
cp $srcdir/frame_subsampling_factor $dir 2>/dev/null || true

if [ -f $srcdir/frame_subsampling_factor ]; then
  echo "$0: guessing that this is a chain system, checking parameters."
  if [ -z $scale_opts ]; then
    echo "$0: setting scale_opts"
    scale_opts="--self-loop-scale=1.0 --transition-scale=1.0"
  fi
  if [ $acwt == 0.1 ]; then
    echo "$0: setting acwt=1.0"
    acwt=1.0
  fi
  if [ $lmwt == 10 ]; then
    echo "$0: setting lmwt=1.0"
    lmwt=1
  fi
fi


utils/lang/check_phones_compatible.sh $lang/phones.txt $srcdir/phones.txt
cp $lang/phones.txt $dir

data_uniform_seg=$dir/${data_id}_uniform_seg

# First we split the data into segments of around 30s long, on which
# it would be possible to do a decoding.
# A diarization step will be added in the future.
if [ $stage -le 1 ]; then
  echo "$0: Stage 1 (Splitting data directory $data into uniform segments)"

  utils/data/get_utt2dur.sh $data
  if [ ! -f $data/segments ]; then
    utils/data/get_segments_for_data.sh $data > $data/segments
  fi

  utils/data/get_uniform_subsegments.py \
    --max-segment-duration=$max_segment_duration \
    --overlap-duration=$overlap_duration \
    --max-remaining-duration=$(perl -e "print $max_segment_duration / 2.0") \
    $data/segments > $dir/uniform_sub_segments
fi

if [ $stage -le 2 ]; then
  echo "$0: Stage 2 (Prepare uniform sub-segmented data directory)"
  rm -r $data_uniform_seg || true

  if [ ! -z "$seconds_per_spk_max" ]; then
    utils/data/subsegment_data_dir.sh \
      $data $dir/uniform_sub_segments $dir/${data_id}_uniform_seg.temp

    utils/data/modify_speaker_info.sh --seconds-per-spk-max $seconds_per_spk_max \
      $dir/${data_id}_uniform_seg.temp $data_uniform_seg
  else
    utils/data/subsegment_data_dir.sh \
      $data $dir/uniform_sub_segments $data_uniform_seg
  fi

  utils/fix_data_dir.sh $data_uniform_seg

  # Compute new cmvn stats for the segmented data directory
  steps/compute_cmvn_stats.sh $data_uniform_seg/
fi

graph_dir=$dir/graphs_uniform_seg

if [ $stage -le 3 ]; then
  echo "$0: Stage 3 (Building biased-language-model decoding graphs)"

  mkdir -p $graph_dir

  n_reco=$(cat $text | wc -l) || exit 1
  nj_reco=$nj

  if [ $nj -gt $n_reco ]; then
    nj_reco=$n_reco
  fi

  # Make graphs w.r.t. to the original text (usually recording-level)
  steps/cleanup/make_biased_lm_graphs.sh $graph_opts \
    --scale-opts "$scale_opts" \
    --nj $nj_reco --cmd "$cmd" $text \
    $lang $dir $dir/graphs
  if [ -z "$utt2text" ]; then
    # and then copy it to the sub-segments.
    cat $dir/uniform_sub_segments | awk '{print $1" "$2}' | \
      utils/apply_map.pl -f 2 $dir/graphs/HCLG.fsts.scp | \
      sort -k1,1 > \
      $graph_dir/HCLG.fsts.scp
  else
    # and then copy it to the sub-segments.
    cat $dir/uniform_sub_segments | awk '{print $1" "$2}' | \
      utils/apply_map.pl -f 2 $utt2text | \
      utils/apply_map.pl -f 2 $dir/graphs/HCLG.fsts.scp | \
      sort -k1,1 > \
      $graph_dir/HCLG.fsts.scp
  fi

  cp $lang/words.txt $graph_dir
  cp -r $lang/phones $graph_dir
  [ -f $dir/graphs/num_pdfs ] && cp $dir/graphs/num_pdfs $graph_dir/
fi

decode_dir=$dir/lats
mkdir -p $decode_dir

online_ivector_dir=
if [ ! -z "$extractor" ]; then
  online_ivector_dir=$dir/ivectors_$(basename $data_uniform_seg)

  if [ $stage -le 4 ]; then
    # Compute energy-based VAD
    if $use_vad; then
      steps/compute_vad_decision.sh $data_uniform_seg \
        $data_uniform_seg/log $data_uniform_seg/data
    fi

    steps/online/nnet2/extract_ivectors_online.sh \
      --nj $nj --cmd "$cmd --mem 4G" --use-vad $use_vad \
      $data_uniform_seg $extractor $online_ivector_dir
  fi
fi

if [ $stage -le 5 ]; then
  echo "$0: Decoding with biased language models..."

  steps/cleanup/decode_segmentation_nnet3.sh \
    --acwt $acwt \
    --beam $beam --lattice-beam $lattice_beam --nj $nj --cmd "$cmd --mem 4G" \
    --skip-scoring true --allow-partial false \
    --extra-left-context $extra_left_context \
    --extra-right-context $extra_right_context \
    --extra-left-context-initial $extra_left_context_initial \
    --extra-right-context-final $extra_right_context_final \
    --frames-per-chunk $frames_per_chunk \
    ${online_ivector_dir:+--online-ivector-dir $online_ivector_dir} \
    $graph_dir $data_uniform_seg $decode_dir
fi

frame_shift_opt=
if [ -f $srcdir/frame_subsampling_factor ]; then
  frame_shift_opt="--frame-shift 0.0$(cat $srcdir/frame_subsampling_factor)"
fi

if [ $stage -le 6 ]; then
  steps/get_ctm_fast.sh --lmwt $lmwt --cmd "$cmd --mem 4G" \
    --print-silence true $frame_shift_opt \
    $data_uniform_seg $lang $decode_dir $decode_dir/ctm_$lmwt
fi

# Split the original text into documents, over which we can do
# searching reasonably efficiently. Also get a mapping from the original
# text to the created documents (i.e. text2doc)
# Since the Smith-Waterman alignment is linear in the length of the
# text, we want to keep it reasonably small (a few thousand words).

if [ $stage -le 7 ]; then
  # Split the reference text into documents.
  mkdir -p $dir/docs

  # text2doc is a mapping from the original transcript to the documents
  # it is split into.
  # The format is
  # <original-transcript> <doc1> <doc2> ...
  steps/cleanup/internal/split_text_into_docs.pl --max-words $max_words \
    $text $dir/docs/doc2text $dir/docs/docs.txt
  utils/utt2spk_to_spk2utt.pl $dir/docs/doc2text > $dir/docs/text2doc
fi

if [ $stage -le 8 ]; then
  # Get TF-IDF for the reference documents.
  echo $nj > $dir/docs/num_jobs

  utils/split_data.sh $data_uniform_seg $nj

  mkdir -p $dir/docs/split$nj/

  # First compute IDF stats
  $cmd $dir/log/compute_source_idf_stats.log \
    steps/cleanup/internal/compute_tf_idf.py \
    --tf-weighting-scheme="raw" \
    --idf-weighting-scheme="log" \
    --output-idf-stats=$dir/docs/idf_stats.txt \
    $dir/docs/docs.txt $dir/docs/src_tf_idf.txt

  # Split documents so that they can be accessed easily by parallel jobs.
  mkdir -p $dir/docs/split$nj/
  sdir=$dir/docs/split$nj
  for n in `seq $nj`; do

    # old2new_utts is a mapping from the original segments to the
    # new segments created by uniformly segmenting.
    # The format is <old-utterance> <new-utt1> <new-utt2> ...
    utils/filter_scp.pl $data_uniform_seg/split$nj/$n/utt2spk $dir/uniform_sub_segments | \
      cut -d ' ' -f 1,2 | utils/utt2spk_to_spk2utt.pl > $sdir/old2new_utts.$n.txt

    if [ ! -z "$utt2text" ]; then
      # utt2text, if provided, is a mapping from the <old-utterance> to
      # <original-transript>.
      # Since text2doc is mapping from <original-transcript> to documents, we
      # first have to find the original-transcripts that are in the current
      # split.
      utils/filter_scp.pl $sdir/old2new_utts.$n.txt $utt2text | \
        cut -d ' ' -f 2 | sort -u | \
        utils/filter_scp.pl /dev/stdin $dir/docs/text2doc > $sdir/text2doc.$n
    else
      utils/filter_scp.pl $sdir/old2new_utts.$n.txt \
        $dir/docs/text2doc > $sdir/text2doc.$n
    fi

    utils/spk2utt_to_utt2spk.pl $sdir/text2doc.$n | \
      utils/filter_scp.pl /dev/stdin $dir/docs/docs.txt > \
      $sdir/docs.$n.txt
  done

  # Compute TF-IDF for the source documents.
  $cmd JOB=1:$nj $dir/docs/log/get_tfidf_for_source_texts.JOB.log \
    steps/cleanup/internal/compute_tf_idf.py \
      --tf-weighting-scheme="raw" \
      --idf-weighting-scheme="log" \
      --input-idf-stats=$dir/docs/idf_stats.txt \
      $sdir/docs.JOB.txt $sdir/src_tf_idf.JOB.txt

  sdir=$dir/docs/split$nj
  # Make $sdir an absolute pathname.
  sdir=`perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' $sdir ${PWD}`

  for n in `seq $nj`; do
    awk -v f="$sdir/src_tf_idf.$n.txt" '{print $1" "f}' \
      $sdir/text2doc.$n
  done | perl -ane 'BEGIN { %tfidfs = (); }
  {
    if (!defined $tfidfs{$F[0]}) {
      $tfidfs{$F[0]} = $F[1];
    }
  }
  END {
  while(my ($k, $v) = each %tfidfs) {
    print "$k $v\n";
  } }' > $dir/docs/source2tf_idf.scp
fi

if [ $stage -le 9 ]; then
  echo "$0: using default values of non-scored words..."

  # At the level of this script we just hard-code it that non-scored words are
  # those that map to silence phones (which is what get_non_scored_words.py
  # gives us), although this could easily be made user-configurable.  This list
  # of non-scored words affects the behavior of several of the data-cleanup
  # scripts; essentially, we view the non-scored words as negotiable when it
  # comes to the reference transcript, so we'll consider changing the reference
  # to match the hyp when it comes to these words.
  steps/cleanup/internal/get_non_scored_words.py $lang > $dir/non_scored_words.txt
fi

if [ $stage -le 10 ]; then
  sdir=$dir/query_docs/split$nj
  mkdir -p $sdir

  # Compute TF-IDF for the query documents (decode hypotheses).
  # The output is an archive of TF-IDF indexed by the query.
  $cmd JOB=1:$nj $decode_dir/ctm_$lmwt/log/compute_query_tf_idf.JOB.log \
    steps/cleanup/internal/ctm_to_text.pl --non-scored-words $dir/non_scored_words.txt \
      $decode_dir/ctm_$lmwt/ctm.JOB \| \
    steps/cleanup/internal/compute_tf_idf.py \
      --tf-weighting-scheme="normalized" \
      --idf-weighting-scheme="log" \
      --input-idf-stats=$dir/docs/idf_stats.txt \
      --accumulate-over-docs=false \
      - $sdir/query_tf_idf.JOB.ark.txt

  # The relevant documents can be found using TF-IDF similarity and nearby
  # documents can also be picked for the Smith-Waterman alignment stage.

  # Get a mapping from the new utterance-ids to original transcripts
  if [ -z "$utt2text" ]; then
    awk '{print $1" "$2}' $dir/uniform_sub_segments > \
      $dir/new2orig_utt
  else
    awk '{print $1" "$2}' $dir/uniform_sub_segments | \
      utils/apply_map.pl -f 2 $utt2text > \
      $dir/new2orig_utt
  fi

  # The query TF-IDFs are all indexed by the utterance-id of the sub-segments.
  # The source TF-IDFs use the document-ids created by splitting the reference
  # text into documents.
  # For each query, we need to retrieve the documents that were created from
  # the same original utterance that the sub-segment was from. For this,
  # we have to load the source TF-IDF that has those documents. This
  # information is provided using the option --source-text-id2tf-idf-file.
  # The output of this script is a file where the first column is the
  # query-id (i.e. sub-segment-id) and the remaining columns, which is at least
  # one in number and a maxmium of (1 + 2 * num-neighbors-to-search) columns
  # is the document-ids for the retrieved documents.
  $cmd JOB=1:$nj $dir/log/retrieve_similar_docs.JOB.log \
    steps/cleanup/internal/retrieve_similar_docs.py \
      --query-tfidf=$dir/query_docs/split$nj/query_tf_idf.JOB.ark.txt \
      --source-text-id2tfidf=$dir/docs/source2tf_idf.scp \
      --source-text-id2doc-ids=$dir/docs/text2doc \
      --query-id2source-text-id=$dir/new2orig_utt \
      --num-neighbors-to-search=$num_neighbors_to_search \
      --neighbor-tfidf-threshold=$neighbor_tfidf_threshold \
      --relevant-docs=$dir/query_docs/split$nj/relevant_docs.JOB.txt

  $cmd JOB=1:$nj $decode_dir/ctm_$lmwt/log/get_ctm_edits.JOB.log \
    steps/cleanup/internal/stitch_documents.py \
      --query2docs=$dir/query_docs/split$nj/relevant_docs.JOB.txt \
      --input-documents=$dir/docs/split$nj/docs.JOB.txt \
      --output-documents=- \| \
    steps/cleanup/internal/align_ctm_ref.py --eps-symbol='"<eps>"' \
      --oov-word="'`cat $lang/oov.txt`'" --symbol-table=$lang/words.txt \
      --hyp-format=CTM --align-full-hyp=$align_full_hyp \
      --hyp=$decode_dir/ctm_$lmwt/ctm.JOB --ref=- \
      --output=$decode_dir/ctm_$lmwt/ctm_edits.JOB

  for n in `seq $nj`; do
    cat $decode_dir/ctm_$lmwt/ctm_edits.$n
  done > $decode_dir/ctm_$lmwt/ctm_edits

fi

if [ $stage -le 11 ]; then
  $cmd $dir/log/resolve_ctm_edits.log \
    steps/cleanup/internal/resolve_ctm_edits_overlaps.py \
    ${data_uniform_seg}/segments $decode_dir/ctm_$lmwt/ctm_edits $dir/ctm_edits
fi

if [ $stage -le 12 ]; then
  echo "$0: modifying ctm-edits file to allow repetitions [for dysfluencies] and "
  echo "   ... to fix reference mismatches involving non-scored words. "

  $cmd $dir/log/modify_ctm_edits.log \
    steps/cleanup/internal/modify_ctm_edits.py --verbose=3 $dir/non_scored_words.txt \
    $dir/ctm_edits $dir/ctm_edits.modified

  echo "   ... See $dir/log/modify_ctm_edits.log for details and stats, including"
  echo " a list of commonly-repeated words."
fi

if [ $stage -le 13 ]; then
  echo "$0: applying 'taint' markers to ctm-edits file to mark silences and"
  echo "  ... non-scored words that are next to errors."
  $cmd $dir/log/taint_ctm_edits.log \
       steps/cleanup/internal/taint_ctm_edits.py --remove-deletions=false \
       $dir/ctm_edits.modified $dir/ctm_edits.tainted
  echo "... Stats, including global cor/ins/del/sub stats, are in $dir/log/taint_ctm_edits.log."
fi

if [ $stage -le 14 ]; then
  echo "$0: creating segmentation from ctm-edits file."

  segmentation_opts=(
  --min-split-point-duration=$min_split_point_duration
  --max-deleted-words-kept-when-merging=$max_deleted_words_kept_when_merging
  --merging.max-wer=$max_wer
  --merging.max-segment-length=$max_segment_length_for_merging
  --merging.max-bad-proportion=$max_bad_proportion
  --merging.max-intersegment-incorrect-words-length=$max_intersegment_incorrect_words_length
  --splitting.max-segment-length=$max_segment_length_for_splitting
  --splitting.hard-max-segment-length=$hard_max_segment_length
  --splitting.min-silence-length=$min_silence_length_to_split_at
  --splitting.min-non-scored-length=$min_non_scored_length_to_split_at
  )

  $cmd $dir/log/segment_ctm_edits.log \
    steps/cleanup/internal/segment_ctm_edits_mild.py \
      ${segmentation_opts[@]} $segmentation_extra_opts \
      --oov-symbol-file=$lang/oov.txt \
      --ctm-edits-out=$dir/ctm_edits.segmented \
      --word-stats-out=$dir/word_stats.txt \
      $dir/non_scored_words.txt \
      $dir/ctm_edits.tainted $dir/text $dir/segments

  echo "$0: contents of $dir/log/segment_ctm_edits.log are:"
  cat $dir/log/segment_ctm_edits.log
  echo "For word-level statistics on p(not-being-in-a-segment), with 'worst' words at the top,"
  echo "see $dir/word_stats.txt"
  echo "For detailed utterance-level debugging information, see $dir/ctm_edits.segmented"
fi

mkdir -p $out_data
if [ $stage -le 15 ]; then
  utils/data/subsegment_data_dir.sh $data_uniform_seg \
    $dir/segments $dir/text $out_data
fi


================================================
FILE: egs/steps/cleanup/split_long_utterance.sh
================================================
#!/usr/bin/env bash

# Copyright 2014  Guoguo Chen
# Apache 2.0

# Begin configuration section.
seg_length=30
min_seg_length=10
overlap_length=5
# End configuration section.

echo "$0 $@"

[ -f ./path.sh ] && . ./path.sh
. parse_options.sh || exit 1;

if [ $# -ne 2 ]; then
  echo "This script truncates the long audio into smaller overlapping segments"
  echo ""
  echo "Usage: $0 [options] <input-dir> <output-dir>"
  echo " e.g.: $0 data/train_si284_long data/train_si284_split"
  echo ""
  echo "Options:"
  echo "    --min-seg-length        # minimal segment length"
  echo "    --seg-length            # length of segments in seconds."
  echo "    --overlap-length        # length of overlap in seconds."
  exit 1;
fi

input_dir=$1
output_dir=$2

for f in spk2utt text utt2spk wav.scp; do
  [ ! -f $input_dir/$f ] && echo "$0: no such file $input_dir/$f" && exit 1;
done

[ ! $seg_length -gt $overlap_length ] \
  && echo "$0: --seg-length should be longer than --overlap-length." && exit 1;

# Checks if sox is on the path.
sox=`which sox`
[ $? -ne 0 ] && echo "$0: sox command not found." && exit 1;
sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe
[ ! -x $sph2pipe ] && echo "$0: sph2pipe command not found." && exit 1;

mkdir -p $output_dir
cp -f $input_dir/spk2gender $output_dir/spk2gender 2>/dev/null
cp -f $input_dir/text $output_dir/text.orig
cp -f $input_dir/wav.scp $output_dir/wav.scp

# We assume the audio length in header is correct and get it from there. It is
# a little bit annoying that old version of sox does not support the following:
#   $audio_cmd | sox --i -D
# we have to put it in the following format for the old versions:
#   $sox --i -D "|$audio_cmd"
# Another way is to count all the samples to get the duration, but it takes
# longer time, so we do not use it here.. The command is:
#   $audio_cmd | sox -t wav - -n stat | grep -P "^Length" | awk '{print $1;}'
#
# Note: in the wsj example the process takes couple of minutes because of the
#       audio file concatenation; in a real case it should be much faster since
#       it just reads the header.
cat $output_dir/wav.scp | perl -e '
  $no_orig_seg = "false";       # Original segment file may or may not exist.
  ($u2s_in, $u2s_out, $seg_in,
   $seg_out, $orig2utt, $sox, $slen, $mslen, $olen) = @ARGV;
  open(UI, "<$u2s_in") || die "Error: fail to open $u2s_in\n";
  open(UO, ">$u2s_out") || die "Error: fail to open $u2s_out\n";
  open(SI, "<$seg_in") || ($no_orig_seg = "true");
  open(SO, ">$seg_out") || die "Error: fail to open $seg_out\n";
  open(UMAP, ">$orig2utt") || die "Error: fail to open $orig2utt\n";
  # If the original segment file exists, we have to work out the segment
  # duration from the segment file. Otherwise we work that out from the wav.scp
  # file.
  if ($no_orig_seg eq "false") {
    while (<SI>) {
      chomp;
      @col = split;
      @col == 4 || die "Error: bad line $_\n";
      ($seg_id, $wav_id, $seg_start, $seg_end) = @col;
      $seg2wav{$seg_id} = $wav_id;
      $seg_start{$seg_id} = $seg_start;
      $seg_end{$seg_id} = $seg_end;
    }
  } else {
    while (<STDIN>) {
      chomp;
      @col = split;
      @col >= 2 || "bad line $_\n";
      if ((@col > 2) &&  ($col[-1] eq "|")) {
        $wav_id = shift @col; pop @col;
        $audio_cmd = join(" ", @col);
        $duration = `$sox --i -D '\''|$audio_cmd'\''`;
      } else {
        @col == 2 || die "Error: bad line $_\n in wav.scp";
        $wav_id = $col[0];
        $audio_file = $col[1];
        $duration = `$sox --i -D $audio_file`;
      }
      chomp($duration);
      $seg2wav{$wav_id} = $wav_id;
      $seg_start{$wav_id} = 0;
      $seg_end{$wav_id} = $duration;
    }
  }
  while (<UI>) {
    chomp;
    @col = split;
    @col == 2 || die "Error: bad line $_\n";
    $utt2spk{$col[0]} = $col[1];
  }
  foreach $seg (sort keys %seg2wav) {
    $index = 0;
    $step = $slen - $olen;
    print UMAP "$seg";
    while ($seg_start{$seg} + $index * $step < $seg_end{$seg}) {
      $new_seg = $seg . "_" . sprintf("%05d", $index);
      $start = $seg_start{$seg} + $index * $step;
      $end = $start + $slen;
      defined($utt2spk{$seg}) || die "Error: speaker not found for $seg\n";
      print UO "$new_seg $utt2spk{$seg}\n";
      print UMAP " $new_seg"; 
      $index += 1;
      if ($end - $olen + $mslen >= $seg_end{$seg}) {
        # last segment will have at least $mslen seconds.
        $end = $seg_end{$seg};
        print SO "$new_seg $seg2wav{$seg} $start $end\n";
        last;
      } else {
        print SO "$new_seg $seg2wav{$seg} $start $end\n";
      }
    }
    print UMAP "\n";
  }' $input_dir/utt2spk $output_dir/utt2spk \
    $input_dir/segments $output_dir/segments $output_dir/orig2utt \
    $sox $seg_length $min_seg_length $overlap_length

# CAVEAT: We are not dealing with channels here. Each channel should have a
# unique file name in wav.scp.
paste -d ' ' <(cut -d ' ' -f 1 $output_dir/wav.scp) \
  <(cut -d ' ' -f 1 $output_dir/wav.scp) | awk '{print $1" "$2" A";}' \
  > $output_dir/reco2file_and_channel

utils/fix_data_dir.sh $output_dir

exit 0;


================================================
FILE: egs/steps/combine_ali_dirs.sh
================================================
#!/usr/bin/env bash
# Copyright 2016  Xiaohui Zhang  Apache 2.0.
# Copyright 2019  SmartAction (kkm)

# This script combines alignment directories, such as exp/tri4a_ali, and
# validates matching of the utterances and alignments after combining.

# Begin configuration section.
cmd=run.pl
nj=4
combine_lat=true
combine_ali=true
tolerance=10
# End configuration section.
echo "$0 $@"  # Print the command line for logging.

[[ -f path.sh ]] && . ./path.sh
. parse_options.sh || exit 1

export LC_ALL=C

if [[ $# -lt 3 ]]; then
  cat >&2 <<EOF
Usage: $0 [options] <data> <dest-dir> <src-dir1> <src-dir2> ...
 e.g.: $0 --nj 32 data/train exp/tri3_ali_combined exp/tri3_ali_1 exp_tri3_ali_2
Options:
 --nj <nj>              # number of jobs to split combined archives [4]
 --combine_ali false    # merge ali.*.gz if present [true]
 --combine_lat false    # merge lat.*.gz if present [true]
 --tolerance <int,%>    # maximum percentage of missing alignments or lattices
                        # w.r.t. total utterances in <data> before error is
                        # reported [10]

The script checks that certain important files are present and compatible in all
source directories (phones.txt, tree); other are copied from the first source
(cmvn_opts, final.mdl) without much checking.

Both --combine_ali and --combine_lat are true by default, but the script
proceeds with a warning if directories do not contain either alignments or
alignment lattices. Check for files ali.1.gz and/or lat.1.gz in the <dest-dir>
after the script completes if additional programmatic check is required.
EOF
  exit 1;
fi

if [[ ! $combine_lat && ! $combine_ali ]]; then
  echo "$0: at least one of --combine_lat and --combine_ali must be true"
  exit 1
fi

data=$1
dest=$2
shift 2
first_src=$1

do_ali=$combine_ali
do_lat=$combine_lat

# Check if alignments and/or lattices are present. Since we combine both,
# whichever present, issue a warning only. Also verify that the target is
# different from any source; we cannot combine in-place, and a lot of damage
# could result.
for src in $@; do
  if [[ "$(cd 2>/dev/null -P -- "$src" && pwd)" = \
        "$(cd 2>/dev/null -P -- "$dest" && pwd)" ]]; then
    echo "$0: error: Source $src is same as target $dest."
    exit 1
  fi
  if $do_ali && [[ ! -f $src/ali.1.gz ]]; then
    echo "$0: warning: Alignments (ali.*.gz) are not present in $src, not" \
         "combining. Consider '--combine_ali false' to suppress this warning."
    do_ali=false
  fi
  if $do_lat && [[ ! -f $src/lat.1.gz ]]; then
    echo "$0: warning: Alignment lattices (lat.*.gz) are not present in $src,"\
      "not combining. Consider '--combine_lat false' to suppress this warning."
    do_lat=false
  fi
done

if ! $do_ali && ! $do_lat; then
  echo "$0: error: Cannot combine directories."
  exit 1
fi

# Verify that required files are present in the first directory.
for f in cmvn_opts final.mdl num_jobs phones.txt tree; do
  if [ ! -f $first_src/$f ]; then
    echo "$0: error: Required source file $first_src/$f is missing."
    exit 1
  fi
done

# Verify that phones and trees are compatible in all directories, and than
# num_jobs files are present, too.
for src in $@; do
  if [[ $src != $first_src ]]; then
    if [[ ! -f $src/num_jobs ]]; then
      echo "$0: error: Required source file $src/num_jobs is missing."
      exit 1
    fi
    if ! cmp -s $first_src/tree $src/tree; then
      echo "$0: error: tree $src/tree is either missing or not the" \
           "same as $first_src/tree."
      exit 1
    fi
    if [[ ! -f $src/phones.txt ]]; then
      echo "$0: error: Required source file $src/phones.txt is missing."
      exit 1
    fi
    utils/lang/check_phones_compatible.sh $first_src/phones.txt \
                                          $src/phones.txt || exit 1
  fi
done

# All checks passed, ok to prepare directory. Copy model and other files from
# the first source, as they either checked to be compatible, or we do not care
# if they are.
mkdir -p $dest || exit 1
rm -f $dest/{cmvn_opts,final.mdl,num_jobs,phones.txt,tree}
$do_ali && rm -f $dest/ali.*.{gz,scp}
$do_lat && rm -f $dest/lat.*.{gz,scp}
cp $first_src/{cmvn_opts,final.mdl,phones.txt,tree} $dest/ || exit 1
cp $first_src/frame_subsampling_factor $dest/ 2>/dev/null  # If present.
echo $nj > $dest/num_jobs || exit 1

# Make temporary directory, delete on signal, but not on 'exit 1'.
temp_dir=$(mktemp -d $dest/temp.XXXXXX) || exit 1
cleanup() { rm -rf "$temp_dir"; }
trap cleanup HUP INT TERM
echo "$0: note: Temporary directory $temp_dir will not be deleted in case of" \
     "script failure, so you could examine it for troubleshooting."


# This function may be called twice, once to combine alignments and the second
# time to combine lattices. The two invocations are as follows:
#   do_combine ali alignments copy-int-vector $@
#   do_combine lat lattices   lattice-copy $@
# where 'ali'/'lat' is a prefix to archive name, 'alignments'/'lattices' go into
# log messages and logfile names, and 'copy-int-vector'/'lattice-copy' is the
# program used to copy corresponding objects.
do_combine() {
  local ark=$1 entities=$2 copy_program=$3
  shift 3

  echo "$0: Gathering $entities from each source directory."
  # Assign all source gzipped archive names to an exported variable, one each
  # per source directory, so that we can copy archives in a job per source.
  src_id=0
  for src in $@; do
    src_id=$((src_id + 1))
    nj_src=$(cat $src/num_jobs) || exit 1
    # Create and export variable src_arcs_${src_id} for the job runner.
    # Each numbered variable will contain the list of archives, e. g.:
    # src_arcs_1="exp/tri3_ali/ali.1.gz exp/tri3_ali/ali.1.gz ..."
    # ('printf' repeats its format as long as there are more arguments).
    printf "$src/$ark.%d.gz " $(seq $nj_src) > $temp_dir/src_arks.${src_id}
  done
  
  # Gather archives in parallel jobs.
  $cmd JOB=1:$src_id $dest/log/gather_$entities.JOB.log \
    $copy_program \
      "ark:gunzip -c \$(cat $temp_dir/src_arks.JOB) |" \
      "ark,scp:$temp_dir/$ark.JOB.ark,$temp_dir/$ark.JOB.scp" || exit 1

  # Merge (presumed already sorted) scp's into a single script.
  sort -m $temp_dir/$ark.*.scp > $temp_dir/$ark.scp || exit 1

  inputs=$(for n in `seq $nj`; do echo $temp_dir/$ark.$n.scp; done)
  utils/split_scp.pl --utt2spk=$data/utt2spk $temp_dir/$ark.scp $inputs

  echo "$0: Splitting combined $entities into $nj archives on speaker boundary."
  $cmd JOB=1:$nj $dest/log/chop_combined_$entities.JOB.log \
    $copy_program \
      "scp:$temp_dir/$ark.JOB.scp" \
      "ark:| gzip -c > $dest/$ark.JOB.gz" || exit 1

  # Get some interesting stats, and signal an error if error threshold exceeded.
  n_utt=$(wc -l <$data/utt2spk)
  n_ali=$(wc -l <$temp_dir/$ark.scp)
  n_ali_no_utt=$(join -j1 -v2 $data/utt2spk $temp_dir/$ark.scp | wc -l)
  n_utt_no_ali=$(join -j1 -v1 $data/utt2spk $temp_dir/$ark.scp | wc -l)
  n_utt_no_ali_pct=$(perl -e "print int($n_utt_no_ali/$n_utt * 100 + .5);")
  echo "$0: Combined $n_ali $entities for $n_utt utterances." \
       "There were $n_utt_no_ali utterances (${n_utt_no_ali_pct}%) without" \
       "$entities, and $n_ali_no_utt $entities not matching any utterance."

  if (( $n_utt_no_ali_pct >= $tolerance )); then
    echo "$0: error: Percentage of utterances missing $entities," \
         "${n_utt_no_ali_pct}%, is at or above error tolerance ${tolerance}%."
    exit 1
  fi

  return 0
}

# Do the actual combining. Do not check returned exit code, as
# the function always calls 'exit 1' on failure.
$do_ali && do_combine ali 'alignments' copy-int-vector "$@"
$do_lat && do_combine lat 'lattices' lattice-copy "$@"

# Delete the temporary directory on success.
cleanup

what=
$do_ali && what+='alignments '
$do_ali && $do_lat && what+='and '
$do_lat && what+='lattices '
echo "$0: Stored combined ${what}in $dest"  # No period, interferes with
                                            # copy/paste from tty emulator.
exit 0


================================================
FILE: egs/steps/combine_trans_dirs.sh
================================================
#!/usr/bin/env bash
# Copyright 2016  Xiaohui Zhang  Apache 2.0.
# Copyright 2019  SmartAction (kkm)
# Copyright 2019  manhong wang (marvin)

# This script only combines transform file in the aligments dirs, egs: trans.1,  and
# validates matching of the utterances and alignments after combining. you would need this fmllr trans
# files after you combine ali or lat dirs(combine_ali_dirs.sh or combine_lat_dis.sh).

# Begin configuration section.
cmd=run.pl
tolerance=10
# End configuration section.
echo "$0 $@"  # Print the command line for logging.

[[ -f path.sh ]] && . ./path.sh
. parse_options.sh || exit 1

export LC_ALL=C

if [[ $# -lt 3 ]]; then
  cat >&2 <<EOF
Usage: $0 [options] <data> <dest-dir> <src-dir1> <src-dir2> ...
 e.g.: $0 data/train exp/tri3_trans_combined exp/tri3_trans_1 exp_tri3_trans_2
Options:
 --tolerance <int,%>    # maximum percentage of missing trans
                        # w.r.t. total utterances in <data> before error is
                        # reported [10]

Note:we do not checks that certain important files are present and compatible in all
source directories (phones.txt, tree) here.Because you would run combine_trans_dirs.sh 
or combine_lat_dis.sh first.

EOF
  exit 1;
fi


data=$1
dest=$2
shift 2
first_src=$1

do_trans=true    


# All checks passed, ok to prepare directory. but we do not Copy model and other files from
# the first source.

for src in $@; do
  if [[ "$(cd 2>/dev/null -P -- "$src" && pwd)" = \
        "$(cd 2>/dev/null -P -- "$dest" && pwd)" ]]; then
    echo "$0: error: Source $src is same as target $dest."
    exit 1
  fi
  if $do_trans && [[ ! -f $src/trans.1 ]]; then
    echo "$0: warning: transform (trans.*) are not present in $src, not" \
         "combining. please check you files" 
    exit 1
  fi
done

if [ ! -f $dest/ali.1.gz  ] && [ ! -f $dest/lat.1.gz ] ; then 
    echo "$0: warning: we assume you have combined the ali or lat dirs " \
         "please run combine_ali_dir.sh or combine_lat_dir.sh firstly"
    exit 1
fi

nj=$(cat $dest/num_jobs)

if [ -f $dest/trans.1 ] ; then rm $dest/trans.* ;fi    #remove old trans.*

# Make temporary directory, delete on signal, but not on 'exit 1'.
temp_dir=$(mktemp -d $dest/temp.XXXXXX) || exit 1
cleanup() { rm -rf "$temp_dir"; }
trap cleanup HUP INT TERM
echo "$0: note: Temporary directory $temp_dir will not be deleted in case of" \
     "script failure, so you could examine it for troubleshooting."

do_combine_trans() {
  local ark=$1 entities=$2 copy_program=$3
  shift 3

  echo "$0: Gathering $entities from each source directory."
  # Assign all source gzipped archive names to an exported variable, one each
  # per source directory, so that we can copy archives in a job per source.
  src_id=0
  for src in $@; do
    src_id=$((src_id + 1))
    nj_src=$(cat $src/num_jobs) || exit 1
    # Create and export variable src_arcs_${src_id} for the job runner.
    # Each numbered variable will contain the list of archives, e. g.:
    # src_arcs_1="exp/tri3_ali/trans.1 exp/tri3_ali/trans.1 ..."
    # ('printf' repeats its format as long as there are more arguments).
    printf "$src/$ark.%d " $(seq $nj_src) > $temp_dir/src_arks.${src_id}
  done
  
  # Gather archives in parallel jobs.
  $cmd JOB=1:$src_id $dest/log/gather_$entities.JOB.log \
    $copy_program \
      "ark:cat \$(cat $temp_dir/src_arks.JOB) |" \
      "ark,scp:$temp_dir/$ark.JOB,$temp_dir/$ark.JOB.scp" || exit 1

  # Merge (presumed already sorted) scp's into a single script.
  sort -m $temp_dir/$ark.*.scp > $temp_dir/$ark.scp || exit 1

  echo "$0: Splitting combined $entities into $nj archives on speaker boundary."
  $cmd JOB=1:$nj $dest/log/chop_combined_$entities.JOB.log \
    $copy_program \
      "scp:utils/split_scp.pl  -j $nj JOB --one-based $temp_dir/$ark.scp |" \
      "ark:$dest/$ark.JOB" || exit 1

  # Get some interesting stats.
  n_utt=$(wc -l <$data/spk2utt)
  n_trans=$(wc -l <$temp_dir/$ark.scp)
  n_utt_no_trans_pct=$(perl -e "print int(($n_utt - $n_trans)/$n_utt * 100 + .5);")
  echo "$0: Combined $n_trans $entities for $n_utt utterances." 

  if (( $n_utt_no_trans_pct >= $tolerance )); then
    echo "$0: error: Percentage of utterances missing $entities," \
         "${n_utt_no_trans_pct}%, is at or above error tolerance ${tolerance}%."
    exit 1
  fi

  return 0
}

$do_trans && do_combine_trans trans 'transforms' copy-matrix "$@"

cleanup     # Delete the temporary directory on success.

echo "$0: Stored combined fmllr trans in $dest"  
exit 0


================================================
FILE: egs/steps/compare_alignments.sh
================================================
#!/usr/bin/env bash

# Copyright 2018  Johns Hopkins University (author: Daniel Povey)
# Apache 2.0.

set -e
stage=0
cmd=run.pl   # We use this only for get_ctm.sh, which can be a little slow.
num_to_sample=1000  # We sample this many utterances for human-readable display, starting from the worst and then
                    # starting from the middle.
cleanup=true

if [ -f ./path.sh ]; then . ./path.sh; fi

. ./utils/parse_options.sh

if [ $# -ne 5 ] && [ $# -ne 7 ]; then
  cat <<EOF
  This script compares two directories containing data alignments, and
  creates statistics showing how much the phone and word alignments differ,
  including breakdown by phones and words; and which utterances differ the
  most.  This is intended for diagnostic purposes.  Both alignment directories
  should be for the same data (or at least the data sets should overlap).
  The word alignment stats may not be correctly obtained if the data-dirs are
  not the same.

  Usage: $0 [options] <lang-directory> <data-directory> <ali-dir1> <ali-dir2> <work-dir>
    or:  $0 [options] <lang1> <lang2> <data1> <data2> <ali-dir1> <ali-dir2> <work-dir>
   e.g.: $0 data/lang data/train exp/tri2_ali exp/tri3_ali exp/compare_ali_2_3

  Options:
              --cmd (run.pl|queue.pl...)      # specify how to run the sub-processes.
                                              # (passed through to get_train_ctm.sh)
              --cleanup <true|false>          # Specify --cleanup false to prevent
                                              # cleanup of temporary files.
              --stage  <n>                    # Enables you to run part of the script.

EOF
  exit 1
fi

if [ $# -eq 5 ]; then
  lang1=$1
  lang2=$1
  data1=$2
  data2=$2
  ali_dir1=$3
  ali_dir2=$4
  dir=$5
else
  lang1=$1
  lang2=$2
  data1=$3
  data2=$4
  ali_dir1=$5
  ali_dir2=$6
  dir=$7
fi

for f in $lang1/phones.txt $lang2/phones.txt $data1/utt2spk $data2/utt2spk \
         $ali_dir1/ali.1.gz $ali_dir2/ali.2.gz; do
  if [ ! -f $f ]; then
    echo "$0: expected file $f to exist"
    exit 1
  fi
done

# This will exit if the phone symbol id's are different, due to
# `set -e` above.
utils/lang/check_phones_compatible.sh $lang1/phones.txt $lang2/phones.txt

nj1=$(cat $ali_dir1/num_jobs)
nj2=$(cat $ali_dir2/num_jobs)

mkdir -p $dir/log


if [ $stage -le 0 ]; then
  echo "$0: converting alignments to phones."

  for j in $(seq $nj1); do gunzip -c $ali_dir1/ali.$j.gz; done | \
    ali-to-phones --per-frame=true $ali_dir1/final.mdl ark:- ark:- | gzip -c > $dir/phones1.gz

  for j in $(seq $nj2); do gunzip -c $ali_dir2/ali.$j.gz; done | \
    ali-to-phones --per-frame=true $ali_dir2/final.mdl ark:- ark:- | gzip -c > $dir/phones2.gz
fi

if [ $stage -le 1 ]; then
  echo "$0: getting comparison stats and utterance stats."
  compare-int-vector --binary=false --write-confusion-matrix=$dir/conf.mat \
            "ark:gunzip -c $dir/phones1.gz|" "ark:gunzip -c $dir/phones2.gz|" 2>$dir/log/compare_phones.log > $dir/utt_stats.phones
  tail -n 8 $dir/log/compare_phones.log
fi

if [ $stage -le 3 ]; then
  cat $dir/conf.mat | grep -v -F '[' | sed 's/]//' | awk '{n=NF; for (k=1;k<=n;k++) { conf[NR,k] = $k; row_tot[NR] += $k; col_tot[k] += $k; } } END{
   for (row=1;row<=n;row++) for (col=1;col<=n;col++) {
     val = conf[row,col]; this_row_tot = row_tot[row]; this_col_tot = col_tot[col];
     rval=conf[col,row]
     min_tot = (this_row_tot < this_col_tot ? this_row_tot : this_col_tot);
     if (val != 0) {
       phone1 = row-1; phone2 = col-1;
       if (row == col) printf("COR %d %d %.2f%\n", phone1, val, (val * 100 / this_row_tot));
       else {
         norm_prob = val * val / min_tot;  # heuristic for sorting.
         printf("SUB %d %d %d %d %.2f%% %.2f%%\n",
                 norm_prob, phone1, phone2, val, (val * 100 / min_tot), (rval * 100 / min_tot)); }}}}' > $dir/phone_stats.all

   (
     echo "# Format: <phone> <frame-count> <percent-correct>"
     grep '^COR' $dir/phone_stats.all | sort -n -k4,4 | awk '{print $2, $3, $4}' | utils/int2sym.pl -f 1 $lang1/phones.txt
   ) > $dir/phones_correct.txt

   (
     echo "#Format: <phone1> <phone2> <num-frames> <prob-wrong%> <reverse-prob-wrong%>"
     echo "# <num-frames> is the number of frames that were labeled <phone1> in the first"
     echo "# set of alignments and <phone2> in the second."
     echo "# <prob-wrong> is <num-frames> divided by the smaller of the total num-frames of"
     echo "#  phone1 or phone2, whichever is smaller; expressed as a percentage."
     echo "#<reverse-prob-wrong> is the same but for the reverse substitution, from"
     echo "#<phone2> to <phone1>; the comparison with <prob-wrong> the substitutions are)."
     grep '^SUB' $dir/phone_stats.all | sort -nr -k2,2 | awk '{print $3,$4,$5,$6,$7}' | utils/int2sym.pl -f 1-2 $lang1/phones.txt
   ) > $dir/phone_subs.txt
fi

if [ $stage -le 4 ]; then
  echo "$0: getting CTMs"
  steps/get_train_ctm.sh --use-segments false --print-silence true --cmd "$cmd" --frame-shift 1.0 $data1 $lang1 $ali_dir1 $dir/ctm1
  steps/get_train_ctm.sh --use-segments false --print-silence true --cmd "$cmd" --frame-shift 1.0 $data2 $lang2 $ali_dir2 $dir/ctm2
fi

if [ $stage -le 5 ]; then
  oov=$(cat $lang1/oov.int)
  # Note: below, we use $lang1 for both setups; this is by design as compare-int-vector
  # assumes they use the same symbol table.
  for n in 1 2; do
    cat $dir/ctm${n}/ctm | utils/sym2int.pl --map-oov $oov -f 5 $lang1/words.txt | \
      awk 'BEGIN{utt_id="";} { if (utt_id != $1) { if (utt_id != "") printf("\n"); utt_id=$1; printf("%s ", utt_id); } t_start=int($3); t_end=t_start + int($4); word=$5; for (t=t_start; t<t_end; t++) printf("%s ", word); } END{printf("\n")}' | \
      copy-int-vector ark:- ark:- | gzip -c >$dir/words${n}.gz
  done
fi

if [ $stage -le 5 ]; then
  compare-int-vector --binary=false --write-tot-counts=$dir/words_tot.vec --write-diff-counts=$dir/words_diff.vec \
         "ark:gunzip -c $dir/words1.gz|" "ark:gunzip -c $dir/words2.gz|" 2>$dir/log/compare_words.log >$dir/utt_stats.words
  tail -n 8 $dir/log/compare_words.log
fi

if [ $stage -le 6 ]; then

  ( echo "# Word stats.  Format:";
    echo "<proportion-of-wrong-frames> <num-wrong-frames> <num-correct-frames> <word>"

    paste <(awk '{for (n=2;n<NF;n++) print $n;}' <$dir/words_diff.vec) \
      <(awk '{for (n=2;n<NF;n++) print $n;}' <$dir/words_tot.vec) | \
       awk '{ if($2 > 0) print $1*$1/$2, $1/$2, $1, $2, (NR-1)}' | utils/int2sym.pl -f 5 $lang1/words.txt | \
      sort -nr | awk '{print $2, $3, $4, $5;}'
  ) > $dir/word_stats.txt

fi

if [ $stage -le 7 ]; then
  for type in phones words; do
    num_utts=$(wc -l <$dir/utt_stats.$type)
    cat $dir/utt_stats.$type | awk -v type=$type 'BEGIN{print "Utterance-id proportion-"type"-changed num-frames num-wrong-frames"; }
          {print $1, $3 * 1.0 / $2, $2, $3; }' | sort -nr -k2,2 > $dir/utt_stats.$type.sorted
    (
      echo "$0: Percentiles 100, 90, .. 0 of proportion-$type-changed distribution (over utterances) are:"
    cat $dir/utt_stats.$type.sorted | awk -v n=$num_utts 'BEGIN{k=int((n-1)/10);} {if (NR % k == 1) printf("%s ", $2); } END{print "";}'
    ) | tee $dir/utt_stats.$type.percentiles
  done
fi


if [ $stage -le 8 ]; then
  # Display the 1000 worst utterances, and 1000 utterances from the middle of the pack, in a readable format.
  num_utts=$(wc -l <$dir/utt_stats.words.sorted)
  half_num_utts=$[$num_utts/2];
  if [ $num_to_sample -gt $half_num_utts ]; then
    num_to_sample=$half_num_utts
  fi
  head -n $num_to_sample $dir/utt_stats.words.sorted | awk '{print $1}' > $dir/utt_ids.worst
  tail -n +$half_num_utts $dir/utt_stats.words.sorted | head -n $num_to_sample | awk '{print $1}' > $dir/utt_ids.mid

  for suf in worst mid; do
    for n in 1 2; do
      gunzip -c $dir/phones${n}.gz | copy-int-vector ark:- ark,t:- | utils/filter_scp.pl $dir/utt_ids.$suf  >$dir/temp
      # the next command reorders them, and duplicates the utterance-idwhich we'll later use
      # that to display the word sequence.
      awk '{print $1,$1,$1}' <$dir/utt_ids.$suf | utils/apply_map.pl -f 3 $dir/temp > $dir/phones${n}.$suf
      rm $dir/temp
    done
    # the stuff with 0 and <eps> below is a kind of hack so that if the phones are the same, we end up
    # with just the phone, but if different, we end up with p1/p2.
    # The apply_map.pl stuff is to put the transcript there.

    (
      echo "# Format: <utterance-id> <word1> <word2> ... <wordN>  <frame1-phone> ... <frameN-phone>"
      echo "# If the two alignments have the same phone, just that phone will be printed;"
      echo "# otherwise the two phones will be printed, as in 'phone1/phone2'.  So '/' is present"
      echo "# whenever there is a mismatch."

      paste $dir/phones1.$suf $dir/phones2.$suf | perl -ane ' @A = split("\t", $_); @A1 = split(" ", $A[0]); @A2 = split(" ", $A[1]);
            $utt = shift @A1; shift @A2; print $utt, " ";
            for ($n = 0; $n < @A1 && $n < @A2; $n++) { $a1=$A1[$n]; $a2=$A2[$n];  if ($a1 eq $a2) { print "$a1 "; } else { print "$a1 0 $a2 "; }}
            print "\n" ' | utils/int2sym.pl -f 3- $lang1/phones.txt | sed 's: <eps> :/:g' | \
        utils/apply_map.pl -f 2 $data1/text
    )  > $dir/compare_phones_${suf}.txt
  done
fi


if [ $stage -le 9 ] && $cleanup; then
  rm $dir/phones{1,2}.gz $dir/words{1,2}.gz $dir/ctm*/ctm $dir/*.vec $dir/conf.mat \
     $dir/utt_ids.*  $dir/phones{1,2}.{mid,worst} $dir/utt_stats.{phones,words} \
     $dir/phone_stats.all
fi

# clean up
exit 0


================================================
FILE: egs/steps/compute_cmvn_stats.sh
================================================
#!/usr/bin/env bash

# Copyright 2012-2016  Johns Hopkins University (Author: Daniel Povey)
# Apache 2.0
# To be run from .. (one directory up from here)
# see ../run.sh for example

# Compute cepstral mean and variance statistics per speaker.
# We do this in just one job; it's fast.
# This script takes no options.
#
# Note: there is no option to do CMVN per utterance.  The idea is
# that if you did it per utterance it would not make sense to do
# per-speaker fMLLR on top of that (since you'd be doing fMLLR on
# top of different offsets).  Therefore what would be the use
# of the speaker information?  In this case you should probably
# make the speaker-ids identical to the utterance-ids.  The
# speaker information does not have to correspond to actual
# speakers, it's just the level you want to adapt at.

echo "$0 $@"  # Print the command line for logging

fake=false   # If specified, can generate fake/dummy CMVN stats (that won't normalize)
fake_dims=   # as the "fake" option, but you can generate "fake" stats only for certain
             # dimensions.
two_channel=false

if [ "$1" == "--fake" ]; then
  fake=true
  shift
fi
if [ "$1" == "--fake-dims" ]; then
  fake_dims=$2
  shift
  shift
fi
if [ "$1" == "--two-channel" ]; then
  two_channel=true
  shift
fi

if [ $# -lt 1 ] || [ $# -gt 3 ]; then
   echo "Usage: $0 [options] <data-dir> [<log-dir> [<cmvn-dir>] ]";
   echo "e.g.: $0 data/train exp/make_mfcc/train mfcc"
   echo "Note: <log-dir> defaults to <data-dir>/log, and <cmvn-dir> defaults to <data-dir>/data"
   echo "Options:"
   echo " --fake          gives you fake cmvn stats that do no normalization."
   echo " --two-channel   is for two-channel telephone data, there must be no segments "
   echo "                 file and reco2file_and_channel must be present.  It will take"
   echo "                 only frames that are louder than the other channel."
   echo " --fake-dims <n1:n2>  Generate stats that won't cause normalization for these"
   echo "                  dimensions (e.g. 13:14:15)"
   exit 1;
fi

if [ -f path.sh ]; then . ./path.sh; fi

data=$1
if [ $# -ge 2 ]; then
  logdir=$2
else
  logdir=$data/log
fi
if [ $# -ge 3 ]; then
  cmvndir=$3
else
  cmvndir=$data/data
fi

# make $cmvndir an absolute pathname.
cmvndir=`perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' $cmvndir ${PWD}`

# use "name" as part of name of the archive.
name=`basename $data`

mkdir -p $cmvndir || exit 1;
mkdir -p $logdir || exit 1;


required="$data/feats.scp $data/spk2utt"

for f in $required; do
  if [ ! -f $f ]; then
    echo "$0: no such file $f"
    exit 1;
  fi
done

if $fake; then
  dim=`feat-to-dim scp:$data/feats.scp -`
  ! cat $data/spk2utt | awk -v dim=$dim '{print $1, "["; for (n=0; n < dim; n++) { printf("0 "); } print "1";
                                                        for (n=0; n < dim; n++) { printf("1 "); } print "0 ]";}' | \
    copy-matrix ark:- ark,scp:$cmvndir/cmvn_$name.ark,$cmvndir/cmvn_$name.scp && \
     echo "Error creating fake CMVN stats.  See $logdir/cmvn_$name.log." && exit 1;
elif $two_channel; then
  ! compute-cmvn-stats-two-channel $data/reco2file_and_channel scp:$data/feats.scp \
       ark,scp:$cmvndir/cmvn_$name.ark,$cmvndir/cmvn_$name.scp \
    2> $logdir/cmvn_$name.log && echo "Error computing CMVN stats (using two-channel method). See $logdir/cmvn_$name.log." && exit 1;
elif [ ! -z "$fake_dims" ]; then
  ! compute-cmvn-stats --spk2utt=ark:$data/spk2utt scp:$data/feats.scp ark:- | \
    modify-cmvn-stats "$fake_dims" ark:- ark,scp:$cmvndir/cmvn_$name.ark,$cmvndir/cmvn_$name.scp && \
    echo "Error computing (partially fake) CMVN stats.  See $logdir/cmvn_$name.log" && exit 1;
else
  ! compute-cmvn-stats --spk2utt=ark:$data/spk2utt scp:$data/feats.scp ark,scp:$cmvndir/cmvn_$name.ark,$cmvndir/cmvn_$name.scp \
    2> $logdir/cmvn_$name.log && echo "Error computing CMVN stats. See $logdir/cmvn_$name.log" && exit 1;
fi

cp $cmvndir/cmvn_$name.scp $data/cmvn.scp || exit 1;

nc=`cat $data/cmvn.scp | wc -l`
nu=`cat $data/spk2utt | wc -l`
if [ $nc -ne $nu ]; then
  echo "$0: warning: it seems not all of the speakers got cmvn stats ($nc != $nu);"
  [ $nc -eq 0 ] && exit 1;
fi

echo "Succeeded creating CMVN stats for $name"


================================================
FILE: egs/steps/compute_vad_decision.sh
================================================
#!/bin/bash 

# Copyright    2017  Vimal Manohar
# Apache 2.0

# To be run from .. (one directory up from here)
# see ../run.sh for example

# Compute energy based VAD output

nj=4
cmd=run.pl
vad_config=conf/vad.conf

echo "$0 $@"  # Print the command line for logging

if [ -f path.sh ]; then . ./path.sh; fi
. parse_options.sh || exit 1;

if [ $# -lt 1 ] || [ $# -gt 3 ]; then
   echo "Usage: $0 [options] <data-dir> [<log-dir> [<vad-dir>]]";
   echo "e.g.: $0 data/train exp/make_vad mfcc"
   echo "Note: <log-dir> defaults to <data-dir>/log, and <vad-dir> defaults to <data-dir>/data"
   echo " Options:"
   echo "  --vad-config <config-file>                       # config passed to compute-vad-energy"
   echo "  --nj <nj>                                        # number of parallel jobs"
   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
   exit 1;
fi

data=$1
if [ $# -ge 2 ]; then
  logdir=$2
else
  logdir=$data/log
fi
if [ $# -ge 3 ]; then
  vaddir=$3
else
  vaddir=$data/data
fi


# make $vaddir an absolute pathname.
vaddir=`perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' $vaddir ${PWD}`

# use "name" as part of name of the archive.
name=`basename $data`

mkdir -p $vaddir || exit 1;
mkdir -p $logdir || exit 1;

if [ -f $data/vad.scp ]; then
  mkdir -p $data/.backup
  echo "$0: moving $data/vad.scp to $data/.backup"
  mv $data/vad.scp $data/.backup
fi

for f in $data/feats.scp "$vad_config"; do
  if [ ! -f $f ]; then
    echo "compute_vad_decision.sh: no such file $f"
    exit 1;
  fi
done

utils/split_data.sh $data $nj || exit 1;
sdata=$data/split$nj;

$cmd JOB=1:$nj $logdir/vad_${name}.JOB.log \
  compute-vad --config=$vad_config scp:$sdata/JOB/feats.scp \
  ark,scp:$vaddir/vad_${name}.JOB.ark,$vaddir/vad_${name}.JOB.scp || exit 1

for ((n=1; n<=nj; n++)); do
  cat $vaddir/vad_${name}.$n.scp || exit 1;
done > $data/vad.scp

nc=`cat $data/vad.scp | wc -l` 
nu=`cat $data/feats.scp | wc -l` 
if [ $nc -ne $nu ]; then
  echo "**Warning it seems not all of the speakers got VAD output ($nc != $nu);"
  echo "**validate_data_dir.sh will fail; you might want to use fix_data_dir.sh"
  [ $nc -eq 0 ] && exit 1;
fi


echo "Created VAD output for $name"


================================================
FILE: egs/steps/conf/append_eval_to_ctm.py
================================================
#!/usr/bin/env python

# Copyright 2015  Brno University of Technology (author: Karel Vesely)
# Apache 2.0

from __future__ import print_function
import sys,operator

# Append Levenshtein alignment of 'hypothesis' and 'reference' into 'CTM':
# (i.e. the output of 'align-text' post-processed by 'wer_per_utt_details.pl')

# The tags in the appended column are:
#  'C' = correct
#  'S' = substitution
#  'I' = insertion
#  'U' = unknown (not part of scored segment)

if len(sys.argv) != 4:
  print('Usage: %s eval-in ctm-in ctm-eval-out' % __file__)
  sys.exit(1)
dummy, eval_in, ctm_in, ctm_eval_out = sys.argv

if ctm_eval_out == '-': ctm_eval_out = '/dev/stdout'

# Read the evalutation,
eval_vec = dict()
with open(eval_in, 'r') as f:
  while True:
    # Reading 4 lines encoding one utterance,
    ref = f.readline()
    hyp = f.readline()
    op = f.readline()
    csid = f.readline()
    if not ref: break
    # Parse the input,
    utt,tag,hyp_vec = hyp.split(' ',2)
    assert(tag == 'hyp')
    utt,tag,op_vec = op.split(' ',2)
    assert(tag == 'op')
    hyp_vec = hyp_vec.split()
    op_vec = op_vec.split()
    # Fill create eval vector with symbols 'C', 'S', 'I'
    assert(utt not in eval_vec)
    eval_vec[utt] = []
    for op,hyp in zip(op_vec, hyp_vec):
      if op != 'D': eval_vec[utt].append((op,hyp))

# Load the 'ctm' into dictionary,
ctm = dict()
with open(ctm_in) as f:
  for l in f:
    utt, ch, beg, dur, wrd, conf = l.split()
    if not utt in ctm: ctm[utt] = []
    ctm[utt].append((utt, ch, float(beg), float(dur), wrd, float(conf)))

# Build the 'ctm' with 'eval' column added,
ctm_eval = []
for utt,ctm_part in ctm.items():
  ctm_part.sort(key = operator.itemgetter(2)) # Sort by 'beg' time,
  try:
    # merging 'tuples' by '+', the record has format:
    # (utt, ch, beg, dur, ctm_wrd, conf, op, hyp_wrd)
    merged = [ ctm_tup + evl_tup for ctm_tup,evl_tup in zip(ctm_part,eval_vec[utt]) ]
    # check,
    for j in range(len(merged)):
      hyp_wrd = merged[j][-1]
      ctm_wrd = merged[j][-4]
      assert hyp_wrd == ctm_wrd, "We failed with words: hyp_wrd %s, ctm_wrd %s" % (hyp_wrd,ctm_wrd) # Check that words in 'ctm' and 'utt_stats' match!
      merged[j] = merged[j][:-1] # dropping the 'hyp_wrd' (the last element of tuple),
    # append,
    ctm_eval.extend(merged)
  except KeyError:
    print('Missing key', utt, 'in the word-evaluation stats from scoring')

# Sort again,
ctm_eval.sort(key = operator.itemgetter(0,1,2))

# Store,
with open(ctm_eval_out,'w') as f:
  for tup in ctm_eval:
    f.write('%s %s %f %f %s %f %s\n' % tup)


================================================
FILE: egs/steps/conf/append_prf_to_ctm.py
================================================
#!/usr/bin/env python

# Copyright 2015  Brno University of Technology (author: Karel Vesely)
# Apache 2.0

from __future__ import print_function
import sys

# Append Levenshtein alignment of 'hypothesis' and 'reference' into 'CTM':
# (parsed from the 'prf' output of 'sclite')

# The tags in appended column are:
#  'C' = correct
#  'S' = substitution
#  'I' = insertion
#  'U' = unknown (not part of scored segment)

# Parse options,
if len(sys.argv) != 4:
  print("Usage: %s prf ctm_in ctm_out" % __file__)
  sys.exit(1)
prf_file, ctm_file, ctm_out_file = sys.argv[1:]

if ctm_out_file == '-': ctm_out_file = '/dev/stdout'

# Load the prf file,
prf = []
with open(prf_file) as f:
  for l in f:
    # Store the data,
    if l[:5] == 'File:':
      file_id = l.split()[1]
    if l[:8] == 'Channel:':
      chan = l.split()[1]
    if l[:5] == 'H_T1:':
      h_t1 = l
    if l[:5] == 'Eval:':
      evl = l
      prf.append((file_id,chan,h_t1,evl))

# Parse the prf records into dictionary,
prf_dict = dict()
for (f,c,t,e) in prf:
  t_pos = 0 # position in the 't' string,
  while t_pos < len(t):
    t1 = t[t_pos:].split(' ',1)[0] # get 1st token at 't_pos'
    try:
      # get word evaluation letter 'C,S,I',
      evl = e[t_pos] if e[t_pos] != ' ' else 'C' 
      # add to dictionary,
      key='%s,%s' % (f,c) # file,channel
      if key not in prf_dict: prf_dict[key] = dict()
      prf_dict[key][float(t1)] = evl
    except ValueError:
      pass
    t_pos += len(t1)+1 # advance position for parsing,

# Load the ctm file (with confidences),
with open(ctm_file) as f:
  ctm = [ l.split() for l in f ]

# Append the sclite alignment tags to ctm,
ctm_out = []
for f, chan, beg, dur, wrd, conf in ctm:
  # U = unknown, C = correct, S = substitution, I = insertion,
  sclite_tag = 'U' 
  try:
    sclite_tag = prf_dict[('%s,%s'%(f,chan)).lower()][float(beg)]
  except KeyError:
    pass
  ctm_out.append([f,chan,beg,dur,wrd,conf,sclite_tag])

# Save the augmented ctm file,
with open(ctm_out_file, 'w') as f:
  f.writelines([' '.join(ctm_record)+'\n' for ctm_record in ctm_out])


================================================
FILE: egs/steps/conf/apply_calibration.sh
================================================
#!/usr/bin/env bash
# Copyright 2015, Brno University of Technology (Author: Karel Vesely). Apache 2.0.

# Trains logistic regression, which calibrates the per-word confidences,
# which are extracted by the Minimum Bayes Risk decoding.

# begin configuration section.
cmd=
stage=0
# end configuration section.

[ -f ./path.sh ] && . ./path.sh
. parse_options.sh || exit 1;

if [ $# -ne 5 ]; then
  echo "Usage: $0 [opts] <data-dir> <lang-dir|graph-dir> <decode-dir> <calibration-dir> <output-dir>"
  echo " Options:"
  echo "    --cmd (run.pl|queue.pl...)      # specify how to run the sub-processes."
  exit 1;
fi

set -euo pipefail

data=$1
lang=$2 # Note: may be graph directory not lang directory, but has the necessary stuff copied.
latdir=$3
caldir=$4
dir=$5

model=$latdir/../final.mdl # assume model one level up from decoding dir.
calibration=$caldir/calibration.mdl
word_feats=$caldir/word_feats
word_categories=$caldir/word_categories

for f in $lang/words.txt $word_feats $word_categories $latdir/lat.1.gz $calibration $model; do
  [ ! -f $f ] && echo "$0: Missing file $f" && exit 1
done
[ -z "$cmd" ] && echo "$0: Missing --cmd '...'" && exit 1

[ -d $dir/log ] || mkdir -p $dir/log
nj=$(cat $latdir/num_jobs)
lmwt=$(cat $caldir/lmwt)
decode_mbr=$(cat $caldir/decode_mbr)

# Store the setup,
echo $lmwt >$dir/lmwt
echo $decode_mbr >$dir/decode_mbr 
cp $calibration $dir/calibration.mdl
cp $word_feats $dir/word_feats
cp $word_categories $dir/word_categories

# Create the ctm with raw confidences,
# - we keep the timing relative to the utterance,
if [ $stage -le 0 ]; then
  $cmd JOB=1:$nj $dir/log/get_ctm.JOB.log \
    lattice-scale --inv-acoustic-scale=$lmwt "ark:gunzip -c $latdir/lat.JOB.gz|" ark:- \| \
    lattice-limit-depth ark:- ark:- \| \
    lattice-push --push-strings=false ark:- ark:- \| \
    lattice-align-words-lexicon --max-expand=10.0 \
     $lang/phones/align_lexicon.int $model ark:- ark:- \| \
    lattice-to-ctm-conf --decode-mbr=$decode_mbr ark:- - \| \
    utils/int2sym.pl -f 5 $lang/words.txt \
    '>' $dir/JOB.ctm
  # Merge and clean,
  for ((n=1; n<=nj; n++)); do cat $dir/${n}.ctm; done > $dir/ctm
  rm $dir/*.ctm
  cat $dir/ctm | utils/sym2int.pl -f 5 $lang/words.txt >$dir/ctm_int
fi

# Compute lattice-depth,
latdepth=$dir/lattice_frame_depth.ark
if [ $stage -le 1 ]; then
  [ -e $latdepth ] || steps/conf/lattice_depth_per_frame.sh --cmd "$cmd" $latdir $dir
fi

# Create the forwarding data for logistic regression,
if [ $stage -le 2 ]; then
  steps/conf/prepare_calibration_data.py --conf-feats $dir/forward_feats.ark \
    --lattice-depth $latdepth $dir/ctm_int $word_feats $word_categories
fi

# Apply calibration model to dev,
if [ $stage -le 3 ]; then
  logistic-regression-eval --apply-log=false $calibration \
    ark:$dir/forward_feats.ark ark,t:- | \
    awk '{ key=$1; p_corr=$4; sub(/,.*/,"",key); gsub(/\^/," ",key); print key,p_corr }' | \
    utils/int2sym.pl -f 5 $lang/words.txt \
    >$dir/ctm_calibrated
fi

exit 0


================================================
FILE: egs/steps/conf/convert_ctm_to_tra.py
================================================
#!/usr/bin/env python

# Copyright 2015  Brno University of Technology (author: Karel Vesely)
# Apache 2.0

from __future__ import print_function
import sys, operator

# This scripts loads a 'ctm' file and converts it into the 'tra' format:
# "utt-key word1 word2 word3 ... wordN"
# The 'utt-key' is the 1st column in the CTM.

# Typically the CTM contains:
# - utterance-relative timimng (i.e. prepared without 'utils/convert_ctm.pl')
# - confidences

if len(sys.argv) != 3:
  print('Usage: %s ctm-in tra-out' % __file__)
  sys.exit(1)
dummy, ctm_in, tra_out = sys.argv

if ctm_in == '-': ctm_in = '/dev/stdin'
if tra_out == '-': tra_out = '/dev/stdout'

# Load the 'ctm' into dictionary,
tra = dict()
with open(ctm_in) as f:
  for l in f:
    utt, ch, beg, dur, wrd, conf = l.split()
    if not utt in tra: tra[utt] = []
    tra[utt].append((float(beg),wrd))

# Store the in 'tra' format,
with open(tra_out,'w') as f:
  for utt,tuples in tra.items():
    tuples.sort(key = operator.itemgetter(0)) # Sort by 'beg' time,
    f.write('%s %s\n' % (utt,' '.join([t[1] for t in tuples])))


================================================
FILE: egs/steps/conf/get_ctm_conf.sh
================================================
#!/usr/bin/env bash
# Copyright Johns Hopkins University (Author: Daniel Povey) 2012.  Apache 2.0.

# This script produces CTM files from a decoding directory that has lattices
# present.  This version gives you confidence scores using MBR decoding.
# See also steps/get_ctm.sh


# begin configuration section.
cmd=run.pl
stage=0
min_lmwt=5
max_lmwt=20
use_segments=true # if we have a segments file, use it to convert
                  # the segments to be relative to the original files.
iter=final
beam=5  # pruning beam before MBR decoding
#end configuration section.

echo "$0 $@"  # Print the command line for logging

[ -f ./path.sh ] && . ./path.sh
. parse_options.sh || exit 1;

if [ $# -ne 3 ]; then
  echo "This script produces CTM files from a decoding directory that has lattices "
  echo "present.  This version gives you confidence scores using MBR decoding."
  echo "Usage: $0 [options] <data-dir> <lang-dir|graph-dir> <decode-dir>"
  echo " Options:"
  echo "    --cmd (run.pl|queue.pl...)      # specify how to run the sub-processes."
  echo "    --stage (0|1|2)                 # start scoring script from part-way through."
  echo "    --use-segments (true|false)     # use segments and reco2file_and_channel files "
  echo "                                    # to produce a ctm relative to the original audio"
  echo "                                    # files, with channel information (typically needed"
  echo "                                    # for NIST scoring)."
  echo "e.g.:"
  echo "$0 data/train data/lang exp/tri4a/decode/"
  echo "See also: steps/get_ctm.sh, steps/get_ctm_conf_fast.sh"
  exit 1;
fi

data=$1
lang=$2 # Note: may be graph directory not lang directory, but has the necessary stuff copied.
dir=$3

model=$dir/../$iter.mdl # assume model one level up from decoding dir.


for f in $lang/words.txt $model $dir/lat.1.gz; do
  [ ! -f $f ] && echo "$0: expecting file $f to exist" && exit 1;
done

name=`basename $data`; # e.g. eval2000

mkdir -p $dir/scoring/log

frame_shift_opt=
if [ -f $dir/../frame_shift ]; then
  frame_shift_opt="--frame-shift=$(cat $dir/../frame_shift)"
  echo "$0: $dir/../frame_shift exists, using $frame_shift_opt"
elif [ -f $dir/../frame_subsampling_factor ]; then
  factor=$(cat $dir/../frame_subsampling_factor) || exit 1
  frame_shift_opt="--frame-shift=0.0$factor"
  echo "$0: $dir/../frame_subsampling_factor exists, using $frame_shift_opt"
fi

if [ $stage -le 0 ]; then
  if [ -f $data/segments ] && $use_segments; then
    f=$data/reco2file_and_channel
    [ ! -f $f ] && echo "$0: expecting file $f to exist" && exit 1;
    filter_cmd="utils/convert_ctm.pl $data/segments $data/reco2file_and_channel"
  else
    filter_cmd=cat
  fi

  nj=$(cat $dir/num_jobs)
  lats=$(for n in $(seq $nj); do echo -n "$dir/lat.$n.gz "; done)
  if [ -f $lang/phones/word_boundary.int ]; then
    $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/get_ctm.LMWT.log \
      set -o pipefail '&&' mkdir -p $dir/score_LMWT/ '&&' \
      lattice-prune --inv-acoustic-scale=LMWT --beam=$beam "ark:gunzip -c $lats|" ark:- \| \
      lattice-align-words $lang/phones/word_boundary.int $model ark:- ark:- \| \
      lattice-to-ctm-conf $frame_shift_opt --decode-mbr=true --inv-acoustic-scale=LMWT ark:- - \| \
      utils/int2sym.pl -f 5 $lang/words.txt \| \
      $filter_cmd '>' $dir/score_LMWT/$name.ctm || exit 1;
  else
    if [ ! -f $lang/phones/align_lexicon.int ]; then
      echo "$0: neither $lang/phones/word_boundary.int nor $lang/phones/align_lexicon.int exists: cannot align."
      exit 1;
    fi
    $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/get_ctm.LMWT.log \
      set -o pipefail '&&' mkdir -p $dir/score_LMWT/ '&&' \
      lattice-prune --inv-acoustic-scale=LMWT --beam=$beam "ark:gunzip -c $lats|" ark:- \| \
      lattice-align-words-lexicon $lang/phones/align_lexicon.int $model ark:- ark:- \| \
      lattice-to-ctm-conf $frame_shift_opt --decode-mbr=true --inv-acoustic-scale=LMWT ark:- - \| \
      utils/int2sym.pl -f 5 $lang/words.txt \| \
      $filter_cmd '>' $dir/score_LMWT/$name.ctm || exit 1;
  fi
fi


================================================
FILE: egs/steps/conf/lattice_depth_per_frame.sh
================================================
#!/usr/bin/env bash
# Copyright 2015  Brno University of Technology (Author: Karel Vesely)
# Licensed under the Apache License, Version 2.0 (the "License")

# Extract lattice-depth for each frame.

# Begin configuration
cmd=run.pl
# End configuration

echo "$0 $@"  # Print the command line for logging

[ -f path.sh ] && . ./path.sh # source the path.
. parse_options.sh || exit 1;

if [ $# != 2 ]; then
   echo "usage: $0 [opts] <dir-with-lats> <out-dir>"
   echo "main options (for others, see top of script file)"
   echo "  --config <config-file>          # config containing options"
   echo "  --cmd"
   exit 1;
fi

set -euo pipefail

latdir=$1
dir=$2

[ ! -f $latdir/lat.1.gz ] && echo "Missing $latdir/lat.1.gz" && exit 1
nj=$(cat $latdir/num_jobs)

# Get the pdf-posterior vectors,
$cmd JOB=1:$nj $dir/log/lattice_depth_per_frame.JOB.log \
  lattice-depth-per-frame "ark:gunzip -c $latdir/lat.JOB.gz |" ark,t:$dir/lattice_frame_depth.JOB.ark
# Merge,
for ((n=1; n<=nj; n++)); do cat $dir/lattice_frame_depth.${n}.ark; done >$dir/lattice_frame_depth.ark
rm $dir/lattice_frame_depth.*.ark

# Done!


================================================
FILE: egs/steps/conf/parse_arpa_unigrams.py
================================================
#!/usr/bin/env python

# Copyright 2015  Brno University of Technology (author: Karel Vesely)
# Apache 2.0

from __future__ import print_function
import sys, gzip, re

# Parse options,
if len(sys.argv) != 4:
  print("Usage: %s <words.txt> <arpa-gz> <unigrams>" % __file__)
  sys.exit(0)
words_txt, arpa_gz, unigrams_out = sys.argv[1:]

if arpa_gz == '-': arpa_gz = '/dev/stdin'
if unigrams_out == '-': unigrams_out = '/dev/stdout'

# Load the words.txt,
words = [ l.split() for l in open(words_txt) ]

# Load the unigram probabilities in 10log from ARPA,
wrd_log10 = dict()
with gzip.open(arpa_gz,'r') as f:
  read = False
  for l in f:
    if l.strip() == '\\1-grams:': read = True
    if l.strip() == '\\2-grams:': break
    if read and len(l.split())>=2:
      log10_p_unigram, wrd = re.split('[\t ]+',l.strip(),2)[:2]
      wrd_log10[wrd] = float(log10_p_unigram)

# Create list, 'wrd id log_p_unigram',
words_unigram = [[wrd, id, (wrd_log10[wrd] if wrd in wrd_log10 else -99)] for wrd,id in words ]

print(words_unigram[0], file=sys.stderr)
# Store,
with open(unigrams_out,'w') as f:
  f.writelines(['%s %s %g\n' % (w,i,p) for (w,i,p) in words_unigram])


================================================
FILE: egs/steps/conf/prepare_calibration_data.py
================================================
#!/usr/bin/env python

# Copyright 2015  Brno University of Technology (author: Karel Vesely)
# Apache 2.0

from __future__ import division
import sys, math

from optparse import OptionParser
desc = """
Prepare input features and training targets for logistic regression,
which calibrates the Minimum Bayes Risk posterior confidences.

The logisitc-regression input features are: 
- posteriors from 'ctm' transformed by logit,
- logarithm of word-length in letters,
- 10base logarithm of unigram probability of a word from language model,
- logarithm of average lattice-depth at position of the word (optional),

The logistic-regresion targets are:
- 1 for correct word,
- 0 for incorrect word (substitution, insertion),

The iput 'ctm' is augmented by per-word tags (or 'U' is added if no tags),
'C' = correct
'S' = substitution
'I' = insertion
'U' = unknown (not part of scored segment)

The script can be used both to prepare the training data,
or to prepare input features for forwarding through trained model.
"""
usage = "%prog [opts] ctm word-filter word-length unigrams depth-per-frame-ascii.ark word-categories"
parser = OptionParser(usage=usage, description=desc)
parser.add_option("--conf-targets", help="Targets file for logistic regression (no targets generated if '') [default %default]", default='')
parser.add_option("--conf-feats", help="Feature file for logistic regression. [default %default]", default='')
parser.add_option("--lattice-depth", help="Per-frame lattice depths, ascii-ark (optional). [default %default]", default='')
(o, args) = parser.parse_args()

if len(args) != 3:
  parser.print_help()
  sys.exit(1)
ctm_file, word_feats_file, word_categories_file = args

assert(o.conf_feats != '')

# Load the ctm (optionally add eval colmn with 'U'):
ctm = [ l.split() for l in open(ctm_file) ]
if len(ctm[0]) == 6: [ l.append('U') for l in ctm ]
assert(len(ctm[0]) == 7)

# Load the word-features, the format: "wrd wrd_id filter length other_feats"
# (typically 'other_feats' are unigram log-probabilities),
word_feats = [ l.split(None,4) for l in open(word_feats_file) ]

# Prepare filtering dict,
word_filter = { wrd_id:bool(int(filter)) for (wrd,wrd_id,filter,length,other_feats) in word_feats }
# Prepare the lenght dict,
word_length = { wrd_id:float(length) for (wrd,wrd_id,filter,length,other_feats) in word_feats }
# Prepare other_feats dict,
other_feats = { wrd_id:other_feats.strip() for (wrd,wrd_id,filter,length,other_feats) in word_feats }

# Build the targets,
if o.conf_targets != '':
  with open(o.conf_targets,'w') as f:
    for (utt, chan, beg, dur, wrd_id, conf, score_tag) in ctm:
      # Skip the words we don't know if being correct, 
      if score_tag == 'U': continue 
      # Some words are excluded from training (partial words, hesitations, etc.),
      # (Value: 1 == keep word, 0 == exclude word from the targets),
      if not word_filter[wrd_id]: continue 
      # Build the key,
      key = "%s^%s^%s^%s^%s,%s,%s" % (utt, chan, beg, dur, wrd_id, conf, score_tag)
      # Build the target,
      tgt = 1 if score_tag == 'C' else 0 # Correct = 1, else 0,
      # Write,
      f.write('%s %d\n' % (key,tgt))

# Load the per-frame lattice-depth,
# - we assume, the 1st column in 'ctm' is the 'utterance-key' in depth file,
# - if the 'ctm' and 'ark' keys don't match, we leave this feature out,
if o.lattice_depth:
  depths = dict()
  for l in open(o.lattice_depth):
    utt,d = l.split(' ',1)
    depths[utt] = [int(i) for i in d.split()]

# Load the 'word_categories' mapping for categorical input features derived from 'lang/words.txt',
wrd_to_cat = [ l.split() for l in open(word_categories_file) ]
wrd_to_cat = { wrd_id:int(category) for wrd,wrd_id,category in wrd_to_cat }
wrd_cat_num = max(wrd_to_cat.values()) + 1

# Build the input features,
with open(o.conf_feats,'w') as f:
  for (utt, chan, beg, dur, wrd_id, conf, score_tag) in ctm:
    # Build the key, same as previously,
    key = "%s^%s^%s^%s^%s,%s,%s" % (utt, chan, beg, dur, wrd_id, conf, score_tag)

    # Build input features,
    # - logit of MBR posterior,
    damper = 0.001 # avoid -inf,+inf from log,
    logit = math.log(float(conf)+damper) - math.log(1.0 - float(conf)+damper)
    # - log of word-length,
    log_word_length = math.log(word_length[wrd_id]) # i.e. number of phones in a word,
    # - categorical distribution of words (with frequency higher than min-count),
    wrd_1_of_k = [0]*wrd_cat_num; 
    wrd_1_of_k[wrd_to_cat[wrd_id]] = 1;

    # Compose the input feature vector,
    feats = [ logit, log_word_length, other_feats[wrd_id] ] + wrd_1_of_k

    # Optionally add average-depth of lattice at the word position,
    if o.lattice_depth != '':
      depth_slice = depths[utt][int(round(100.0*float(beg))):int(round(100.0*(float(beg)+float(dur))))]
      log_avg_depth = math.log(float(sum(depth_slice))/len(depth_slice))
      feats += [ log_avg_depth ]

    # Store the input features, 
    f.write(key + ' [ ' + ' '.join(map(str,feats)) + ' ]\n')


================================================
FILE: egs/steps/conf/prepare_word_categories.py
================================================
#!/usr/bin/env python

# Copyright 2015  Brno University of Technology (author: Karel Vesely)
# Apache 2.0

import sys

from optparse import OptionParser
desc = """
Prepare mapping of words into categories. Each word with minimal frequency 
has its own category, the rest is merged into single class.
"""
usage = "%prog [opts] words.txt ctm category_mapping"
parser = OptionParser(usage=usage, description=desc)
parser.add_option("--min-count", help="Minimum word-count to have a single word category. [default %default]", type='int', default=20)
(o, args) = parser.parse_args()

if len(args) != 3:
  parser.print_help()
  sys.exit(1)
words_file, text_file, category_mapping_file = args

if text_file == '-': text_file = '/dev/stdin'
if category_mapping_file == '-': category_mapping_file = '/dev/stdout'

# Read the words from the 'tra' file,
with open(text_file) as f:
  text_words = [ l.split()[1:] for l in f ]

# Flatten the array of arrays of words,
import itertools
text_words = list(itertools.chain.from_iterable(text_words))

# Count the words (regardless if correct or incorrect),
word_counts = dict()
for w in text_words:
  if w not in word_counts: word_counts[w] = 0
  word_counts[w] += 1

# Read the words.txt,
with open(words_file) as f:
  word_id = [ l.split() for l in f ]

# Append the categories,
n=1
word_id_cat=[]
for word, idx in word_id:
  cat = 0 
  if word in word_counts:
    if word_counts[word] > o.min_count:
      cat = n; n += 1
  word_id_cat.append([word, idx, str(cat)])

# Store the mapping,
with open(category_mapping_file,'w') as f:
  f.writelines([' '.join(record)+'\n' for record in word_id_cat])


================================================
FILE: egs/steps/conf/train_calibration.sh
================================================
#!/usr/bin/env bash
# Copyright 2015, Brno University of Technology (Author: Karel Vesely). Apache 2.0.

# Trains logistic regression, which calibrates the per-word confidences in 'CTM'.
# The 'raw' confidences are obtained by Minimum Bayes Risk decoding.

# The input features of logistic regression are:
# - logit of Minumum Bayer Risk posterior
# - log of word-length in characters
# - log of average-depth depth of a lattice at words' position
# - log of frames per character ratio
# (- categorical distribution of 'lang/words.txt', DISABLED)

# begin configuration section.
cmd=
lmwt=12
decode_mbr=true
word_min_count=10 # Minimum word-count for single-word category,
normalizer=0.0025 # L2 regularization constant,
category_text= # Alternative corpus for counting words to get word-categories (by default using 'ctm'),
stage=0
# end configuration section.

[ -f ./path.sh ] && . ./path.sh
. parse_options.sh || exit 1;

if [ $# -ne 5 ]; then
  echo "Usage: $0 [opts] <data-dir> <lang-dir|graph-dir> <word-feats> <decode-dir> <calibration-dir>"
  echo " Options:"
  echo "    --cmd (run.pl|queue.pl...)      # specify how to run the sub-processes."
  echo "    --lmwt <int>                    # scaling for confidence extraction"
  echo "    --decode-mbr <bool>             # use Minimum Bayes Risk decoding"
  echo "    --grep-filter <str>             # remove words from calibration targets"
  exit 1;
fi

set -euo pipefail

data=$1
lang=$2 # Note: may be graph directory not lang directory, but has the necessary stuff copied.
word_feats=$3
latdir=$4
dir=$5

model=$latdir/../final.mdl # assume model one level up from decoding dir.

for f in $data/text $lang/words.txt $word_feats $latdir/lat.1.gz; do
  [ ! -f $f ] && echo "$0: Missing file $f" && exit 1
done
[ -z "$cmd" ] && echo "$0: Missing --cmd '...'" && exit 1

[ -d $dir/log ] || mkdir -p $dir/log
nj=$(cat $latdir/num_jobs)

# Store the setup,
echo $lmwt >$dir/lmwt
echo $decode_mbr >$dir/decode_mbr
cp $word_feats $dir/word_feats

# Create the ctm with raw confidences,
# - we keep the timing relative to the utterance,
if [ $stage -le 0 ]; then
  $cmd JOB=1:$nj $dir/log/get_ctm.JOB.log \
    lattice-scale --inv-acoustic-scale=$lmwt "ark:gunzip -c $latdir/lat.JOB.gz|" ark:- \| \
    lattice-limit-depth ark:- ark:- \| \
    lattice-push --push-strings=false ark:- ark:- \| \
    lattice-align-words-lexicon --max-expand=10.0 \
     $lang/phones/align_lexicon.int $model ark:- ark:- \| \
    lattice-to-ctm-conf --decode-mbr=$decode_mbr ark:- - \| \
    utils/int2sym.pl -f 5 $lang/words.txt \
    '>' $dir/JOB.ctm
  # Merge and clean,
  for ((n=1; n<=nj; n++)); do cat $dir/${n}.ctm; done > $dir/ctm
  rm $dir/*.ctm
fi

# Get evaluation of the 'ctm' using the 'text' reference,
if [ $stage -le 1 ]; then
  steps/conf/convert_ctm_to_tra.py $dir/ctm - | \
  align-text --special-symbol="<eps>" ark:$data/text ark:- ark,t:- | \
  utils/scoring/wer_per_utt_details.pl --special-symbol "<eps>" \
  >$dir/align_text 
  # Append alignment to ctm,
  steps/conf/append_eval_to_ctm.py $dir/align_text $dir/ctm $dir/ctm_aligned
  # Convert words to 'ids',
  cat $dir/ctm_aligned | utils/sym2int.pl -f 5 $lang/words.txt >$dir/ctm_aligned_int
fi

# Prepare word-categories (based on wotd frequencies in 'ctm'),
if [ -z "$category_text" ]; then
  steps/conf/convert_ctm_to_tra.py $dir/ctm - | \
  steps/conf/prepare_word_categories.py --min-count $word_min_count $lang/words.txt - $dir/word_categories
else
  steps/conf/prepare_word_categories.py --min-count $word_min_count $lang/words.txt "$category_text" $dir/word_categories
fi

# Compute lattice-depth,
latdepth=$dir/lattice_frame_depth.ark
if [ $stage -le 2 ]; then
  [ -e $latdepth ] || steps/conf/lattice_depth_per_frame.sh --cmd "$cmd" $latdir $dir
fi

# Create the training data for logistic regression,
if [ $stage -le 3 ]; then
  steps/conf/prepare_calibration_data.py \
    --conf-targets $dir/train_targets.ark --conf-feats $dir/train_feats.ark \
    --lattice-depth $latdepth $dir/ctm_aligned_int $word_feats $dir/word_categories
fi

# Train the logistic regression,
if [ $stage -le 4 ]; then
  logistic-regression-train --binary=false --normalizer=$normalizer ark:$dir/train_feats.ark \
    ark:$dir/train_targets.ark $dir/calibration.mdl 2>$dir/log/logistic-regression-train.log
fi

# Apply calibration model to dev,
if [ $stage -le 5 ]; then
  logistic-regression-eval --apply-log=false $dir/calibration.mdl \
    ark:$dir/train_feats.ark ark,t:- | \
    awk '{ key=$1; p_corr=$4; sub(/,.*/,"",key); gsub(/\^/," ",key); print key,p_corr }' | \
    utils/int2sym.pl -f 5 $lang/words.txt \
    >$dir/ctm_calibrated_int
fi

exit 0


================================================
FILE: egs/steps/copy_ali_dir.sh
================================================
#!/usr/bin/env bash
# Copyright 2019   Phani Sankar Nidadavolu
# Apache 2.0.

prefixes="reverb1 babble music noise"
include_original=true
max_jobs_run=50
nj=100
cmd=queue.pl
write_binary=true

. ./path.sh
. utils/parse_options.sh

if [ $# -ne 3 ]; then
  echo "Usage: $0 <out-data> <src-ali-dir> <out-ali-dir>"
  echo "This script creates alignments for the aug dirs by copying "
  echo " the alignments of original train dir"
  echo "While copying it adds prefix to the utterances specified by prefixes option"
  echo "Note that the original train dir does not have any prefix"
  echo "To include the original training directory in the copied "
  echo "version set the --include-original option to true"
  echo "main options (for others, see top of script file)"
  echo "  --prefixes <string of prefixes to add>    # All the prefixes of aug data to be included"
  echo "  --include-original <true/false>           # If true, will copy the alignements of original dir"
  echo "  --write-compact <true/false>              # Write lattices in compact mode"
  exit 1
fi

data=$1
src_dir=$2
dir=$3

mkdir -p $dir

num_jobs=$(cat $src_dir/num_jobs)

rm -f $dir/ali_tmp.*.{ark,scp} 2>/dev/null

# Copy the alignments temporarily
echo "creating temporary alignments in $dir"
$cmd --max-jobs-run $max_jobs_run JOB=1:$num_jobs $dir/log/copy_ali_temp.JOB.log \
  copy-int-vector --binary=$write_binary \
  "ark:gunzip -c $src_dir/ali.JOB.gz |" \
  ark,scp:$dir/ali_tmp.JOB.ark,$dir/ali_tmp.JOB.scp || exit 1

# Make copies of utterances for perturbed data
for p in $prefixes; do
  cat $dir/ali_tmp.*.scp | awk -v p=$p '{print p"-"$0}'
done | sort -k1,1 > $dir/ali_out.scp.aug

if [ "$include_original" == "true" ]; then
  cat $dir/ali_tmp.*.scp | awk '{print $0}' | sort -k1,1 > $dir/ali_out.scp.clean
  cat $dir/ali_out.scp.clean $dir/ali_out.scp.aug | sort -k1,1 > $dir/ali_out.scp
else
  cat $dir/ali_out.scp.aug | sort -k1,1 > $dir/ali_out.scp
fi

utils/split_data.sh ${data} $nj

# Copy and dump the lattices for perturbed data
echo Creating alignments for augmented data by copying alignments from clean data
$cmd --max-jobs-run $max_jobs_run JOB=1:$nj $dir/log/copy_out_ali.JOB.log \
  copy-int-vector --binary=$write_binary \
  "scp:utils/filter_scp.pl ${data}/split$nj/JOB/utt2spk $dir/ali_out.scp |" \
  "ark:| gzip -c > $dir/ali.JOB.gz" || exit 1

rm $dir/ali_out.scp.{aug,clean} $dir/ali_out.scp
rm $dir/ali_tmp.*

echo $nj > $dir/num_jobs

for f in cmvn_opts tree splice_opts phones.txt final.mdl splice_opts tree frame_subsampling_factor; do
  if [ -f $src_dir/$f ]; then cp $src_dir/$f $dir/$f; fi
done


================================================
FILE: egs/steps/copy_lat_dir.sh
================================================
#!/usr/bin/env bash
# Copyright 2019   Phani Sankar Nidadavolu
# Apache 2.0.

prefixes="reverb1 babble music noise"
include_original=true
max_jobs_run=50
nj=100
cmd=queue.pl
write_compact=true

. ./path.sh
. utils/parse_options.sh

if [ $# -ne 3 ]; then
  echo "Usage: $0 [options] <out-data> <src-lat-dir> <out-lat-dir>"
  echo "This script creates lattices for the aug dirs by copying the lattices of original train dir"
  echo "While copying it adds prefix to the utterances specified by prefixes option"
  echo "Note that the original train dir does not have any prefix"
  echo "To include the original training directory in the copied "
  echo "version set the --include-original option to true"
  echo "main options (for others, see top of script file)"
  echo "  --prefixes <string of prefixes to add>             # All the prefixes of aug data to be included"
  echo "  --include-original <true/false>                    # If true, will copy the lattices of original dir"
  echo "  --write-compact <true/false>                       # Write lattices in compact mode"
  exit 1
fi

data=$1
src_dir=$2
dir=$3

mkdir -p $dir

num_jobs=$(cat $src_dir/num_jobs)

rm -f $dir/lat_tmp.*.{ark,scp} 2>/dev/null

# Copy the alignments temporarily
echo "creating temporary lattices in $dir"
$cmd --max-jobs-run $max_jobs_run JOB=1:$num_jobs $dir/log/copy_lat_temp.JOB.log \
  lattice-copy --write-compact=$write_compact \
  "ark:gunzip -c $src_dir/lat.JOB.gz |" \
  ark,scp:$dir/lat_tmp.JOB.ark,$dir/lat_tmp.JOB.scp || exit 1

# Make copies of utterances for perturbed data
for p in $prefixes; do
  cat $dir/lat_tmp.*.scp | awk -v p=$p '{print p"-"$0}'
done | sort -k1,1 > $dir/lat_out.scp.aug

if [ "$include_original" == "true" ]; then
  cat $dir/lat_tmp.*.scp | awk '{print $0}' | sort -k1,1 > $dir/lat_out.scp.clean
  cat $dir/lat_out.scp.clean $dir/lat_out.scp.aug | sort -k1,1 > $dir/lat_out.scp
else
  cat $dir/lat_out.scp.aug | sort -k1,1 > $dir/lat_out.scp
fi

utils/split_data.sh ${data} $nj

# Copy and dump the lattices for perturbed data
echo Creating lattices for augmented data by copying lattices from clean data
$cmd --max-jobs-run $max_jobs_run JOB=1:$nj $dir/log/copy_out_lat.JOB.log \
  lattice-copy --write-compact=$write_compact \
  "scp:utils/filter_scp.pl ${data}/split$nj/JOB/utt2spk $dir/lat_out.scp |" \
  "ark:| gzip -c > $dir/lat.JOB.gz" || exit 1

rm $dir/lat_out.scp.{aug,clean} $dir/lat_out.scp
rm $dir/lat_tmp.*

echo $nj > $dir/num_jobs

for f in phones.txt cmvn_opts splice_opts final.mdl splice_opts tree frame_subsampling_factor; do
  if [ -f $src_dir/$f ]; then cp $src_dir/$f $dir/$f; fi
done


================================================
FILE: egs/steps/copy_trans_dir.sh
================================================
#!/usr/bin/env bash
# Copyright 2019   Phani Sankar Nidadavolu
# Copyright 2019   manhong wang(marvin)
# Apache 2.0.

#This script creates fmllr transform for the aug dirs by copying 
#the trans of original train dir after you copy_ali_dirs.sh or copy_lat_dirs.sh
#Note :  wo do not accept --nj here ,which shoud keep same as ali file
prefixes="reverb1 babble music noise"
include_original=true
cmd=run.pl
write_binary=true

. ./path.sh
. utils/parse_options.sh

if [ $# -ne 3 ]; then
  echo "Usage: $0 <out-data> <src-ali-dir> <out-ali-dir>"
  echo "This script creates fmllr transform for the aug dirs by copying "
  echo " the trans of original train dir"
  echo "While copying it adds prefix to the utterances specified by prefixes option"
  echo "Note that the original train dir does not have any prefix"
  echo "To include the original training directory in the copied "
  echo "version set the --include-original option to true"
  echo "main options (for others, see top of script file)"
  echo "  --prefixes <string of prefixes to add>    # All the prefixes of aug data to be included"
  echo "  --include-original <true/false>           # If true, will copy the alignements of original dir"
  exit 1
fi

data=$1
src_dir=$2
dir=$3

if [ ! -d $dir ]; then
    echo "$0: warning : you may need combine ali or lat first !" && exit 1
fi

if [ ! -f $src_dir/trans.1 ] ; then
    echo "$0: no trans exist in $src_dir dir"  && exit 1
fi


nj=$(cat $dir/num_jobs)
rm -f $dir/trans* 2>/dev/null

# Copy the fmllr trans temporarily
echo "creating temporary trans in $dir"
$cmd  JOB=1:$nj $dir/log/copy_trans_temp.JOB.log \
  copy-matrix --binary=$write_binary \
  "ark:cat $src_dir/trans.JOB |" \
  ark,scp:$dir/trans_tmp.JOB.ark,$dir/trans_tmp.JOB.scp || exit 1

# Make copies of utterances for perturbed data
for p in $prefixes; do
  cat $dir/trans_tmp.*.scp | awk -v p=$p '{print p"-"$0}'
done | sort -k1,1 > $dir/trans_out.scp.aug

if [ "$include_original" == "true" ]; then
  cat $dir/trans_tmp.*.scp | awk '{print $0}' | sort -k1,1 > $dir/trans_out.scp.clean
  cat $dir/trans_out.scp.clean $dir/trans_out.scp.aug | sort -k1,1 > $dir/trans_out.scp.old
else
  cat $dir/trans_out.scp.aug | sort -k1,1 > $dir/trans_out.scp.old
fi

utils/filter_scp.pl  ${data}/spk2utt  $dir/trans_out.scp.old  >  $dir/trans_out.scp
utils/split_data.sh ${data} $nj

# Copy and dump the trans for perturbed data
echo Creating fmllr trans for augmented data by copying fmllr trans from clean data
$cmd  JOB=1:$nj $dir/log/copy_out_trans.JOB.log \
  copy-matrix --binary=$write_binary \
  "scp:utils/split_scp.pl  --one-based -j $nj JOB $dir/trans_out.scp |" \
  ark:$dir/trans.JOB || exit 1

n_aug_trans=`wc -l $data/spk2utt`
n_copy_trans=`wc -l $dir/trans_out.scp`
echo "copy $n_copy_trans speaker's  fmllr trans of total $n_aug_trans"
rm $dir/trans_out.scp.aug  $dir/trans_out.scp.old $dir/trans_out.scp   $dir/trans_tmp.*
exit 0


================================================
FILE: egs/steps/data/augment_data_dir.py
================================================
#!/usr/bin/env python3
# Copyright 2017  David Snyder
#           2017  Ye Bai
#           2019  Phani Sankar Nidadavolu
# Apache 2.0
#
# This script generates augmented data.  It is based on
# steps/data/reverberate_data_dir.py but doesn't handle reverberation.
# It is designed to be somewhat simpler and more flexible for augmenting with
# additive noise.
from __future__ import print_function
import sys, random, argparse, os, imp
sys.path.append("steps/data/")
sys.path.insert(0, 'steps/')

from reverberate_data_dir import parse_file_to_dict
from reverberate_data_dir import write_dict_to_file
import libs.common as common_lib
data_lib = imp.load_source('dml', 'steps/data/data_dir_manipulation_lib.py')

def get_args():
    parser = argparse.ArgumentParser(description="Augment the data directory with additive noises. "
        "Noises are separated into background and foreground noises which are added together or "
        "separately.  Background noises are added to the entire recording, and repeated as necessary "
        "to cover the full length.  Multiple overlapping background noises can be added, to simulate "
        "babble, for example.  Foreground noises are added sequentially, according to a specified "
        "interval.  See also steps/data/reverberate_data_dir.py "
        "Usage: augment_data_dir.py [options...] <in-data-dir> <out-data-dir> "
        "E.g., steps/data/augment_data_dir.py --utt-suffix aug --fg-snrs 20:10:5:0 --bg-snrs 20:15:10 "
        "--num-bg-noise 1:2:3 --fg-interval 3 --fg-noise-dir data/musan_noise --bg-noise-dir "
        "data/musan_music data/train data/train_aug", formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--fg-snrs', type=str, dest = "fg_snr_str", default = '20:10:0',
                        help='When foreground noises are being added, the script will iterate through these SNRs.')
    parser.add_argument('--bg-snrs', type=str, dest = "bg_snr_str", default = '20:10:0',
                        help='When background noises are being added, the script will iterate through these SNRs.')
    parser.add_argument('--num-bg-noises', type=str,
                        dest = "num_bg_noises", default = '1',
                        help='Number of overlapping background noises that we iterate over.'
                            ' For example, if the input is "1:2:3" then the output wavs will have either '
                            '1, 2, or 3 randomly chosen background noises overlapping the entire recording')
    parser.add_argument('--fg-interval', type=int,
                        dest = "fg_interval", default = 0,
                        help='Number of seconds between the end of one '
                            'foreground noise and the beginning of the next.')
    parser.add_argument('--utt-suffix', type=str,
                        dest = "utt_suffix", default = None,
                        help='Suffix added to utterance IDs.')
    parser.add_argument('--utt-prefix', type=str,
                        dest = "utt_prefix", default = None,
                        help='Prefix added to utterance IDs.')
    parser.add_argument('--random-seed', type=int, dest = "random_seed",
                        default = 123, help='Random seed.')
    parser.add_argument("--modify-spk-id", type=str,
                        dest='modify_spk_id', default=False,
                        action=common_lib.StrToBoolAction,
                        choices=["true", "false"],
                        help='Utt prefix or suffix would be added to the spk id '
                            'also (used in ASR), in speaker id it is left unmodifed')
    parser.add_argument("--bg-noise-dir", type=str, dest="bg_noise_dir",
                        help="Background noise data directory")
    parser.add_argument("--fg-noise-dir", type=str, dest="fg_noise_dir",
                        help="Foreground noise data directory")
    parser.add_argument("input_dir", help="Input data directory")
    parser.add_argument("output_dir", help="Output data directory")

    print(' '.join(sys.argv))
    args = parser.parse_args()
    args = check_args(args)
    return args

def check_args(args):
    # Check args
    if args.utt_suffix is None and args.utt_prefix is None:
        args.utt_modifier_type = None
        args.utt_modifier = ""
    elif args.utt_suffix is None and args.utt_prefix is not None:
        args.utt_modifier_type = "prefix"
        args.utt_modifier = args.utt_prefix
    elif args.utt_suffix is not None and args.utt_prefix is None:
        args.utt_modifier_type = "suffix"
        args.utt_modifier = args.utt_suffix
    else:
        raise Exception("Trying to add both prefix and suffix. Choose either of them")

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)
    if not args.fg_interval >= 0:
        raise Exception("--fg-interval must be 0 or greater")
    if args.bg_noise_dir is None and args.fg_noise_dir is None:
        raise Exception("Either --fg-noise-dir or --bg-noise-dir must be specified")
    return args

def get_noise_list(noise_wav_scp_filename):
    noise_wav_scp_file = open(noise_wav_scp_filename, 'r', encoding='utf-8').readlines()
    noise_wavs = {}
    noise_utts = []
    for line in noise_wav_scp_file:
        toks=line.split(" ")
        wav = " ".join(toks[1:])
        noise_utts.append(toks[0])
        noise_wavs[toks[0]] = wav.rstrip()
    return noise_utts, noise_wavs

def augment_wav(utt, wav, dur, fg_snr_opts, bg_snr_opts, fg_noise_utts, \
    bg_noise_utts, noise_wavs, noise2dur, interval, num_opts):
    # This section is common to both foreground and background noises
    new_wav = ""
    dur_str = str(dur)
    noise_dur = 0
    tot_noise_dur = 0
    snrs=[]
    noises=[]
    start_times=[]

    # Now handle the background noises
    if len(bg_noise_utts) > 0:
        num = random.choice(num_opts)
        for i in range(0, num):
            noise_utt = random.choice(bg_noise_utts)
            noise = "wav-reverberate --duration=" \
            + dur_str + " \"" + noise_wavs[noise_utt] + "\" - |"
            snr = random.choice(bg_snr_opts)
            snrs.append(snr)
            start_times.append(0)
            noises.append(noise)

    # Now handle the foreground noises
    if len(fg_noise_utts) > 0:
        while tot_noise_dur < dur:
            noise_utt = random.choice(fg_noise_utts)
            noise = noise_wavs[noise_utt]
            snr = random.choice(fg_snr_opts)
            snrs.append(snr)
            noise_dur = noise2dur[noise_utt]
            start_times.append(tot_noise_dur)
            tot_noise_dur += noise_dur + interval
            noises.append(noise)

    start_times_str = "--start-times='" + ",".join([str(i) for i in start_times]) + "'"
    snrs_str = "--snrs='" + ",".join([str(i) for i in snrs]) + "'"
    noises_str = "--additive-signals='" + ",".join(noises).strip() + "'"

    # If the wav is just a file
    if wav.strip()[-1] != "|":
        new_wav = "wav-reverberate --shift-output=true " + noises_str + " " \
            + start_times_str + " " + snrs_str + " " + wav + " - |"
    # Else if the wav is in a pipe
    else:
        new_wav = wav + " wav-reverberate --shift-output=true " + noises_str + " " \
            + start_times_str + " " + snrs_str + " - - |"
    return new_wav

def get_new_id(utt, utt_modifier_type, utt_modifier):
    """ This function generates a new id from the input id
        This is needed when we have to create multiple copies of the original data
        E.g. get_new_id("swb0035", prefix="rvb", copy=1) returns a string "rvb1_swb0035"
    """
    if utt_modifier_type == "suffix" and len(utt_modifier) > 0:
        new_utt = utt + "-" + utt_modifier
    elif utt_modifier_type == "prefix" and len(utt_modifier) > 0:
        new_utt = utt_modifier + "-" + utt
    else:
        new_utt = utt

    return new_utt

def copy_file_if_exists(input_file, output_file, utt_modifier_type,
                        utt_modifier, fields=[0]):
    if os.path.isfile(input_file):
        clean_dict = parse_file_to_dict(input_file,
            value_processor = lambda x: " ".join(x))
        new_dict = {}
        for key in clean_dict.keys():
            modified_key = get_new_id(key, utt_modifier_type, utt_modifier)
            if len(fields) > 1:
                values = clean_dict[key].split(" ")
                modified_values = values
                for idx in range(1, len(fields)):
                    modified_values[idx-1] = get_new_id(values[idx-1],
                                            utt_modifier_type, utt_modifier)
                new_dict[modified_key] = " ".join(modified_values)
            else:
                new_dict[modified_key] = clean_dict[key]
        write_dict_to_file(new_dict, output_file)

def create_augmented_utt2uniq(input_dir, output_dir,
                            utt_modifier_type, utt_modifier):
    clean_utt2spk_file = input_dir + "/utt2spk"
    clean_utt2spk_dict = parse_file_to_dict(clean_utt2spk_file,
                            value_processor = lambda x: " ".join(x))
    augmented_utt2uniq_dict = {}
    for key in clean_utt2spk_dict.keys():
        modified_key = get_new_id(key, utt_modifier_type, utt_modifier)
        augmented_utt2uniq_dict[modified_key] = key
    write_dict_to_file(augmented_utt2uniq_dict, output_dir + "/utt2uniq")

def main():
    args = get_args()
    input_dir = args.input_dir
    output_dir = args.output_dir

    fg_snrs = [int(i) for i in args.fg_snr_str.split(":")]
    bg_snrs = [int(i) for i in args.bg_snr_str.split(":")]
    num_bg_noises = [int(i) for i in args.num_bg_noises.split(":")]
    reco2dur = parse_file_to_dict(input_dir + "/reco2dur",
        value_processor = lambda x: float(x[0]))
    wav_scp_file = open(input_dir + "/wav.scp", 'r', encoding='utf-8').readlines()

    noise_wavs = {}
    noise_reco2dur = {}
    bg_noise_utts = []
    fg_noise_utts = []

    # Load background noises
    if args.bg_noise_dir:
        bg_noise_wav_filename = args.bg_noise_dir + "/wav.scp"
        bg_noise_utts, bg_noise_wavs = get_noise_list(bg_noise_wav_filename)
        bg_noise_reco2dur = parse_file_to_dict(args.bg_noise_dir + "/reco2dur",
            value_processor = lambda x: float(x[0]))
        noise_wavs.update(bg_noise_wavs)
        noise_reco2dur.update(bg_noise_reco2dur)

    # Load foreground noises
    if args.fg_noise_dir:
        fg_noise_wav_filename = args.fg_noise_dir + "/wav.scp"
        fg_noise_reco2dur_filename = args.fg_noise_dir + "/reco2dur"
        fg_noise_utts, fg_noise_wavs = get_noise_list(fg_noise_wav_filename)
        fg_noise_reco2dur = parse_file_to_dict(args.fg_noise_dir + "/reco2dur",
            value_processor = lambda x: float(x[0]))
        noise_wavs.update(fg_noise_wavs)
        noise_reco2dur.update(fg_noise_reco2dur)

    random.seed(args.random_seed)
    new_utt2wav = {}
    new_utt2spk = {}

    # Augment each line in the wav file
    for line in wav_scp_file:
        toks = line.rstrip().split(" ")
        utt = toks[0]
        wav = " ".join(toks[1:])
        dur = reco2dur[utt]
        new_wav = augment_wav(utt, wav, dur, fg_snrs, bg_snrs, fg_noise_utts,
            bg_noise_utts, noise_wavs, noise_reco2dur, args.fg_interval,
            num_bg_noises)

        new_utt = get_new_id(utt, args.utt_modifier_type, args.utt_modifier)

        new_utt2wav[new_utt] = new_wav

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    write_dict_to_file(new_utt2wav, output_dir + "/wav.scp")
    copy_file_if_exists(input_dir + "/reco2dur", output_dir + "/reco2dur",
                                args.utt_modifier_type, args.utt_modifier)
    copy_file_if_exists(input_dir + "/utt2dur", output_dir + "/utt2dur",
                                args.utt_modifier_type, args.utt_modifier)

    # Check whether to modify the speaker id or not while creating utt2spk file
    fields = ([0, 1] if args.modify_spk_id else [0])
    copy_file_if_exists(input_dir + "/utt2spk", output_dir + "/utt2spk",
                        args.utt_modifier_type, args.utt_modifier, fields=fields)
    copy_file_if_exists(input_dir + "/utt2lang", output_dir + "/utt2lang",
                        args.utt_modifier_type, args.utt_modifier)
    copy_file_if_exists(input_dir + "/utt2num_frames", output_dir + "/utt2num_frames",
                        args.utt_modifier_type, args.utt_modifier)
    copy_file_if_exists(input_dir + "/text", output_dir + "/text", args.utt_modifier_type,
                        args.utt_modifier)
    copy_file_if_exists(input_dir + "/segments", output_dir + "/segments",
                        args.utt_modifier_type, args.utt_modifier, fields=[0, 1])
    copy_file_if_exists(input_dir + "/vad.scp", output_dir + "/vad.scp",
                        args.utt_modifier_type, args.utt_modifier)
    copy_file_if_exists(input_dir + "/reco2file_and_channel",
                        output_dir + "/reco2file_and_channel",
                        args.utt_modifier_type, args.utt_modifier, fields=[0, 1])

    if args.modify_spk_id:
        copy_file_if_exists(input_dir + "/spk2gender", output_dir + "/spk2gender",
                        args.utt_modifier_type, args.utt_modifier)
    else:
        copy_file_if_exists(input_dir + "/spk2gender", output_dir + "/spk2gender", None, "")

    # Create utt2uniq file
    if os.path.isfile(input_dir + "/utt2uniq"):
        copy_file_if_exists(input_dir + "/utt2uniq", output_dir + "/utt2uniq",
                        args.utt_modifier_type, args.utt_modifier, fields=[0])
    else:
        create_augmented_utt2uniq(input_dir, output_dir,
                        args.utt_modifier_type, args.utt_modifier)

    data_lib.RunKaldiCommand("utils/utt2spk_to_spk2utt.pl <{output_dir}/utt2spk >{output_dir}/spk2utt"
                    .format(output_dir = output_dir))

    data_lib.RunKaldiCommand("utils/fix_data_dir.sh {output_dir}".format(output_dir = output_dir))

if __name__ == "__main__":
    main()


================================================
FILE: egs/steps/data/data_dir_manipulation_lib.py
================================================
import subprocess

def RunKaldiCommand(command, wait = True):
    """ Runs commands frequently seen in Kaldi scripts. These are usually a
        sequence of commands connected by pipes, so we use shell=True """
    #logger.info("Running the command\n{0}".format(command))
    p = subprocess.Popen(command, shell = True,
                         stdout = subprocess.PIPE,
                         stderr = subprocess.PIPE)

    if wait:
        [stdout, stderr] = p.communicate()
        if p.returncode is not 0:
            raise Exception("There was an error while running the command {0}\n------------\n{1}".format(command, stderr))
        return stdout, stderr
    else:
        return p


================================================
FILE: egs/steps/data/make_musan.py
================================================
#!/usr/bin/env python3
# Copyright 2015   David Snyder
#           2019   Phani Sankar Nidadavolu
# Apache 2.0.
#
# This file is meant to be invoked by make_musan.sh.

import os, sys, argparse
sys.path.append("steps/data/")
sys.path.insert(0, 'steps/')
import libs.common as common_lib

def get_args():
    parser = argparse.ArgumentParser(description="Create MUSAN corpus",
                        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument("--use-vocals", type=str,
                        dest='use_vocals', default=True,
                        action=common_lib.StrToBoolAction,
                        choices=["true", "false"],
                        help='use vocals from the music corpus')
    parser.add_argument('--sampling-rate', type=int, default=16000,
                        help="Sampling rate of the source data. If a positive integer is specified with this option, "
                        "the MUSAN corpus will be resampled to the rate of the source data."
                        "Original MUSAN corpus is sampled at 16KHz. Defaults to 16000 Hz")
    parser.add_argument("in_dir", help="Input data directory")
    parser.add_argument("out_dir", help="Output data directory")

    print(' '.join(sys.argv))
    args = parser.parse_args()
    args = check_args(args)

    return args

def check_args(args):
    if not os.path.exists(args.in_dir):
        raise Exception('input dir {0} does not exist'.format(args.in_dir))
    if not os.path.exists(args.out_dir):
        print("Preparing {0}/musan...".format(args.out_dir))
        os.makedirs(args.out_dir)

    return args

def process_music_annotations(path):
    utt2spk = {}
    utt2vocals = {}
    lines = open(path, 'r').readlines()
    for line in lines:
        utt, genres, vocals, musician = line.rstrip().split()[:4]
        # For this application, the musican ID isn't important
        utt2spk[utt] = utt
        utt2vocals[utt] = vocals == "Y"
    return utt2spk, utt2vocals

def prepare_music(root_dir, use_vocals, sampling_rate):
    utt2vocals = {}
    utt2spk = {}
    utt2wav = {}
    num_good_files = 0
    num_bad_files = 0
    music_dir = os.path.join(root_dir, "music")
    for root, dirs, files in os.walk(music_dir):
        for file in files:
            file_path = os.path.join(root, file)
            if file.endswith(".wav"):
                utt = str(file).replace(".wav", "")
                utt2wav[utt] = file_path
            elif str(file) == "ANNOTATIONS":
                utt2spk_part, utt2vocals_part = process_music_annotations(file_path)
                utt2spk.update(utt2spk_part)
                utt2vocals.update(utt2vocals_part)

    utt2spk_str = ""
    utt2wav_str = ""
    for utt in utt2vocals:
        if utt in utt2wav:
            if use_vocals or not utt2vocals[utt]:
                utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n"
                if sampling_rate == 16000:
                    utt2wav_str = utt2wav_str + utt + " " + utt2wav[utt] + "\n"
                else:
                    utt2wav_str = utt2wav_str + utt + " sox -t wav " + utt2wav[utt] + " -r" \
                                    " {fs} -t wav - |\n".format(fs=sampling_rate)
            num_good_files += 1
        else:
            print("Missing file {}".format(utt))
            num_bad_files += 1
    print("In music directory, processed {} files; {} had missing wav data".format(
                                                    num_good_files, num_bad_files))
    return utt2spk_str, utt2wav_str


def prepare_speech(root_dir, sampling_rate):
    utt2spk = {}
    utt2wav = {}
    num_good_files = 0
    num_bad_files = 0
    speech_dir = os.path.join(root_dir, "speech")
    for root, dirs, files in os.walk(speech_dir):
        for file in files:
            file_path = os.path.join(root, file)
            if file.endswith(".wav"):
                utt = str(file).replace(".wav", "")
                utt2wav[utt] = file_path
                utt2spk[utt] = utt

    utt2spk_str = ""
    utt2wav_str = ""
    for utt in utt2spk:
        if utt in utt2wav:
            utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n"
            if sampling_rate == 16000:
                utt2wav_str = utt2wav_str + utt + " " + utt2wav[utt] + "\n"
            else:
                utt2wav_str = utt2wav_str + utt + " sox -t wav " + utt2wav[utt] + " -r" \
                                    " {fs} -t wav - |\n".format(fs=sampling_rate)
            num_good_files += 1
        else:
            print("Missing file {}".format(utt))
            num_bad_files += 1
    print("In speech directory, processed {} files; {} had missing wav data".format(
                                                    num_good_files, num_bad_files))
    return utt2spk_str, utt2wav_str


def prepare_noise(root_dir, sampling_rate):
    utt2spk = {}
    utt2wav = {}
    num_good_files = 0
    num_bad_files = 0
    noise_dir = os.path.join(root_dir, "noise")
    for root, dirs, files in os.walk(noise_dir):
        for file in files:
            file_path = os.path.join(root, file)
            if file.endswith(".wav"):
                utt = str(file).replace(".wav", "")
                utt2wav[utt] = file_path
                utt2spk[utt] = utt

    utt2spk_str = ""
    utt2wav_str = ""
    for utt in utt2spk:
        if utt in utt2wav:
            utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n"
            if sampling_rate == 16000:
                utt2wav_str = utt2wav_str + utt + " " + utt2wav[utt] + "\n"
            else:
                utt2wav_str = utt2wav_str + utt + " sox -t wav " + utt2wav[utt] + " -r" \
                                    " {fs} -t wav - |\n".format(fs=sampling_rate)
            num_good_files += 1
        else:
            print("Missing file {}".format(utt))
            num_bad_files += 1
    print("In noise directory, processed {} files; {} had missing wav data".format(
                                    num_good_files, num_bad_files))
    return utt2spk_str, utt2wav_str


def main():
    args = get_args()
    in_dir = args.in_dir
    out_dir = args.out_dir
    use_vocals = args.use_vocals
    sampling_rate = args.sampling_rate

    utt2spk_music, utt2wav_music = prepare_music(in_dir, use_vocals, sampling_rate)
    utt2spk_speech, utt2wav_speech = prepare_speech(in_dir, sampling_rate)
    utt2spk_noise, utt2wav_noise = prepare_noise(in_dir, sampling_rate)

    utt2spk = utt2spk_speech + utt2spk_music + utt2spk_noise
    utt2wav = utt2wav_speech + utt2wav_music + utt2wav_noise
    wav_fi = open(os.path.join(out_dir, "wav.scp"), 'w')
    wav_fi.write(utt2wav)
    utt2spk_fi = open(os.path.join(out_dir, "utt2spk"), 'w')
    utt2spk_fi.write(utt2spk)


if __name__=="__main__":
    main()


================================================
FILE: egs/steps/data/make_musan.sh
================================================
#!/usr/bin/env bash
# Copyright 2015   David Snyder
#           2019   Phani Sankar Nidadavolu
# Apache 2.0.
#
# This script creates the MUSAN data directory.
# Consists of babble, music and noise files.
# Used to create augmented data
# The required dataset is freely available at http://www.openslr.org/17/

# The corpus can be cited as follows:
# @misc{musan2015,
#  author = {David Snyder and Guoguo Chen and Daniel Povey},
#  title = {{MUSAN}: {A} {M}usic, {S}peech, and {N}oise {C}orpus},
#  year = {2015},
#  eprint = {1510.08484},
#  note = {arXiv:1510.08484v1}
# }

set -e
use_vocals=true
sampling_rate=16000
stage=0

echo "$0 $@"  # Print the command line for logging

if [ -f path.sh ]; then . ./path.sh; fi
. parse_options.sh || exit 1;

if [ $# -ne 2 ]; then
    echo USAGE: $0 input_dir output_dir
    echo input_dir is the path where the MUSAN corpus is located
    echo e.g: $0 /export/corpora/JHU/musan data
    echo "main options (for others, see top of script file)"
    echo "  --sampling-rate <sampling frequency>        # Sampling frequency of source dir"
    echo "  --use-vocals <true/false>        # Use vocals from music portion of MUSAN corpus"
    exit 1;
fi

in_dir=$1
data_dir=$2

mkdir -p local/musan.tmp

# The below script will create the musan corpus
steps/data/make_musan.py --use-vocals ${use_vocals} \
                        --sampling-rate ${sampling_rate} \
                        ${in_dir} ${data_dir}/musan || exit 1;

utils/fix_data_dir.sh ${data_dir}/musan

grep "music" ${data_dir}/musan/utt2spk > local/musan.tmp/utt2spk_music
grep "speech" ${data_dir}/musan/utt2spk > local/musan.tmp/utt2spk_speech
grep "noise" ${data_dir}/musan/utt2spk > local/musan.tmp/utt2spk_noise

utils/subset_data_dir.sh --utt-list local/musan.tmp/utt2spk_music \
        ${data_dir}/musan ${data_dir}/musan_music
utils/subset_data_dir.sh --utt-list local/musan.tmp/utt2spk_speech \
        ${data_dir}/musan ${data_dir}/musan_speech
utils/subset_data_dir.sh --utt-list local/musan.tmp/utt2spk_noise \
        ${data_dir}/musan ${data_dir}/musan_noise

utils/fix_data_dir.sh ${data_dir}/musan_music
utils/fix_data_dir.sh ${data_dir}/musan_speech
utils/fix_data_dir.sh ${data_dir}/musan_noise

rm -rf local/musan.tmp

for name in speech noise music; do
    utils/data/get_reco2dur.sh ${data_dir}/musan_${name}
done


================================================
FILE: egs/steps/data/reverberate_data_dir.py
================================================
#!/usr/bin/env python3
# Copyright 2016  Tom Ko
#           2018  David Snyder
#           2019  Phani Sankar Nidadavolu
# Apache 2.0
# script to generate reverberated data

import argparse, shlex, glob, math, os, random, sys, warnings, copy, imp, ast

data_lib = imp.load_source('dml', 'steps/data/data_dir_manipulation_lib.py')

def get_args():
    # we add required arguments as named arguments for readability
    parser = argparse.ArgumentParser(description="Reverberate the data directory with an option "
                                                 "to add isotropic and point source noises. "
                                                 "Usage: reverberate_data_dir.py [options...] <in-data-dir> <out-data-dir> "
                                                 "E.g. reverberate_data_dir.py --rir-set-parameters rir_list "
                                                 "--foreground-snrs 20:10:15:5:0 --background-snrs 20:10:15:5:0 "
                                                 "--noise-list-file noise_list --speech-rvb-probability 1 --num-replications 2 "
                                                 "--random-seed 1 data/train data/train_rvb",
                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    parser.add_argument("--rir-set-parameters", type=str, action='append', required = True, dest = "rir_set_para_array",
                        help="Specifies the parameters of an RIR set. "
                        "Supports the specification of  mixture_weight and rir_list_file_name. The mixture weight is optional. "
                        "The default mixture weight is the probability mass remaining after adding the mixture weights "
                        "of all the RIR lists, uniformly divided among the RIR lists without mixture weights. "
                        "E.g. --rir-set-parameters '0.3, rir_list' or 'rir_list' "
                        "the format of the RIR list file is "
                        "--rir-id <string,required> --room-id <string,required> "
                        "--receiver-position-id <string,optional> --source-position-id <string,optional> "
                        "--rt-60 <float,optional> --drr <float, optional> location <rspecifier> "
                        "E.g. --rir-id 00001 --room-id 001 --receiver-position-id 001 --source-position-id 00001 "
                        "--rt60 0.58 --drr -4.885 data/impulses/Room001-00001.wav")
    parser.add_argument("--noise-set-parameters", type=str, action='append', default = None, dest = "noise_set_para_array",
                        help="Specifies the parameters of an noise set. "
                        "Supports the specification of mixture_weight and noise_list_file_name. The mixture weight is optional. "
                        "The default mixture weight is the probability mass remaining after adding the mixture weights "
                        "of all the noise lists, uniformly divided among the noise lists without mixture weights. "
                        "E.g. --noise-set-parameters '0.3, noise_list' or 'noise_list' "
                        "the format of the noise list file is "
                        "--noise-id <string,required> --noise-type <choices = {isotropic, point source},required> "
                        "--bg-fg-type <choices = {background, foreground}, default=background> "
                        "--room-linkage <str, specifies the room associated with the noise file. Required if isotropic> "
                        "location <rspecifier> "
                        "E.g. --noise-id 001 --noise-type isotropic --rir-id 00019 iso_noise.wav")
    parser.add_argument("--num-replications", type=int, dest = "num_replicas", default = 1,
                        help="Number of replicate to generated for the data")
    parser.add_argument('--foreground-snrs', type=str, dest = "foreground_snr_string", default = '20:10:0', help='When foreground noises are being added the script will iterate through these SNRs.')
    parser.add_argument('--background-snrs', type=str, dest = "background_snr_string", default = '20:10:0', help='When background noises are being added the script will iterate through these SNRs.')
    parser.add_argument('--prefix', type=str, default = None, help='This prefix will modified for each reverberated copy, by adding additional affixes.')
    parser.add_argument("--speech-rvb-probability", type=float, default = 1.0,
                        help="Probability of reverberating a speech signal, e.g. 0 <= p <= 1")
    parser.add_argument("--pointsource-noise-addition-probability", type=float, default = 1.0,
                        help="Probability of adding point-source noises, e.g. 0 <= p <= 1")
    parser.add_argument("--isotropic-noise-addition-probability", type=float, default = 1.0,
                        help="Probability of adding isotropic noises, e.g. 0 <= p <= 1")
    parser.add_argument("--rir-smoothing-weight", type=float, default = 0.3,
                        help="Smoothing weight for the RIR probabilties, e.g. 0 <= p <= 1. If p = 0, no smoothing will be done. "
                        "The RIR distribution will be mixed with a uniform distribution according to the smoothing weight")
    parser.add_argument("--noise-smoothing-weight", type=float, default = 0.3,
                        help="Smoothing weight for the noise probabilties, e.g. 0 <= p <= 1. If p = 0, no smoothing will be done. "
                        "The noise distribution will be mixed with a uniform distribution according to the smoothing weight")
    parser.add_argument("--max-noises-per-minute", type=int, default = 2,
                        help="This controls the maximum number of point-source noises that could be added to a recording according to its duration")
    parser.add_argument('--random-seed', type=int, default=0, help='seed to be used in the randomization of impulses and noises')
    parser.add_argument("--shift-output", type=str, help="If true, the reverberated waveform will be shifted by the amount of the peak position of the RIR",
                         choices=['true', 'false'], default = "true")
    parser.add_argument('--source-sampling-rate', type=int, default=None,
                        help="Sampling rate of the source data. If a positive integer is specified with this option, "
                        "the RIRs/noises will be resampled to the rate of the source data.")
    parser.add_argument("--include-original-data", type=str, help="If true, the output data includes one copy of the original data",
                         choices=['true', 'false'], default = "false")
    parser.add_argument("input_dir",
                        help="Input data directory")
    parser.add_argument("output_dir",
                        help="Output data directory")

    print(' '.join(sys.argv))

    args = parser.parse_args()
    args = check_args(args)

    return args

def check_args(args):
    if args.prefix is None:
        if args.num_replicas > 1 or args.include_original_data == "true":
            args.prefix = "rvb"
            warnings.warn("--prefix is set to 'rvb' as more than one copy of data is generated")

    if not args.num_replicas > 0:
        raise Exception("--num-replications cannot be non-positive")

    if args.speech_rvb_probability < 0 or args.speech_rvb_probability > 1:
        raise Exception("--speech-rvb-probability must be between 0 and 1")

    if args.pointsource_noise_addition_probability < 0 or args.pointsource_noise_addition_probability > 1:
        raise Exception("--pointsource-noise-addition-probability must be between 0 and 1")

    if args.isotropic_noise_addition_probability < 0 or args.isotropic_noise_addition_probability > 1:
        raise Exception("--isotropic-noise-addition-probability must be between 0 and 1")

    if args.rir_smoothing_weight < 0 or args.rir_smoothing_weight > 1:
        raise Exception("--rir-smoothing-weight must be between 0 and 1")

    if args.noise_smoothing_weight < 0 or args.noise_smoothing_weight > 1:
        raise Exception("--noise-smoothing-weight must be between 0 and 1")

    if args.max_noises_per_minute < 0:
        raise Exception("--max-noises-per-minute cannot be negative")

    if args.source_sampling_rate is not None and args.source_sampling_rate <= 0:
        raise Exception("--source-sampling-rate cannot be non-positive")

    return args


class list_cyclic_iterator(object):
    def __init__(self, list):
        self.list_index = 0
        self.list = list
        random.shuffle(self.list)

    def __next__(self):
        item = self.list[self.list_index]
        self.list_index = (self.list_index + 1) % len(self.list)
        return item

    next = __next__  # for Python 2

def pick_item_with_probability(x):
    """ This functions picks an item from the collection according to the associated
        probability distribution. The probability estimate of each item in the collection
        is stored in the "probability" field of the particular item. x : a
        collection (list or dictionary) where the values contain a field called probability
    """
    if isinstance(x, dict):
        keylist = list(x.keys())
        keylist.sort()
        random.shuffle(keylist)
        plist = [x[k] for k in keylist]
    else:
        plist = x
    total_p = sum(item.probability for item in plist)
    p = random.uniform(0, total_p)
    accumulate_p = 0
    for item in plist:
        if accumulate_p + item.probability >= p:
            return item
        accumulate_p += item.probability
    assert False, "Shouldn't get here as the accumulated probability should always equal to 1"


def parse_file_to_dict(file, assert2fields = False, value_processor = None):
    """ This function parses a file and pack the data into a dictionary
        It is useful for parsing file like wav.scp, utt2spk, text...etc
    """
    if value_processor is None:
        value_processor = lambda x: x[0]
    dict = {}
    for line in open(file, 'r', encoding='utf-8'):
        parts = line.split()
        if assert2fields:
            assert(len(parts) == 2)

        dict[parts[0]] = value_processor(parts[1:])
    return dict

def write_dict_to_file(dict, file_name):
    """ This function creates a file and write the content of a dictionary into it
    """
    file = open(file_name, 'w', encoding='utf-8')
    keys = sorted(dict.keys())
    for key in keys:
        value = dict[key]
        if type(value) in [list, tuple] :
            if type(value) is tuple:
                value = list(value)
            value = sorted(value)
            value = ' '.join(str(value))
        file.write('{0} {1}\n'.format(key, value))
    file.close()


def create_corrupted_utt2uniq(input_dir, output_dir, num_replicas, include_original, prefix):
    """This function creates the utt2uniq file from the utterance id in utt2spk file
    """
    corrupted_utt2uniq = {}
    # Parse the utt2spk to get the utterance id
    utt2spk = parse_file_to_dict(input_dir + "/utt2spk", value_processor = lambda x: " ".join(x))
    keys = sorted(utt2spk.keys())
    if include_original:
        start_index = 0
    else:
        start_index = 1

    for i in range(start_index, num_replicas+1):
        for utt_id in keys:
            new_utt_id = get_new_id(utt_id, prefix, i)
            corrupted_utt2uniq[new_utt_id] = utt_id

    write_dict_to_file(corrupted_utt2uniq, output_dir + "/utt2uniq")


def add_point_source_noise(noise_addition_descriptor,  # descriptor to store the information of the noise added
                        room,  # the room selected
                        pointsource_noise_list, # the point source noise list
                        pointsource_noise_addition_probability, # Probability of adding point-source noises
                        foreground_snrs, # the SNR for adding the foreground noises
                        background_snrs, # the SNR for adding the background noises
                        speech_dur,  # duration of the recording
                        max_noises_recording  # Maximum number of point-source noises that can be added
                        ):
    if len(pointsource_noise_list) > 0 and random.random() < pointsource_noise_addition_probability and max_noises_recording >= 1:
        for k in range(random.randint(1, max_noises_recording)):
            # pick the RIR to reverberate the point-source noise
            noise = pick_item_with_probability(pointsource_noise_list)
            noise_rir = pick_item_with_probability(room.rir_list)
            # If it is a background noise, the noise will be extended and be added to the whole speech
            # if it is a foreground noise, the noise will not extended and be added at a random time of the speech
            if noise.bg_fg_type == "background":
                noise_rvb_command = """wav-reverberate --impulse-response="{0}" --duration={1}""".format(noise_rir.rir_rspecifier, speech_dur)
                noise_addition_descriptor['start_times'].append(0)
                noise_addition_descriptor['snrs'].append(next(background_snrs))
            else:
                noise_rvb_command = """wav-reverberate --impulse-response="{0}" """.format(noise_rir.rir_rspecifier)
                noise_addition_descriptor['start_times'].append(round(random.random() * speech_dur, 2))
                noise_addition_descriptor['snrs'].append(next(foreground_snrs))

            # check if the rspecifier is a pipe or not
            if len(noise.noise_rspecifier.split()) == 1:
                noise_addition_descriptor['noise_io'].append("{1} {0} - |".format(noise.noise_rspecifier, noise_rvb_command))
            else:
                noise_addition_descriptor['noise_io'].append("{0} {1} - - |".format(noise.noise_rspecifier, noise_rvb_command))

    return noise_addition_descriptor


def generate_reverberation_opts(room_dict,  # the room dictionary, please refer to make_room_dict() for the format
                              pointsource_noise_list, # the point source noise list
                              iso_noise_dict, # the isotropic noise dictionary
                              foreground_snrs, # the SNR for adding the foreground noises
                              background_snrs, # the SNR for adding the background noises
                              speech_rvb_probability, # Probability of reverberating a speech signal
                              isotropic_noise_addition_probability, # Probability of adding isotropic noises
                              pointsource_noise_addition_probability, # Probability of adding point-source noises
                              speech_dur,  # duration of the recording
                              max_noises_recording  # Maximum number of point-source noises that can be added
                              ):
    """ This function randomly decides whether to reverberate, and sample a RIR if it does
        It also decides whether to add the appropriate noises
        This function return the string of options to the binary wav-reverberate
    """
    reverberate_opts = ""
    noise_addition_descriptor = {'noise_io': [],
                                 'start_times': [],
                                 'snrs': []}
    # Randomly select the room
    # Here the room probability is a sum of the probabilities of the RIRs recorded in the room.
    room = pick_item_with_probability(room_dict)
    # Randomly select the RIR in the room
    speech_rir = pick_item_with_probability(room.rir_list)
    if random.random() < speech_rvb_probability:
        # pick the RIR to reverberate the speech
        reverberate_opts += """--impulse-response="{0}" """.format(speech_rir.rir_rspecifier)

    rir_iso_noise_list = []
    if speech_rir.room_id in iso_noise_dict:
        rir_iso_noise_list = iso_noise_dict[speech_rir.room_id]
    # Add the corresponding isotropic noise associated with the selected RIR
    if len(rir_iso_noise_list) > 0 and random.random() < isotropic_noise_addition_probability:
        isotropic_noise = pick_item_with_probability(rir_iso_noise_list)
        # extend the isotropic noise to the length of the speech waveform
        # check if the rspecifier is a pipe or not
        if len(isotropic_noise.noise_rspecifier.split()) == 1:
            noise_addition_descriptor['noise_io'].append("wav-reverberate --duration={1} {0} - |".format(isotropic_noise.noise_rspecifier, speech_dur))
        else:
            noise_addition_descriptor['noise_io'].append("{0} wav-reverberate --duration={1} - - |".format(isotropic_noise.noise_rspecifier, speech_dur))
        noise_addition_descriptor['start_times'].append(0)
        noise_addition_descriptor['snrs'].append(next(background_snrs))

    noise_addition_descriptor = add_point_source_noise(noise_addition_descriptor,  # descriptor to store the information of the noise added
                                                    room,  # the room selected
                                                    pointsource_noise_list, # the point source noise list
                                                    pointsource_noise_addition_probability, # Probability of adding point-source noises
                                                    foreground_snrs, # the SNR for adding the foreground noises
                                                    background_snrs, # the SNR for adding the background noises
                                                    speech_dur,  # duration of the recording
                                                    max_noises_recording  # Maximum number of point-source noises that can be added
                                                    )

    assert len(noise_addition_descriptor['noise_io']) == len(noise_addition_descriptor['start_times'])
    assert len(noise_addition_descriptor['noise_io']) == len(noise_addition_descriptor['snrs'])
    if len(noise_addition_descriptor['noise_io']) > 0:
        reverberate_opts += "--additive-signals='{0}' ".format(','.join(noise_addition_descriptor['noise_io']))
        reverberate_opts += "--start-times='{0}' ".format(','.join([str(x) for x in noise_addition_descriptor['start_times']]))
        reverberate_opts += "--snrs='{0}' ".format(','.join([str(x) for x in noise_addition_descriptor['snrs']]))

    return reverberate_opts

def get_new_id(id, prefix=None, copy=0):
    """ This function generates a new id from the input id
        This is needed when we have to create multiple copies of the original data
        E.g. get_new_id("swb0035", prefix="rvb", copy=1) returns a string "rvb1-swb0035"
    """
    if prefix is not None:
        new_id = prefix + str(copy) + "-" + id
    else:
        new_id = id

    return new_id


def generate_reverberated_wav_scp(wav_scp,  # a dictionary whose values are the Kaldi-IO strings of the speech recordings
                               durations, # a dictionary whose values are the duration (in sec) of the speech recordings
                               output_dir, # output directory to write the corrupted wav.scp
                               room_dict,  # the room dictionary, please refer to make_room_dict() for the format
                               pointsource_noise_list, # the point source noise list
                               iso_noise_dict, # the isotropic noise dictionary
                               foreground_snr_array, # the SNR for adding the foreground noises
                               background_snr_array, # the SNR for adding the background noises
                               num_replicas, # Number of replicate to generated for the data
                               include_original, # include a copy of the original data
                               prefix, # prefix for the id of the corrupted utterances
                               speech_rvb_probability, # Probability of reverberating a speech signal
                               shift_output, # option whether to shift the output waveform
                               isotropic_noise_addition_probability, # Probability of adding isotropic noises
                               pointsource_noise_addition_probability, # Probability of adding point-source noises
                               max_noises_per_minute # maximum number of point-source noises that can be added to a recording according to its duration
                               ):
    """ This is the main function to generate pipeline command for the corruption
        The generic command of wav-reverberate will be like:
        wav-reverberate --duration=t --impulse-response=rir.wav
        --additive-signals='noise1.wav,noise2.wav' --snrs='snr1,snr2' --start-times='s1,s2' input.wav output.wav
    """
    foreground_snrs = list_cyclic_iterator(foreground_snr_array)
    background_snrs = list_cyclic_iterator(background_snr_array)
    corrupted_wav_scp = {}
    keys = sorted(wav_scp.keys())
    if include_original:
        start_index = 0
    else:
        start_index = 1

    for i in range(start_index, num_replicas+1):
        for recording_id in keys:
            wav_original_pipe = wav_scp[recording_id]
            # check if it is really a pipe
            if len(wav_original_pipe.split()) == 1:
                wav_original_pipe = "cat {0} |".format(wav_original_pipe)
            speech_dur = durations[recording_id]
            max_noises_recording = math.floor(max_noises_per_minute * speech_dur / 60)

            reverberate_opts = generate_reverberation_opts(room_dict,  # the room dictionary, please refer to make_room_dict() for the format
                                                         pointsource_noise_list, # the point source noise list
                                                         iso_noise_dict, # the isotropic noise dictionary
                                                         foreground_snrs, # the SNR for adding the foreground noises
                                                         background_snrs, # the SNR for adding the background noises
                                                         speech_rvb_probability, # Probability of reverberating a speech signal
                                                         isotropic_noise_addition_probability, # Probability of adding isotropic noises
                                                         pointsource_noise_addition_probability, # Probability of adding point-source noises
                                                         speech_dur,  # duration of the recording
                                                         max_noises_recording  # Maximum number of point-source noises that can be added
                                                         )

            # prefix using index 0 is reserved for original data e.g. rvb0_swb0035 corresponds to the swb0035 recording in original data
            if reverberate_opts == "" or i == 0:
                wav_corrupted_pipe = "{0}".format(wav_original_pipe)
            else:
                wav_corrupted_pipe = "{0} wav-reverberate --shift-output={1} {2} - - |".format(wav_original_pipe, shift_output, reverberate_opts)

            new_recording_id = get_new_id(recording_id, prefix, i)
            corrupted_wav_scp[new_recording_id] = wav_corrupted_pipe

    write_dict_to_file(corrupted_wav_scp, output_dir + "/wav.scp")


def add_prefix_to_fields(input_file, output_file, num_replicas, include_original, prefix, field = [0]):
    """ This function replicate the entries in files like segments, utt2spk, text
    """
    list = [x.strip() for x in open(input_file, encoding='utf-8')]
    f = open(output_file, "w", encoding='utf-8')
    if include_original:
        start_index = 0
    else:
        start_index = 1

    for i in range(start_index, num_replicas+1):
        for line in list:
            if len(line) > 0 and line[0] != ';':
                split1 = line.split()
                for j in field:
                    split1[j] = get_new_id(split1[j], prefix, i)
                print(" ".join(split1), file=f)
            else:
                print(line, file=f)
    f.close()


def create_reverberated_copy(input_dir,
                           output_dir,
                           room_dict,  # the room dictionary, please refer to make_room_dict() for the format
                           pointsource_noise_list, # the point source noise list
                           iso_noise_dict, # the isotropic noise dictionary
                           foreground_snr_string, # the SNR for adding the foreground noises
                           background_snr_string, # the SNR for adding the background noises
                           num_replicas, # Number of replicate to generated for the data
                           include_original, # include a copy of the original data
                           prefix, # prefix for the id of the corrupted utterances
                           speech_rvb_probability, # Probability of reverberating a speech signal
                           shift_output, # option whether to shift the output waveform
                           isotropic_noise_addition_probability, # Probability of adding isotropic noises
                           pointsource_noise_addition_probability, # Probability of adding point-source noises
                           max_noises_per_minute  # maximum number of point-source noises that can be added to a recording according to its duration
                           ):
    """ This function creates multiple copies of the necessary files,
        e.g. utt2spk, wav.scp ...
    """
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    wav_scp = parse_file_to_dict(input_dir + "/wav.scp", value_processor = lambda x: " ".join(x))
    if not os.path.isfile(input_dir + "/reco2dur"):
        print("Getting the duration of the recordings...");
        data_lib.RunKaldiCommand("utils/data/get_reco2dur.sh {}".format(input_dir))
    durations = parse_file_to_dict(input_dir + "/reco2dur", value_processor = lambda x: float(x[0]))
    foreground_snr_array = [float(x) for x in foreground_snr_string.split(':')]
    background_snr_array = [float(x) for x in background_snr_string.split(':')]

    generate_reverberated_wav_scp(wav_scp, durations, output_dir, room_dict, pointsource_noise_list, iso_noise_dict,
               foreground_snr_array, background_snr_array, num_replicas, include_original, prefix,
               speech_rvb_probability, shift_output, isotropic_noise_addition_probability,
               pointsource_noise_addition_probability, max_noises_per_minute)

    add_prefix_to_fields(input_dir + "/utt2spk", output_dir + "/utt2spk", num_replicas, include_original, prefix, field = [0,1])
    data_lib.RunKaldiCommand("utils/utt2spk_to_spk2utt.pl <{output_dir}/utt2spk >{output_dir}/spk2utt"
                    .format(output_dir = output_dir))

    if os.path.isfile(input_dir + "/utt2uniq"):
        add_prefix_to_fields(input_dir + "/utt2uniq", output_dir + "/utt2uniq", num_replicas, include_original, prefix, field =[0])
    else:
        # Create the utt2uniq file
        create_corrupted_utt2uniq(input_dir, output_dir, num_replicas, include_original, prefix)

    if os.path.isfile(input_dir + "/text"):
        add_prefix_to_fields(input_dir + "/text", output_dir + "/text", num_replicas, include_original, prefix, field =[0])
    if os.path.isfile(input_dir + "/segments"):
        add_prefix_to_fields(input_dir + "/segments", output_dir + "/segments", num_replicas, include_original, prefix, field = [0,1])
    if os.path.isfile(input_dir + "/reco2file_and_channel"):
        add_prefix_to_fields(input_dir + "/reco2file_and_channel", output_dir + "/reco2file_and_channel", num_replicas, include_original, prefix, field = [0,1])
    if os.path.isfile(input_dir + "/vad.scp"):
        add_prefix_to_fields(input_dir + "/vad.scp", output_dir + "/vad.scp", num_replicas, include_original, prefix, field=[0])

    data_lib.RunKaldiCommand("utils/validate_data_dir.sh --no-feats --no-text {output_dir}"
                    .format(output_dir = output_dir))


def smooth_probability_distribution(set_list, smoothing_weight=0.0, target_sum=1.0):
    """ This function smooths the probability distribution in the list
    """
    if len(list(set_list)) > 0:
      num_unspecified = 0
      accumulated_prob = 0
      for item in set_list:
          if item.probability is None:
              num_unspecified += 1
          else:
              accumulated_prob += item.probability

      # Compute the probability for the items without specifying their probability
      uniform_probability = 0
      if num_unspecified > 0 and accumulated_prob < 1:
          uniform_probability = (1 - accumulated_prob) / float(num_unspecified)
      elif num_unspecified > 0 and accumulated_prob >= 1:
          warnings.warn("The sum of probabilities specified by user is larger than or equal to 1. "
                        "The items without probabilities specified will be given zero to their probabilities.")

      for item in set_list:
          if item.probability is None:
              item.probability = uniform_probability
          else:
              # smooth the probability
              item.probability = (1 - smoothing_weight) * item.probability + smoothing_weight * uniform_probability

      # Normalize the probability
      sum_p = sum(item.probability for item in set_list)
      for item in set_list:
          item.probability = item.probability / sum_p * target_sum

    return set_list


def parse_set_parameter_strings(set_para_array):
    """ This function parse the array of rir set parameter strings.
        It will assign probabilities to those rir sets which don't have a probability
        It will also check the existence of the rir list files.
    """
    set_list = []
    for set_para in set_para_array:
        set = lambda: None
        setattr(set, "filename", None)
        setattr(set, "probability", None)
        parts = set_para.split(',')
        if len(parts) == 2:
            set.probability = float(parts[0])
            set.filename = parts[1].strip()
        else:
            set.filename = parts[0].strip()
        if not os.path.isfile(set.filename):
            raise Exception(set.filename + " not found")
        set_list.append(set)

    return smooth_probability_distribution(set_list)


def parse_rir_list(rir_set_para_array, smoothing_weight, sampling_rate = None):
    """ This function creates the RIR list
        Each rir object in the list contains the following attributes:
        rir_id, room_id, receiver_position_id, source_position_id, rt60, drr, probability
        Please refer to the help messages in the parser for the meaning of these attributes
    """
    rir_parser = argparse.ArgumentParser()
    rir_parser.add_argument('--rir-id', type=str, required=True, help='This id is unique for each RIR and the noise may associate with a particular RIR by refering to this id')
    rir_parser.add_argument('--room-id', type=str, required=True, help='This is the room that where the RIR is generated')
    rir_parser.add_argument('--receiver-position-id', type=str, default=None, help='receiver position id')
    rir_parser.add_argument('--source-position-id', type=str, default=None, help='source position id')
    rir_parser.add_argument('--rt60', type=float, default=None, help='RT60 is the time required for reflections of a direct sound to decay 60 dB.')
    rir_parser.add_argument('--drr', type=float, default=None, help='Direct-to-reverberant-ratio of the impulse response.')
    rir_parser.add_argument('--cte', type=float, default=None, help='Early-to-late index of the impulse response.')
    rir_parser.add_argument('--probability', type=float, default=None, help='probability of the impulse response.')
    rir_parser.add_argument('rir_rspecifier', type=str, help="""rir rspecifier, it can be either a filename or a piped command.
                            E.g. data/impulses/Room001-00001.wav or "sox data/impulses/Room001-00001.wav -t wav - |" """)

    set_list = parse_set_parameter_strings(rir_set_para_array)

    rir_list = []
    for rir_set in set_list:
        current_rir_list = [rir_parser.parse_args(shlex.split(x.strip())) for x in open(rir_set.filename)]
        for rir in current_rir_list:
            if sampling_rate is not None:
                # check if the rspecifier is a pipe or not
                if len(rir.rir_rspecifier.split()) == 1:
                    rir.rir_rspecifier = "sox {0} -r {1} -t wav - |".format(rir.rir_rspecifier, sampling_rate)
                else:
                    rir.rir_rspecifier = "{0} sox -t wav - -r {1} -t wav - |".format(rir.rir_rspecifier, sampling_rate)

        rir_list += smooth_probability_distribution(current_rir_list, smoothing_weight, rir_set.probability)

    return rir_list


def almost_equal(value_1, value_2, accuracy = 10**-8):
    """ This function checks if the inputs are approximately equal assuming they are floats.
    """
    return abs(value_1 - value_2) < accuracy


def make_room_dict(rir_list):
    """ This function converts a list of RIRs into a dictionary of RIRs indexed by the room-id.
        Its values are objects with two attributes: a local RIR list
        and the probability of the corresponding room
        Please look at the comments at parse_rir_list() for the attributes that a RIR object contains
    """
    room_dict = {}
    for rir in rir_list:
        if rir.room_id not in room_dict:
            # add new room
            room_dict[rir.room_id] = lambda: None
            setattr(room_dict[rir.room_id], "rir_list", [])
            setattr(room_dict[rir.room_id], "probability", 0)
        room_dict[rir.room_id].rir_list.append(rir)

    # the probability of the room is the sum of probabilities of its RIR
    for key in room_dict.keys():
        room_dict[key].probability = sum(rir.probability for rir in room_dict[key].rir_list)

    assert almost_equal(sum(room_dict[key].probability for key in room_dict.keys()), 1.0)

    return room_dict

def parse_noise_list(noise_set_para_array, smoothing_weight, sampling_rate = None):
    """ This function creates the point-source noise list
         and the isotropic noise dictionary from the noise information file
         The isotropic noise dictionary is indexed by the room
         and its value is the corrresponding isotropic noise list
         Each noise object in the list contains the following attributes:
         noise_id, noise_type, bg_fg_type, room_linkage, probability, noise_rspecifier
         Please refer to the help messages in the parser for the meaning of these attributes
    """
    noise_parser = argparse.ArgumentParser()
    noise_parser.add_argument('--noise-id', type=str, required=True, help='noise id')
    noise_parser.add_argument('--noise-type', type=str, required=True, help='the type of noise; i.e. isotropic or point-source', choices = ["isotropic", "point-source"])
    noise_parser.add_argument('--bg-fg-type', type=str, default="background", help='background or foreground noise, for background noises, '
                              'they will be extended before addition to cover the whole speech; for foreground noise, they will be kept '
                              'to their original duration and added at a random point of the speech.', choices = ["background", "foreground"])
    noise_parser.add_argument('--room-linkage', type=str, default=None, help='required if isotropic, should not be specified if point-source.')
    noise_parser.add_argument('--probability', type=float, default=None, help='probability of the noise.')
    noise_parser.add_argument('noise_rspecifier', type=str, help="""noise rspecifier, it can be either a filename or a piped command.
                              E.g. type5_noise_cirline_ofc_ambient1.wav or "sox type5_noise_cirline_ofc_ambient1.wav -t wav - |" """)

    set_list = parse_set_parameter_strings(noise_set_para_array)

    pointsource_noise_list = []
    iso_noise_dict = {}
    for noise_set in set_list:
        current_noise_list = [noise_parser.parse_args(shlex.split(x.strip())) for x in open(noise_set.filename)]
        current_pointsource_noise_list = []
        for noise in current_noise_list:
            if sampling_rate is not None:
                # check if the rspecifier is a pipe or not
                if len(noise.noise_rspecifier.split()) == 1:
                    noise.noise_rspecifier = "sox {0} -r {1} -t wav - |".format(noise.noise_rspecifier, sampling_rate)
                else:
                    noise.noise_rspecifier = "{0} sox -t wav - -r {1} -t wav - |".format(noise.noise_rspecifier, sampling_rate)

            if noise.noise_type == "isotropic":
                if noise.room_linkage is None:
                    raise Exception("--room-linkage must be specified if --noise-type is isotropic")
                else:
                    if noise.room_linkage not in iso_noise_dict:
                        iso_noise_dict[noise.room_linkage] = []
                    iso_noise_dict[noise.room_linkage].append(noise)
            else:
                current_pointsource_noise_list.append(noise)

        pointsource_noise_list += smooth_probability_distribution(current_pointsource_noise_list, smoothing_weight, noise_set.probability)

    # ensure the point-source noise probabilities sum to 1
    pointsource_noise_list = smooth_probability_distribution(pointsource_noise_list, smoothing_weight, 1.0)
    if len(pointsource_noise_list) > 0:
        assert almost_equal(sum(noise.probability for noise in pointsource_noise_list), 1.0)

    # ensure the isotropic noise source probabilities for a given room sum to 1
    for key in iso_noise_dict.keys():
        iso_noise_dict[key] = smooth_probability_distribution(iso_noise_dict[key])
        assert almost_equal(sum(noise.probability for noise in iso_noise_dict[key]), 1.0)

    return (pointsource_noise_list, iso_noise_dict)


def main():
    args = get_args()

    random.seed(args.random_seed)
    rir_list = parse_rir_list(args.rir_set_para_array, args.rir_smoothing_weight, args.source_sampling_rate)
    print("Number of RIRs is {0}".format(len(rir_list)))
    pointsource_noise_list = []
    iso_noise_dict = {}
    if args.noise_set_para_array is not None:
        pointsource_noise_list, iso_noise_dict = parse_noise_list(args.noise_set_para_array,
                                                                args.noise_smoothing_weight,
                                                                args.source_sampling_rate)
        print("Number of point-source noises is {0}".format(len(pointsource_noise_list)))
        print("Number of isotropic noises is {0}".format(sum(len(iso_noise_dict[key]) for key in iso_noise_dict.keys())))
    room_dict = make_room_dict(rir_list)

    if args.include_original_data == "true":
        include_original = True
    else:
        include_original = False
    create_reverberated_copy(input_dir = args.input_dir,
                           output_dir = args.output_dir,
                           room_dict = room_dict,
                           pointsource_noise_list = pointsource_noise_list,
                           iso_noise_dict = iso_noise_dict,
                           foreground_snr_string = args.foreground_snr_string,
                           background_snr_string = args.background_snr_string,
                           num_replicas = args.num_replicas,
                           include_original = include_original,
                           prefix = args.prefix,
                           speech_rvb_probability = args.speech_rvb_probability,
                           shift_output = args.shift_output,
                           isotropic_noise_addition_probability = args.isotropic_noise_addition_probability,
                           pointsource_noise_addition_probability = args.pointsource_noise_addition_probability,
                           max_noises_per_minute = args.max_noises_per_minute)


    data_lib.RunKaldiCommand("utils/validate_data_dir.sh --no-feats --no-text {output_dir}"
                    .format(output_dir = args.output_dir))

if __name__ == "__main__":
    main()


================================================
FILE: egs/steps/decode.sh
================================================
#!/usr/bin/env bash

# Copyright 2012  Johns Hopkins University (Author: Daniel Povey)
# Apache 2.0

# Begin configuration section.
transform_dir=   # this option won't normally be used, but it can be used if you want to
                 # supply existing fMLLR transforms when decoding.
iter=
model= # You can specify the model to use (e.g. if you want to use the .alimdl)
stage=0
nj=4
cmd=run.pl
max_active=7000
beam=13.0
lattice_beam=6.0
acwt=0.083333 # note: only really affects pruning (scoring is on lattices).
num_threads=1 # if >1, will use gmm-latgen-faster-parallel
parallel_opts=  # ignored now.
scoring_opts=
# note: there are no more min-lmwt and max-lmwt options, instead use
# e.g. --scoring-opts "--min-lmwt 1 --max-lmwt 20"
skip_scoring=false
decode_extra_opts=
# End configuration section.

echo "$0 $@"  # Print the command line for logging

[ -f ./path.sh ] && . ./path.sh; # source the path.
. parse_options.sh || exit 1;

if [ $# != 3 ]; then
   echo "Usage: steps/decode.sh [options] <graph-dir> <data-dir> <decode-dir>"
   echo "... where <decode-dir> is assumed to be a sub-directory of the directory"
   echo " where the model is."
   echo "e.g.: steps/decode.sh exp/mono/graph_tgpr data/test_dev93 exp/mono/decode_dev93_tgpr"
   echo ""
   echo "This script works on CMN + (delta+delta-delta | LDA+MLLT) features; it works out"
   echo "what type of features you used (assuming it's one of these two)"
   echo ""
   echo "main options (for others, see top of script file)"
   echo "  --config <config-file>                           # config containing options"
   echo "  --nj <nj>                                        # number of parallel jobs"
   echo "  --iter <iter>                                    # Iteration of model to test."
   echo "  --model <model>                                  # which model to use (e.g. to"
   echo "                                                   # specify the final.alimdl)"
   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
   echo "  --transform-dir <trans-dir>                      # dir to find fMLLR transforms "
   echo "  --acwt <float>                                   # acoustic scale used for lattice generation "
   echo "  --scoring-opts <string>                          # options to local/score.sh"
   echo "  --num-threads <n>                                # number of threads to use, default 1."
   echo "  --parallel-opts <opts>                           # ignored now, present for historical reasons."
   exit 1;
fi


graphdir=$1
data=$2
dir=$3
srcdir=`dirname $dir`; # The model directory is one level up from decoding directory.
sdata=$data/split$nj;

mkdir -p $dir/log
[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
echo $nj > $dir/num_jobs

if [ -z "$model" ]; then # if --model <mdl> was not specified on the command line...
  if [ -z $iter ]; then model=$srcdir/final.mdl;
  else model=$srcdir/$iter.mdl; fi
fi

if [ $(basename $model) != final.alimdl ] ; then
  # Do not use the $srcpath -- look at the path where the model is
  if [ -f $(dirname $model)/final.alimdl ] && [ -z "$transform_dir" ]; then
    echo -e '\n\n'
    echo $0 'WARNING: Running speaker independent system decoding using a SAT model!'
    echo $0 'WARNING: This is OK if you know what you are doing...'
    echo -e '\n\n'
  fi
fi

for f in $sdata/1/feats.scp $sdata/1/cmvn.scp $model $graphdir/HCLG.fst; do
  [ ! -f $f ] && echo "$0: Error: no such file $f" && exit 1;
done

if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
echo "decode.sh: feature type is $feat_type";

splice_opts=`cat $srcdir/splice_opts 2>/dev/null` # frame-splicing options.
cmvn_opts=`cat $srcdir/cmvn_opts 2>/dev/null`
delta_opts=`cat $srcdir/delta_opts 2>/dev/null`

thread_string=
[ $num_threads -gt 1 ] && thread_string="-parallel --num-threads=$num_threads"

case $feat_type in
  delta) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas $delta_opts ark:- ark:- |";;
  lda) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |";;
  *) echo "$0: Error: Invalid feature type $feat_type" && exit 1;
esac
if [ ! -z "$transform_dir" ]; then # add transforms to features...
  echo "Using fMLLR transforms from $transform_dir"
  [ ! -f $transform_dir/trans.1 ] && echo "Expected $transform_dir/trans.1 to exist."
  [ ! -s $transform_dir/num_jobs ] && \
    echo "$0: Error: expected $transform_dir/num_jobs to contain the number of jobs." && exit 1;
  nj_orig=$(cat $transform_dir/num_jobs)
  if [ $nj -ne $nj_orig ]; then
    # Copy the transforms into an archive with an index.
    echo "$0: num-jobs for transforms mismatches, so copying them."
    for n in $(seq $nj_orig); do cat $transform_dir/trans.$n; done | \
       copy-feats ark:- ark,scp:$dir/trans.ark,$dir/trans.scp || exit 1;
    feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk scp:$dir/trans.scp ark:- ark:- |"
  else
    # number of jobs matches with alignment dir.
    feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/trans.JOB ark:- ark:- |"
  fi
fi

if [ $stage -le 0 ]; then
  if [ -f "$graphdir/num_pdfs" ]; then
    [ "`cat $graphdir/num_pdfs`" -eq `am-info --print-args=false $model | grep pdfs | awk '{print $NF}'` ] || \
      { echo "$0: Error: Mismatch in number of pdfs with $model"; exit 1; }
  fi
  $cmd --num-threads $num_threads JOB=1:$nj $dir/log/decode.JOB.log \
    gmm-latgen-faster$thread_string --max-active=$max_active --beam=$beam --lattice-beam=$lattice_beam \
    --acoustic-scale=$acwt --allow-partial=true --word-symbol-table=$graphdir/words.txt $decode_extra_opts \
    $model $graphdir/HCLG.fst "$feats" "ark:|gzip -c > $dir/lat.JOB.gz" || exit 1;
fi

if [ $stage -le 1 ]; then
  [ ! -z $iter ] && iter_opt="--iter $iter"
  steps/diagnostic/analyze_lats.sh --cmd "$cmd" $iter_opt $graphdir $dir
fi

if ! $skip_scoring ; then
  [ ! -x local/score.sh ] && \
    echo "$0: Not scoring because local/score.sh does not exist or not executable." && exit 1;
  local/score.sh --cmd "$cmd" $scoring_opts $data $graphdir $dir ||
    { echo "$0: Error: scoring failed. (ignore by '--skip-scoring true')"; exit 1; }
fi

exit 0;


================================================
FILE: egs/steps/decode_basis_fmllr.sh
================================================
#!/usr/bin/env bash

# Copyright 2012   Carnegie Mellon University (Author: Yajie Miao)
#                  Johns Hopkins University (Author: Daniel Povey)
#           2014   David Snyder

# Decoding script that does basis fMLLR.  This can be on top of delta+delta-delta,
# or LDA+MLLT features.

# There are 3 models involved potentially in this script,
# and for a standard, speaker-independent system they will all be the same.
# The "alignment model" is for the 1st-pass decoding and to get the 
# Gaussian-level alignments for the "adaptation model" the first time we
# do fMLLR.  The "adaptation model" is used to estimate fMLLR transforms
# and to generate state-level lattices.  The lattices are then rescored
# with the "final model".
#
# The following table explains where we get these 3 models from.
# Note: $srcdir is one level up from the decoding directory.
#
#   Model              Default source:                 
#
#  "alignment model"   $srcdir/final.alimdl              --alignment-model <model>
#                     (or $srcdir/final.mdl if alimdl absent)
#  "adaptation model"  $srcdir/final.mdl                 --adapt-model <model>
#  "final model"       $srcdir/final.mdl                 --final-model <model>


# Begin configuration section
first_beam=10.0 # Beam used in initial, speaker-indep. pass
first_max_active=2000 # max-active used in initial pass.
alignment_model=
adapt_model=
final_model=
stage=0
acwt=0.083333 # Acoustic weight used in getting fMLLR transforms, and also in 
              # lattice generation.

# Parameters in alignment of training data
scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
align_beam=10
retry_beam=40

max_active=7000
beam=13.0
lattice_beam=6.0
nj=4
silence_weight=0.01
cmd=run.pl
si_dir=
num_threads=1 # if >1, will use gmm-latgen-faster-parallel
parallel_opts=  # ignored, present for historical reasons.
skip_scoring=false
scoring_opts=
# End configuration section

echo "$0 $@"  # Print the command line for logging

[ -f ./path.sh ] && . ./path.sh; # source the path.
. parse_options.sh || exit 1;

if [ $# != 3 ]; then
   echo "Usage: steps/decode_basis_fmllr.sh [options] <graph-dir> <data-dir> <decode-dir>"
   echo " e.g.: steps/decode_basis_fmllr.sh exp/tri2b/graph_tgpr data/train_si84 data/test_dev93 exp/tri2b/decode_dev93_tgpr"
   echo "main options (for others, see top of script file)"
   echo "  --config <config-file>                   # config containing options"
   echo "  --nj <nj>                                # number of parallel jobs"
   echo "  --cmd <cmd>                              # Command to run in parallel with"
   echo "  --adapt-model <adapt-mdl>                # Model to compute transforms with"
   echo "  --alignment-model <ali-mdl>              # Model to get Gaussian-level alignments for"
   echo "                                           # 1st pass of transform computation."
   echo "  --final-model <finald-mdl>               # Model to finally decode with"
   echo "  --si-dir <speaker-indep-decoding-dir>    # use this to skip 1st pass of decoding"
   echo "                                           # Caution-- must be with same tree"
   echo "  --acwt <acoustic-weight>                 # default 0.08333 ... used to get posteriors"
   echo "  --scoring-opts <string>                  # options to local/score.sh"
   echo "  --num-threads <n>                        # number of threads to use, default 1."
   echo "  --parallel-opts <opts>                   # ignored, present for historical reasons."
   exit 1;
fi


graphdir=$1
data=$2
dir=`echo $3 | sed 's:/$::g'` # remove any trailing slash.

srcdir=`dirname $dir`; # Assume model directory one level up from decoding directory.
sdata=$data/split$nj;

thread_string=
[ $num_threads -gt 1 ] && thread_string="-parallel --num-threads=$num_threads"

mkdir -p $dir/log
[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
echo $nj > $dir/num_jobs
splice_opts=`cat $srcdir/splice_opts 2>/dev/null` # frame-splicing options.
cmvn_opts=`cat $srcdir/cmvn_opts 2>/dev/null`
delta_opts=`cat $srcdir/delta_opts 2>/dev/null`

silphonelist=`cat $graphdir/phones/silence.csl` || exit 1;

# Some checks.  Note: we don't need $srcdir/tree but we expect
# it should exist, given the current structure of the scripts.
for f in $graphdir/HCLG.fst $data/feats.scp $srcdir/tree $srcdir/fmllr.basis; do
  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
done

## Work out name of alignment model. ##
if [ -z "$alignment_model" ]; then
  if [ -f "$srcdir/final.alimdl" ]; then alignment_model=$srcdir/final.alimdl;
  else alignment_model=$srcdir/final.mdl; fi
fi
[ ! -f "$alignment_model" ] && echo "$0: no alignment model $alignment_model " && exit 1;
##

## Do the speaker-independent decoding, if --si-dir option not present. ##
if [ -z "$si_dir" ]; then # we need to do the speaker-independent decoding pass.
  si_dir=${dir}.si # Name it as our decoding dir, but with suffix ".si".
  if [ $stage -le 0 ]; then
    if [ -f "$graphdir/num_pdfs" ]; then
      [ "`cat $graphdir/num_pdfs`" -eq `am-info --print-args=false $alignment_model | grep pdfs | awk '{print $NF}'` ] || \
        { echo "Mismatch in number of pdfs with $alignment_model"; exit 1; }
    fi

    steps/decode.sh --scoring-opts "$scoring_opts" \
              --num-threads $num_threads --skip-scoring $skip_scoring \
              --acwt $acwt --nj $nj --cmd "$cmd" --beam $first_beam \
              --model $alignment_model --max-active \
              $first_max_active $graphdir $data $si_dir || exit 1;
  fi
fi
##

## Some checks, and setting of defaults for variables.
[ "$nj" -ne "`cat $si_dir/num_jobs`" ] && echo "Mismatch in #jobs with si-dir" && exit 1;
[ ! -f "$si_dir/lat.1.gz" ] && echo "No such file $si_dir/lat.1.gz" && exit 1;
[ -z "$adapt_model" ] && adapt_model=$srcdir/final.mdl
[ -z "$final_model" ] && final_model=$srcdir/final.mdl
for f in $adapt_model $final_model; do
  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
done
##

## Set up the unadapted features "$sifeats" for testing set
if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
echo "$0: feature type is $feat_type";
case $feat_type in
  delta) sifeats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas $delta_opts ark:- ark:- |";;
  lda) sifeats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |";;
  *) echo "Invalid feature type $feat_type" && exit 1;
esac
##

## Now get the first-pass fMLLR transforms.
## We give all the default parameters in gmm-est-basis-fmllr
if [ $stage -le 1 ]; then
  echo "$0: getting first-pass fMLLR transforms."
  $cmd JOB=1:$nj $dir/log/fmllr_pass1.JOB.log \
    gunzip -c $si_dir/lat.JOB.gz \| \
    lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \
    weight-silence-post $silence_weight $silphonelist $alignment_model ark:- ark:- \| \
    gmm-post-to-gpost $alignment_model "$sifeats" ark:- ark:- \| \
    gmm-est-basis-fmllr-gpost --spk2utt=ark:$sdata/JOB/spk2utt \
    --fmllr-min-count=200  --num-iters=10 --size-scale=0.2 \
    --step-size-iters=3 --write-weights=ark:$dir/pre_wgt.JOB \
     $adapt_model $srcdir/fmllr.basis "$sifeats" ark,s,cs:- \
    ark:$dir/pre_trans.JOB || exit 1;
fi
##

pass1feats="$sifeats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$dir/pre_trans.JOB ark:- ark:- |"

## Do the main lattice generation pass.  Note: we don't determinize the lattices at
## this stage, as we're going to use them in acoustic rescoring with the larger 
## model, and it's more correct to store the full state-level lattice for this purpose.
if [ $stage -le 2 ]; then
  echo "$0: doing main lattice generation phase"
  if [ -f "$graphdir/num_pdfs" ]; then
    [ "`cat $graphdir/num_pdfs`" -eq `am-info --print-args=false $adapt_model | grep pdfs | awk '{print $NF}'` ] || \
      { echo "Mismatch in number of pdfs with $adapt_model"; exit 1; }
  fi
  $cmd JOB=1:$nj --num-threads $num_threads $dir/log/decode.JOB.log \
    gmm-latgen-faster$thread_string --max-active=$max_active --beam=$beam --lattice-beam=$lattice_beam \
    --acoustic-scale=$acwt  \
    --determinize-lattice=false --allow-partial=true --word-symbol-table=$graphdir/words.txt \
    $adapt_model $graphdir/HCLG.fst "$pass1feats" "ark:|gzip -c > $dir/lat.tmp.JOB.gz" \
    || exit 1;
fi
##

## Do a second pass of estimating the transform-- this time with the lattices
## generated from the alignment model.  Compose the transforms to get
## $dir/trans.1, etc.
if [ $stage -le 3 ]; then
  echo "$0: estimating fMLLR transforms a second time."
  $cmd JOB=1:$nj $dir/log/fmllr_pass2.JOB.log \
    lattice-determinize-pruned$thread_string --acoustic-scale=$acwt --beam=4.0 \
    "ark:gunzip -c $dir/lat.tmp.JOB.gz|" ark:- \| \
    lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \
    weight-silence-post $silence_weight $silphonelist $adapt_model ark:- ark:- \| \
    gmm-est-basis-fmllr --fmllr-min-count=200 \
    --spk2utt=ark:$sdata/JOB/spk2utt --write-weights=ark:$dir/trans_tmp_wgt.JOB \
    $adapt_model $srcdir/fmllr.basis "$pass1feats" ark,s,cs:- ark:$dir/trans_tmp.JOB '&&' \
    compose-transforms --b-is-affine=true ark:$dir/trans_tmp.JOB ark:$dir/pre_trans.JOB \
    ark:$dir/trans.JOB  || exit 1;
fi
##

feats="$sifeats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$dir/trans.JOB ark:- ark:- |"

# Rescore the state-level lattices with the final adapted features, and the final model
# (which by default is $srcdir/final.mdl, but which may be specified on the command line,
# useful in case of discriminatively trained systems).
# At this point we prune and determinize the lattices and write them out, ready for 
# language model rescoring.

if [ $stage -le 4 ]; then
  echo "$0: doing a final pass of acoustic rescoring."
  $cmd JOB=1:$nj $dir/log/acoustic_rescore.JOB.log \
    gmm-rescore-lattice $final_model "ark:gunzip -c $dir/lat.tmp.JOB.gz|" "$feats" ark:- \| \
    lattice-determinize-pruned$thread_string --acoustic-scale=$acwt --beam=$lattice_beam ark:- \
    "ark:|gzip -c > $dir/lat.JOB.gz" '&&' rm $dir/lat.tmp.JOB.gz || exit 1;
fi

if ! $skip_scoring ; then
  [ ! -x local/score.sh ] && \
    echo "$0: not scoring because local/score.sh does not exist or not executable." && exit 1;
  local/score.sh --cmd "$cmd" $data $graphdir $dir ||
    { echo "$0: Scoring failed. (ignore by '--skip-scoring true')"; exit 1; }
fi

rm $dir/{trans_tmp,pre_trans}.*

exit 0;


================================================
FILE: egs/steps/decode_biglm.sh
================================================
#!/usr/bin/env bash

# Copyright 2012  Johns Hopkins University (Author: Daniel Povey)
# Apache 2.0

# Begin configuration.
nj=4
cmd=run.pl
maxactive=7000
beam=13.0
lattice_beam=6.0
acwt=0.083333
skip_scoring=false
# End configuration.

echo "$0 $@"  # Print the command line for logging

[ -f ./path.sh ] && . ./path.sh; # source the path.
. parse_options.sh || exit 1;

if [ $# != 5 ]; then
   echo "Usage: steps/decode_si_biglm.sh [options] <graph-dir> <old-LM-fst> <new-LM-fst> <data-dir> <decode-dir>"
   echo "... where <decode-dir> is assumed to be a sub-directory of the directory"
   echo " where the model is."
   echo "e.g.: steps/decode_si.sh exp/mono/graph_tgpr data/test_dev93 exp/mono/decode_dev93_tgpr"
   echo ""
   echo "This script works on CMN + (delta+delta-delta | LDA+MLLT) features; it works out"
   echo "what type of features you used (assuming it's one of these two)"
   echo ""
   echo "main options (for others, see top of script file)"
   echo "  --config <config-file>                           # config containing options"
   echo "  --nj <nj>                                        # number of parallel jobs"
   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
   exit 1;
fi


graphdir=$1
oldlm_fst=$2
newlm_fst=$3
data=$4
dir=$5

srcdir=`dirname $dir`; # The model directory is one level up from decoding directory.
sdata=$data/split$nj;
splice_opts=`cat $srcdir/splice_opts 2>/dev/null`
cmvn_opts=`cat $srcdir/cmvn_opts 2>/dev/null`
delta_opts=`cat $srcdir/delta_opts 2>/dev/null`

mkdir -p $dir/log
[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
echo $nj > $dir/num_jobs


for f in $sdata/1/feats.scp $sdata/1/cmvn.scp $srcdir/final.mdl $graphdir/HCLG.fst $oldlm_fst $newlm_fst; do
  [ ! -f $f ] && echo "decode_si.sh: no such file $f" && exit 1;
done


if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
echo "decode_si.sh: feature type is $feat_type"

case $feat_type in
  delta) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas $delta_opts ark:- ark:- |";;
  lda) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |";;
  *) echo "Invalid feature type $feat_type" && exit 1;
esac

[ -f `dirname $oldlm_fst`/words.txt ] && ! cmp `dirname $oldlm_fst`/words.txt $graphdir/words.txt && \
  echo "Warning: old LM words.txt does not match with that in $graphdir .. probably will not work.";
[ -f `dirname $newlm_fst`/words.txt ] && ! cmp `dirname $oldlm_fst`/words.txt $graphdir/words.txt && \
  echo "Warning: new LM words.txt does not match with that in $graphdir .. probably will not work.";

# fstproject replaces the disambiguation symbol #0, which only appears on the
# input side, with the <eps> that appears in the corresponding arcs on the output side.
oldlm_cmd="fstproject --project_output=true $oldlm_fst | fstarcsort --sort_type=ilabel |"
newlm_cmd="fstproject --project_output=true $newlm_fst | fstarcsort --sort_type=ilabel |"

$cmd JOB=1:$nj $dir/log/decode.JOB.log \
 gmm-latgen-biglm-faster --max-active=$maxactive --beam=$beam --lattice-beam=$lattice_beam \
   --acoustic-scale=$acwt --allow-partial=true --word-symbol-table=$graphdir/words.txt \
  $srcdir/final.mdl $graphdir/HCLG.fst "$oldlm_cmd" "$newlm_cmd" "$feats" \
  "ark:|gzip -c > $dir/lat.JOB.gz" || exit 1;

if ! $skip_scoring ; then
  [ ! -x local/score.sh ] && \
    echo "Not scoring because local/score.sh does not exist or not executable." && exit 1;
  local/score.sh --cmd "$cmd" $data $graphdir $dir ||
    { echo "$0: Scoring failed. (ignore by '--skip-scoring true')"; exit 1; }
fi

exit 0;


================================================
FILE: egs/steps/decode_combine.sh
================================================
#!/usr/bin/env bash

# Copyright 2012  Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.

# Combine two decoding directories by composing the lattices (we
# apply a weight to each of the original weights, by default 0.5 each).
# Note, this is not the only combination method, or the most normal combination
# method.  See also egs/wsj/s5/local/score_combine.sh.

# Begin configuration section.
weight1=0.5 # Weight on 1st set of lattices.
cmd=run.pl
skip_scoring=false
# End configuration section.

echo "$0 $@"  # Print the command line for logging

[ -f ./path.sh ] && . ./path.sh; # source the path.
. parse_options.sh || exit 1;

if [ $# -ne 5 ]; then
  echo "Usage: steps/decode_combine.sh [options] <data> <lang-dir|graph-dir> <decode-dir1> <decode-dir2> <decode-dir-out>"
  echo " e.g.: steps/decode_combine.sh data/lang data/test exp/dir1/decode exp/dir2/decode exp/combine_1_2/decode"
  echo "main options (for others, see top of script file)"
  echo "  --config <config-file>                   # config containing options"
  echo "  --cmd <cmd>                              # Command to run in parallel with"
  echo "  --weight1 <weight>                       # Weight on 1st set of lattices (default 0.5)"
  exit 1;
fi

data=$1
lang_or_graphdir=$2
srcdir1=$3
srcdir2=$4
dir=$5

for f in $data/utt2spk $lang_or_graphdir/phones.txt $srcdir1/lat.1.gz $srcdir2/lat.1.gz; do
  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
done

nj1=`cat $srcdir1/num_jobs` || exit 1;
nj2=`cat $srcdir2/num_jobs` || exit 1;
[ $nj1 -ne $nj2 ] && echo "$0: mismatch in number of jobs $nj1 versus $nj2" && exit 1;
nj=$nj1

mkdir -p $dir/log
echo $nj > $dir/num_jobs

# The lattice-interp command does the score interpolation (with composition),
# and the lattice-copy-backoff replaces the result with the 1st lattice, in
# cases where the composed result was empty.
$cmd JOB=1:$nj $dir/log/interp.JOB.log \
  lattice-interp --alpha=$weight1 "ark:gunzip -c $srcdir1/lat.JOB.gz|" \
   "ark,s,cs:gunzip -c $srcdir2/lat.JOB.gz|" ark:- \| \
  lattice-copy-backoff "ark,s,cs:gunzip -c $srcdir1/lat.JOB.gz|" ark,s,cs:- \
   "ark:|gzip -c >$dir/lat.JOB.gz" || exit 1;

if ! $skip_scoring ; then
  [ ! -x local/score.sh ] && \
    echo "Not scoring because local/score.sh does not exist or not executable." && exit 1;
  local/score.sh --cmd "$cmd" $data $lang_or_graphdir $dir ||
    { echo "$0: Scoring failed. (ignore by '--skip-scoring true')"; exit 1; }
fi

exit 0;


================================================
FILE: egs/steps/decode_fmllr.sh
================================================
#!/usr/bin/env bash

# Copyright 2012-2015  Johns Hopkins University (Author: Daniel Povey)

# Decoding script that does fMLLR.  This can be on top of delta+delta-delta, or
# LDA+MLLT features.

# There are 3 models involved potentially in this script,
# and for a standard, speaker-independent system they will all be the same.
# The "alignment model" is for the 1st-pass decoding and to get the
# Gaussian-level alignments for the "adaptation model" the first time we
# do fMLLR.  The "adaptation model" is used to estimate fMLLR transforms
# and to generate state-level lattices.  The lattices are then rescored
# with the "final model".
#
# The following table explains where we get these 3 models from.
# Note: $srcdir is one level up from the decoding directory.
#
#   Model              Default source:
#
#  "alignment model"   $srcdir/final.alimdl              --alignment-model <model>
#                     (or $srcdir/final.mdl if alimdl absent)
#  "adaptation model"  $srcdir/final.mdl                 --adapt-model <model>
#  "final model"       $srcdir/final.mdl                 --final-model <model>


# Begin configuration section
first_beam=10.0 # Beam used in initial, speaker-indep. pass
first_max_active=2000 # max-active used in initial pass.
alignment_model=
adapt_model=
final_model=
stage=0
acwt=0.083333 # Acoustic weight used in getting fMLLR transforms, and also in
              # lattice generation.
max_active=7000
beam=13.0
lattice_beam=6.0
nj=4
silence_weight=0.01
cmd=run.pl
si_dir=
fmllr_update_type=full
num_threads=1 # if >1, will use gmm-latgen-faster-parallel
parallel_opts=  # ignored now.
skip_scoring=false
scoring_opts=
max_fmllr_jobs=25  # I've seen the fMLLR jobs overload NFS badly if the decoding
                   # was started with a lot of many jobs, so we limit the number of
                   # parallel jobs to 25 by default.  End configuration section
echo "$0 $@"  # Print the command line for logging

[ -f ./path.sh ] && . ./path.sh; # source the path.
. parse_options.sh || exit 1;

if [ $# != 3 ]; then
   echo "Wrong #arguments ($#, expected 3)"
   echo "Usage: steps/decode_fmllr.sh [options] <graph-dir> <data-dir> <decode-dir>"
   echo " e.g.: steps/decode_fmllr.sh exp/tri2b/graph_tgpr data/test_dev93 exp/tri2b/decode_dev93_tgpr"
   echo "main options (for others, see top of script file)"
   echo "  --config <config-file>                   # config containing options"
   echo "  --nj <nj>                                # number of parallel jobs"
   echo "  --cmd <cmd>                              # Command to run in parallel with"
   echo "  --adapt-model <adapt-mdl>                # Model to compute transforms with"
   echo "  --alignment-model <ali-mdl>              # Model to get Gaussian-level alignments for"
   echo "                                           # 1st pass of transform computation."
   echo "  --final-model <finald-mdl>               # Model to finally decode with"
   echo "  --si-dir <speaker-indep-decoding-dir>    # use this to skip 1st pass of decoding"
   echo "                                           # Caution-- must be with same tree"
   echo "  --acwt <acoustic-weight>                 # default 0.08333 ... used to get posteriors"
   echo "  --num-threads <n>                        # number of threads to use, default 1."
   echo "  --scoring-opts <opts>                    # options to local/score.sh"
   exit 1;
fi


graphdir=$1
data=$2
dir=`echo $3 | sed 's:/$::g'` # remove any trailing slash.

srcdir=`dirname $dir`; # Assume model directory one level up from decoding directory.
sdata=$data/split$nj;

thread_string=
[ $num_threads -gt 1 ] && thread_string="-parallel --num-threads=$num_threads"


mkdir -p $dir/log
split_data.sh $data $nj || exit 1;
echo $nj > $dir/num_jobs
splice_opts=`cat $srcdir/splice_opts 2>/dev/null` # frame-splicing options.
cmvn_opts=`cat $srcdir/cmvn_opts 2>/dev/null`
delta_opts=`cat $srcdir/delta_opts 2>/dev/null`

silphonelist=`cat $graphdir/phones/silence.csl` || exit 1;

# Some checks.  Note: we don't need $srcdir/tree but we expect
# it should exist, given the current structure of the scripts.
for f in $graphdir/HCLG.fst $data/feats.scp $srcdir/tree; do
  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
done

## Work out name of alignment model. ##
if [ -z "$alignment_model" ]; then
  if [ -f "$srcdir/final.alimdl" ]; then alignment_model=$srcdir/final.alimdl;
  else alignment_model=$srcdir/final.mdl; fi
fi
[ ! -f "$alignment_model" ] && echo "$0: no alignment model $alignment_model " && exit 1;
##

## Do the speaker-independent decoding, if --si-dir option not present. ##
if [ -z "$si_dir" ]; then # we need to do the speaker-independent decoding pass.
  si_dir=${dir}.si # Name it as our decoding dir, but with suffix ".si".
  if [ $stage -le 0 ]; then
    if [ -f "$graphdir/num_pdfs" ]; then
      [ "`cat $graphdir/num_pdfs`" -eq `am-info --print-args=false $alignment_model | grep pdfs | awk '{print $NF}'` ] || \
        { echo "Mismatch in number of pdfs with $alignment_model"; exit 1; }
    fi
    steps/decode.sh --scoring-opts "$scoring_opts" \
           --num-threads $num_threads --skip-scoring $skip_scoring \
           --acwt $acwt --nj $nj --cmd "$cmd" --beam $first_beam \
           --model $alignment_model --max-active \
           $first_max_active $graphdir $data $si_dir || exit 1;
  fi
fi
##

## Some checks, and setting of defaults for variables.
[ "$nj" -ne "`cat $si_dir/num_jobs`" ] && echo "Mismatch in #jobs with si-dir" && exit 1;
[ ! -f "$si_dir/lat.1.gz" ] && echo "No such file $si_dir/lat.1.gz" && exit 1;
[ -z "$adapt_model" ] && adapt_model=$srcdir/final.mdl
[ -z "$final_model" ] && final_model=$srcdir/final.mdl
for f in $adapt_model $final_model; do
  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
done
##

## Set up the unadapted features "$sifeats"
if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
echo "$0: feature type is $feat_type";
case $feat_type in
  delta) sifeats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas $delta_opts ark:- ark:- |";;
  lda) sifeats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |";;
  *) echo "Invalid feature type $feat_type" && exit 1;
esac
##

## Now get the first-pass fMLLR transforms.
if [ $stage -le 1 ]; then
  echo "$0: getting first-pass fMLLR transforms."
  $cmd --max-jobs-run $max_fmllr_jobs JOB=1:$nj $dir/log/fmllr_pass1.JOB.log \
    gunzip -c $si_dir/lat.JOB.gz \| \
    lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \
    weight-silence-post $silence_weight $silphonelist $alignment_model ark:- ark:- \| \
    gmm-post-to-gpost $alignment_model "$sifeats" ark:- ark:- \| \
    gmm-est-fmllr-gpost --fmllr-update-type=$fmllr_update_type \
    --spk2utt=ark:$sdata/JOB/spk2utt $adapt_model "$sifeats" ark,s,cs:- \
    ark:$dir/pre_trans.JOB || exit 1;
fi
##

pass1feats="$sifeats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$dir/pre_trans.JOB ark:- ark:- |"

## Do the main lattice generation pass.  Note: we don't determinize the lattices at
## this stage, as we're going to use them in acoustic rescoring with the larger
## model, and it's more correct to store the full state-level lattice for this purpose.
if [ $stage -le 2 ]; then
  echo "$0: doing main lattice generation phase"
  if [ -f "$graphdir/num_pdfs" ]; then
    [ "`cat $graphdir/num_pdfs`" -eq `am-info --print-args=false $adapt_model | grep pdfs | awk '{print $NF}'` ] || \
      { echo "Mismatch in number of pdfs with $adapt_model"; exit 1; }
  fi
  $cmd --num-threads $num_threads JOB=1:$nj $dir/log/decode.JOB.log \
    gmm-latgen-faster$thread_string --max-active=$max_active --beam=$beam --lattice-beam=$lattice_beam \
    --acoustic-scale=$acwt --determinize-lattice=false \
    --allow-partial=true --word-symbol-table=$graphdir/words.txt \
    $adapt_model $graphdir/HCLG.fst "$pass1feats" "ark:|gzip -c > $dir/lat.tmp.JOB.gz" \
    || exit 1;
fi
##

## Do a second pass of estimating the transform-- this time with the lattices
## generated from the alignment model.  Compose the transforms to get
## $dir/trans.1, etc.
if [ $stage -le 3 ]; then
  echo "$0: estimating fMLLR transforms a second time."
  $cmd --max-jobs-run $max_fmllr_jobs JOB=1:$nj $dir/log/fmllr_pass2.JOB.log \
    lattice-determinize-pruned$thread_string --acoustic-scale=$acwt --beam=4.0 \
    "ark:gunzip -c $dir/lat.tmp.JOB.gz|" ark:- \| \
    lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \
    weight-silence-post $silence_weight $silphonelist $adapt_model ark:- ark:- \| \
    gmm-est-fmllr --fmllr-update-type=$fmllr_update_type \
    --spk2utt=ark:$sdata/JOB/spk2utt $adapt_model "$pass1feats" \
    ark,s,cs:- ark:$dir/trans_tmp.JOB '&&' \
    compose-transforms --b-is-affine=true ark:$dir/trans_tmp.JOB ark:$dir/pre_trans.JOB \
    ark:$dir/trans.JOB  || exit 1;
fi
##

feats="$sifeats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$dir/trans.JOB ark:- ark:- |"

# Rescore the state-level lattices with the final adapted features, and the final model
# (which by default is $srcdir/final.mdl, but which may be specified on the command line,
# useful in case of discriminatively trained systems).
# At this point we prune and determinize the lattices and write them out, ready for
# language model rescoring.

if [ $stage -le 4 ]; then
  echo "$0: doing a final pass of acoustic rescoring."
  $cmd --num-threads $num_threads JOB=1:$nj $dir/log/acoustic_rescore.JOB.log \
    gmm-rescore-lattice $final_model "ark:gunzip -c $dir/lat.tmp.JOB.gz|" "$feats" ark:- \| \
    lattice-determinize-pruned$thread_string --acoustic-scale=$acwt --beam=$lattice_beam ark:- \
    "ark:|gzip -c > $dir/lat.JOB.gz" '&&' rm $dir/lat.tmp.JOB.gz || exit 1;
fi

if [ $stage -le 5 ]; then
  steps/diagnostic/analyze_lats.sh --cmd "$cmd" $graphdir $dir
fi

if ! $skip_scoring ; then
  [ ! -x local/score.sh ] && \
    echo "$0: not scoring because local/score.sh does not exist or not executable." && exit 1;
  local/score.sh $scoring_opts --cmd "$cmd" $data $graphdir $dir
fi

rm $dir/{trans_tmp,pre_trans}.*

exit 0;


================================================
FILE: egs/steps/decode_fmllr_extra.sh
================================================
#!/usr/bin/env bash

# Copyright 2012  Johns Hopkins University (Author: Daniel Povey)

# Decoding script that does fMLLR.  This can be on top of delta+delta-delta, or
# LDA+MLLT features.
# This script does an extra pass of lattice generation over and above what the original
# script did-- it's for robustness in the case where your original cepstral mean
# normalization was way off.
# We also added a new option --distribute=true (by default) to 
# weight-silence-post.  This weights the silence frames in a different way,
# weighting all posteriors on the frame rather than just the silence ones, which
# removes a particular kind of bias that the old approach suffered from.

# There are 3 models involved potentially in this script,
# and for a standard, speaker-independent system they will all be the same.
# The "alignment model" is for the 1st-pass decoding and to get the 
# Gaussian-level alignments for the "adaptation model" the first time we
# do fMLLR.  The "adaptation model" is used to estimate fMLLR transforms
# and to generate state-level lattices.  The lattices are then rescored
# with the "final model".
#
# The following table explains where we get these 3 models from.
# Note: $srcdir is one level up from the decoding directory.
#
#   Model              Default source:                 
#
#  "alignment model"   $srcdir/final.alimdl              --alignment-model <model>
#                     (or $srcdir/final.mdl if alimdl absent)
#  "adaptation model"  $srcdir/final.mdl                 --adapt-model <model>
#  "final model"       $srcdir/final.mdl                 --final-model <model>


# Begin configuration section
first_beam=10.0 # Beam used in initial, speaker-indep. pass
first_max_active=2000 # max-active used in first two passes.
first_lattice_beam=4.0 # lattice pruning beam for si decode and first-pass fMLLR decode.
                # the different spelling from lattice_beam is unfortunate; these scripts
                # have a history.
alignment_model=
adapt_model=
final_model=
cleanup=true
stage=0
acwt=0.083333 # Acoustic weight used in getting fMLLR transforms, and also in 
              # lattice generation.
max_active=7000
max_mem=50000000
beam=13.0
lattice_beam=6.0
nj=4
silence_weight=0.01
distribute=true # option to weight-silence-post.
cmd=run.pl
si_dir=
fmllr_update_type=full
skip_scoring=false
num_threads=1 # if >1, will use gmm-latgen-faster-parallel
parallel_opts=  # ignored now.
scoring_opts=

# End configuration section

echo "$0 $@"  # Print the command line for logging

[ -f ./path.sh ] && . ./path.sh; # source the path.
. parse_options.sh || exit 1;

if [ $# != 3 ]; then
   echo "Usage: steps/decode_fmllr.sh [options] <graph-dir> <data-dir> <decode-dir>"
   echo " e.g.: steps/decode_fmllr.sh exp/tri2b/graph_tgpr data/test_dev93 exp/tri2b/decode_dev93_tgpr"
   echo "main options (for others, see top of script file)"
   echo "  --config <config-file>                   # config containing options"
   echo "  --nj <nj>                                # number of parallel jobs"
   echo "  --cmd <cmd>                              # Command to run in parallel with"
   echo "  --adapt-model <adapt-mdl>                # Model to compute transforms with"
   echo "  --alignment-model <ali-mdl>              # Model to get Gaussian-level alignments for"
   echo "                                           # 1st pass of transform computation."
   echo "  --final-model <finald-mdl>               # Model to finally decode with"
   echo "  --si-dir <speaker-indep-decoding-dir>    # use this to skip 1st pass of decoding"
   echo "                                           # Caution-- must be with same tree"
   echo "  --acwt <acoustic-weight>                 # default 0.08333 ... used to get posteriors"
   echo "  --num-threads <n>                        # number of threads to use, default 1."
   echo "  --scoring-opts <opts>                    # options to local/score.sh"
   exit 1;
fi


graphdir=$1
data=$2
dir=`echo $3 | sed 's:/$::g'` # remove any trailing slash.

srcdir=`dirname $dir`; # Assume model directory one level up from decoding directory.
sdata=$data/split$nj;

thread_string=
[ $num_threads -gt 1 ] && thread_string="-parallel --num-threads=$num_threads"

mkdir -p $dir/log
[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
echo $nj > $dir/num_jobs
splice_opts=`cat $srcdir/splice_opts 2>/dev/null` # frame-splicing options.
cmvn_opts=`cat $srcdir/cmvn_opts 2>/dev/null`
delta_opts=`cat $srcdir/delta_opts 2>/dev/null`

silphonelist=`cat $graphdir/phones/silence.csl` || exit 1;

# Some checks.  Note: we don't need $srcdir/tree but we expect
# it should exist, given the current structure of the scripts.
for f in $graphdir/HCLG.fst $data/feats.scp $srcdir/tree; do
  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
done

## Work out name of alignment model. ##
if [ -z "$alignment_model" ]; then
  if [ -f "$srcdir/final.alimdl" ]; then alignment_model=$srcdir/final.alimdl;
  else alignment_model=$srcdir/final.mdl; fi
fi
[ ! -f "$alignment_model" ] && echo "$0: no alignment model $alignment_model " && exit 1;
##

## Do the speaker-independent decoding, if --si-dir option not present. ##
if [ -z "$si_dir" ]; then # we need to do the speaker-independent decoding pass.
  si_dir=${dir}.si # Name it as our decoding dir, but with suffix ".si".
  if [ $stage -le 0 ]; then
  if [ -f "$graphdir/num_pdfs" ]; then
    [ "`cat $graphdir/num_pdfs`" -eq `am-info --print-args=false $alignment_model | grep pdfs | awk '{print $NF}'` ] || \
      { echo "Mismatch in number of pdfs with $alignment_model" exit 1; }
  fi
    steps/decode.sh --acwt $acwt --nj $nj --cmd "$cmd" --beam $first_beam --model $alignment_model\
      --max-active $first_max_active --num-threads $num_threads\
      --skip-scoring true $graphdir $data $si_dir || exit 1;
  fi
fi
##

## Some checks, and setting of defaults for variables.
[ "$nj" -ne "`cat $si_dir/num_jobs`" ] && echo "Mismatch in #jobs with si-dir" && exit 1;
[ ! -f "$si_dir/lat.1.gz" ] && echo "No such file $si_dir/lat.1.gz" && exit 1;
[ -z "$adapt_model" ] && adapt_model=$srcdir/final.mdl
[ -z "$final_model" ] && final_model=$srcdir/final.mdl
for f in $adapt_model $final_model; do
  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
done
##

## Set up the unadapted features "$sifeats"
if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
echo "$0: feature type is $feat_type";
case $feat_type in
  delta) sifeats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas $delta_opts ark:- ark:- |";;
  lda) sifeats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |";;
  *) echo "Invalid feature type $feat_type" && exit 1;
esac
##

## Now get the first-pass fMLLR transforms.
if [ $stage -le 1 ]; then
  echo "$0: getting first-pass fMLLR transforms."
  $cmd JOB=1:$nj $dir/log/fmllr_pass1.JOB.log \
    gunzip -c $si_dir/lat.JOB.gz \| \
    lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \
    weight-silence-post --distribute=$distribute $silence_weight $silphonelist $alignment_model ark:- ark:- \| \
    gmm-post-to-gpost $alignment_model "$sifeats" ark:- ark:- \| \
    gmm-est-fmllr-gpost --fmllr-update-type=$fmllr_update_type \
    --spk2utt=ark:$sdata/JOB/spk2utt $adapt_model "$sifeats" ark,s,cs:- \
    ark:$dir/trans1.JOB || exit 1;
fi
##

pass1feats="$sifeats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$dir/trans1.JOB ark:- ark:- |"

## Do the first adapted lattice generation pass. 
if [ $stage -le 2 ]; then
  echo "$0: doing first adapted lattice generation phase"
  if [ -f "$graphdir/num_pdfs" ]; then
    [ "`cat $graphdir/num_pdfs`" -eq `am-info --print-args=false $adapt_model | grep pdfs | awk '{print $NF}'` ] || \
      { echo "Mismatch in number of pdfs with $adapt_model" exit 1; }
  fi
  $cmd --num-threads $num_threads JOB=1:$nj $dir/log/decode1.JOB.log\
    gmm-latgen-faster$thread_string --max-active=$first_max_active --max-mem=$max_mem --beam=$first_beam --lattice-beam=$first_lattice_beam \
    --acoustic-scale=$acwt --allow-partial=true --word-symbol-table=$graphdir/words.txt \
    $adapt_model $graphdir/HCLG.fst "$pass1feats" "ark:|gzip -c > $dir/lat1.JOB.gz" \
    || exit 1;
fi


## Do a second pass of estimating the transform.  Compose the transforms to get
## $dir/trans2.*.
if [ $stage -le 3 ]; then
  echo "$0: estimating fMLLR transforms a second time."
  $cmd JOB=1:$nj $dir/log/fmllr_pass2.JOB.log \
    lattice-to-post --acoustic-scale=$acwt "ark:gunzip -c $dir/lat1.JOB.gz|" ark:- \| \
    weight-silence-post --distribute=$distribute $silence_weight $silphonelist $adapt_model ark:- ark:- \| \
    gmm-est-fmllr --fmllr-update-type=$fmllr_update_type \
    --spk2utt=ark:$sdata/JOB/spk2utt $adapt_model "$pass1feats" \
    ark,s,cs:- ark:$dir/trans1b.JOB '&&' \
    compose-transforms --b-is-affine=true ark:$dir/trans1b.JOB ark:$dir/trans1.JOB \
    ark:$dir/trans2.JOB  || exit 1;
  if $cleanup; then
    rm $dir/trans1b.* $dir/trans1.* $dir/lat1.*.gz
  fi
fi
##

pass2feats="$sifeats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$dir/trans2.JOB ark:- ark:- |"

# Generate a 3rd set of lattices, with the "adaptation model"; we'll use these
# to adapt a 3rd time, and we'll rescore them.  Since we should be close to the final
# fMLLR, we don't bother dumping un-determinized lattices to disk.

## Do the final lattice generation pass (but we'll rescore these lattices
## after another stage of adaptation.)
if [ $stage -le 4 ]; then
  echo "$0: doing final lattice generation phase"
  $cmd --num-threads $num_threads JOB=1:$nj $dir/log/decode2.JOB.log\
    gmm-latgen-faster$thread_string --max-active=$max_active --max-mem=$max_mem --beam=$beam --lattice-beam=$lattice_beam \
    --acoustic-scale=$acwt --allow-partial=true --word-symbol-table=$graphdir/words.txt \
    $adapt_model $graphdir/HCLG.fst "$pass2feats" "ark:|gzip -c > $dir/lat2.JOB.gz" \
    || exit 1;
fi


## Do a third pass of estimating the transform.  Compose the transforms to get
## $dir/trans.*.
if [ $stage -le 5 ]; then
  echo "$0: estimating fMLLR transforms a third time."
  $cmd JOB=1:$nj $dir/log/fmllr_pass3.JOB.log \
    lattice-to-post --acoustic-scale=$acwt "ark:gunzip -c $dir/lat2.JOB.gz|" ark:- \| \
    weight-silence-post --distribute=$distribute $silence_weight $silphonelist $adapt_model ark:- ark:- \| \
    gmm-est-fmllr --fmllr-update-type=$fmllr_update_type \
    --spk2utt=ark:$sdata/JOB/spk2utt $adapt_model "$pass2feats" \
    ark,s,cs:- ark:$dir/trans2b.JOB '&&' \
    compose-transforms --b-is-affine=true ark:$dir/trans2b.JOB ark:$dir/trans2.JOB \
    ark:$dir/trans.JOB  || exit 1;
  if $cleanup; then
    rm $dir/trans2b.* $dir/trans2.*
  fi
fi
##

feats="$sifeats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$dir/trans.JOB ark:- ark:- |"

if [ $stage -le 6 ]; then
  echo "$0: doing a final pass of acoustic rescoring."
  $cmd JOB=1:$nj $dir/log/acoustic_rescore.JOB.log \
    gmm-rescore-lattice $final_model "ark:gunzip -c $dir/lat2.JOB.gz|" "$feats" \
      "ark:|gzip -c > $dir/lat.JOB.gz" || exit 1;
  if $cleanup; then
    rm $dir/lat2.*.gz
  fi
fi

if ! $skip_scoring ; then
  [ ! -x local/score.sh ] && \
    echo "$0: not scoring because local/score.sh does not exist or not executable." && exit 1;
  local/score.sh $scoring_opts --cmd "$cmd" $data $graphdir $dir
fi

exit 0;


================================================
FILE: egs/steps/decode_fmmi.sh
================================================
#!/usr/bin/env bash

# Copyright 2012  Johns Hopkins University (Author: Daniel Povey)
# Apache 2.0
# Decoding of fMMI or fMPE models (feature-space discriminative training).
# If transform-dir supplied, expects e.g. fMLLR transforms in that dir.

# Begin configuration section.  
stage=1
iter=final
nj=4
cmd=run.pl
maxactive=7000
beam=13.0
lattice_beam=6.0
acwt=0.083333 # note: only really affects pruning (scoring is on lattices).
ngselect=2; # Just use the 2 top Gaussians for fMMI/fMPE.  Should match train.
transform_dir=
num_threads=1 # if >1, will use gmm-latgen-faster-parallel
parallel_opts=  # ignored now.
scoring_opts=
skip_scoring=false
# End configuration section.

echo "$0 $@"  # Print the command line for logging

[ -f ./path.sh ] && . ./path.sh; # source the path.
. parse_options.sh || exit 1;

if [ $# != 3 ]; then
   echo "Usage: steps/decode_fmmi.sh [options] <graph-dir> <data-dir> <decode-dir>"
   echo "... where <decode-dir> is assumed to be a sub-directory of the directory"
   echo " where the model is."
   echo "e.g.: steps/decode_fmmi.sh exp/mono/graph_tgpr data/test_dev93 exp/mono/decode_dev93_tgpr"
   echo ""
   echo "This script works on CMN + (delta+delta-delta | LDA+MLLT) features; it works out"
   echo "what type of features you used (assuming it's one of these two)"
   echo "You can also use fMLLR features-- you have to supply --transform-dir option."
   echo ""
   echo "main options (for others, see top of script file)"
   echo "  --config <config-file>                           # config containing options"
   echo "  --nj <nj>                                        # number of parallel jobs"
   echo "  --iter <iter>                                    # Iteration of model to test."
   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
   echo "  --acwt <float>                                   # acoustic scale used for lattice generation "
   echo "  --transform-dir <transform-dir>                  # where to find fMLLR transforms."
   echo "  --scoring-opts <string>                          # options to local/score.sh"
   echo "                                                   # speaker-adapted decoding"
   echo "  --num-threads <n>                                # number of threads to use, default 1."
   exit 1;
fi


graphdir=$1
data=$2
dir=$3
srcdir=`dirname $dir`; # The model directory is one level up from decoding directory.
sdata=$data/split$nj;
splice_opts=`cat $srcdir/splice_opts 2>/dev/null`
cmvn_opts=`cat $srcdir/cmvn_opts 2>/dev/null`
delta_opts=`cat $srcdir/delta_opts 2>/dev/null`
thread_string=
[ $num_threads -gt 1 ] && thread_string="-parallel --num-threads=$num_threads" 

mkdir -p $dir/log
[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
echo $nj > $dir/num_jobs

model=$srcdir/$iter.mdl

for f in $sdata/1/feats.scp $sdata/1/cmvn.scp $model $graphdir/HCLG.fst; do
  [ ! -f $f ] && echo "decode_fmmi.sh: no such file $f" && exit 1;
done

if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
echo "decode_fmmi.sh: feature type is $feat_type";

case $feat_type in
  delta) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas $delta_opts ark:- ark:- |";;
  lda) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |";;
  *) echo "Invalid feature type $feat_type" && exit 1;
esac

if [ ! -z "$transform_dir" ]; then # add transforms to features...
  echo "Using fMLLR transforms from $transform_dir"
  [ ! -f $transform_dir/trans.1 ] && echo "Expected $transform_dir/trans.1 to exist."
  [ "`cat $transform_dir/num_jobs`" -ne $nj ] && \
     echo "Mismatch in number of jobs with $transform_dir";
  feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/trans.JOB ark:- ark:- |"
fi

fmpefeats="$feats fmpe-apply-transform $srcdir/$iter.fmpe ark:- 'ark,s,cs:gunzip -c $dir/gselect.JOB.gz|' ark:- |" 

if [ $stage -le 1 ]; then
  # Get Gaussian selection info.
  $cmd JOB=1:$nj $dir/log/gselect.JOB.log \
    gmm-gselect --n=$ngselect $srcdir/$iter.fmpe "$feats" \
    "ark:|gzip -c >$dir/gselect.JOB.gz" || exit 1;
fi
  
if [ $stage -le 2 ]; then
  $cmd --num-threads $num_threads JOB=1:$nj $dir/log/decode.JOB.log \
    gmm-latgen-faster$thread_string --max-active=$maxactive --beam=$beam --lattice-beam=$lattice_beam \
    --acoustic-scale=$acwt --allow-partial=true --word-symbol-table=$graphdir/words.txt \
    $model $graphdir/HCLG.fst "$fmpefeats" "ark:|gzip -c > $dir/lat.JOB.gz" || exit 1;
fi

if [ $stage -le 3 ]; then
  if ! $skip_scoring ; then
    [ ! -x local/score.sh ] && \
      echo "Not scoring because local/score.sh does not exist or not executable." && exit 1;
    
    local/score.sh --cmd "$cmd" $scoring_opts $data $graphdir $dir || 
      { echo "$0: Scoring failed. (ignore by '--skip-scoring true')"; exit 1; }
  fi
fi

exit 0;


================================================
FILE: egs/steps/decode_fromlats.sh
================================================
#!/usr/bin/env bash

# Copyright 2012  Johns Hopkins University (Author: Daniel Povey)
# Apache 2.0

# Decode, limited to the word-sequences that were present in a set
# of lattices on disk.  The other lattices do not have to be built
# with the same tree or the same context size-- however, you do
# have to be using the same vocabulary (words.txt)-- if not you'd
# have to map the vocabulary somehow.

# Note: if the trees are identical, you can use gmm-rescore-lattice.

# Mechanism: create an unweighted acceptor (on words) for each utterance,
# compose that with G, determinize, and then use compile-train-graphs-fsts
# to compile a graph for each utterance, to decode with.  

# Begin configuration.
cmd=run.pl
maxactive=7000
beam=20.0
lattice_beam=7.0
acwt=0.083333
batch_size=75 # Limits memory blowup in compile-train-graphs-fsts
scale_opts="--transition-scale=1.0 --self-loop-scale=0.1"
skip_scoring=false
# End configuration.

echo "$0 $@"  # Print the command line for logging

[ -f ./path.sh ] && . ./path.sh; # source the path.
. parse_options.sh || exit 1;


if [ $# != 4 ]; then
   echo "Usage: steps/decode_si_fromlats.sh [options] <data-dir> <lang> <old-decode-dir> <decode-dir>"
   echo "e.g.: steps/decode_si_fromlats.sh data/test_dev93 data/lang_test_tg exp/tri2b/decode_tgpr_dev93 exp/tri2a/decode_tgpr_dev93_fromlats"
   echo ""
   echo "main options (for others, see top of script file)"
   echo "  --config <config-file>                           # config containing options"
   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
   exit 1;
fi


data=$1
lang=$2
olddir=$3
dir=$4
srcdir=`dirname $dir`; # Assume model directory one level up from decoding directory.

mkdir -p $dir/log

nj=`cat $olddir/num_jobs` || exit 1;
splice_opts=`cat $srcdir/splice_opts 2>/dev/null`
cmvn_opts=`cat $srcdir/cmvn_opts 2>/dev/null`
sdata=$data/split$nj
[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
echo $nj >$dir/num_jobs

for f in $sdata/1/feats.scp $sdata/1/cmvn.scp $srcdir/final.mdl $olddir/lat.1.gz \
    $srcdir/tree $lang/L_disambig.fst $lang/phones.txt; do
  [ ! -f $f ] && echo "decode_si_fromlats.sh: no such file $f" && exit 1;
done


if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
echo "decode_si.sh: feature type is $feat_type"

case $feat_type in
  delta) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
  lda) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |";;
  *) echo "Invalid feature type $feat_type" && exit 1;
esac


$cmd JOB=1:$nj $dir/log/decode_lats.JOB.log \
 lattice-to-fst "ark:gunzip -c $olddir/lat.JOB.gz|" ark:- \| \
  fsttablecompose "fstproject --project_output=true $lang/G.fst | fstarcsort |" ark:- ark:- \| \
  fstdeterminizestar ark:- ark:- \| \
  compile-train-graphs-fsts --read-disambig-syms=$lang/phones/disambig.int \
    --batch-size=$batch_size $scale_opts $srcdir/tree $srcdir/final.mdl $lang/L_disambig.fst ark:- ark:- \|  \
  gmm-latgen-faster --max-active=$maxactive --beam=$beam --lattice-beam=$lattice_beam --acoustic-scale=$acwt \
    --allow-partial=true --word-symbol-table=$lang/words.txt \
    $srcdir/final.mdl ark:- "$feats" "ark:|gzip -c > $dir/lat.JOB.gz" || exit 1;

if ! $skip_scoring ; then
  [ ! -x local/score.sh ] && \
    echo "Not scoring because local/score.sh does not exist or not executable." && exit 1;
  local/score.sh --cmd "$cmd" $data $lang $dir ||
    { echo "$0: Scoring failed. (ignore by '--skip-scoring true')"; exit 1; }
fi

exit 0;


================================================
FILE: egs/steps/decode_lvtln.sh
================================================
#!/usr/bin/env bash

# Copyright 2012  Johns Hopkins University (Author: Daniel Povey)
# Copyright 2014  Vimal Manohar

# Decoding script for LVTLN models.  Will estimate VTLN warping factors
# as a by product, which can be used to extract VTLN-warped features.

# Begin configuration section
stage=0
acwt=0.083333 
max_active=3000 # Have a smaller than normal max-active, to limit decoding time.
beam=13.0
lattice_beam=6.0
nj=4
silence_weight=0.0
logdet_scale=0.0
cmd=run.pl
skip_scoring=false
num_threads=1 # if >1, will use gmm-latgen-faster-parallel
parallel_opts=  # ignored now.
scoring_opts=
cleanup=true
# End configuration section
echo "$0 $@"  # Print the command line for logging

[ -f ./path.sh ] && . ./path.sh; # source the path.
. parse_options.sh || exit 1;

if [ $# != 3 ]; then
   echo "Wrong #arguments ($#, expected 3)"
   echo "Usage: steps/decode_lvtln.sh [options] <graph-dir> <data-dir> <decode-dir>"
   echo " e.g.: steps/decode_lvtln.sh exp/tri2d/graph_tgpr data/test_dev93 exp/tri2d/decode_dev93_tgpr"
   echo "main options (for others, see top of script file)"
   echo "  --config <config-file>                   # config containing options"
   echo "  --nj <nj>                                # number of parallel jobs"
   echo "  --cmd <cmd>                              # Command to run in parallel with"
   echo "  --acwt <acoustic-weight>                 # default 0.08333 ... used to get posteriors"
   echo "  --scoring-opts <opts>                    # options to local/score.sh"
   exit 1;
fi


graphdir=$1
data=$2
dir=`echo $3 | sed 's:/$::g'` # remove any trailing slash.

srcdir=`dirname $dir`; # Assume model directory one level up from decoding directory.
sdata=$data/split$nj;

thread_string=
[ $num_threads -gt 1 ] && thread_string="-parallel --num-threads=$num_threads"

if [ -f $data/spk2warp ]; then
  echo "$0: file $data/spk2warp exists.  This script expects non-VTLN features"
  exit 1;
fi


mkdir -p $dir/log
split_data.sh $data $nj || exit 1;
echo $nj > $dir/num_jobs
splice_opts=`cat $srcdir/splice_opts 2>/dev/null` # frame-splicing options.
cmvn_opts=`cat $srcdir/cmvn_opts 2>/dev/null`

silphonelist=`cat $graphdir/phones/silence.csl` || exit 1;

# Some checks.  Note: we don't need $srcdir/tree but we expect
# it should exist, given the current structure of the scripts.
for f in $graphdir/HCLG.fst $data/feats.scp $srcdir/tree $srcdir/final.mdl \
  $srcdir/final.alimdl $srcdir/final.lvtln; do
  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
done

## Set up the unadapted features "$sifeats"
if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
echo "$0: feature type is $feat_type";
case $feat_type in
  delta) sifeats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
  lda) sifeats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |";;
  *) echo "Invalid feature type $feat_type" && exit 1;
esac


## Generate lattices.
if [ $stage -le 0 ]; then
  echo "$0: doing main lattice generation phase"
  if [ -f "$graphdir/num_pdfs" ]; then
    [ "`cat $graphdir/num_pdfs`" -eq `am-info --print-args=false $srcdir/final.alimdl | grep pdfs | awk '{print $NF}'` ] || \
      { echo "Mismatch in number of pdfs with $srcdir/final.alimdl"; exit 1; }
  fi
  $cmd --num-threads $num_threads JOB=1:$nj $dir/log/decode.JOB.log \
    gmm-latgen-faster$thread_string --max-active=$max_active --beam=$beam --lattice-beam=$lattice_beam \
     --acoustic-scale=$acwt --allow-partial=true --word-symbol-table=$graphdir/words.txt \
    $srcdir/final.alimdl $graphdir/HCLG.fst "$sifeats" "ark:|gzip -c > $dir/lat_pass1.JOB.gz" \
    || exit 1;
fi


## Get the first-pass LVTLN transforms
if [ $stage -le 1 ]; then
  echo "$0: getting first-pass LVTLN transforms."
  if [ -f "$graphdir/num_pdfs" ]; then
    [ "`cat $graphdir/num_pdfs`" -eq `am-info --print-args=false $srcdir/final.mdl | grep pdfs | awk '{print $NF}'` ] || \
      { echo "Mismatch in number of pdfs with $srcdir/final.mdl"; exit 1; }
  fi
  $cmd JOB=1:$nj $dir/log/lvtln_pass1.JOB.log \
    gunzip -c $dir/lat_pass1.JOB.gz \| \
    lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \
    weight-silence-post $silence_weight $silphonelist $srcdir/final.alimdl ark:- ark:- \| \
    gmm-post-to-gpost $srcdir/final.alimdl "$sifeats" ark:- ark:- \| \
    gmm-est-lvtln-trans --logdet-scale=$logdet_scale --verbose=1 --spk2utt=ark:$sdata/JOB/spk2utt \
       $srcdir/final.mdl $srcdir/final.lvtln "$sifeats" ark,s,cs:- ark:$dir/trans_pass1.JOB \
       ark,t:$dir/warp_pass1.JOB || exit 1;
fi
##

feats1="$sifeats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$dir/trans_pass1.JOB ark:- ark:- |"

## Do a second pass of estimating the LVTLN transform.

if [ $stage -le 3 ]; then
  echo "$0: rescoring the lattices with first-pass LVTLN transforms"
  $cmd --num-threads $num_threads JOB=1:$nj $dir/log/rescore.JOB.log \
    gmm-rescore-lattice $srcdir/final.mdl "ark:gunzip -c $dir/lat_pass1.JOB.gz|" "$feats1" \
     "ark:|gzip -c > $dir/lat_pass2.JOB.gz" || exit 1;
fi

if [ $stage -le 4 ]; then
  echo "$0: re-estimating LVTLN transforms"
  $cmd JOB=1:$nj $dir/log/lvtln_pass2.JOB.log \
    gunzip -c $dir/lat_pass2.JOB.gz \| \
    lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \
    weight-silence-post $silence_weight $silphonelist $srcdir/final.mdl ark:- ark:- \| \
    gmm-post-to-gpost $srcdir/final.mdl "$feats1" ark:- ark:- \| \
    gmm-est-lvtln-trans --logdet-scale=$logdet_scale --verbose=1 --spk2utt=ark:$sdata/JOB/spk2utt \
      $srcdir/final.mdl $srcdir/final.lvtln "$sifeats" ark,s,cs:- ark:$dir/trans.JOB \
      ark,t:$dir/warp.JOB || exit 1;
fi

feats="$sifeats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$dir/trans.JOB ark:- ark:- |"

if [ $stage -le 5 ]; then
  # This second rescoring is only really necessary for scoring purposes,
  # it does not affect the transforms.
  echo "$0: rescoring the lattices with second-pass LVTLN transforms"
  $cmd --num-threads $num_threads JOB=1:$nj $dir/log/rescore.JOB.log \
    gmm-rescore-lattice $srcdir/final.mdl "ark:gunzip -c $dir/lat_pass2.JOB.gz|" "$feats" \
     "ark:|gzip -c > $dir/lat.JOB.gz" || exit 1;
fi

if [ -f $dir/warp.1 ]; then
  for j in $(seq $nj); do cat $dir/warp_pass1.$j; done > $dir/0.warp || exit 1;
  for j in $(seq $nj); do cat $dir/warp.$j; done > $dir/final.warp || exit 1;
  ns1=$(cat $dir/0.warp | wc -l)
  ns2=$(cat $dir/final.warp | wc -l)
  ! [ "$ns1" == "$ns2" ] && echo "$0: Number of speakers differ pass1 vs pass2, $ns1 != $ns2" && exit 1;

  paste $dir/0.warp $dir/final.warp | awk '{x=$2 - $4; if ((x>0?x:-x) > 0.010001) { print $1, $2, $4; }}' > $dir/warp_changed
  nc=$(cat $dir/warp_changed | wc -l)
  echo "$0: For $nc speakers out of $ns1, warp changed pass1 vs pass2 by >0.01, see $dir/warp_changed for details"
fi

if true; then # Diagnostics
  if [ -f $data/spk2gender ]; then 
    # To make it easier to eyeball the male and female speakers' warps
    # separately, separate them out.
    for g in m f; do # means: for gender in male female
      cat $dir/final.warp | \
        utils/filter_scp.pl <(grep -w $g $data/spk2gender | awk '{print $1}') > $dir/final.warp.$g
      echo -n "The last few warp factors for gender $g are: "
      tail -n 10 $dir/final.warp.$g | awk '{printf("%s ", $2);}'; 
      echo
    done
  fi
fi

if ! $skip_scoring ; then
  [ ! -x local/score.sh ] && \
    echo "$0: not scoring because local/score.sh does not exist or not executable." && exit 1;
  local/score.sh $scoring_opts --cmd "$cmd" $data $graphdir $dir
fi

if $cleanup; then
  rm $dir/lat_pass?.*.gz $dir/trans_pass1.* $dir/warp_pass1.* $dir/warp.*
fi


exit 0;


================================================
FILE: egs/steps/decode_nolats.sh
================================================
#!/usr/bin/env bash

# Copyright 2012-2014  Johns Hopkins University (Author: Daniel Povey)
#                      Vimal Manohar
# Apache 2.0

##Changes
# Vimal Manohar (Jan 2014):
# Added options to boost silence probabilities in the model before
# decoding. This can help in favoring the silence phones when 
# some silence regions are wrongly decoded as speech phones like glottal stops

# Begin configuration section.  
transform_dir=
iter=
model= # You can specify the model to use (e.g. if you want to use the .alimdl)
boost_silence=1.0         # Boost silence pdfs in the model by this factor before decoding
silence_phones_list=      # List of silence phones that would be boosted before decoding
stage=0
nj=4
cmd=run.pl
max_active=7000
beam=13.0
lattice_beam=6.0
acwt=0.083333 # note: only really affects pruning (scoring is on lattices).
write_alignments=false  # The output directory is treated like an alignment directory
write_words=true
# End configuration section.

echo "$0 $@"  # Print the command line for logging

[ -f ./path.sh ] && . ./path.sh; # source the path.
. parse_options.sh || exit 1;

[ -z $silence_phones_list ] && boost_silence=1.0

if [ $# != 3 ]; then
   echo "Usage: $0 [options] <graph-dir> <data-dir> <decode-dir>"
   echo "... where <decode-dir> is assumed to be a sub-directory of the directory"
   echo " where the model is.  This version produces just linear output, no lattices"
   echo ""
   echo "e.g.: steps/decode.sh exp/mono/graph_tgpr data/test_dev93 exp/mono/decode_dev93_tgpr"
   echo ""
   echo "This script works on CMN + (delta+delta-delta | LDA+MLLT) features; it works out"
   echo "what type of features you used (assuming it's one of these two)"
   echo ""
   echo "main options (for others, see top of script file)"
   echo "  --config <config-file>                           # config containing options"
   echo "  --nj <nj>                                        # number of parallel jobs"
   echo "  --iter <iter>                                    # Iteration of model to test."
   echo "  --model <model>                                  # which model to use (e.g. to"
   echo "                                                   # specify the final.alimdl)"
   echo "  --write-alignments <true|false>                  # if true, output ali.*.gz"
   echo "  --write-words <true|false>                       # if true, output words.*.gz"
   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
   echo "  --transform-dir <trans-dir>                      # dir to find fMLLR transforms "
   echo "  --acwt <float>                                   # acoustic scale used for lattice generation "
   exit 1;
fi


graphdir=$1
data=$2
dir=$3
srcdir=`dirname $dir`; # The model directory is one level up from decoding directory.
sdata=$data/split$nj;

mkdir -p $dir/log
[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
echo $nj > $dir/num_jobs

if [ -z "$model" ]; then # if --model <mdl> was not specified on the command line...
  if [ -z $iter ]; then model=$srcdir/final.mdl; 
  else model=$srcdir/$iter.mdl; fi
fi

for f in $sdata/1/feats.scp $sdata/1/cmvn.scp $model $graphdir/HCLG.fst; do
  [ ! -f $f ] && echo "decode.sh: no such file $f" && exit 1;
done

if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
echo "decode.sh: feature type is $feat_type";

splice_opts=`cat $srcdir/splice_opts 2>/dev/null`
cmvn_opts=`cat $srcdir/cmvn_opts 2>/dev/null`
delta_opts=`cat $srcdir/delta_opts 2>/dev/null`

utils/lang/check_phones_compatible.sh $graphdir/phones.txt $srcdir/phones.txt || exit 1;

if $write_alignments; then
  # Copy model and options that are generally expected in an alignment 
  # directory.
  cp $graphdir/phones.txt $dir || exit 1;

  cp $srcdir/{tree,final.mdl} $dir || exit 1;
  cp $srcdir/final.alimdl $dir 2>/dev/null
  cp $srcdir/final.occs $dir;
  cp $srcdir/splice_opts $dir 2>/dev/null # frame-splicing options.
  cp $srcdir/cmvn_opts $dir 2>/dev/null # cmn/cmvn option.
  cp $srcdir/delta_opts $dir 2>/dev/null
fi

case $feat_type in
  delta) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas $delta_opts ark:- ark:- |";;
  lda) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |"
    if $write_alignments; then
      cp $srcdir/final.mat $dir
      cp $srcdir/full.mat $dir 2>/dev/null
    fi
    ;;
  *) echo "Invalid feature type $feat_type" && exit 1;
esac
if [ ! -z "$transform_dir" ]; then # add transforms to features...
  echo "Using fMLLR transforms from $transform_dir"
  [ ! -f $transform_dir/trans.1 ] && echo "Expected $transform_dir/trans.1 to exist."
  [ "`cat $transform_dir/num_jobs`" -ne $nj ] && \
     echo "Mismatch in number of jobs with $transform_dir";
  feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/trans.JOB ark:- ark:- |"
fi

if [ $stage -le 0 ]; then
  if $write_alignments; then
    ali="ark:|gzip -c > $dir/ali.JOB.gz"
  else
    ali="ark:/dev/null"
  fi
  if $write_words; then
    words="ark:|gzip -c > $dir/words.JOB.gz"
  else
    words="ark:/dev/null"
  fi

  [ ! -z "$silence_phones_list" ]  && \
    model="gmm-boost-silence --boost=$boost_silence $silence_phones_list $model - |"

  if [ -f "$graphdir/num_pdfs" ]; then
    [ "`cat $graphdir/num_pdfs`" -eq `am-info --print-args=false $model | grep pdfs | awk '{print $NF}'` ] || \
      { echo "Mismatch in number of pdfs with $model"; exit 1; }
  fi
  $cmd JOB=1:$nj $dir/log/decode.JOB.log \
    gmm-decode-faster --max-active=$max_active --beam=$beam  \
    --acoustic-scale=$acwt --allow-partial=true --word-symbol-table=$graphdir/words.txt \
    "$model" $graphdir/HCLG.fst "$feats" "$words" "$ali" || exit 1;
fi

exit 0;


================================================
FILE: egs/steps/decode_raw_fmllr.sh
================================================
#!/usr/bin/env bash

# Copyright 2012-2013  Johns Hopkins University (Author: Daniel Povey)

# This decoding script is like decode_fmllr.sh, but it does the fMLLR on
# the raw cepstra, using the model in the LDA+MLLT space
# 
# Decoding script that does fMLLR.  This can be on top of delta+delta-delta, or
# LDA+MLLT features.

# There are 3 models involved potentially in this script,
# and for a standard, speaker-independent system they will all be the same.
# The "alignment model" is for the 1st-pass decoding and to get the 
# Gaussian-level alignments for the "adaptation model" the first time we
# do fMLLR.  The "adaptation model" is used to estimate fMLLR transforms
# and to generate state-level lattices.  The lattices are then rescored
# with the "final model".
#
# The following table explains where we get these 3 models from.
# Note: $srcdir is one level up from the decoding directory.
#
#   Model              Default source:                 
#
#  "alignment model"   $srcdir/final.alimdl              --alignment-model <model>
#                     (or $srcdir/final.mdl if alimdl absent)
#  "adaptation model"  $srcdir/final.mdl                 --adapt-model <model>
#  "final model"       $srcdir/final.mdl                 --final-model <model>


# Begin configuration section
first_beam=10.0 # Beam used in initial, speaker-indep. pass
first_max_active=2000 # max-active used in initial pass.
alignment_model=
adapt_model=
final_model=
stage=0
acwt=0.083333 # Acoustic weight used in getting fMLLR transforms, and also in 
              # lattice generation.
max_active=7000
use_normal_fmllr=false
beam=13.0
lattice_beam=6.0
nj=4
silence_weight=0.01
cmd=run.pl
si_dir=
num_threads=1 # if >1, will use gmm-latgen-faster-parallel
parallel_opts=  # ignored now.
skip_scoring=false
scoring_opts=
# End configuration section
echo "$0 $@"  # Print the command line for logging

[ -f ./path.sh ] && . ./path.sh; # source the path.
. parse_options.sh || exit 1;

if [ $# != 3 ]; then
   echo "Wrong #arguments ($#, expected 3)"
   echo "Usage: steps/decode_fmllr.sh [options] <graph-dir> <data-dir> <decode-dir>"
   echo " e.g.: steps/decode_fmllr.sh exp/tri2b/graph_tgpr data/test_dev93 exp/tri2b/decode_dev93_tgpr"
   echo "main options (for others, see top of script file)"
   echo "  --config <config-file>                   # config containing options"
   echo "  --nj <nj>                                # number of parallel jobs"
   echo "  --cmd <cmd>                              # Command to run in parallel with"
   echo "  --adapt-model <adapt-mdl>                # Model to compute transforms with"
   echo "  --alignment-model <ali-mdl>              # Model to get Gaussian-level alignments for"
   echo "                                           # 1st pass of transform computation."
   echo "  --final-model <finald-mdl>               # Model to finally decode with"
   echo "  --si-dir <speaker-indep-decoding-dir>    # use this to skip 1st pass of decoding"
   echo "                                           # Caution-- must be with same tree"
   echo "  --acwt <acoustic-weight>                 # default 0.08333 ... used to get posteriors"
   echo "  --num-threads <n>                        # number of threads to use, default 1."
   echo "  --scoring-opts <opts>                    # options to local/score.sh"
   exit 1;
fi


graphdir=$1
data=$2
dir=`echo $3 | sed 's:/$::g'` # remove any trailing slash.

srcdir=`dirname $dir`; # Assume model directory one level up from decoding directory.
sdata=$data/split$nj;

thread_string=
[ $num_threads -gt 1 ] && thread_string="-parallel --num-threads=$num_threads"


mkdir -p $dir/log
split_data.sh $data $nj || exit 1;
echo $nj > $dir/num_jobs
splice_opts=`cat $srcdir/splice_opts 2>/dev/null` # frame-splicing options.
cmvn_opts=`cat $srcdir/cmvn_opts 2>/dev/null`
raw_dim=$(feat-to-dim scp:$data/feats.scp -) || exit 1;
! [ "$raw_dim" -gt 0 ] && echo "raw feature dim not set" && exit 1;

silphonelist=`cat $graphdir/phones/silence.csl` || exit 1;

# Some checks.  Note: we don't need $srcdir/tree but we expect
# it should exist, given the current structure of the scripts.
for f in $graphdir/HCLG.fst $data/feats.scp $srcdir/tree; do
  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
done

## Work out name of alignment model. ##
if [ -z "$alignment_model" ]; then
  if [ -f "$srcdir/final.alimdl" ]; then alignment_model=$srcdir/final.alimdl;
  else alignment_model=$srcdir/final.mdl; fi
fi
[ ! -f "$alignment_model" ] && echo "$0: no alignment model $alignment_model " && exit 1;
##

## Do the speaker-independent decoding, if --si-dir option not present. ##
if [ -z "$si_dir" ]; then # we need to do the speaker-independent decoding pass.
  si_dir=${dir}.si # Name it as our decoding dir, but with suffix ".si".
  if [ $stage -le 0 ]; then
    steps/decode.sh --scoring-opts "$scoring_opts" \
              --num-threads $num_threads --skip-scoring $skip_scoring \
              --acwt $acwt --nj $nj --cmd "$cmd" --beam $first_beam \
              --model $alignment_model --max-active \
              $first_max_active $graphdir $data $si_dir || exit 1;
  fi
fi
##

## Some checks, and setting of defaults for variables.
[ "$nj" -ne "`cat $si_dir/num_jobs`" ] && echo "Mismatch in #jobs with si-dir" && exit 1;
[ ! -f "$si_dir/lat.1.gz" ] && echo "No such file $si_dir/lat.1.gz" && exit 1;
[ -z "$adapt_model" ] && adapt_model=$srcdir/final.mdl
[ -z "$final_model" ] && final_model=$srcdir/final.mdl
for f in $adapt_model $final_model; do
  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
done
##

if [[ ! -f $srcdir/final.mat || ! -f $srcdir/full.mat ]]; then
  echo "$0: we require final.mat and full.mat in the source directory $srcdir"
fi

splicedfeats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- |"
sifeats="$splicedfeats transform-feats $srcdir/final.mat ark:- ark:- |"

full_lda_mat="get-full-lda-mat --print-args=false $srcdir/final.mat $srcdir/full.mat -|"

##

## Now get the first-pass fMLLR transforms.
if [ $stage -le 1 ]; then
  echo "$0: getting first-pass raw-fMLLR transforms."
  $cmd JOB=1:$nj $dir/log/fmllr_pass1.JOB.log \
    gunzip -c $si_dir/lat.JOB.gz \| \
    lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \
    weight-silence-post $silence_weight $silphonelist $alignment_model ark:- ark:- \| \
    gmm-post-to-gpost $alignment_model "$sifeats" ark:- ark:- \| \
    gmm-est-fmllr-raw-gpost --raw-feat-dim=$raw_dim --spk2utt=ark:$sdata/JOB/spk2utt $adapt_model "$full_lda_mat" \
      "$splicedfeats" ark,s,cs:- ark:$dir/pre_trans.JOB || exit 1;
fi
##

pass1splicedfeats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$dir/pre_trans.JOB ark:- ark:- | splice-feats $splice_opts ark:- ark:- |"
pass1feats="$pass1splicedfeats transform-feats $srcdir/final.mat ark:- ark:- |"

## Do the main lattice generation pass.  Note: we don't determinize the lattices at
## this stage, as we're going to use them in acoustic rescoring with the larger 
## model, and it's more correct to store the full state-level lattice for this purpose.
if [ $stage -le 2 ]; then
  echo "$0: doing main lattice generation phase"
  $cmd --num-threads $num_threads JOB=1:$nj $dir/log/decode.JOB.log \
    gmm-latgen-faster$thread_string --max-active=$max_active --beam=$beam --lattice-beam=$lattice_beam \
    --acoustic-scale=$acwt --determinize-lattice=false \
    --allow-partial=true --word-symbol-table=$graphdir/words.txt \
    $adapt_model $graphdir/HCLG.fst "$pass1feats" "ark:|gzip -c > $dir/lat.tmp.JOB.gz" \
    || exit 1;
fi
##

## Do a second pass of estimating the transform-- this time with the lattices
## generated from the alignment model.  Compose the transforms to get
## $dir/trans.1, etc.
if [ $stage -le 3 ]; then
  echo "$0: estimating raw-fMLLR transforms a second time."
  $cmd JOB=1:$nj $dir/log/fmllr_pass2.JOB.log \
    lattice-determinize-pruned --acoustic-scale=$acwt --beam=4.0 \
    "ark:gunzip -c $dir/lat.tmp.JOB.gz|" ark:- \| \
    lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \
    weight-silence-post $silence_weight $silphonelist $adapt_model ark:- ark:- \| \
    gmm-est-fmllr-raw --raw-feat-dim=$raw_dim --spk2utt=ark:$sdata/JOB/spk2utt \
     $adapt_model "$full_lda_mat" "$pass1splicedfeats" ark,s,cs:- ark:$dir/trans_tmp.JOB '&&' \
    compose-transforms --b-is-affine=true ark:$dir/trans_tmp.JOB ark:$dir/pre_trans.JOB \
    ark:$dir/raw_trans.JOB  || exit 1;
fi
##

feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$dir/raw_trans.JOB ark:- ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |"

if [ $stage -le 4 ] && $use_normal_fmllr; then
  echo "$0: estimating normal fMLLR transforms"
  $cmd JOB=1:$nj $dir/log/fmllr_pass3.JOB.log \
    gmm-rescore-lattice $final_model "ark:gunzip -c $dir/lat.tmp.JOB.gz|" "$feats" ark:- \| \
    lattice-determinize-pruned --acoustic-scale=$acwt --beam=4.0 ark:- ark:- \| \
    lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \
    weight-silence-post $silence_weight $silphonelist $adapt_model ark:- ark:- \| \
    gmm-est-fmllr --spk2utt=ark:$sdata/JOB/spk2utt \
     $adapt_model "$feats" ark,s,cs:- ark:$dir/trans.JOB || exit 1;
fi

if $use_normal_fmllr; then
  feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$dir/trans.JOB ark:- ark:- |"
fi

# Rescore the state-level lattices with the final adapted features, and the final model
# (which by default is $srcdir/final.mdl, but which may be specified on the command line,
# useful in case of discriminatively trained systems).
# At this point we prune and determinize the lattices and write them out, ready for 
# language model rescoring.

if [ $stage -le 5 ]; then
  echo "$0: doing a final pass of acoustic rescoring."
  $cmd --num-threads $num_threads JOB=1:$nj $dir/log/acoustic_rescore.JOB.log \
    gmm-rescore-lattice $final_model "ark:gunzip -c $dir/lat.tmp.JOB.gz|" "$feats" ark:- \| \
    lattice-determinize-pruned$thread_string --acoustic-scale=$acwt --beam=$lattice_beam ark:- \
    "ark:|gzip -c > $dir/lat.JOB.gz" '&&' rm $dir/lat.tmp.JOB.gz || exit 1;
fi

if ! $skip_scoring ; then
  [ ! -x local/score.sh ] && \
    echo "$0: not scoring because local/score.sh does not exist or not executable." && exit 1;
  local/score.sh $scoring_opts --cmd "$cmd" $data $graphdir $dir
fi

#rm $dir/{trans_tmp,pre_trans}.*

exit 0;


================================================
FILE: egs/steps/decode_sgmm2.sh
================================================
#!/usr/bin/env bash

# Copyright 2012  Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.

# This script does decoding with an SGMM system, with speaker vectors.
# If the SGMM system was
# built on top of fMLLR transforms from a conventional system, you should
# provide the --transform-dir option.

# Begin configuration section.
stage=1
transform_dir=    # dir to find fMLLR transforms.
nj=4 # number of decoding jobs.
acwt=0.1  # Just a default value, used for adaptation and beam-pruning..
cmd=run.pl
beam=13.0
gselect=15  # Number of Gaussian-selection indices for SGMMs.  [Note:
            # the first_pass_gselect variable is used for the 1st pass of
            # decoding and can be tighter.
first_pass_gselect=3 # Use a smaller number of Gaussian-selection indices in
            # the 1st pass of decoding (lattice generation).
max_active=7000
max_mem=50000000
#WARNING: This option is renamed lattice_beam (it was renamed to follow the naming
#         in the other scripts
lattice_beam=6.0 # Beam we use in lattice generation.
vecs_beam=4.0 # Beam we use to prune lattices while getting posteriors for
    # speaker-vector computation.  Can be quite tight (actually we could
    # probably just do best-path.
use_fmllr=false
fmllr_iters=10
fmllr_min_count=1000
num_threads=1 # if >1, will use gmm-latgen-faster-parallel
parallel_opts=  # ignored now.
skip_scoring=false
scoring_opts=
# note: there are no more min-lmwt and max-lmwt options, instead use
# e.g. --scoring-opts "--min-lmwt 1 --max-lmwt 20"
# End configuration section.

echo "$0 $@"  # Print the command line for logging

[ -f ./path.sh ] && . ./path.sh; # source the path.
. parse_options.sh || exit 1;

if [ $# -ne 3 ]; then
  echo "Usage: steps/decode_sgmm2.sh [options] <graph-dir> <data-dir> <decode-dir>"
  echo " e.g.: steps/decode_sgmm2.sh --transform-dir exp/tri3b/decode_dev93_tgpr \\"
  echo "      exp/sgmm3a/graph_tgpr data/test_dev93 exp/sgmm3a/decode_dev93_tgpr"
  echo "main options (for others, see top of script file)"
  echo "  --transform-dir <decoding-dir>           # directory of previous decoding"
  echo "                                           # where we can find transforms for SAT systems."
  echo "  --config <config-file>                   # config containing options"
  echo "  --nj <nj>                                # number of parallel jobs"
  echo "  --cmd <cmd>                              # Command to run in parallel with"
  echo "  --beam <beam>                            # Decoding beam; default 13.0"
  exit 1;
fi

graphdir=$1
data=$2
dir=$3
srcdir=`dirname $dir`; # Assume model directory one level up from decoding directory.

for f in $graphdir/HCLG.fst $data/feats.scp $srcdir/final.mdl; do
  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
done

sdata=$data/split$nj;
silphonelist=`cat $graphdir/phones/silence.csl` || exit 1
gselect_opt="--gselect=ark,s,cs:gunzip -c $dir/gselect.JOB.gz|"
gselect_opt_1stpass="$gselect_opt copy-gselect --n=$first_pass_gselect ark:- ark:- |"

mkdir -p $dir/log
[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
echo $nj > $dir/num_jobs
splice_opts=`cat $srcdir/splice_opts 2>/dev/null` # frame-splicing options.
cmvn_opts=`cat $srcdir/cmvn_opts 2>/dev/null`
thread_string=
[ $num_threads -gt 1 ] && thread_string="-parallel --num-threads=$num_threads"

## Set up features.
if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
echo "$0: feature type is $feat_type"

case $feat_type in
  delta) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
  lda) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |"
    ;;
  *) echo "$0: invalid feature type $feat_type" && exit 1;
esac
if [ ! -z "$transform_dir" ]; then
  [ "$nj" -ne "`cat $transform_dir/num_jobs`" ] \
    && echo "$0: #jobs mismatch with transform-dir." && exit 1;
  if [ -f $transform_dir/trans.1 ]; then
    echo "$0: using transforms from $transform_dir"
    feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$transform_dir/trans.JOB ark:- ark:- |"
  elif [ -f $transform_dir/raw_trans.1 ]; then
    feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$transform_dir/raw_trans.JOB ark:- ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |"
  else
    echo "$0: no such file $transform_dir/trans.1 or $transform_dir/raw_trans.1, invalid --transform-dir option?"
    exit 1;
  fi
elif grep 'transform-feats --utt2spk' $srcdir/log/acc.0.1.log 2>/dev/null; then
  echo "$0: **WARNING**: you seem to be using an SGMM system trained with transforms,"
  echo "  but you are not providing the --transform-dir option in test time."
fi
##

## Save Gaussian-selection info to disk.
# Note: we can use final.mdl regardless of whether there is an alignment model--
# they use the same UBM.

if [ $stage -le 1 ]; then
  $cmd JOB=1:$nj $dir/log/gselect.JOB.log \
    sgmm2-gselect --full-gmm-nbest=$gselect $srcdir/final.mdl \
    "$feats" "ark:|gzip -c >$dir/gselect.JOB.gz" || exit 1;
fi

## Work out name of alignment model. ##
if [ -z "$alignment_model" ]; then
  if [ -f "$srcdir/final.alimdl" ]; then alignment_model=$srcdir/final.alimdl;
  else alignment_model=$srcdir/final.mdl; fi
fi
[ ! -f "$alignment_model" ] && echo "$0: no alignment model $alignment_model " && exit 1;


# Generate state-level lattice which we can rescore.  This is done with the alignment
# model and no speaker-vectors.
if [ $stage -le 2 ]; then
  if [ -f "$graphdir/num_pdfs" ]; then
    [ "`cat $graphdir/num_pdfs`" -eq `am-info --print-args=false $alignment_model | grep pdfs | awk '{print $NF}'` ] || \
      { echo "Mismatch in number of pdfs with $alignment_model"; exit 1; }
  fi
  $cmd --num-threads $num_threads JOB=1:$nj $dir/log/decode_pass1.JOB.log \
    sgmm2-latgen-faster$thread_string --max-active=$max_active --beam=$beam --lattice-beam=$lattice_beam \
    --acoustic-scale=$acwt --determinize-lattice=false --allow-partial=true \
    --word-symbol-table=$graphdir/words.txt --max-mem=$max_mem "$gselect_opt_1stpass" $alignment_model \
    $graphdir/HCLG.fst "$feats" "ark:|gzip -c > $dir/pre_lat.JOB.gz" || exit 1;
fi

# Estimate speaker vectors (1st pass).  Prune before determinizing
# because determinization can take a while on un-pruned lattices.
# Note: the sgmm2-post-to-gpost stage is necessary because we have
# a separate alignment-model and final model, otherwise we'd skip it
# and use sgmm2-est-spkvecs.
if [ $stage -le 3 ]; then
  $cmd JOB=1:$nj $dir/log/vecs_pass1.JOB.log \
    gunzip -c $dir/pre_lat.JOB.gz \| \
    lattice-prune --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \
    lattice-determinize-pruned --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \
    lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \
    weight-silence-post 0.0 $silphonelist $alignment_model ark:- ark:- \| \
    sgmm2-post-to-gpost "$gselect_opt" $alignment_model "$feats" ark:- ark:- \| \
    sgmm2-est-spkvecs-gpost --spk2utt=ark:$sdata/JOB/spk2utt \
     $srcdir/final.mdl "$feats" ark,s,cs:- "ark:$dir/pre_vecs.JOB" || exit 1;
fi

# Estimate speaker vectors (2nd pass).  Since we already have spk vectors,
# at this point we need to rescore the lattice to get the correct posteriors.
if [ $stage -le 4 ]; then
  $cmd JOB=1:$nj $dir/log/vecs_pass2.JOB.log \
    gunzip -c $dir/pre_lat.JOB.gz \| \
    sgmm2-rescore-lattice --speedup=true --spk-vecs=ark:$dir/pre_vecs.JOB \
           --utt2spk=ark:$sdata/JOB/utt2spk \
      "$gselect_opt" $srcdir/final.mdl ark:- "$feats" ark:- \| \
    lattice-prune --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \
    lattice-determinize-pruned --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \
    lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \
    weight-silence-post 0.0 $silphonelist $srcdir/final.mdl ark:- ark:- \| \
    sgmm2-est-spkvecs --spk2utt=ark:$sdata/JOB/spk2utt "$gselect_opt" --spk-vecs=ark:$dir/pre_vecs.JOB \
     $srcdir/final.mdl "$feats" ark,s,cs:- "ark:$dir/vecs.JOB" || exit 1;
fi
rm $dir/pre_vecs.*

if $use_fmllr; then
  # Estimate fMLLR transforms (note: these may be on top of any
  # fMLLR transforms estimated with the baseline GMM system.
  if [ $stage -le 5 ]; then # compute fMLLR transforms.
    echo "$0: computing fMLLR transforms."
    if [ ! -f $srcdir/final.fmllr_mdl ] || [ $srcdir/final.fmllr_mdl -ot $srcdir/final.mdl ]; then
      echo "$0: computing pre-transform for fMLLR computation."
      sgmm2-comp-prexform $srcdir/final.mdl $srcdir/final.occs $srcdir/final.fmllr_mdl || exit 1;
    fi
    $cmd JOB=1:$nj $dir/log/fmllr.JOB.log \
      gunzip -c $dir/pre_lat.JOB.gz \| \
      sgmm2-rescore-lattice --speedup=true --spk-vecs=ark:$dir/vecs.JOB \
        --utt2spk=ark:$sdata/JOB/utt2spk \
      "$gselect_opt" $srcdir/final.mdl ark:- "$feats" ark:- \| \
      lattice-prune --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \
      lattice-determinize-pruned --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \
      lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \
      weight-silence-post 0.0 $silphonelist $srcdir/final.mdl ark:- ark:- \| \
      sgmm2-est-fmllr --spk2utt=ark:$sdata/JOB/spk2utt "$gselect_opt" --spk-vecs=ark:$dir/vecs.JOB \
       --fmllr-iters=$fmllr_iters --fmllr-min-count=$fmllr_min_count \
      $srcdir/final.fmllr_mdl "$feats" ark,s,cs:- "ark:$dir/trans.JOB" || exit 1;
  fi
  feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$dir/trans.JOB ark:- ark:- |"
fi

# Now rescore the state-level lattices with the adapted features and the
# corresponding model.  Prune and determinize the lattices to limit
# their size.
if [ $stage -le 6 ]; then
  $cmd --num-threads $num_threads JOB=1:$nj $dir/log/rescore.JOB.log \
    sgmm2-rescore-lattice "$gselect_opt" --utt2spk=ark:$sdata/JOB/utt2spk --spk-vecs=ark:$dir/vecs.JOB \
    $srcdir/final.mdl "ark:gunzip -c $dir/pre_lat.JOB.gz|" "$feats" ark:- \| \
    lattice-determinize-pruned$thread_string --acoustic-scale=$acwt --beam=$lattice_beam ark:- \
    "ark:|gzip -c > $dir/lat.JOB.gz" || exit 1;
fi
rm $dir/pre_lat.*.gz


if [ $stage -le 7 ]; then
  steps/diagnostic/analyze_lats.sh --cmd "$cmd" $graphdir $dir
fi

if [ $stage -le 8 ]; then
  if ! $skip_scoring ; then
    [ ! -x local/score.sh ] && \
      echo "Not scoring because local/score.sh does not exist or not executable." && exit 1;
    local/score.sh $scoring_opts --cmd "$cmd" $data $graphdir $dir
  fi
fi

exit 0;


================================================
FILE: egs/steps/decode_sgmm2_fromlats.sh
================================================
#!/usr/bin/env bash

# Copyright 2012-2013  Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.

# This script does decoding with an SGMM2 system, with speaker vectors.  If the
# SGMM2 system was built on top of fMLLR transforms from a conventional system,
# you should provide the --transform-dir option.

# This script does not use a decoding graph, but instead you provide
# a previous decoding directory with lattices in it.  This script will only
# make use of the word sequences in the lattices; it limits the decoding
# to those sequences.  You should also provide a "lang" directory from
# which this script will use the G.fst and L.fst.

# Begin configuration section.
stage=1
alignment_model=
transform_dir=    # dir to find fMLLR transforms.
acwt=0.08333  # Just a default value, used for adaptation and beam-pruning..
batch_size=75 # Limits memory blowup in compile-train-graphs-fsts
cmd=run.pl
beam=20.0
gselect=15  # Number of Gaussian-selection indices for SGMMs.  [Note:
            # the first_pass_gselect variable is used for the 1st pass of
            # decoding and can be tighter.
first_pass_gselect=3 # Use a smaller number of Gaussian-selection indices in
            # the 1st pass of decoding (lattice generation).
max_active=7000
lattice_beam=8.0 # Beam we use in lattice generation.
vecs_beam=4.0 # Beam we use to prune lattices while getting posteriors for
    # speaker-vector computation.  Can be quite tight (actually we could
    # probably just do best-path.
use_fmllr=false
fmllr_iters=10
fmllr_min_count=1000
scale_opts="--transition-scale=1.0 --self-loop-scale=0.1"
skip_scoring=false
# End configuration section.

echo "$0 $@"  # Print the command line for logging

[ -f ./path.sh ] && . ./path.sh; # source the path.
. parse_options.sh || exit 1;

if [ $# -ne 4 ]; then
  echo "Usage: steps/decode_sgmm2_fromlats.sh [options] <data-dir> <lang-dir> <old-decode-dir> <decode-dir>"
  echo ""
  echo "main options (for others, see top of script file)"
  echo "  --transform-dir <decoding-dir>           # directory of previous decoding"
  echo "                                           # where we can find transforms for SAT systems."
  echo "  --alignment-model <ali-mdl>              # Model for the first-pass decoding."
  echo "  --config <config-file>                   # config containing options"
  echo "  --cmd <cmd>                              # Command to run in parallel with"
  echo "  --beam <beam>                            # Decoding beam; default 13.0"
  exit 1;
fi

data=$1
lang=$2
olddir=$3
dir=$4
srcdir=`dirname $dir`

for f in $data/feats.scp $lang/G.fst $lang/L_disambig.fst $lang/phones/disambig.int \
    $srcdir/final.mdl $srcdir/tree $olddir/lat.1.gz; do
  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
done

nj=`cat $olddir/num_jobs` || exit 1;
sdata=$data/split$nj;
silphonelist=`cat $lang/phones/silence.csl` || exit 1
splice_opts=`cat $srcdir/splice_opts 2>/dev/null`
cmvn_opts=`cat $srcdir/cmvn_opts 2>/dev/null`
gselect_opt="--gselect=ark,s,cs:gunzip -c $dir/gselect.JOB.gz|"
gselect_opt_1stpass="$gselect_opt copy-gselect --n=$first_pass_gselect ark:- ark:- |"

mkdir -p $dir/log
[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
echo $nj > $dir/num_jobs


## Set up features

if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
echo "$0: feature type is $feat_type"
if [ -z "$transform_dir" ] && [ -f $olddir/trans.1 ]; then
  transform_dir=$olddir
fi

case $feat_type in
  delta) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
  lda) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |"
    ;;
  *) echo "$0: invalid feature type $feat_type" && exit 1;
esac
if [ ! -z "$transform_dir" ]; then
  echo "$0: using transforms from $transform_dir"
  [ ! -f $transform_dir/trans.1 ] && echo "$0: no such file $transform_dir/trans.1" && exit 1;
  [ "$nj" -ne "`cat $transform_dir/num_jobs`" ] \
    && echo "$0: #jobs mismatch with transform-dir." && exit 1;
  feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$transform_dir/trans.JOB ark:- ark:- |"
elif grep 'transform-feats --utt2spk' $srcdir/log/acc.0.1.log 2>/dev/null; then
  echo "$0: **WARNING**: you seem to be using an SGMM system trained with transforms,"
  echo "  but you are not providing the --transform-dir option in test time."
fi

## Calculate FMLLR pre-transforms if needed. We are doing this here since this
## step is requried by models both with and without speaker vectors
if $use_fmllr; then
  if [ ! -f $srcdir/final.fmllr_mdl ] || [ $srcdir/final.fmllr_mdl -ot $srcdir/final.mdl ]; then
    echo "$0: computing pre-transform for fMLLR computation."
    sgmm2-comp-prexform $srcdir/final.mdl $srcdir/final.occs $srcdir/final.fmllr_mdl || exit 1;
  fi
fi

## Save Gaussian-selection info to disk.
# Note: we can use final.mdl regardless of whether there is an alignment model--
# they use the same UBM.
if [ $stage -le 1 ]; then
  $cmd JOB=1:$nj $dir/log/gselect.JOB.log \
    sgmm2-gselect --full-gmm-nbest=$gselect $srcdir/final.mdl \
    "$feats" "ark:|gzip -c >$dir/gselect.JOB.gz" || exit 1;
fi

## Work out name of alignment model. ##
if [ -z "$alignment_model" ]; then
  if [ -f "$srcdir/final.alimdl" ]; then alignment_model=$srcdir/final.alimdl;
  else alignment_model=$srcdir/final.mdl; fi
fi
[ ! -f "$alignment_model" ] && echo "$0: no alignment model $alignment_model " && exit 1;

# Generate state-level lattice which we can rescore.  This is done with the
# alignment model and no speaker-vectors.
if [ $stage -le 2 ]; then
  $cmd JOB=1:$nj $dir/log/decode_pass1.JOB.log \
    lattice-to-fst "ark:gunzip -c $olddir/lat.JOB.gz|" ark:- \| \
    fsttablecompose "fstproject --project_output=true $lang/G.fst | fstarcsort |" ark:- ark:- \| \
    fstdeterminizestar ark:- ark:- \| \
    compile-train-graphs-fsts --read-disambig-syms=$lang/phones/disambig.int \
      --batch-size=$batch_size $scale_opts \
      $srcdir/tree $srcdir/final.mdl $lang/L_disambig.fst ark:- ark:- \| \
    sgmm2-latgen-faster --max-active=$max_active --beam=$beam --lattice-beam=$lattice_beam \
      --acoustic-scale=$acwt --determinize-lattice=false --allow-partial=true \
      --word-symbol-table=$lang/words.txt "$gselect_opt_1stpass" $alignment_model \
      "ark:-" "$feats" "ark:|gzip -c > $dir/pre_lat.JOB.gz" || exit 1;
fi

## Check if the model has speaker vectors
spkdim=`sgmm2-info $srcdir/final.mdl | grep 'speaker vector' | awk '{print $NF}'`

if [ $spkdim -gt 0 ]; then  ### For models with speaker vectors:

# Estimate speaker vectors (1st pass).  Prune before determinizing
# because determinization can take a while on un-pruned lattices.
# Note: the sgmm2-post-to-gpost stage is necessary because we have
# a separate alignment-model and final model, otherwise we'd skip it
# and use sgmm2-est-spkvecs.
  if [ $stage -le 3 ]; then
    $cmd JOB=1:$nj $dir/log/vecs_pass1.JOB.log \
      gunzip -c $dir/pre_lat.JOB.gz \| \
      lattice-prune --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \
      lattice-determinize-pruned --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \
      lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \
      weight-silence-post 0.0 $silphonelist $alignment_model ark:- ark:- \| \
      sgmm2-post-to-gpost "$gselect_opt" $alignment_model "$feats" ark:- ark:- \| \
      sgmm2-est-spkvecs-gpost --spk2utt=ark:$sdata/JOB/spk2utt \
        $srcdir/final.mdl "$feats" ark,s,cs:- "ark:$dir/pre_vecs.JOB" || exit 1;
  fi

# Estimate speaker vectors (2nd pass).  Since we already have spk vectors,
# at this point we need to rescore the lattice to get the correct posteriors.
  if [ $stage -le 4 ]; then
    $cmd JOB=1:$nj $dir/log/vecs_pass2.JOB.log \
      gunzip -c $dir/pre_lat.JOB.gz \| \
      sgmm2-rescore-lattice --spk-vecs=ark:$dir/pre_vecs.JOB --utt2spk=ark:$sdata/JOB/utt2spk \
        "$gselect_opt" $srcdir/final.mdl ark:- "$feats" ark:- \| \
      lattice-prune --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \
      lattice-determinize-pruned --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \
      lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \
      weight-silence-post 0.0 $silphonelist $srcdir/final.mdl ark:- ark:- \| \
      sgmm2-est-spkvecs --spk2utt=ark:$sdata/JOB/spk2utt "$gselect_opt" --spk-vecs=ark:$dir/pre_vecs.JOB \
        $srcdir/final.mdl "$feats" ark,s,cs:- "ark:$dir/vecs.JOB" || exit 1;
  fi
  rm $dir/pre_vecs.*

  if $use_fmllr; then
    # Estimate fMLLR transforms (note: these may be on top of any
    # fMLLR transforms estimated with the baseline GMM system.
    if [ $stage -le 5 ]; then # compute fMLLR transforms.
      echo "$0: computing fMLLR transforms."
      $cmd JOB=1:$nj $dir/log/fmllr.JOB.log \
        gunzip -c $dir/pre_lat.JOB.gz \| \
        sgmm2-rescore-lattice --spk-vecs=ark:$dir/vecs.JOB --utt2spk=ark:$sdata/JOB/utt2spk \
          "$gselect_opt" $srcdir/final.mdl ark:- "$feats" ark:- \| \
        lattice-prune --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \
        lattice-determinize-pruned --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \
        lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \
        weight-silence-post 0.0 $silphonelist $srcdir/final.mdl ark:- ark:- \| \
        sgmm2-est-fmllr --spk2utt=ark:$sdata/JOB/spk2utt "$gselect_opt" --spk-vecs=ark:$dir/vecs.JOB \
          --fmllr-iters=$fmllr_iters --fmllr-min-count=$fmllr_min_count \
          $srcdir/final.fmllr_mdl "$feats" ark,s,cs:- "ark:$dir/trans.JOB" || exit 1;
    fi
    feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$dir/trans.JOB ark:- ark:- |"
  fi

# Now rescore the state-level lattices with the adapted features and the
# corresponding model.  Prune and determinize the lattices to limit
# their size.
  if [ $stage -le 6 ]; then
    $cmd JOB=1:$nj $dir/log/rescore.JOB.log \
      sgmm2-rescore-lattice "$gselect_opt" --utt2spk=ark:$sdata/JOB/utt2spk --spk-vecs=ark:$dir/vecs.JOB \
        $srcdir/final.mdl "ark:gunzip -c $dir/pre_lat.JOB.gz|" "$feats" ark:- \| \
      lattice-determinize-pruned --acoustic-scale=$acwt --beam=$lattice_beam ark:- \
        "ark:|gzip -c > $dir/lat.JOB.gz" || exit 1;
  fi
  rm $dir/pre_lat.*.gz

else  ### For models without speaker vectors:

  if $use_fmllr; then
    # Estimate fMLLR transforms (note: these may be on top of any
    # fMLLR transforms estimated with the baseline GMM system.
    if [ $stage -le 5 ]; then # compute fMLLR transforms.
      echo "$0: computing fMLLR transforms."
      $cmd JOB=1:$nj $dir/log/fmllr.JOB.log \
        gunzip -c $dir/pre_lat.JOB.gz \| \
        sgmm2-rescore-lattice --utt2spk=ark:$sdata/JOB/utt2spk \
        "$gselect_opt" $srcdir/final.mdl ark:- "$feats" ark:- \| \
        lattice-prune --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \
        lattice-determinize-pruned --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \
        lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \
        weight-silence-post 0.0 $silphonelist $srcdir/final.mdl ark:- ark:- \| \
        sgmm2-est-fmllr --spk2utt=ark:$sdata/JOB/spk2utt "$gselect_opt" \
        --fmllr-iters=$fmllr_iters --fmllr-min-count=$fmllr_min_count \
        $srcdir/final.fmllr_mdl "$feats" ark,s,cs:- "ark:$dir/trans.JOB" || exit 1;
    fi
    feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$dir/trans.JOB ark:- ark:- |"
  fi

# Now rescore the state-level lattices with the adapted features and the
# corresponding model.  Prune and determinize the lattices to limit
# their size.
  if [ $stage -le 6 ] && $use_fmllr; then
    $cmd JOB=1:$nj $dir/log/rescore.JOB.log \
      sgmm2-rescore-lattice "$gselect_opt" --utt2spk=ark:$sdata/JOB/utt2spk \
        $srcdir/final.mdl "ark:gunzip -c $dir/pre_lat.JOB.gz|" "$feats" ark:- \| \
      lattice-determinize-pruned --acoustic-scale=$acwt --beam=$lattice_beam ark:- \
        "ark:|gzip -c > $dir/lat.JOB.gz" || exit 1;
    rm $dir/pre_lat.*.gz
  else  # Already done with decoding if no adaptation needed.
    for n in `seq 1 $nj`; do
      mv $dir/pre_lat.${n}.gz $dir/lat.${n}.gz
    done
  fi

fi

# The output of this script is the files "lat.*.gz"-- we'll rescore this at
# different acoustic scales to get the final output.


if [ $stage -le 7 ]; then
  if ! $skip_scoring ; then
    [ ! -x local/score.sh ] && \
      echo "Not scoring because local/score.sh does not exist or not executable." && exit 1;
    echo "score best paths"
    local/score.sh --cmd "$cmd" $data $lang $dir ||
      { echo "$0: Scoring failed. (ignore by '--skip-scoring true')"; exit 1; }
  fi
fi
echo "Decoding done."
exit 0;


================================================
FILE: egs/steps/decode_sgmm2_rescore.sh
================================================
#!/usr/bin/env bash

# Copyright 2012  Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.

# This script does decoding with an SGMM system, by rescoring lattices
# generated from a previous SGMM system.  The directory with the lattices
# is assumed to contain speaker vectors, if used.  Basically it rescores
# the lattices one final time, using the same setup as the final decoding
# pass of the source dir.  The assumption is that the model may have
# been discriminatively trained.

# If the system was built on top of fMLLR transforms from a conventional system,
# you should provide the --transform-dir option.

# Begin configuration section.
transform_dir=    # dir to find fMLLR transforms.
cmd=run.pl
iter=final
skip_scoring=false
scoring_opts=
# End configuration section.

echo "$0 $@"  # Print the command line for logging

[ -f ./path.sh ] && . ./path.sh; # source the path.
. parse_options.sh || exit 1;

if [ $# -ne 4 ]; then
  echo "Usage: steps/decode_sgmm2_rescore.sh [options] <graph-dir|lang-dir> <data-dir> <old-decode-dir> <decode-dir>"
  echo " e.g.: steps/decode_sgmm2_rescore.sh --transform-dir exp/tri3b/decode_dev93_tgpr \\"
  echo "      exp/sgmm3a/graph_tgpr data/test_dev93 exp/sgmm3a/decode_dev93_tgpr exp/sgmm3a_mmi/decode_dev93_tgpr"
  echo "main options (for others, see top of script file)"
  echo "  --transform-dir <decoding-dir>           # directory of previous decoding"
  echo "                                           # where we can find transforms for SAT systems."
  echo "  --config <config-file>                   # config containing options"
  echo "  --cmd <cmd>                              # Command to run in parallel with"
  echo "  --iter <iter>                            # iteration of model to use (default: final)"
  exit 1;
fi

graphdir=$1
data=$2
olddir=$3
dir=$4
srcdir=`dirname $dir`; # Assume model directory one level up from decoding directory.

for f in $graphdir/words.txt $data/feats.scp $olddir/lat.1.gz $olddir/gselect.1.gz \
   $srcdir/$iter.mdl; do
  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
done

nj=`cat $olddir/num_jobs` || exit 1;
sdata=$data/split$nj;
gselect_opt="--gselect=ark,s,cs:gunzip -c $olddir/gselect.JOB.gz|"
splice_opts=`cat $srcdir/splice_opts 2>/dev/null`
cmvn_opts=`cat $srcdir/cmvn_opts 2>/dev/null`

mkdir -p $dir/log
[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
echo $nj > $dir/num_jobs

if [ -f $olddir/vecs.1 ]; then
  echo "$0: using speaker vectors from $olddir"
  spkvecs_opt="--spk-vecs=ark:$olddir/vecs.JOB --utt2spk=ark:$sdata/JOB/utt2spk"
else
  echo "$0: no speaker vectors found."
  spkvecs_opt=
fi


## Set up features.
if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
echo "$0: feature type is $feat_type"

case $feat_type in
  delta) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
  lda) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |"
    ;;
  *) echo "$0: invalid feature type $feat_type" && exit 1;
esac
if [ ! -z "$transform_dir" ]; then
  echo "$0: using transforms from $transform_dir"
  [ ! -f $transform_dir/trans.1 ] && echo "$0: no such file $transform_dir/trans.1" && exit 1;
  [ "$nj" -ne "`cat $transform_dir/num_jobs`" ] \
    && echo "$0: #jobs mismatch with transform-dir." && exit 1;
  feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$transform_dir/trans.JOB ark:- ark:- |"
elif grep 'transform-feats --utt2spk' $srcdir/log/acc.0.1.log 2>/dev/null; then
  echo "$0: **WARNING**: you seem to be using an SGMM system trained with transforms,"
  echo "  but you are not providing the --transform-dir option in test time."
fi

if [ -f $olddir/trans.1 ]; then
  echo "$0: using (in addition to any previous transforms) transforms from $olddir"
  feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$olddir/trans.JOB ark:- ark:- |"
fi
##

# Rescore the state-level lattices with the model provided.  Just
# one command in this script.
echo "$0: rescoring lattices with SGMM model in $srcdir/$iter.mdl"
$cmd JOB=1:$nj $dir/log/rescore.JOB.log \
  sgmm2-rescore-lattice "$gselect_opt" $spkvecs_opt \
  $srcdir/$iter.mdl "ark:gunzip -c $olddir/lat.JOB.gz|" "$feats" \
  "ark:|gzip -c > $dir/lat.JOB.gz" || exit 1;

if ! $skip_scoring ; then
  [ ! -x local/score.sh ] && \
    echo "Not scoring because local/score.sh does not exist or not executable." && exit 1;
  local/score.sh $scoring_opts --cmd "$cmd" $data $graphdir $dir ||
    { echo "$0: Scoring failed. (ignore by '--skip-scoring true')"; exit 1; }
fi

exit 0;


================================================
FILE: egs/steps/decode_sgmm2_rescore_project.sh
================================================
#!/usr/bin/env bash

# Copyright 2012  Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.

# This script does decoding with an SGMM system, by rescoring lattices
# generated from a previous SGMM system.  This version does the "predictive"
# SGMM, where we subtract some constant times the log-prob of the left
# few spliced frames, and the same for the right few.
# The directory with the lattices
# is assumed to contain any speaker vectors, if used.  This script just
# adds into the acoustic scores, (some constant, default -0.25) times
# the acoustic score of the left model, and the same for the right model.

# the lattices one final time, using the same setup as the final decoding
# pass of the source dir.  The assumption is that the model may have
# been discriminatively trained.

# If the system was built on top of fMLLR transforms from a conventional system,
# you should provide the --transform-dir option.

# Begin configuration section.
stage=0
transform_dir=    # dir to find fMLLR transforms.
cmd=run.pl
iter=final
prob_scale=-0.25
dimensions=0:13:104:117
skip_scoring=false
# End configuration section.

echo "$0 $@"  # Print the command line for logging

[ -f ./path.sh ] && . ./path.sh; # source the path.
. parse_options.sh || exit 1;

if [ $# -ne 5 ]; then
  echo "Usage: steps/decode_sgmm_rescore_project.sh [options] <full-lda-mat> <graph-dir|lang-dir> <data-dir> <old-decode-dir> <decode-dir>"
  echo " e.g.: steps/decode_sgmm_rescore_project.sh --transform-dir exp/tri3b/decode_dev93_tgpr \\"
  echo "     exp/tri2b/full.mat exp/sgmm3a/graph_tgpr data/test_dev93 exp/sgmm3a/decode_dev93_tgpr exp/sgmm3a/decode_dev93_tgpr_predict"
  echo "main options (for others, see top of script file)"
  echo "  --transform-dir <decoding-dir>           # directory of previous decoding"
  echo "                                           # where we can find transforms for SAT systems."
  echo "  --config <config-file>                   # config containing options"
  echo "  --cmd <cmd>                              # Command to run in parallel with"
  echo "  --prob-scale <scale>                     # Default -0.25, scale on left and right models."
  exit 1;
fi

full_lda_mat=$1
graphdir=$2
data=$3
olddir=$4
dir=$5
srcdir=`dirname $dir`; # Assume model directory one level up from decoding directory.

for f in $full_lda_mat $graphdir/words.txt $data/feats.scp $olddir/lat.1.gz \
   $olddir/gselect.1.gz $srcdir/$iter.mdl; do
  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
done

nj=`cat $olddir/num_jobs` || exit 1;
sdata=$data/split$nj;
splice_opts=`cat $srcdir/splice_opts 2>/dev/null`
cmvn_opts=`cat $srcdir/cmvn_opts 2>/dev/null`

mkdir -p $dir/log
[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
echo $nj > $dir/num_jobs

if [ -f $olddir/vecs.1 ]; then
  echo "$0: using speaker vectors from $olddir"
  spkvecs_opt="--spk-vecs=ark:$olddir/vecs.JOB --utt2spk=ark:$sdata/JOB/utt2spk"
else
  echo "$0: no speaker vectors found."
  spkvecs_opt=
fi

if [ $stage -le 0 ]; then
  # Get full LDA+MLLT mat and its inverse.  Note: the full LDA+MLLT mat is
  # the LDA+MLLT mat, plus the "rejected" rows of the LDA matrix.
  $cmd $dir/log/get_full_lda.log \
    get-full-lda-mat $srcdir/final.mat $full_lda_mat $dir/full.mat $dir/full_inv.mat || exit 1;
fi

if [ $stage -le 1 ]; then
  left_start=`echo $dimensions | cut '-d:' -f 1`;
  left_end=`echo $dimensions | cut '-d:' -f 2`;
  right_start=`echo $dimensions | cut '-d:' -f 3`;
  right_end=`echo $dimensions | cut '-d:' -f 4`;

  # Prepare left and right models.  For now, the dimensions are hardwired (e.g., 13 MFCCs and splice 9 frames).
  # Note: the choice of dividing by the prob of the left 4 and the right 4 frames is a bit arbitrary and
  # we could investigate different configurations.
  $cmd $dir/log/left.log \
    sgmm2-project --start-dim=$left_start --end-dim=$left_end $srcdir/final.mdl $dir/full.mat $dir/left.mdl $dir/left.mat || exit 1;
  $cmd $dir/log/right.log \
    sgmm2-project --start-dim=$right_start --end-dim=$right_end $srcdir/final.mdl $dir/full.mat $dir/right.mdl $dir/right.mat || exit 1;
fi


# we apply the scaling on the new acoustic probs by adding the inverse
# of that to the old acoustic probs, and then later inverting again.
# this has to do with limitations in sgmm2-rescore-lattice: we can only
# scale the *old* acoustic probs, not the new ones.
inverse_prob_scale=`perl -e "print (1.0 / $prob_scale);"`
cur_lats="ark:gunzip -c $olddir/lat.JOB.gz | lattice-scale --acoustic-scale=$inverse_prob_scale ark:- ark:- |"

## Set up features.  Note: we only support LDA+MLLT features, this
## is inherent in the method, we could not support deltas.

for model_type in left right; do

  feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- |" # spliced features.
  if [ ! -z "$transform_dir" ]; then  # using speaker-specific transforms.
     # we want to transform in the sequence: $dir/full.mat, then the result of
     # (extend-transform-dim $transform_dir/trans.JOB), then $dir/full_inv.mat to
     # get back to the spliced space, then the left.mat or right.mat.  But
     # note that compose-transforms operates in matrix-multiplication order,
     # which is opposite from the "order of applying the transforms" order.
     new_dim=$[`copy-matrix --binary=false $dir/full.mat - | wc -l` - 1]; # 117 in normal case.
     feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk 'ark:extend-transform-dim --new-dimension=$new_dim ark:$transform_dir/trans.JOB ark:- | compose-transforms ark:- $dir/full.mat ark:- | compose-transforms $dir/full_inv.mat ark:- ark:- | compose-transforms $dir/${model_type}.mat ark:- ark:- |' ark:- ark:- |"
  else  # else, we transform with the "left" or "right" matrix; these transform from the
        # spliced space.
     feats="$feats transform-feats $dir/${model_type}.mat |"
     # If we don't have the --transform-dir option, make sure the model was
     # trained in the same way.
     if grep 'transform-feats --utt2spk' $srcdir/log/acc.0.1.log 2>/dev/null; then
       echo "$0: **WARNING**: you seem to be using an SGMM system trained with transforms,"
       echo "  but you are not providing the --transform-dir option in test time."
     fi
  fi
  if [ -f $olddir/trans.1 ]; then
     echo "$0: warning: not using transforms in $olddir (this is just a "
     echo " limitation of the script right now, and could be fixed)."
  fi
  
  if [ $stage -le 2 ]; then
    echo "Getting gselect info for $model_type model."
    $cmd JOB=1:$nj $dir/log/gselect.$model_type.JOB.log \
       sgmm2-gselect $dir/$model_type.mdl "$feats" \
       "ark,t:|gzip -c >$dir/gselect.$model_type.JOB.gz" || exit 1;
  fi
  gselect_opt="--gselect=ark,s,cs:gunzip -c $dir/gselect.$model_type.JOB.gz|"


  # Rescore the state-level lattices with the model provided.  Just
  # one command in this script.
  # The --old-acoustic-scale=1.0 option means we just add the scores
  # to the old scores.
  if [ $stage -le 3 ]; then
    echo "$0: rescoring lattices with $model_type model"
    $cmd JOB=1:$nj $dir/log/rescore.${model_type}.JOB.log \
      sgmm2-rescore-lattice --old-acoustic-scale=1.0 "$gselect_opt" $spkvecs_opt \
      $dir/$model_type.mdl "$cur_lats" "$feats" \
      "ark:|gzip -c > $dir/lat.${model_type}.JOB.gz" || exit 1;
  fi
  cur_lats="ark:gunzip -c $dir/lat.${model_type}.JOB.gz |"
done

if [ $stage -le 4 ]; then
  echo "$0: getting final lattices."
  $cmd JOB=1:$nj $dir/log/scale_lats.JOB.log \
    lattice-scale --acoustic-scale=$prob_scale "$cur_lats" "ark:|gzip -c >$dir/lat.JOB.gz" \
   || exit 1;
fi

rm $dir/lat.{left,right}.*.gz 2>/dev/null  # note: if these still exist, it will
 # confuse the scoring script.

if ! $skip_scoring ; then
  [ ! -x local/score.sh ] && \
    echo "Not scoring because local/score.sh does not exist or not executable." && exit 1;
  local/score.sh --cmd "$cmd" $data $graphdir $dir ||
    { echo "$0: Scoring failed. (ignore by '--skip-scoring true')"; exit 1; }
fi

exit 0;


================================================
FILE: egs/steps/decode_with_map.sh
================================================
#!/usr/bin/env bash

# Copyright 2012  Neha Agrawal, Cisco Systems;
#                 Johns Hopkins University (Author: Daniel Povey);
#                 
# Apache 2.0

# Begin configuration section.  
transform_dir=
iter=
model= # You can specify the model to use (e.g. if you want to use the .alimdl)
nj=4
cmd=run.pl
max_active=7000
beam=13.0
lattice_beam=6.0
acwt=0.083333 # note: only really affects pruning (scoring is on lattices).
mean_tau=20
weight_tau=10
flags=mw  # could also contain "v" for variance; the default
          # tau for that is 50.
stage=1
skip_scoring=false
# End configuration section.

[ -f ./path.sh ] && . ./path.sh; # source the path.
. parse_options.sh || exit 1;
if [ $# != 3 ]; then
   echo "Usage: steps/decode.sh [options] <graph-dir> <data-dir> <decode-dir>"
   echo "... where <decode-dir> is assumed to be a sub-directory of the directory"
   echo " where the model is."
   echo "e.g.: steps/decode.sh exp/mono/graph_tgpr data/test_dev93 exp/mono/decode_dev93_tgpr"
   echo ""
   echo "This script works on CMN + (delta+delta-delta | LDA+MLLT) features; it works out"
   echo "what type of features you used (assuming it's one of these two)"
   echo ""
   echo "main options (for others, see top of script file)"
   echo "  --config <config-file>                           # config containing options"
   echo "  --nj <nj>                                        # number of parallel jobs"
   echo "  --iter <iter>                                    # Iteration of model to test."
   echo "  --model <model>                                  # which model to use (e.g. to"
   echo "                                                   # specify the final.alimdl)"
   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
   echo "  --transform-dir <trans-dir>                      # dir to find fMLLR transforms "
   echo "                                                   # speaker-adapted decoding"
   exit 1;
fi


graphdir=$1
data=$2
dir=$3
srcdir=`dirname $dir`; # The model directory is one level up from decoding directory.
sdata=$data/split$nj;

mkdir -p $dir/log
[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
echo $nj > $dir/num_jobs

if [ -z "$model" ]; then # if --model <mdl> was not specified on the command line...
  if [ -z $iter ]; then model=$srcdir/final.mdl; 
  else model=$srcdir/$iter.mdl; fi
fi

for f in $sdata/1/feats.scp $sdata/1/cmvn.scp $model $graphdir/HCLG.fst; do
  [ ! -f $f ] && echo "decode.sh: no such file $f" && exit 1;
done

if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
echo "decode.sh: feature type is $feat_type";

splice_opts=`cat $srcdir/splice_opts 2>/dev/null`
cmvn_opts=`cat $srcdir/cmvn_opts 2>/dev/null`
delta_opts=`cat $srcdir/delta_opts 2>/dev/null`

case $feat_type in
  delta) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas $delta_opts ark:- ark:- |";;
  lda) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |";;
  *) echo "Invalid feature type $feat_type" && exit 1;
esac
if [ ! -z "$transform_dir" ]; then # add transforms to features...
  echo "Using fMLLR transforms from $transform_dir"
  [ ! -f $transform_dir/trans.1 ] && echo "Expected $transform_dir/trans.1 to exist."
  [ "`cat $transform_dir/num_jobs`" -ne $nj ] && \
     echo "Mismatch in number of jobs with $transform_dir";
  feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/trans.JOB ark:- ark:- |"
fi

if [ $stage -le 1 ]; then
  echo "Doing first-pass decoding before MAP decoding."
  $cmd JOB=1:$nj $dir/log/decode_pass1.JOB.log \
    gmm-decode-faster --max-active=$max_active --beam=$beam \
    --acoustic-scale=$acwt --allow-partial=true --word-symbol-table=$graphdir/words.txt \
    $model $graphdir/HCLG.fst "$feats" ark:$dir/tmp.JOB.tra ark:$dir/pass1_decode.JOB.ali || exit 1;
fi

if [ $stage -le 2 ]; then
  echo "Computing MAP stats and doing MAP-adapted decoding"
  $cmd JOB=1:$nj $dir/log/decode_pass2.JOB.log \
    ali-to-post ark:$dir/pass1_decode.JOB.ali ark:- \| \
  gmm-adapt-map --mean-tau=$mean_tau --weight-tau=$weight_tau \
       --update-flags=$flags --spk2utt=ark:$sdata/JOB/spk2utt \
     $model "$feats" ark:- ark:- \| \
  gmm-latgen-map --lattice-beam=$lattice_beam --acoustic-scale=$acwt \
   --utt2spk=ark:$sdata/JOB/utt2spk --max-active=$max_active --beam=$beam \
   --allow-partial=true --word-symbol-table=$graphdir/words.txt \
   $model ark,s,cs:- $graphdir/HCLG.fst "$feats" "ark:|gzip -c >$dir/lat.JOB.gz"
fi
#rm -f $dir/pass1_decode.*.ali
#rm -f $dir/tmp.*.tra

if ! $skip_scoring ; then
  [ ! -x local/score.sh ] && \
    echo "Not scoring because local/score.sh does not exist or not executable." && exit 1;
  local/score.sh --cmd "$cmd" $data $graphdir $dir ||
    { echo "$0: Scoring failed. (ignore by '--skip-scoring true')"; exit 1; }
fi

exit 0;


================================================
FILE: egs/steps/diagnostic/analyze_alignments.sh
================================================
#!/usr/bin/env bash
#
# Copyright Johns Hopkins University (Author: Daniel Povey) 2016.  Apache 2.0.

# This script performs some analysis of alignments on disk, currently in terms
# of phone lengths, including lengths of leading and trailing silences


# begin configuration section.
cmd=run.pl
#end configuration section.

echo "$0 $@"  # Print the command line for logging

[ -f ./path.sh ] && . ./path.sh
. parse_options.sh || exit 1;

if [ $# -ne 2 ]; then
  echo "Usage: $0 [options] <lang-dir> <ali-dir>"
  echo " Options:"
  echo "    --cmd (run.pl|queue.pl...)      # specify how to run the sub-processes."
  echo "e.g.:"
  echo "$0 data/lang exp/tri4b"
  echo "This script writes some diagnostics to <ali-dir>/log/alignments.log"
  exit 1;
fi

lang=$1
dir=$2

model=$dir/final.mdl

for f in $lang/words.txt $model $dir/ali.1.gz $dir/num_jobs; do
  [ ! -f $f ] && echo "$0: expecting file $f to exist" && exit 1;
done

num_jobs=$(cat $dir/num_jobs) || exit 1

mkdir -p $dir/log

rm $dir/phone_stats.*.gz 2>/dev/null || true

$cmd JOB=1:$num_jobs $dir/log/get_phone_alignments.JOB.log \
  set -o pipefail '&&' ali-to-phones --write-lengths=true "$model"  \
      "ark:gunzip -c $dir/ali.JOB.gz|" ark,t:- \| \
   sed -E 's/^[^ ]+ //' \| \
   awk 'BEGIN{FS=" ; "; OFS="\n";} {print "begin " $1; if (NF>1) print "end " $NF; for (n=1;n<=NF;n++) print "all " $n; }' \| \
   sort \| uniq -c \| gzip -c '>' $dir/phone_stats.JOB.gz || exit 1

if ! $cmd $dir/log/analyze_alignments.log \
  gunzip -c "$dir/phone_stats.*.gz" \| \
  steps/diagnostic/analyze_phone_length_stats.py $lang; then
  echo "$0: analyze_phone_length_stats.py failed, but ignoring the error (it's just for diagnostics)"
fi

grep WARNING $dir/log/analyze_alignments.log
echo "$0: see stats in $dir/log/analyze_alignments.log"

rm $dir/phone_stats.*.gz

exit 0


================================================
FILE: egs/steps/diagnostic/analyze_lats.sh
================================================
#!/usr/bin/env bash
#
# Copyright Johns Hopkins University (Author: Daniel Povey) 2016.  Apache 2.0.

# This script does the same type of diagnostics as analyze_alignments.sh, except
# it starts from lattices (so it has to convert the lattices to alignments
# first).

# begin configuration section.
iter=final
cmd=run.pl
acwt=0.1
#end configuration section.

echo "$0 $@"  # Print the command line for logging

[ -f ./path.sh ] && . ./path.sh
. parse_options.sh || exit 1;

if [ $# -ne 2 ]; then
  echo "Usage: $0 [options] (<lang-dir>|<graph-dir>) <decode-dir>"
  echo " Options:"
  echo "    --cmd (run.pl|queue.pl...)      # specify how to run the sub-processes."
  echo "    --acwt <acoustic-scale>         # Acoustic scale for getting best-path (default: 0.1)"
  echo "e.g.:"
  echo "$0 data/lang exp/tri4b/decode_dev"
  echo "This script writes some diagnostics to <decode-dir>/log/alignments.log"
  exit 1;
fi

lang=$1
dir=$2

model=$dir/../${iter}.mdl

for f in $lang/words.txt $model $dir/lat.1.gz $dir/num_jobs; do
  [ ! -f $f ] && echo "$0: expecting file $f to exist" && exit 1;
done

num_jobs=$(cat $dir/num_jobs) || exit 1

mkdir -p $dir/log

rm $dir/phone_stats.*.gz 2>/dev/null || true

# this writes two archives of depth_tmp and ali_tmp of (depth per frame, alignment per frame).
$cmd JOB=1:$num_jobs $dir/log/lattice_best_path.JOB.log \
  lattice-depth-per-frame "ark:gunzip -c $dir/lat.JOB.gz|" "ark,t:|gzip -c > $dir/depth_tmp.JOB.gz" ark:- \| \
  lattice-best-path --acoustic-scale=$acwt ark:- ark:/dev/null "ark,t:|gzip -c >$dir/ali_tmp.JOB.gz" || exit 1

$cmd JOB=1:$num_jobs $dir/log/get_lattice_stats.JOB.log \
  ali-to-phones --write-lengths=true "$model" "ark:gunzip -c $dir/ali_tmp.JOB.gz|" ark,t:- \| \
  perl -ne 'chomp;s/^\S+\s*//;@a=split /\s;\s/, $_;$count{"begin ".$a[$0]."\n"}++;
  if(@a>1){$count{"end ".$a[-1]."\n"}++;}for($i=0;$i<@a;$i++){$count{"all ".$a[$i]."\n"}++;}
  END{for $k (sort keys %count){print "$count{$k} $k"}}' \| \
  gzip -c '>' $dir/phone_stats.JOB.gz || exit 1

$cmd $dir/log/analyze_alignments.log \
  gunzip -c "$dir/phone_stats.*.gz" \| \
  steps/diagnostic/analyze_phone_length_stats.py $lang || exit 1

grep WARNING $dir/log/analyze_alignments.log
echo "$0: see stats in $dir/log/analyze_alignments.log"

$cmd $dir/log/dump_ali_frame.log \
  ali-to-phones --per-frame=true "$model" "ark:gunzip -c $dir/ali_tmp.*.gz|" "ark,t:|gzip -c >$dir/ali_frame_tmp.gz"

$cmd $dir/log/analyze_lattice_depth_stats.log \
  gunzip -c "$dir/depth_tmp.*.gz" \| \
  steps/diagnostic/analyze_lattice_depth_stats.py $lang "$dir/ali_frame_tmp.gz" || exit 1

grep Overall $dir/log/analyze_lattice_depth_stats.log
echo "$0: see stats in $dir/log/analyze_lattice_depth_stats.log"


rm $dir/phone_stats.*.gz
rm $dir/depth_tmp.*.gz
rm $dir/ali_frame_tmp.gz
rm $dir/ali_tmp.*.gz

exit 0


================================================
FILE: egs/steps/diagnostic/analyze_lattice_depth_stats.py
================================================
#!/usr/bin/env python3


# Copyright 2016 Johns Hopkins University (author: Daniel Povey)
# Apache 2.0.

from __future__ import print_function
from __future__ import division
import argparse
import sys, os
from collections import defaultdict
from io import open
import codecs
import gzip

# reference: http://www.macfreek.nl/memory/Encoding_of_Python_stdout
if sys.version_info.major == 2:
    sys.stdout = codecs.getwriter('utf-8')(sys.stdout, 'strict')
else:
    assert sys.version_info.major == 3
    sys.stdout = codecs.getwriter('utf-8')(sys.stdout.buffer, 'strict')


parser = argparse.ArgumentParser(description="This script reads stats created in analyze_lats.sh "
                                 "to print information about lattice depths broken down per phone. "
                                 "The normal output of this script is written to the standard output "
                                 "and is human readable (on crashes, we'll print an error to stderr.")

parser.add_argument("--frequency-cutoff-percentage", type = float,
                    default = 0.5, help="Cutoff, expressed as a percentage "
                    "(between 0 and 100), of frequency at which we print stats "
                    "for a phone.")

parser.add_argument("lang",
                    help="Language directory, e.g. data/lang.")

parser.add_argument("ali_per_frame",
                    help="Gzipped alignment per frame, e.g. ali_frame_tmp.gz")

args = parser.parse_args()

# set up phone_int2text to map from phone to printed form.
phone_int2text = {}
try:
    f = open(args.lang + "/phones.txt", "r", encoding='utf-8')
    for line in f.readlines():
        [ word, number] = line.split()
        phone_int2text[int(number)] = word
    f.close()
except:
    sys.exit(u"analyze_lattice_depth_stats.py: error opening or reading {0}/phones.txt".format(
            args.lang))
# this is a special case... for begin- and end-of-sentence stats,
# we group all nonsilence phones together.
phone_int2text[0] = 'nonsilence'

# populate the set and 'nonsilence', which will contain the integer phone-ids of
# nonsilence phones (and disambig phones, which won't matter).
nonsilence = set(phone_int2text.keys())
nonsilence.remove(0)
try:
    # open lang/phones/silence.csl-- while there are many ways of obtaining the
    # silence/nonsilence phones, we read this because it's present in graph
    # directories as well as lang directories.
    filename = u"{0}/phones/silence.csl".format(args.lang)
    f = open(filename, "r")
    line = f.readline()
    for silence_phone in line.split(":"):
        nonsilence.remove(int(silence_phone))
    f.close()
except Exception as e:
    sys.exit(u"analyze_lattice_depth_stats.py: error processing {0}/phones/silence.csl: {1}".format(
            args.lang, str(e)))

# phone_depth_counts is a dict of dicts.
# for each integer phone-id 'phone',
# phone_depth_counts[phone] is a map from depth to count (of frames on which
# that was the 1-best phone in the alignment, and the lattice depth
# had that value).  So we'd access it as
# count = phone_depth_counts[phone][depth].

phone_depth_counts = dict()

# note: -1 is for all phones put in one bucket.
for p in [ -1 ] + list(phone_int2text.keys()):
    phone_depth_counts[p] = defaultdict(int)

total_frames = 0

ali_per_frame = {}
for line in gzip.open(args.ali_per_frame, mode='rt', encoding='utf-8'):
   uttid, ali = line.split(" ", 1)
   ali_per_frame[uttid] = ali

for line in sys.stdin:
    uttid, depth = line.split(" ", 1)
    if uttid in ali_per_frame:
        apf = ali_per_frame[uttid].split()
        dpf = depth.split()
        for p, d in zip(apf, dpf):
              p, d = int(p), int(d)
              phone_depth_counts[p][d] += 1
              total_frames += 1
              if p in nonsilence:
                  nonsilence_phone = 0
                  phone_depth_counts[nonsilence_phone][d] += 1
              universal_phone = -1
              phone_depth_counts[universal_phone][d] += 1

if total_frames == 0:
    sys.exit(u"analyze_lattice_depth_stats.py: read no input")

# If depth_to_count is a map from depth-in-frames to count,
# return the depth-in-frames that equals the (fraction * 100)'th
# percentile of the distribution.
def GetPercentile(depth_to_count, fraction):
    this_total_frames = sum(depth_to_count.values())
    if this_total_frames == 0:
        return 0
    else:
        items = sorted(depth_to_count.items())
        count_cutoff = int(fraction * this_total_frames)
        cur_count_total = 0
        for depth,count in items:
            assert count >= 0
            cur_count_total += count
            if cur_count_total >= count_cutoff:
                return depth
        assert false # we shouldn't reach here.

def GetMean(depth_to_count):
    this_total_frames = sum(depth_to_count.values())
    if this_total_frames == 0:
        return 0.0
    this_total_depth = sum([ float(l * c) for l,c in depth_to_count.items() ])
    return this_total_depth / this_total_frames


print(u"The total amount of data analyzed assuming 100 frames per second "
      u"is {0} hours".format("%.1f" % (total_frames / 360000.0)))

# the next block prints lines like (to give some examples):
# Nonsilence phones as a group account for 74.4% of phone occurrences, with lattice depth (10,50,90-percentile)=(1,2,7) and mean=3.1
# Phone SIL accounts for 25.5% of phone occurrences, with lattice depth (10,50,90-percentile)=(1,1,4) and mean=2.5
# Phone Z_E accounts for 2.5% of phone occurrences, with lattice depth (10,50,90-percentile)=(1,2,6) and mean=2.9
# ...


# sort the phones in decreasing order of count.
for phone,depths in sorted(phone_depth_counts.items(), key = lambda x : -sum(x[1].values())):

    frequency_percentage = sum(depths.values()) * 100.0 / total_frames
    if frequency_percentage < args.frequency_cutoff_percentage:
        continue


    depth_percentile_10 = GetPercentile(depths, 0.1)
    depth_percentile_50 = GetPercentile(depths, 0.5)
    depth_percentile_90 = GetPercentile(depths, 0.9)
    depth_mean = GetMean(depths)

    if phone > 0:
        try:
            phone_text = phone_int2text[phone]
        except:
            sys.exit(u"analyze_lattice_depth_stats.py: phone {0} is not covered on phones.txt "
                     u"(lang/alignment mismatch?)".format(phone))
        preamble = u"Phone {phone_text} accounts for {percent}% of frames, with".format(
            phone_text = phone_text, percent = "%.1f" % frequency_percentage)
    elif phone == 0:
        preamble = u"Nonsilence phones as a group account for {percent}% of frames, with".format(
            percent = "%.1f" % frequency_percentage)
    else:
        assert phone == -1
        preamble = "Overall,";

    print(u"{preamble} lattice depth (10,50,90-percentile)=({p10},{p50},{p90}) and mean={mean}".format(
            preamble = preamble,
            p10 = depth_percentile_10,
            p50 = depth_percentile_50,
            p90 = depth_percentile_90,
            mean = "%.1f" % depth_mean))


================================================
FILE: egs/steps/diagnostic/analyze_phone_length_stats.py
================================================
#!/usr/bin/env python


# Copyright 2016 Johns Hopkins University (author: Daniel Povey)
# Apache 2.0.

from __future__ import print_function
import argparse
import sys, os
from collections import defaultdict
from io import open
import codecs

# reference: http://www.macfreek.nl/memory/Encoding_of_Python_stdout
if sys.version_info.major == 2:
    sys.stdout = codecs.getwriter('utf-8')(sys.stdout, 'strict')
else:
    assert sys.version_info.major == 3
    sys.stdout = codecs.getwriter('utf-8')(sys.stdout.buffer, 'strict')


parser = argparse.ArgumentParser(description="This script reads stats created in analyze_alignments.sh "
                                 "to print information about phone lengths in alignments.  It's principally "
                                 "useful in order to see whether there is a reasonable amount of silence "
                                 "at the beginning and ends of segments.  The normal output of this script "
                                 "is written to the standard output and is human readable (on crashes, "
                                 "we'll print an error to stderr.")

parser.add_argument("--frequency-cutoff-percentage", type = float,
                    default = 0.5, help="Cutoff, expressed as a percentage "
                    "(between 0 and 100), of frequency at which we print stats "
                    "for a phone.")

parser.add_argument("lang",
                    help="Language directory, e.g. data/lang.")

args = parser.parse_args()


# set up phone_int2text to map from phone to printed form.
phone_int2text = {}
try:
    f = open(args.lang + "/phones.txt", "r", encoding='utf-8')
    for line in f.readlines():
        [ word, number] = line.split()
        phone_int2text[int(number)] = word
    f.close()
except:
    sys.exit("analyze_phone_length_stats.py: error opening or reading {0}/phones.txt".format(
            args.lang))
# this is a special case... for begin- and end-of-sentence stats,
# we group all nonsilence phones together.
phone_int2text[0] = 'nonsilence'


# populate the set 'nonsilence', which will contain the integer phone-ids of
# nonsilence phones (and disambig phones, which won't matter).
nonsilence = set(phone_int2text.keys())
nonsilence.remove(0)
try:
    # open lang/phones/silence.csl-- while there are many ways of obtaining the
    # silence/nonsilence phones, we read this because it's present in graph
    # directories as well as lang directories.
    filename = "{0}/phones/silence.csl".format(args.lang)
    f = open(filename, "r")
    line = f.readline()
    f.close()
    for silence_phone in line.split(":"):
        nonsilence.remove(int(silence_phone))
except Exception as e:
    sys.exit("analyze_phone_length_stats.py: error processing {0}/phones/silence.csl: {1}".format(
            args.lang, str(e)))


# phone_length is a dict of dicts of dicts;
# phone_lengths[boundary_type] for boundary_type in [ 'begin', 'end', 'all' ] is
# a dict indexed by phone, containing dicts from length to a count of occurrences.
# Phones are ints and lengths are integers representing numbers of frames.
# So: count == phone_lengths[boundary_type][phone][length].
# note: for the 'begin' and 'end' boundary-types, we group all nonsilence phones
# into phone-id zero.
phone_lengths = dict()
for boundary_type in [ 'begin', 'end', 'all' ]:
    phone_lengths[boundary_type] = dict()
    for p in phone_int2text.keys():
        phone_lengths[boundary_type][p] = defaultdict(int)

# total_phones is a dict from boundary_type to total count [of phone occurrences]
total_phones = defaultdict(int)
# total_frames is a dict from boundary_type to total number of frames.
total_frames = defaultdict(int)
# total_frames is a dict from num-frames to count of num-utterances with that
# num-frames.

while True:
    line = sys.stdin.readline()
    if line == '':
        break
    a = line.split()
    if len(a) != 4:
        sys.exit("analyze_phone_length_stats.py: reading stdin, could not interpret line: " + line)
    try:
        count, boundary_type, phone, length = a
        total_phones[boundary_type] += int(count)
        total_frames[boundary_type] += int(count) * int(length)
        phone_lengths[boundary_type][int(phone)][int(length)] += int(count)
        if int(phone) in nonsilence:
            nonsilence_phone = 0
            phone_lengths[boundary_type][nonsilence_phone][int(length)] += int(count)
    except Exception as e:
        sys.exit("analyze_phone_length_stats.py: unexpected phone {0} "
                 "seen (lang directory mismatch?): {1}".format(phone, str(e)))

if len(phone_lengths) == 0:
    sys.exit("analyze_phone_length_stats.py: read no input")

# work out the optional-silence phone
try:
    f = open(args.lang + "/phones/optional_silence.int", "r")
    optional_silence_phone = int(f.readline())
    optional_silence_phone_text = phone_int2text[optional_silence_phone]
    f.close()
    if optional_silence_phone in nonsilence:
        print(u"analyze_phone_length_stats.py: was expecting the optional-silence phone to "
              u"be a member of the silence phones, it is not.  This script won't work correctly.")
except:
    largest_count = 0
    optional_silence_phone = 1
    for p in phone_int2text.keys():
        if p > 0 and not p in nonsilence:
            this_count = sum([ l * c for l,c in phone_lengths['all'][p].items() ])
            if this_count > largest_count:
                largest_count = this_count
                optional_silence_phone = p
    optional_silence_phone_text = phone_int2text[optional_silence_phone]
    print(u"analyze_phone_length_stats.py: could not get optional-silence phone from "
          u"{0}/phones/optional_silence.int, guessing that it's {1} from the stats. ".format(
            args.lang, optional_silence_phone_text))


# If length_to_count is a map from length-in-frames to count,
# return the length-in-frames that equals the (fraction * 100)'th
# percentile of the distribution.
def GetPercentile(length_to_count, fraction):
    total_phones = sum(length_to_count.values())
    if total_phones == 0:
        return 0
    else:
        items = sorted(length_to_count.items())
        count_cutoff = int(fraction * total_phones)
        cur_count_total = 0
        for length,count in items:
            assert count >= 0
            cur_count_total += count
            if cur_count_total >= count_cutoff:
                return length
        assert false # we shouldn't reach here.

def GetMean(length_to_count):
    total_phones = sum(length_to_count.values())
    if total_phones == 0:
        return 0.0
    total_frames = sum([ float(l * c) for l,c in length_to_count.items() ])
    return total_frames / total_phones


# Analyze frequency, median and mean of optional-silence at beginning and end of utterances.
# The next block will print something like
#  "At utterance begin, SIL is seen 15.0% of the time; when seen, duration (median, mean) is (5, 7.6) frames."
#  "At utterance end, SIL is seen 14.6% of the time; when seen, duration (median, mean) is (4, 6.1) frames."


# This block will print warnings if silence is seen less than 80% of the time at utterance
# beginning and end.
for boundary_type in 'begin', 'end':
    phone_to_lengths = phone_lengths[boundary_type]
    num_utterances = total_phones[boundary_type]
    assert num_utterances > 0
    opt_sil_lengths = phone_to_lengths[optional_silence_phone]
    frequency_percentage = sum(opt_sil_lengths.values()) * 100.0 / num_utterances
    # The reason for this warning is that the tradition in speech recognition is
    # to supply a little silence at the beginning and end of utterances... up to
    # maybe half a second.  If your database is not like this, you should know;
    # you may want to mess with the segmentation to add more silence.
    if frequency_percentage < 80.0:
        print(u"analyze_phone_length_stats.py: WARNING: optional-silence {0} is seen only {1}% "
              u"of the time at utterance {2}.  This may not be optimal.".format(
                optional_silence_phone_text, frequency_percentage, boundary_type))


# this will control a sentence that we print..
boundary_to_text = { }
boundary_to_text['begin'] = 'At utterance begin'
boundary_to_text['end'] = 'At utterance end'
boundary_to_text['all'] = 'Overall'

# the next block prints lines like (to give some examples):
# At utterance begin, SIL accounts for 98.4% of phone occurrences, with duration (median, mean, 95-percentile) is (57,59.9,113) frames.
# ...
# At utterance end, nonsilence accounts for 4.2% of phone occurrences, with duration (median, mean, 95-percentile) is (13,13.3,22) frames.
# ...
# Overall, R_I accounts for 3.2% of phone occurrences, with duration (median, mean, 95-percentile) is (6,6.9,12) frames.

for boundary_type in 'begin', 'end', 'all':
    phone_to_lengths = phone_lengths[boundary_type]
    tot_num_phones = total_phones[boundary_type]
    # sort the phones in decreasing order of count.
    for phone,lengths in sorted(phone_to_lengths.items(), key = lambda x : -sum(x[1].values())):
        frequency_percentage = sum(lengths.values()) * 100.0 / tot_num_phones
        if frequency_percentage < args.frequency_cutoff_percentage:
            continue

        duration_median = GetPercentile(lengths, 0.5)
        duration_percentile_95 = GetPercentile(lengths, 0.95)
        duration_mean = GetMean(lengths)

        text = boundary_to_text[boundary_type]  # e.g. 'At utterance begin'.
        try:
            phone_text = phone_int2text[phone]
        except:
            sys.exit("analyze_phone_length_stats.py: phone {0} is not covered on phones.txt "
                     "(lang/alignment mismatch?)".format(phone))
        print(u"{text}, {phone_text} accounts for {percent}% of phone occurrences, with "
              u"duration (median, mean, 95-percentile) is ({median},{mean},{percentile95}) frames.".format(
                text = text, phone_text = phone_text,
                percent = "%.1f" % frequency_percentage,
                median = duration_median, mean = "%.1f" % duration_mean,
                percentile95 = duration_percentile_95))


## Print stats on frequency and average length of word-internal optional-silences.
## For optional-silence only, subtract the begin and end-utterance stats from the 'all'
## stats, to get the stats excluding initial and final phones.
total_frames['internal'] = total_frames['all'] - total_frames['begin'] - total_frames['end']
total_phones['internal'] = total_phones['all'] - total_phones['begin'] - total_phones['end']

internal_opt_sil_phone_lengths = dict(phone_lengths['all'][optional_silence_phone])
# internal_opt_sil_phone_lenghts is a dict from length to count.
for length in list(internal_opt_sil_phone_lengths.keys()):
    # subtract the counts for begin and end from the overall counts to get the
    # word-internal count.
    internal_opt_sil_phone_lengths[length] -= (phone_lengths['begin'][optional_silence_phone][length] +
                                               phone_lengths['end'][optional_silence_phone][length])
    if internal_opt_sil_phone_lengths[length] == 0:
        del internal_opt_sil_phone_lengths[length]

if total_phones['internal'] != 0.0:
    total_internal_optsil_frames = sum([ float(l * c) for l,c in internal_opt_sil_phone_lengths.items() ])
    total_optsil_frames = sum([ float(l * c)
                                for l,c in phone_lengths['all'][optional_silence_phone].items() ])
    opt_sil_internal_frame_percent = total_internal_optsil_frames * 100.0 / total_frames['internal']
    opt_sil_total_frame_percent = total_optsil_frames * 100.0 / total_frames['all']
    internal_frame_percent = total_frames['internal'] * 100.0 / total_frames['all']

    print(u"The optional-silence phone {0} occupies {1}% of frames overall ".format(
            optional_silence_phone_text, "%.1f" % opt_sil_total_frame_percent))
    hours_total = total_frames['all'] / 360000.0;
    hours_nonsil = (total_frames['all'] - total_optsil_frames) / 360000.0
    print(u"Limiting the stats to the {0}% of frames not covered by an utterance-[begin/end] phone, "
          u"optional-silence {1} occupies {2}% of frames.".format("%.1f" % internal_frame_percent,
                                                                 optional_silence_phone_text,
                                                                 "%.1f" % opt_sil_internal_frame_percent))
    print(u"Assuming 100 frames per second, the alignments represent {0} hours of data, "
          u"or {1} hours if {2} frames are excluded.".format(
            "%.1f" % hours_total, "%.1f" % hours_nonsil, optional_silence_phone_text))

    opt_sil_internal_phone_percent = (sum(internal_opt_sil_phone_lengths.values()) *
                                      100.0 / total_phones['internal'])
    duration_median = GetPercentile(internal_opt_sil_phone_lengths, 0.5)
    duration_mean = GetMean(internal_opt_sil_phone_lengths)
    duration_percentile_95 = GetPercentile(internal_opt_sil_phone_lengths, 0.95)
    print(u"Utterance-internal optional-silences {0} comprise {1}% of utterance-internal phones, with duration "
          u"(median, mean, 95-percentile) = ({2},{3},{4})".format(
                optional_silence_phone_text, "%.1f" % opt_sil_internal_phone_percent,
                duration_median, "%0.1f" % duration_mean, duration_percentile_95))


================================================
FILE: egs/steps/dict/apply_g2p.sh
================================================
#!/usr/bin/env bash
# Copyright 2014  Johns Hopkins University (Author: Yenda Trmal)
# Copyright 2016  Xiaohui Zhang
# Apache 2.0

# Begin configuration section.  
stage=0
encoding='utf-8'
var_counts=3  #Generate upto N variants
var_mass=0.9  #Generate so many variants to produce 90 % of the prob mass
cmd=run.pl
nj=10          #Split the task into several parallel, to speedup things
model=
# End configuration section.

echo "$0 $@"  # Print the command line for logging

[ -f ./path.sh ] && . ./path.sh; # source the path.
. parse_options.sh || exit 1;

set -u
set -e

if [ $# != 3 ]; then
   echo "Usage: $0 [options] <word-list> <g2p-model-dir> <output-dir>"
   echo "... where <word-list> is a list of words whose pronunciation is to be generated"
   echo "          <g2p-model-dir> is a directory used as a target during training of G2P"
   echo "          <output-dir> is the directory where the output lexicon should be stored"
   echo "e.g.: $0 oov_words exp/g2p exp/g2p/oov_lex"
   echo ""
   echo "main options (for others, see top of script file)"
   echo "  --nj <int>                                    # How many tasks should be spawn (to speedup things)"
   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
   exit 1;
fi

wordlist=$1
modeldir=$2
output=$3


mkdir -p $output/log

model=$modeldir/g2p.model.final
[ ! -f ${model:-} ] && echo "File $model not found in the directory $modeldir." && exit 1
#[ ! -x $wordlist ] && echo "File $wordlist not found!" && exit 1

cp $wordlist $output/wordlist.txt

if ! g2p=`which g2p.py` ; then
  echo "The Sequitur was not found !"
  echo "Go to $KALDI_ROOT/tools and execute extras/install_sequitur.sh"
  exit 1
fi

echo "Applying the G2P model to wordlist $wordlist"

if [ $stage -le 0 ]; then
  $cmd JOBS=1:$nj $output/log/apply.JOBS.log \
    split -n l/JOBS/$nj $output/wordlist.txt \| \
    g2p.py -V $var_mass --variants-number $var_counts --encoding $encoding \
      --model $modeldir/g2p.model.final --apply - \
    \> $output/output.JOBS
fi
cat $output/output.* > $output/output

# Remap the words from output file back to the original casing
# Conversion of some of thems might have failed, so we have to be careful
# and use the transform_map file we generated beforehand
# Also, because the sequitur output is not readily usable as lexicon (it adds 
# one more column with ordering of the pron. variants) convert it into the proper lexicon form
output_lex=$output/lexicon.lex

# Just convert it to a proper lexicon format
cut -f 1,3,4 $output/output > $output_lex

# Some words might have been removed or skipped during the process,
# let's check it and warn the user if so...
nlex=`cut -f 1 $output_lex | sort -u | wc -l`
nwlist=`cut -f 1 $output/wordlist.txt | sort -u | wc -l`
if [ $nlex -ne $nwlist ] ; then
  echo "WARNING: Unable to generate pronunciation for all words. ";
  echo "WARINNG:   Wordlist: $nwlist words"
  echo "WARNING:   Lexicon : $nlex words"
  echo "WARNING:Diff example: "
  diff <(cut -f 1 $output_lex | sort -u ) \
       <(cut -f 1 $output/wordlist.txt | sort -u ) || true
fi
exit 0


================================================
FILE: egs/steps/dict/apply_g2p_phonetisaurus.sh
================================================
#!/usr/bin/env bash
# Copyright 2014  Johns Hopkins University (Author: Yenda Trmal)
# Copyright 2016  Xiaohui Zhang
#           2018  Ruizhe Huang
# Apache 2.0

# This script applies a trained Phonetisarus G2P model to
# synthesize pronunciations for missing words (i.e., words in
# transcripts but not the lexicon), and output the expanded lexicon.
# The user could specify either nbest or pmass option 
# to determine the number of output pronunciation variants, 
# or use them together to get the intersection of two options.

# Begin configuration section.  
stage=0
nbest=      # Generate up to N, like N=3, pronunciation variants for each word
            # (The maximum size of the nbest list, not considering pruning and taking the prob-mass yet). 
thresh=5    # Pruning threshold for the n-best list, in (0, 99], which is a -log-probability value.
            # A large threshold makes the nbest list shorter, and less likely to hit the max size.
            # This value corresponds to the weight_threshold in shortest-path.h of openfst.
pmass=      # Select the top variants from the pruned nbest list,
            # summing up to this total prob-mass for a word.
            # On the "boundary", it's greedy by design, e.g. if pmass = 0.8,
            # and we have prob(pron_1) = 0.5, and prob(pron_2) = 0.4, then we get both.
# End configuration section.

echo "$0 $@"  # Print the command line for logging

[ -f ./path.sh ] && . ./path.sh; # source the path.
. utils/parse_options.sh || exit 1;

set -u
set -e

if [ $# != 3 ]; then
  echo "Usage: $0 [options] <word-list> <g2p-model-dir> <output-dir>"
  echo "... where <word-list> is a list of words whose pronunciation is to be generated."
  echo "          <g2p-model-dir> is a directory used as a target during training of G2P"
  echo "          <output-dir> is the directory where the output lexicon should be stored."
  echo "                       The format of the output lexicon output-dir/lexicon.lex is" 
  echo "                       <word>\t<prob>\t<pronunciation> per line."
  echo "e.g.: $0 --nbest 1 exp/g2p/oov_words.txt exp/g2p exp/g2p/oov_lex"
  echo ""
  echo "main options (for others, see top of script file)"
  echo "  --nbest <int>    # Generate upto N pronunciation variants for each word." 
  echo "  --pmass <float>  # Select the top variants from the pruned nbest list," 
  echo "                   # summing up to this total prob-mass, within [0, 1], for a word." 
  echo "  --thresh <int>   # Pruning threshold for n-best."
  exit 1;
fi

wordlist=$1
modeldir=$2
outdir=$3

model=$modeldir/model.fst
output_lex=$outdir/lexicon.lex
mkdir -p $outdir

[ ! -f ${model:-} ] && echo "$0: File $model not found in the directory $modeldir." && exit 1
[ ! -f $wordlist ] && echo "$0: File $wordlist not found!" && exit 1
[ -z $pmass ] && [ -z $nbest ] && echo "$0: nbest or/and pmass should be specified." && exit 1;
if ! phonetisaurus=`which phonetisaurus-apply` ; then
  echo "Phonetisarus was not found !"
  echo "Go to $KALDI_ROOT/tools and execute extras/install_phonetisaurus.sh"
  exit 1
fi

cp $wordlist $outdir/wordlist.txt

# three options: 1) nbest, 2) pmass, 3) nbest+pmass,
nbest=${nbest:-20}   # if nbest is not specified, set it to 20, due to Phonetisaurus mechanism
pmass=${pmass:-1.0}  # if pmass is not specified, set it to 1.0, due to Phonetisaurus mechanism

[[ ! $nbest =~ ^[1-9][0-9]*$ ]] && echo "$0: nbest should be a positive integer." && exit 1;

echo "Applying the G2P model to wordlist $wordlist"
phonetisaurus-apply --pmass $pmass --nbest $nbest --thresh $thresh \
  --word_list $wordlist --model $model \
  --accumulate --verbose --prob \
  1>$output_lex

echo "Completed. Synthesized lexicon for new words is in $output_lex"

# Some words might have been removed or skipped during the process,
# let's check it and warn the user if so...
nlex=`cut -f 1 $output_lex | sort -u | wc -l`
nwlist=`cut -f 1 $wordlist | sort -u | wc -l`
if [ $nlex -ne $nwlist ] ; then
  failed_wordlist=$outdir/lexicon.failed
  echo "WARNING: Unable to generate pronunciation for all words. ";
  echo "WARINNG:   Wordlist: $nwlist words"
  echo "WARNING:   Lexicon : $nlex words"
  comm -13 <(cut -f 1 $output_lex | sort -u ) \
           <(cut -f 1 $wordlist | sort -u ) \
           >$failed_wordlist && echo "WARNING: The list of failed words is in $failed_wordlist"
fi
exit 0


================================================
FILE: egs/steps/dict/apply_lexicon_edits.py
================================================
#!/usr/bin/env python

# Copyright 2016  Xiaohui Zhang
# Apache 2.0.

from __future__ import print_function
import argparse
import sys

def GetArgs():
    parser = argparse.ArgumentParser(description = "Apply an lexicon edits file (output from steps/dict/select_prons_bayesian.py)to an input lexicon"
                                     "to produce a learned lexicon.",
                                     epilog = "See steps/dict/learn_lexicon_greedy.sh for example")

    parser.add_argument("in_lexicon", metavar='<in-lexicon>', type = str,
                        help = "Input lexicon. Each line must be <word> <phones>.")
    parser.add_argument("lexicon_edits_file", metavar='<lexicon-edits-file>', type = str,
                        help = "Input lexicon edits file containing human-readable & editable"
                               "pronounciation info.  The info for each word is like:"
                         "------------ an 4086.0 --------------"
                         "R  | Y |  2401.6 |  AH N"
                         "R  | Y |  640.8 |  AE N"
                         "P  | Y |  1035.5 |  IH N"
                         "R(ef), P(hone-decoding) represents the pronunciation source"
                         "Y/N means the recommended decision of including this pron or not"
                         "and the numbers are soft counts accumulated from lattice-align-word outputs. See steps/dict/select_prons_bayesian.py for more details.")
    parser.add_argument("out_lexicon", metavar='<out-lexicon>', type = str,
                        help = "Output lexicon to this file.")

    print (' '.join(sys.argv), file=sys.stderr)

    args = parser.parse_args()
    args = CheckArgs(args)

    return args

def CheckArgs(args):
    if args.in_lexicon == "-":
        args.in_lexicon = sys.stdin
    else:
        args.in_lexicon_handle = open(args.in_lexicon)
    args.lexicon_edits_file_handle = open(args.lexicon_edits_file)

    if args.out_lexicon == "-":
        args.out_lexicon_handle = sys.stdout
    else:
        args.out_lexicon_handle = open(args.out_lexicon, "w")

    return args

def ReadLexicon(lexicon_file_handle):
    lexicon = set()
    if lexicon_file_handle:
        for line in lexicon_file_handle.readlines():
            splits = line.strip().split()
            if len(splits) == 0:
                continue
            if len(splits) < 2:
                raise Exception('Invalid format of line ' + line
                                    + ' in lexicon file.')
            word = splits[0]
            phones = ' '.join(splits[1:])
            lexicon.add((word, phones))
    return lexicon

def ApplyLexiconEdits(lexicon, lexicon_edits_file_handle):
    if lexicon_edits_file_handle:
        for line in lexicon_edits_file_handle.readlines():
            # skip all commented lines
            if line.startswith('#'):
                continue
            # read a word from a line like "---- MICROPHONES 200.0 ----".
            if line.startswith('---'):
                splits = line.strip().strip('-').strip().split()
                if len(splits) != 2:
                    print(splits, file=sys.stderr)
                    raise Exception('Invalid format of line ' + line
                                        + ' in lexicon edits file.')
                word = splits[0].strip()
            else:
            # parse the pron and decision 'Y/N' of accepting the pron or not,
            # from a line like: 'P  | Y |  42.0 |  M AY K R AH F OW N Z'
                splits = line.split('|')
                if len(splits) != 4:
                    raise Exception('Invalid format of line ' + line
                                        + ' in lexicon edits file.')
                pron = splits[3].strip()
                if splits[1].strip() == 'Y':
                    lexicon.add((word, pron))
                elif splits[1].strip() == 'N':
                    lexicon.discard((word, pron))
                else:
                    raise Exception('Invalid format of line ' + line
                                        + ' in lexicon edits file.')
    return lexicon


def WriteLexicon(lexicon, out_lexicon_handle):
    for word, pron in lexicon:
        print('{0} {1}'.format(word, pron), file=out_lexicon_handle)
    out_lexicon_handle.close()

def Main():
    args = GetArgs()
    lexicon = ReadLexicon(args.in_lexicon_handle)
    ApplyLexiconEdits(lexicon, args.lexicon_edits_file_handle)
    WriteLexicon(lexicon, args.out_lexicon_handle)

if __name__ == "__main__":
    Main()


================================================
FILE: egs/steps/dict/get_pron_stats.py
================================================
#!/usr/bin/env python

# Copyright 2016  Xiaohui Zhang
#           2016  Vimal Manohar
# Apache 2.0.

from __future__ import print_function
from collections import defaultdict
import argparse
import sys

def GetArgs():
    parser = argparse.ArgumentParser(
        description = "Accumulate statistics from lattice-alignment outputs for lexicon"
        "learning. The inputs are a file containing arc level information from lattice-align-words,"
        "and a map which maps word-position-dependent phones to word-position-independent phones"
        "(output from steps/cleanup/debug_lexicon.txt). The output contains accumulated soft-counts"
        "of pronunciations",
        epilog = "cat exp/tri3_lex_0.4_work/lats/arc_info_sym.*.txt \\|"
        "  steps/dict/get_pron_stats.py - exp/tri3_lex_0.4_work/phone_decode/phone_map.txt \\"
        "  exp/tri3_lex_0.4_work/lats/pron_stats.txt"
        "See steps/dict/learn_lexicon_greedy.sh for examples in detail.")

    parser.add_argument("arc_info_file", metavar = "<arc-info-file>", type = str,
                        help = "Input file containing per arc statistics; "
                        "each line must be <counts> <word> <phones>")
    parser.add_argument("phone_map", metavar = "<phone-map>", type = str,
                        help = "An input phone map used to remove word boundary markers from phones;"
                        "generated in steps/cleanup/debug_lexicon.sh")
    parser.add_argument("stats_file", metavar = "<stats_file>", type = str,
                        help = "Write accumulated statitistics to this file;"
                        "each line is <count> <word> <phones>")

    print (' '.join(sys.argv), file=sys.stderr)

    args = parser.parse_args()
    args = CheckArgs(args)

    return args

def CheckArgs(args):
    if args.arc_info_file == "-":
        args.arc_info_file_handle = sys.stdin
    else:
        args.arc_info_file_handle = open(args.arc_info_file)
    args.phone_map_handle = open(args.phone_map)

    if args.stats_file == "-":
        args.stats_file_handle = sys.stdout
    else:
        args.stats_file_handle = open(args.stats_file, "w")

    return args


def GetStatsFromArcInfo(arc_info_file_handle, phone_map_handle):
    prons = defaultdict(set)
    # need to map the phones to remove word boundary markers.
    phone_map = {}
    stats_unmapped = {} 
    stats = {} 
    for line in phone_map_handle.readlines():
        splits = line.strip().split()
        phone_map[splits[0]] = splits[1]

    for line in arc_info_file_handle.readlines():
        splits = line.strip().split()
        if (len(splits) == 0):
            continue
        if (len(splits) < 6):
            raise Exception('Invalid format of line ' + line
                                + ' in arc_info_file')
        word = splits[4]
        count = float(splits[3])
        phones = " ".join(splits[5:])        
        prons[word].add(phones)
        stats_unmapped[(word, phones)] = stats_unmapped.get((word, phones), 0) + count
     
    for word_pron, count in stats_unmapped.items():
        phones_unmapped = word_pron[1].split()
        phones = [phone_map[phone] for phone in phones_unmapped]
        stats[(word_pron[0], " ".join(phones))] = count
    return stats

def WriteStats(stats, file_handle):
    for word_pron, count in stats.items():
        print('{2} {0} {1}'.format(word_pron[0], word_pron[1], count),
              file=file_handle)
    file_handle.close()

def Main():
    args = GetArgs()
    stats = GetStatsFromArcInfo(args.arc_info_file_handle, args.phone_map_handle)
    WriteStats(stats, args.stats_file_handle)

if __name__ == "__main__":
    Main()


================================================
FILE: egs/steps/dict/internal/get_subsegments.py
================================================
#!/usr/bin/env python

# Copyright 2018 Xiaohui Zhang
# Apache 2.0.

# we're using python 3.x style print but want it to work in python 2.x,
from __future__ import print_function
import argparse
import sys
import string

def GetArgs():
    parser = argparse.ArgumentParser(
        description = "The purpose of this script is to use a ctm and a vocab file"
        "to extract sub-utterances and a sub-segmentation. Extracted sub-utterances"
        "are all the strings of consecutive in-vocab words from the ctm"
        "surrounded by an out-of-vocab word at each end if present.",
        epilog = "e.g. steps/dict/internal/get_subsegments.py exp/tri3_lex_0.4_work/phonetic_decoding/word.ctm \\"
        "exp/tri3_lex_0.4_work/learn_vocab.txt exp/tri3_lex_0.4_work/resegmentation/subsegments \\"
        "exp/tri3_lex_0.4_work/resegmentation/text"
        "See steps/dict/learn_lexicon_greedy.sh for an example.")

    parser.add_argument("ctm", metavar='<ctm>', type = str,
                        help = "Input ctm file."
                        "each line must be <utt-id> <chanel> <start-time> <duration> <word>")
    parser.add_argument("vocab", metavar='<vocab>', type = str,
                        help = "Vocab file."
                        "each line must be <word>")
    parser.add_argument("subsegment", metavar='<subsegtment>', type = str,
                        help = "Subsegment file. Each line is in format:"
                        "<new-utt> <old-utt> <start-time-within-old-utt> <end-time-within-old-utt>")
    parser.add_argument("text", metavar='<text>', type = str,
                        help = "Text file. Each line is in format:"
                        " <new-utt> <word1> <word2> ... <wordN>.")
  
    print (' '.join(sys.argv), file = sys.stderr)

    args = parser.parse_args()
    args = CheckArgs(args)

    return args

def CheckArgs(args):
    if args.ctm == "-":
        args.ctm_handle = sys.stdin
    else:
        args.ctm_handle = open(args.ctm)

    if args.vocab is not '':
        if args.vocab == "-":
            args.vocab_handle = sys.stdout
        else:
            args.vocab_handle = open(args.vocab)

    args.subsegment_handle = open(args.subsegment, 'w')
    args.text_handle = open(args.text, 'w')

    return args

def GetSubsegments(args, vocab):
    sub_utt = list()
    last_is_oov = False
    is_oov = False
    utt_id_last = None
    start_times = {}
    end_times = {}
    sub_utts = {}
    sub_utt_id = 1
    sub_utt_id_last = 1
    end_time_last = 0.0
    for line in args.ctm_handle:
        splits = line.strip().split()
        if len(splits) < 5:
            raise Exception("problematic line",line)

        utt_id = splits[0]
        start = float(splits[2])
        dur = float(splits[3])
        word = splits[4]
        if utt_id != utt_id_last:
            sub_utt_id = 1
            if len(sub_utt)>1:
                sub_utts[utt_id_last+'-'+str(sub_utt_id_last)] = (utt_id_last, sub_utt)
                end_times[utt_id_last+'-'+str(sub_utt_id_last)] = ent_time_last
            sub_utt = []
            start_times[utt_id+'-'+str(sub_utt_id)] = start
            is_oov_last = False
        if word == '<eps>':
            is_oov = True
            end_times[utt_id+'-'+str(sub_utt_id)] = start + dur
        elif word in vocab:
            is_oov = True
            sub_utt.append(word)
            end_times[utt_id+'-'+str(sub_utt_id)] = start + dur
        else:
            is_oov = False
            if is_oov_last == True:
                sub_utt.append(word)
                sub_utts[utt_id+'-'+str(sub_utt_id_last)] = (utt_id, sub_utt)
                end_times[utt_id+'-'+str(sub_utt_id_last)] = start + dur
                sub_utt_id += 1
            sub_utt = [word]
            start_times[utt_id+'-'+str(sub_utt_id)] = start
        utt_id_last = utt_id
        sub_utt_id_last = sub_utt_id
        is_oov_last = is_oov
        ent_time_last = start + dur
        
    if is_oov:
        if word != '<eps>':
            sub_utt.append(word)
        sub_utts[utt_id+'-'+str(sub_utt_id_last)] = (utt_id, sub_utt)
        end_times[utt_id+'-'+str(sub_utt_id_last)] = start + dur

    for utt,v in sorted(sub_utts.items()):
        print(utt, ' '.join(sub_utts[utt][1]), file=args.text_handle)
        print(utt, sub_utts[utt][0], start_times[utt], end_times[utt], file=args.subsegment_handle)

def ReadVocab(vocab_file_handle):
    vocab = set()
    if vocab_file_handle:
        for line in vocab_file_handle.readlines():
            splits = line.strip().split()
            if len(splits) == 0:
                continue
            if len(splits) > 1:
                raise Exception('Invalid format of line ' + line
                                    + ' in vocab file.')
            word = splits[0]
            vocab.add(word)
    return vocab

def Main():
    args = GetArgs()

    vocab = ReadVocab(args.vocab_handle)
    GetSubsegments(args, vocab)
   
if __name__ == "__main__":
    Main()


================================================
FILE: egs/steps/dict/internal/prune_pron_candidates.py
================================================
#!/usr/bin/env python

# Copyright 2018  Xiaohui Zhang
# Apache 2.0.

from __future__ import print_function
from collections import defaultdict
import argparse
import sys
import math

def GetArgs():
    parser = argparse.ArgumentParser(
        description = "Prune pronunciation candidates based on soft-counts from lattice-alignment"
        "outputs, and a reference lexicon. Basically, for each word we sort all pronunciation"
        "cadidates according to their soft-counts, and then select the top variant-counts-ratio * N candidates"
        "(For words in the reference lexicon, N = # pron variants given by the reference"
        "lexicon; For oov words, N = avg. # pron variants per word in the reference lexicon).",
        epilog = "See steps/dict/learn_lexicon_greedy.sh for example")

    parser.add_argument("--variant-counts-ratio", type = float, default = "3.0",
                        help = "A user-specified ratio parameter which determines how many"
                        "pronunciation candidates we want to keep for each word at most.")
    parser.add_argument("pron_stats", metavar = "<pron-stats>", type = str,
                        help = "File containing soft-counts of pronounciation candidates; "
                        "each line must be <soft-counts> <word> <phones>")
    parser.add_argument("lexicon_phonetic_decoding", metavar = "<lexicon-phonetic-decoding>", type = str,
                        help = "Lexicon containing pronunciation candidates from phonetic decoding."
                        "each line must be <word> <phones>")
    parser.add_argument("lexiconp_g2p", metavar = "<lexiconp-g2p>", type = str,
                        help = "Lexicon with probabilities for pronunciation candidates from G2P."
                        "each line must be <prob> <word> <phones>")
    parser.add_argument("ref_lexicon", metavar = "<ref-lexicon>", type = str,
                        help = "Reference lexicon file, where we obtain # pron variants for"
                        "each word, based on which we prune the pron candidates."
                        "Each line must be <word> <phones>")
    parser.add_argument("lexicon_phonetic_decoding_pruned", metavar = "<lexicon-phonetic-decoding-pruned>", type = str,
                        help = "Output lexicon containing pronunciation candidates from phonetic decoding after pruning."
                        "each line must be <word> <phones>")
    parser.add_argument("lexicon_g2p_pruned", metavar = "<lexicon-g2p-pruned>", type = str,
                        help = "Output lexicon containing pronunciation candidates from G2P after pruning."
                        "each line must be <word> <phones>")

    print (' '.join(sys.argv), file=sys.stderr)

    args = parser.parse_args()
    args = CheckArgs(args)

    return args

def CheckArgs(args):
    print(args)
    args.pron_stats_handle = open(args.pron_stats)
    args.lexicon_phonetic_decoding_handle = open(args.lexicon_phonetic_decoding)
    args.lexiconp_g2p_handle = open(args.lexiconp_g2p)
    args.ref_lexicon_handle = open(args.ref_lexicon)
    args.lexicon_phonetic_decoding_pruned_handle = open(args.lexicon_phonetic_decoding_pruned, "w")
    args.lexicon_g2p_pruned_handle = open(args.lexicon_g2p_pruned, "w")
    return args

def ReadStats(pron_stats_handle):
    stats = defaultdict(list)
    for line in pron_stats_handle.readlines():
        splits = line.strip().split()
        if len(splits) == 0:
            continue
        if len(splits) < 2:
            raise Exception('Invalid format of line ' + line
                                + ' in stats file.')
        count = float(splits[0])
        word = splits[1]
        phones = ' '.join(splits[2:])
        stats[word].append((phones, count))

    return stats

def ReadLexicon(lexicon_handle):
    lexicon = defaultdict(set)
    for line in lexicon_handle.readlines():
        splits = line.strip().split()
        if len(splits) == 0:
            continue
        if len(splits) < 2:
            raise Exception('Invalid format of line ' + line
                                + ' in lexicon file.')
        word = splits[0]
        phones = ' '.join(splits[1:])
        lexicon[word].add(phones)
    return lexicon

def ReadLexiconp(lexiconp_handle):
    lexicon = defaultdict(set)
    pron_probs = defaultdict(float)
    for line in lexiconp_handle.readlines():
        splits = line.strip().split()
        if len(splits) == 0:
            continue
        if len(splits) < 3:
            raise Exception('Invalid format of line ' + line
                                + ' in lexicon file.')
        word = splits[1]
        prob = float(splits[0])
        phones = ' '.join(splits[2:])
        pron_probs[(word, phones)] = prob
        lexicon[word].add(phones)
    return lexicon, pron_probs

def PruneProns(args, stats, ref_lexicon, lexicon_phonetic_decoding, lexicon_g2p, lexicon_g2p_probs):
    # For those pron candidates from lexicon_phonetic_decoding/g2p which don't
    # have stats, we append them to the "stats" dict, with a zero count.
    for word, entry in stats.iteritems():
        prons_with_stats = set()
        for (pron, count) in entry:
            prons_with_stats.add(pron)
        for pron in lexicon_g2p[word]:
            if pron not in prons_with_stats:
                entry.append((pron, lexicon_g2p_probs[(word, pron)]-1.0))
        entry.sort(key=lambda x: x[1])
    
    # Compute the average # pron variants counts per word in the reference lexicon.
    num_words_ref = 0
    num_prons_ref = 0
    for word, prons in ref_lexicon.iteritems():
        num_words_ref += 1
        num_prons_ref += len(prons)
    avg_variant_counts_ref = round(float(num_prons_ref) / float(num_words_ref))
    for word, entry in stats.iteritems():
        if word in ref_lexicon:
            variant_counts = args.variant_counts_ratio * len(ref_lexicon[word])
        else:
            variant_counts = args.variant_counts_ratio * avg_variant_counts_ref
        num_variants = 0
        count = 0.0
        while num_variants < variant_counts:
            try:
                pron, count = entry.pop()
                if word in ref_lexicon and pron in ref_lexicon[word]:
                    continue
                if pron in lexicon_phonetic_decoding[word]:
                    num_variants += 1
                    print('{0} {1}'.format(word, pron), file=args.lexicon_phonetic_decoding_pruned_handle)
                if pron in lexicon_g2p[word]:
                    num_variants += 1
                    print('{0} {1}'.format(word, pron), file=args.lexicon_g2p_pruned_handle)
            except IndexError:
                break

def Main():
    args = GetArgs()
    ref_lexicon = ReadLexicon(args.ref_lexicon_handle)
    lexicon_phonetic_decoding = ReadLexicon(args.lexicon_phonetic_decoding_handle)
    lexicon_g2p, lexicon_g2p_probs = ReadLexiconp(args.lexiconp_g2p_handle)
    stats = ReadStats(args.pron_stats_handle)

    PruneProns(args, stats, ref_lexicon, lexicon_phonetic_decoding, lexicon_g2p, lexicon_g2p_probs)

if __name__ == "__main__":
    Main()


================================================
FILE: egs/steps/dict/internal/sum_arc_info.py
================================================
#!/usr/bin/env python

# Copyright 2018   Xiaohui Zhang
# Apache 2.0

from __future__ import print_function
from collections import defaultdict
import argparse
import sys

class StrToBoolAction(argparse.Action):
    """ A custom action to convert bools from shell format i.e., true/false
        to python format i.e., True/False """
    def __call__(self, parser, namespace, values, option_string=None):
        if values == "true":
            setattr(namespace, self.dest, True)
        elif values == "false":
            setattr(namespace, self.dest, False)
        else:
            raise Exception("Unknown value {0} for --{1}".format(values, self.dest))


def GetArgs():
    parser = argparse.ArgumentParser(
        description = "Accumulate statistics from per arc lattice statitics"
        "for lexicon learning",
        epilog = "See steps/dict/learn_lexicon_greedy.sh for example")

    parser.add_argument("--set-sum-to-one", type = str, default = True,
                        action = StrToBoolAction, choices = ["true", "false"],
                        help = "If normalize posteriors such that the sum of "
                        "pronunciation posteriors of a word in an utterance is 1.")
    parser.add_argument("arc_info_file", metavar = "<arc-info-file>", type = str,
                        help = "File containing per arc statistics; "
                        "each line must be <utt-id> <word> <start-frame> <duration> <posterior>"
                        "<phones-with-word-boundary-markers>")
    parser.add_argument("phone_map", metavar = "<phone-map>", type = str,
                        help = "An input phone map used to remove word boundary markers from phones;"
                        "generated in steps/cleanup/debug_lexicon.sh")
    parser.add_argument("stats_file", metavar = "<out-stats-file>", type = str,
                        help = "Write accumulated statitistics to this file"
                        "each line is <utt-id> <word> <start-frame> <posterior>"
                        "<phones-without-word-boundary-markers>")

    print (' '.join(sys.argv), file=sys.stderr)

    args = parser.parse_args()
    args = CheckArgs(args)

    return args

def CheckArgs(args):
    if args.arc_info_file == "-":
        args.arc_info_file_handle = sys.stdin
    else:
        args.arc_info_file_handle = open(args.arc_info_file)
    
    args.phone_map_handle = open(args.phone_map)

    if args.stats_file == "-":
        args.stats_file_handle = sys.stdout
    else:
        args.stats_file_handle = open(args.stats_file, "w")

    return args

def Main():
    args = GetArgs()

    lexicon = defaultdict(list)
    prons = defaultdict(list)
    start_frames = {}
    stats = defaultdict(lambda : defaultdict(float))
    sum_tot = defaultdict(float)

    phone_map = {}
    for line in args.phone_map_handle.readlines():
        splits = line.strip().split()
        phone_map[splits[0]] = splits[1]

    for line in args.arc_info_file_handle.readlines():
        splits = line.strip().split()

        if (len(splits) == 0):
            continue

        if (len(splits) < 6):
            raise Exception('Invalid format of line ' + line
                                + ' in ' + args.arc_info_file)

        utt = splits[0]
        start_frame = int(splits[1])
        word = splits[4]
        count = float(splits[3])
        phones_unmapped = splits[5:]   
        phones = [phone_map[phone] for phone in phones_unmapped]
        phones = ' '.join(phones)
        overlap = False
        if word == '<eps>':
            continue
        if (word, utt) not in start_frames:
            start_frames[(word, utt)] = start_frame

        if (word, utt) in stats:
            stats[word, utt][phones] = stats[word, utt].get(phones, 0) + count
        else:
            stats[(word, utt)][phones] = count
        sum_tot[(word, utt)] += count

        if phones not in prons[word]:
            prons[word].append(phones)

    for (word, utt) in stats:
       count_sum = 0.0
       counts = dict()
       for phones in stats[(word, utt)]:
           count = stats[(word, utt)][phones]
           count_sum += count
           counts[phones] = count
       # By default we normalize the pron posteriors of each word in each utterance,
       # so that they sum up exactly to one. If a word occurs two times in a utterance,
       # the effect of this operation is to average the posteriors of these two occurences
       # so that there's only one "equivalent occurence" of this word in the utterance.
       # However, this case should be extremely rare if the utterances are already
       # short sub-utterances produced by steps/dict/internal/get_subsegments.py
       for phones in stats[(word, utt)]:
           count = counts[phones] / count_sum
           print(word, utt, start_frames[(word, utt)], count, phones, file=args.stats_file_handle)
       # # Diagnostics info implying incomplete arc_info or multiple occurences of a word in a utterance:
       # if count_sum < 0.9 or count_sum > 1.1:
       #    print(word, utt, start_frame, count_sum, stats[word, utt], file=sys.stderr)

    args.stats_file_handle.close()

if __name__ == "__main__":
    Main()


================================================
FILE: egs/steps/dict/learn_lexicon_bayesian.sh
================================================
#! /bin/bash

# Copyright 2016  Xiaohui Zhang
#           2016  Vimal Manohar
# Apache 2.0

# This script demonstrate how to expand a existing lexicon using a combination
# of acoustic evidence and G2P to learn a lexicon that covers words in a target 
# vocab, and agrees sufficiently with the acoustics. The basic idea is to 
# run phonetic decoding on acoustic training data using an existing
# acoustice model (possibly re-trained using a G2P-expanded lexicon) to get 
# alternative pronunciations for words in training data. Then we combine three
# exclusive sources of pronunciations: the reference lexicon (supposedly 
# hand-derived), phonetic decoding, and G2P (optional) into one lexicon and then run 
# lattice alignment on the same data, to collect acoustic evidence (soft
# counts) of all pronunciations. Based on these statistics, and
# user-specified prior-counts (parameterized by prior mean and prior-counts-tot,
# assuming the prior follows a Dirichlet distribution), we then use a Bayesian
# framework to compute posteriors of all pronunciations for each word,
# and then select best pronunciations for each word. The output is a final learned lexicon
# whose vocab matches the user-specified target-vocab, and two intermediate resultis:
# an edits file which records the recommended changes to all in-ref-vocab words'
# prons, and a half-learned lexicon where all in-ref-vocab words' prons were untouched
# (on top of which we apply the edits file to produce the final learned lexicon).
# The user can always modify the edits file manually and then re-apply it on the
# half-learned lexicon using steps/dict/apply_lexicon_edits to produce the final
# learned lexicon. See the last stage in this script for details.


# Begin configuration section.  
cmd=run.pl
nj=4
stage=0

oov_symbol=
lexicon_g2p=

min_prob=0.3
variant_counts_ratio=8 
variants_prob_mass=0.7
variants_prob_mass_ref=0.9

prior_counts_tot=15
prior_mean="0.7,0.2,0.1"
num_gauss=
num_leaves=
retrain_src_mdl=false

cleanup=true
# End configuration section.  

. ./path.sh
. utils/parse_options.sh

if [ $# -lt 6 ] || [ $# -gt 7 ]; then
  echo "Usage: $0 [options] <ref-dict> <target-vocab> <data> \\"
  echo "                    <src-mdl-dir> <ref-lang> <dest-dict> [ <tmp-dir> ]"
  echo "e.g.: $0 --oov-symbol \"<UNK>\" data/local/dict data/local/lm/librispeech-vocab.txt data/train \\"
  echo "                               exp/tri3 data/lang data/local/dict_learned"
  echo "" 
  echo "  This script does lexicon expansion using a combination of acoustic"
  echo "  evidence and G2P to produce a lexicon that covers words of a target vocab:"
  echo ""               
  echo "Arguments:"
  echo " <ref-dict>     the dir which contains the reference lexicon (most probably hand-derived)"
  echo "                we want to expand/improve, and nonsilence_phones.txt,.etc which we need " 
  echo "                for building new dict dirs."
  echo " <target-vocab> the vocabulary we want the final learned lexicon to cover (one word per line)."
  echo " <data>         acoustic training data we use to get alternative"
  echo "                pronunciations and collet acoustic evidence."
  echo " <src-mdl-dir>  The dir containing an SAT-GMM acoustic model (we optionaly we re-train it" 
  echo "                using G2P expanded lexicon) to do phonetic decoding (to get alternative"
  echo "                pronunciations) and lattice-alignment (to collect acoustic evidence for"
  echo "                evaluating all prounciations)"
  echo " <ref-lang>     the reference lang dir which we use to get non-scored-words"
  echo "                like <UNK> for building new dict dirs"
  echo " <dest-dict>    the dict dir where we put the final learned lexicon, whose vocab"
  echo "                matches <target-vocab>"
  echo " [ <tmp-dir> ]  the temporary dir where most of the intermediate outputs are stored"
  echo "                (default: \${src-mdl-dir}_lex_learn_work)"
  echo ""
  echo "Note: <target-vocab> and the vocab of <data> don't have to match. For words"
  echo "     who are in <target-vocab> but not seen in <data>, their pronunciations" 
  echo "     will be given by G2P at the end."
  echo ""
  echo "Options:"
  echo "  --stage <n>                  # stage to run from, to enable resuming from partially"
  echo "                               # completed run (default: 0)"
  echo "  --cmd '$cmd'                 # command to submit jobs with (e.g. run.pl, queue.pl)"
  echo "  --nj <nj>                    # number of parallel jobs"
  echo "  --oov-symbol <unk_symbol>    # (required option) oov symbol, like <UNK>."
  echo "  --lexicon-g2p                # A lexicon file containing g2p generated pronunciations, for words in acoustic training "
  echo "                               # data / target vocabulary. It's optional."
  echo "  --min-prob <float>           # The cut-off parameter used to select pronunciation candidates from phonetic"
  echo "                               # decoding. We remove pronunciations with probabilities less than this value"
  echo "                               # after normalizing the probs s.t. the max-prob is 1.0 for each word."
  echo "  --variant-counts-ratio <int> # This ratio parameter determines the maximum number of pronunciation"
  echo "                               # candidates we will keep for each word, after pruning according to lattice statistics from"
  echo "                               # the first iteration of lattice generation. See steps/dict/internal/prune_pron_candidates.py"
  echo "                               # for details."
  echo "  --prior-mean                 # Mean of priors (summing up to 1) assigned to three exclusive pronunciation"
  echo "         <float,float,float>   # source: reference lexicon, g2p, and phonetic decoding (used in the Bayesian"
  echo "                               # pronunciation selection procedure). We recommend setting a larger prior"
  echo "                               # mean for the reference lexicon, e.g. '0.6,0.2,0.2'."
  echo "  --prior-counts-tot <float>   # Total amount of prior counts we add to all pronunciation candidates of"
  echo "                               # each word. By timing it with the prior mean of a source, and then dividing"
  echo "                               # by the number of candidates (for a word) from this source, we get the"
  echo "                               # prior counts we actually add to each candidate."
  echo "  --variants-prob-mass <float> # In the Bayesian pronunciation selection procedure, for each word, we"
  echo "                               # choose candidates (from all three sources) with highest posteriors"
  echo "                               # until the total prob mass hit this amount."
  echo "                               # It's used in a similar fashion when we apply G2P."
  echo "  --variants-prob-mass-ref     # In the Bayesian pronunciation selection procedure, for each word,"
  echo "                               # after the total prob mass of selected candidates hit variants-prob-mass,"
  echo "                               # we continue to pick up reference candidates with highest posteriors"
  echo "                               # until the total prob mass hit this amount (must >= variants-prob-mass)."
  echo "  --num-gauss                  # number of gaussians for the re-trained SAT model (on top of <src-mdl-dir>)."            
  echo "  --num-leaves                 # number of leaves for the re-trained SAT model (on top of <src-mdl-dir>)." 
  echo "  --retrain-src-mdl            # true if you want to re-train the src_mdl before phone decoding (default false)."
  exit 1
fi

echo "$0 $@"  # Print the command line for logging

ref_dict=$1
target_vocab=$2
data=$3
src_mdl_dir=$4
ref_lang=$5
dest_dict=$6

if [ -z "$oov_symbol" ]; then
   echo "$0: the --oov-symbol option is required."
   exit 1
fi

if [ $# -gt 6 ]; then
  dir=$7 
else
  dir=${src_mdl_dir}_lex_learn_work
fi

mkdir -p $dir

if [ $stage -le 0 ]; then
  echo "$0: Some preparatory work."
  # Get the word counts of training data.
  awk '{for (n=2;n<=NF;n++) counts[$n]++;} END{for (w in counts) printf "%s %d\n",w, counts[w];}' \
    $data/text | sort > $dir/train_counts.txt
  
  # Get the non-scored entries and exclude them from the reference lexicon/vocab, and target_vocab.
  steps/cleanup/internal/get_non_scored_words.py $ref_lang > $dir/non_scored_words
  awk 'NR==FNR{a[$1] = 1; next} {if($1 in a) print $0}' $dir/non_scored_words \
    $ref_dict/lexicon.txt > $dir/non_scored_entries 

  # Remove non-scored-words from the reference lexicon.
  awk 'NR==FNR{a[$1] = 1; next} {if(!($1 in a)) print $0}' $dir/non_scored_words \
    $ref_dict/lexicon.txt | tr -s '\t' ' ' | awk '$1=$1' > $dir/ref_lexicon.txt

  cat $dir/ref_lexicon.txt | awk '{print $1}' | sort | uniq > $dir/ref_vocab.txt
  awk 'NR==FNR{a[$1] = 1; next} {if(!($1 in a)) print $0}' $dir/non_scored_words \
    $target_vocab | sort | uniq > $dir/target_vocab.txt
    
  # From the reference lexicon, we estimate the target_num_prons_per_word as,
  # round(avg. # prons per word in the reference lexicon). This'll be used as 
  # the upper bound of # pron variants per word when we apply G2P or select prons to
  # construct the learned lexicon in later stages.
  python -c 'import sys; import math; print int(round(float(sys.argv[1])/float(sys.argv[2])))' \
    `wc -l $dir/ref_lexicon.txt | awk '{print $1}'` `wc -l $dir/ref_vocab.txt | awk '{print $1}'` \
    > $dir/target_num_prons_per_word || exit 1;

  if [ -z $lexicon_g2p ]; then
    # create an empty list of g2p generated prons, if it's not given.
    touch $dir/lexicon_g2p.txt
  else
    cat $lexicon_g2p | awk '{if (NF<2) {print "There is an empty pronunciation in lexicon_g2p.txt. Exit." \
      > "/dev/stderr"; exit 1} print $0}' - > $dir/lexicon_g2p.txt || exit 1;
  fi
fi

if [ $stage -le 1 ] && $retrain_src_mdl; then
  echo "$0: Expand the reference lexicon to cover all words in the target vocab. and then"
  echo "   ... re-train the source acoustic model for phonetic decoding. "
  mkdir -p $dir/dict_expanded_target_vocab
  cp $ref_dict/{extra_questions.txt,optional_silence.txt,nonsilence_phones.txt,silence_phones.txt} \
    $dir/dict_expanded_target_vocab  2>/dev/null
  rm $dir/dict_expanded_target_vocab/lexiconp.txt $dir/dict_expanded_target_vocab/lexicon.txt 2>/dev/null
  
  # Get the oov words list (w.r.t ref vocab) which are in the target vocab. 
  awk 'NR==FNR{a[$1] = 1; next} !($1 in a)' $dir/ref_lexicon.txt \
    $dir/target_vocab.txt | sort | uniq > $dir/oov_target_vocab.txt

  # Assign pronunciations from lexicon_g2p.txt to oov_target_vocab. For words which
  # cannot be found in lexicon_g2p.txt, we simply ignore them.
  awk 'NR==FNR{a[$1] = 1; next} ($1 in a)' $dir/oov_target_vocab.txt \
    $dir/lexicon_g2p.txt > $dir/lexicon_g2p_oov_target_vocab.txt
  
  cat $dir/lexicon_g2p_oov_target_vocab.txt $dir/ref_lexicon.txt | \
    awk 'NR==FNR{a[$1] = 1; next} ($1 in a)' $dir/target_vocab.txt - | \
    cat $dir/non_scored_entries - | 
    sort | uniq > $dir/dict_expanded_target_vocab/lexicon.txt
   
  utils/prepare_lang.sh --phone-symbol-table $ref_lang/phones.txt $dir/dict_expanded_target_vocab \
    "$oov_symbol" $dir/lang_expanded_target_vocab_tmp $dir/lang_expanded_target_vocab || exit 1;
  
  # Align the acoustic training data using the given src_mdl_dir.
  alidir=${src_mdl_dir}_ali_$(basename $data) 
  steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
    $data $dir/lang_expanded_target_vocab $src_mdl_dir $alidir || exit 1;

  # Train another SAT system on the given data and put it in $dir/${src_mdl_dir}_retrained
  # this model will be used for phonetic decoding and lattice alignment later on.
  if [ -z $num_leaves ] || [ -z $num_gauss ] ; then
    # infer the model parameters using the inital GMM
    num_leaves=`gmm-info ${src_mdl_dir}/final.mdl  | grep 'pdfs' | awk '{print $NF-1}'`
    num_gauss=`gmm-info ${src_mdl_dir}/final.mdl  | grep 'gaussians' | awk '{print $NF-1}'`
  fi
  steps/train_sat.sh --cmd "$train_cmd" $num_leaves $num_gauss \
    $data $dir/lang_expanded_target_vocab $alidir $dir/${src_mdl_dir}_retrained || exit 1;
fi

if [ $stage -le 2 ]; then
  echo "$0: Expand the reference lexicon to cover all words seen in,"
  echo "  ... acoustic training data, and prepare corresponding dict and lang directories."
  echo "  ... This is needed when generate pron candidates from phonetic decoding."
  mkdir -p $dir/dict_expanded_train
  cp $ref_dict/{extra_questions.txt,optional_silence.txt,nonsilence_phones.txt,silence_phones.txt} \
    $dir/dict_expanded_train 2>/dev/null
  rm $dir/dict_expanded_train/lexiconp.txt $dir/dict_expanded_train/lexicon.txt 2>/dev/null

  # Get the oov words list (w.r.t ref vocab) which are in training data. 
  awk 'NR==FNR{a[$1] = 1; next} {if(!($1 in a)) print $1}' $dir/ref_lexicon.txt \
    $dir/train_counts.txt | awk 'NR==FNR{a[$1] = 1; next} {if(!($1 in a)) print $0}' \
    $dir/non_scored_words - | sort > $dir/oov_train.txt || exit 1; 
  
  awk 'NR==FNR{a[$1] = 1; next} {if(($1 in a)) b+=$2; else c+=$2} END{print c/(b+c)}' \
    $dir/ref_vocab.txt $dir/train_counts.txt > $dir/train_oov_rate || exit 1;
  
  echo "OOV rate (w.r.t. the reference lexicon) of the acoustic training data is:"
  cat $dir/train_oov_rate

  # Assign pronunciations from lexicon_g2p to oov_train. For words which
  # cannot be found in lexicon_g2p, we simply assign oov_symbol's pronunciaiton
  # (like NSN) to them, in order to get phonetic decoding pron candidates for them later on.
  awk 'NR==FNR{a[$1] = 1; next} ($1 in a)' $dir/oov_train.txt \
    $dir/lexicon_g2p.txt > $dir/g2p_prons_for_oov_train.txt || exit 1;
  
  # Get the pronunciation of oov_symbol.
  oov_pron=`cat $dir/non_scored_entries | grep $oov_symbol | awk '{print $2}'`
  # For oov words in training data for which we don't even have G2P pron candidates,
  # we simply assign them the pronunciation of the oov symbol (like <unk>).
  awk 'NR==FNR{a[$1] = 1; next} {if(!($1 in a)) print $1}' $dir/g2p_prons_for_oov_train.txt \
    $dir/oov_train.txt | awk -v op="$oov_pron" '{print $0" "op}' > $dir/oov_train_no_pron.txt || exit 1;
    
  cat $dir/oov_train_no_pron.txt $dir/g2p_prons_for_oov_train.txt $dir/ref_lexicon.txt | \
    awk 'NR==FNR{a[$1] = 1; next} ($1 in a)' $dir/train_counts.txt - | \
    cat - $dir/non_scored_entries | \
    sort | uniq > $dir/dict_expanded_train/lexicon.txt || exit 1;
  
  utils/prepare_lang.sh --phone-symbol-table $ref_lang/phones.txt $dir/dict_expanded_train "$oov_symbol" \
    $dir/lang_expanded_train_tmp $dir/lang_expanded_train || exit 1;
fi

if [ $stage -le 3 ]; then
  echo "$0: Generate pronunciation candidates from phonetic decoding on acoustic training data.."
  if $retrain_src_mdl; then mdl_dir=$dir/${src_mdl_dir}_retrained; else mdl_dir=$src_mdl_dir; fi
  steps/cleanup/debug_lexicon.sh --nj $nj --cmd "$decode_cmd" $data $dir/lang_expanded_train \
    $mdl_dir $dir/dict_expanded_train/lexicon.txt $dir/phonetic_decoding || exit 1;
  
  # We prune the phonetic decoding generated prons relative to the largest count, by setting "min_prob",
  # and only leave prons who are not present in the reference lexicon / g2p-generated lexicon.
  cat $dir/ref_lexicon.txt $dir/lexicon_g2p.txt | sort -u > $dir/phonetic_decoding/filter_lexicon.txt 
  
  $cmd $dir/phonetic_decoding/log/prons_to_lexicon.log steps/dict/prons_to_lexicon.py \
    --min-prob=$min_prob --filter-lexicon=$dir/phonetic_decoding/filter_lexicon.txt \
    $dir/phonetic_decoding/prons.txt $dir/lexicon_phonetic_decoding_with_eps.txt
  cat $dir/lexicon_phonetic_decoding_with_eps.txt | grep -vP "<eps>|<UNK>|<unk>|\[.*\]" | \
    sort | uniq > $dir/lexicon_phonetic_decoding.txt || exit 1;
fi

if [ $stage -le 4 ]; then
  echo "$0: Combine the reference lexicon and pronunciations from phone-decoding/G2P into one"
  echo "  ... lexicon, and run lattice alignment using this lexicon on acoustic training data"
  echo "  ... to collect acoustic evidence."
  # Combine the reference lexicon, pronunciations from G2P and phonetic decoding into one lexicon.
  mkdir -p $dir/dict_combined_iter1
  cp $ref_dict/{extra_questions.txt,optional_silence.txt,nonsilence_phones.txt,silence_phones.txt} \
    $dir/dict_combined_iter1/ 2>/dev/null
  rm $dir/dict_combined_iter1/lexiconp.txt $dir/dict_combined_iter1/lexicon.txt 2>/dev/null

  # Filter out words which don't appear in the acoustic training data
  cat $dir/lexicon_phonetic_decoding.txt $dir/lexicon_g2p.txt \
    $dir/ref_lexicon.txt | tr -s '\t' ' ' | \
    awk 'NR==FNR{a[$1] = 1; next} ($1 in a)' $dir/train_counts.txt - | \
    cat $dir/non_scored_entries - | \
    sort | uniq > $dir/dict_combined_iter1/lexicon.txt
  
  utils/prepare_lang.sh --phone-symbol-table $ref_lang/phones.txt \
    $dir/dict_combined_iter1 "$oov_symbol" \
    $dir/lang_combined_iter1_tmp $dir/lang_combined_iter1 || exit 1;
  
  # Generate lattices for the acoustic training data with the combined lexicon.
  if $retrain_src_mdl; then mdl_dir=$dir/${src_mdl_dir}_retrained; else mdl_dir=$src_mdl_dir; fi
  steps/align_fmllr_lats.sh --acoustic-scale 0.05 --cmd "$decode_cmd" --nj $nj \
    $data $dir/lang_combined_iter1 $mdl_dir $dir/lats_iter1 || exit 1;

  # Get arc level information from the lattice.
  $cmd JOB=1:$nj $dir/lats_iter1/log/get_arc_info.JOB.log \
    lattice-align-words $dir/lang_combined_iter1/phones/word_boundary.int \
    $dir/lats_iter1/final.mdl \
    "ark:gunzip -c $dir/lats_iter1/lat.JOB.gz |" ark:- \| \
    lattice-arc-post --acoustic-scale=0.1 $dir/lats_iter1/final.mdl ark:- - \| \
    utils/int2sym.pl -f 5 $dir/lang_combined_iter1/words.txt \| \
    utils/int2sym.pl -f 6- $dir/lang_combined_iter1/phones.txt '>' \
    $dir/lats_iter1/arc_info_sym.JOB.txt || exit 1;
  
  # Get soft counts of all pronunciations from arc level information.
  cat $dir/lats_iter1/arc_info_sym.*.txt | steps/dict/get_pron_stats.py - \
    $dir/phonetic_decoding/phone_map.txt $dir/lats_iter1/pron_stats.txt || exit 1;
fi

if [ $stage -le 5 ]; then
  echo "$0: Prune the pronunciation candidates generated from G2P/phonetic decoding, and re-do lattice-alignment."
  mkdir -p $dir/dict_combined_iter2
  cp $ref_dict/{extra_questions.txt,optional_silence.txt,nonsilence_phones.txt,silence_phones.txt} \
    $dir/dict_combined_iter2/ 2>/dev/null
  rm $dir/dict_combined_iter2/lexiconp.txt $dir/dict_combined_iter2/lexicon.txt 2>/dev/null

  # Prune away pronunciations which have low acoustic evidence from the first pass of lattice alignment.
  $cmd $dir/lats_iter1/log/prune_pron_candidates.log steps/dict/internal/prune_pron_candidates.py \
    --variant-counts-ratio $variant_counts_ratio \
    $dir/lats_iter1/pron_stats.txt $dir/lexicon_phonetic_decoding.txt $dir/lexiconp_g2p.txt $dir/ref_lexicon.txt \
    $dir/lexicon_phonetic_decoding_pruned.txt $dir/lexicon_g2p_pruned.txt

  # Filter out words which don't appear in the acoustic training data
  cat $dir/lexicon_phonetic_decoding_pruned.txt $dir/lexicon_g2p_pruned.txt \
    $dir/ref_lexicon.txt | tr -s '\t' ' ' | \
    awk 'NR==FNR{a[$1] = 1; next} ($1 in a)' $dir/train_counts.txt - | \
    cat $dir/non_scored_entries - | \
    sort | uniq > $dir/dict_combined_iter2/lexicon.txt

  utils/prepare_lang.sh --phone-symbol-table $ref_lang/phones.txt \
    $dir/dict_combined_iter2 "$oov_symbol" \
    $dir/lang_combined_iter2_tmp $dir/lang_combined_iter2 || exit 1;
  
  if $retrain_src_mdl; then mdl_dir=$dir/${src_mdl_dir}_retrained; else mdl_dir=$src_mdl_dir; fi
  steps/align_fmllr_lats.sh --cmd "$decode_cmd" --nj $nj \
    $data $dir/lang_combined_iter2 $mdl_dir $dir/lats_iter2 || exit 1;

  # Get arc level information from the lattice.
  $cmd JOB=1:$nj $dir/lats_iter2/log/get_arc_info.JOB.log \
    lattice-align-words $dir/lang_combined_iter2/phones/word_boundary.int \
    $dir/lats_iter2/final.mdl \
    "ark:gunzip -c $dir/lats_iter2/lat.JOB.gz |" ark:- \| \
    lattice-arc-post --acoustic-scale=0.1 $dir/lats_iter2/final.mdl ark:- - \| \
    utils/int2sym.pl -f 5 $dir/lang_combined_iter2/words.txt \| \
    utils/int2sym.pl -f 6- $dir/lang_combined_iter2/phones.txt '>' \
    $dir/lats_iter2/arc_info_sym.JOB.txt || exit 1;
  
  # Get soft counts of all pronunciations from arc level information.
  cat $dir/lats_iter2/arc_info_sym.*.txt | steps/dict/get_pron_stats.py - \
    $dir/phonetic_decoding/phone_map.txt $dir/lats_iter2/pron_stats.txt || exit 1;
fi

if [ $stage -le 6 ]; then
  echo "$0: Select pronunciations according to the acoustic evidence from lattice alignment."
  # Given the acoustic evidence (soft-counts), we use a Bayesian framework to select pronunciations 
  # from three exclusive candidate sources: reference (hand-derived) lexicon, G2P and phonetic decoding.
  # The posteriors for all candidate prons for all words are printed into pron_posteriors.txt
  # For words which are out of the ref. vocab, the learned prons are written into out_of_ref_vocab_prons_learned.txt.
  # Among them, for words without acoustic evidence, we just ignore them, even if pron candidates from G2P were provided).
  # For words in the ref. vocab, we instead output a human readable & editable "edits" file called
  # ref_lexicon_edits.txt, which records all proposed changes to the prons (if any). Also, a 
  # summary is printed into the log file.
  
  variants_counts=`cat $dir/target_num_prons_per_word` || exit 1;
  $cmd $dir/lats_iter2/log/select_prons_bayesian.log \
    steps/dict/select_prons_bayesian.py --prior-mean=$prior_mean --prior-counts-tot=$prior_counts_tot \
    --variants-counts=$variants_counts --variants-prob-mass=$variants_prob_mass --variants-prob-mass-ref=$variants_prob_mass_ref \
    $ref_dict/silence_phones.txt $dir/lats_iter2/pron_stats.txt $dir/train_counts.txt $dir/ref_lexicon.txt \
    $dir/lexicon_g2p_pruned.txt $dir/lexicon_phonetic_decoding_pruned.txt \
    $dir/lats_iter2/pron_posteriors.temp $dir/lats_iter2/out_of_ref_vocab_prons_learned.txt $dir/lats_iter2/ref_lexicon_edits.txt

  # We reformat the pron_posterior file and add some comments.
  paste <(cat $dir/lats_iter2/pron_posteriors.temp | cut -d' ' -f1-3 | column -t) \
    <(cat $dir/lats_iter2/pron_posteriors.temp | cut -d' ' -f4-) | sort -nr -k1,3 | \
    cat <( echo ';; <word> <source: R(eference)/G(2P)/P(hone-decoding)> <posterior> <pronunciation>') -  \
    > $dir/lats_iter2/pron_posteriors.txt
  rm $dir/pron_posteriors.temp 2>/dev/null

  # Remove some stuff that takes up space and is unlikely to be useful later on.
  if $cleanup; then
    rm -r $dir/lats_iter*/{fsts*,lat*} 2>/dev/null
  fi
fi

if [ $stage -le 7 ]; then
  echo "$0: Expand the learned lexicon further to cover words in target vocab that are."
  echo "  ... not seen in acoustic training data."
  mkdir -p $dest_dict
  cp $ref_dict/{extra_questions.txt,optional_silence.txt,nonsilence_phones.txt,silence_phones.txt} \
    $dest_dict  2>/dev/null
  rm $dest_dict/lexiconp.txt $dest_dict/lexicon.txt 2>/dev/null
  # Get the list of oov (w.r.t. ref vocab) without acoustic evidence, which are in the
  # target vocab. We'll just assign to them pronunciations from lexicon_g2p, if any.
  cat $dir/lats_iter2/out_of_ref_vocab_prons_learned.txt $dir/ref_lexicon.txt | \
    awk 'NR==FNR{a[$1] = 1; next} !($1 in a)' - \
    $dir/target_vocab.txt | sort | uniq > $dir/oov_no_acoustics.txt || exit 1;

  awk 'NR==FNR{a[$1] = 1; next} ($1 in a)' $dir/oov_no_acoustics.txt \
    $dir/lexicon_g2p.txt > $dir/g2p_prons_for_oov_no_acoustics.txt
 
  # We concatenate three lexicons togethers: G2P lexicon for oov words without acoustics,
  # learned lexicon for oov words with acoustics, and the original reference lexicon (for
  # this part, later one we'll apply recommended changes using steps/dict/apply_lexicon_edits.py
  cat $dir/g2p_prons_for_oov_no_acoustics.txt $dir/lats_iter2/out_of_ref_vocab_prons_learned.txt \
    $dir/ref_lexicon.txt | tr -s '\t' ' ' | sort | uniq > $dest_dict/lexicon.temp

  awk 'NR==FNR{a[$1] = 1; next} ($1 in a)' $dir/target_vocab.txt \
    $dest_dict/lexicon.temp | sort | uniq > $dest_dict/lexicon.nosil

  cat $dir/non_scored_entries $dest_dict/lexicon.nosil | sort | uniq >$dest_dict/lexicon0.txt
fi

if [ $stage -le 8 ]; then
  echo "$0: Apply the ref_lexicon_edits file to the reference lexicon."
  echo "  ... The user can inspect/modify the edits file and then re-run:"
  echo "  ... steps/dict/apply_lexicon_edits.py $dest_dict/lexicon0.txt $dir/lats_iter2/ref_lexicon_edits.txt  - | \\"
  echo "  ...   sort -u \> $dest_dict/lexicon.txt to re-produce the final learned lexicon."
  cp $dir/lats_iter2/ref_lexicon_edits.txt $dest_dict/lexicon_edits.txt 2>/dev/null
  steps/dict/apply_lexicon_edits.py $dest_dict/lexicon0.txt $dir/lats_iter2/ref_lexicon_edits.txt - | \
    sort | uniq > $dest_dict/lexicon.txt || exit 1;
fi


================================================
FILE: egs/steps/dict/learn_lexicon_greedy.sh
================================================
#! /bin/bash

# Copyright 2018  Xiaohui Zhang
# Apache 2.0

# This recipe has similar inputs and outputs as steps/dict/learn_lexicon.sh
# The major difference is, instead of using a Bayesian framework for 
# pronunciation selection, we used a likelihood-reduction based greedy 
# pronunciation selection framework presented in the paper:
# "Acoustic data-driven lexicon learning based on a greedy pronunciation "
# "selection framework, by X. Zhang, V. Mahonar, D. Povey and S. Khudanpur,"
# "Interspeech 2017."

# This script demonstrate how to expand a existing lexicon using a combination
# of acoustic evidence and G2P to learn a lexicon that covers words in a target 
# vocab, and agrees sufficiently with the acoustics. The basic idea is to 
# run phonetic decoding on acoustic training data using an existing
# acoustice model (possibly re-trained using a G2P-expanded lexicon) to get 
# alternative pronunciations for words in training data. Then we combine three
# exclusive sources of pronunciations: the reference lexicon (supposedly 
# hand-derived), phonetic decoding, and G2P (optional) into one lexicon and then run 
# lattice alignment on the same data, to collect acoustic evidence (soft
# counts) of all pronunciations. Based on these statistics, we use a greedy
# framework (see steps/dict/select_prons_greedy.sh for details) to select an
# informative subset of pronunciations for each word with acoustic evidence. 
# two important parameters are alpha and beta. Basically, the three dimensions of alpha
# and beta correspond to three pronunciation sources: phonetic-decoding, G2P and
# the reference lexicon, and the larger a value is, the more aggressive we'll
# prune pronunciations from that sooure. The valid range of each dim. is [0, 1]
# (for alpha, and 0 means we never pruned pron from that source.) [0, 100] (for beta). 
# The output of steps/dict/select_prons_greedy.sh is a learned lexicon whose vocab 
# matches the user-specified target-vocab, and two intermediate outputs which were
# used to generate the learned lexicon: an edits file which records the recommended
# changes to all in-ref-vocab words' prons, and a half-learned lexicon
# ($dest_dict/lexicon0.txt) where all in-ref-vocab words' prons were untouched
# (on top of which we apply the edits file to produce the final learned lexicon). 
# The user can always modify the edits file manually and then re-apply it on the 
# half-learned lexicon using steps/dict/apply_lexicon_edits.sh to produce the 
# final learned lexicon. See the last stage in this script for details.

stage=0
# Begin configuration section.  
cmd=run.pl
nj=
stage=0
oov_symbol=
lexiconp_g2p=
min_prob=0.3
variant_counts_ratio=8 
variant_counts_no_acoustics=1 
alpha="0,0,0"
beta="0,0,0"
delta=0.0000001
num_gauss=
num_leaves=
retrain_src_mdl=true
cleanup=true
nj_select_prons=200
learn_iv_prons=false # whether we want to learn the prons of IV words (w.r.t. ref_vocab), 

# End configuration section.  

. ./path.sh
. utils/parse_options.sh

if [ $# -lt 6 ] || [ $# -gt 7 ]; then
  echo "Usage: $0 [options] <ref-dict> <target-vocab> <data> <src-mdl-dir> \\"
  echo "          <ref-lang> <dest-dict> <dir>."
  echo "  This script does lexicon expansion using a combination of acoustic"
  echo "  evidence and G2P to produce a lexicon that covers words of a target vocab:"
  echo ""               
  echo "Arguments:"
  echo " <ref-dict>     The dir which contains the reference lexicon (most probably hand-derived)"
  echo "                we want to expand/improve, and nonsilence_phones.txt,.etc which we need " 
  echo "                for building new dict dirs."
  echo " <target-vocab> The vocabulary we want the final learned lexicon to cover (one word per line)."
  echo " <data>         acoustic training data we use to get alternative"
  echo "                pronunciations and collet acoustic evidence."
  echo " <src-mdl-dir>  The dir containing an SAT-GMM acoustic model (we optionaly we re-train it" 
  echo "                using G2P expanded lexicon) to do phonetic decoding (to get alternative"
  echo "                pronunciations) and lattice-alignment (to collect acoustic evidence for"
  echo "                evaluating all prounciations)"
  echo " <ref-lang>     The reference lang dir which we use to get non-scored-words"
  echo "                like <UNK> for building new dict dirs"
  echo " <dest-dict>    The dict dir where we put the final learned lexicon, whose vocab"
  echo "                matches <target-vocab>."
  echo " <dir>          The dir which contains all the intermediate outputs of this script."
  echo ""
  echo "Note: <target-vocab> and the vocab of <data> don't have to match. For words"
  echo "     who are in <target-vocab> but not seen in <data>, their pronunciations" 
  echo "     will be given by G2P at the end."
  echo ""
  echo "e.g. $0 data/local/dict data/local/lm/librispeech-vocab.txt data/train \\"
  echo "          exp/tri3 data/lang data/local/dict_learned"
  echo "Options:"
  echo "  --stage <n>                         # stage to run from, to enable resuming from partially"
  echo "                                      # completed run (default: 0)"
  echo "  --cmd '$cmd'                        # command to submit jobs with (e.g. run.pl, queue.pl)"
  echo "  --nj <nj>                           # number of parallel jobs"
  echo "  --oov-symbol '$oov_symbol'          # oov symbol, like <UNK>."
  echo "  --lexiconp-g2p                      # a lexicon (with prob in the second column) file containing g2p generated"
  echo "                                      # pronunciations, for words in acoustic training data / target vocabulary. It's optional."
  echo "  --min-prob <float>                  # The cut-off parameter used to select pronunciation candidates from phonetic"
  echo "                                      # decoding. We remove pronunciations with probabilities less than this value"
  echo "                                      # after normalizing the probs s.t. the max-prob is 1.0 for each word."
  echo "  --variant-counts-ratio <int>        # This ratio parameter determines the maximum number of pronunciation"
  echo "                                      # candidates we will keep for each word, after pruning according to lattice statistics from"
  echo "                                      # the first iteration of lattice generation. See steps/dict/internal/prune_pron_candidates.py"
  echo "                                      # for details."
  echo "  --variant-counts-no-acoustics <int> # how many g2p-prons per word we want to include for each words unseen in acoustic training data."
  echo "  --alpha <float>,<float>,<float>     # scaling factors used in the greedy pronunciation selection framework, "
  echo "                                      # see steps/dict/select_prons_greedy.py for details."
  echo "  --beta <int>,<int>,<int>            # smoothing factors used in the greedy pronunciation selection framework, "
  echo "                                      # see steps/dict/select_prons_greedy.py for details."
  echo "  --delta <float>                     # a floor value used in the greedy pronunciation selection framework, "
  echo "                                      # see steps/dict/select_prons_greedy.py for details."
  echo "  --num-gauss                         # number of gaussians for the re-trained SAT model (on top of <src-mdl-dir>)."            
  echo "  --num-leaves                        # number of leaves for the re-trained SAT model (on top of <src-mdl-dir>)." 
  echo "  --retrain-src-mdl                   # true if you want to re-train the src_mdl before phone decoding (default false)."
  exit 1
fi

echo "$0 $@"  # Print the command line for logging

ref_dict=$1
target_vocab=$2
data=$3
src_mdl_dir=$4
ref_lang=$5
dest_dict=$6

if [ -z "$oov_symbol" ]; then
   echo "$0: the --oov-symbol option is required."
   exit 1
fi

if [ $# -gt 6 ]; then
  dir=$7 # Most intermediate outputs will be put here. 
else
  dir=${src_mdl_dir}_lex_learn_work
fi

mkdir -p $dir
if [ $stage -le 0 ]; then
  echo "$0: Some preparatory work."
  # Get the word counts of training data.
  awk '{for (n=2;n<=NF;n++) counts[$n]++;} END{for (w in counts) printf "%s %d\n",w, counts[w];}' \
    $data/text | sort > $dir/train_counts.txt
  
  # Get the non-scored entries and exclude them from the reference lexicon/vocab, and target_vocab.
  steps/cleanup/internal/get_non_scored_words.py $ref_lang > $dir/non_scored_words
  awk 'NR==FNR{a[$1] = 1; next} {if($1 in a) print $0}' $dir/non_scored_words \
    $ref_dict/lexicon.txt > $dir/non_scored_entries 

  # Remove non-scored-words from the reference lexicon.
  awk 'NR==FNR{a[$1] = 1; next} {if(!($1 in a)) print $0}' $dir/non_scored_words \
    $ref_dict/lexicon.txt | tr -s '\t' ' ' | awk '$1=$1' > $dir/ref_lexicon.txt

  cat $dir/ref_lexicon.txt | awk '{print $1}' | sort | uniq > $dir/ref_vocab.txt
  awk 'NR==FNR{a[$1] = 1; next} {if(!($1 in a)) print $0}' $dir/non_scored_words \
    $target_vocab | sort | uniq > $dir/target_vocab.txt
    
  # From the reference lexicon, we estimate the target_num_prons_per_word as,
  # round(avg. # prons per word in the reference lexicon). This'll be used as 
  # the upper bound of # pron variants per word when we apply G2P or select prons to
  # construct the learned lexicon in later stages.
  python -c 'import sys; import math; print int(round(float(sys.argv[1])/float(sys.argv[2])))' \
    `wc -l $dir/ref_lexicon.txt | awk '{print $1}'` `wc -l $dir/ref_vocab.txt | awk '{print $1}'` \
    > $dir/target_num_prons_per_word || exit 1;

  if [ -z $lexiconp_g2p ]; then
    # create an empty list of g2p generated prons, if it's not given.
    touch $dir/lexicon_g2p.txt
    touch $dir/lexiconp_g2p.txt
  else
    # Exchange the 1st column (word) and 2nd column (prob) and remove pronunciations
    # which are already in the reference lexicon.
    cat $lexiconp_g2p | awk '{a=$1;b=$2; $1="";$2="";print b" "a$0}' | \
      awk 'NR==FNR{a[$0] = 1; next} {w=$2;for (n=3;n<=NF;n++) w=w" "$n; if(!(w in a)) print $0}' \
      $dir/ref_lexicon.txt - > $dir/lexiconp_g2p.txt 2>/dev/null
    
    # make a copy where we remove the first column (probabilities).
    cat $dir/lexiconp_g2p.txt | cut -f1,3- > $dir/lexicon_g2p.txt 2>/dev/null
  fi
  variant_counts=`cat $dir/target_num_prons_per_word` || exit 1;
  $cmd $dir/log/prune_g2p_lexicon.log steps/dict/prons_to_lexicon.py \
    --top-N=$variant_counts $dir/lexiconp_g2p.txt \
    $dir/lexicon_g2p_variant_counts${variant_counts}.txt || exit 1;
fi

if [ $stage -le 1 ] && $retrain_src_mdl; then
  echo "$0: Expand the reference lexicon to cover all words in the target vocab. and then"
  echo "   ... re-train the source acoustic model for phonetic decoding. "
  mkdir -p $dir/dict_expanded_target_vocab
  cp $ref_dict/{extra_questions.txt,optional_silence.txt,nonsilence_phones.txt,silence_phones.txt} \
    $dir/dict_expanded_target_vocab  2>/dev/null
  rm $dir/dict_expanded_target_vocab/lexiconp.txt $dir/dict_expanded_target_vocab/lexicon.txt 2>/dev/null
  
  # Get the oov words list (w.r.t ref vocab) which are in the target vocab. 
  awk 'NR==FNR{a[$1] = 1; next} !($1 in a)' $dir/ref_lexicon.txt \
    $dir/target_vocab.txt | sort | uniq > $dir/oov_target_vocab.txt

  # Assign pronunciations from lexicon_g2p.txt to oov_target_vocab. For words which
  # cannot be found in lexicon_g2p.txt, we simply ignore them.
  awk 'NR==FNR{a[$1] = 1; next} ($1 in a)' $dir/oov_target_vocab.txt \
    $dir/lexicon_g2p.txt > $dir/lexicon_g2p_oov_target_vocab.txt
  
  cat $dir/lexicon_g2p_oov_target_vocab.txt $dir/ref_lexicon.txt | \
    awk 'NR==FNR{a[$1] = 1; next} ($1 in a)' $dir/target_vocab.txt - | \
    cat $dir/non_scored_entries - | 
    sort | uniq > $dir/dict_expanded_target_vocab/lexicon.txt
  
  utils/prepare_lang.sh --phone-symbol-table $ref_lang/phones.txt $dir/dict_expanded_target_vocab \
    $oov_symbol $dir/lang_expanded_target_vocab_tmp $dir/lang_expanded_target_vocab || exit 1;
  
  # Align the acoustic training data using the given src_mdl_dir.
  alidir=${src_mdl_dir}_ali_$(basename $data) 
  steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
    $data $dir/lang_expanded_target_vocab $src_mdl_dir $alidir || exit 1;
  
  # Train another SAT system on the given data and put it in $dir/${src_mdl_dir}_retrained
  # this model will be used for phonetic decoding and lattice alignment later on.
  if [ -z $num_leaves ] || [ -z $num_gauss ] ; then
    echo "num_leaves and num_gauss need to be specified." && exit 1;
  fi
  steps/train_sat.sh --cmd "$train_cmd" $num_leaves $num_gauss \
    $data $dir/lang_expanded_target_vocab $alidir $dir/${src_mdl_dir}_retrained || exit 1;
fi

if [ $stage -le 2 ]; then
  echo "$0: Expand the reference lexicon to cover all words seen in,"
  echo "  ... acoustic training data, and prepare corresponding dict and lang directories."
  echo "  ... This is needed when generate pron candidates from phonetic decoding."
  mkdir -p $dir/dict_expanded_train
  cp $ref_dict/{extra_questions.txt,optional_silence.txt,nonsilence_phones.txt,silence_phones.txt} \
    $dir/dict_expanded_train 2>/dev/null
  rm $dir/dict_expanded_train/lexiconp.txt $dir/dict_expanded_train/lexicon.txt 2>/dev/null

  # Get the oov words list (w.r.t ref vocab) which are in training data. 
  awk 'NR==FNR{a[$1] = 1; next} {if(!($1 in a)) print $1}' $dir/ref_lexicon.txt \
    $dir/train_counts.txt | awk 'NR==FNR{a[$1] = 1; next} {if(!($1 in a)) print $0}' \
    $dir/non_scored_words - | sort > $dir/oov_train.txt || exit 1; 
  
  awk 'NR==FNR{a[$1] = 1; next} {if(($1 in a)) b+=$2; else c+=$2} END{print c/(b+c)}' \
    $dir/ref_vocab.txt $dir/train_counts.txt > $dir/train_oov_rate || exit 1;
  
  echo "OOV rate (w.r.t. the reference lexicon) of the acoustic training data is:"
  cat $dir/train_oov_rate

  # Assign pronunciations from lexicon_g2p to oov_train. For words which
  # cannot be found in lexicon_g2p, we simply assign oov_symbol's pronunciaiton
  # (like NSN) to them, in order to get phonetic decoding pron candidates for them later on.
  variant_counts=`cat $dir/target_num_prons_per_word` || exit 1;
  awk 'NR==FNR{a[$1] = 1; next} ($1 in a)' $dir/oov_train.txt \
    $dir/lexicon_g2p_variant_counts${variant_counts}.txt > $dir/g2p_prons_for_oov_train.txt || exit 1;
  
  # Get the pronunciation of oov_symbol.
  oov_pron=`cat $dir/non_scored_entries | grep $oov_symbol | awk '{print $2}'`
  # For oov words in training data for which we don't even have G2P pron candidates,
  # we simply assign them the pronunciation of the oov symbol (like <unk>),
  # so that we can get pronunciations for them from phonetic decoding.
  awk 'NR==FNR{a[$1] = 1; next} {if(!($1 in a)) print $1}' $dir/g2p_prons_for_oov_train.txt \
    $dir/oov_train.txt | awk -v op="$oov_pron" '{print $0" "op}' > $dir/oov_train_no_pron.txt || exit 1;
    
  cat $dir/oov_train_no_pron.txt $dir/g2p_prons_for_oov_train.txt $dir/ref_lexicon.txt | \
    awk 'NR==FNR{a[$1] = 1; next} ($1 in a)' $dir/train_counts.txt - | \
    cat - $dir/non_scored_entries | \
    sort | uniq > $dir/dict_expanded_train/lexicon.txt || exit 1;
  
  utils/prepare_lang.sh $dir/dict_expanded_train $oov_symbol \
    $dir/lang_expanded_train_tmp $dir/lang_expanded_train || exit 1;
fi

if [ $stage -le 3 ]; then
  echo "$0: Generate pronunciation candidates from phonetic decoding on acoustic training data.."
  if $retrain_src_mdl; then mdl_dir=$dir/${src_mdl_dir}_retrained; else mdl_dir=$src_mdl_dir; fi
  steps/cleanup/debug_lexicon.sh  --nj $nj \
    --cmd "$decode_cmd" $data $dir/lang_expanded_train \
    $mdl_dir $dir/dict_expanded_train/lexicon.txt $dir/phonetic_decoding || exit 1;
fi

if [ $stage -le 4 ]; then
  echo "$0: Combine the reference lexicon and pronunciations from phone-decoding/G2P into one"
  echo "  ... lexicon, and run lattice alignment using this lexicon on acoustic training data"
  echo "  ... to collect acoustic evidence."
  # We first prune the phonetic decoding generated prons relative to the largest count, by setting "min_prob",
  # and only leave prons who are not present in the reference lexicon / g2p-generated lexicon.
  cat $dir/ref_lexicon.txt $dir/lexicon_g2p.txt | sort -u > $dir/phonetic_decoding/filter_lexicon.txt 
  
  $cmd $dir/phonetic_decoding/log/prons_to_lexicon.log steps/dict/prons_to_lexicon.py \
    --min-prob=$min_prob --filter-lexicon=$dir/phonetic_decoding/filter_lexicon.txt \
    $dir/phonetic_decoding/prons.txt $dir/lexicon_pd_with_eps.txt

  # We abandon phonetic-decoding candidates for infrequent words.
  awk '{if($2 < 3) print $1}' $dir/train_counts.txt > $dir/pd_candidates_to_exclude.txt 
  awk 'NR==FNR{a[$1] = $2; next} {if(a[$1]<10) print $1}' $dir/train_counts.txt \
    $dir/oov_train_no_pron.txt >> $dir/pd_candidates_to_exclude.txt 

  if [ -s $dir/pd_candidates_to_exclude.txt ]; then
    cat $dir/lexicon_pd_with_eps.txt | grep -vP "<eps>|<UNK>|<unk>|\[.*\]" | \
      awk 'NR==FNR{a[$0] = 1; next} {if(!($1 in a)) print $0}' $dir/pd_candidates_to_exclude.txt - | \
      sort | uniq > $dir/lexicon_pd.txt || exit 1;
  else
    cat $dir/lexicon_pd_with_eps.txt | grep -vP "<eps>|<UNK>|<unk>|\[.*\]" | \
      sort | uniq > $dir/lexicon_pd.txt || exit 1;
  fi

  # Combine the reference lexicon, pronunciations from G2P and phonetic decoding into one lexicon.
  mkdir -p $dir/dict_combined_iter1
  cp $ref_dict/{extra_questions.txt,optional_silence.txt,nonsilence_phones.txt,silence_phones.txt} \
    $dir/dict_combined_iter1/ 2>/dev/null
  rm $dir/dict_combined_iter1/lexiconp.txt $dir/dict_combined_iter1/lexicon.txt 2>/dev/null

  # Filter out words which don't appear in the acoustic training data
  cat $dir/lexicon_pd.txt $dir/lexicon_g2p.txt \
    $dir/ref_lexicon.txt | tr -s '\t' ' ' | \
    awk 'NR==FNR{a[$1] = 1; next} ($1 in a)' $dir/train_counts.txt - | \
    cat $dir/non_scored_entries - | \
    sort | uniq > $dir/dict_combined_iter1/lexicon.txt
  
  utils/prepare_lang.sh --phone-symbol-table $ref_lang/phones.txt \
    $dir/dict_combined_iter1 $oov_symbol \
    $dir/lang_combined_iter1_tmp $dir/lang_combined_iter1 || exit 1;
  
  # Generate lattices for the acoustic training data with the combined lexicon.
  if $retrain_src_mdl; then mdl_dir=$dir/${src_mdl_dir}_retrained; else mdl_dir=$src_mdl_dir; fi

  # Get the vocab for words for which we want to learn pronunciations.
  if $learn_iv_prons; then
    # If we want to learn the prons of IV words (w.r.t. ref_vocab), the learn_vocab is just the intersection of
    # target_vocab and the vocab of words seen in acoustic training data (first col. of train_counts.txt)
    awk 'NR==FNR{a[$1] = 1; next} {if($1 in a) print $1}' $dir/target_vocab.txt $dir/train_counts.txt \
      > $dir/learn_vocab.txt
  else
    # Exclude words from the ref_vocab if we don't want to learn the pronunciations of IV words.
    awk 'NR==FNR{a[$1] = 1; next} {if($1 in a) print $1}' $dir/target_vocab.txt $dir/train_counts.txt | \
      awk 'NR==FNR{a[$1] = 1; next} {if(!($1 in a)) print $1}' $dir/ref_vocab.txt - > $dir/learn_vocab.txt
  fi
  
  # In order to get finer lattice stats of alternative prons, we want to make lattices deeper.
  # To speed up lattice generation, we use a ctm to create sub-utterances and a sub-segmentation
  # for each instance of a word within learn_vocab (or a string of consecutive words within learn_vocab),
  # including a single out-of-learn-vocab word at the boundary if present.
  mkdir -p $dir/resegmentation
  steps/dict/internal/get_subsegments.py $dir/phonetic_decoding/word.ctm $dir/learn_vocab.txt \
    $dir/resegmentation/subsegments $dir/resegmentation/text || exit 1;
  utils/data/subsegment_data_dir.sh $data $dir/resegmentation/subsegments $dir/resegmentation/text \
    $dir/resegmentation/data || exit 1;
  steps/compute_cmvn_stats.sh $dir/resegmentation/data || exit 1;

  steps/align_fmllr_lats.sh --beam 20 --retry-beam 50 --final-beam 30 --acoustic-scale 0.05 --cmd "$decode_cmd" --nj $nj \
    $dir/resegmentation/data $dir/lang_combined_iter1 $mdl_dir $dir/lats_iter1 || exit 1;

  # Get arc level information from the lattice.
  $cmd JOB=1:$nj $dir/lats_iter1/log/get_arc_info.JOB.log \
    lattice-align-words $dir/lang_combined_iter1/phones/word_boundary.int \
    $dir/lats_iter1/final.mdl \
    "ark:gunzip -c $dir/lats_iter1/lat.JOB.gz |" ark:- \| \
    lattice-arc-post --acoustic-scale=0.1 $dir/lats_iter1/final.mdl ark:- - \| \
    utils/int2sym.pl -f 5 $dir/lang_combined_iter1/words.txt \| \
    utils/int2sym.pl -f 6- $dir/lang_combined_iter1/phones.txt '>' \
    $dir/lats_iter1/arc_info_sym.JOB.txt || exit 1;
  
  # Compute soft counts (pron_stats) of every particular word-pronunciation pair by
  # summing up arc level information over all utterances. We'll use this to prune
  # pronunciation candidates before the next iteration of lattice generation.
  cat $dir/lats_iter1/arc_info_sym.*.txt | steps/dict/get_pron_stats.py - \
    $dir/phonetic_decoding/phone_map.txt $dir/lats_iter1/pron_stats.txt || exit 1;
  
  # Accumlate utterance-level pronunciation posteriors (into arc_stats) by summing up
  # posteriors of arcs representing the same word & pronunciation and starting
  # from roughly the same location. See steps/dict/internal/sum_arc_info.py for details.
  for i in `seq 1 $nj`;do
    cat $dir/lats_iter1/arc_info_sym.${i}.txt | sort -n -k1 -k2 -k3r | \
      steps/dict/internal/sum_arc_info.py - $dir/phonetic_decoding/phone_map.txt $dir/lats_iter1/arc_info_summed.${i}.txt
  done 
  cat $dir/lats_iter1/arc_info_summed.*.txt | sort -k1 -k2 > $dir/lats_iter1/arc_stats.txt 

  # Prune the phonetic_decoding lexicon so that any pronunciation that only has non-zero posterior at one word example will be removed.
  # The pruned lexicon is put in $dir/lats_iter1. After further pruning in the next stage it'll be put back to $dir.
  awk 'NR==FNR{w=$1;for (n=5;n<=NF;n++) w=w" "$n;a[w]+=1;next} {if($0 in a && a[$0]>1) print $0}' \
    $dir/lats_iter1/arc_stats.txt $dir/lexicon_pd.txt > $dir/lats_iter1/lexicon_pd_pruned.txt
fi

# Here we re-generate lattices (with a wider beam and a pruned combined lexicon) and re-collect pronunciation statistics 
if [ $stage -le 5 ]; then
  echo "$0: Prune the pronunciation candidates generated from G2P/phonetic decoding, and re-do lattice-alignment."
  mkdir -p $dir/dict_combined_iter2
  cp $ref_dict/{extra_questions.txt,optional_silence.txt,nonsilence_phones.txt,silence_phones.txt} \
    $dir/dict_combined_iter2/ 2>/dev/null
  rm $dir/dict_combined_iter2/lexiconp.txt $dir/dict_combined_iter2/lexicon.txt 2>/dev/null

  # Prune away pronunciations which have low acoustic evidence from the first pass of lattice generation.
  $cmd $dir/lats_iter1/log/prune_pron_candidates.log steps/dict/internal/prune_pron_candidates.py \
    --variant-counts-ratio $variant_counts_ratio \
    $dir/lats_iter1/pron_stats.txt $dir/lats_iter1/lexicon_pd_pruned.txt $dir/lexiconp_g2p.txt $dir/ref_lexicon.txt \
    $dir/lexicon_pd_pruned.txt $dir/lexicon_g2p_pruned.txt

  # Filter out words which don't appear in the acoustic training data.
  cat $dir/lexicon_pd_pruned.txt $dir/lexicon_g2p_pruned.txt \
    $dir/ref_lexicon.txt | tr -s '\t' ' ' | \
    awk 'NR==FNR{a[$1] = 1; next} ($1 in a)' $dir/train_counts.txt - | \
    cat $dir/non_scored_entries - | \
    sort | uniq > $dir/dict_combined_iter2/lexicon.txt

  utils/prepare_lang.sh --phone-symbol-table $ref_lang/phones.txt \
    $dir/dict_combined_iter2 $oov_symbol \
    $dir/lang_combined_iter2_tmp $dir/lang_combined_iter2 || exit 1;
  
  # Re-generate lattices with a wider beam, so that we'll get deeper lattices.
  if $retrain_src_mdl; then mdl_dir=$dir/${src_mdl_dir}_retrained; else mdl_dir=$src_mdl_dir; fi
  steps/align_fmllr_lats.sh  --beam 30 --retry-beam 60 --final-beam 50 --acoustic-scale 0.05 --cmd "$decode_cmd" --nj $nj \
    $dir/resegmentation/data $dir/lang_combined_iter2 $mdl_dir $dir/lats_iter2 || exit 1;

  # Get arc level information from the lattice as we did in the last stage.
  $cmd JOB=1:$nj $dir/lats_iter2/log/get_arc_info.JOB.log \
    lattice-align-words $dir/lang_combined_iter2/phones/word_boundary.int \
    $dir/lats_iter2/final.mdl \
    "ark:gunzip -c $dir/lats_iter2/lat.JOB.gz |" ark:- \| \
    lattice-arc-post --acoustic-scale=0.1 $dir/lats_iter2/final.mdl ark:- - \| \
    utils/int2sym.pl -f 5 $dir/lang_combined_iter2/words.txt \| \
    utils/int2sym.pl -f 6- $dir/lang_combined_iter2/phones.txt '>' \
    $dir/lats_iter2/arc_info_sym.JOB.txt || exit 1;
  
  # Compute soft counts (pron_stats) of every particular word-pronunciation pair as
  # we did in the last stage. The stats will only be used as diagnostics.
  cat $dir/lats_iter2/arc_info_sym.*.txt | steps/dict/get_pron_stats.py - \
    $dir/phonetic_decoding/phone_map.txt $dir/lats_iter2/pron_stats.txt || exit 1;
  
  # Accumlate utterance-level pronunciation posteriors as we did in the last stage.
  for i in `seq 1 $nj`;do
    cat $dir/lats_iter2/arc_info_sym.${i}.txt | sort -n -k1 -k2 -k3r | \
      steps/dict/internal/sum_arc_info.py - $dir/phonetic_decoding/phone_map.txt $dir/lats_iter2/arc_info_summed.${i}.txt
  done 
  cat $dir/lats_iter2/arc_info_summed.*.txt | sort -k1 -k2 > $dir/lats_iter2/arc_stats.txt 

  # The pron_stats are the acoustic evidence which the likelihood-reduction-based pronunciation
  # selection procedure will be based on.
  # Split the utterance-level pronunciation posterior stats into $nj_select_prons pieces,
  # so that the following pronunciation selection stage can be parallelized.
  numsplit=$nj_select_prons
  awk '{print $1"-"$2" "$1}' $dir/lats_iter2/arc_stats.txt > $dir/lats_iter2/utt2word
  utt2words=$(for n in `seq $numsplit`; do echo $dir/lats_iter2/utt2word.$n; done)
  utils/split_scp.pl --utt2spk=$dir/lats_iter2/utt2word $dir/lats_iter2/utt2word $utt2words || exit 1
  for n in `seq $numsplit`; do 
    (cat $dir/lats_iter2/utt2word.$n | awk '{$1=substr($1,length($2)+2);print $2" "$1}' - > $dir/lats_iter2/word2utt.$n
     awk 'NR==FNR{a[$0] = 1; next} {b=$1" "$2; if(b in a) print $0}' $dir/lats_iter2/word2utt.$n \
       $dir/lats_iter2/arc_stats.txt > $dir/lats_iter2/arc_stats.${n}.txt
    ) &
  done
  wait
fi

if [ $stage -le 6 ]; then
  echo "$0: Select pronunciations according to the acoustic evidence from lattice alignment."
  # Given the acoustic evidence (soft-counts), we use a Bayesian framework to select pronunciations 
  # from three exclusive candidate sources: reference (hand-derived) lexicon, G2P and phonetic decoding.
  # The posteriors for all candidate prons for all words are printed into pron_posteriors.txt
  # For words which are out of the ref. vocab, the learned prons are written into out_of_ref_vocab_prons_learned.txt.
  # Among them, for words without acoustic evidence, we just ignore them, even if pron candidates from G2P were provided).
  # For words in the ref. vocab, we instead output a human readable & editable "edits" file called
  # ref_lexicon_edits.txt, which records all proposed changes to the prons (if any). Also, a 
  # summary is printed into the log file.
  
  $cmd JOB=1:$nj_select_prons $dir/lats_iter2/log/generate_learned_lexicon.JOB.log \
    steps/dict/select_prons_greedy.py \
      --alpha=${alpha} --beta=${beta} \
      --delta=${delta} \
      $ref_dict/silence_phones.txt $dir/lats_iter2/arc_stats.JOB.txt $dir/train_counts.txt $dir/ref_lexicon.txt \
      $dir/lexicon_g2p_pruned.txt $dir/lexicon_pd_pruned.txt \
      $dir/lats_iter2/learned_lexicon.JOB.txt || exit 1;

  cat $dir/lats_iter2/learned_lexicon.*.txt > $dir/lats_iter2/learned_lexicon.txt
  rm $dir/lats_iter2/learned_lexicon.*.txt

  $cmd $dir/lats_iter2/log/lexicon_learning_summary.log \
    steps/dict/merge_learned_lexicons.py \
      $dir/lats_iter2/arc_stats.txt $dir/train_counts.txt $dir/ref_lexicon.txt \
      $dir/lexicon_g2p_pruned.txt $dir/lexicon_pd_pruned.txt \
      $dir/lats_iter2/learned_lexicon.txt \
      $dir/lats_iter2/out_of_ref_vocab_prons_learned.txt $dir/lats_iter2/ref_lexicon_edits.txt || exit 1;

  cp $dir/lats_iter2/ref_lexicon_edits.txt $dir/lats_iter2/ref_lexicon_edits.txt
  # Remove some stuff that takes up space and is unlikely to be useful later on.
  if $cleanup; then
    rm -r $dir/lats_iter*/{fsts*,lat*} 2>/dev/null
  fi
fi

if [ $stage -le 7 ]; then
  echo "$0: Expand the learned lexicon further to cover words in target vocab that are."
  echo "  ... not seen in acoustic training data."
  mkdir -p $dest_dict
  cp $ref_dict/{extra_questions.txt,optional_silence.txt,nonsilence_phones.txt,silence_phones.txt} \
    $dest_dict  2>/dev/null
  rm $dest_dict/lexiconp.txt $dest_dict/lexicon.txt 2>/dev/null
  # Get the list of oov (w.r.t. ref vocab) without acoustic evidence, which are in the
  # target vocab. We'll just assign to them pronunciations from lexicon_g2p, if any.
  cat $dir/lats_iter2/out_of_ref_vocab_prons_learned.txt $dir/ref_lexicon.txt | \
    awk 'NR==FNR{a[$1] = 1; next} !($1 in a)' - \
    $dir/target_vocab.txt | sort | uniq > $dir/oov_no_acoustics.txt || exit 1;
  
  variant_counts=$variant_counts_no_acoustics
  
  $cmd $dir/log/prune_g2p_lexicon.log steps/dict/prons_to_lexicon.py \
    --top-N=$variant_counts $dir/lexiconp_g2p.txt \
    $dir/lexicon_g2p_variant_counts${variant_counts}.txt || exit 1;
  
  awk 'NR==FNR{a[$1] = 1; next} ($1 in a)' $dir/oov_no_acoustics.txt \
    $dir/lexicon_g2p_variant_counts${variant_counts}.txt > $dir/g2p_prons_for_oov_no_acoustics.txt|| exit 1;

  # Get the pronunciation of oov_symbol.
  oov_pron=`cat $dir/non_scored_entries | grep $oov_symbol | awk '{print $2}'` || exit 1;
  # For oov words in target_vocab for which we don't even have G2P pron candidates,
  # we simply assign them the pronunciation of the oov symbol (like <unk>),
  if [ -s $dir/g2p_prons_for_oov_no_acoustics.txt ]; then
    awk 'NR==FNR{a[$1] = 1; next} {if(!($1 in a)) print $1}' $dir/g2p_prons_for_oov_no_acoustics.txt \
      $dir/oov_no_acoustics.txt | awk -v op="$oov_pron" '{print $0" "op}' > $dir/oov_target_vocab_no_pron.txt || exit 1;
  else
    awk -v op="$oov_pron" '{print $0" "op}' $dir/oov_no_acoustics.txt > $dir/oov_target_vocab_no_pron.txt || exit 1
  fi

  # We concatenate three lexicons togethers: G2P lexicon for oov words without acoustics,
  # learned lexicon for oov words with acoustics, and the original reference lexicon (for
  # this part, later one we'll apply recommended changes using steps/dict/apply_lexicon_edits.py
  cat $dir/g2p_prons_for_oov_no_acoustics.txt $dir/lats_iter2/out_of_ref_vocab_prons_learned.txt \
    $dir/oov_target_vocab_no_pron.txt $dir/ref_lexicon.txt | tr -s '\t' ' ' | sort | uniq > $dest_dict/lexicon.temp

  awk 'NR==FNR{a[$1] = 1; next} ($1 in a)' $dir/target_vocab.txt \
    $dest_dict/lexicon.temp | sort | uniq > $dest_dict/lexicon.nosil

  cat $dir/non_scored_entries $dest_dict/lexicon.nosil | sort | uniq >$dest_dict/lexicon0.txt
fi

if [ $stage -le 8 ]; then
  echo "$0: Apply the ref_lexicon_edits file to the reference lexicon."
  echo "  ... The user can inspect/modify the edits file and then re-run:"
  echo "  ... steps/dict/apply_lexicon_edits.py $dest_dict/lexicon0.txt $dir/lats_iter2/ref_lexicon_edits.txt  - | \\"
  echo "  ...   sort -u \> $dest_dict/lexicon.txt to re-produce the final learned lexicon."
  cp $dir/lats_iter2/ref_lexicon_edits.txt $dest_dict/lexicon_edits.txt 2>/dev/null
  steps/dict/apply_lexicon_edits.py $dest_dict/lexicon0.txt $dir/lats_iter2/ref_lexicon_edits.txt - | \
    sort | uniq > $dest_dict/lexicon.txt || exit 1;
fi

echo "Lexicon learning ends successfully. Please refer to $dir/lats_iter2/log/lexicon_learning_summary.log"
echo "  for a summary. The learned lexicon, whose vocab matches the target_vocab, is $dest_dict/lexicon.txt"


================================================
FILE: egs/steps/dict/merge_learned_lexicons.py
================================================
#!/usr/bin/env python

# Copyright 2018  Xiaohui Zhang
# Apache 2.0.

from __future__ import print_function
from collections import defaultdict
import argparse
import sys
import math

def GetArgs():
    parser = argparse.ArgumentParser(
        description = "Convert a learned lexicon produced by steps/dict/select_prons_greedy.sh"
        "into a lexicon for OOV words (w.r.t. ref. vocab) and a human editable lexicon-edit file."
        "for in-vocab words, and generate detailed summaries of the lexicon learning results"
        "The inputs are a learned lexicon, an arc-stats file, and three source lexicons "
        "(phonetic-decoding(PD)/G2P/ref). The outputs are: a learned lexicon for OOVs"
        "(learned_lexicon_oov), and a lexicon_edits file (ref_lexicon_edits) containing"
        "suggested modifications of prons, for in-vocab words.",
        epilog = "See steps/dict/learn_lexicon_greedy.sh for example.")
    parser.add_argument("arc_stats_file", metavar = "<arc-stats-file>", type = str,
                        help = "File containing word-pronunciation statistics obtained from lattices; "
                        "each line must be <word> <utt-id> <start-frame> <count> <phones>")
    parser.add_argument("word_counts_file", metavar = "<counts-file>", type = str,
                        help = "File containing word counts in acoustic training data; "
                        "each line must be <word> <count>.")
    parser.add_argument("ref_lexicon", metavar = "<reference-lexicon>", type = str,
                        help = "The reference lexicon (most probably hand-derived)."
                        "Each line must be <word> <phones>")
    parser.add_argument("g2p_lexicon", metavar = "<g2p-expanded-lexicon>", type = str,
                        help = "Candidate ronouciations from G2P results."
                        "Each line must be <word> <phones>")
    parser.add_argument("pd_lexicon", metavar = "<prons-in-acoustic-evidence>", type = str,
                        help = "Candidate ronouciations from phonetic decoding results."
                        "Each line must be <word> <phones>")
    parser.add_argument("learned_lexicon", metavar = "<learned-lexicon>", type = str,
                        help = "Learned lexicon."
                        "Each line must be <word> <phones>")
    parser.add_argument("learned_lexicon_oov", metavar = "<learned-lexicon-oov>", type = str,
                        help = "Output file which is the learned lexicon for words out of the ref. vocab.")
    parser.add_argument("ref_lexicon_edits", metavar = "<lexicon-edits>", type = str,
                        help = "Output file containing human-readable & editable pronounciation info (and the"
                        "accept/reject decision made by our algorithm) for those words in ref. vocab," 
                        "to which any change has been recommended. The info for each word is like:" 
                        "------------ an 4086.0 --------------"
                        "R  | Y |  2401.6 |  AH N"
                        "R  | Y |  640.8 |  AE N"
                        "P  | Y |  1035.5 |  IH N"
                        "R(ef), P(hone-decoding) represents the pronunciation source"
                        "Y/N means the recommended decision of including this pron or not"
                        "and the numbers are soft counts accumulated from lattice-align-word outputs. "
                        "See the function WriteEditsAndSummary for more details.")
 
    print (' '.join(sys.argv), file=sys.stderr)

    args = parser.parse_args()
    args = CheckArgs(args)

    return args

def CheckArgs(args):
    if args.arc_stats_file == "-":
        args.arc_stats_file_handle = sys.stdin
    else:
        args.arc_stats_file_handle = open(args.arc_stats_file)
    args.word_counts_file_handle = open(args.word_counts_file)
    args.ref_lexicon_handle = open(args.ref_lexicon)
    args.g2p_lexicon_handle = open(args.g2p_lexicon)
    args.pd_lexicon_handle = open(args.pd_lexicon)
    args.learned_lexicon_handle = open(args.learned_lexicon)
    args.learned_lexicon_oov_handle = open(args.learned_lexicon_oov, "w")
    args.ref_lexicon_edits_handle = open(args.ref_lexicon_edits, "w")
    
    return args

def ReadArcStats(arc_stats_file_handle):
    stats = defaultdict(lambda : defaultdict(dict))
    stats_summed = defaultdict(float)
    for line in arc_stats_file_handle.readlines():
        splits = line.strip().split()

        if (len(splits) == 0):
            continue

        if (len(splits) < 5):
            raise Exception('Invalid format of line ' + line
                                + ' in ' + arc_stats_file)
        utt = splits[1]
        start_frame = int(splits[2])
        word = splits[0]
        count = float(splits[3])
        phones = splits[4:]
        phones = ' '.join(phones)
        stats[word][(utt, start_frame)][phones] = count
        stats_summed[(word, phones)] += count
    return stats, stats_summed

def ReadWordCounts(word_counts_file_handle):
    counts = {}
    for line in word_counts_file_handle.readlines():
        splits = line.strip().split()
        if len(splits) < 2:
            raise Exception('Invalid format of line ' + line
                                + ' in counts file.')
        word = splits[0]
        count = int(splits[1])
        counts[word] = count
    return counts

def ReadLexicon(args, lexicon_file_handle, counts):
    # we're skipping any word not in counts (not seen in training data),
    # cause we're only learning prons for words who have acoustic examples.
    lexicon = defaultdict(set)
    for line in lexicon_file_handle.readlines():
        splits = line.strip().split()
        if len(splits) == 0:
            continue
        if len(splits) < 2:
            raise Exception('Invalid format of line ' + line
                                + ' in lexicon file.')
        word = splits[0]
        if word not in counts:
            continue
        phones = ' '.join(splits[1:])
        lexicon[word].add(phones)
    return lexicon

def WriteEditsAndSummary(args, learned_lexicon, ref_lexicon, pd_lexicon, g2p_lexicon, counts, stats, stats_summed):
    # Note that learned_lexicon and ref_lexicon are dicts of sets of prons, while the other two lexicons are sets of (word, pron) pairs.
    threshold = 2
    words = [defaultdict(set) for i in range(4)] # "words" contains four bins, where we
    # classify each word into, according to whether it's count > threshold,
    # and whether it's OOVs w.r.t the reference lexicon.

    src = {}
    print("# Note: This file contains pronunciation info for words who have candidate "
          "prons from G2P/phonetic-decoding accepted in the learned lexicon"
          ", sorted by their counts in acoustic training data, "
          ,file=args.ref_lexicon_edits_handle)
    print("# 1st Col: source of the candidate pron: G(2P) / P(hone-decoding) / R(eference)."
          ,file=args.ref_lexicon_edits_handle)
    print("# 2nd Col: accepted or not in the learned lexicon (Y/N).", file=args.ref_lexicon_edits_handle)
    print("# 3rd Col: soft counts from lattice-alignment (not augmented by prior-counts)."
          ,file=args.ref_lexicon_edits_handle)
    print("# 4th Col: the pronunciation cadidate.", file=args.ref_lexicon_edits_handle)
    
    # words which are to be printed into the edits file.
    words_to_edit = [] 
    num_prons_tot = 0
    for word in learned_lexicon:
        num_prons_tot += len(learned_lexicon[word])
        count = len(stats[word]) # This count could be smaller than the count read from the dict "counts",
        # since in each sub-utterance, multiple occurences (which is rare) of the same word are compressed into one.
        # We use this count here so that in the edit-file, soft counts for each word sum up to one. 
        flags = ['0' for i in range(3)] # "flags" contains three binary indicators, 
        # indicating where this word's pronunciations come from.
        for pron in learned_lexicon[word]:
            if word in pd_lexicon and pron in pd_lexicon[word]:
                flags[0] = '1'
                src[(word, pron)] = 'P'
            elif word in ref_lexicon and pron in ref_lexicon[word]:
                flags[1] = '1'
                src[(word, pron)] = 'R'
            elif word in g2p_lexicon and pron in g2p_lexicon[word]:
                flags[2] = '1'
                src[(word, pron)] = 'G'
        if word in ref_lexicon:
            all_ref_prons_accepted = True
            for pron in ref_lexicon[word]:
                if pron not in learned_lexicon[word]:
                    all_ref_prons_accepted = False
                    break
            if not all_ref_prons_accepted or flags[0] == '1' or flags[2] == '1':
                words_to_edit.append((word, len(stats[word])))
            if count > threshold:
                words[0][flags[0] + flags[1] + flags[2]].add(word)
            else:
                words[1][flags[0] + flags[1] + flags[2]].add(word)
        else:
            if count > threshold: 
                words[2][flags[0] + flags[2]].add(word)
            else:
                words[3][flags[0] + flags[2]].add(word)

    words_to_edit_sorted = sorted(words_to_edit, key=lambda entry: entry[1], reverse=True)
    for word, count in words_to_edit_sorted:
        print("------------",word, "%2.1f" % count, "--------------", file=args.ref_lexicon_edits_handle)
        learned_prons = []
        for pron in learned_lexicon[word]:
            learned_prons.append((src[(word, pron)], 'Y', stats_summed[(word, pron)], pron))
        for pron in ref_lexicon[word]:
            if pron not in learned_lexicon[word]:
                learned_prons.append(('R', 'N', stats_summed[(word, pron)], pron))
        learned_prons_sorted = sorted(learned_prons, key=lambda item: item[2], reverse=True)
        for item in learned_prons_sorted:
            print('{} | {} |  {:.2f} | {}'.format(item[0], item[1], item[2], item[3]), file=args.ref_lexicon_edits_handle)

    num_oovs_with_acoustic_evidence = len(set(learned_lexicon.keys()).difference(set(ref_lexicon.keys())))
    num_oovs = len(set(counts.keys()).difference(set(ref_lexicon.keys())))
    num_ivs = len(learned_lexicon) - num_oovs_with_acoustic_evidence
    print("Average num. prons per word in the learned lexicon is {}".format(float(num_prons_tot)/float(len(learned_lexicon))), file=sys.stderr)
    # print("Here are the words whose reference pron candidates were all declined", words[0]['100'], file=sys.stderr)
    print("-------------------------------------------------Summary------------------------------------------", file=sys.stderr)
    print("We have acoustic evidence for {} out of {} in-vocab (w.r.t the reference lexicon) words from the acoustic training data.".format(num_ivs, len(ref_lexicon)), file=sys.stderr) 
    print("  Among those frequent words whose counts in the training text > ", threshold, ":", file=sys.stderr) 
    num_freq_ivs_from_all_sources = len(words[0]['111']) + len(words[0]['110']) + len(words[0]['011'])
    num_freq_ivs_from_g2p_or_phonetic_decoding = len(words[0]['101']) + len(words[0]['001']) + len(words[0]['100'])
    num_freq_ivs_from_ref = len(words[0]['010'])
    num_infreq_ivs_from_all_sources = len(words[1]['111']) + len(words[1]['110']) + len(words[1]['011'])
    num_infreq_ivs_from_g2p_or_phonetic_decoding = len(words[1]['101']) + len(words[1]['001']) + len(words[1]['100'])
    num_infreq_ivs_from_ref = len(words[1]['010'])
    print('    {} words\' selected prons came from the reference lexicon, G2P/phonetic-decoding.'.format(num_freq_ivs_from_all_sources), file=sys.stderr)
    print('    {} words\' selected prons come from G2P/phonetic-decoding-generated.'.format(num_freq_ivs_from_g2p_or_phonetic_decoding), file=sys.stderr) 
    print('    {} words\' selected prons came from the reference lexicon only.'.format(num_freq_ivs_from_ref), file=sys.stderr) 
    print('  For those words whose counts in the training text <= {}:'.format(threshold), file=sys.stderr) 
    print('    {} words\' selected prons came from the reference lexicon, G2P/phonetic-decoding.'.format(num_infreq_ivs_from_all_sources), file=sys.stderr)
    print('    {} words\' selected prons come from G2P/phonetic-decoding-generated.'.format(num_infreq_ivs_from_g2p_or_phonetic_decoding), file=sys.stderr) 
    print('    {} words\' selected prons came from the reference lexicon only.'.format(num_infreq_ivs_from_ref), file=sys.stderr) 
    print("---------------------------------------------------------------------------------------------------", file=sys.stderr)
    num_freq_oovs_from_both_sources = len(words[2]['11'])
    num_freq_oovs_from_phonetic_decoding = len(words[2]['10'])
    num_freq_oovs_from_g2p = len(words[2]['01'])
    num_infreq_oovs_from_both_sources = len(words[3]['11'])
    num_infreq_oovs_from_phonetic_decoding = len(words[3]['10'])
    num_infreq_oovs_from_g2p = len(words[3]['01'])
    print('We have acoustic evidence for {} out of {} OOV (w.r.t the reference lexicon) words from the acoustic training data.'.format(num_oovs_with_acoustic_evidence, num_oovs), file=sys.stderr)
    print('  Among those words whose counts in the training text > {}:'.format(threshold), file=sys.stderr)
    print('    {} words\' selected prons came from G2P and phonetic-decoding.'.format(num_freq_oovs_from_both_sources), file=sys.stderr)
    print('    {} words\' selected prons came from phonetic decoding only.'.format(num_freq_oovs_from_phonetic_decoding), file=sys.stderr) 
    print('    {} words\' selected prons came from G2P only.'.format(num_freq_oovs_from_g2p), file=sys.stderr) 
    print('  For those words whose counts in the training text <= {}:'.format(threshold), file=sys.stderr) 
    print('    {} words\' selected prons came from G2P and phonetic-decoding.'.format(num_infreq_oovs_from_both_sources), file=sys.stderr)
    print('    {} words\' selected prons came from phonetic decoding only.'.format(num_infreq_oovs_from_phonetic_decoding), file=sys.stderr) 
    print('    {} words\' selected prons came from G2P only.'.format(num_infreq_oovs_from_g2p), file=sys.stderr) 

def WriteLearnedLexiconOov(learned_lexicon, ref_lexicon, file_handle):
    for word, prons in learned_lexicon.iteritems():
        if word not in ref_lexicon:
            for pron in prons:
                print('{0} {1}'.format(word, pron), file=file_handle)
    file_handle.close()

def Main():
    args = GetArgs()

    # Read in three lexicon sources, word counts, and pron stats.
    counts = ReadWordCounts(args.word_counts_file_handle)
    ref_lexicon = ReadLexicon(args, args.ref_lexicon_handle, counts)
    g2p_lexicon = ReadLexicon(args, args.g2p_lexicon_handle, counts)
    pd_lexicon =  ReadLexicon(args, args.pd_lexicon_handle, counts)
    stats, stats_summed = ReadArcStats(args.arc_stats_file_handle)
    learned_lexicon =  ReadLexicon(args, args.learned_lexicon_handle, counts)
    
    # Write the learned prons for words out of the ref. vocab into learned_lexicon_oov.
    WriteLearnedLexiconOov(learned_lexicon, ref_lexicon, args.learned_lexicon_oov_handle)
    # Edits will be printed into ref_lexicon_edits, and the summary will be printed into stderr.
    WriteEditsAndSummary(args, learned_lexicon, ref_lexicon, pd_lexicon, g2p_lexicon, counts, stats, stats_summed)

if __name__ == "__main__":
    Main()


================================================
FILE: egs/steps/dict/prons_to_lexicon.py
================================================
#!/usr/bin/env python

# Copyright 2016  Vimal Manohar
#           2016  Xiaohui Zhang
# Apache 2.0.

# we're using python 3.x style print but want it to work in python 2.x,
from __future__ import print_function
from collections import defaultdict
import argparse
import sys

class StrToBoolAction(argparse.Action):
    """ A custom action to convert bools from shell format i.e., true/false
        to python format i.e., True/False """
    def __call__(self, parser, namespace, values, option_string=None):
        if values == "true":
            setattr(namespace, self.dest, True)
        elif values == "false":
            setattr(namespace, self.dest, False)
        else:
            raise Exception("Unknown value {0} for --{1}".format(values, self.dest))

def GetArgs():
    parser = argparse.ArgumentParser(description = "Converts pronunciation statistics (from phonetic decoding or g2p) "
                                     "into a lexicon for. We prune the pronunciations "
                                     "based on a provided stats file, and optionally filter out entries which are present "
                                     "in a filter lexicon.",
                                     epilog = "e.g. steps/dict/prons_to_lexicon.py --min-prob=0.4 \\"
                                     "--filter-lexicon=exp/tri3_lex_0.4_work/phone_decode/filter_lexicon.txt \\"
                                     "exp/tri3_lex_0.4_work/phone_decode/prons.txt \\"
                                     "exp/tri3_lex_0.4_work/lexicon_phone_decoding.txt"
                                     "See steps/dict/learn_lexicon_greedy.sh for examples in detail.")

    parser.add_argument("--set-sum-to-one", type = str, default = False,
                        action = StrToBoolAction, choices = ["true", "false"],
                        help = "If normalize lexicon such that the sum of "
                        "probabilities is 1.")
    parser.add_argument("--set-max-to-one", type = str, default = True,
                        action = StrToBoolAction, choices = ["true", "false"],
                        help = "If normalize lexicon such that the max "
                        "probability is 1.")
    parser.add_argument("--top-N", type = int, default = 0,
                        help = "If non-zero, we just take the top N pronunciations (according to stats/pron-probs) for each word.")
    parser.add_argument("--min-prob", type = float, default = 0.1,
                        help = "Remove pronunciation with probabilities less "
                        "than this value after normalization.")
    parser.add_argument("--filter-lexicon", metavar='<filter-lexicon>', type = str, default = '',
                        help = "Exclude entries in this filter lexicon from the output lexicon."
                        "each line must be <word> <phones>")
    parser.add_argument("stats_file", metavar='<stats-file>', type = str,
                        help = "Input lexicon file containing pronunciation statistics/probs in the first column."
                        "each line must be <counts> <word> <phones>")
    parser.add_argument("out_lexicon", metavar='<out-lexicon>', type = str,
                        help = "Output lexicon.")

    print (' '.join(sys.argv), file = sys.stderr)

    args = parser.parse_args()
    args = CheckArgs(args)

    return args

def CheckArgs(args):
    if args.stats_file == "-":
        args.stats_file_handle = sys.stdin
    else:
        args.stats_file_handle = open(args.stats_file)

    if args.filter_lexicon is not '':
        if args.filter_lexicon == "-":
            args.filter_lexicon_handle = sys.stdout
        else:
            args.filter_lexicon_handle = open(args.filter_lexicon)
    
    if args.out_lexicon == "-":
        args.out_lexicon_handle = sys.stdout
    else:
        args.out_lexicon_handle = open(args.out_lexicon, "w")

    if args.set_max_to_one == args.set_sum_to_one:
        raise Exception("Cannot have both "
            "set-max-to-one and set-sum-to-one as true or false.")

    return args

def ReadStats(args):
    lexicon = {}
    word_count = {}
    for line in args.stats_file_handle:
        splits = line.strip().split()
        if len(splits) < 3:
            continue

        word = splits[1]
        count = float(splits[0])
        phones = ' '.join(splits[2:])

        lexicon[(word, phones)] = lexicon.get((word, phones), 0) + count
        word_count[word] = word_count.get(word, 0) + count

    return [lexicon, word_count]

def ReadLexicon(lexicon_file_handle):
    lexicon = set()
    if lexicon_file_handle:
        for line in lexicon_file_handle.readlines():
            splits = line.strip().split()
            if len(splits) == 0:
                continue
            if len(splits) < 2:
                raise Exception('Invalid format of line ' + line
                                    + ' in lexicon file.')
            word = splits[0]
            phones = ' '.join(splits[1:])
            lexicon.add((word, phones))
    return lexicon

def ConvertWordCountsToProbs(args, lexicon, word_count):
    word_probs = {}
    for entry, count in lexicon.iteritems():
        word = entry[0]
        phones = entry[1]
        prob = float(count) / float(word_count[word])
        if word in word_probs:
            word_probs[word].append((phones, prob))
        else:
            word_probs[word] = [(phones, prob)]

    return word_probs

def ConvertWordProbsToLexicon(word_probs):
    lexicon = {}
    for word, entry in word_probs.iteritems():
        for x in entry:
            lexicon[(word, x[0])] = lexicon.get((word,x[0]), 0) + x[1]
    return lexicon

def NormalizeLexicon(lexicon, set_max_to_one = True,
                     set_sum_to_one = False, min_prob = 0):
    word_probs = {}
    for entry, prob in lexicon.iteritems():
        t = word_probs.get(entry[0], (0,0))
        word_probs[entry[0]] = (t[0] + prob, max(t[1], prob))

    for entry, prob in lexicon.iteritems():
        if set_max_to_one:
            prob = prob / word_probs[entry[0]][1]
        elif set_sum_to_one:
            prob = prob / word_probs[entry[0]][0]
        if prob < min_prob:
            prob = 0
        lexicon[entry] = prob

def TakeTopN(lexicon, top_N):
    lexicon_reshaped = defaultdict(list) 
    lexicon_pruned = {}
    for entry, prob in lexicon.iteritems():
        lexicon_reshaped[entry[0]].append([entry[1], prob])
    for word in lexicon_reshaped:
        prons = lexicon_reshaped[word]
        sorted_prons = sorted(prons, reverse=True, key=lambda prons: prons[1])
        for i in range(len(sorted_prons)):
            if i >= top_N:
                lexicon[(word, sorted_prons[i][0])] = 0
        
def WriteLexicon(args, lexicon, filter_lexicon):
    words = set()
    num_removed = 0
    num_filtered = 0
    for entry, prob in lexicon.iteritems():
        if prob == 0:
            num_removed += 1
            continue
        if entry in filter_lexicon:
            num_filtered += 1
            continue
        words.add(entry[0])
        print("{0} {1}".format(entry[0], entry[1]),
                file = args.out_lexicon_handle)
    print ("Before pruning, the total num. pronunciations is: {}".format(len(lexicon)), file=sys.stderr)
    print ("Removed {0} pronunciations by setting min_prob {1}".format(num_removed, args.min_prob), file=sys.stderr)
    print ("Filtered out {} pronunciations in the filter lexicon.".format(num_filtered), file=sys.stderr)
    num_prons_from_phone_decoding = len(lexicon) - num_removed - num_filtered
    print ("Num. pronunciations in the output lexicon, which solely come from phone decoding"
           "is {0}. num. words is {1}".format(num_prons_from_phone_decoding, len(words)), file=sys.stderr)

def Main():
    args = GetArgs()

    [lexicon, word_count] = ReadStats(args)

    word_probs = ConvertWordCountsToProbs(args, lexicon, word_count)

    lexicon = ConvertWordProbsToLexicon(word_probs)
    filter_lexicon = set()
    if args.filter_lexicon is not '':
        filter_lexicon = ReadLexicon(args.filter_lexicon_handle)
    if args.top_N > 0:
        TakeTopN(lexicon, args.top_N)
    else:
        NormalizeLexicon(lexicon, set_max_to_one = args.set_max_to_one,
                         set_sum_to_one = args.set_sum_to_one,
                         min_prob = args.min_prob)
    WriteLexicon(args, lexicon, filter_lexicon)
    args.out_lexicon_handle.close()

if __name__ == "__main__":
    Main()


================================================
FILE: egs/steps/dict/prune_pron_candidates.py
================================================
#!/usr/bin/env python

# Copyright 2016  Xiaohui Zhang
# Apache 2.0.

from __future__ import print_function
from __future__ import division
from collections import defaultdict
import argparse
import sys
import math

def GetArgs():
    parser = argparse.ArgumentParser(description = "Prune pronunciation candidates based on soft-counts from lattice-alignment"
                                     "outputs, and a reference lexicon. Basically, for each word we sort all pronunciation"
                                     "cadidates according to their soft-counts, and then select the top r * N candidates"
                                     "(For words in the reference lexicon, N = # pron variants given by the reference"
                                     "lexicon; For oov words, N = avg. # pron variants per word in the reference lexicon)."
                                     "r is a user-specified constant, like 2.",
                                     epilog = "See steps/dict/learn_lexicon_greedy.sh for example")

    parser.add_argument("--r", type = float, default = "2.0",
                        help = "a user-specified ratio parameter which determines how many"
                        "pronunciation candidates we want to keep for each word.")
    parser.add_argument("pron_stats", metavar = "<pron-stats>", type = str,
                        help = "File containing soft-counts of all pronounciation candidates; "
                        "each line must be <soft-counts> <word> <phones>")
    parser.add_argument("ref_lexicon", metavar = "<ref-lexicon>", type = str,
                        help = "Reference lexicon file, where we obtain # pron variants for"
                        "each word, based on which we prune the pron candidates.")
    parser.add_argument("pruned_prons", metavar = "<pruned-prons>", type = str,
                        help = "A file in lexicon format, which contains prons we want to" 
                        "prune away from the pron_stats file.")

    print (' '.join(sys.argv), file=sys.stderr)

    args = parser.parse_args()
    args = CheckArgs(args)

    return args

def CheckArgs(args):
    args.pron_stats_handle = open(args.pron_stats)
    args.ref_lexicon_handle = open(args.ref_lexicon)
    if args.pruned_prons == "-":
        args.pruned_prons_handle = sys.stdout
    else:
        args.pruned_prons_handle = open(args.pruned_prons, "w")
    return args

def ReadStats(pron_stats_handle):
    stats = defaultdict(list)
    for line in pron_stats_handle.readlines():
        splits = line.strip().split()
        if len(splits) == 0:
            continue
        if len(splits) < 2:
            raise Exception('Invalid format of line ' + line
                                + ' in stats file.')
        count = float(splits[0])
        word = splits[1]
        phones = ' '.join(splits[2:])
        stats[word].append((phones, count))

    for word, entry in stats.items():
        entry.sort(key=lambda x: x[1])
    return stats

def ReadLexicon(ref_lexicon_handle):
    ref_lexicon = defaultdict(set)
    for line in ref_lexicon_handle.readlines():
        splits = line.strip().split()
        if len(splits) == 0:
            continue
        if len(splits) < 2:
            raise Exception('Invalid format of line ' + line
                                + ' in lexicon file.')
        word = splits[0]
        try:
            phones = ' '.join(splits[2:])
        except ValueError:
            phones = ' '.join(splits[1:])
        ref_lexicon[word].add(phones)
    return ref_lexicon

def PruneProns(args, stats, ref_lexicon):
    # Compute the average # pron variants counts per word in the reference lexicon.
    num_words_ref = 0
    num_prons_ref = 0
    for word, prons in ref_lexicon.items():
        num_words_ref += 1
        num_prons_ref += len(prons)
    avg_variants_counts_ref = math.ceil(float(num_prons_ref) / float(num_words_ref))

    for word, entry in stats.items():
        if word in ref_lexicon:
            variants_counts = args.r * len(ref_lexicon[word])
        else:
            variants_counts = args.r * avg_variants_counts_ref
        num_variants = 0
        while num_variants < variants_counts:
            try:
                pron, prob = entry.pop()
                if word not in ref_lexicon or pron not in ref_lexicon[word]:
                    num_variants += 1
            except IndexError:
                break
        
    for word, entry in stats.items():
        for pron, prob in entry:
            if word not in ref_lexicon or pron not in ref_lexicon[word]:
                print('{0} {1}'.format(word, pron), file=args.pruned_prons_handle)

def Main():
    args = GetArgs()
    ref_lexicon = ReadLexicon(args.ref_lexicon_handle)
    stats = ReadStats(args.pron_stats_handle)
    PruneProns(args, stats, ref_lexicon)

if __name__ == "__main__":
    Main()


================================================
FILE: egs/steps/dict/select_prons_bayesian.py
================================================
#!/usr/bin/env python

# Copyright 2016  Xiaohui Zhang
# Apache 2.0.

from __future__ import print_function
from __future__ import division
from collections import defaultdict
import argparse
import sys
import math

def GetArgs():
    parser = argparse.ArgumentParser(description = "Use a Bayesian framework to select"
                                     "pronunciation candidates from three sources: reference lexicon"
                                     ", G2P lexicon and phonetic-decoding lexicon. The inputs are a word-stats file,"
                                     "a pron-stats file, and three source lexicons (ref/G2P/phonetic-decoding)."
                                     "We assume the pronunciations for each word follow a Categorical distribution"
                                     "with Dirichlet priors. Thus, with user-specified prior counts (parameterized by"
                                     "prior-mean and prior-count-tot) and observed counts from the pron-stats file, "
                                     "we can compute posterior for each pron, and select candidates with highest"
                                     "posteriors, until we hit user-specified variants-prob-mass/counts thresholds."
                                     "The outputs are: a file specifiying posteriors of all candidate (pron_posteriors),"
                                     "a learned lexicon for words out of the ref. vocab (learned_lexicon_oov),"
                                     "and a lexicon_edits file containing suggested modifications of prons, for"
                                     "words within the ref. vocab (ref_lexicon_edits).",
                                     epilog = "See steps/dict/learn_lexicon_bayesian.sh for example.")
    parser.add_argument("--prior-mean", type = str, default = "0,0,0",
                        help = "Mean of priors (summing up to 1) assigned to three exclusive n"
                        "pronunciatio sources: reference lexicon, g2p, and phonetic decoding. We "
                        "recommend setting a larger prior mean for the reference lexicon, e.g. '0.6,0.2,0.2'")
    parser.add_argument("--prior-counts-tot", type = float, default = 15.0,
                        help = "Total amount of prior counts we add to all pronunciation candidates of"
                        "each word. By timing it with the prior mean of a source, and then dividing"
                        "by the number of candidates (for a word) from this source, we get the"
                        "prior counts we actually add to each candidate.")
    parser.add_argument("--variants-prob-mass", type = float, default = 0.7,
                        help = "For each word, we pick up candidates (from all three sources)"
                        "with highest posteriors until the total prob mass hit this amount.")
    parser.add_argument("--variants-prob-mass-ref", type = float, default = 0.9,
                        help = "For each word, after the total prob mass of selected candidates "
                        "hit variants-prob-mass, we continue to pick up reference candidates"
                        "with highest posteriors until the total prob mass hit this amount (must >= variants-prob-mass).")
    parser.add_argument("--variants-counts", type = int, default = 1,
                        help = "Generate upto this many variants of prons for each word out"
                        "of the ref. lexicon.")
    parser.add_argument("silence_file", metavar = "<silphonetic-file>", type = str,
                        help = "File containing a list of silence phones.")
    parser.add_argument("pron_stats_file", metavar = "<stats-file>", type = str,
                        help = "File containing pronunciation statistics from lattice alignment; "
                        "each line must be <count> <word> <phones>.")
    parser.add_argument("word_counts_file", metavar = "<counts-file>", type = str,
                        help = "File containing word counts in acoustic training data; "
                        "each line must be <word> <count>.")
    parser.add_argument("ref_lexicon", metavar = "<reference-lexicon>", type = str,
                        help = "The reference lexicon (most probably hand-derived)."
                        "Each line must be <word> <phones>")
    parser.add_argument("g2p_lexicon", metavar = "<g2p-expanded-lexicon>", type = str,
                        help = "Candidate ronouciations from G2P results."
                        "Each line must be <word> <phones>")
    parser.add_argument("phonetic_decoding_lexicon", metavar = "<prons-in-acoustic-evidence>", type = str,
                        help = "Candidate ronouciations from phonetic decoding results."
                        "Each line must be <word> <phones>")
    parser.add_argument("pron_posteriors", metavar = "<pron-posteriors>", type = str,
                        help = "Output file containing posteriors of all candidate prons for each word,"
                        "based on which we select prons to construct the learned lexicon."
                        "each line is <word> <pronunciation-source: one of R(ef)/G(2P)/P(hone-decoding)> <posterior> <pronunciation> ")
    parser.add_argument("learned_lexicon_oov", metavar = "<learned-lexicon-oov>", type = str,
                        help = "Output file which is the learned lexicon for words out of the ref. vocab.")
    parser.add_argument("ref_lexicon_edits", metavar = "<lexicon-edits>", type = str,
                        help = "Output file containing human-readable & editable pronounciation info (and the"
                        "accept/reject decision made by our algorithm) for those words in ref. vocab," 
                        "to which any change has been recommended. The info for each word is like:" 
                        "------------ an 4086.0 --------------"
                        "R  | Y |  2401.6 |  AH N"
                        "R  | Y |  640.8 |  AE N"
                        "P  | Y |  1035.5 |  IH N"
                        "R(ef), P(hone-decoding) represents the pronunciation source"
                        "Y/N means the recommended decision of including this pron or not"
                        "and the numbers are soft counts accumulated from lattice-align-word outputs. "
                        "See the function WriteEditsAndSummary for more details.")


    print (' '.join(sys.argv), file=sys.stderr)

    args = parser.parse_args()
    args = CheckArgs(args)

    return args

def CheckArgs(args):
    args.silence_file_handle = open(args.silence_file)
    if args.pron_stats_file == "-":
        args.pron_stats_file_handle = sys.stdin
    else:
        args.pron_stats_file_handle = open(args.pron_stats_file)
    args.word_counts_file_handle = open(args.word_counts_file)
    args.ref_lexicon_handle = open(args.ref_lexicon)
    args.g2p_lexicon_handle = open(args.g2p_lexicon)
    args.phonetic_decoding_lexicon_handle = open(args.phonetic_decoding_lexicon)
    args.pron_posteriors_handle = open(args.pron_posteriors, "w")
    args.learned_lexicon_oov_handle = open(args.learned_lexicon_oov, "w")
    args.ref_lexicon_edits_handle = open(args.ref_lexicon_edits, "w")
    
    prior_mean = args.prior_mean.strip().split(',')
    if len(prior_mean) is not 3:
        raise Exception('Invalid Dirichlet prior mean ', args.prior_mean)
    for i in range(0,3):
        if float(prior_mean[i]) <= 0 or float(prior_mean[i]) >= 1:
            raise Exception('Dirichlet prior mean', prior_mean[i], 'is invalid, it must be between 0 and 1.')
    args.prior_mean = [float(prior_mean[0]), float(prior_mean[1]), float(prior_mean[2])]

    return args

def ReadPronStats(pron_stats_file_handle):
    stats = {}
    for line in pron_stats_file_handle.readlines():
        splits = line.strip().split()
        if len(splits) == 0:
            continue
        if len(splits) < 2:
            raise Exception('Invalid format of line ' + line
                                + ' in stats file.')
        count = float(splits[0])
        word = splits[1]
        phones = ' '.join(splits[2:])
        stats[(word, phones)] = count
    return stats

def ReadWordCounts(word_counts_file_handle):
    counts = {}
    for line in word_counts_file_handle.readlines():
        splits = line.strip().split()
        if len(splits) < 2:
            raise Exception('Invalid format of line ' + line
                                + ' in counts file.')
        word = splits[0]
        count = int(splits[1])
        counts[word] = count
    return counts

def ReadLexicon(args, lexicon_file_handle, counts):
    # we're skipping any word not in counts (not seen in training data),
    # cause we're only learning prons for words who have acoustic examples.
    lexicon = defaultdict(set)
    for line in lexicon_file_handle.readlines():
        splits = line.strip().split()
        if len(splits) == 0:
            continue
        if len(splits) < 2:
            raise Exception('Invalid format of line ' + line
                                + ' in lexicon file.')
        word = splits[0]
        if word not in counts:
            continue
        phones = ' '.join(splits[1:])
        lexicon[word].add(phones)
    return lexicon

def FilterPhoneticDecodingLexicon(args, phonetic_decoding_lexicon, stats):
    # We want to remove all candidates which contains silence phones
    silphones = set()
    for line in args.silence_file_handle:
        silphones.add(line.strip())
    rejected_candidates = set()
    for word, prons in phonetic_decoding_lexicon.items():
        for pron in prons:
            for phone in pron.split():
                if phone in silphones:
                   if (word, pron) in stats:
                       count = stats[(word, pron)]
                       del stats[(word, pron)]
                   else:
                       count = 0
                   rejected_candidates.add((word, pron))
                   print('WARNING: removing the candidate pronunciation from phonetic-decoding: {0}: '
                         '"{1}" whose soft-count from lattice-alignment is {2}, cause it contains at'
                         ' least one silence phone.'.format(word, pron, count), file=sys.stderr)
                   break
    for word, pron in rejected_candidates:
        phonetic_decoding_lexicon[word].remove(pron)
    return phonetic_decoding_lexicon, stats

def ComputePriorCounts(args, counts, ref_lexicon, g2p_lexicon, phonetic_decoding_lexicon):
    prior_counts = defaultdict(list)
    # In case one source is absent for a word, we set zero prior to this source, 
    # and then re-normalize the prior mean parameters s.t. they sum up to one.
    for word in counts:
        prior_mean = [args.prior_mean[0], args.prior_mean[1], args.prior_mean[2]]
        if word not in ref_lexicon:
            prior_mean[0] = 0
        if word not in g2p_lexicon:
            prior_mean[1] = 0
        if word not in phonetic_decoding_lexicon:
            prior_mean[2] = 0
        prior_mean_sum = sum(prior_mean)
        try:
            prior_mean = [float(t) / prior_mean_sum for t in prior_mean] 
        except ZeroDivisionError:
            print('WARNING: word {} appears in train_counts but not in any lexicon.'.format(word), file=sys.stderr)
        prior_counts[word] = [t * args.prior_counts_tot for t in prior_mean] 
    return prior_counts

def ComputePosteriors(args, stats, ref_lexicon, g2p_lexicon, phonetic_decoding_lexicon, prior_counts):
    posteriors = defaultdict(list) # This dict stores a list of (pronunciation, posterior)
    # pairs for each word, where the posteriors are normalized soft counts. Before normalization,
    # The soft-counts were augmented by a user-specified prior count, according the source 
    # (ref/G2P/phonetic-decoding) of this pronunciation.

    for word, prons in ref_lexicon.items():
        for pron in prons:
            # c is the augmented soft count (observed count + prior count)
            c = float(prior_counts[word][0]) / len(ref_lexicon[word]) + stats.get((word, pron), 0)
            posteriors[word].append((pron, c))

    for word, prons in g2p_lexicon.items():
        for pron in prons:
            c = float(prior_counts[word][1]) / len(g2p_lexicon[word]) + stats.get((word, pron), 0)
            posteriors[word].append((pron, c))

    for word, prons in phonetic_decoding_lexicon.items():
        for pron in prons:
            c = float(prior_counts[word][2]) / len(phonetic_decoding_lexicon[word]) + stats.get((word, pron), 0)
            posteriors[word].append((pron, c))

    num_prons_from_ref = sum(len(ref_lexicon[i]) for i in ref_lexicon)
    num_prons_from_g2p = sum(len(g2p_lexicon[i]) for i in g2p_lexicon)
    num_prons_from_phonetic_decoding = sum(len(phonetic_decoding_lexicon[i]) for i in phonetic_decoding_lexicon)
    print ("---------------------------------------------------------------------------------------------------", file=sys.stderr)
    print ('Total num. words is {}:'.format(len(posteriors)), file=sys.stderr)
    print ('{0} candidate prons came from the reference lexicon; {1} came from G2P;{2} came from'
           'phonetic_decoding'.format(num_prons_from_ref, num_prons_from_g2p, num_prons_from_phonetic_decoding), file=sys.stderr)
    print ("---------------------------------------------------------------------------------------------------", file=sys.stderr)

    # Normalize the augmented soft counts to get posteriors.
    count_sum = defaultdict(float) # This dict stores the pronunciation which has 
    # the sum of augmented soft counts for each word.
    
    for word in posteriors:
        # each entry is a pair: (prounciation, count)
        count_sum[word] = sum([entry[1] for entry in posteriors[word]])
    
    for word, entry in posteriors.items():
        new_entry = []
        for pron, count in entry:      
            post = float(count) / count_sum[word]
            new_entry.append((pron, post))
            source = 'R'
            if word in g2p_lexicon and pron in g2p_lexicon[word]:
                source = 'G'
            elif word in phonetic_decoding_lexicon and pron in phonetic_decoding_lexicon[word]:
                source = 'P'
            print(word, source, "%3.2f" % post, pron, file=args.pron_posteriors_handle)
        del entry[:]
        entry.extend(sorted(new_entry, key=lambda new_entry: new_entry[1]))
    return posteriors

def SelectPronsBayesian(args, counts, posteriors, ref_lexicon, g2p_lexicon, phonetic_decoding_lexicon):
    reference_selected = 0
    g2p_selected = 0
    phonetic_decoding_selected = 0
    learned_lexicon = defaultdict(set)

    for word, entry in posteriors.items():
        num_variants = 0
        post_tot = 0.0
        variants_counts = args.variants_counts
        variants_prob_mass = args.variants_prob_mass
        if word in ref_lexicon:
            # the variants count of the current word's prons in the ref lexicon.
            variants_counts_ref = len(ref_lexicon[word])
            # For words who don't appear in acoustic training data at all, we simply accept all ref prons.
            # For words in ref. vocab, we set the max num. variants 
            if counts.get(word, 0) > 0:
                variants_counts = math.ceil(1.5 * variants_counts_ref)
            else:
                variants_counts = variants_counts_ref
                variants_prob_mass = 1.0
        last_post = 0.0
        while ((num_variants < variants_counts and post_tot < variants_prob_mass)
               or (len(entry) > 0 and entry[-1][1] == last_post)): # this conditions 
               # means the posterior of the current pron is the same as the one we just included.
            try:
                pron, post = entry.pop()
                last_post = post
            except IndexError:
                break
            post_tot += post
            learned_lexicon[word].add(pron)
            num_variants += 1
            if word in ref_lexicon and pron in ref_lexicon[word]:
                reference_selected += 1
            elif word in g2p_lexicon and pron in g2p_lexicon[word]:
                g2p_selected += 1
            else:
                phonetic_decoding_selected += 1

        while (num_variants < variants_counts and post_tot < args.variants_prob_mass_ref):
            try:
                pron, post = entry.pop()
            except IndexError:
                break
            if word in ref_lexicon and pron in ref_lexicon[word]:
                post_tot += post
                learned_lexicon[word].add(pron)
                num_variants += 1
                reference_selected += 1

    num_prons_tot = reference_selected + g2p_selected + phonetic_decoding_selected
    print('---------------------------------------------------------------------------------------------------', file=sys.stderr)
    print ('Num. words in the learned lexicon: {0} num. selected prons: {1}'.format(len(learned_lexicon), num_prons_tot), file=sys.stderr)
    print ('{0} selected prons came from reference candidate prons; {1} came from G2P candidate prons;'
           '{2} came from phonetic-decoding candidate prons.'.format(reference_selected, g2p_selected, phonetic_decoding_selected), file=sys.stderr) 
    return learned_lexicon

def WriteEditsAndSummary(args, learned_lexicon, ref_lexicon, phonetic_decoding_lexicon, g2p_lexicon, counts, stats):
    # Note that learned_lexicon and ref_lexicon are dicts of sets of prons, while the other two lexicons are sets of (word, pron) pairs.
    threshold = 3
    words = [defaultdict(set) for i in range(4)] # "words" contains four bins, where we
    # classify each word into, according to whether it's count > threshold,
    # and whether it's OOVs w.r.t the reference lexicon.

    src = {}
    print("# Note: This file contains pronunciation info for words who have candidate"
          "prons from G2P/phonetic-decoding accepted in the learned lexicon."
          ", sorted by their counts in acoustic training data, "
          ,file=args.ref_lexicon_edits_handle)
    print("# 1st Col: source of the candidate pron: G(2P) / P(hone-decoding) / R(eference)."
          ,file=args.ref_lexicon_edits_handle)
    print("# 2nd Col: accepted or not in the learned lexicon (Y/N).", file=args.ref_lexicon_edits_handle)
    print("# 3rd Col: soft counts from lattice-alignment (not augmented by prior-counts)."
          ,file=args.ref_lexicon_edits_handle)
    print("# 4th Col: the pronunciation cadidate.", file=args.ref_lexicon_edits_handle)
    
    # words which are to be printed into the edits file.
    words_to_edit = [] 
    for word in learned_lexicon:
        count = counts.get(word, 0)
        flags = ['0' for i in range(3)] # "flags" contains three binary indicators, 
        # indicating where this word's pronunciations come from.
        for pron in learned_lexicon[word]:
            if word in phonetic_decoding_lexicon and pron in phonetic_decoding_lexicon[word]:
                flags[0] = '1'
                src[(word, pron)] = 'P'
            if word in ref_lexicon and pron in ref_lexicon[word]:
                flags[1] = '1'
                src[(word, pron)] = 'R'
            if word in g2p_lexicon and pron in g2p_lexicon[word]:
                flags[2] = '1'
                src[(word, pron)] = 'G'
        if word in ref_lexicon:
            all_ref_prons_accepted = True
            for pron in ref_lexicon[word]:
                if pron not in learned_lexicon[word]:
                    all_ref_prons_accepted = False
                    break
            if not all_ref_prons_accepted or flags[0] == '1' or flags[2] == '1':
                words_to_edit.append((word, counts[word]))
            if count > threshold:
                words[0][flags[0] + flags[1] + flags[2]].add(word)
            else:
                words[1][flags[0] + flags[1] + flags[2]].add(word)
        else:
            if count > threshold: 
                words[2][flags[0] + flags[2]].add(word)
            else:
                words[3][flags[0] + flags[2]].add(word)

    words_to_edit_sorted = sorted(words_to_edit, key=lambda entry: entry[1], reverse=True)
    for word, count in words_to_edit_sorted:
        print("------------",word, "%2.1f" % count, "--------------", file=args.ref_lexicon_edits_handle)
        for pron in learned_lexicon[word]:
            print(src[(word, pron)], ' | Y | ', "%2.1f | " % stats.get((word, pron), 0), pron, 
                  file=args.ref_lexicon_edits_handle)
        for pron in ref_lexicon[word]:
            if pron not in learned_lexicon[word]:
                soft_count = stats.get((word, pron), 0)
                print('R  | N |  {:.2f} | {} '.format(soft_count, pron), file=args.ref_lexicon_edits_handle)
    print("Here are the words whose reference pron candidates were all declined", words[0]['100'], file=sys.stderr)
    print("-------------------------------------------------Summary------------------------------------------", file=sys.stderr)
    print("In the learned lexicon, out of those", len(ref_lexicon), "words from the vocab of the reference lexicon:", file=sys.stderr) 
    print("  For those frequent words whose counts in the training text > ", threshold, ":", file=sys.stderr) 
    num_freq_ivs_from_all_sources = len(words[0]['111']) + len(words[0]['110']) + len(words[0]['011'])
    num_freq_ivs_from_g2p_or_phonetic_decoding = len(words[0]['101']) + len(words[0]['001']) + len(words[0]['100'])
    num_freq_ivs_from_ref = len(words[0]['010'])
    num_infreq_ivs_from_all_sources = len(words[1]['111']) + len(words[1]['110']) + len(words[1]['011'])
    num_infreq_ivs_from_g2p_or_phonetic_decoding = len(words[1]['101']) + len(words[1]['001']) + len(words[1]['100'])
    num_infreq_ivs_from_ref = len(words[1]['010'])
    print(' {} words\' selected prons came from the reference lexicon, G2P/phonetic-decoding.'.format(num_freq_ivs_from_all_sources), file=sys.stderr)
    print(' {} words\' selected prons come from G2P/phonetic-decoding-generated.'.format(num_freq_ivs_from_g2p_or_phonetic_decoding), file=sys.stderr) 
    print(' {} words\' selected prons came from the reference lexicon only.'.format(num_freq_ivs_from_ref), file=sys.stderr) 
    print('  For those words whose counts in the training text <= {}:'.format(threshold), file=sys.stderr) 
    print(' {} words\' selected prons came from the reference lexicon, G2P/phonetic-decoding.'.format(num_infreq_ivs_from_all_sources), file=sys.stderr)
    print(' {} words\' selected prons come from G2P/phonetic-decoding-generated.'.format(num_infreq_ivs_from_g2p_or_phonetic_decoding), file=sys.stderr) 
    print(' {} words\' selected prons came from the reference lexicon only.'.format(num_infreq_ivs_from_ref), file=sys.stderr) 
    print("---------------------------------------------------------------------------------------------------", file=sys.stderr)
    num_oovs = len(learned_lexicon) - len(ref_lexicon)
    num_freq_oovs_from_both_sources = len(words[2]['11'])
    num_freq_oovs_from_phonetic_decoding = len(words[2]['10'])
    num_freq_oovs_from_g2p = len(words[2]['01'])
    num_infreq_oovs_from_both_sources = len(words[3]['11'])
    num_infreq_oovs_from_phonetic_decoding = len(words[3]['10'])
    num_infreq_oovs_from_g2p = len(words[3]['01'])
    print('  In the learned lexicon, out of those {} OOV words (w.r.t the reference lexicon):'.format(num_oovs), file=sys.stderr)
    print('  For those words whose counts in the training text > {}:'.format(threshold), file=sys.stderr)
    print('    {} words\' selected prons came from G2P and phonetic-decoding.'.format(num_freq_oovs_from_both_sources), file=sys.stderr)
    print('    {} words\' selected prons came from phonetic decoding only.'.format(num_freq_oovs_from_phonetic_decoding), file=sys.stderr) 
    print('    {} words\' selected prons came from G2P only.'.format(num_freq_oovs_from_g2p), file=sys.stderr) 
    print('  For those words whose counts in the training text <= {}:'.format(threshold), file=sys.stderr) 
    print('    {} words\' selected prons came from G2P and phonetic-decoding.'.format(num_infreq_oovs_from_both_sources), file=sys.stderr)
    print('    {} words\' selected prons came from phonetic decoding only.'.format(num_infreq_oovs_from_phonetic_decoding), file=sys.stderr) 
    print('    {} words\' selected prons came from G2P only.'.format(num_infreq_oovs_from_g2p), file=sys.stderr) 

def WriteLearnedLexiconOov(learned_lexicon, ref_lexicon, file_handle):
    for word, prons in learned_lexicon.items():
        if word not in ref_lexicon:
            for pron in prons:
                print('{0} {1}'.format(word, pron), file=file_handle)
    file_handle.close()

def Main():
    args = GetArgs()

    # Read in three lexicon sources, word counts, and pron stats.
    counts = ReadWordCounts(args.word_counts_file_handle)
    ref_lexicon = ReadLexicon(args, args.ref_lexicon_handle, counts)
    g2p_lexicon = ReadLexicon(args, args.g2p_lexicon_handle, counts)
    phonetic_decoding_lexicon =  ReadLexicon(args, args.phonetic_decoding_lexicon_handle, counts)
    stats = ReadPronStats(args.pron_stats_file_handle)
    phonetic_decoding_lexicon, stats = FilterPhoneticDecodingLexicon(args, phonetic_decoding_lexicon, stats)
   
    # Compute prior counts
    prior_counts = ComputePriorCounts(args, counts, ref_lexicon, g2p_lexicon, phonetic_decoding_lexicon)
    # Compute posteriors, and then select prons to construct the learned lexicon.
    posteriors = ComputePosteriors(args, stats, ref_lexicon, g2p_lexicon, phonetic_decoding_lexicon, prior_counts)

    # Select prons to construct the learned lexicon.
    learned_lexicon = SelectPronsBayesian(args, counts, posteriors, ref_lexicon, g2p_lexicon, phonetic_decoding_lexicon)
    
    # Write the learned prons for words out of the ref. vocab into learned_lexicon_oov.
    WriteLearnedLexiconOov(learned_lexicon, ref_lexicon, args.learned_lexicon_oov_handle)
    # Edits will be printed into ref_lexicon_edits, and the summary will be printed into stderr.
    WriteEditsAndSummary(args, learned_lexicon, ref_lexicon, phonetic_decoding_lexicon, g2p_lexicon, counts, stats)

if __name__ == "__main__":
    Main()


================================================
FILE: egs/steps/dict/select_prons_greedy.py
================================================
#!/usr/bin/env python

# Copyright 2018  Xiaohui Zhang
# Apache 2.0.

from __future__ import print_function
from collections import defaultdict
import argparse
import sys
import math

def GetArgs():
    parser = argparse.ArgumentParser(
        description = "Use a greedy framework to select pronunciation candidates"
        "from three sources: a reference lexicon, G2P lexicon and phonetic-decoding"
        "(PD) lexicon. Basically, this script implements the Alg. 1 in the paper:"
        "Acoustic data-driven lexicon learning based on a greedy pronunciation "
        "selection framework, by X. Zhang, V. Mahonar, D. Povey and S. Khudanpur,"
        "Interspeech 2017. The inputs are an arc-stats file, containing "
        "acoustic evidence (tau_{uwb} in the paper) and three source lexicons "
        "(phonetic-decoding(PD)/G2P/ref). The outputs is the learned lexicon for"
        "all words in the arc_stats (acoustic evidence) file.",
        epilog = "See steps/dict/learn_lexicon_greedy.sh for example.")
    parser.add_argument("--alpha", type = str, default = "0,0,0",
                        help = "Scaling factors for the likelihood reduction threshold."
                        "of three pronunciaiton candidate sources: phonetic-decoding (PD),"
                        "G2P and reference. The valid range of each dimension is [0, 1], and"
                        "a large value means we prune pronunciations from this source more"
                        "aggressively. Setting a dimension to zero means we never want to remove"
                        "pronunciaiton from that source. See Section 4.3 in the paper for details.")
    parser.add_argument("--beta", type = str, default = "0,0,0",
                        help = "smoothing factors for the likelihood reduction term."
                        "of three pronunciaiton candidate sources: phonetic-decoding (PD),"
                        "G2P and reference. The valid range of each dimension is [0, 100], and"
                        "a large value means we prune pronunciations from this source more"
                        "aggressively. See Section 4.3 in the paper for details.")
    parser.add_argument("--delta", type = float, default = 0.000000001,
                        help = "Floor value of the pronunciation posterior statistics."
                        "The valid range is (0, 0.01),"
                        "See Section 3 in the paper for details.")
    parser.add_argument("silence_phones_file", metavar = "<silphone-file>", type = str,
                        help = "File containing a list of silence phones.")
    parser.add_argument("arc_stats_file", metavar = "<arc-stats-file>", type = str,
                        help = "File containing word-pronunciation statistics obtained from lattices; "
                        "each line must be <word> <utt-id> <start-frame> <count> <phones>")
    parser.add_argument("word_counts_file", metavar = "<counts-file>", type = str,
                        help = "File containing word counts in acoustic training data; "
                        "each line must be <word> <count>.")
    parser.add_argument("ref_lexicon", metavar = "<reference-lexicon>", type = str,
                        help = "The reference lexicon (most probably hand-derived)."
                        "Each line must be <word> <phones>")
    parser.add_argument("g2p_lexicon", metavar = "<g2p-expanded-lexicon>", type = str,
                        help = "Candidate ronouciations from G2P results."
                        "Each line must be <word> <phones>")
    parser.add_argument("pd_lexicon", metavar = "<phonetic-decoding-lexicon>", type = str,
                        help = "Candidate ronouciations from phonetic decoding results."
                        "Each line must be <word> <phones>")
    parser.add_argument("learned_lexicon", metavar = "<learned-lexicon>", type = str,
                        help = "Learned lexicon.")


    print (' '.join(sys.argv), file=sys.stderr)

    args = parser.parse_args()
    args = CheckArgs(args)

    return args

def CheckArgs(args):
    args.silence_phones_file_handle = open(args.silence_phones_file)
    if args.arc_stats_file == "-":
        args.arc_stats_file_handle = sys.stdin
    else:
        args.arc_stats_file_handle = open(args.arc_stats_file)
    args.word_counts_file_handle = open(args.word_counts_file)
    args.ref_lexicon_handle = open(args.ref_lexicon)
    args.g2p_lexicon_handle = open(args.g2p_lexicon)
    args.pd_lexicon_handle = open(args.pd_lexicon)
    args.learned_lexicon_handle = open(args.learned_lexicon, "w")
    
    alpha = args.alpha.strip().split(',')
    if len(alpha) is not 3:
        raise Exception('Invalid alpha ', args.alpha)
    for i in range(0,3):
        if float(alpha[i]) < 0 or float(alpha[i]) > 1:
            raise Exception('alaph ', alpha[i], 
                            ' is invalid, it must be within [0, 1].')
        if float(alpha[i]) == 0:
            alpha[i] = -1e-3
        # The absolute likelihood loss (search for loss_abs) is supposed to be positive.
        # But it could be negative near zero because of numerical precision limit.
        # In this case, even if alpha is set to be zero, which means we never want to
        # remove pronunciation from that source, the quality score (search for q_b)
        # could still be negative, which means this pron could be potentially removed.
        # To prevent this, we set alpha as a negative value near zero to ensure
        # q_b is always positive.

    args.alpha = [float(alpha[0]), float(alpha[1]), float(alpha[2])]
    print("[alpha_{pd}, alpha_{g2p}, alpha_{ref}] is: ", args.alpha)
    exit
    beta = args.beta.strip().split(',')
    if len(beta) is not 3:
        raise Exception('Invalid beta ', args.beta)
    for i in range(0,3):
        if float(beta[i]) < 0 or float(beta[i]) > 100:
            raise Exception('beta ', beta[i], 
                            ' is invalid, it must be within [0, 100].')
    args.beta = [float(beta[0]), float(beta[1]), float(beta[2])]
    print("[beta_{pd}, beta_{g2p}, beta_{ref}] is: ", args.beta)

    if args.delta <= 0 or args.delta > 0.1:
        raise Exception('delta ', args.delta, ' is invalid, it must be within'
                        '(0, 0.01).')
    print("delta is: ", args.delta)

    return args

def ReadArcStats(arc_stats_file_handle):
    stats = defaultdict(lambda : defaultdict(dict))
    stats_summed = defaultdict(float)
    for line in arc_stats_file_handle.readlines():
        splits = line.strip().split()

        if (len(splits) == 0):
            continue

        if (len(splits) < 5):
            raise Exception('Invalid format of line ' + line
                                + ' in ' + arc_stats_file)
        utt = splits[1]
        start_frame = int(splits[2])
        word = splits[0]
        count = float(splits[3])
        phones = splits[4:]
        phones = ' '.join(phones)
        stats[word][(utt, start_frame)][phones] = count
        stats_summed[(word, phones)] += count
    return stats, stats_summed

def ReadWordCounts(word_counts_file_handle):
    counts = {}
    for line in word_counts_file_handle.readlines():
        splits = line.strip().split()
        if len(splits) < 2:
            raise Exception('Invalid format of line ' + line
                                + ' in counts file.')
        word = splits[0]
        count = int(splits[1])
        counts[word] = count
    return counts

def ReadLexicon(args, lexicon_file_handle, counts):
    # we're skipping any word not in counts (not seen in training data),
    # cause we're only learning prons for words who have acoustic examples.
    lexicon = defaultdict(set)
    for line in lexicon_file_handle.readlines():
        splits = line.strip().split()
        if len(splits) == 0:
            continue
        if len(splits) < 2:
            raise Exception('Invalid format of line ' + line
                                + ' in lexicon file.')
        word = splits[0]
        if word not in counts:
            continue
        phones = ' '.join(splits[1:])
        lexicon[word].add(phones)
    return lexicon

def FilterPhoneticDecodingLexicon(args, pd_lexicon):
    # We want to remove all candidates which contain silence phones
    silphones = set()
    for line in args.silence_phones_file_handle:
        silphones.add(line.strip())
    rejected_candidates = set()
    for word, prons in pd_lexicon.iteritems():
        for pron in prons:
            for phone in pron.split():
                if phone in silphones:
                   rejected_candidates.add((word, pron))
                   break
    for word, pron in rejected_candidates:
        pd_lexicon[word].remove(pron)
    return pd_lexicon

# One iteration of Expectation-Maximization computation (Eq. 3-4 in the paper).
def OneEMIter(args, word, stats, prons, pron_probs, debug=False):
    prob_acc = [0.0 for i in range(len(prons[word]))]
    s = sum(pron_probs)
    for i in range(len(pron_probs)):
        pron_probs[i] = pron_probs[i] / s
    log_like = 0.0
    for (utt, start_frame) in stats[word]:
        prob = []
        soft_counts = []
        for i in range(len(prons[word])):
            phones = prons[word][i]
            soft_count = stats[word][(utt, start_frame)].get(phones, 0)
            if soft_count < args.delta: 
                soft_count = args.delta
            soft_counts.append(soft_count)
        prob = [i[0] * i[1] for i in zip(soft_counts, pron_probs)]
        for i in range(len(prons[word])):
            prob_acc[i] += prob[i] / sum(prob)
        log_like += math.log(sum(prob))
    pron_probs = [1.0 / float(len(stats[word])) * p for p in prob_acc]
    log_like = 1.0 / float(len(stats[word])) * log_like
    if debug:
        print("Log_like of the word: ", log_like, "pron probs: ", pron_probs)
    return pron_probs, log_like

def SelectPronsGreedy(args, stats, counts, ref_lexicon, g2p_lexicon, pd_lexicon, dianostic_info=False):
    prons = defaultdict(list) # Put all possible prons from three source lexicons into this dictionary
    src = {} # Source of each (word, pron) pair: 'P' = phonetic-decoding, 'G' = G2P, 'R' = reference
    learned_lexicon = defaultdict(set) # Put all selected prons in this dictionary
    for lexicon in ref_lexicon, g2p_lexicon, pd_lexicon:
        for word in lexicon:
            for pron in lexicon[word]:
                prons[word].append(pron)
    for word in prons:
        for pron in prons[word]:
            if word in pd_lexicon and pron in pd_lexicon[word]:
                src[(word, pron)] = 'P'
            if word in g2p_lexicon and pron in g2p_lexicon[word]:
                src[(word, pron)] = 'G'
            if word in ref_lexicon and pron in ref_lexicon[word]:
                src[(word, pron)] = 'R'
   
    for word in prons:
        if word not in stats:
            continue
        n = len(prons[word])
        pron_probs = [1/float(n) for i in range(n)]
        if dianostic_info:
            print("pronunciations of word '{}': {}".format(word, prons[word]))
        active_indexes = set(range(len(prons[word])))
       
        deleted_prons = [] # indexes of prons to be deleted
        soft_counts_normalized = []
        while len(active_indexes) > 1:
            log_like = 1.0
            log_like_last = -1.0
            num_iters = 0
            while abs(log_like - log_like_last) > 1e-7:
                num_iters += 1
                log_like_last = log_like
                pron_probs, log_like = OneEMIter(args, word, stats, prons, pron_probs, False)
                if log_like_last == 1.0 and len(soft_counts_normalized) == 0: # the first iteration
                    soft_counts_normalized = pron_probs
                    if dianostic_info: 
                        print("Avg.(over all egs) soft counts: {}".format(soft_counts_normalized))
            if dianostic_info:
                print("\n Log_like after {} iters of EM: {}, estimated pron_probs: {} \n".format(
                        num_iters, log_like, pron_probs))
            candidates_to_delete = []
            
            for i in active_indexes:
                pron_probs_mod = [p for p in pron_probs]
                pron_probs_mod[i] = 0.0
                for j in range(len(pron_probs_mod)):
                    if j in active_indexes and j != i:
                        pron_probs_mod[j] += 0.01
                pron_probs_mod = [s / sum(pron_probs_mod) for s in pron_probs_mod]
                log_like2 = 1.0
                log_like2_last = -1.0
                num_iters2 = 0
                # Running EM until convengence
                while abs(log_like2 - log_like2_last) > 0.001 :
                    num_iters2 += 1
                    log_like2_last = log_like2
                    pron_probs_mod, log_like2 = OneEMIter(args, word, stats,
                                                          prons, pron_probs_mod, False)
                
                loss_abs = log_like - log_like2 # absolute likelihood loss before normalization
                # (supposed to be positive, but could be negative near zero because of numerical precision limit).
                log_delta = math.log(args.delta)
                thr = -log_delta
                loss = loss_abs
                source = src[(word, prons[word][i])]
                if dianostic_info:
                    print("\n set the pron_prob of '{}' whose source is {}, to zero results in {}"
                    " loss in avg. log-likelihood; Num. iters until converging:{}. ".format(
                      prons[word][i], source, loss, num_iters2))
                # Compute quality score q_b = loss_abs * / (M_w + beta_s(b)) + alpha_s(b) * log_delta
                # See Sec. 4.3 and Alg. 1 in the paper.
                if source == 'P':
                   thr *= args.alpha[0]
                   loss *= float(len(stats[word])) / (float(len(stats[word])) + args.beta[0])
                if source == 'G':
                   thr *= args.alpha[1]
                   loss *= float(len(stats[word])) / (float(len(stats[word])) + args.beta[1])
                if source == 'R':
                   thr *= args.alpha[2]
                   loss *= float(len(stats[word])) / (float(len(stats[word])) + args.beta[2])
                if loss - thr < 0: # loss - thr here is just q_b
                   if dianostic_info:
                       print("Smoothed log-like loss {} is smaller than threshold {} so that the quality"
                             "score {} is negative, adding the pron to the list of candidates to delete"
                             ". ".format(loss, thr, loss-thr))
                   candidates_to_delete.append((loss-thr, i))
            if len(candidates_to_delete) == 0:
                break
            candidates_to_delete_sorted = sorted(candidates_to_delete, 
                                                 key=lambda candidates_to_delete: candidates_to_delete[0])

            deleted_candidate = candidates_to_delete_sorted[0]
            active_indexes.remove(deleted_candidate[1])
            pron_probs[deleted_candidate[1]] = 0.0
            for i in range(len(pron_probs)):
                if i in active_indexes:
                    pron_probs[i] += 0.01
            pron_probs = [s / sum(pron_probs) for s in pron_probs]
            source = src[(word, prons[word][deleted_candidate[1]])]
            pron = prons[word][deleted_candidate[1]]
            soft_count = soft_counts_normalized[deleted_candidate[1]]
            quality_score = deleted_candidate[0]
            # This part of diagnostic info provides hints to the user on how to adjust the parameters.
            if dianostic_info:
                print("removed pron {}, from source {} with quality score {:.5f}".format(
                        pron, source, quality_score)) 
                if (source == 'P' and soft_count > 0.7 and len(stats[word]) > 5):
                    print("WARNING: alpha_{pd} or beta_{pd} may be too large!"
                          "    For the word '{}' whose count is {}, the candidate "
                          "    pronunciation from phonetic decoding '{}' with normalized "
                          "    soft count {} (out of 1) is rejected. It shouldn't have been"
                          "    rejected if alpha_{pd} is smaller than {}".format(
                            word, len(stats[word]), pron, soft_count, -loss / log_delta, 
                            -args.alpha[0] * len(stats[word]) + (objf_change + args.beta[0])),
                            file=sys.stderr)
                    if loss_abs > thr:
                        print("    or beta_{pd} is smaller than {}".format(
                                (loss_abs / thr - 1) * len(stats[word])), file=sys.stderr)
                if (source == 'G' and soft_count > 0.7 and len(stats[word]) > 5):
                    print("WARNING: alpha_{g2p} or beta_{g2p} may be too large!"
                          "    For the word '{}' whose count is {}, the candidate "
                          "    pronunciation from G2P '{}' with normalized "
                          "    soft count {} (out of 1) is rejected. It shouldn't have been"
                          "    rejected if alpha_{g2p} is smaller than {} ".format(
                            word, len(stats[word]), pron, soft_count, -loss / log_delta, 
                            -args.alpha[1] * len(stats[word]) + (objf_change + args.beta[1])),
                          file=sys.stderr)
                    if loss_abs > thr:
                        print("    or beta_{g2p} is smaller than {}.".format((
                                loss_abs / thr - 1) * len(stats[word])), file=sys.stderr)
            deleted_prons.append(deleted_candidate[1])
        for i in range(len(prons[word])):
            if i not in deleted_prons:
                learned_lexicon[word].add(prons[word][i])

    return learned_lexicon

def WriteLearnedLexicon(learned_lexicon, file_handle):
    for word, prons in learned_lexicon.iteritems():
        for pron in prons:
            print('{0} {1}'.format(word, pron), file=file_handle)
    file_handle.close()

def Main():
    args = GetArgs()
    
    # Read in three lexicon sources, word counts, and pron stats.
    counts = ReadWordCounts(args.word_counts_file_handle)
    ref_lexicon = ReadLexicon(args, args.ref_lexicon_handle, counts)
    g2p_lexicon = ReadLexicon(args, args.g2p_lexicon_handle, counts)
    pd_lexicon =  ReadLexicon(args, args.pd_lexicon_handle, counts)
    stats, stats_summed = ReadArcStats(args.arc_stats_file_handle)
    pd_lexicon = FilterPhoneticDecodingLexicon(args, pd_lexicon)
                  
    # Select prons to construct the learned lexicon.
    learned_lexicon = SelectPronsGreedy(args, stats, counts, ref_lexicon, g2p_lexicon, pd_lexicon)
    
    # Write the learned prons for words out of the ref. vocab into learned_lexicon_oov.
    WriteLearnedLexicon(learned_lexicon, args.learned_lexicon_handle)

if __name__ == "__main__":
    Main()


================================================
FILE: egs/steps/dict/train_g2p.sh
================================================
#!/usr/bin/env bash
# Copyright 2014  Johns Hopkins University (Author: Yenda Trmal)
# Copyright 2016  Xiaohui Zhang
# Apache 2.0

# Begin configuration section.
iters=5
stage=0
encoding='utf-8'
only_words=true
cmd=run.pl
# a list of silence phones, like data/local/dict/silence_phones.txt
silence_phones=
# End configuration section.

echo "$0 $@"  # Print the command line for logging

[ -f ./path.sh ] && . ./path.sh; # source the path.
. utils/parse_options.sh || exit 1;

set -u
set -e

if [ $# != 2 ]; then
   echo "Usage: $0 [options] <lexicon-in> <work-dir>"
   echo "    where <lexicon-in> is the training lexicon (one pronunciation per "
   echo "    word per line, with lines like 'hello h uh l ow') and"
   echo "    <work-dir> is directory where the models will be stored"
   echo "e.g.: train_g2p.sh --silence-phones data/local/dict/silence_phones.txt data/local/dict/lexicon.txt exp/g2p/"
   echo ""
   echo "main options (for others, see top of script file)"
   echo "  --iters <int>                                    # How many iterations. Relates to N-ngram order"
   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
   echo "  --silence-phones <silphones-list>                # e.g. data/local/dict/silence_phones.txt."
   echo "                                                   # A list of silence phones, one or more per line"
   echo "                                                   # Relates to  --only-words option"
   echo "  --only-words (true|false)    (default: true)     # If true, exclude silence words, i.e."
   echo "                                                   # words with 1 phone which is a silence."
   exit 1;
fi

lexicon=$1
wdir=$2


mkdir -p $wdir/log

[ ! -f $lexicon ] && echo "$0: Training lexicon does not exist." && exit 1

# Optionally remove words that are mapped to a single silence phone from the lexicon.
if $only_words && [ ! -z "$silence_phones" ]; then
  awk -v s=$silence_phones \
    'BEGIN{while((getline<s)>0) {for(i=1;i<=NF;i++) sil[$i]=1;}}
    {if (!(NF == 2 && $2 in sil)) print;}' $lexicon > $wdir/lexicon_onlywords.txt
  lexicon=$wdir/lexicon_onlywords.txt
fi

if ! g2p=`which g2p.py` ; then
  echo "Sequitur was not found !"
  echo "Go to $KALDI_ROOT/tools and execute extras/install_sequitur.sh"
  exit 1
fi

echo "Training the G2P model (iter 0)"

if [ $stage -le 0 ]; then
  $cmd $wdir/log/g2p.0.log \
    g2p.py -S --encoding $encoding --train $lexicon --devel 5% --write-model $wdir/g2p.model.0
fi

for i in `seq 0 $(($iters-2))`; do

  echo "Training the G2P model (iter $[$i + 1] )"

  if [ $stage -le $i ]; then
    $cmd $wdir/log/g2p.$(($i + 1)).log \
      g2p.py -S --encoding $encoding --model $wdir/g2p.model.$i --ramp-up --train $lexicon --devel 5% --write-model $wdir/g2p.model.$(($i+1))
  fi

done

! (set -e; cd $wdir; ln -sf g2p.model.$[$iters-1] g2p.model.final ) && echo "Problem finalizing training... " && exit 1

if [ $stage -le $(($i + 2)) ]; then
  echo "Running test..."
  $cmd $wdir/log/test.log \
    g2p.py --encoding $encoding --model $wdir/g2p.model.final --test $lexicon
fi


================================================
FILE: egs/steps/dict/train_g2p_phonetisaurus.sh
================================================
#!/usr/bin/env bash

# Copyright 2017  Intellisist, Inc. (Author: Navneeth K)
#           2017  Xiaohui Zhang
#           2018  Ruizhe Huang
# Apache License 2.0

# This script trains a g2p model using Phonetisaurus.

stage=0
encoding='utf-8'
only_words=true
silence_phones=

echo "$0 $@"  # Print the command line for logging

[ -f ./path.sh ] && . ./path.sh; # source the path.
. utils/parse_options.sh || exit 1;

set -u
set -e

if [ $# != 2 ]; then
  echo "Usage: $0 [options] <lexicon-in> <work-dir>"
  echo "    where <lexicon-in> is the training lexicon (one pronunciation per "
  echo "    word per line, with lines like 'hello h uh l ow') and"
  echo "    <work-dir> is directory where the models will be stored"
  echo "e.g.: $0 --silence-phones data/local/dict/silence_phones.txt data/local/dict/lexicon.txt exp/g2p/"
  echo ""
  echo "main options (for others, see top of script file)"
  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
  echo "  --silence-phones <silphones-list>                # e.g. data/local/dict/silence_phones.txt."
  echo "                                                   # A list of silence phones, one or more per line"
  echo "                                                   # Relates to  --only-words option"
  echo "  --only-words (true|false)    (default: true)     # If true, exclude silence words, i.e."
  echo "                                                   # words with one or multiple phones which are all silence."
  exit 1;
fi

lexicon=$1
wdir=$2

[ ! -f $lexicon ] && echo "Cannot find $lexicon" && exit

isuconv=`which uconv`
if [ -z $isuconv ]; then
  echo "uconv was not found. You must install the icu4c package."
  exit 1;
fi

if ! phonetisaurus=`which phonetisaurus-apply` ; then
  echo "Phonetisarus was not found !"
  echo "Go to $KALDI_ROOT/tools and execute extras/install_phonetisaurus.sh"
  exit 1
fi

mkdir -p $wdir


# For input lexicon, remove pronunciations containing non-utf-8-encodable characters,
# and optionally remove words that are mapped to a single silence phone from the lexicon.
if [ $stage -le 0 ]; then
  if $only_words && [ ! -z "$silence_phones" ]; then
    awk 'NR==FNR{a[$1] = 1; next} {s=$2;for(i=3;i<=NF;i++) s=s" "$i; if(!(s in a)) print $1" "s}' \
      $silence_phones $lexicon | \
      awk '{printf("%s\t",$1); for (i=2;i<NF;i++){printf("%s ",$i);} printf("%s\n",$NF);}' | \
      uconv -f "$encoding"  -t "$encoding" -x Any-NFC - | awk 'NF > 0'> $wdir/lexicon_tab_separated.txt
  else
    awk '{printf("%s\t",$1); for (i=2;i<NF;i++){printf("%s ",$i);} printf("%s\n",$NF);}' $lexicon | \
      uconv -f "$encoding" -t "$encoding" -x Any-NFC - | awk 'NF > 0'> $wdir/lexicon_tab_separated.txt
  fi
fi

if [ $stage -le 1 ]; then
  # Align lexicon stage. Lexicon is assumed to have first column tab separated
  phonetisaurus-align --input=$wdir/lexicon_tab_separated.txt --ofile=${wdir}/aligned_lexicon.corpus || exit 1;
fi

if [ $stage -le 2 ]; then
  # Convert aligned lexicon to arpa using make_kn_lm.py, a re-implementation of srilm's ngram-count functionality.
  ./utils/lang/make_kn_lm.py -ngram-order 7 -text ${wdir}/aligned_lexicon.corpus -lm ${wdir}/aligned_lexicon.arpa
fi

if [ $stage -le 3 ]; then
  # Convert the arpa file to FST.
  phonetisaurus-arpa2wfst --lm=${wdir}/aligned_lexicon.arpa --ofile=${wdir}/model.fst
fi


================================================
FILE: egs/steps/get_ctm.sh
================================================
#!/usr/bin/env bash
# Copyright Johns Hopkins University (Author: Daniel Povey) 2012.  Apache 2.0.

# This script produces CTM files from a decoding directory that has lattices                                                                         
# present. It does this for a range of language model weights; see also 
# get_ctm_fast.sh which does it for just one LM weight and also supports
# the word insertion penalty, and get_ctm_conf.sh which outputs CTM files
# with confidence scores.


# begin configuration section.
cmd=run.pl
stage=0
frame_shift=0.01
min_lmwt=5
max_lmwt=20
use_segments=true # if we have a segments file, use it to convert
                  # the segments to be relative to the original files.
print_silence=false
#end configuration section.

echo "$0 $@"  # Print the command line for logging

[ -f ./path.sh ] && . ./path.sh
. parse_options.sh || exit 1;

if [ $# -ne 3 ]; then
  echo "Usage: $0 [options] <data-dir> <lang-dir|graph-dir> <decode-dir>"
  echo " Options:"
  echo "    --cmd (run.pl|queue.pl...)      # specify how to run the sub-processes."
  echo "    --stage (0|1|2)                 # start scoring script from part-way through."
  echo "    --use-segments (true|false)     # use segments and reco2file_and_channel files "
  echo "                                    # to produce a ctm relative to the original audio"
  echo "                                    # files, with channel information (typically needed"
  echo "                                    # for NIST scoring)."
  echo "    --frame-shift (default=0.01)    # specify this if your lattices have a frame-shift"
  echo "                                    # not equal to 0.01 seconds"
  echo "e.g.:"
  echo "$0 data/train data/lang exp/tri4a/decode/"
  echo "See also: steps/get_train_ctm.sh, steps/get_ctm_fast.sh, steps/get_ctm_conf.sh"

  exit 1;
fi

data=$1
lang=$2 # Note: may be graph directory not lang directory, but has the necessary stuff copied.
dir=$3

model=$dir/../final.mdl # assume model one level up from decoding dir.


for f in $lang/words.txt $model $dir/lat.1.gz; do
  [ ! -f $f ] && echo "$0: expecting file $f to exist" && exit 1;
done

name=`basename $data`; # e.g. eval2000

mkdir -p $dir/scoring/log

if [ $stage -le 0 ]; then
  if [ -f $data/segments ] && $use_segments; then
    f=$data/reco2file_and_channel
    [ ! -f $f ] && echo "$0: expecting file $f to exist" && exit 1;
    filter_cmd="utils/convert_ctm.pl $data/segments $data/reco2file_and_channel"
  else
    filter_cmd=cat
  fi

  nj=$(cat $dir/num_jobs)
  lats=$(for n in $(seq $nj); do echo -n "$dir/lat.$n.gz "; done)
  if [ -f $lang/phones/word_boundary.int ]; then
    $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/get_ctm.LMWT.log \
      set -o pipefail '&&' mkdir -p $dir/score_LMWT/ '&&' \
      lattice-1best --lm-scale=LMWT "ark:gunzip -c $lats|" ark:- \| \
      lattice-align-words $lang/phones/word_boundary.int $model ark:- ark:- \| \
      nbest-to-ctm --frame-shift=$frame_shift --print-silence=$print_silence ark:- - \| \
      utils/int2sym.pl -f 5 $lang/words.txt \| \
      $filter_cmd '>' $dir/score_LMWT/$name.ctm || exit 1;
  elif [ -f $lang/phones/align_lexicon.int ]; then
    $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/get_ctm.LMWT.log \
      set -o pipefail '&&' mkdir -p $dir/score_LMWT/ '&&' \
      lattice-1best --lm-scale=LMWT "ark:gunzip -c $lats|" ark:- \| \
      lattice-align-words-lexicon $lang/phones/align_lexicon.int $model ark:- ark:- \| \
      lattice-1best ark:- ark:- \| \
      nbest-to-ctm --frame-shift=$frame_shift --print-silence=$print_silence ark:- - \| \
      utils/int2sym.pl -f 5 $lang/words.txt \| \
      $filter_cmd '>' $dir/score_LMWT/$name.ctm || exit 1;
  else
    echo "$0: neither $lang/phones/word_boundary.int nor $lang/phones/align_lexicon.int exists: cannot align."
    exit 1;
  fi
fi


================================================
FILE: egs/steps/get_ctm_conf_fast.sh
================================================
#!/usr/bin/env bash
# Copyright 2012  Johns Hopkins University (Author: Daniel Povey)
#           2017  Vimal Manohar
#           2018  Xiaohui Zhang
#           2018  Music Technology Group, Universitat Pompeu Fabra.
# Apache 2.0

# This script produces CTM files with confidence scores
# from a decoding directory that has lattices
# present. It does this for one LM weight and also supports 
# the word insertion penalty.
# This is similar to get_ctm_conf.sh, but gets the CTM at the utterance-level.
# It can be faster than steps/get_ctm_conf.sh --use-segments false as it splits
# the process across many jobs. 

# begin configuration section.
cmd=run.pl
stage=0
frame_shift=0.01
lmwt=10
wip=0.0
print_silence=false
#end configuration section.

echo "$0 $@"  # Print the command line for logging

[ -f ./path.sh ] && . ./path.sh
. parse_options.sh || exit 1;

if [ $# -ne 4 ]; then
  echo "Usage: $0 [options] <data-dir> <lang-dir|graph-dir> <decode-dir> <ctm-out-dir>"
  echo " Options:"
  echo "    --cmd (run.pl|queue.pl...)      # specify how to run the sub-processes."
  echo "    --stage (0|1|2)                 # start scoring script from part-way through."
  echo "    --frame-shift (default=0.01)    # specify this if your lattices have a frame-shift"
  echo "                                    # not equal to 0.01 seconds"
  echo "e.g.:"
  echo "$0 data/train data/lang exp/tri4a/decode/"
  echo "See also: steps/get_ctm.sh, steps/get_ctm_conf.sh"
  exit 1;
fi

data=$1
lang=$2 # Note: may be graph directory not lang directory, but has the necessary stuff copied.
decode_dir=$3
dir=$4

if [ -f $decode_dir/final.mdl ]; then
  model=$decode_dir/final.mdl
else
  model=$decode_dir/../final.mdl # assume model one level up from decoding dir.
fi

for f in $lang/words.txt $model $decode_dir/lat.1.gz; do
  [ ! -f $f ] && echo "$0: expecting file $f to exist" && exit 1;
done

mkdir -p $dir

nj=$(cat $decode_dir/num_jobs)
echo $nj > $dir/num_jobs

if [ -f $lang/phones/word_boundary.int ]; then
  $cmd JOB=1:$nj $dir/log/get_ctm.JOB.log \
    set -o pipefail '&&' \
    lattice-add-penalty --word-ins-penalty=$wip "ark:gunzip -c $decode_dir/lat.JOB.gz|" ark:- \| \
    lattice-prune --inv-acoustic-scale=$lmwt --beam=5 ark:- ark:- \| \
    lattice-align-words $lang/phones/word_boundary.int $model ark:- ark:- \| \
    lattice-to-ctm-conf --frame-shift=$frame_shift --decode-mbr=true --inv-acoustic-scale=$lmwt ark:- - \| \
    utils/int2sym.pl -f 5 $lang/words.txt \
    '>' $dir/ctm.JOB || exit 1;
elif [ -f $lang/phones/align_lexicon.int ]; then
    set -o pipefail '&&' \
    lattice-add-penalty --word-ins-penalty=$wip "ark:gunzip -c $decode_dir/lat.JOB.gz|" ark:- \| \
    lattice-prune --inv-acoustic-scale=$lmwt --beam=5 ark:- ark:- \| \
    lattice-align-words-lexicon $lang/phones/align_lexicon.int $model ark:- ark:- \| \
    lattice-to-ctm-conf --frame-shift=$frame_shift --decode-mbr=true --inv-acoustic-scale=$lmwt ark:- - \| \
    utils/int2sym.pl -f 5 $lang/words.txt \
    '>' $dir/ctm.JOB || exit 1;
else
  echo "$0: neither $lang/phones/word_boundary.int nor $lang/phones/align_lexicon.int exists: cannot align."
  exit 1;
fi

for n in `seq $nj`; do 
  cat $dir/ctm.$n
done > $dir/ctm


================================================
FILE: egs/steps/get_ctm_fast.sh
================================================
#!/usr/bin/env bash
# Copyright 2012  Johns Hopkins University (Author: Daniel Povey)
#           2017  Vimal Manohar
#           2018  Xiaohui Zhang
#           2018  Music Technology Group, Universitat Pompeu Fabra.
# Apache 2.0

# This script produces CTM files from a decoding directory that has lattices
# present. It does this for one LM weight and also supports 
# the word insertion penalty.
# This is similar to get_ctm.sh, but gets the CTM at the utterance-level.
# It can be faster than steps/get_ctm.sh --use-segments false as it splits
# the process across many jobs. 

# begin configuration section.
cmd=run.pl
stage=0
frame_shift=0.01
lmwt=10
wip=0.0
print_silence=false
#end configuration section.

echo "$0 $@"  # Print the command line for logging

[ -f ./path.sh ] && . ./path.sh
. parse_options.sh || exit 1;

if [ $# -ne 4 ]; then
  echo "Usage: $0 [options] <data-dir> <lang-dir|graph-dir> <decode-dir> <ctm-out-dir>"
  echo " Options:"
  echo "    --cmd (run.pl|queue.pl...)      # specify how to run the sub-processes."
  echo "    --stage (0|1|2)                 # start scoring script from part-way through."
  echo "    --frame-shift (default=0.01)    # specify this if your lattices have a frame-shift"
  echo "                                    # not equal to 0.01 seconds"
  echo "e.g.:"
  echo "$0 data/train data/lang exp/tri4a/decode/"
  echo "See also: steps/get_ctm.sh, steps/get_ctm_conf.sh"
  exit 1;
fi

data=$1
lang=$2 # Note: may be graph directory not lang directory, but has the necessary stuff copied.
decode_dir=$3
dir=$4

if [ -f $decode_dir/final.mdl ]; then
  model=$decode_dir/final.mdl
else
  model=$decode_dir/../final.mdl # assume model one level up from decoding dir.
fi

for f in $lang/words.txt $model $decode_dir/lat.1.gz; do
  [ ! -f $f ] && echo "$0: expecting file $f to exist" && exit 1;
done

mkdir -p $dir

nj=$(cat $decode_dir/num_jobs)
echo $nj > $dir/num_jobs

if [ -f $lang/phones/word_boundary.int ]; then
  $cmd JOB=1:$nj $dir/log/get_ctm.JOB.log \
    set -o pipefail '&&' \
    lattice-1best --lm-scale=$lmwt --word-ins-penalty=$wip "ark:gunzip -c $decode_dir/lat.JOB.gz|" ark:- \| \
    lattice-align-words $lang/phones/word_boundary.int $model ark:- ark:- \| \
    nbest-to-ctm --frame-shift=$frame_shift --print-silence=$print_silence ark:- - \| \
    utils/int2sym.pl -f 5 $lang/words.txt \
    '>' $dir/ctm.JOB || exit 1;
elif [ -f $lang/phones/align_lexicon.int ]; then
  $cmd JOB=1:$nj $dir/log/get_ctm.JOB.log \
    set -o pipefail '&&' \
    lattice-1best --lm-scale=$lmwt --word-ins-penalty=$wip "ark:gunzip -c $decode_dir/lat.JOB.gz|" ark:- \| \
    lattice-align-words-lexicon $lang/phones/align_lexicon.int $model ark:- ark:- \| \
    lattice-1best ark:- ark:- \| \
    nbest-to-ctm --frame-shift=$frame_shift --print-silence=$print_silence ark:- - \| \
    utils/int2sym.pl -f 5 $lang/words.txt \
    '>' $dir/ctm.JOB || exit 1;
else
  echo "$0: neither $lang/phones/word_boundary.int nor $lang/phones/align_lexicon.int exists: cannot align."
  exit 1;
fi

for n in `seq $nj`; do 
  cat $dir/ctm.$n
done > $dir/ctm


================================================
FILE: egs/steps/get_fmllr_basis.sh
================================================
#!/usr/bin/env bash

# Copyright 2012   Carnegie Mellon University (Author: Yajie Miao)
#                  Johns Hopkins University (Author: Daniel Povey)

# Decoding script that computes basis for basis-fMLLR (see decode_fmllr_basis.sh).
# This can be on top of delta+delta-delta, or LDA+MLLT features.

stage=0
# Parameters in alignment of training data
scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
per_utt=true # If true, then treat each utterance as a separate speaker for purposes of
  # basis training... this is recommended if the number of actual speakers in your
  # training set is less than (feature-dim) * (feature-dim+1).
silence_weight=0.01
cmd=run.pl
# End configuration section

echo "$0 $@"  # Print the command line for logging

[ -f ./path.sh ] && . ./path.sh; # source the path.
. parse_options.sh || exit 1;

if [ $# != 3 ]; then
   echo "Usage: steps/get_fmllr_basis.sh [options] <data-dir> <lang-dir> <exp-dir>"
   echo " e.g.: steps/decode_basis_fmllr.sh data/train_si84 data/lang exp/tri3b/"
   echo "Note: we currently assume that this is the same data you trained the model with."
   echo "main options (for others, see top of script file)"
   echo "  --config <config-file>                   # config containing options"
   echo "  --cmd <cmd>                              # Command to run in parallel with"
   exit 1;
fi

data=$1
lang=$2
dir=$3

nj=`cat $dir/num_jobs` || exit 1;
sdata=$data/split$nj;
[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;

splice_opts=`cat $dir/splice_opts 2>/dev/null` # frame-splicing options.
cmvn_opts=`cat $srcdir/cmvn_opts 2>/dev/null`

silphonelist=`cat $lang/phones/silence.csl` || exit 1;

for f in $data/feats.scp $dir/final.mdl $dir/ali.1.gz; do
  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
done

utils/lang/check_phones_compatible.sh $lang/phones.txt $dir/phones.txt || exit 1;
# Set up the unadapted features "$sifeats".
if [ -f $dir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
echo "$0: feature type is $feat_type";
case $feat_type in
  delta) sifeats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
  lda) sifeats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $dir/final.mat ark:- ark:- |";;
  *) echo "Invalid feature type $feat_type" && exit 1;
esac

# Set up the adapted features "$feats" for training set.
if [ -f $srcdir/trans.1 ]; then 
  feats="$sifeats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$srcdir/trans.JOB ark:- ark:- |";
else
  feats="$sifeats";
fi


if $per_utt; then
  spk2utt_opt=  # treat each utterance as separate speaker when computing basis.
  echo "Doing per-utterance adaptation for purposes of computing the basis."
else
  echo "Doing per-speaker adaptation for purposes of computing the basis."
  [ `cat $sdata/spk2utt | wc -l` -lt $[41*40] ] && \
    echo "Warning: number of speakers is small, might be better to use --per-utt=true."
  spk2utt_opt="--spk2utt=ark:$sdata/JOB/spk2utt"
fi

# Note: we get Gaussian level alignments with the "final.mdl" and the
# speaker adapted features. 
$cmd JOB=1:$nj $dir/log/basis_acc.JOB.log \
  ali-to-post "ark:gunzip -c $dir/ali.JOB.gz|" ark:- \| \
  weight-silence-post $silence_weight $silphonelist $dir/final.mdl ark:- ark:- \| \
  gmm-post-to-gpost $dir/final.mdl "$feats" ark:- ark:- \| \
  gmm-basis-fmllr-accs-gpost $spk2utt_opt \
    $dir/final.mdl "$sifeats" ark,s,cs:- $dir/basis.acc.JOB || exit 1; 

# Compute the basis matrices.
$cmd $dir/log/basis_training.log \
  gmm-basis-fmllr-training $dir/final.mdl $dir/fmllr.basis $dir/basis.acc.* || exit 1;
rm $dir/basis.acc.* 2>/dev/null

exit 0;


================================================
FILE: egs/steps/get_lexicon_probs.sh
================================================
#!/usr/bin/env bash
# Copyright 2013  Johns Hopkins University (Author: Daniel Povey)
# Apache 2.0


# From a training or alignment directory, and an original lexicon.txt and lang/
# directory, obtain a new lexicon with pronunciation probabilities.
# Note: this script is currently deprecated, the recipes are using a different
# script in utils/dict_dir_add_pronprobs.sh.


# Begin configuration section.  
stage=0
smooth_count=1.0 # Amount of count to add corresponding to each original lexicon entry;
                 # this corresponds to add-one smoothing of the pron-probs.
max_one=true   # If true, normalize the pron-probs so the maximum value for each word is 1.0,
               # rather than summing to one.  This is quite standard.

# End configuration options.

echo "$0 $@"  # Print the command line for logging

[ -f path.sh ] && . ./path.sh # source the path.
. parse_options.sh || exit 1;

if [ $# != 6 ]; then
   echo "Usage: steps/get_lexicon_probs.sh <data-dir> <lang-dir> <src-dir|ali-dir> <old-lexicon> <exp-dir> <new-lexicon>"
   echo "e.g.: steps/get_lexicon_probs.sh data/train data/lang exp/tri5 data/local/lexicon.txt \\"
   echo "                      exp/tri5_lexprobs data/local_withprob/lexicon.txt"
   echo "Note: we assume you ran using word-position-dependent phones but both the old and new lexicon will not have"
   echo "these markings.  We also assume the new lexicon will have pron-probs but the old one does not; this limitation"
   echo "of the script can be removed later."
   echo "Main options (for others, see top of script file)"
   echo "  --config <config-file>                           # config containing options"
   echo "  --stage <stage>                                  # used to control partial re-running."
   echo "  --max-one <true|false>                           # If true, normalize so max prob of each"
   echo "                                                   # word is one.  Default: true"
   echo "  --smooth <smooth-count>                          # Amount to smooth each count by (default: 1.0)"
   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
   exit 1;
fi

data=$1
lang=$2
srcdir=$3
old_lexicon=$4
dir=$5
new_lexicon=$6

oov=`cat $lang/oov.int` || exit 1;
nj=`cat $srcdir/num_jobs` || exit 1;

for f in $data/text $lang/L.fst $lang/phones/word_boundary.int $srcdir/ali.1.gz $old_lexicon; do
  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
done

mkdir -p $dir/log
utils/split_data.sh $data $nj # Make sure split data-dir exists.
sdata=$data/split$nj

utils/lang/check_phones_compatible.sh $lang/phones.txt $srcdir/phones.txt || exit 1;
cp $lang/phones.txt $dir || exit 1;

if [ $stage -le 0 ]; then

  ( ( for n in `seq $nj`; do gunzip -c $srcdir/ali.$n.gz; done ) | \
    linear-to-nbest ark:- "ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt $data/text |" '' '' ark:- | \
    lattice-align-words $lang/phones/word_boundary.int $srcdir/final.mdl ark:- ark:- | \
    lattice-to-phone-lattice --replace-words=false $srcdir/final.mdl ark:- ark,t:- | \
    awk '{ if (NF == 4) { word_phones = sprintf("%s %s", $3, $4); count[word_phones]++; } } 
        END { for(key in count) { print count[key], key; } }' | \
          sed s:0,0,:: | awk '{print $2, $1, $3;}' | sed 's/_/ /g' | \
          utils/int2sym.pl -f 3- $lang/phones.txt  | \
          sed -E 's/_I( |$)/ /g' |  sed -E 's/_E( |$)/ /g' | sed -E 's/_B( |$)/ /g' | sed -E 's/_S( |$)/ /g' | \
          utils/int2sym.pl -f 1 $lang/words.txt > $dir/lexicon_counts.txt
  ) 2>&1 | tee $dir/log/get_fsts.log

fi

cat $old_lexicon | awk '{if (!($2 > 0.0 && $2 < 1.0)) { exit(1); }}' && \
  echo "Error: old lexicon $old_lexicon appears to have pron-probs; we don't expect this." && \
  exit 1;

mkdir -p `dirname $new_lexicon` || exit 1;

if [ $stage -le 1 ]; then
  grep -v -w '^<eps>' $dir/lexicon_counts.txt | \
  perl -e ' ($old_lexicon, $smooth_count, $max_one) = @ARGV;
    ($smooth_count >= 0) || die "Invalid smooth_count $smooth_count";
    ($max_one eq "true" || $max_one eq "false") || die "Invalid max_one variable $max_one";
    open(O, "<$old_lexicon")||die "Opening old-lexicon file $old_lexicon"; 
    while(<O>) {
      $_ =~ m/(\S+)\s+(.+)/ || die "Bad old-lexicon line $_";
      $word = $1;
      $orig_pron = $2;
      # Remember the mapping from canonical prons to original prons: in the case of
      # syllable based systems we want to remember the locations of tabs in
      # the original lexicon.
      $pron = join(" ", split(" ", $orig_pron));
      $orig_pron{$word,$pron} = $orig_pron;
      $count{$word,$pron} += $smooth_count;
      $tot_count{$word} += $smooth_count;
    }
    while (<STDIN>) {
      $_ =~ m/(\S+)\s+(\S+)\s+(.+)/ || die "Bad new-lexicon line $_";
      $word = $1;
      $this_count = $2;
      $pron = join(" ", split(" ", $3));
      $count{$word,$pron} += $this_count;
      $tot_count{$word} += $this_count;
    }
    if ($max_one eq "true") {  # replace $tot_count{$word} with max count
       # of any pron.
      %tot_count = {}; # set to empty assoc array.
      foreach $key (keys %count) {
        ($word, $pron) = split($; , $key); # $; is separator for strings that index assoc. arrays.
        $this_count = $count{$key};
        if (!defined $tot_count{$word} || $this_count > $tot_count{$word}) {
          $tot_count{$word} = $this_count;
        }
      }
    }
    foreach $key (keys %count) {
       ($word, $pron) = split($; , $key); # $; is separator for strings that index assoc. arrays.
       $this_orig_pron = $orig_pron{$key};
       if (!defined $this_orig_pron) { die "Word $word and pron $pron did not appear in original lexicon."; }
       if (!defined $tot_count{$word}) { die "Tot-count not defined for word $word."; }
       $prob = $count{$key} / $tot_count{$word};
       print "$word\t$prob\t$this_orig_pron\n";  # Output happens here.
    } '  $old_lexicon $smooth_count $max_one > $new_lexicon || exit 1;
fi

exit 0;

echo $nj > $dir/num_jobs
[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;

cp $srcdir/{tree,final.mdl} $dir || exit 1;
cp $srcdir/final.occs $dir;
splice_opts=`cat $srcdir/splice_opts 2>/dev/null` # frame-splicing options.
cp $srcdir/splice_opts $dir 2>/dev/null # frame-splicing options.


if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
echo "$0: feature type is $feat_type"

case $feat_type in
  delta) sifeats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
  lda) sifeats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |"
    cp $srcdir/final.mat $dir    
   ;;
  *) echo "Invalid feature type $feat_type" && exit 1;
esac

## Set up model and alignment model.
mdl=$srcdir/final.mdl
if [ -f $srcdir/final.alimdl ]; then
  alimdl=$srcdir/final.alimdl
else
  alimdl=$srcdir/final.mdl
fi
[ ! -f $mdl ] && echo "$0: no such model $mdl" && exit 1;
alimdl_cmd="gmm-boost-silence --boost=$boost_silence `cat $lang/phones/optional_silence.csl` $alimdl - |"
mdl_cmd="gmm-boost-silence --boost=$boost_silence `cat $lang/phones/optional_silence.csl` $mdl - |"


## Work out where we're getting the graphs from.
if $use_graphs; then
  [ "$nj" != "`cat $srcdir/num_jobs`" ] && \
    echo "$0: you specified --use-graphs true, but #jobs mismatch." && exit 1;
  [ ! -f $srcdir/fsts.1.gz ] && echo "No graphs in $srcdir" && exit 1;
  graphdir=$srcdir
else
  graphdir=$dir
  if [ $stage -le 0 ]; then
    echo "$0: compiling training graphs"
    tra="ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt $sdata/JOB/text|";   
    $cmd JOB=1:$nj $dir/log/compile_graphs.JOB.log  \
      compile-train-graphs --read-disambig-syms=$lang/phones/disambig.int $dir/tree $dir/final.mdl  $lang/L.fst "$tra" \
        "ark:|gzip -c >$dir/fsts.JOB.gz" || exit 1;
  fi
fi


if [ $stage -le 1 ]; then
  echo "$0: aligning data in $data using $alimdl and speaker-independent features."
  $cmd JOB=1:$nj $dir/log/align_pass1.JOB.log \
    gmm-align-compiled $scale_opts --beam=$beam --retry-beam=$retry_beam "$alimdl_cmd" \
    "ark:gunzip -c $graphdir/fsts.JOB.gz|" "$sifeats" "ark:|gzip -c >$dir/pre_ali.JOB.gz" || exit 1;
fi

if [ $stage -le 2 ]; then
  echo "$0: computing fMLLR transforms"
  if [ "$alimdl" != "$mdl" ]; then
    $cmd JOB=1:$nj $dir/log/fmllr.JOB.log \
      ali-to-post "ark:gunzip -c $dir/pre_ali.JOB.gz|" ark:- \| \
      weight-silence-post 0.0 $silphonelist $alimdl ark:- ark:- \| \
      gmm-post-to-gpost $alimdl "$sifeats" ark:- ark:- \| \
      gmm-est-fmllr-gpost --fmllr-update-type=$fmllr_update_type \
      --spk2utt=ark:$sdata/JOB/spk2utt $mdl "$sifeats" \
      ark,s,cs:- ark:$dir/trans.JOB || exit 1;
  else
    $cmd JOB=1:$nj $dir/log/fmllr.JOB.log \
      ali-to-post "ark:gunzip -c $dir/pre_ali.JOB.gz|" ark:- \| \
      weight-silence-post 0.0 $silphonelist $alimdl ark:- ark:- \| \
      gmm-est-fmllr --fmllr-update-type=$fmllr_update_type \
      --spk2utt=ark:$sdata/JOB/spk2utt $mdl "$sifeats" \
      ark,s,cs:- ark:$dir/trans.JOB || exit 1;
  fi
fi

feats="$sifeats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$dir/trans.JOB ark:- ark:- |"

if [ $stage -le 3 ]; then
  echo "$0: doing final alignment."
  $cmd JOB=1:$nj $dir/log/align_pass2.JOB.log \
    gmm-align-compiled $scale_opts --beam=$beam --retry-beam=$retry_beam "$mdl_cmd" \
    "ark:gunzip -c $graphdir/fsts.JOB.gz|" "$feats" "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1;
fi

rm $dir/pre_ali.*.gz

echo "$0: done aligning data."

utils/summarize_warnings.pl $dir/log

exit 0;


================================================
FILE: egs/steps/get_prons.sh
================================================
#!/usr/bin/env bash
# Copyright  2014  Johns Hopkins University (Author: Daniel Povey)
#            2014  Guoguo Chen
# Apache 2.0

# Begin configuration section.
cmd=run.pl
stage=1
lmwt=10
# End configuration options.

echo "$0 $@"  # Print the command line for logging

[ -f path.sh ] && . ./path.sh # source the path.
. parse_options.sh || exit 1;

if [ $# != 3 ]; then
   echo "usage: $0 <data-dir> <lang-dir> <dir>"
   echo "e.g.:  $0 data/train data/lang exp/tri3"
   echo "or:  $0 data/train data/lang exp/tri3/decode_dev"
   echo "This script writes files prons.*.gz in the directory provided, which must"
   echo "contain alignments (ali.*.gz) or lattices (lat.*.gz).  These files are as"
   echo "output by nbest-to-prons (see its usage message)."
   echo "Main options (for others, see top of script file)"
   echo "  --config <config-file>                           # config containing options"
   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
   echo "  --lmwt <lm-weight>                               # scale for LM, only applicable"
   echo "                                                   # for lattice input (default: 10)"
   exit 1;
fi

# As the usage message of nbest-to-prons says, its output has lines that can be interpreted as
#  <utterance-id> <begin-frame> <num-frames> <word> <phone1> <phone2> ... <phoneN>
# and you could convert these into text form using a command like:
# gunzip -c prons.*.gz | utils/sym2int.pl -f 4 words.txt | utils/sym2int.pl -f 5- phones.txt


data=$1
lang=$2
dir=$3

for f in $data/utt2spk $lang/words.txt $dir/num_jobs; do
  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1;
done

nj=$(cat $dir/num_jobs) || exit 1;
sdata=$data/split$nj
oov=`cat $lang/oov.int` || exit 1;
[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;


if [ -f $dir/final.mdl ]; then
  mdl=$dir/final.mdl
else
  if [ -f $dir/../final.mdl ]; then
    mdl=$dir/../final.mdl  # e.g. decoding directories.
  else
    echo "$0: expected $dir/final.mdl or $dir/../final.mdl to exist."
    exit 1;
  fi
fi

if [ -f $lang/phones/word_boundary.int ]; then
  align_words_cmd="lattice-align-words $lang/phones/word_boundary.int $mdl ark:- ark:-"
else
  if [ ! -f $lang/phones/align_lexicon.int ]; then
    echo "$0: expected either $lang/phones/word_boundary.int or $lang/phones/align_lexicon.int to exist."
    exit 1;
  fi
  align_words_cmd="lattice-align-words-lexicon $lang/phones/align_lexicon.int $mdl ark:- ark:-"
fi

if [ -f $dir/ali.1.gz ]; then
  echo "$0: $dir/ali.1.gz exists, so starting from alignments."

  if [ $stage -le 1 ]; then
    rm $dir/prons.*.gz 2>/dev/null
    $cmd JOB=1:$nj $dir/log/nbest_to_prons.JOB.log \
      linear-to-nbest "ark:gunzip -c $dir/ali.JOB.gz|" \
      "ark:sym2int.pl --map-oov $oov -f 2- $lang/words.txt <$sdata/JOB/text |" \
      "" "" ark:- \| $align_words_cmd \| \
      nbest-to-prons $mdl ark:- "|gzip -c >$dir/prons.JOB.gz" || exit 1;
  fi
else
  if [ ! -f $dir/lat.1.gz ]; then
    echo "$0: expected either $dir/ali.1.gz or $dir/lat.1.gz to exist."
    exit 1;
  fi
  echo "$0: $dir/lat.1.gz exists, so starting from lattices."

  if [ $stage -le 1 ]; then
    rm $dir/prons.*.gz 2>/dev/null
    $cmd JOB=1:$nj $dir/log/nbest_to_prons.JOB.log \
      lattice-1best --lm-scale=$lmwt "ark:gunzip -c $dir/lat.JOB.gz|" ark:- \| \
      $align_words_cmd \| \
      nbest-to-prons $mdl ark:- "|gzip -c >$dir/prons.JOB.gz" || exit 1;
  fi
fi


if [ $stage -le 2 ]; then
  gunzip -c $dir/prons.*.gz | \
    awk '{ $1=""; $2=""; $3=""; count[$0]++; } END{for (k in count) { print count[k], k; }}' > $dir/pron_counts.int || exit 1;
fi

if [ $stage -le 3 ]; then
  cat $dir/pron_counts.int | utils/int2sym.pl -f 2 $lang/words.txt | \
    utils/int2sym.pl -f 3- $lang/phones.txt | sort -nr > $dir/pron_counts.txt
fi

if [ $stage -le 4 ]; then
  if [ -f $lang/phones/word_boundary.int ]; then
    # remove the _B, _I, _S, _E markers from phones; this is often convenient
    # if we want to go back to a word-position-independent source lexicon.
    cat $dir/pron_counts.txt | perl -ane '@A = split(" ", $_);
     for ($n=2;$n<@A;$n++) { $A[$n] =~ s/_[BISE]$//; } print join(" ", @A) . "\n"; ' >$dir/pron_counts_nowb.txt
  fi
fi

if [ $stage -le 5 ]; then
  # Here we figure the count of silence before and after words (actually prons)
  # 1. Create a text like file, but instead of putting words, we write
  #    "word pron" pairs. We change the format of prons.*.gz from pron-per-line
  #    to utterance-per-line (with "word pron" pairs tab-separated), and add
  #    <s> and </s> at the begin and end of each sentence. The _B, _I, _S, _E
  #    markers are removed from phones.
  gunzip -c $dir/prons.*.gz | utils/int2sym.pl -f 4 $lang/words.txt | \
    utils/int2sym.pl -f 5- $lang/phones.txt | cut -d ' ' -f 1,4- | awk '
    BEGIN { utter_id = ""; }
    {
      if (utter_id == "") { utter_id = $1; printf("%s\t<s>", utter_id); }
      else if (utter_id != $1) {
        printf "\t</s>\n"; utter_id = $1; printf("%s\t<s>", utter_id);
      }
      printf("\t%s", $2);
      for (n = 3; n <= NF; n++) { sub("_[BISE]$", "", $n); printf(" %s", $n); }
    }
    END { printf "\t</s>\n"; }' > $dir/pron_perutt_nowb.txt

  # 2. Collect bigram counts for words. To be more specific, we are actually
  #    collecting counts for "v ? w", where "?" represents silence or
  #    non-silence.
  cat $dir/pron_perutt_nowb.txt | perl -ape 's/<eps>[^\t]*\t//g;' | perl -e '
    while (<>) {
      chomp; @col = split("\t");
      for($i = 1; $i < scalar(@col) - 1; $i += 1) {
        $bigram{$col[$i] . "\t" . $col[$i + 1]} += 1;
      }
    }
    foreach $key (keys %bigram) {
      print "$bigram{$key}\t$key\n";
    }' > $dir/pron_bigram_counts_nowb.txt

  # 3. Collect bigram counts for silence and words. the count file has 4 fields
  #    for counts, followed by the "word pron" pair. All fields are separated by
  #    spaces:
  #    <sil-before-count> <nonsil-before-count> <sil-after-count> <nonsil-after-count> <word> <phone1> <phone2 >...
  cat $dir/pron_perutt_nowb.txt | cut -f 2- | perl -e '
    %sil_wpron = (); %nonsil_wpron = (); %wpron_sil = (); %wpron_nonsil = ();
    %words = ();
    while (<STDIN>) {
      chomp;
      @col = split(/[\t]+/, $_); @col >= 2 || die "'$0': bad line \"$_\"\n";
      for ($n = 0; $n < @col - 1; $n++) {
        # First word is not silence, collect the wpron_sil and wpron_nonsil
        # stats.
        if ($col[$n] !~ m/^<eps> /) {
          if ($col[$n + 1] =~ m/^<eps> /) { $wpron_sil{$col[$n]} += 1; }
          else { $wpron_nonsil{$col[$n]} += 1; }
          $words{$col[$n]} = 1;
        }
        # Second word is not silence, collect the sil_wpron and nonsil_wpron
        # stats.
        if ($col[$n + 1] !~ m/^<eps> /) {
          if ($col[$n] =~ m/^<eps> /) { $sil_wpron{$col[$n + 1]} += 1; }
          else { $nonsil_wpron{$col[$n + 1]} += 1; }
          $words{$col[$n + 1]} = 1;
        }
      }
    }
    foreach $wpron (sort keys %words) {
      $sil_wpron{$wpron} += 0; $nonsil_wpron{$wpron} += 0;
      $wpron_sil{$wpron} += 0; $wpron_nonsil{$wpron} += 0;;
      print "$sil_wpron{$wpron} $nonsil_wpron{$wpron} ";
      print "$wpron_sil{$wpron} $wpron_nonsil{$wpron} $wpron\n";
    }
  '> $dir/sil_counts_nowb.txt
fi

echo "$0: done writing prons to $dir/prons.*.gz, silence counts in "
echo "$0: $dir/sil_counts_nowb.txt and pronunciation counts in "
echo "$0: $dir/pron_counts.{int,txt}"
if [ -f $lang/phones/word_boundary.int ]; then
  echo "$0: ... and also in $dir/pron_counts_nowb.txt"
fi

exit 0;


================================================
FILE: egs/steps/get_train_ctm.sh
================================================
#!/usr/bin/env bash
# Copyright Johns Hopkins University (Author: Daniel Povey) 2012.  Apache 2.0.

# This script produces CTM files from a training directory that has alignments
# present.


# begin configuration section.
cmd=run.pl
frame_shift=0.01
stage=0
use_segments=true # if we have a segments file, use it to convert
                  # the segments to be relative to the original files.
print_silence=false # if true, will print <eps> (optional-silence) arcs.

#end configuration section.

echo "$0 $@"  # Print the command line for logging

[ -f ./path.sh ] && . ./path.sh
. parse_options.sh || exit 1;

if [ $# -ne 3 ] && [ $# -ne 4 ]; then
  echo "Usage: $0 [options] <data-dir> <lang-dir> <ali-dir|model-dir> [<output-dir>]"
  echo "(<output-dir> defaults to  <ali-dir|model-dir>.)"
  echo " Options:"
  echo "    --cmd (run.pl|queue.pl...)      # specify how to run the sub-processes."
  echo "    --stage (0|1|2)                 # start scoring script from part-way through."
  echo "    --use-segments (true|false)     # use segments and reco2file_and_channel files "
  echo "                                    # to produce a ctm relative to the original audio"
  echo "                                    # files, with channel information (typically needed"
  echo "                                    # for NIST scoring)."
  echo "    --frame-shift (default=0.01)    # specify this if your alignments have a frame-shift"
  echo "                                    # not equal to 0.01 seconds"
  echo "e.g.:"
  echo "$0 data/train data/lang exp/tri3a_ali"
  echo "Produces ctm in: exp/tri3a_ali/ctm"
  exit 1;
fi

data=$1
lang=$2 # Note: may be graph directory not lang directory, but has the necessary stuff copied.
ali_dir=$3
dir=$4
if [ -z $dir ]; then
  dir=$ali_dir
fi


model=$ali_dir/final.mdl # assume model one level up from decoding dir.


for f in $lang/words.txt $model $ali_dir/ali.1.gz $lang/oov.int; do
  [ ! -f $f ] && echo "$0: expecting file $f to exist" && exit 1;
done

oov=`cat $lang/oov.int` || exit 1;
nj=`cat $ali_dir/num_jobs` || exit 1;
split_data.sh $data $nj || exit 1;
sdata=$data/split$nj

mkdir -p $dir/log || exit 1;

if [ $stage -le 0 ]; then
  if [ -f $lang/phones/word_boundary.int ]; then
    $cmd JOB=1:$nj $dir/log/get_ctm.JOB.log \
      set -o pipefail '&&' linear-to-nbest "ark:gunzip -c $ali_dir/ali.JOB.gz|" \
      "ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt < $sdata/JOB/text |" \
      '' '' ark:- \| \
      lattice-align-words $lang/phones/word_boundary.int $model ark:- ark:- \| \
      nbest-to-ctm --frame-shift=$frame_shift --print-silence=$print_silence ark:- - \| \
      utils/int2sym.pl -f 5 $lang/words.txt \| \
      gzip -c '>' $dir/ctm.JOB.gz || exit 1
  else
    if [ ! -f $lang/phones/align_lexicon.int ]; then
      echo "$0: neither $lang/phones/word_boundary.int nor $lang/phones/align_lexicon.int exists: cannot align."
      exit 1;
    fi
    $cmd JOB=1:$nj $dir/log/get_ctm.JOB.log \
      set -o pipefail '&&' linear-to-nbest "ark:gunzip -c $ali_dir/ali.JOB.gz|" \
      "ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt < $sdata/JOB/text |" \
      '' '' ark:- \| \
      lattice-align-words-lexicon $lang/phones/align_lexicon.int $model ark:- ark:- \| \
      lattice-1best ark:- ark:- \| \
      nbest-to-ctm --frame-shift=$frame_shift --print-silence=$print_silence ark:- - \| \
      utils/int2sym.pl -f 5 $lang/words.txt \| \
      gzip -c '>' $dir/ctm.JOB.gz || exit 1
  fi
fi

if [ $stage -le 1 ]; then
  if [ -f $data/segments ] && $use_segments; then
    f=$data/reco2file_and_channel
    [ ! -f $f ] && echo "$0: expecting file $f to exist" && exit 1;
    for n in `seq $nj`; do gunzip -c $dir/ctm.$n.gz; done | \
      utils/convert_ctm.pl $data/segments $data/reco2file_and_channel > $dir/ctm || exit 1;
  else
    for n in `seq $nj`; do gunzip -c $dir/ctm.$n.gz; done > $dir/ctm || exit 1;
  fi
  rm $dir/ctm.*.gz
fi


================================================
FILE: egs/steps/info/chain_dir_info.pl
================================================
#!/usr/bin/perl -w

use Fcntl;

# we may at some point support options.

$debug = 0;  # we set it to 1 for debugging the script itself.

if ($ARGV[0] eq "--debug") {
  $debug = 1;
  shift @ARGV;
}

if (@ARGV == 0) {
  print STDERR "Usage: steps/info/nnet3_dir_info.pl [--debug] <nnet3-dir1> [<nnet3-dir2> ... ]\n" .
               "e.g: steps/info/nnet3_dir_info.pl exp/nnet3/tdnn_sp\n" .
               "This script extracts some important information from the logs\n" .
               "and displays it on a single (rather long) line.\n" .
               "The --debug option is just to debug the script itself.\n" .
               "This program exits with status 0 if it seems like the arguments\n" .
               "really were of the expected directory type, and 1 otherwise.\n";
  exit(1);
}

if (@ARGV > 1) {
  # repeatedly invoke this program with each of the remaining args.
  $exit_status = 0;
  if ($debug) { $debug_opt = "--debug " } else { $debug_opt = ""; }
  foreach $dir (@ARGV) {
    if (system("$0 $debug_opt$dir") != 0) {
      $exit_status = 1;
    }
  }
  exit($exit_status);
}

$nnet_dir = shift @ARGV;

sub list_all_log_files {
  my @ans = ();
  my $dh;
  if (!opendir($dh, "$nnet_dir/log")) { return (); }
  @ans = readdir $dh;
  closedir $dh;
  return @ans;
}


# returns 1 if the diagnostics are finished on this iter, else 0.
sub diagnostics_are_finished_on_iter {
  my $ans = 1;
  my $iter = shift @_;
  if (!open(F, "<$nnet_dir/log/compute_prob_train.$iter.log")) {
    return 0;
  }
  $found_loglike = 0;
  while (<F>) {
    if (m/Overall log-probability/) { $found_loglike = 1; }
  }
  if (!$found_loglike) { $ans = 0; }
  close(F);
  if (!open(F, "<$nnet_dir/log/compute_prob_valid.$iter.log")) {
    return 0;
  }
  $found_loglike = 0;
  while (<F>) {
    if (m/Overall log-probability/) { $found_loglike = 1; }
  }
  if (!$found_loglike) { $ans = 0; }
  close(F);
  return $ans;
}

# get the number of iterations.
# note: the iterations go from 0 to num-iters-1.
# if num_iters = 0 this program will just exit with status 1.
# we may return a number slightly less than the number of iterations
# in order to ensure that the compute_prob_train and compute_prob_valid
# processes have finished.
sub get_num_iters {
  my $iter = 0;
  while (defined $log_file_hash{"train.$iter.1.log"}) {
    $iter++;
  }
  if ($iter == 0) {
    die "$nnet_dir does not seem to be an nnet3 neural net training directory.";
  }
  my $last_iter = $iter - 1;
  # find an iteration where the diagnostic jobs compute_prob_{train,valid}.$last_iter.log are done.
  for (my $chosen_last_iter = $last_iter;
       $chosen_last_iter >= $last_iter - 6 && $chosen_last_iter >= 0;
       $chosen_last_iter--) {
    if (! diagnostics_are_finished_on_iter($chosen_last_iter)) {
      if ($debug) {
        print STDERR "nnet3_dir_info.pl: diagnostics not finished running on iteration $chosen_last_iter\n";
      }
    } else {
      return $chosen_last_iter + 1;
    }
  }
  # OK, something's not right, just return the original iteration.
  return $iter;
}

sub get_num_jobs_initial {
  my $num_jobs = 1;
  while (defined $log_file_hash{"train.0.$num_jobs.log"}) {
    $num_jobs++;
  }
  $num_jobs--;
  if ($num_jobs == 0) {
    die "$nnet_dir does not seem to be an nnet3 neural net training directory.";
  }
  return $num_jobs;
}


sub get_num_jobs_final {  # expects $num_iters to exist as a global variable.
  my $final_iter = $num_iters - 1;
  my $num_jobs = 1;
  while (defined $log_file_hash{"train.$final_iter.$num_jobs.log"}) {
    $num_jobs++;
  }
  $num_jobs--;
  if ($num_jobs == 0) {
    die "$nnet_dir does not seem to be an nnet3 neural net training directory.";
  }
  return $num_jobs;
}

sub get_combine_info {
  # returns a string with info about the combination stage, or the empty
  # string if there wasn't one.
  if (defined $log_file_hash{"combine.log"} &&
      open(F, "<$nnet_dir/log/combine.log")) {
    while (<F>) {
      if (m/Combining nnets, objective function changed from (\S+) to (\S+)/) {
        close(F);
        return sprintf(" combine=%.3f->%.3f", $1, $2);
      } elsif (m/Combining (\S+) nnets, objective function changed from (\S+) to (\S+)/) {
        close(F);
        return sprintf(" combine=%.3f->%.3f (over %d)", $2, $3, $1);
      }
    }
  }
  return "";
}

sub format_float_as_string {
  my $float = shift @_;
  if (abs($float) >= 1.0) {
    return sprintf("%.2f", $float);
  } else {
    return sprintf("%.3f", $float);
  }
}

# this is used in get_loglike_and_accuracy to format
# strings like ' loglike[32,48,final],train/valid=(-2.43,-2.32,-2.21/-2.84,-2.71,-2.68)'.
sub get_printed_string {
  # $name might be 'loglike', for example.
  my ($name, $iters_array_ref, $train_hash_ref, $valid_hash_ref) = @_;
  my @iters_array = @$iters_array_ref;
  my %train_hash = %$train_hash_ref;  # hash from iter-string to value.
  my %valid_hash = %$valid_hash_ref;  # hash from iter-string to value.
  my @iters_to_print = ();
  my @train_values_to_print = ();
  my @valid_values_to_print = ();
  foreach my $iter (@iters_array) {
    if (defined($train_hash{$iter}) && defined($valid_hash{$iter})) {
      push @iters_to_print, $iter;
      push @train_values_to_print, format_float_as_string($train_hash{$iter});
      push @valid_values_to_print, format_float_as_string($valid_hash{$iter});
    }
  }
  if (@iters_to_print == 0) {  return ""; }
  my $joined_iters = join(",", @iters_to_print);
  my $joined_train_values = join(",", @train_values_to_print);
  my $joined_valid_values = join(",", @valid_values_to_print);
  return " ${name}:train/valid[$joined_iters]=($joined_train_values/$joined_valid_values)";
}


# invoke this as get_objf_iter($iter1, $iter2,..) where $iterN is the string-valued
# iteration, e.g. "92", or "final", or "combined", such that we expect
# $nnet_dir/log/compute_prob_{train,valid}.$iterN.log to exist.
sub get_logprob_and_accuracy_info {
  my @iters_array = @_;
  my %iter_to_train_logprob = ();
  my %iter_to_train_penalty = ();
  my %iter_to_train_xent = ();
  my %iter_to_valid_logprob = ();
  my %iter_to_valid_penalty = ();
  my %iter_to_valid_xent = ();


  foreach my $iter (@iters_array) {
     if (defined $log_file_hash{"compute_prob_train.$iter.log"} &&
        defined $log_file_hash{"compute_prob_valid.$iter.log"} &&
        open(F, "<$nnet_dir/log/compute_prob_train.$iter.log") &&
        open(G, "<$nnet_dir/log/compute_prob_valid.$iter.log")) {
      while (<F>) {
        if (m/Overall log-probability for 'output' is (\S+) \+ (\S+)/) {
          $iter_to_train_logprob{$iter} = $1;
          $iter_to_train_penalty{$iter} = $2;
        } elsif (m/Overall log-probability for 'output' is (\S+)/) {
          $iter_to_train_logprob{$iter} = $1;
          $iter_to_train_penalty{$iter} = 0.0;
        } elsif (m/Overall log-probability for 'output-xent' is (\S+) per frame/) {
          $iter_to_train_xent{$iter} = $1;
        }
      }
      close(F);
      while (<G>) {
        if (m/Overall log-probability for 'output' is (\S+) \+ (\S+)/) {
          $iter_to_valid_logprob{$iter} = $1;
          $iter_to_valid_penalty{$iter} = $2;
        } elsif (m/Overall log-probability for 'output' is (\S+)/) {
          $iter_to_valid_logprob{$iter} = $1;
          $iter_to_valid_penalty{$iter} = 0.0;
        } elsif (m/Overall log-probability for 'output-xent' is (\S+) per frame/) {
          $iter_to_valid_xent{$iter} = $1;
        }
      }
      close(G);
    }
  }
  $ans = "";
  $ans .= get_printed_string("xent", \@iters_array, \%iter_to_train_xent,
                             \%iter_to_valid_xent);
  $ans .= get_printed_string("logprob", \@iters_array, \%iter_to_train_logprob,
                             \%iter_to_valid_logprob);
  # we don't do anything with the l2 penalties.
  return $ans;
}

# invoke this as get_progress_info($iter), e.g. set $iter to the last
# iteration number.
sub get_progress_info {
  my $iter = shift @_;
  if (!defined $log_file_hash{"progress.$iter.log"} ||
      !open(F, "<$nnet_dir/log/progress.$iter.log")) {
    return "";
  }
  my $num_parameters = "0";
  my $output_dim = 0;
  my $input_dim = 0;
  my $ivector_dim = 0;
  my $max_clipped_proportion = 0.0;
  while (<F>) {
    if (m/clipped-proportion=([^,]+)/ && $1 > $max_clipped_proportion) {
      $max_clipped_proportion = $1;
    }
    if (m/^num-parameters: (\S+)/) {
      $num_parameters = sprintf("%.1fM", $1 / 1000000.0);
    }
    if (m/^output-node.* name=output .*dim=(\S+)/) {
      $output_dim = $1;
    }
    if (m/^input-node.* name=input .*dim=(\S+)/) {
      $input_dim = $1;
    }
    if (m/^input-node.* name=ivector .*dim=(\S+)/) {
      $ivector_dim = $1;
    }
  }
  close(F);
  $ans = "";
  if ($num_parameters ne "0") {  $ans .= " num-params=$num_parameters"; }
  if ($max_clipped_proportion > 0.1) {
    if ($max_clipped_proportion > 0.3) {
      $ans .= " **max-clipped-proportion=$max_clipped_proportion**";  # for emphasis; this generally isn't good.
    } else {
      $ans .= " max-clipped-proportion=$max_clipped_proportion";
    }
  }
  if ($output_dim > 0 && $input_dim > 0 && $ivector_dim > 0) {
    $ans .= " dim=$input_dim+$ivector_dim->$output_dim";
  } elsif ($output_dim > 0 && $input_dim > 0) {
    $ans .= " dim=$input_dim->$output_dim";
  } elsif ($output_dim > 0) {
    $ans .= " output-dim=$output_dim";
  }
  return $ans;
}

# return 1 if we seem to have finished training, else 0.
sub finished_training {
  return defined $log_file_hash{"compute_prob_train.final.log"} ||
    defined $log_file_hash{"compute_prob_train.combined.log"};
}

@log_files = list_all_log_files();
if (@log_files == 0) {  exit(1); }
$log_file_hash = ();
foreach $f (@log_files) { $log_file_hash{$f} = 1; }

$num_iters = get_num_iters();
$num_jobs_initial = get_num_jobs_initial();
$num_jobs_final = get_num_jobs_final();
$last_iter = $num_iters - 1;
$two_thirds_iter = int($last_iter * 0.666);

$output_string = "$nnet_dir: num-iters=$num_iters";

$output_string .= " nj=$num_jobs_initial..$num_jobs_final";

$output_string .= get_progress_info("$last_iter");

$output_string .= get_combine_info();


# note: IIRC some of the scripts use the name 'combined' for the model after
# combination, and some 'final', so we try both; only one of these will
# actually produce any output.


@iters_array = ("$two_thirds_iter", "$last_iter", "final", "combined");

$output_string .= get_logprob_and_accuracy_info(@iters_array);

print "$output_string\n";

exit(0);


================================================
FILE: egs/steps/info/gmm_dir_info.pl
================================================
#!/usr/bin/perl -w

use Fcntl;

# we may at some point support options.

$debug = 0;  # we set it to 1 for debugging the script itself.

if ($ARGV[0] eq "--debug") {
  $debug = 1;
  shift @ARGV;
}

if (@ARGV == 0) {
  print STDERR "Usage: steps/info/gmm_dir_info.pl [--debug] <gmm-dir1> [<gmm-dir2> ... ]\n" .
               "e.g: steps/info/gmm_dir_info.pl exp/tri3\n" .
               "This script extracts some important information from the logs\n" .
               "and displays it on a single (rather long) line.\n" .
               "The --debug option is just to debug the script itself.\n" .
               "This program exits with status 0 if it seems like the argument\n" .
               "really was a GMM dir, and 1 otherwise.\n";
  exit(1);
}

if (@ARGV > 1) {
  # repeatedly invoke this program with each of the remaining args.
  $exit_status = 0;
  if ($debug) { $debug_opt = "--debug " } else { $debug_opt = ""; }
  foreach $dir (@ARGV) {
    if (system("$0 $debug_opt$dir") != 0) {
      $exit_status = 1;
    }
  }
  exit($exit_status);
}


$gmm_dir = shift @ARGV;

sub list_all_log_files {
  my @ans = ();
  my $dh;
  if (!opendir($dh, "$gmm_dir/log")) { return (); }
  @ans = readdir $dh;
  closedir $dh;
  return @ans;
}


sub get_num_jobs {
  if (! -d $gmm_dir) {
    print STDERR "steps/info/gmm_dir_info.pl: no such directory $gmm_dir\n";
    exit(1);
  }
  if (!open(F, "<$gmm_dir/num_jobs")) {
    print STDERR "steps/info/gmm_dir_info.pl: no such file $gmm_dir/num_jobs\n";
  }
  my $num_jobs = <F>;
  if (!($num_jobs > 0)) {
    print STDERR "steps/info/gmm_dir_info.pl: bad contents of file $gmm_dir/num_jobs\n";
  }
  close(F);
  return 0 + $num_jobs;  # force conversion to integer.
}

# this function returns a string containing info from the last set of alignment
# jobs.  it may be empty if no alignment info was found, or if it didn't have the
# expected contents.
sub get_last_align_info {
  $max_align_iter = -1;
  foreach $f (@log_files) {
    if ($f =~ m:^align.(\d+).1.log$: && $1 > $max_align_iter) {
      $max_align_iter = $1;
    }
  }
  if ($debug) {
    print STDERR "max-align-iter=$max_align_iter\n";
  }
  if ($max_align_iter == -1) { return ""; }  # something went wrong; return no info.

  $num_utts = 0;
  $num_utts_err = 0;
  $num_utts_retry = 0;
  $num_frames = 0;
  $tot_loglike = 0;
  if ($debug) {
    print STDERR "Starting reading alignment logs\n";
  }
  for ($j = 1; $j <= $num_jobs; $j++) {
    if (open(F, "${gmm_dir}/log/align.$max_align_iter.$j.log")) {
      # we only need the last few lines of the file, e.g. the last 5 lines which
      # would normally be about 400 characters... so the last 1000 characters
      # should be enough.
      seek(F, Fcntl::SEEK_END, -1000);
      while (<F>) {
        if (m/Overall log-likelihood per frame is (\S+) over (\S+) frames./) {
          $tot_loglike += $1 * $2;
          $num_frames += $2;
        } elsif (m/Retried (\S+) out of (\S+) utterances/) {
          $num_utts_retry += $1;
          $num_utts += $2;
        } elsif (m/Done \S+, errors on (\S+)/) {
          $num_utts_err += $1;
        }
      }
      close(F);
    }
  }
  if ($debug) {
    print STDERR "Done reading alignment logs\n";
  }
  if ($num_utts == 0 || $num_frames == 0) { return ""; }  # something went wrong.

  # note: the number of hours of data, e.g. "3.23h data", assumes 100 frames
  # per second, which is almost always true for GMM-based systems.
  return sprintf(" align prob=%.2f over %.2fh [retry=%.1f%%, fail=%.1f%%]",
                 ($tot_loglike / $num_frames), ($num_frames / 360000.0),
                 ($num_utts_retry * 100.0 / $num_utts), ($num_utts_err * 100.0 / $num_utts));
}


# this function returns a string containing info from the last update
# job.  Right now it includes info about the num-states and num-gauss
# and the percentage of Gaussians that had variances floored; we
# also say how much data was used if this
# the string may be empty if no such job was found.
sub get_last_update_info {
  $max_update_iter = -1;
  foreach $f (@log_files) {
    if ($f =~ m:^update.(\d+).log$: && $1 > $max_update_iter) {
      $max_update_iter = $1;
    }
  }
  if ($debug) {
    print STDERR "max-update-iter=$max_update_iter\n";
  }
  if ($max_update_iter == -1) { return ""; }  # something went wrong; return no info.


  $num_gauss = 0;
  $num_gauss_floored = 0;  # number of Gaussians with at least one variance floored.
  $num_gauss_removed = 0;  # number of Gaussians removed due to low-counts.
  $num_gauss_tot = 0;     # total number of Gaussians before splitting.
  $num_gauss_after_split = 0;  # total number of Gaussians after splitting [will
                               # usually be same as before, on last iter.]
  $num_states = 0;  # total number of states [pdf-ids]
  $num_frames = 0;  # total number of frames.
  $loglike = 0;  # log-likelihood [from auxf].

  if (open(F, "<${gmm_dir}/log/update.$max_update_iter.log")) {
    while (<F>) {
      if (m/variance elements floored in (\S+) Gaussians, out of (\S+)/) {
        $num_gauss_floored = $1;
        $num_gauss_tot = $2;
      } elsif (m/Overall avg like per frame = (\S+) over (\S+) frames/) {
        $loglike = $1;
        $num_frames = $2;
      } elsif (m/Split (\S+) states .+ split #Gauss from \S+ to (\S+)/) {
        $num_states = $1;
        $num_gauss_after_split = $2;
      }
    }
    close(F);
  } else {
    return "";  # something went wrong.
  }
  $ans = "";

  if (($align_info eq "" || ! defined $align_info)) {
    # add some info that we'd otherwise get from the alignment jobs.
    if ($num_frames != 0) {
      # add info about how much data we trained on.
      $ans .= sprintf(" %.2fh data", $num_frames / 360000.0);
    }
    if ($loglike != 0) {
      $ans .= sprintf(" log-like=%.2f", $loglike);
    }
  }

  if ($num_states != 0) {
    $ans .= sprintf(" states=%d", $num_states);
  }

  # the next line is really just in case there was no splitting done-- in that
  # case we get the num-gauss from the line about the variance flooring.
  $max_num_gauss = ($num_gauss > $num_gauss_after_split ? $num_gauss : $num_gauss_after_split);
  if ($max_num_gauss > 0) { $ans .= " gauss=$max_num_gauss"; }

  if ($num_gauss > 0 && $num_gauss_removed > 0) {
    $ans .= sprintf(" lowcount-gauss-removed=%d", $num_gauss_removed);
  }

  if ($num_gauss > 0 && $num_gauss_floored > 0) {
    $ans .= sprintf(" gauss-floored=%.02%%", $num_gauss_floored * 100.0 / $num_gauss);
  }
  return $ans;
}


sub get_fmllr_info {
  my %fmllr_num_frames = ();  # maps from fmllr iteration to num-frames
  my %fmllr_auxf_impr = ();  # maps from fmllr iteration to total auxf impr times num-frames.
  foreach $log_file (@log_files) {
    if ($log_file =~ m/^fmllr.(\d+).(\d+).log$/) {
      $iter = $1;
      $job_number = $2;
      if ($job_number <= $num_jobs && open(F, "<$gmm_dir/log/$log_file")) {
        while (<F>) {
          if (m/Overall fMLLR auxf impr per frame is (\S+) over (\S+) frames/) {
            $fmllr_num_frames{$iter} += $2;
            $fmllr_auxf_impr{$iter} += $1 * $2;
          }
        }
        close(F);
      }
    }
  }
  my $tot_auxf_impr = 0.0;
  my $num_frames = 0.0;
  # the fMLLR auxf impr will be summed over the fMLLR iterations.
  foreach $iter (sort(keys %fmllr_auxf_impr)) {
    if ($debug) {
      print STDERR "fmllr iter $iter: $fmllr_auxf_impr{$iter} / $fmllr_num_frames{$iter}\n";
    }
    $tot_auxf_impr += $fmllr_auxf_impr{$iter} / $fmllr_num_frames{$iter};
    $num_frames = $fmllr_num_frames{$iter};  # take the num-frames from the final iteration.
  }
  if ($tot_auxf_impr != 0.0 && $num_frames != 0.0) {
    return sprintf(" fmllr-impr=%.2f over %.2fh", $tot_auxf_impr, $num_frames / 360000.0);
  } else {
    return "";
  }
}

sub get_mllt_info {
  # note: both the objective improvement and logdet are summed over
  # all the iterations of MLLT update.
  my $mllt_objf_impr = 0.0;
  my $mllt_logdet = 0.0;

  foreach $log_file (@log_files) {
    if ($log_file =~ m/^mupdate.\d+.log$/) {
      if (open(F, "<$gmm_dir/log/$log_file")) {
        while (<F>) {
          if (m/Overall objective function improvement for MLLT is (\S+) over \S+ frames, logdet is (\S+)/) {
            $mllt_objf_impr += $1;
            $mllt_logdet += $2;
          }
        }
        close(F);
      }
    }
  }
  if ($mllt_objf_impr != 0.0 && $mllt_logdet != 0.0) {
    return sprintf(" mllt:impr,logdet=%.2f,%.2f", $mllt_objf_impr, $mllt_logdet);
  } else {
    return "";
  }
}

sub get_tree_info {
  $ans = "";
  if (open(F, "<$gmm_dir/log/build_tree.log")) {
    while (<F>) {
      if (m/Including just phones that were split, improvement is (\S+) per frame/) {
        $ans = sprintf(" tree-impr=%.2f", $1);
      }
    }
    close(F);
  }
  return $ans;
}

sub get_lda_info {
  $ans = "";
  if (open(F, "<$gmm_dir/log/lda_est.log")) {
    while (<F>) {
      if (m/Sum of selected singular values is (\S+)/) {
        $ans = sprintf(" lda-sum=%.2f", $1);
      }
    }
    close(F);
  }
  return $ans;
}


@log_files = list_all_log_files();

if (@log_files == 0) {
  exit(1);
}

$output_string = "$gmm_dir:";

$num_jobs = get_num_jobs();  # will crash on failure.

$output_string .= " nj=$num_jobs";

$insufficient_output_string = $output_string;

$align_info =  get_last_align_info();
$output_string .= $align_info;

$output_string .= get_last_update_info($align_info);

$output_string .= get_fmllr_info();

$output_string .= get_tree_info();

$output_string .= get_lda_info();

$output_string .= get_mllt_info();

print $output_string . "\n";

if ($output_string eq $insufficient_output_string) {
  # if we only had "$gmm_dir: nj=$num_jobs", then it's probably not a GMM dir:
  # exit with status 1.
  exit(1);
}

exit(0);


================================================
FILE: egs/steps/info/nnet2_dir_info.pl
================================================
#!/usr/bin/perl -w

use Fcntl;

# we may at some point support options.

$debug = 0;  # we set it to 1 for debugging the script itself.

if ($ARGV[0] eq "--debug") {
  $debug = 1;
  shift @ARGV;
}

if (@ARGV == 0) {
  print STDERR "Usage: steps/info/nnet2_dir_info.pl [--debug] <nnet3-dir1> [<nnet3-dir2> ... ]\n" .
               "e.g: steps/info/nnet2_dir_info.pl exp/nnet3/tdnn_sp\n" .
               "This script extracts some important information from the logs\n" .
               "and displays it on a single (rather long) line.\n" .
               "The --debug option is just to debug the script itself.\n" .
               "This program exits with status 0 if it seems like the arguments\n" .
               "really were of the expected directory type, and 1 otherwise.\n";
  exit(1);
}

if (@ARGV > 1) {
  # repeatedly invoke this program with each of the remaining args.
  $exit_status = 0;
  if ($debug) { $debug_opt = "--debug " } else { $debug_opt = ""; }
  foreach $dir (@ARGV) {
    if (system("$0 $debug_opt$dir") != 0) {
      $exit_status = 1;
    }
  }
  exit($exit_status);
}

$nnet_dir = shift @ARGV;

sub list_all_log_files {
  my @ans = ();
  my $dh;
  if (!opendir($dh, "$nnet_dir/log")) { return (); }
  @ans = readdir $dh;
  closedir $dh;
  return @ans;
}


# returns 1 if the diagnostics are finished on this iter, else 0.
sub diagnostics_are_finished_on_iter {
  my $ans = 1;
  my $iter = shift @_;
  if (!open(F, "<$nnet_dir/log/compute_prob_train.$iter.log")) {
    return 0;
  }
  $found_loglike = 0;
  while (<F>) {
    if (m/Overall log-likelihood/) { $found_loglike = 1; }
  }
  if (!$found_loglike) { $ans = 0; }
  close(F);
  if (!open(F, "<$nnet_dir/log/compute_prob_valid.$iter.log")) {
    return 0;
  }
  $found_loglike = 0;
  while (<F>) {
    if (m/Overall log-likelihood/) { $found_loglike = 1; }
  }
  if (!$found_loglike) { $ans = 0; }
  close(F);
  return $ans;
}

# get the number of iterations.
# note: the iterations go from 0 to num-iters-1.
# if num_iters = 0 this program will just exit with status 1.
# we may return a number slightly less than the number of iterations
# in order to ensure that the compute_prob_train and compute_prob_valid
# processes have finished.
sub get_num_iters {
  my $iter = 0;
  while (defined $log_file_hash{"train.$iter.1.log"}) {
    $iter++;
  }
  if ($iter == 0) {
    die "$nnet_dir does not seem to be an nnet3 neural net training directory.";
  }
  my $last_iter = $iter - 1;
  # find an iteration where the diagnostic jobs compute_prob_{train,valid}.$last_iter.log are done.
  for (my $chosen_last_iter = $last_iter;
       $chosen_last_iter >= $last_iter - 6 && $chosen_last_iter >= 0;
       $chosen_last_iter--) {
    if (! diagnostics_are_finished_on_iter($chosen_last_iter)) {
      if ($debug) {
        print STDERR "nnet3_dir_info.pl: diagnostics not finished running on iteration $chosen_last_iter\n";
      }
    } else {
      return $chosen_last_iter + 1;
    }
  }
  # OK, something's not right, just return the original iteration.
  return $iter;
}

sub get_num_jobs_initial {
  my $num_jobs = 1;
  while (defined $log_file_hash{"train.0.$num_jobs.log"}) {
    $num_jobs++;
  }
  $num_jobs--;
  if ($num_jobs == 0) {
    die "$nnet_dir does not seem to be an nnet3 neural net training directory.";
  }
  return $num_jobs;
}


sub get_num_jobs_final {  # expects $num_iters to exist as a global variable.
  my $final_iter = $num_iters - 1;
  my $num_jobs = 1;
  while (defined $log_file_hash{"train.$final_iter.$num_jobs.log"}) {
    $num_jobs++;
  }
  $num_jobs--;
  if ($num_jobs == 0) {
    die "$nnet_dir does not seem to be an nnet3 neural net training directory.";
  }
  return $num_jobs;
}

sub get_combine_info {
  # returns a string with info about the combination stage, or the empty
  # string if there wasn't one.
  if (defined $log_file_hash{"combine.log"} &&
      open(F, "<$nnet_dir/log/combine.log")) {
    while (<F>) {
      if (m/Combining nnets, objective function changed from (\S+) to (\S+)/) {
        close(F);
        return sprintf(" combine=%.2f->%.2f", $1, $2);
      }
    }
  }
  return "";
}

# this is used in get_loglike_and_accuracy to format
# strings like ' loglike[32,48,final],train/valid=(-2.43,-2.32,-2.21/-2.84,-2.71,-2.68)'.
sub get_printed_string {
  # $name might be 'loglike', for example.
  my ($name, $iters_array_ref, $train_hash_ref, $valid_hash_ref) = @_;
  my @iters_array = @$iters_array_ref;
  my %train_hash = %$train_hash_ref;  # hash from iter-string to value.
  my %valid_hash = %$valid_hash_ref;  # hash from iter-string to value.
  my @iters_to_print = ();
  my @train_values_to_print = ();
  my @valid_values_to_print = ();
  foreach my $iter (@iters_array) {
    if (defined($train_hash{$iter}) && defined($valid_hash{$iter})) {
      push @iters_to_print, $iter;
      push @train_values_to_print, sprintf("%.2f", $train_hash{$iter});
      push @valid_values_to_print, sprintf("%.2f", $valid_hash{$iter});
    }
  }
  if (@iters_to_print == 0) {  return ""; }
  my $joined_iters = join(",", @iters_to_print);
  my $joined_train_values = join(",", @train_values_to_print);
  my $joined_valid_values = join(",", @valid_values_to_print);
  return " ${name}:train/valid[$joined_iters]=($joined_train_values/$joined_valid_values)";
}

# invoke this as get_objf_iter($iter1, $iter2,..) where $iterN is the string-valued
# iteration, e.g. "92", or "final", or "combined", such that we expect
# $nnet_dir/log/compute_prob_{train,valid}.$iterN.log to exist.
sub get_loglike_and_accuracy_info {
  my @iters_array = @_;
  my %iter_to_train_loglike = ();
  my %iter_to_valid_loglike = ();
  my %iter_to_train_accuracy = ();
  my %iter_to_valid_accuracy = ();


  foreach my $iter (@iters_array) {
    if (defined $log_file_hash{"compute_prob_train.$iter.log"} &&
        defined $log_file_hash{"compute_prob_valid.$iter.log"} &&
        open(F, "<$nnet_dir/log/compute_prob_train.$iter.log") &&
        open(G, "<$nnet_dir/log/compute_prob_valid.$iter.log")) {
      while (<F>) {
        if (m/average probability is (\S+) and accuracy is (\S+) with total weight \S+/) {
          $iter_to_train_loglike{$iter} = $1;
          $iter_to_train_accuracy{$iter} = $2;
        }
      }
      close(F);
      while (<G>) {
        if (m/average probability is (\S+) and accuracy is (\S+) with total weight \S+/) {
          $iter_to_valid_loglike{$iter} = $1;
          $iter_to_valid_accuracy{$iter} = $2;
        }
      }
      close(G);
    }
  }
  $ans = "";
  $ans .= get_printed_string("loglike", \@iters_array, \%iter_to_train_loglike,
                             \%iter_to_valid_loglike);
  $ans .= get_printed_string("accuracy", \@iters_array, \%iter_to_train_accuracy,
                             \%iter_to_valid_accuracy);
  return $ans;
}

# invoke this as get_progress_info($iter), e.g. set $iter to the last
# iteration number.
sub get_progress_info {
  my $iter = shift @_;
  if (!defined $log_file_hash{"progress.$iter.log"} ||
      !open(F, "<$nnet_dir/log/progress.$iter.log")) {
    return "";
  }
  my $num_parameters = "0";
  my $output_dim = 0;
  my $input_dim = 0;
  while (<F>) {
    if (m/^parameter-dim (\S+)/) {
      $num_parameters = sprintf("%.1fM", $1 / 1000000.0);
    }
    if (m/^input-dim (\S+)/) {
      $input_dim = $1;
    }
    if (m/^output-dim (\S+)/) {
      $output_dim = $1;
    }
  }
  close(F);
  $ans = "";
  if ($num_parameters ne "0") {  $ans .= " num-params=$num_parameters"; }
  if ($output_dim > 0 && $input_dim > 0) {
    $ans .= " dim=$input_dim->$output_dim";
  } elsif ($output_dim > 0) {
    $ans .= " output-dim=$output_dim";
  }
  return $ans;
}

# return 1 if we seem to have finished training, else 0.
sub finished_training {
  return defined $log_file_hash{"compute_prob_train.final.log"} ||
    defined $log_file_hash{"compute_prob_train.combined.log"};
}

@log_files = list_all_log_files();
if (@log_files == 0) {  exit(1); }
$log_file_hash = ();
foreach $f (@log_files) { $log_file_hash{$f} = 1; }

$num_iters = get_num_iters();
$num_jobs_initial = get_num_jobs_initial();
$num_jobs_final = get_num_jobs_final();
$last_iter = $num_iters - 1;
$two_thirds_iter = int($last_iter * 0.666);

$output_string = "$nnet_dir: num-iters=$num_iters";

$output_string .= " nj=$num_jobs_initial..$num_jobs_final";

$output_string .= get_progress_info("$last_iter");

$output_string .= get_combine_info();


# note: IIRC some of the scripts use the name 'combined' for the model after
# combination, and some 'final', so we try both; only one of these will
# actually produce any output.


@iters_array = ("$two_thirds_iter", "$last_iter", "final", "combined");

$output_string .= get_loglike_and_accuracy_info(@iters_array);

print "$output_string\n";

exit(0);


================================================
FILE: egs/steps/info/nnet3_dir_info.pl
================================================
#!/usr/bin/perl -w

use Fcntl;

# we may at some point support options.

$debug = 0;  # we set it to 1 for debugging the script itself.

if ($ARGV[0] eq "--debug") {
  $debug = 1;
  shift @ARGV;
}

if (@ARGV == 0) {
  print STDERR "Usage: steps/info/nnet3_dir_info.pl [--debug] <nnet3-dir1> [<nnet3-dir2> ... ]\n" .
               "e.g: steps/info/nnet3_dir_info.pl exp/nnet3/tdnn_sp\n" .
               "This script extracts some important information from the logs\n" .
               "and displays it on a single (rather long) line.\n" .
               "The --debug option is just to debug the script itself.\n" .
               "This program exits with status 0 if it seems like the arguments\n" .
               "really were of the expected directory type, and 1 otherwise.\n";
  exit(1);
}

if (@ARGV > 1) {
  # repeatedly invoke this program with each of the remaining args.
  $exit_status = 0;
  if ($debug) { $debug_opt = "--debug " } else { $debug_opt = ""; }
  foreach $dir (@ARGV) {
    if (system("$0 $debug_opt$dir") != 0) {
      $exit_status = 1;
    }
  }
  exit($exit_status);
}

$nnet_dir = shift @ARGV;

sub list_all_log_files {
  my @ans = ();
  my $dh;
  if (!opendir($dh, "$nnet_dir/log")) { return (); }
  @ans = readdir $dh;
  closedir $dh;
  return @ans;
}


# returns 1 if the diagnostics are finished on this iter, else 0.
sub diagnostics_are_finished_on_iter {
  my $ans = 1;
  my $iter = shift @_;
  if (!open(F, "<$nnet_dir/log/compute_prob_train.$iter.log")) {
    return 0;
  }
  $found_loglike = 0;
  while (<F>) {
    if (m/Overall log-likelihood/) { $found_loglike = 1; }
  }
  if (!$found_loglike) { $ans = 0; }
  close(F);
  if (!open(F, "<$nnet_dir/log/compute_prob_valid.$iter.log")) {
    return 0;
  }
  $found_loglike = 0;
  while (<F>) {
    if (m/Overall log-likelihood/) { $found_loglike = 1; }
  }
  if (!$found_loglike) { $ans = 0; }
  close(F);
  return $ans;
}

# get the number of iterations.
# note: the iterations go from 0 to num-iters-1.
# if num_iters = 0 this program will just exit with status 1.
# we may return a number slightly less than the number of iterations
# in order to ensure that the compute_prob_train and compute_prob_valid
# processes have finished.
sub get_num_iters {
  my $iter = 0;
  while (defined $log_file_hash{"train.$iter.1.log"}) {
    $iter++;
  }
  if ($iter == 0) {
    die "$nnet_dir does not seem to be an nnet3 neural net training directory.";
  }
  my $last_iter = $iter - 1;
  # find an iteration where the diagnostic jobs compute_prob_{train,valid}.$last_iter.log are done.
  for (my $chosen_last_iter = $last_iter;
       $chosen_last_iter >= $last_iter - 6 && $chosen_last_iter >= 0;
       $chosen_last_iter--) {
    if (! diagnostics_are_finished_on_iter($chosen_last_iter)) {
      if ($debug) {
        print STDERR "nnet3_dir_info.pl: diagnostics not finished running on iteration $chosen_last_iter\n";
      }
    } else {
      return $chosen_last_iter + 1;
    }
  }
  # OK, something's not right, just return the original iteration.
  return $iter;
}

sub get_num_jobs_initial {
  my $num_jobs = 1;
  while (defined $log_file_hash{"train.0.$num_jobs.log"}) {
    $num_jobs++;
  }
  $num_jobs--;
  if ($num_jobs == 0) {
    die "$nnet_dir does not seem to be an nnet3 neural net training directory.";
  }
  return $num_jobs;
}


sub get_num_jobs_final {  # expects $num_iters to exist as a global variable.
  my $final_iter = $num_iters - 1;
  my $num_jobs = 1;
  while (defined $log_file_hash{"train.$final_iter.$num_jobs.log"}) {
    $num_jobs++;
  }
  $num_jobs--;
  if ($num_jobs == 0) {
    die "$nnet_dir does not seem to be an nnet3 neural net training directory.";
  }
  return $num_jobs;
}

sub get_combine_info {
  # returns a string with info about the combination stage, or the empty
  # string if there wasn't one.
  if (defined $log_file_hash{"combine.log"} &&
      open(F, "<$nnet_dir/log/combine.log")) {
    while (<F>) {
      if (m/Combining nnets, objective function changed from (\S+) to (\S+)/) {
        close(F);
        return sprintf(" combine=%.2f->%.2f", $1, $2);
      } elsif (m/Combining (\S+) nnets, objective function changed from (\S+) to (\S+)/) {
        close(F);
        return sprintf(" combine=%.2f->%.2f (over %d)", $2, $3, $1); 
      }
    }
  }
  return "";
}

sub number_to_string {
  my ($value, $name) = @_;
  my $precision;
  if (abs($value) < 0.02 or ($name eq "accuracy" and abs($value) > 0.97)) {
    $precision = 4;
  } elsif (abs($value) < 0.2 or ($name eq "accuracy" and abs($value) > 0.7)) {
    $precision = 3;
  } else {
    $precision = 2;
  }
  my $format = "%.${precision}f";  # e.g. "%.2f"
  return sprintf($format, $value);
}

# this is used in get_loglike_and_accuracy to format
# strings like ' loglike[32,48,final],train/valid=(-2.43,-2.32,-2.21/-2.84,-2.71,-2.68)'.
sub get_printed_string {
  # $name might be 'loglike', for example.
  my ($name, $iters_array_ref, $train_hash_ref, $valid_hash_ref) = @_;
  my @iters_array = @$iters_array_ref;
  my %train_hash = %$train_hash_ref;  # hash from iter-string to value.
  my %valid_hash = %$valid_hash_ref;  # hash from iter-string to value.
  my @iters_to_print = ();
  my @train_values_to_print = ();
  my @valid_values_to_print = ();
  foreach my $iter (@iters_array) {
    if (defined($train_hash{$iter}) && defined($valid_hash{$iter})) {
      push @iters_to_print, $iter;
      push @train_values_to_print, number_to_string($train_hash{$iter}, $name);
      push @valid_values_to_print, number_to_string($valid_hash{$iter}, $name);
    }
  }
  if (@iters_to_print == 0) {  return ""; }
  my $joined_iters = join(",", @iters_to_print);
  my $joined_train_values = join(",", @train_values_to_print);
  my $joined_valid_values = join(",", @valid_values_to_print);
  return " ${name}:train/valid[$joined_iters]=($joined_train_values/$joined_valid_values)";
}

# invoke this as get_objf_iter($iter1, $iter2,..) where $iterN is the string-valued
# iteration, e.g. "92", or "final", or "combined", such that we expect
# $nnet_dir/log/compute_prob_{train,valid}.$iterN.log to exist.
sub get_loglike_and_accuracy_info {
  my @iters_array = @_;
  my %iter_to_train_loglike = ();
  my %iter_to_valid_loglike = ();
  my %iter_to_train_accuracy = ();
  my %iter_to_valid_accuracy = ();


  foreach my $iter (@iters_array) {
    if (defined $log_file_hash{"compute_prob_train.$iter.log"} &&
        defined $log_file_hash{"compute_prob_valid.$iter.log"} &&
        open(F, "<$nnet_dir/log/compute_prob_train.$iter.log") &&
        open(G, "<$nnet_dir/log/compute_prob_valid.$iter.log")) {
      while (<F>) {
        if (m/Overall log-likelihood for 'output' is (\S+) per frame/) {
          $iter_to_train_loglike{$iter} = $1;
        } elsif (m/Overall accuracy for 'output' is (\S+) per frame/) {
          $iter_to_train_accuracy{$iter} = $1;
        }
      }
      close(F);
      while (<G>) {
        if (m/Overall log-likelihood for 'output' is (\S+) per frame/) {
          $iter_to_valid_loglike{$iter} = $1;
        } elsif (m/Overall accuracy for 'output' is (\S+) per frame/) {
          $iter_to_valid_accuracy{$iter} = $1;
        }
      }
      close(G);
    }
  }
  $ans = "";
  $ans .= get_printed_string("loglike", \@iters_array, \%iter_to_train_loglike,
                             \%iter_to_valid_loglike);
  $ans .= get_printed_string("accuracy", \@iters_array, \%iter_to_train_accuracy,
                             \%iter_to_valid_accuracy);
  return $ans;
}

# invoke this as get_progress_info($iter), e.g. set $iter to the last
# iteration number.
sub get_progress_info {
  my $iter = shift @_;
  if (!defined $log_file_hash{"progress.$iter.log"} ||
      !open(F, "<$nnet_dir/log/progress.$iter.log")) {
    return "";
  }
  my $num_parameters = "0";
  my $output_dim = 0;
  my $input_dim = 0;
  my $ivector_dim = 0;
  my $max_clipped_proportion = 0.0;
  while (<F>) {
    if (m/clipped-proportion=([^,]+)/ && $1 > $max_clipped_proportion) {
      $max_clipped_proportion = $1;
    }
    if (m/^num-parameters: (\S+)/) {
      $num_parameters = sprintf("%.1fM", $1 / 1000000.0);
    }
    if (m/^output-node.* name=output .*dim=(\S+)/) {
      $output_dim = $1;
    }
    if (m/^input-node.* name=input .*dim=(\S+)/) {
      $input_dim = $1;
    }
    if (m/^input-node.* name=ivector .*dim=(\S+)/) {
      $ivector_dim = $1;
    }
  }
  close(F);
  $ans = "";
  if ($num_parameters ne "0") {  $ans .= " num-params=$num_parameters"; }
  if ($max_clipped_proportion > 0.1) {
    if ($max_clipped_proportion > 0.3) {
      $ans .= " **max-clipped-proportion=$max_clipped_proportion**";  # for emphasis; this generally isn't good.
    } else {
      $ans .= " max-clipped-proportion=$max_clipped_proportion";
    }
  }
  if ($output_dim > 0 && $input_dim > 0 && $ivector_dim > 0) {
    $ans .= " dim=$input_dim+$ivector_dim->$output_dim";
  } elsif ($output_dim > 0 && $input_dim > 0) {
    $ans .= " dim=$input_dim->$output_dim";
  } elsif ($output_dim > 0) {
    $ans .= " output-dim=$output_dim";
  }
  return $ans;
}

# return 1 if we seem to have finished training, else 0.
sub finished_training {
  return defined $log_file_hash{"compute_prob_train.final.log"} ||
    defined $log_file_hash{"compute_prob_train.combined.log"};
}

@log_files = list_all_log_files();
if (@log_files == 0) {  exit(1); }
$log_file_hash = ();
foreach $f (@log_files) { $log_file_hash{$f} = 1; }

$num_iters = get_num_iters();
$num_jobs_initial = get_num_jobs_initial();
$num_jobs_final = get_num_jobs_final();
$last_iter = $num_iters - 1;
$two_thirds_iter = int($last_iter * 0.666);

$output_string = "$nnet_dir: num-iters=$num_iters";

$output_string .= " nj=$num_jobs_initial..$num_jobs_final";

$output_string .= get_progress_info("$last_iter");

$output_string .= get_combine_info();


# note: IIRC some of the scripts use the name 'combined' for the model after
# combination, and some 'final', so we try both; only one of these will
# actually produce any output.


@iters_array = ("$two_thirds_iter", "$last_iter", "final", "combined");

$output_string .= get_loglike_and_accuracy_info(@iters_array);

print "$output_string\n";

exit(0);


================================================
FILE: egs/steps/info/nnet3_disc_dir_info.pl
================================================
#!/usr/bin/perl -w

use Fcntl;

# we may at some point support options.

$debug = 0;  # we set it to 1 for debugging the script itself.

if ($ARGV[0] eq "--debug") {
  $debug = 1;
  shift @ARGV;
}

if (@ARGV == 0) {
  print STDERR "Usage: steps/info/nnet3_disc_dir_info.pl [--debug] <nnet3-disc-dir1> [<nnet3-disc-dir2> ... ]\n" .
               "e.g: steps/info/nnet3_dir_info.pl exp/nnet3/tdnn_sp_smbr\n" .
               "This script extracts some important information from the logs\n" .
               "and displays it on a few lines.\n" .
               "The --debug option is just to debug the script itself.\n" .
               "This program exits with status 0 if it seems like the argument\n" .
               "really was a GMM dir, and 1 otherwise.\n";
  exit(1);
}

if (@ARGV > 1) {
  # repeatedly invoke this program with each of the remaining args.
  $exit_status = 0;
  if ($debug) { $debug_opt = "--debug " } else { $debug_opt = ""; }
  foreach $dir (@ARGV) {
    if (system("$0 $debug_opt$dir") != 0) {
      $exit_status = 1;
    }
  }
  exit($exit_status);
}

# from this point we can assume we're invoked with one argument.
$nnet_dir = shift @ARGV;

# This function returns an array of iteration numbers, one
# for each epoch that has already completed (but including
# epoch zero)... e.g.
# it might return (0, 194, 388, 582).
# This is done by reading the soft links, e.g. epoch1.mdl ->194.mdl
sub get_iters_for_epochs {
  my @ans = ();
  for (my $n = 0; 1; $n++) {
    if (-l "$nnet_dir/epoch$n.mdl") {
      my $link_name = readlink("$nnet_dir/epoch$n.mdl");
      if ($link_name =~ m/^(\d+).mdl/) {
        my $iter = $1;
        push @ans, $iter;
      } else {
        die "unexpected link name $nnet_dir/epoch$n.mdl -> $link_name";
      }
    } else {
      if (@ans == 0) {
        die "$nnet_dir does not seem to be a discriminative-training dir " .
          "(expected a link $nnet_dir/epoch0.mdl)";
      }
      return @ans;
    }
  }
}


sub get_num_jobs {
  my $j = 1;
  for (my $j = 1; 1; $j++) {
    if (! -f "$nnet_dir/log/train.0.$j.log") {
      if ($j == 1) {
        die "$nnet_dir does not seem to be a discriminative-training dir " .
          "(expected $nnet_dir/log/train.0.1.log to exist)";
      } else {
        return $j - 1;
      }
    }
  }
}

# returns a string describing the effective learning rate and possibly
# any final-layer-factor.
sub get_effective_learning_rate_str {
  # effective learning rate is the actual learning rate divided by the
  # number of jobs.
  my $convert_log = "$nnet_dir/log/convert.log";
  if (-f $convert_log) {
    open(F, "<$convert_log");
    while (<F>) {
      if (m/--edits/) {
        if (m/set-learning-rate learning-rate=(\S+); set-learning-rate name=output.affine learning-rate=([^"']+)["']/) {
          my $learning_rate = $1;
          my $last_layer_factor = sprintf("%.2f", $2 / $1);
          my $num_jobs = get_num_jobs();
          my $effective_learning_rate = sprintf("%.3g", $learning_rate / $num_jobs);
          close(F);
          return "effective-lrate=$effective_learning_rate;last-layer-factor=$last_layer_factor";
        } elsif (m/set-learning-rate learning-rate=([^"']+)["']/) {
          my $learning_rate = $1;
          my $num_jobs = get_num_jobs();
          my $effective_learning_rate = sprintf("%.3g", $learning_rate / $num_jobs);
          close(F);
          return "effective-lrate=$effective_learning_rate";
        }
      }
    }
  } else {
    die("Expected file $convert_log to exist");
  }
  close(F);
  return "lrate=??";  # could not parse it from the log.
}


# prints some info about the objective function...
sub get_objf_str {
  my @iters_for_epochs = get_iters_for_epochs();
  if (@iters_for_epochs == 1) {
    die("No epochs have finished in directory $nnet_dir")
  }
  # will produce output like:
  # iters-per-epoch=123;epoch[0,1,2,3,4]:train-objf=[0.89,0.92,0.93,0.94],valid-objf=[...],train-counts=[...],valid-counts=[...]"
  # the "counts" are the average num+den occupation counts in the lattices; it's a measure of how much confusability
  # there still is in the lattices.
  my $iters_per_epoch = $iters_for_epochs[1] - $iters_for_epochs[0];
  my $ans = "iters-per-epoch=$iters_per_epoch";
  $ans .= ";epoch[" . join(",", 0..$#iters_for_epochs) . "]:";
  my @train_objfs = ();
  my @train_counts = ();
  my @valid_objfs = ();
  my @valid_counts = ();
  foreach $iter (@iters_for_epochs) {
    if ($iter > 0) { $iter -= 1; }  # last iter will not exist.
    my $train_log = "$nnet_dir/log/compute_objf_train.$iter.log";
    my $valid_log = "$nnet_dir/log/compute_objf_valid.$iter.log";
    if (!open (T, "<$train_log")){  print STDERR "$0: warning: Expected file $train_log to exist\n"; }
    if (!open (V, "<$valid_log")){  print STDERR "$0: warning: Expected file $valid_log to exist\n"; }
    my $train_count = "??";
    my $valid_count = "??";
    my $train_objf = "??";
    my $valid_objf = "??";
    while (<T>) {
      if (m/num\+den count.+is (\S+) per frame/) { $train_count = sprintf("%.2f", $1); }
      if (m/Overall.+ is (\S+) per frame/) { $train_objf = sprintf("%.2f", $1); }
    }
    close(T);
    while (<V>) {
      if (m/num\+den count.+is (\S+) per frame/) { $valid_count = sprintf("%.2f", $1); }
      if (m/Overall.+ is (\S+) per frame/) { $valid_objf = sprintf("%.2f", $1); }
    }
    push @train_objfs, $train_objf;
    push @train_counts, $train_count;
    push @valid_objfs, $valid_objf;
    push @valid_counts, $valid_count;
    close(V);
  }
  $ans .= "train-objf=[" . join(",", @train_objfs) .
       "],valid-objf=[" . join(",", @valid_objfs) .
       "],train-counts=[" . join(",", @train_counts) .
       "],valid-counts=[" . join(",", @valid_counts) . "]";
  return $ans;
}


$output_string = "$nnet_dir:num-jobs=".get_num_jobs().";" .
     get_effective_learning_rate_str() . ";" . get_objf_str();

print "$output_string\n";

exit(0);


================================================
FILE: egs/steps/libs/__init__.py
================================================


# Copyright 2016    Vimal Manohar
# Apache 2.0.

""" This package contains modules and subpackages used in kaldi scripts.
"""

from . import common

__all__ = ["common"]


================================================
FILE: egs/steps/libs/common.py
================================================


# Copyright 2016 Vijayaditya Peddinti.
#           2016 Vimal Manohar
#           2017 Johns Hopkins University (author: Daniel Povey)
# Apache 2.0

""" This module contains several utility functions and classes that are
commonly used in many kaldi python scripts.
"""

from __future__ import print_function
from __future__ import division
import argparse
import logging
import math
import os
import subprocess
import sys
import threading

try:
    import thread as thread_module
except:
    import _thread as thread_module

logger = logging.getLogger(__name__)
logger.addHandler(logging.NullHandler())


def send_mail(message, subject, email_id):
    try:
        subprocess.Popen(
            'echo "{message}" | mail -s "{subject}" {email}'.format(
                message=message,
                subject=subject,
                email=email_id), shell=True)
    except Exception as e:
        logger.info("Unable to send mail due to error:\n {error}".format(
                        error=str(e)))
        pass


def str_to_bool(value):
    if value == "true":
        return True
    elif value == "false":
        return False
    else:
        raise ValueError


class StrToBoolAction(argparse.Action):
    """ A custom action to convert bools from shell format i.e., true/false
        to python format i.e., True/False """

    def __call__(self, parser, namespace, values, option_string=None):
        try:
            setattr(namespace, self.dest, str_to_bool(values))
        except ValueError:
            raise Exception(
                "Unknown value {0} for --{1}".format(values, self.dest))


class NullstrToNoneAction(argparse.Action):
    """ A custom action to convert empty strings passed by shell to None in
    python. This is necessary as shell scripts print null strings when a
    variable is not specified. We could use the more apt None in python. """

    def __call__(self, parser, namespace, values, option_string=None):
        if values.strip() == "":
            setattr(namespace, self.dest, None)
        else:
            setattr(namespace, self.dest, values)


class smart_open(object):
    """
    This class is designed to be used with the "with" construct in python
    to open files. It is similar to the python open() function, but
    treats the input "-" specially to return either sys.stdout or sys.stdin
    depending on whether the mode is "w" or "r".

    e.g.: with smart_open(filename, 'w') as fh:
            print ("foo", file=fh)
    """
    def __init__(self, filename, mode="r"):
        self.filename = filename
        self.mode = mode
        assert self.mode == "w" or self.mode == "r"

    def __enter__(self):
        if self.filename == "-" and self.mode == "w":
            self.file_handle = sys.stdout
        elif self.filename == "-" and self.mode == "r":
            self.file_handle = sys.stdin
        else:
            self.file_handle = open(self.filename, self.mode)
        return self.file_handle

    def __exit__(self, *args):
        if self.filename != "-":
            self.file_handle.close()


def check_if_cuda_compiled():
    p = subprocess.Popen("cuda-compiled")
    p.communicate()
    if p.returncode == 1:
        return False
    else:
        return True


def execute_command(command):
    """ Runs a kaldi job in the foreground and waits for it to complete; raises an
        exception if its return status is nonzero.  The command is executed in
        'shell' mode so 'command' can involve things like pipes.  Often,
        'command' will start with 'run.pl' or 'queue.pl'.  The stdout and stderr
        are merged with the calling process's stdout and stderr so they will
        appear on the screen.

        See also: get_command_stdout, background_command
    """
    p = subprocess.Popen(command, shell=True)
    p.communicate()
    if p.returncode is not 0:
        raise Exception("Command exited with status {0}: {1}".format(
                p.returncode, command))


def get_command_stdout(command, require_zero_status = True):
    """ Executes a command and returns its stdout output as a string.  The
        command is executed with shell=True, so it may contain pipes and
        other shell constructs.

        If require_zero_stats is True, this function will raise an exception if
        the command has nonzero exit status.  If False, it just prints a warning
        if the exit status is nonzero.

        See also: execute_command, background_command
    """
    p = subprocess.Popen(command, shell=True,
                         stdout=subprocess.PIPE)

    stdout = p.communicate()[0]
    if p.returncode is not 0:
        output = "Command exited with status {0}: {1}".format(
            p.returncode, command)
        if require_zero_status:
            raise Exception(output)
        else:
            logger.warning(output)
    return stdout if type(stdout) is str else stdout.decode()


def wait_for_background_commands():
    """ This waits for all threads to exit.  You will often want to
        run this at the end of programs that have launched background
        threads, so that the program will wait for its child processes
        to terminate before it dies."""
    for t in threading.enumerate():
        if not t == threading.current_thread():
            t.join()

def background_command(command, require_zero_status = False):
    """Executes a command in a separate thread, like running with '&' in the shell.
       If you want the program to die if the command eventually returns with
       nonzero status, then set require_zero_status to True.  'command' will be
       executed in 'shell' mode, so it's OK for it to contain pipes and other
       shell constructs.

       This function returns the Thread object created, just in case you want
       to wait for that specific command to finish.  For example, you could do:
             thread = background_command('foo | bar')
             # do something else while waiting for it to finish
             thread.join()

       See also:
         - wait_for_background_commands(), which can be used
           at the end of the program to wait for all these commands to terminate.
         - execute_command() and get_command_stdout(), which allow you to
           execute commands in the foreground.

    """

    p = subprocess.Popen(command, shell=True)
    thread = threading.Thread(target=background_command_waiter,
                              args=(command, p, require_zero_status))
    thread.daemon=True  # make sure it exits if main thread is terminated
                        # abnormally.
    thread.start()
    return thread


def background_command_waiter(command, popen_object, require_zero_status):
    """ This is the function that is called from background_command, in
        a separate thread."""

    popen_object.communicate()
    if popen_object.returncode is not 0:
        str = "Command exited with status {0}: {1}".format(
            popen_object.returncode, command)
        if require_zero_status:
            logger.error(str)
            # thread.interrupt_main() sends a KeyboardInterrupt to the main
            # thread, which will generally terminate the program.
            thread_module.interrupt_main()
        else:
            logger.warning(str)


def get_number_of_leaves_from_tree(alidir):
    stdout = get_command_stdout(
        "tree-info {0}/tree 2>/dev/null | grep num-pdfs".format(alidir))
    parts = stdout.split()
    assert(parts[0] == "num-pdfs")
    num_leaves = int(parts[1])
    if num_leaves == 0:
        raise Exception("Number of leaves is 0")
    return num_leaves


def get_number_of_leaves_from_model(dir):
    stdout = get_command_stdout(
        "am-info {0}/final.mdl 2>/dev/null | grep -w pdfs".format(dir))
    parts = stdout.split()
    # number of pdfs 7115
    assert(' '.join(parts[0:3]) == "number of pdfs")
    num_leaves = int(parts[3])
    if num_leaves == 0:
        raise Exception("Number of leaves is 0")
    return num_leaves


def get_number_of_jobs(alidir):
    try:
        num_jobs = int(open('{0}/num_jobs'.format(alidir)).readline().strip())
    except (IOError, ValueError) as e:
        logger.error("Exception while reading the "
                     "number of alignment jobs: ", exc_info=True)
        raise SystemExit(1)
    return num_jobs


def get_ivector_dim(ivector_dir=None):
    if ivector_dir is None:
        return 0
    stdout_val = get_command_stdout(
        "feat-to-dim --print-args=false "
        "scp:{dir}/ivector_online.scp -".format(dir=ivector_dir))
    ivector_dim = int(stdout_val)
    return ivector_dim

def get_ivector_extractor_id(ivector_dir=None):
    if ivector_dir is None:
        return None
    stdout_val = get_command_stdout(
        "steps/nnet2/get_ivector_id.sh {dir}".format(dir=ivector_dir))

    if (stdout_val.strip() == "") or (stdout_val is None):
        return None

    return stdout_val.strip()

def get_feat_dim(feat_dir):
    if feat_dir is None:
        return 0
    stdout_val = get_command_stdout(
        "feat-to-dim --print-args=false "
        "scp:{data}/feats.scp -".format(data=feat_dir))
    feat_dim = int(stdout_val)
    return feat_dim


def get_feat_dim_from_scp(feat_scp):
    stdout_val = get_command_stdout(
        "feat-to-dim --print-args=false "
        "scp:{feat_scp} -".format(feat_scp=feat_scp))
    feat_dim = int(stdout_val)
    return feat_dim


def read_kaldi_matrix(matrix_file):
    """This function reads a kaldi matrix stored in text format from
    'matrix_file' and stores it as a list of rows, where each row is a list.
    """
    try:
        lines = [x.split() for x in open(matrix_file).readlines()]
        first_field = lines[0][0]
        last_field = lines[-1][-1]
        lines[0] = lines[0][1:]
        lines[-1] = lines[-1][:-1]
        if not (first_field == "[" and last_field == "]"):
            raise Exception(
                "Kaldi matrix file has incorrect format, "
                "only text format matrix files can be read by this script")
        for i in range(len(lines)):
            lines[i] = [int(float(x)) for x in lines[i]]
        return lines
    except IOError:
        raise Exception("Error while reading the kaldi matrix file "
                        "{0}".format(matrix_file))


def write_kaldi_matrix(output_file, matrix):
    """This function writes the matrix stored as a list of lists
    into 'output_file' in kaldi matrix text format.
    """
    with open(output_file, 'w') as f:
        f.write("[ ")
        num_rows = len(matrix)
        if num_rows == 0:
            raise Exception("Matrix is empty")
        num_cols = len(matrix[0])

        for row_index in range(len(matrix)):
            if num_cols != len(matrix[row_index]):
                raise Exception("All the rows of a matrix are expected to "
                                "have the same length")
            f.write(" ".join([str(x) for x in matrix[row_index]]))
            if row_index != num_rows - 1:
                f.write("\n")
        f.write(" ]")


def write_matrix_ascii(file_or_fd, mat, key=None):
    """This function writes the matrix 'mat' stored as a list of lists
    in kaldi matrix text format.
    The destination can be a file or an opened file descriptor.
    If key is provided, then matrix is written to an archive with the 'key'
    as the index field.
    """
    try:
        fd = open(file_or_fd, 'w')
    except TypeError:
        # 'file_or_fd' is opened file descriptor,
        fd = file_or_fd

    try:
        if key is not None:
            print ("{0} [".format(key),
                   file=fd)  # ark-files have keys (utterance-id)
        else:
            print (" [", file=fd)

        num_cols = 0
        for i, row in enumerate(mat):
            line = ' '.join(["{0:f}".format(x) for x in row])
            if i == 0:
                num_cols = len(row)
            elif len(row) != num_cols:
                raise Exception("All the rows of a matrix are expected to "
                                "have the same length")

            if i == len(mat) - 1:
                line += " ]"
            print (line, file=fd)
    finally:
        if fd is not file_or_fd : fd.close()


def read_matrix_ascii(file_or_fd):
    """This function reads a matrix in kaldi matrix text format
    and stores it as a list of lists.
    The input can be a file or an opened file descriptor.
    """
    try:
        fd = open(file_or_fd, 'r')
        fname = file_or_fd
    except TypeError:
        # 'file_or_fd' is opened file descriptor,
        fd = file_or_fd
        fname = file_or_fd.name

    first = fd.read(2)
    if first != ' [':
        logger.error(
            "Kaldi matrix file %s has incorrect format, "
            "only text format matrix files can be read by this script",
            fname)
        raise RuntimeError

    rows = []
    while True:
        line = fd.readline()
        if len(line) == 0:
            logger.error("Kaldi matrix file %s has incorrect format; "
                         "got EOF before end of matrix", fname)
        if len(line.strip()) == 0 : continue # skip empty line
        arr = line.strip().split()
        if arr[-1] != ']':
            rows.append([float(x) for x in arr])  # not last line
        else:
            rows.append([float(x) for x in arr[:-1]])  # lastline
            return rows
    if fd is not file_or_fd:
        fd.close()


def read_key(fd):
  """ [str] = read_key(fd)
   Read the utterance-key from the opened ark/stream descriptor 'fd'.
  """
  str_ = ''
  while True:
    char = fd.read(1)
    if char == '':
        break
    if char == ' ':
        break
    str_ += char
  str_ = str_.strip()
  if str_ == '':
      return None   # end of file,
  return str_


def read_mat_ark(file_or_fd):
    """This function reads a kaldi matrix archive in text format
    and yields a dictionary output indexed by the key (utterance-id).
    The input can be a file or an opened file descriptor.

    Example usage:
    mat_dict = { key: mat for key, mat in read_mat_ark(file) }
    """
    try:
        fd = open(file_or_fd, 'r')
        fname = file_or_fd
    except TypeError:
        # 'file_or_fd' is opened file descriptor,
        fd = file_or_fd
        fname = file_or_fd.name

    try:
        key = read_key(fd)
        while key:
          mat = read_matrix_ascii(fd)
          yield key, mat
          key = read_key(fd)
    finally:
        if fd is not file_or_fd:
            fd.close()


def force_symlink(file1, file2):
    import errno
    try:
        os.symlink(file1, file2)
    except OSError as e:
        if e.errno == errno.EEXIST:
            os.remove(file2)
            os.symlink(file1, file2)


def compute_lifter_coeffs(lifter, dim):
    coeffs = [0] * dim
    for i in range(0, dim):
        coeffs[i] = 1.0 + 0.5 * lifter * math.sin(math.pi * i / float(lifter))

    return coeffs


def compute_idct_matrix(K, N, cepstral_lifter=0):
    matrix = [[0] * K for i in range(N)]
    # normalizer for X_0
    normalizer = math.sqrt(1.0 / float(N))
    for j in range(0, N):
        matrix[j][0] = normalizer
    # normalizer for other elements
    normalizer = math.sqrt(2.0 / float(N))
    for k in range(1, K):
        for n in range(0, N):
            matrix[n][
                k] = normalizer * math.cos(math.pi / float(N) * (n + 0.5) * k)

    if cepstral_lifter != 0:
        lifter_coeffs = compute_lifter_coeffs(cepstral_lifter, K)
        for k in range(0, K):
            for n in range(0, N):
                matrix[n][k] = float(matrix[n][k]) / lifter_coeffs[k]

    return matrix


def write_idct_matrix(feat_dim, cepstral_lifter, file_path):
    # generate the IDCT matrix and write to the file
    idct_matrix = compute_idct_matrix(feat_dim, feat_dim, cepstral_lifter)
    # append a zero column to the matrix, this is the bias of the fixed affine
    # component
    for k in range(0, feat_dim):
        idct_matrix[k].append(0)
    write_kaldi_matrix(file_path, idct_matrix)


================================================
FILE: egs/steps/libs/nnet3/__init__.py
================================================


# Copyright 2016    Johns Hopkins University (Dan Povey)
#           2016    Vimal Manohar
#           2016    Vijayaditya Peddinti
#           2016    Yiming Wang
# Apache 2.0.


# This module has the python functions which facilitate the use of nnet3 toolkit
# It has two sub-modules
# xconfig : Library for parsing high level description of neural networks
# train : Library for training scripts


================================================
FILE: egs/steps/libs/nnet3/report/__init__.py
================================================


# Copyright 2016    Vimal Manohar
# Apache 2.0.

from . import log_parse

__all__ = ["log_parse"]


================================================
FILE: egs/steps/libs/nnet3/report/log_parse.py
================================================


# Copyright 2016    Vijayaditya Peddinti
#                   Vimal Manohar
# Apache 2.0.

from __future__ import division
from __future__ import print_function
import traceback
import datetime
import logging
import re

import libs.common as common_lib

logger = logging.getLogger(__name__)
logger.addHandler(logging.NullHandler())

g_lstmp_nonlin_regex_pattern = ''.join([".*progress.([0-9]+).log:component name=(.+) ",
    "type=(.*)Component,.*",
    "i_t_sigmoid.*",
    "value-avg=\[.*=\((.+)\), mean=([0-9\.\-e]+), stddev=([0-9\.e\-]+)\].*",
    "deriv-avg=\[.*=\((.+)\), mean=([0-9\.\-e]+), stddev=([0-9\.e\-]+)\].*",
    "f_t_sigmoid.*",
    "value-avg=\[.*=\((.+)\), mean=([0-9\.\-e]+), stddev=([0-9\.e\-]+)\].*",
    "deriv-avg=\[.*=\((.+)\), mean=([0-9\.\-e]+), stddev=([0-9\.e\-]+)\].*",
    "c_t_tanh.*",
    "value-avg=\[.*=\((.+)\), mean=([0-9\.\-e]+), stddev=([0-9\.e\-]+)\].*",
    "deriv-avg=\[.*=\((.+)\), mean=([0-9\.\-e]+), stddev=([0-9\.e\-]+)\].*",
    "o_t_sigmoid.*",
    "value-avg=\[.*=\((.+)\), mean=([0-9\.\-e]+), stddev=([0-9\.e\-]+)\].*",
    "deriv-avg=\[.*=\((.+)\), mean=([0-9\.\-e]+), stddev=([0-9\.e\-]+)\].*",
    "m_t_tanh.*",
    "value-avg=\[.*=\((.+)\), mean=([0-9\.\-e]+), stddev=([0-9\.e\-]+)\].*",
    "deriv-avg=\[.*=\((.+)\), mean=([0-9\.\-e]+), stddev=([0-9\.e\-]+)\]"])

g_normal_nonlin_regex_pattern = ''.join([".*progress.([0-9]+).log:component name=(.+) ",
    "type=(.*)Component,.*",
    "value-avg=\[.*=\((.+)\), mean=([0-9\.\-e]+), stddev=([0-9\.e\-]+)\].*",
    "deriv-avg=\[.*=\((.+)\), mean=([0-9\.\-e]+), stddev=([0-9\.e\-]+)\]"])

g_normal_nonlin_regex_pattern_with_oderiv = ''.join([".*progress.([0-9]+).log:component name=(.+) ",
    "type=(.*)Component,.*",
    "value-avg=\[.*=\((.+)\), mean=([0-9\.\-e]+), stddev=([0-9\.e\-]+)\].*",
    "deriv-avg=\[.*=\((.+)\), mean=([0-9\.\-e]+), stddev=([0-9\.e\-]+)\].*",
    "oderiv-rms=\[.*=\((.+)\), mean=([0-9\.\-e]+), stddev=([0-9\.e\-]+)\]"])

class KaldiLogParseException(Exception):
    """ An Exception class that throws an error when there is an issue in
    parsing the log files. Extend this class if more granularity is needed.
    """
    def __init__(self, message = None):
        if message is not None and message.strip() == "":
            message = None

        Exception.__init__(self,
                           "There was an error while trying to parse the logs."
                           " Details : \n{0}\n".format(message))

# This function is used to fill stats_per_component_per_iter table with the
# results of regular expression.

def fill_nonlin_stats_table_with_regex_result(groups, gate_index, stats_table):
    iteration = int(groups[0])
    component_name = groups[1]
    component_type = groups[2]
    # for value-avg
    value_percentiles = groups[3+gate_index*6]
    value_mean = float(groups[4+gate_index*6])
    value_stddev = float(groups[5+gate_index*6])
    value_percentiles_split = re.split(',| ',value_percentiles)
    assert len(value_percentiles_split) == 13
    value_5th = float(value_percentiles_split[4])
    value_50th = float(value_percentiles_split[6])
    value_95th = float(value_percentiles_split[9])
    # for deriv-avg
    deriv_percentiles = groups[6+gate_index*6]
    deriv_mean = float(groups[7+gate_index*6])
    deriv_stddev = float(groups[8+gate_index*6])
    deriv_percentiles_split = re.split(',| ',deriv_percentiles)
    assert len(deriv_percentiles_split) == 13
    deriv_5th = float(deriv_percentiles_split[4])
    deriv_50th = float(deriv_percentiles_split[6])
    deriv_95th = float(deriv_percentiles_split[9])

    if len(groups) <= 9:
        try:
            if iteration in stats_table[component_name]['stats']:
                stats_table[component_name]['stats'][iteration].extend(
                        [value_mean,  value_stddev,
                         deriv_mean,  deriv_stddev,
                         value_5th,  value_50th,  value_95th,
                         deriv_5th,  deriv_50th,  deriv_95th])
            else:
                stats_table[component_name]['stats'][iteration] = [
                        value_mean,  value_stddev,
                        deriv_mean,  deriv_stddev,
                        value_5th,  value_50th,  value_95th,
                        deriv_5th,  deriv_50th,  deriv_95th]
        except KeyError:
            stats_table[component_name] = {}
            stats_table[component_name]['type'] = component_type
            stats_table[component_name]['stats'] = {}
            stats_table[component_name][
                    'stats'][iteration] = [value_mean,  value_stddev,
                                           deriv_mean,  deriv_stddev,
                                           value_5th,  value_50th,  value_95th,
                                           deriv_5th,  deriv_50th,  deriv_95th]
    else:
        #for oderiv-rms
        oderiv_percentiles = groups[9+gate_index*6]
        oderiv_mean = float(groups[10+gate_index*6])
        oderiv_stddev = float(groups[11+gate_index*6])
        oderiv_percentiles_split = re.split(',| ',oderiv_percentiles)
        assert len(oderiv_percentiles_split) == 13
        oderiv_5th = float(oderiv_percentiles_split[4])
        oderiv_50th = float(oderiv_percentiles_split[6])
        oderiv_95th = float(oderiv_percentiles_split[9])
        try:
            if iteration in stats_table[component_name]['stats']:
                stats_table[component_name]['stats'][iteration].extend(
                        [value_mean,  value_stddev,
                         deriv_mean,  deriv_stddev,
                         oderiv_mean, oderiv_stddev,
                         value_5th,  value_50th,  value_95th,
                         deriv_5th,  deriv_50th,  deriv_95th,
                         oderiv_5th, oderiv_50th, oderiv_95th])
            else:
                stats_table[component_name]['stats'][iteration] = [
                        value_mean,  value_stddev,
                        deriv_mean,  deriv_stddev,
                        oderiv_mean, oderiv_stddev,
                        value_5th,  value_50th,  value_95th,
                        deriv_5th,  deriv_50th,  deriv_95th,
                        oderiv_5th, oderiv_50th, oderiv_95th]
        except KeyError:
            stats_table[component_name] = {}
            stats_table[component_name]['type'] = component_type
            stats_table[component_name]['stats'] = {}
            stats_table[component_name][
                    'stats'][iteration] = [value_mean,  value_stddev,
                                           deriv_mean,  deriv_stddev,
                                           oderiv_mean, oderiv_stddev,
                                           value_5th,  value_50th,  value_95th,
                                           deriv_5th,  deriv_50th,  deriv_95th,
                                           oderiv_5th, oderiv_50th, oderiv_95th]

def parse_progress_logs_for_nonlinearity_stats(exp_dir):

    """ Parse progress logs for mean and std stats for non-linearities.
    e.g. for a line that is parsed from progress.*.log:
    exp/nnet3/lstm_self_repair_ld5_sp/log/progress.9.log:component name=Lstm3_i
    type=SigmoidComponent, dim=1280, self-repair-scale=1e-05, count=1.96e+05,
    value-avg=[percentiles(0,1,2,5 10,20,50,80,90
    95,98,99,100)=(0.05,0.09,0.11,0.15 0.19,0.27,0.50,0.72,0.83
    0.88,0.92,0.94,0.99), mean=0.502, stddev=0.23],
    deriv-avg=[percentiles(0,1,2,5 10,20,50,80,90
    95,98,99,100)=(0.009,0.04,0.05,0.06 0.08,0.10,0.14,0.17,0.18
    0.19,0.20,0.20,0.21), mean=0.134, stddev=0.0397]
    """

    progress_log_files = "%s/log/progress.*.log" % (exp_dir)
    stats_per_component_per_iter = {}

    progress_log_lines = common_lib.get_command_stdout(
        'grep -e "value-avg.*deriv-avg.*oderiv" {0}'.format(progress_log_files),
        require_zero_status = False)

    if progress_log_lines:
        # cases with oderiv-rms
        parse_regex = re.compile(g_normal_nonlin_regex_pattern_with_oderiv)
    else:
        # cases with only value-avg and deriv-avg
        progress_log_lines = common_lib.get_command_stdout(
        'grep -e "value-avg.*deriv-avg" {0}'.format(progress_log_files),
        require_zero_status = False)
        parse_regex = re.compile(g_normal_nonlin_regex_pattern)

    for line in progress_log_lines.split("\n"):
        mat_obj = parse_regex.search(line)
        if mat_obj is None:
            continue
        # groups = ('9', 'Lstm3_i', 'Sigmoid', '0.05...0.99', '0.502', '0.23',
        # '0.009...0.21', '0.134', '0.0397')
        groups = mat_obj.groups()
        component_type = groups[2]
        if component_type == 'LstmNonlinearity':
            parse_regex_lstmp = re.compile(g_lstmp_nonlin_regex_pattern)
            mat_obj = parse_regex_lstmp.search(line)
            groups = mat_obj.groups()
            assert len(groups) == 33
            for i in list(range(0,5)):
                fill_nonlin_stats_table_with_regex_result(groups, i,
                        stats_per_component_per_iter)
        else:
            fill_nonlin_stats_table_with_regex_result(groups, 0,
                    stats_per_component_per_iter)
    return stats_per_component_per_iter


def parse_difference_string(string):
    dict = {}
    for parts in string.split():
        sub_parts = parts.split(":")
        dict[sub_parts[0]] = float(sub_parts[1])
    return dict


class MalformedClippedProportionLineException(Exception):
    def __init__(self, line):
        Exception.__init__(self,
                           "Malformed line encountered while trying to "
                           "extract clipped-proportions.\n{0}".format(line))


def parse_progress_logs_for_clipped_proportion(exp_dir):
    """ Parse progress logs for clipped proportion stats.

    e.g. for a line that is parsed from progress.*.log:
    exp/chain/cwrnn_trial2_ld5_sp/log/progress.245.log:component
    name=BLstm1_forward_c type=ClipGradientComponent, dim=512,
    norm-based-clipping=true, clipping-threshold=30,
    clipped-proportion=0.000565527,
    self-repair-clipped-proportion-threshold=0.01, self-repair-target=0,
    self-repair-scale=1
    """

    progress_log_files = "%s/log/progress.*.log" % (exp_dir)
    component_names = set([])
    progress_log_lines = common_lib.get_command_stdout(
        'grep -e "{0}" {1}'.format(
            "clipped-proportion", progress_log_files),
        require_zero_status=False)
    parse_regex = re.compile(".*progress\.([0-9]+)\.log:component "
                             "name=(.*) type=.* "
                             "clipped-proportion=([0-9\.e\-]+)")

    cp_per_component_per_iter = {}

    max_iteration = 0
    component_names = set([])
    for line in progress_log_lines.split("\n"):
        mat_obj = parse_regex.search(line)
        if mat_obj is None:
            if line.strip() == "":
                continue
            raise MalformedClippedProportionLineException(line)
        groups = mat_obj.groups()
        iteration = int(groups[0])
        max_iteration = max(max_iteration, iteration)
        name = groups[1]
        clipped_proportion = float(groups[2])
        if clipped_proportion > 1:
            raise MalformedClippedProportionLineException(line)
        if iteration not in cp_per_component_per_iter:
            cp_per_component_per_iter[iteration] = {}
        cp_per_component_per_iter[iteration][name] = clipped_proportion
        component_names.add(name)
    component_names = list(component_names)
    component_names.sort()

    # re arranging the data into an array
    # and into an cp_per_iter_per_component
    cp_per_iter_per_component = {}
    for component_name in component_names:
        cp_per_iter_per_component[component_name] = []
    data = []
    data.append(["iteration"]+component_names)
    for iter in range(max_iteration+1):
        if iter not in cp_per_component_per_iter:
            continue
        comp_dict = cp_per_component_per_iter[iter]
        row = [iter]
        for component in component_names:
            try:
                row.append(comp_dict[component])
                cp_per_iter_per_component[component].append(
                    [iter, comp_dict[component]])
            except KeyError:
                # if clipped proportion is not available for a particular
                # component it is set to None
                # this usually happens during layer-wise discriminative
                # training
                row.append(None)
        data.append(row)

    return {'table': data,
            'cp_per_component_per_iter': cp_per_component_per_iter,
            'cp_per_iter_per_component': cp_per_iter_per_component}


def parse_progress_logs_for_param_diff(exp_dir, pattern):
    """ Parse progress logs for per-component parameter differences.

    e.g. for a line that is parsed from progress.*.log:
    exp/chain/cwrnn_trial2_ld5_sp/log/progress.245.log:LOG
    (nnet3-show-progress:main():nnet3-show-progress.cc:144) Relative parameter
    differences per layer are [ Cwrnn1_T3_W_r:0.0171537
    Cwrnn1_T3_W_x:1.33338e-07 Cwrnn1_T2_W_r:0.048075 Cwrnn1_T2_W_x:1.34088e-07
    Cwrnn1_T1_W_r:0.0157277 Cwrnn1_T1_W_x:0.0212704 Final_affine:0.0321521
    Cwrnn2_T3_W_r:0.0212082 Cwrnn2_T3_W_x:1.33691e-07 Cwrnn2_T2_W_r:0.0212978
    Cwrnn2_T2_W_x:1.33401e-07 Cwrnn2_T1_W_r:0.014976 Cwrnn2_T1_W_x:0.0233588
    Cwrnn3_T3_W_r:0.0237165 Cwrnn3_T3_W_x:1.33184e-07 Cwrnn3_T2_W_r:0.0239754
    Cwrnn3_T2_W_x:1.3296e-07 Cwrnn3_T1_W_r:0.0194809 Cwrnn3_T1_W_x:0.0271934 ]
    """

    if pattern not in set(["Relative parameter differences",
                           "Parameter differences"]):
        raise Exception("Unknown value for pattern : {0}".format(pattern))

    progress_log_files = "%s/log/progress.*.log" % (exp_dir)
    progress_per_iter = {}
    component_names = set([])
    progress_log_lines = common_lib.get_command_stdout(
        'grep -e "{0}" {1}'.format(pattern, progress_log_files))
    parse_regex = re.compile(".*progress\.([0-9]+)\.log:"
                             "LOG.*{0}.*\[(.*)\]".format(pattern))
    for line in progress_log_lines.split("\n"):
        mat_obj = parse_regex.search(line)
        if mat_obj is None:
            continue
        groups = mat_obj.groups()
        iteration = groups[0]
        differences = parse_difference_string(groups[1])
        component_names = component_names.union(list(differences.keys()))
        progress_per_iter[int(iteration)] = differences

    component_names = list(component_names)
    component_names.sort()
    # rearranging the parameter differences available per iter
    # into parameter differences per component
    progress_per_component = {}
    for cn in component_names:
        progress_per_component[cn] = {}

    max_iter = max(progress_per_iter.keys())
    total_missing_iterations = 0
    gave_user_warning = False
    for iter in range(max_iter + 1):
        try:
            component_dict = progress_per_iter[iter]
        except KeyError:
            continue

        for component_name in component_names:
            try:
                progress_per_component[component_name][iter] = component_dict[
                    component_name]
            except KeyError:
                total_missing_iterations += 1
                # the component was not found this iteration, may be because of
                # layerwise discriminative training
                pass
        if (total_missing_iterations/len(component_names) > 20
                and not gave_user_warning and logger is not None):
            logger.warning("There are more than {0} missing iterations per "
                           "component. Something might be wrong.".format(
                                total_missing_iterations/len(component_names)))
            gave_user_warning = True

    return {'progress_per_component': progress_per_component,
            'component_names': component_names,
            'max_iter': max_iter}


def get_train_times(exp_dir):
    train_log_files = "%s/log/" % (exp_dir)
    train_log_names = "train.*.log"
    train_log_lines = common_lib.get_command_stdout(
        'find {0} -name "{1}" | xargs grep -H -e Accounting'.format(train_log_files,train_log_names))
    parse_regex = re.compile(".*train\.([0-9]+)\.([0-9]+)\.log:# "
                             "Accounting: time=([0-9]+) thread.*")

    train_times = {}
    for line in train_log_lines.split('\n'):
        mat_obj = parse_regex.search(line)
        if mat_obj is not None:
            groups = mat_obj.groups()
            try:
                train_times[int(groups[0])][int(groups[1])] = float(groups[2])
            except KeyError:
                train_times[int(groups[0])] = {}
                train_times[int(groups[0])][int(groups[1])] = float(groups[2])
    iters = train_times.keys()
    for iter in iters:
        values = train_times[iter].values()
        train_times[iter] = max(values)
    return train_times

def parse_prob_logs(exp_dir, key='accuracy', output="output"):
    train_prob_files = "%s/log/compute_prob_train.*.log" % (exp_dir)
    valid_prob_files = "%s/log/compute_prob_valid.*.log" % (exp_dir)
    train_prob_strings = common_lib.get_command_stdout(
        'grep -e {0} {1}'.format(key, train_prob_files))
    valid_prob_strings = common_lib.get_command_stdout(
        'grep -e {0} {1}'.format(key, valid_prob_files))

    # LOG
    # (nnet3-chain-compute-prob:PrintTotalStats():nnet-chain-diagnostics.cc:149)
    # Overall log-probability for 'output' is -0.399395 + -0.013437 = -0.412832
    # per frame, over 20000 fra

    # LOG
    # (nnet3-chain-compute-prob:PrintTotalStats():nnet-chain-diagnostics.cc:144)
    # Overall log-probability for 'output' is -0.307255 per frame, over 20000
    # frames.

    parse_regex = re.compile(
        ".*compute_prob_.*\.([0-9]+).log:LOG "
        ".nnet3.*compute-prob.*:PrintTotalStats..:"
        "nnet.*diagnostics.cc:[0-9]+. Overall ([a-zA-Z\-]+) for "
        "'{output}'.*is ([0-9.\-e]+) .*per frame".format(output=output))

    train_objf = {}
    valid_objf = {}

    for line in train_prob_strings.split('\n'):
        mat_obj = parse_regex.search(line)
        if mat_obj is not None:
            groups = mat_obj.groups()
            if groups[1] == key:
                train_objf[int(groups[0])] = groups[2]
    if not train_objf:
        raise KaldiLogParseException("Could not find any lines with {k} in "
                " {l}".format(k=key, l=train_prob_files))

    for line in valid_prob_strings.split('\n'):
        mat_obj = parse_regex.search(line)
        if mat_obj is not None:
            groups = mat_obj.groups()
            if groups[1] == key:
                valid_objf[int(groups[0])] = groups[2]

    if not valid_objf:
        raise KaldiLogParseException("Could not find any lines with {k} in "
                " {l}".format(k=key, l=valid_prob_files))

    iters = list(set(valid_objf.keys()).intersection(list(train_objf.keys())))
    if not iters:
        raise KaldiLogParseException("Could not any common iterations with"
                " key {k} in both {tl} and {vl}".format(
                    k=key, tl=train_prob_files, vl=valid_prob_files))
    iters.sort()
    return list([(int(x), float(train_objf[x]),
                               float(valid_objf[x])) for x in iters])

def parse_rnnlm_prob_logs(exp_dir, key='objf'):
    train_prob_files = "%s/log/train.*.*.log" % (exp_dir)
    valid_prob_files = "%s/log/compute_prob.*.log" % (exp_dir)
    train_prob_strings = common_lib.get_command_stdout(
        'grep -e {0} {1}'.format(key, train_prob_files))
    valid_prob_strings = common_lib.get_command_stdout(
        'grep -e {0} {1}'.format(key, valid_prob_files))

    # LOG
    # (rnnlm-train[5.3.36~8-2ec51]:PrintStatsOverall():rnnlm-core-training.cc:118)
    # Overall objf is (-4.426 + -0.008287) = -4.435 over 4.503e+06 words (weighted)
    # in 1117 minibatches; exact = (-4.426 + 0) = -4.426

    # LOG
    # (rnnlm-compute-prob[5.3.36~8-2ec51]:PrintStatsOverall():rnnlm-core-training.cc:118)
    # Overall objf is (-4.677 + -0.002067) = -4.679 over 1.08e+05 words (weighted)
    # in 27 minibatches; exact = (-4.677 + 0.002667) = -4.674

    parse_regex_train = re.compile(
        ".*train\.([0-9]+).1.log:LOG "
        ".rnnlm-train.*:PrintStatsOverall..:"
        "rnnlm.*training.cc:[0-9]+. Overall ([a-zA-Z\-]+) is "
        ".*exact = \(.+\) = ([0-9.\-\+e]+)")

    parse_regex_valid = re.compile(
        ".*compute_prob\.([0-9]+).log:LOG "
        ".rnnlm.*compute-prob.*:PrintStatsOverall..:"
        "rnnlm.*training.cc:[0-9]+. Overall ([a-zA-Z\-]+) is "
        ".*exact = \(.+\) = ([0-9.\-\+e]+)")

    train_objf = {}
    valid_objf = {}

    for line in train_prob_strings.split('\n'):
        mat_obj = parse_regex_train.search(line)
        if mat_obj is not None:
            groups = mat_obj.groups()
            if groups[1] == key:
                train_objf[int(groups[0])] = groups[2]
    if not train_objf:
        raise KaldiLogParseException("Could not find any lines with {k} in "
                " {l}".format(k=key, l=train_prob_files))

    for line in valid_prob_strings.split('\n'):
        mat_obj = parse_regex_valid.search(line)
        if mat_obj is not None:
            groups = mat_obj.groups()
            if groups[1] == key:
                valid_objf[int(groups[0])] = groups[2]

    if not valid_objf:
        raise KaldiLogParseException("Could not find any lines with {k} in "
                " {l}".format(k=key, l=valid_prob_files))

    iters = list(set(valid_objf.keys()).intersection(list(train_objf.keys())))
    if not iters:
        raise KaldiLogParseException("Could not any common iterations with"
                " key {k} in both {tl} and {vl}".format(
                    k=key, tl=train_prob_files, vl=valid_prob_files))
    iters.sort()
    return [(int(x), float(train_objf[x]),
                          float(valid_objf[x])) for x in iters]


def generate_acc_logprob_report(exp_dir, key="accuracy", output="output"):
    try:
        times = get_train_times(exp_dir)
    except:
        tb = traceback.format_exc()
        logger.warning("Error getting info from logs, exception was: " + tb)
        times = {}

    report = []
    report.append("%Iter\tduration\ttrain_objective\tvalid_objective\tdifference")
    try:
        if key == "rnnlm_objective":
            data = list(parse_rnnlm_prob_logs(exp_dir, 'objf'))
        else:
            data = list(parse_prob_logs(exp_dir, key, output))
    except:
        tb = traceback.format_exc()
        logger.warning("Error getting info from logs, exception was: " + tb)
        data = []
    for x in data:
        try:
            report.append("%d\t%s\t%g\t%g\t%g" % (x[0], str(times[x[0]]),
                                                  x[1], x[2], x[2]-x[1]))
        except (KeyError, IndexError):
            continue

    total_time = 0
    for iter in times.keys():
        total_time += times[iter]
    report.append("Total training time is {0}\n".format(
                    str(datetime.timedelta(seconds=total_time))))
    return ["\n".join(report), times, data]


================================================
FILE: egs/steps/libs/nnet3/train/__init__.py
================================================

# Copyright 2016 Vimal Manohar
# Apache 2.0

""" This library has classes and methods commonly used for training nnet3
neural networks.

It has separate submodules for frame-level objectives and chain objective:
frame_level_objf -- For both recurrent and non-recurrent architectures
chain_objf -- LF-MMI objective training
"""


================================================
FILE: egs/steps/libs/nnet3/train/chain_objf/__init__.py
================================================


# Copyright 2016    Vimal Manohar
# Apache 2.0.

""" This is a subpackage containing modules for training of
deep neural network acoustic model with chain objective.
"""

from . import acoustic_model

__all__ = ["acoustic_model"]


================================================
FILE: egs/steps/libs/nnet3/train/chain_objf/acoustic_model.py
================================================


# Copyright 2016    Vijayaditya Peddinti.
#           2016    Vimal Manohar
# Apache 2.0.

""" This is a module with methods which will be used by scripts for training of
deep neural network acoustic model with chain objective.
"""
from __future__ import division
from __future__ import print_function

import logging
import math
import os
import sys

import libs.common as common_lib
import libs.nnet3.train.common as common_train_lib

logger = logging.getLogger(__name__)
logger.addHandler(logging.NullHandler())


def create_phone_lm(dir, tree_dir, run_opts, lm_opts=None):
    """Create a phone LM for chain training

    This method trains a phone LM for chain training using the alignments
    in "tree_dir"
    """
    try:
        f = open(tree_dir + "/num_jobs", 'r')
        num_ali_jobs = int(f.readline())
        assert num_ali_jobs > 0
    except:
        raise Exception("""There was an error getting the number of alignment
                        jobs from {0}/num_jobs""".format(tree_dir))

    alignments=' '.join(['{0}/ali.{1}.gz'.format(tree_dir, job)
                         for job in range(1, num_ali_jobs + 1)])

    common_lib.execute_command(
        """{command} {dir}/log/make_phone_lm.log \
            gunzip -c {alignments} \| \
            ali-to-phones {tree_dir}/final.mdl ark:- ark:- \| \
            chain-est-phone-lm {lm_opts} ark:- {dir}/phone_lm.fst""".format(
                command=run_opts.command, dir=dir,
                alignments=alignments,
                lm_opts=lm_opts if lm_opts is not None else '',
                tree_dir=tree_dir))


def create_denominator_fst(dir, tree_dir, run_opts):
    common_lib.execute_command(
        """copy-transition-model {tree_dir}/final.mdl \
                {dir}/0.trans_mdl""".format(dir=dir, tree_dir=tree_dir))
    common_lib.execute_command(
        """{command} {dir}/log/make_den_fst.log \
                   chain-make-den-fst {dir}/tree {dir}/0.trans_mdl \
                   {dir}/phone_lm.fst \
                   {dir}/den.fst {dir}/normalization.fst""".format(
                       dir=dir, command=run_opts.command))


def generate_chain_egs(dir, data, lat_dir, egs_dir,
                       left_context, right_context,
                       run_opts, stage=0,
                       left_tolerance=None, right_tolerance=None,
                       left_context_initial=-1, right_context_final=-1,
                       frame_subsampling_factor=3,
                       alignment_subsampling_factor=3,
                       online_ivector_dir=None,
                       frames_per_iter=20000, frames_per_eg_str="20", srand=0,
                       egs_opts=None, cmvn_opts=None):
    """Wrapper for steps/nnet3/chain/get_egs.sh

    See options in that script.
    """

    common_lib.execute_command(
        """steps/nnet3/chain/get_egs.sh {egs_opts} \
                --cmd "{command}" \
                --cmvn-opts "{cmvn_opts}" \
                --online-ivector-dir "{ivector_dir}" \
                --left-context {left_context} \
                --right-context {right_context} \
                --left-context-initial {left_context_initial} \
                --right-context-final {right_context_final} \
                --left-tolerance '{left_tolerance}' \
                --right-tolerance '{right_tolerance}' \
                --frame-subsampling-factor {frame_subsampling_factor} \
                --alignment-subsampling-factor {alignment_subsampling_factor} \
                --stage {stage} \
                --frames-per-iter {frames_per_iter} \
                --frames-per-eg {frames_per_eg_str} \
                --srand {srand} \
                {data} {dir} {lat_dir} {egs_dir}""".format(
                    command=run_opts.egs_command,
                    cmvn_opts=cmvn_opts if cmvn_opts is not None else '',
                    ivector_dir=(online_ivector_dir
                                 if online_ivector_dir is not None
                                 else ''),
                    left_context=left_context,
                    right_context=right_context,
                    left_context_initial=left_context_initial,
                    right_context_final=right_context_final,
                    left_tolerance=(left_tolerance
                                    if left_tolerance is not None
                                    else ''),
                    right_tolerance=(right_tolerance
                                     if right_tolerance is not None
                                     else ''),
                    frame_subsampling_factor=frame_subsampling_factor,
                    alignment_subsampling_factor=alignment_subsampling_factor,
                    stage=stage, frames_per_iter=frames_per_iter,
                    frames_per_eg_str=frames_per_eg_str, srand=srand,
                    data=data, lat_dir=lat_dir, dir=dir, egs_dir=egs_dir,
                    egs_opts=egs_opts if egs_opts is not None else ''))


def train_new_models(dir, iter, srand, num_jobs,
                     num_archives_processed, num_archives,
                     raw_model_string, egs_dir,
                     apply_deriv_weights,
                     min_deriv_time, max_deriv_time_relative,
                     l2_regularize, xent_regularize, leaky_hmm_coefficient,
                     momentum, max_param_change,
                     shuffle_buffer_size, num_chunk_per_minibatch_str,
                     frame_subsampling_factor, run_opts, train_opts,
                     backstitch_training_scale=0.0, backstitch_training_interval=1,
                     use_multitask_egs=False):
    """
    Called from train_one_iteration(), this method trains new models
    with 'num_jobs' jobs, and
    writes files like exp/tdnn_a/24.{1,2,3,..<num_jobs>}.raw

    We cannot easily use a single parallel SGE job to do the main training,
    because the computation of which archive and which --frame option
    to use for each job is a little complex, so we spawn each one separately.
    this is no longer true for RNNs as we use do not use the --frame option
    but we use the same script for consistency with FF-DNN code

    use_multitask_egs : True, if different examples used to train multiple
                        tasks or outputs, e.g.multilingual training.
                        multilingual egs can be generated using get_egs.sh and
                        steps/nnet3/multilingual/allocate_multilingual_examples.py,
                        those are the top-level scripts.
    """

    deriv_time_opts = []
    if min_deriv_time is not None:
        deriv_time_opts.append("--optimization.min-deriv-time={0}".format(
                                    min_deriv_time))
    if max_deriv_time_relative is not None:
        deriv_time_opts.append("--optimization.max-deriv-time-relative={0}".format(
                                    int(max_deriv_time_relative)))

    threads = []
    # the GPU timing info is only printed if we use the --verbose=1 flag; this
    # slows down the computation slightly, so don't accumulate it on every
    # iteration.  Don't do it on iteration 0 either, because we use a smaller
    # than normal minibatch size, and people may get confused thinking it's
    # slower for iteration 0 because of the verbose option.
    verbose_opt = ("--verbose=1" if iter % 20 == 0 and iter > 0 else "")

    for job in range(1, num_jobs+1):
        # k is a zero-based index that we will derive the other indexes from.
        k = num_archives_processed + job - 1
        # work out the 1-based archive index.
        archive_index = (k % num_archives) + 1
        # previous : frame_shift = (k/num_archives) % frame_subsampling_factor
        frame_shift = ((archive_index + k//num_archives)
                       % frame_subsampling_factor)

        multitask_egs_opts = common_train_lib.get_multitask_egs_opts(
            egs_dir,
            egs_prefix="cegs.",
            archive_index=archive_index,
            use_multitask_egs=use_multitask_egs)
        scp_or_ark = "scp" if use_multitask_egs else "ark"
        cache_io_opts = (("--read-cache={dir}/cache.{iter}".format(dir=dir,
                                                                  iter=iter)
                          if iter > 0 else "") +
                         (" --write-cache={0}/cache.{1}".format(dir, iter + 1)
                          if job == 1 else ""))

        thread = common_lib.background_command(
            """{command} {train_queue_opt} {dir}/log/train.{iter}.{job}.log \
                    nnet3-chain-train {parallel_train_opts} {verbose_opt} \
                    --apply-deriv-weights={app_deriv_wts} \
                    --l2-regularize={l2} --leaky-hmm-coefficient={leaky} \
                    {cache_io_opts}  --xent-regularize={xent_reg} \
                    {deriv_time_opts} \
                    --print-interval=10 --momentum={momentum} \
                    --max-param-change={max_param_change} \
                    --backstitch-training-scale={backstitch_training_scale} \
                    --backstitch-training-interval={backstitch_training_interval} \
                    --l2-regularize-factor={l2_regularize_factor} {train_opts} \
                    --srand={srand} \
                    "{raw_model}" {dir}/den.fst \
                    "ark,bg:nnet3-chain-copy-egs {multitask_egs_opts} \
                        --frame-shift={fr_shft} \
                        {scp_or_ark}:{egs_dir}/cegs.{archive_index}.{scp_or_ark} ark:- | \
                        nnet3-chain-shuffle-egs --buffer-size={buf_size} \
                        --srand={srand} ark:- ark:- | nnet3-chain-merge-egs \
                        --minibatch-size={num_chunk_per_mb} ark:- ark:- |" \
                    {dir}/{next_iter}.{job}.raw""".format(
                        command=run_opts.command,
                        train_queue_opt=run_opts.train_queue_opt,
                        dir=dir, iter=iter, srand=iter + srand,
                        next_iter=iter + 1, job=job,
                        deriv_time_opts=" ".join(deriv_time_opts),
                        app_deriv_wts=apply_deriv_weights,
                        fr_shft=frame_shift, l2=l2_regularize,
                        train_opts=train_opts,
                        xent_reg=xent_regularize, leaky=leaky_hmm_coefficient,
                        cache_io_opts=cache_io_opts,
                        parallel_train_opts=run_opts.parallel_train_opts,
                        verbose_opt=verbose_opt,
                        momentum=momentum, max_param_change=max_param_change,
                        backstitch_training_scale=backstitch_training_scale,
                        backstitch_training_interval=backstitch_training_interval,
                        l2_regularize_factor=1.0/num_jobs,
                        raw_model=raw_model_string,
                        egs_dir=egs_dir, archive_index=archive_index,
                        buf_size=shuffle_buffer_size,
                        num_chunk_per_mb=num_chunk_per_minibatch_str,
                        multitask_egs_opts=multitask_egs_opts,
                        scp_or_ark=scp_or_ark),
            require_zero_status=True)

        threads.append(thread)

    for thread in threads:
        thread.join()


def train_one_iteration(dir, iter, srand, egs_dir,
                        num_jobs, num_archives_processed, num_archives,
                        learning_rate, shrinkage_value,
                        num_chunk_per_minibatch_str,
                        apply_deriv_weights, min_deriv_time,
                        max_deriv_time_relative,
                        l2_regularize, xent_regularize,
                        leaky_hmm_coefficient,
                        momentum, max_param_change, shuffle_buffer_size,
                        frame_subsampling_factor,
                        run_opts, dropout_edit_string="", train_opts="",
                        backstitch_training_scale=0.0, backstitch_training_interval=1,
                        use_multitask_egs=False):
    """ Called from steps/nnet3/chain/train.py for one iteration for
    neural network training with LF-MMI objective

    """

    # Set off jobs doing some diagnostics, in the background.
    # Use the egs dir from the previous iteration for the diagnostics
    # check if different iterations use the same random seed
    if os.path.exists('{0}/srand'.format(dir)):
        try:
            saved_srand = int(open('{0}/srand'.format(dir)).readline().strip())
        except (IOError, ValueError):
            logger.error("Exception while reading the random seed "
                         "for training")
            raise
        if srand != saved_srand:
            logger.warning("The random seed provided to this iteration "
                           "(srand={0}) is different from the one saved last "
                           "time (srand={1}). Using srand={0}.".format(
                               srand, saved_srand))
    else:
        with open('{0}/srand'.format(dir), 'w') as f:
            f.write(str(srand))

    # Sets off some background jobs to compute train and
    # validation set objectives
    compute_train_cv_probabilities(
        dir=dir, iter=iter, egs_dir=egs_dir,
        l2_regularize=l2_regularize, xent_regularize=xent_regularize,
        leaky_hmm_coefficient=leaky_hmm_coefficient, run_opts=run_opts,
        use_multitask_egs=use_multitask_egs)

    if iter > 0:
        # Runs in the background
        compute_progress(dir, iter, run_opts)

    do_average = (iter > 0)

    raw_model_string = ("nnet3-am-copy --raw=true --learning-rate={0} "
                        "--scale={1} {2}/{3}.mdl - |".format(
                            learning_rate, shrinkage_value, dir, iter))

    if do_average:
        cur_num_chunk_per_minibatch_str = num_chunk_per_minibatch_str
        cur_max_param_change = max_param_change
    else:
        # on iteration zero, use a smaller minibatch size (and we will later
        # choose the output of just one of the jobs): the model-averaging isn't
        # always helpful when the model is changing too fast (i.e. it can worsen
        # the objective function), and the smaller minibatch size will help to
        # keep the update stable.
        cur_num_chunk_per_minibatch_str = common_train_lib.halve_minibatch_size_str(
            num_chunk_per_minibatch_str)
        cur_max_param_change = float(max_param_change) / math.sqrt(2)

    raw_model_string = raw_model_string + dropout_edit_string
    train_new_models(dir=dir, iter=iter, srand=srand, num_jobs=num_jobs,
                     num_archives_processed=num_archives_processed,
                     num_archives=num_archives,
                     raw_model_string=raw_model_string,
                     egs_dir=egs_dir,
                     apply_deriv_weights=apply_deriv_weights,
                     min_deriv_time=min_deriv_time,
                     max_deriv_time_relative=max_deriv_time_relative,
                     l2_regularize=l2_regularize,
                     xent_regularize=xent_regularize,
                     leaky_hmm_coefficient=leaky_hmm_coefficient,
                     momentum=momentum,
                     max_param_change=cur_max_param_change,
                     shuffle_buffer_size=shuffle_buffer_size,
                     num_chunk_per_minibatch_str=cur_num_chunk_per_minibatch_str,
                     frame_subsampling_factor=frame_subsampling_factor,
                     run_opts=run_opts, train_opts=train_opts,
                     # linearly increase backstitch_training_scale during the
                     # first few iterations (hard-coded as 15)
                     backstitch_training_scale=(backstitch_training_scale *
                         iter / 15 if iter < 15 else backstitch_training_scale),
                     backstitch_training_interval=backstitch_training_interval,
                     use_multitask_egs=use_multitask_egs)

    [models_to_average, best_model] = common_train_lib.get_successful_models(
         num_jobs, '{0}/log/train.{1}.%.log'.format(dir, iter))
    nnets_list = []
    for n in models_to_average:
        nnets_list.append("{0}/{1}.{2}.raw".format(dir, iter + 1, n))

    if do_average:
        # average the output of the different jobs.
        common_train_lib.get_average_nnet_model(
            dir=dir, iter=iter,
            nnets_list=" ".join(nnets_list),
            run_opts=run_opts)

    else:
        # choose the best model from different jobs
        common_train_lib.get_best_nnet_model(
            dir=dir, iter=iter,
            best_model_index=best_model,
            run_opts=run_opts)

    try:
        for i in range(1, num_jobs + 1):
            os.remove("{0}/{1}.{2}.raw".format(dir, iter + 1, i))
    except OSError:
        raise Exception("Error while trying to delete the raw models")

    new_model = "{0}/{1}.mdl".format(dir, iter + 1)

    if not os.path.isfile(new_model):
        raise Exception("Could not find {0}, at the end of "
                        "iteration {1}".format(new_model, iter))
    elif os.stat(new_model).st_size == 0:
        raise Exception("{0} has size 0. Something went wrong in "
                        "iteration {1}".format(new_model, iter))
    if os.path.exists("{0}/cache.{1}".format(dir, iter)):
        os.remove("{0}/cache.{1}".format(dir, iter))


def check_for_required_files(feat_dir, tree_dir, lat_dir=None):
    files = ['{0}/feats.scp'.format(feat_dir), '{0}/ali.1.gz'.format(tree_dir),
             '{0}/final.mdl'.format(tree_dir), '{0}/tree'.format(tree_dir)]
    if lat_dir is not None:
        files += [
             '{0}/lat.1.gz'.format(lat_dir), '{0}/final.mdl'.format(lat_dir),
             '{0}/num_jobs'.format(lat_dir)]
    for file in files:
        if not os.path.isfile(file):
            raise Exception('Expected {0} to exist.'.format(file))


def compute_preconditioning_matrix(dir, egs_dir, num_lda_jobs, run_opts,
                                   max_lda_jobs=None, rand_prune=4.0,
                                   lda_opts=None, use_multitask_egs=False):
    """ Function to estimate and write LDA matrix from cegs

    This function is exactly similar to the version in module
    libs.nnet3.train.frame_level_objf.common except this uses cegs instead of
    egs files.
    """
    if max_lda_jobs is not None:
        if num_lda_jobs > max_lda_jobs:
            num_lda_jobs = max_lda_jobs
    multitask_egs_opts = common_train_lib.get_multitask_egs_opts(
        egs_dir,
        egs_prefix="cegs.",
        archive_index="JOB",
        use_multitask_egs=use_multitask_egs)
    scp_or_ark = "scp" if use_multitask_egs else "ark"
    egs_rspecifier = (
        "ark:nnet3-chain-copy-egs {multitask_egs_opts} "
        "{scp_or_ark}:{egs_dir}/cegs.JOB.{scp_or_ark} ark:- |"
        "".format(egs_dir=egs_dir, scp_or_ark=scp_or_ark,
                  multitask_egs_opts=multitask_egs_opts))

    # Write stats with the same format as stats for LDA.
    common_lib.execute_command(
        """{command} JOB=1:{num_lda_jobs} {dir}/log/get_lda_stats.JOB.log \
                nnet3-chain-acc-lda-stats --rand-prune={rand_prune} \
                {dir}/init.raw "{egs_rspecifier}" \
                {dir}/JOB.lda_stats""".format(
                    command=run_opts.command,
                    num_lda_jobs=num_lda_jobs,
                    dir=dir,
                    egs_rspecifier=egs_rspecifier,
                    rand_prune=rand_prune))

    # the above command would have generated dir/{1..num_lda_jobs}.lda_stats
    lda_stat_files = ['{0}/{1}.lda_stats'.format(dir, x) for x in range(1, num_lda_jobs + 1)]

    common_lib.execute_command(
        """{command} {dir}/log/sum_transform_stats.log \
                sum-lda-accs {dir}/lda_stats {lda_stat_files}""".format(
                    command=run_opts.command,
                    dir=dir, lda_stat_files=" ".join(lda_stat_files)))

    for file in lda_stat_files:
        try:
            os.remove(file)
        except OSError:
            raise Exception("There was error while trying to remove "
                            "lda stat files.")
    # this computes a fixed affine transform computed in the way we described
    # in Appendix C.6 of http://arxiv.org/pdf/1410.7455v6.pdf; it's a scaled
    # variant of an LDA transform but without dimensionality reduction.

    common_lib.execute_command(
        """{command} {dir}/log/get_transform.log \
                nnet-get-feature-transform {lda_opts} {dir}/lda.mat \
                {dir}/lda_stats""".format(
                    command=run_opts.command, dir=dir,
                    lda_opts=lda_opts if lda_opts is not None else ""))

    common_lib.force_symlink("../lda.mat", "{0}/configs/lda.mat".format(dir))


def prepare_initial_acoustic_model(dir, run_opts, srand=-1, input_model=None):
    """ This function adds the first layer; It will also prepare the acoustic
        model with the transition model.
        If 'input_model' is specified, no initial network preparation(adding
        the first layer) is done and this model is used as initial 'raw' model
        instead of '0.raw' model to prepare '0.mdl' as acoustic model by adding the
        transition model.
    """
    if input_model is None:
        common_train_lib.prepare_initial_network(dir, run_opts,
                                                 srand=srand)

    # The model-format for a 'chain' acoustic model is just the transition
    # model and then the raw nnet, so we can use 'cat' to create this, as
    # long as they have the same mode (binary or not binary).
    # We ensure that they have the same mode (even if someone changed the
    # script to make one or both of them text mode) by copying them both
    # before concatenating them.
    common_lib.execute_command(
        """{command} {dir}/log/init_mdl.log \
                nnet3-am-init {dir}/0.trans_mdl {raw_mdl} \
                {dir}/0.mdl""".format(command=run_opts.command, dir=dir,
                                      raw_mdl=(input_model if input_model is not None
                                      else '{0}/0.raw'.format(dir))))


def compute_train_cv_probabilities(dir, iter, egs_dir, l2_regularize,
                                   xent_regularize, leaky_hmm_coefficient,
                                   run_opts,
                                   use_multitask_egs=False):
    model = '{0}/{1}.mdl'.format(dir, iter)
    scp_or_ark = "scp" if use_multitask_egs else "ark"
    egs_suffix = ".scp" if use_multitask_egs else ".cegs"

    multitask_egs_opts = common_train_lib.get_multitask_egs_opts(
                             egs_dir,
                             egs_prefix="valid_diagnostic.",
                             use_multitask_egs=use_multitask_egs)


    common_lib.background_command(
        """{command} {dir}/log/compute_prob_valid.{iter}.log \
                nnet3-chain-compute-prob --l2-regularize={l2} \
                --leaky-hmm-coefficient={leaky} --xent-regularize={xent_reg} \
                {model} {dir}/den.fst \
                "ark,bg:nnet3-chain-copy-egs {multitask_egs_opts} {scp_or_ark}:{egs_dir}/valid_diagnostic{egs_suffix} \
                    ark:- | nnet3-chain-merge-egs --minibatch-size=1:64 ark:- ark:- |" \
        """.format(command=run_opts.command, dir=dir, iter=iter, model=model,
                   l2=l2_regularize, leaky=leaky_hmm_coefficient,
                   xent_reg=xent_regularize,
                   egs_dir=egs_dir,
                   multitask_egs_opts=multitask_egs_opts,
                   scp_or_ark=scp_or_ark, egs_suffix=egs_suffix))

    multitask_egs_opts = common_train_lib.get_multitask_egs_opts(
                             egs_dir,
                             egs_prefix="train_diagnostic.",
                             use_multitask_egs=use_multitask_egs)

    common_lib.background_command(
        """{command} {dir}/log/compute_prob_train.{iter}.log \
                nnet3-chain-compute-prob --l2-regularize={l2} \
                --leaky-hmm-coefficient={leaky} --xent-regularize={xent_reg} \
                {model} {dir}/den.fst \
                "ark,bg:nnet3-chain-copy-egs {multitask_egs_opts} {scp_or_ark}:{egs_dir}/train_diagnostic{egs_suffix} \
                    ark:- | nnet3-chain-merge-egs --minibatch-size=1:64 ark:- ark:- |" \
        """.format(command=run_opts.command, dir=dir, iter=iter, model=model,
                   l2=l2_regularize, leaky=leaky_hmm_coefficient,
                   xent_reg=xent_regularize,
                   egs_dir=egs_dir,
                   multitask_egs_opts=multitask_egs_opts,
                   scp_or_ark=scp_or_ark, egs_suffix=egs_suffix))


def compute_progress(dir, iter, run_opts):

    prev_model = '{0}/{1}.mdl'.format(dir, iter - 1)
    model = '{0}/{1}.mdl'.format(dir, iter)

    common_lib.background_command(
        """{command} {dir}/log/progress.{iter}.log \
                nnet3-am-info {model} '&&' \
                nnet3-show-progress --use-gpu=no {prev_model} {model}
        """.format(command=run_opts.command,
                   dir=dir,
                   iter=iter,
                   model=model,
                   prev_model=prev_model))
    if iter % 10 == 0 and iter > 0:
        # Every 10 iters, print some more detailed information.
        # full_progress.X.log contains some diagnostics of the difference in
        # parameters, printed in the same format as from nnet3-info.
        common_lib.background_command(
            """{command} {dir}/log/full_progress.{iter}.log \
            nnet3-show-progress --use-gpu=no --verbose=2 {prev_model} {model}
        """.format(command=run_opts.command,
                   dir=dir,
                   iter=iter,
                   model=model,
                   prev_model=prev_model))
        # full_info.X.log is just the nnet3-info of the model, with the --verbose=2
        # option which includes stats on the singular values of the parameter matrices.
        common_lib.background_command(
            """{command} {dir}/log/full_info.{iter}.log \
            nnet3-info --verbose=2 {model}
        """.format(command=run_opts.command,
                   dir=dir,
                   iter=iter,
                   model=model))


def combine_models(dir, num_iters, models_to_combine, num_chunk_per_minibatch_str,
                   egs_dir, leaky_hmm_coefficient, l2_regularize,
                   xent_regularize, run_opts,
                   max_objective_evaluations=30,
                   use_multitask_egs=False):
    """ Function to do model combination

    In the nnet3 setup, the logic
    for doing averaging of subsets of the models in the case where
    there are too many models to reliably esetimate interpolation
    factors (max_models_combine) is moved into the nnet3-combine.
    """
    raw_model_strings = []
    logger.info("Combining {0} models.".format(models_to_combine))

    models_to_combine.add(num_iters)

    for iter in sorted(models_to_combine):
        model_file = '{0}/{1}.mdl'.format(dir, iter)
        if os.path.exists(model_file):
            # we used to copy them with nnet3-am-copy --raw=true, but now
            # the raw-model-reading code discards the other stuff itself.
            raw_model_strings.append(model_file)
        else:
            print("{0}: warning: model file {1} does not exist "
                  "(final combination)".format(sys.argv[0], model_file))

    scp_or_ark = "scp" if use_multitask_egs else "ark"
    egs_suffix = ".scp" if use_multitask_egs else ".cegs"

    multitask_egs_opts = common_train_lib.get_multitask_egs_opts(
                             egs_dir,
                             egs_prefix="combine.",
                             use_multitask_egs=use_multitask_egs)

    # We reverse the order of the raw model strings so that the freshest one
    # goes first.  This is important for systems that include batch
    # normalization-- it means that the freshest batch-norm stats are used.
    # Since the batch-norm stats are not technically parameters, they are not
    # combined in the combination code, they are just obtained from the first
    # model.
    raw_model_strings = list(reversed(raw_model_strings))

    common_lib.execute_command(
        """{command} {combine_queue_opt} {dir}/log/combine.log \
                nnet3-chain-combine \
                --max-objective-evaluations={max_objective_evaluations} \
                --l2-regularize={l2} --leaky-hmm-coefficient={leaky} \
                --verbose=3 {combine_gpu_opt} {dir}/den.fst {raw_models} \
                "ark,bg:nnet3-chain-copy-egs {multitask_egs_opts} {scp_or_ark}:{egs_dir}/combine{egs_suffix} ark:- | \
                    nnet3-chain-merge-egs --minibatch-size={num_chunk_per_mb} \
                    ark:- ark:- |" - \| \
                nnet3-am-copy --set-raw-nnet=- {dir}/{num_iters}.mdl \
                {dir}/final.mdl""".format(
                    command=run_opts.command,
                    combine_queue_opt=run_opts.combine_queue_opt,
                    combine_gpu_opt=run_opts.combine_gpu_opt,
                    max_objective_evaluations=max_objective_evaluations,
                    l2=l2_regularize, leaky=leaky_hmm_coefficient,
                    dir=dir, raw_models=" ".join(raw_model_strings),
                    num_chunk_per_mb=num_chunk_per_minibatch_str,
                    num_iters=num_iters,
                    egs_dir=egs_dir,
                    multitask_egs_opts=multitask_egs_opts,
                    scp_or_ark=scp_or_ark, egs_suffix=egs_suffix))

    # Compute the probability of the final, combined model with
    # the same subset we used for the previous compute_probs, as the
    # different subsets will lead to different probs.
    compute_train_cv_probabilities(
        dir=dir, iter='final', egs_dir=egs_dir,
        l2_regularize=l2_regularize, xent_regularize=xent_regularize,
        leaky_hmm_coefficient=leaky_hmm_coefficient,
        run_opts=run_opts,
        use_multitask_egs=use_multitask_egs)


================================================
FILE: egs/steps/libs/nnet3/train/common.py
================================================


# Copyright 2016    Vijayaditya Peddinti.
#           2016    Vimal Manohar
# Apache 2.0

"""This module contains classes and methods common to training of
nnet3 neural networks.
"""
from __future__ import division

import argparse
import glob
import logging
import os
import math
import re
import shutil

import libs.common as common_lib
from libs.nnet3.train.dropout_schedule import *

logger = logging.getLogger(__name__)
logger.addHandler(logging.NullHandler())


class RunOpts(object):
    """A structure to store run options.

    Run options like queue.pl and run.pl, along with their memory
    and parallel training options for various types of commands such
    as the ones for training, parallel-training, running on GPU etc.
    """

    def __init__(self):
        self.command = None
        self.train_queue_opt = None
        self.combine_gpu_opt = None
        self.combine_queue_opt = None
        self.prior_gpu_opt = None
        self.prior_queue_opt = None
        self.parallel_train_opts = None

def get_outputs_list(model_file, get_raw_nnet_from_am=True):
    """ Generates list of output-node-names used in nnet3 model configuration.
        It will normally return 'output'.
    """
    if get_raw_nnet_from_am:
        outputs_list = common_lib.get_command_stdout(
            "nnet3-am-info --print-args=false {0} | "
            "grep -e 'output-node' | cut -f2 -d' ' | cut -f2 -d'=' ".format(model_file))
    else:
        outputs_list = common_lib.get_command_stdout(
            "nnet3-info --print-args=false {0} | "
            "grep -e 'output-node' | cut -f2 -d' ' | cut -f2 -d'=' ".format(model_file))

    return outputs_list.split()


def get_multitask_egs_opts(egs_dir, egs_prefix="",
                           archive_index=-1,
                           use_multitask_egs=False):
    """ Generates egs option for multitask(or multilingual) training setup,
        if {egs_prefix}output.*.ark or {egs_prefix}weight.*.ark files exists in egs_dir.
        Each line in {egs_prefix}*.scp has a corresponding line containing
        name of the output-node in the network and language-dependent weight in
        {egs_prefix}output.*.ark or {egs_prefix}weight.*.ark respectively.
        e.g. Returns the empty string ('') if use_multitask_egs == False,
        otherwise something like:
        '--output=ark:foo/egs/output.3.ark --weight=ark:foo/egs/weights.3.ark'
        i.e. egs_prefix is "" for train and
        "valid_diagnostic." for validation.

        Caution: archive_index is usually an integer, but may be a string ("JOB")
        in some cases.
    """
    multitask_egs_opts = ""
    egs_suffix = ".{0}".format(archive_index) if archive_index != -1 else ""

    if use_multitask_egs:
        output_file_name = ("{egs_dir}/{egs_prefix}output{egs_suffix}.ark"
                            "".format(egs_dir=egs_dir,
                                      egs_prefix=egs_prefix,
                                      egs_suffix=egs_suffix))
        output_rename_opt = ""
        if os.path.isfile(output_file_name):
            output_rename_opt = ("--outputs=ark:{output_file_name}".format(
                output_file_name=output_file_name))

        weight_file_name = ("{egs_dir}/{egs_prefix}weight{egs_suffix}.ark"
                            "".format(egs_dir=egs_dir,
                                      egs_prefix=egs_prefix,
                                      egs_suffix=egs_suffix))
        weight_opt = ""
        if os.path.isfile(weight_file_name):
            weight_opt = ("--weights=ark:{weight_file_name}"
                          "".format(weight_file_name=weight_file_name))

        multitask_egs_opts = (
            "{output_rename_opt} {weight_opt}".format(
                output_rename_opt=output_rename_opt,
                weight_opt=weight_opt))

    return multitask_egs_opts


def get_successful_models(num_models, log_file_pattern,
                          difference_threshold=1.0):
    assert num_models > 0

    parse_regex = re.compile(
        "LOG .* Overall average objective function for "
        "'output' is ([0-9e.\-+= ]+) over ([0-9e.\-+]+) frames")
    objf = []
    for i in range(num_models):
        model_num = i + 1
        logfile = re.sub('%', str(model_num), log_file_pattern)
        lines = open(logfile, 'r').readlines()
        this_objf = -100000.0
        for line_num in range(1, len(lines) + 1):
            # we search from the end as this would result in
            # lesser number of regex searches. Python regex is slow !
            mat_obj = parse_regex.search(lines[-1 * line_num])
            if mat_obj is not None:
                this_objf = float(mat_obj.groups()[0].split()[-1])
                break
        objf.append(this_objf)
    max_index = objf.index(max(objf))
    accepted_models = []
    for i in range(num_models):
        if (objf[max_index] - objf[i]) <= difference_threshold:
            accepted_models.append(i + 1)

    if len(accepted_models) != num_models:
        logger.warn("Only {0}/{1} of the models have been accepted "
                    "for averaging, based on log files {2}.".format(
                        len(accepted_models),
                        num_models, log_file_pattern))

    return [accepted_models, max_index + 1]


def get_average_nnet_model(dir, iter, nnets_list, run_opts,
                           get_raw_nnet_from_am=True):

    next_iter = iter + 1
    if get_raw_nnet_from_am:
        out_model = ("""- \| nnet3-am-copy --set-raw-nnet=-  \
                        {dir}/{iter}.mdl {dir}/{next_iter}.mdl""".format(
                            dir=dir, iter=iter,
                            next_iter=next_iter))
    else:
        out_model = "{dir}/{next_iter}.raw".format(
            dir=dir, next_iter=next_iter)

    common_lib.execute_command(
        """{command} {dir}/log/average.{iter}.log \
                nnet3-average {nnets_list} \
                {out_model}""".format(command=run_opts.command,
                                      dir=dir,
                                      iter=iter,
                                      nnets_list=nnets_list,
                                      out_model=out_model))


def get_best_nnet_model(dir, iter, best_model_index, run_opts,
                        get_raw_nnet_from_am=True):

    best_model = "{dir}/{next_iter}.{best_model_index}.raw".format(
        dir=dir,
        next_iter=iter + 1,
        best_model_index=best_model_index)

    if get_raw_nnet_from_am:
        out_model = ("""- \| nnet3-am-copy --set-raw-nnet=- \
                        {dir}/{iter}.mdl {dir}/{next_iter}.mdl""".format(
                            dir=dir, iter=iter, next_iter=iter + 1))
    else:
        out_model = "{dir}/{next_iter}.raw".format(dir=dir,
                                                   next_iter=iter + 1)

    common_lib.execute_command(
        """{command} {dir}/log/select.{iter}.log \
                nnet3-copy {best_model} \
                {out_model}""".format(command=run_opts.command,
                                      dir=dir, iter=iter,
                                      best_model=best_model,
                                      out_model=out_model))


def validate_chunk_width(chunk_width):
    """Validate a chunk-width string , returns boolean.
    Expected to be a string representing either an integer, like '20',
    or a comma-separated list of integers like '20,30,16'"""
    if not isinstance(chunk_width, str):
        return False
    a = chunk_width.split(",")
    assert len(a) != 0  # would be code error
    for elem in a:
        try:
            i = int(elem)
            if i < 1 and i != -1:
                return False
        except:
            return False
    return True


def principal_chunk_width(chunk_width):
    """Given a chunk-width string like "20" or "50,70,40", returns the principal
    chunk-width which is the first element, as an int.  E.g. 20, or 40."""
    if not validate_chunk_width(chunk_width):
        raise Exception("Invalid chunk-width {0}".format(chunk_width))
    return int(chunk_width.split(",")[0])


def validate_range_str(range_str):
    """Helper function used inside validate_minibatch_size_str().
    Returns true if range_str is a a comma-separated list of
    positive integers and ranges of integers, like '128',
    '128,256', or '64-128,256'."""
    if not isinstance(range_str, str):
        return False
    ranges = range_str.split(",")
    assert len(ranges) > 0
    for r in ranges:
        # a range may be either e.g. '64', or '128-256'
        try:
            c = [int(x) for x in r.split(":")]
        except:
            return False
        # c should be either e.g. [ 128 ], or  [64,128].
        if len(c) == 1:
            if c[0] <= 0:
                return False
        elif len(c) == 2:
            if c[0] <= 0 or c[1] < c[0]:
                return False
        else:
            return False
    return True


def validate_minibatch_size_str(minibatch_size_str):
    """Validate a minibatch-size string (returns bool).
    A minibatch-size string might either be an integer, like '256',
    a comma-separated set of integers or ranges like '128,256' or
    '64:128,256',  or a rule like '128=64:128/256=32,64', whose format
    is: eg-length1=size-range1/eg-length2=size-range2/....
    where a size-range is a comma-separated list of either integers like '16'
    or ranges like '16:32'.  An arbitrary eg will be mapped to the size-range
    for the closest of the listed eg-lengths (the eg-length is defined
    as the number of input frames, including context frames)."""
    if not isinstance(minibatch_size_str, str):
        return False
    a = minibatch_size_str.split("/")
    assert len(a) != 0  # would be code error

    for elem in a:
        b = elem.split('=')
        # We expect b to have length 2 in the normal case.
        if len(b) != 2:
            # one-element 'b' is OK if len(a) is 1 (so there is only
            # one choice)... this would mean somebody just gave "25"
            # or something like that for the minibatch size.
            if len(a) == 1 and len(b) == 1:
                return validate_range_str(elem)
            else:
                return False
        # check that the thing before the '=' sign is a positive integer
        try:
            if int(b[0]) <= 0:
                return False
        except:
            return False  # not an integer at all.

        if not validate_range_str(b[1]):
            return False
    return True


def halve_range_str(range_str):
    """Helper function used inside halve_minibatch_size_str().
    returns half of a range [but converting resulting zeros to
    ones], e.g. '16'->'8', '16,32'->'8,16', '64:128'->'32:64'.
    Returns true if range_str is a a comma-separated list of
    positive integers and ranges of integers, like '128',
    '128,256', or '64-128,256'."""

    ranges = range_str.split(",")
    halved_ranges = []
    for r in ranges:
        # a range may be either e.g. '64', or '128:256'
        c = [str(max(1, int(x)//2)) for x in r.split(":")]
        halved_ranges.append(":".join(c))
    return ','.join(halved_ranges)


def halve_minibatch_size_str(minibatch_size_str):
    """Halve a minibatch-size string, as would be validated by
    validate_minibatch_size_str (see docs for that).  This halves
    all the integer elements of minibatch_size_str that represent minibatch
    sizes (as opposed to chunk-lengths) and that are >1."""

    if not validate_minibatch_size_str(minibatch_size_str):
        raise Exception("Invalid minibatch-size string '{0}'".format(minibatch_size_str))

    a = minibatch_size_str.split("/")
    ans = []
    for elem in a:
        b = elem.split('=')
        # We expect b to have length 2 in the normal case.
        if len(b) == 1:
            return halve_range_str(elem)
        else:
            assert len(b) == 2
            ans.append('{0}={1}'.format(b[0], halve_range_str(b[1])))
    return '/'.join(ans)


def copy_egs_properties_to_exp_dir(egs_dir, dir):
    try:
        for file in ['cmvn_opts', 'splice_opts', 'info/final.ie.id', 'final.mat',
                     'global_cmvn.stats', 'online_cmvn']:
            file_name = '{dir}/{file}'.format(dir=egs_dir, file=file)
            if os.path.isfile(file_name):
                shutil.copy(file_name, dir)
    except IOError:
        logger.error("Error while trying to copy egs "
                     "property files to {dir}".format(dir=dir))
        raise


def parse_generic_config_vars_file(var_file):
    variables = {}
    try:
        var_file_handle = open(var_file, 'r')
        for line in var_file_handle:
            parts = line.split('=')
            field_name = parts[0].strip()
            field_value = parts[1].strip()
            if field_name in ['model_left_context', 'left_context']:
                variables['model_left_context'] = int(field_value)
            elif field_name in ['model_right_context', 'right_context']:
                variables['model_right_context'] = int(field_value)
            elif field_name == 'num_hidden_layers':
                if int(field_value) > 1:
                    raise Exception(
                        "You have num_hidden_layers={0} (real meaning: your config files "
                        "are intended to do discriminative pretraining).  Since Kaldi 5.2, "
                        "this is no longer supported --> use newer config-creation scripts, "
                        "i.e. xconfig_to_configs.py.".format(field_value))
            else:
                variables[field_name] = field_value

        return variables
    except ValueError:
        # we will throw an error at the end of the function so I will just pass
        pass

    raise Exception('Error while parsing the file {0}'.format(var_file))


def get_input_model_info(input_model):
    """ This function returns a dictionary with keys "model_left_context" and
        "model_right_context" and values equal to the left/right model contexts
        for input_model.
        This function is useful when using the --trainer.input-model option
        instead of initializing the model using configs.
    """
    variables = {}
    try:
        out = common_lib.get_command_stdout("""nnet3-info {0} | """
                                            """head -4 """.format(input_model))
        # out looks like this
        # left-context: 7
        # right-context: 0
        # num-parameters: 90543902
        # modulus: 1
        for line in out.split("\n"):
            parts = line.split(":")
            if len(parts) != 2:
                continue
            if parts[0].strip() ==  'left-context':
                variables['model_left_context'] = int(parts[1].strip())
            elif parts[0].strip() ==  'right-context':
                variables['model_right_context'] = int(parts[1].strip())

    except ValueError:
        pass
    return variables


def verify_egs_dir(egs_dir, feat_dim, ivector_dim, ivector_extractor_id,
                   left_context, right_context,
                   left_context_initial=-1, right_context_final=-1):
    try:
        egs_feat_dim = int(open('{0}/info/feat_dim'.format(
                                    egs_dir)).readline())

        egs_ivector_id = None
        try:
            egs_ivector_id = open('{0}/info/final.ie.id'.format(
                                        egs_dir)).readline().strip()
            if (egs_ivector_id == ""):
                egs_ivector_id = None;
        except:
            # it could actually happen that the file is not there
            # for example in cases where the egs were dumped by
            # an older version of the script
            pass

        try:
            egs_ivector_dim = int(open('{0}/info/ivector_dim'.format(
                egs_dir)).readline())
        except:
            egs_ivector_dim = 0
        egs_left_context = int(open('{0}/info/left_context'.format(
                                    egs_dir)).readline())
        egs_right_context = int(open('{0}/info/right_context'.format(
                                    egs_dir)).readline())
        try:
            egs_left_context_initial = int(open('{0}/info/left_context_initial'.format(
                        egs_dir)).readline())
        except:  # older scripts didn't write this, treat it as -1 in that case.
            egs_left_context_initial = -1
        try:
            egs_right_context_final = int(open('{0}/info/right_context_final'.format(
                        egs_dir)).readline())
        except:  # older scripts didn't write this, treat it as -1 in that case.
            egs_right_context_final = -1

        # if feat_dim was supplied as 0, it means the --feat-dir option was not
        # supplied to the script, so we simply don't know what the feature dim is.
        if (feat_dim != 0 and feat_dim != egs_feat_dim) or (ivector_dim != egs_ivector_dim):
            raise Exception("There is mismatch between featdim/ivector_dim of "
                            "the current experiment and the provided "
                            "egs directory")

        if (((egs_ivector_id is None) and (ivector_extractor_id is not None)) or
            ((egs_ivector_id is not None) and (ivector_extractor_id is None))):
            logger.warning("The ivector ids are used inconsistently. It's your "
                          "responsibility to make sure the ivector extractor "
                          "has been used consistently")
            logger.warning("ivector id for egs: {0} in dir {1}".format(egs_ivector_id, egs_dir))
            logger.warning("ivector id for extractor: {0}".format(ivector_extractor_id))
        elif ((egs_ivector_dim > 0) and (egs_ivector_id is None) and (ivector_extractor_id is None)):
            logger.warning("The ivector ids are not used. It's your "
                          "responsibility to make sure the ivector extractor "
                          "has been used consistently")
        elif ivector_extractor_id != egs_ivector_id:
            raise Exception("The egs were generated using a different ivector "
                            "extractor. id1 = {0}, id2={1}".format(
                                ivector_extractor_id, egs_ivector_id));

        if (egs_left_context < left_context or
            egs_right_context < right_context):
            raise Exception('The egs have insufficient (l,r) context ({0},{1}) '
                            'versus expected ({2},{3})'.format(
                                egs_left_context, egs_right_context,
                                left_context, right_context))

        # the condition on the initial/final context is an equality condition,
        # not an inequality condition, as there is no mechanism to 'correct' the
        # context (by subtracting context) while copying the egs, like there is
        # for the regular left-right context.  If the user is determined to use
        # previously dumped egs, they may be able to slightly adjust the
        # --egs.chunk-left-context-initial and --egs.chunk-right-context-final
        # options to make things matched up.  [note: the model l/r context gets
        # added in, so you have to correct for changes in that.]
        if (egs_left_context_initial != left_context_initial or
            egs_right_context_final != right_context_final):
            raise Exception('The egs have incorrect initial/final (l,r) context '
                            '({0},{1}) versus expected ({2},{3}).  See code from '
                            'where this exception was raised for more info'.format(
                                egs_left_context_initial, egs_right_context_final,
                                left_context_initial, right_context_final))

        frames_per_eg_str = open('{0}/info/frames_per_eg'.format(
                             egs_dir)).readline().rstrip()
        if not validate_chunk_width(frames_per_eg_str):
            raise Exception("Invalid frames_per_eg in directory {0}/info".format(
                    egs_dir))
        num_archives = int(open('{0}/info/num_archives'.format(
                                    egs_dir)).readline())

        return [egs_left_context, egs_right_context,
                frames_per_eg_str, num_archives]
    except (IOError, ValueError):
        logger.error("The egs dir {0} has missing or "
                     "malformed files.".format(egs_dir))
        raise


def compute_presoftmax_prior_scale(dir, alidir, num_jobs, run_opts,
                                   presoftmax_prior_scale_power=-0.25):

    # getting the raw pdf count
    common_lib.execute_command(
        """{command} JOB=1:{num_jobs} {dir}/log/acc_pdf.JOB.log \
                ali-to-post "ark:gunzip -c {alidir}/ali.JOB.gz|" ark:- \| \
                post-to-tacc --per-pdf=true  {alidir}/final.mdl ark:- \
                {dir}/pdf_counts.JOB""".format(command=run_opts.command,
                                               num_jobs=num_jobs,
                                               dir=dir,
                                               alidir=alidir))

    common_lib.execute_command(
        """{command} {dir}/log/sum_pdf_counts.log \
                vector-sum --binary=false {dir}/pdf_counts.* {dir}/pdf_counts \
        """.format(command=run_opts.command, dir=dir))

    for file in glob.glob('{0}/pdf_counts.*'.format(dir)):
        os.remove(file)
    pdf_counts = common_lib.read_kaldi_matrix('{0}/pdf_counts'.format(dir))[0]
    scaled_counts = smooth_presoftmax_prior_scale_vector(
        pdf_counts,
        presoftmax_prior_scale_power=presoftmax_prior_scale_power,
        smooth=0.01)

    output_file = "{0}/presoftmax_prior_scale.vec".format(dir)
    common_lib.write_kaldi_matrix(output_file, [scaled_counts])
    common_lib.force_symlink("../presoftmax_prior_scale.vec",
                             "{0}/configs/presoftmax_prior_scale.vec".format(
                                dir))


def smooth_presoftmax_prior_scale_vector(pdf_counts,
                                         presoftmax_prior_scale_power=-0.25,
                                         smooth=0.01):
    total = sum(pdf_counts)
    average_count = float(total) / len(pdf_counts)
    scales = []
    for i in range(len(pdf_counts)):
        scales.append(math.pow(pdf_counts[i] + smooth * average_count,
                               presoftmax_prior_scale_power))
    num_pdfs = len(pdf_counts)
    scaled_counts = [x * float(num_pdfs) / sum(scales) for x in scales]
    return scaled_counts


def prepare_initial_network(dir, run_opts, srand=-3, input_model=None):
    if input_model is not None:
        shutil.copy(input_model, "{0}/0.raw".format(dir))
        return
    if os.path.exists(dir+"/configs/init.config"):
        common_lib.execute_command(
            """{command} {dir}/log/add_first_layer.log \
                    nnet3-init --srand={srand} {dir}/init.raw \
                    {dir}/configs/final.config {dir}/0.raw""".format(
                        command=run_opts.command, srand=srand,
                        dir=dir))
    else:
        common_lib.execute_command(
            """{command} {dir}/log/init_model.log \
           nnet3-init --srand={srand} {dir}/configs/final.config {dir}/0.raw""".format(
                        command=run_opts.command, srand=srand,
                        dir=dir))


def get_model_combine_iters(num_iters, num_epochs,
                      num_archives, max_models_combine,
                      num_jobs_final):
    """ Figures out the list of iterations for which we'll use those models
        in the final model-averaging phase.  (note: it's a weighted average
        where the weights are worked out from a subset of training data.)"""

    approx_iters_per_epoch_final = float(num_archives) / num_jobs_final
    # Note: it used to be that we would combine over an entire epoch,
    # but in practice we very rarely would use any weights from towards
    # the end of that range, so we are changing it to use not
    # approx_iters_per_epoch_final, but instead:
    # approx_iters_per_epoch_final/2 + 1,
    # dividing by 2 to use half an epoch, and adding 1 just to make sure
    # it's not zero.

    # First work out how many iterations we want to combine over in the final
    # nnet3-combine-fast invocation.
    # The number we use is:
    # min(max(max_models_combine, approx_iters_per_epoch_final/2+1),
    #     iters/2)
    # But if this value is > max_models_combine, then the models
    # are subsampled to get these many models to combine.

    num_iters_combine_initial = min(int(approx_iters_per_epoch_final/2) + 1,
                                    int(num_iters/2))

    if num_iters_combine_initial > max_models_combine:
        subsample_model_factor = int(
            float(num_iters_combine_initial) / max_models_combine)
        num_iters_combine = num_iters_combine_initial
        models_to_combine = set(range(
            num_iters - num_iters_combine_initial + 1,
            num_iters + 1, subsample_model_factor))
        models_to_combine.add(num_iters)
    else:
        subsample_model_factor = 1
        num_iters_combine = min(max_models_combine, num_iters//2)
        models_to_combine = set(range(num_iters - num_iters_combine + 1,
                                      num_iters + 1))

    return models_to_combine


def get_current_num_jobs(it, num_it, start, step, end):
    "Get number of jobs for iteration number 'it' of range('num_it')"

    ideal = float(start) + (end - start) * float(it) / num_it
    if ideal < step:
        return int(0.5 + ideal)
    else:
        return int(0.5 + ideal / step) * step


def get_learning_rate(iter, num_jobs, num_iters, num_archives_processed,
                      num_archives_to_process,
                      initial_effective_lrate, final_effective_lrate):
    if iter + 1 >= num_iters:
        effective_learning_rate = final_effective_lrate
    else:
        effective_learning_rate = (
                initial_effective_lrate
                * math.exp(num_archives_processed
                           * math.log(float(final_effective_lrate) / initial_effective_lrate)
                           / num_archives_to_process))

    return num_jobs * effective_learning_rate


def should_do_shrinkage(iter, model_file, shrink_saturation_threshold,
                        get_raw_nnet_from_am=True):

    if iter == 0:
        return True

    if get_raw_nnet_from_am:
        output = common_lib.get_command_stdout(
            "nnet3-am-info {0} 2>/dev/null | "
            "steps/nnet3/get_saturation.pl".format(model_file))
    else:
        output = common_lib.get_command_stdout(
            "nnet3-info 2>/dev/null {0} | "
            "steps/nnet3/get_saturation.pl".format(model_file))
    output = output.strip().split("\n")
    try:
        assert len(output) == 1
        saturation = float(output[0])
        assert saturation >= 0 and saturation <= 1
    except:
        raise Exception("Something went wrong, could not get "
                        "saturation from the output '{0}' of "
                        "get_saturation.pl on the info of "
                        "model {1}".format(output, model_file))
    return saturation > shrink_saturation_threshold


def remove_nnet_egs(egs_dir):
    common_lib.execute_command("steps/nnet2/remove_egs.sh {egs_dir}".format(
            egs_dir=egs_dir))


def clean_nnet_dir(nnet_dir, num_iters, egs_dir,
                   preserve_model_interval=100,
                   remove_egs=True,
                   get_raw_nnet_from_am=True):
    try:
        if remove_egs:
            remove_nnet_egs(egs_dir)

        for iter in range(num_iters):
            remove_model(nnet_dir, iter, num_iters, None,
                         preserve_model_interval,
                         get_raw_nnet_from_am=get_raw_nnet_from_am)
    except (IOError, OSError):
        logger.error("Error while cleaning up the nnet directory")
        raise


def remove_model(nnet_dir, iter, num_iters, models_to_combine=None,
                 preserve_model_interval=100,
                 get_raw_nnet_from_am=True):
    if iter % preserve_model_interval == 0:
        return
    if models_to_combine is not None and iter in models_to_combine:
        return
    if get_raw_nnet_from_am:
        file_name = '{0}/{1}.mdl'.format(nnet_dir, iter)
    else:
        file_name = '{0}/{1}.raw'.format(nnet_dir, iter)

    if os.path.isfile(file_name):
        os.remove(file_name)


def positive_int(arg):
   val = int(arg)
   if (val <= 0):
      raise argparse.ArgumentTypeError("must be positive int: '%s'" % arg)
   return val


class CommonParser(object):
    """Parser for parsing common options related to nnet3 training.

    This argument parser adds common options related to nnet3 training
    such as egs creation, training optimization options.
    These are used in the nnet3 train scripts
    in steps/nnet3/train*.py and steps/nnet3/chain/train.py
    """

    parser = argparse.ArgumentParser(add_help=False)

    def __init__(self,
                 include_chunk_context=True,
                 default_chunk_left_context=0):
        # feat options
        self.parser.add_argument("--feat.online-ivector-dir", type=str,
                                 dest='online_ivector_dir', default=None,
                                 action=common_lib.NullstrToNoneAction,
                                 help="""directory with the ivectors extracted
                                 in an online fashion.""")
        self.parser.add_argument("--feat.cmvn-opts", type=str,
                                 dest='cmvn_opts', default=None,
                                 action=common_lib.NullstrToNoneAction,
                                 help="A string specifying '--norm-means' "
                                 "and '--norm-vars' values")

        # egs extraction options.  there is no point adding the chunk context
        # option for non-RNNs (by which we mean basic TDNN-type topologies), as
        # it wouldn't affect anything, so we disable them if we know in advance
        # that we're not supporting RNN-type topologies (as in train_dnn.py).
        if include_chunk_context:
            self.parser.add_argument("--egs.chunk-left-context", type=int,
                                     dest='chunk_left_context',
                                     default=default_chunk_left_context,
                                     help="""Number of additional frames of input
                                 to the left of the input chunk. This extra
                                 context will be used in the estimation of RNN
                                 state before prediction of the first label. In
                                 the case of FF-DNN this extra context will be
                                 used to allow for frame-shifts""")
            self.parser.add_argument("--egs.chunk-right-context", type=int,
                                     dest='chunk_right_context', default=0,
                                     help="""Number of additional frames of input
                                     to the right of the input chunk. This extra
                                     context will be used in the estimation of
                                     bidirectional RNN state before prediction of
                                 the first label.""")
            self.parser.add_argument("--egs.chunk-left-context-initial", type=int,
                                     dest='chunk_left_context_initial', default=-1,
                                     help="""Number of additional frames of input
                                 to the left of the *first* input chunk extracted
                                 from an utterance.  If negative, defaults to
                                 the same as --egs.chunk-left-context""")
            self.parser.add_argument("--egs.chunk-right-context-final", type=int,
                                     dest='chunk_right_context_final', default=-1,
                                     help="""Number of additional frames of input
                                 to the right of the *last* input chunk extracted
                                 from an utterance.  If negative, defaults to the
                                 same as --egs.chunk-right-context""")
        self.parser.add_argument("--egs.dir", type=str, dest='egs_dir',
                                 default=None,
                                 action=common_lib.NullstrToNoneAction,
                                 help="""Directory with egs. If specified this
                                 directory will be used rather than extracting
                                 egs""")
        self.parser.add_argument("--egs.stage", type=int, dest='egs_stage',
                                 default=0,
                                 help="Stage at which get_egs.sh should be "
                                 "restarted")
        self.parser.add_argument("--egs.opts", type=str, dest='egs_opts',
                                 default=None,
                                 action=common_lib.NullstrToNoneAction,
                                 help="""String to provide options directly
                                 to steps/nnet3/get_egs.sh script""")

        # trainer options
        self.parser.add_argument("--trainer.srand", type=int, dest='srand',
                                 default=0,
                                 help="""Sets the random seed for model
                                 initialization and egs shuffling.
                                 Warning: This random seed does not control all
                                 aspects of this experiment.  There might be
                                 other random seeds used in other stages of the
                                 experiment like data preparation (e.g. volume
                                 perturbation).""")
        self.parser.add_argument("--trainer.num-epochs", type=float,
                                 dest='num_epochs', default=8.0,
                                 help="Number of epochs to train the model")
        self.parser.add_argument("--trainer.shuffle-buffer-size", type=int,
                                 dest='shuffle_buffer_size', default=5000,
                                 help=""" Controls randomization of the samples
                                 on each iteration. If 0 or a large value the
                                 randomization is complete, but this will
                                 consume memory and cause spikes in disk I/O.
                                 Smaller is easier on disk and memory but less
                                 random.  It's not a huge deal though, as
                                 samples are anyway randomized right at the
                                 start.  (the point of this is to get data in
                                 different minibatches on different iterations,
                                 since in the preconditioning method, 2 samples
                                 in the same minibatch can affect each others'
                                 gradients.""")
        self.parser.add_argument("--trainer.max-param-change", type=float,
                                 dest='max_param_change', default=2.0,
                                 help="""The maximum change in parameters
                                 allowed per minibatch, measured in Frobenius
                                 norm over the entire model""")
        self.parser.add_argument("--trainer.samples-per-iter", type=int,
                                 dest='samples_per_iter', default=400000,
                                 help="This is really the number of egs in "
                                 "each archive.")
        self.parser.add_argument("--trainer.lda.rand-prune", type=float,
                                 dest='rand_prune', default=4.0,
                                 help="Value used in preconditioning "
                                 "matrix estimation")
        self.parser.add_argument("--trainer.lda.max-lda-jobs", type=int,
                                 dest='max_lda_jobs', default=10,
                                 help="Max number of jobs used for "
                                 "LDA stats accumulation")
        self.parser.add_argument("--trainer.presoftmax-prior-scale-power",
                                 type=float,
                                 dest='presoftmax_prior_scale_power',
                                 default=-0.25,
                                 help="Scale on presofmax prior")
        self.parser.add_argument("--trainer.optimization.proportional-shrink", type=float,
                                 dest='proportional_shrink', default=0.0,
                                 help="""If nonzero, this will set a shrinkage (scaling)
                        factor for the parameters, whose value is set as:
                        shrink-value=(1.0 - proportional-shrink * learning-rate), where
                        'learning-rate' is the learning rate being applied
                        on the current iteration, which will vary from
                        initial-effective-lrate*num-jobs-initial to
                        final-effective-lrate*num-jobs-final.
                        Unlike for train_rnn.py, this is applied unconditionally,
                        it does not depend on saturation of nonlinearities.
                        Can be used to roughly approximate l2 regularization.""")

        # Parameters for the optimization
        self.parser.add_argument(
            "--trainer.optimization.initial-effective-lrate", type=float,
            dest='initial_effective_lrate', default=0.0003,
            help="Learning rate used during the initial iteration")
        self.parser.add_argument(
            "--trainer.optimization.final-effective-lrate", type=float,
            dest='final_effective_lrate', default=0.00003,
            help="Learning rate used during the final iteration")
        self.parser.add_argument("--trainer.optimization.num-jobs-initial",
                                 type=int, dest='num_jobs_initial', default=1,
                                 help="Number of neural net jobs to run in "
                                 "parallel at the start of training")
        self.parser.add_argument("--trainer.optimization.num-jobs-final",
                                 type=int, dest='num_jobs_final', default=8,
                                 help="Number of neural net jobs to run in "
                                 "parallel at the end of training")
        self.parser.add_argument("--trainer.optimization.num-jobs-step",
            type=positive_int,  metavar='N', dest='num_jobs_step', default=1,
            help="""Number of jobs increment, when exceeds this number. For
            example, if N=3, the number of jobs may progress as 1, 2, 3, 6, 9...""")
        self.parser.add_argument("--trainer.optimization.max-models-combine",
                                 "--trainer.max-models-combine",
                                 type=int, dest='max_models_combine',
                                 default=20,
                                 help="""The maximum number of models used in
                                 the final model combination stage.  These
                                 models will themselves be averages of
                                 iteration-number ranges""")
        self.parser.add_argument("--trainer.optimization.max-objective-evaluations",
                                 "--trainer.max-objective-evaluations",
                                 type=int, dest='max_objective_evaluations',
                                 default=30,
                                 help="""The maximum number of objective
                                 evaluations in order to figure out the
                                 best number of models to combine. It helps to
                                 speedup if the number of models provided to the
                                 model combination binary is quite large (e.g.
                                 several hundred).""")
        self.parser.add_argument("--trainer.optimization.do-final-combination",
                                 dest='do_final_combination', type=str,
                                 action=common_lib.StrToBoolAction,
                                 choices=["true", "false"], default=True,
                                 help="""Set this to false to disable the final
                                 'combine' stage (in this case we just use the
                                 last-numbered model as the final.mdl).""")
        self.parser.add_argument("--trainer.optimization.combine-sum-to-one-penalty",
                                 type=float, dest='combine_sum_to_one_penalty', default=0.0,
                                 help="""This option is deprecated and does nothing.""")
        self.parser.add_argument("--trainer.optimization.momentum", type=float,
                                 dest='momentum', default=0.0,
                                 help="""Momentum used in update computation.
                                 Note: we implemented it in such a way that it
                                 doesn't increase the effective learning
                                 rate.""")
        self.parser.add_argument("--trainer.dropout-schedule", type=str,
                                 action=common_lib.NullstrToNoneAction,
                                 dest='dropout_schedule', default=None,
                                 help="""Use this to specify the dropout
                                 schedule.  You specify a piecewise linear
                                 function on the domain [0,1], where 0 is the
                                 start and 1 is the end of training; the
                                 function-argument (x) rises linearly with the
                                 amount of data you have seen, not iteration
                                 number (this improves invariance to
                                 num-jobs-{initial-final}).  E.g. '0,0.2,0'
                                 means 0 at the start; 0.2 after seeing half
                                 the data; and 0 at the end.  You may specify
                                 the x-value of selected points, e.g.
                                 '0,0.2@0.25,0' means that the 0.2
                                 dropout-proportion is reached a quarter of the
                                 way through the data.   The start/end x-values
                                 are at x=0/x=1, and other unspecified x-values
                                 are interpolated between known x-values.  You
                                 may specify different rules for different
                                 component-name patterns using 'pattern1=func1
                                 pattern2=func2', e.g. 'relu*=0,0.1,0
                                 lstm*=0,0.2,0'.  More general should precede
                                 less general patterns, as they are applied
                                 sequentially.""")
        self.parser.add_argument("--trainer.add-option", type=str,
                                 dest='train_opts', action='append', default=[],
                                 help="""You can use this to add arbitrary options that
                                 will be passed through to the core training code (nnet3-train
                                 or nnet3-chain-train)""")
        self.parser.add_argument("--trainer.optimization.backstitch-training-scale",
                                 type=float, dest='backstitch_training_scale',
                                 default=0.0, help="""scale of parameters changes
                                 used in backstitch training step.""")
        self.parser.add_argument("--trainer.optimization.backstitch-training-interval",
                                 type=int, dest='backstitch_training_interval',
                                 default=1, help="""the interval of minibatches
                                 that backstitch training is applied on.""")
        self.parser.add_argument("--trainer.compute-per-dim-accuracy",
                                 dest='compute_per_dim_accuracy',
                                 type=str, choices=['true', 'false'],
                                 default=False,
                                 action=common_lib.StrToBoolAction,
                                 help="Compute train and validation "
                                 "accuracy per-dim")

        # General options
        self.parser.add_argument("--stage", type=int, default=-4,
                                 help="Specifies the stage of the experiment "
                                 "to execution from")
        self.parser.add_argument("--exit-stage", type=int, default=None,
                                 help="If specified, training exits before "
                                 "running this stage")
        self.parser.add_argument("--cmd", type=str, dest="command",
                                 action=common_lib.NullstrToNoneAction,
                                 help="""Specifies the script to launch jobs.
                                 e.g. queue.pl for launching on SGE cluster
                                        run.pl for launching on local machine
                                 """, default="queue.pl")
        self.parser.add_argument("--egs.cmd", type=str, dest="egs_command",
                                 action=common_lib.NullstrToNoneAction,
                                 help="Script to launch egs jobs")
        self.parser.add_argument("--use-gpu", type=str,
                                 choices=["true", "false", "yes", "no", "wait"],
                                 help="Use GPU for training. "
                                 "Note 'true' and 'false' are deprecated.",
                                 default="yes")
        self.parser.add_argument("--cleanup", type=str,
                                 action=common_lib.StrToBoolAction,
                                 choices=["true", "false"], default=True,
                                 help="Clean up models after training")
        self.parser.add_argument("--cleanup.remove-egs", type=str,
                                 dest='remove_egs', default=True,
                                 action=common_lib.StrToBoolAction,
                                 choices=["true", "false"],
                                 help="If true, remove egs after experiment")
        self.parser.add_argument("--cleanup.preserve-model-interval",
                                 dest="preserve_model_interval",
                                 type=int, default=100,
                                 help="""Determines iterations for which models
                                 will be preserved during cleanup.
                                 If mod(iter,preserve_model_interval) == 0
                                 model will be preserved.""")

        self.parser.add_argument("--reporting.email", dest="email",
                                 type=str, default=None,
                                 action=common_lib.NullstrToNoneAction,
                                 help=""" Email-id to report about the progress
                                 of the experiment.  NOTE: It assumes the
                                 machine on which the script is being run can
                                 send emails from command line via. mail
                                 program. The Kaldi mailing list will not
                                 support this feature.  It might require local
                                 expertise to setup. """)
        self.parser.add_argument("--reporting.interval",
                                 dest="reporting_interval",
                                 type=float, default=0.1,
                                 help="""Frequency with which reports have to
                                 be sent, measured in terms of fraction of
                                 iterations.
                                 If 0 and reporting mail has been specified
                                 then only failure notifications are sent""")


import unittest

class SelfTest(unittest.TestCase):

    def test_halve_minibatch_size_str(self):
        self.assertEqual('32', halve_minibatch_size_str('64'))
        self.assertEqual('32,8:16', halve_minibatch_size_str('64,16:32'))
        self.assertEqual('1', halve_minibatch_size_str('1'))
        self.assertEqual('128=32/256=20,40:50', halve_minibatch_size_str('128=64/256=40,80:100'))


    def test_validate_chunk_width(self):
        for s in [ '64', '64,25,128' ]:
            self.assertTrue(validate_chunk_width(s), s)


    def test_validate_minibatch_size_str(self):
        # Good descriptors.
        for s in [ '32', '32,64', '1:32', '1:32,64', '64,1:32', '1:5,10:15',
                   '128=64:128/256=32,64', '1=2/3=4', '1=1/2=2/3=3/4=4' ]:
            self.assertTrue(validate_minibatch_size_str(s), s)
        # Bad descriptors.
        for s in [ None, 42, (43,), '', '1:', ':2', '3,', ',4', '5:6,', ',7:8',
                   '9=', '10=10/', '11=11/11', '12=1:2//13=1:3' '14=/15=15',
                   '16/17=17', '/18=18', '/18', '//19', '/' ]:
            self.assertFalse(validate_minibatch_size_str(s), s)


    def test_get_current_num_jobs(self):
        niters = 12
        self.assertEqual([2, 3, 3, 4, 4, 5, 6, 6, 7, 7, 8, 8],
                         [get_current_num_jobs(i, niters, 2, 1, 9)
                              for i in range(niters)])
        self.assertEqual([2, 3, 3, 3, 3, 6, 6, 6, 6, 6, 9, 9],
                         [get_current_num_jobs(i, niters, 2, 3, 9)
                              for i in range(niters)])


if __name__ == '__main__':
    unittest.main()


================================================
FILE: egs/steps/libs/nnet3/train/dropout_schedule.py
================================================
#! /usr/bin/env python

# Copyright 2016    Vimal Manohar
# Apache 2.0

"""This module contains methods related to scheduling dropout.
See _self_test() for examples of how the functions work.
"""

import logging

logger = logging.getLogger(__name__)
logger.addHandler(logging.NullHandler())


_debug_dropout = False

def _parse_dropout_option(dropout_option):
    """Parses the string option to --trainer.dropout-schedule and
    returns a list of dropout schedules for different component name patterns.
    Calls _parse_dropout_string() function for each component name pattern
    in the option.

    Arguments:
        dropout_option: The string option passed to --trainer.dropout-schedule.
            See its help for details.
            See _self_test() for examples.
        num_archive_to_process: See _parse_dropout_string() for details.

    Returns a list of (component_name, dropout_schedule) tuples,
    where dropout_schedule is itself a list of
    (data_fraction, dropout_proportion) tuples sorted in reverse order of
    data_fraction.
    A data fraction of 0 corresponds to beginning of training
    and 1 corresponds to all data.
    """
    components = dropout_option.strip().split(' ')
    dropout_schedule = []
    for component in components:
        parts = component.split('=')

        if len(parts) == 2:
            component_name = parts[0]
            this_dropout_str = parts[1]
        elif len(parts) == 1:
            component_name = '*'
            this_dropout_str = parts[0]
        else:
            raise Exception("The dropout schedule must be specified in the "
                            "format 'pattern1=func1 patter2=func2' where "
                            "the pattern can be omitted for a global function "
                            "for all components.\n"
                            "Got {0} in {1}".format(component, dropout_option))

        this_dropout_values = _parse_dropout_string(this_dropout_str)
        dropout_schedule.append((component_name, this_dropout_values))

    if _debug_dropout:
        logger.info("Dropout schedules for component names is as follows:")
        logger.info("<component-name-pattern>: [(num_archives_processed), "
                    "(dropout_proportion) ...]")
        for name, schedule in dropout_schedule:
            logger.info("{0}: {1}".format(name, schedule))

    return dropout_schedule


def _parse_dropout_string(dropout_str):
    """Parses the dropout schedule from the string corresponding to a
    single component in --trainer.dropout-schedule.
    This is a module-internal function called by parse_dropout_function().

    Arguments:
        dropout_str: Specifies dropout schedule for a particular component
            name pattern.
            See help for the option --trainer.dropout-schedule.

    Returns a list of (data_fraction_processed, dropout_proportion) tuples
    sorted in descending order of num_archives_processed.
    A data fraction of 1 corresponds to all data.
    """
    dropout_values = []
    parts = dropout_str.strip().split(',')

    try:
        if len(parts) < 2:
            raise Exception("dropout proportion string must specify "
                            "at least the start and end dropouts")

        # Starting dropout proportion
        dropout_values.append((0, float(parts[0])))
        for i in range(1, len(parts) - 1):
            value_x_pair = parts[i].split('@')
            if len(value_x_pair) == 1:
                # Dropout proportion at half of training
                dropout_proportion = float(value_x_pair[0])
                data_fraction = 0.5
            else:
                assert len(value_x_pair) == 2

                dropout_proportion = float(value_x_pair[0])
                data_fraction = float(value_x_pair[1])

            if (data_fraction < dropout_values[-1][0]
                    or data_fraction > 1.0):
                logger.error(
                    "Failed while parsing value %s in dropout-schedule. "
                    "dropout-schedule must be in incresing "
                    "order of data fractions.", value_x_pair)
                raise ValueError

            dropout_values.append((data_fraction, float(dropout_proportion)))

        dropout_values.append((1.0, float(parts[-1])))
    except Exception:
        logger.error("Unable to parse dropout proportion string %s. "
                     "See help for option "
                     "--trainer.dropout-schedule.", dropout_str)
        raise

    # reverse sort so that its easy to retrieve the dropout proportion
    # for a particular data fraction
    dropout_values.reverse()
    for data_fraction, proportion in dropout_values:
        assert data_fraction <= 1.0 and data_fraction >= 0.0
        assert proportion <= 1.0 and proportion >= 0.0

    return dropout_values


def _get_component_dropout(dropout_schedule, data_fraction):
    """Retrieve dropout proportion from schedule when data_fraction
    proportion of data is seen. This value is obtained by using a
    piecewise linear function on the dropout schedule.
    This is a module-internal function called by _get_dropout_proportions().

    See help for --trainer.dropout-schedule for how the dropout value
    is obtained from the options.

    Arguments:
        dropout_schedule: A list of (data_fraction, dropout_proportion) values
            sorted in descending order of data_fraction.
        data_fraction: The fraction of data seen until this stage of
            training.
    """
    if data_fraction == 0:
        # Dropout at start of the iteration is in the last index of
        # dropout_schedule
        assert dropout_schedule[-1][0] == 0
        return dropout_schedule[-1][1]
    try:
        # Find lower bound of the data_fraction. This is the
        # lower end of the piecewise linear function.
        (dropout_schedule_index, initial_data_fraction,
         initial_dropout) = next((i, tup[0], tup[1])
                                 for i, tup in enumerate(dropout_schedule)
                                 if tup[0] <= data_fraction)
    except StopIteration:
        raise RuntimeError(
            "Could not find data_fraction in dropout schedule "
            "corresponding to data_fraction {0}.\n"
            "Maybe something wrong with the parsed "
            "dropout schedule {1}.".format(data_fraction, dropout_schedule))

    if dropout_schedule_index == 0:
        assert dropout_schedule[0][0] == 1 and data_fraction == 1
        return dropout_schedule[0][1]

    # The upper bound of data_fraction is at the index before the
    # lower bound.
    final_data_fraction, final_dropout = dropout_schedule[
        dropout_schedule_index - 1]

    if final_data_fraction == initial_data_fraction:
        assert data_fraction == initial_data_fraction
        return initial_dropout

    assert (data_fraction >= initial_data_fraction
            and data_fraction < final_data_fraction)

    return ((data_fraction - initial_data_fraction)
            * (final_dropout - initial_dropout)
            / (final_data_fraction - initial_data_fraction)
            + initial_dropout)


def _get_dropout_proportions(dropout_schedule, data_fraction):
    """Returns dropout proportions based on the dropout_schedule for the
    fraction of data seen at this stage of training.  Returns a list of
    pairs (pattern, dropout_proportion); for instance, it might return
    the list ['*', 0.625] meaning a dropout proportion of 0.625 is to
    be applied to all dropout components.

    Returns None if dropout_schedule is None.

    dropout_schedule might be (in the sample case using the default pattern of
    '*'): '0.1,0.5@0.5,0.1', meaning a piecewise linear function that starts at
    0.1 when data_fraction=0.0, rises to 0.5 when data_fraction=0.5, and falls
    again to 0.1 when data_fraction=1.0.   It can also contain space-separated
    items of the form 'pattern=schedule', for instance:
       '*=0.0,0.5,0.0 lstm.*=0.0,0.3@0.75,0.0'
    The more specific patterns should go later, otherwise they will be overridden
    by the less specific patterns' commands.

    Calls _get_component_dropout() for the different component name patterns
    in dropout_schedule.

    Arguments:
        dropout_schedule: Value for the --trainer.dropout-schedule option.
            See help for --trainer.dropout-schedule.
            See _self_test() for examples.
        data_fraction: The fraction of data seen until this stage of
            training.

    """
    if dropout_schedule is None:
        return None
    dropout_schedule = _parse_dropout_option(dropout_schedule)
    dropout_proportions = []
    for component_name, component_dropout_schedule in dropout_schedule:
        dropout_proportions.append(
            (component_name, _get_component_dropout(
                component_dropout_schedule, data_fraction)))
    return dropout_proportions

def get_dropout_edit_option(dropout_schedule, data_fraction, iter_):
    """Return an option to be passed to nnet3-copy (or nnet3-am-copy)
    that will set the appropriate dropout proportion.  If no dropout
    is being used (dropout_schedule is None), returns the empty
    string, otherwise returns something like
    "--edits='set-dropout-proportion name=* proportion=0.625'"
    Arguments:
        dropout_schedule: Value for the --trainer.dropout-schedule option.
            See help for --trainer.dropout-schedule.
            See _self_test() for examples.
        data_fraction: real number in [0,1] that says how far along
            in training we are.
        iter_: iteration number (needed for debug printing only)
    See ReadEditConfig() in nnet3/nnet-utils.h to see how
    set-dropout-proportion directive works.
    """

    if data_fraction > 1.0:
        data_fraction = 1.0

    if dropout_schedule is None:
        return ""

    dropout_proportions = _get_dropout_proportions(
        dropout_schedule, data_fraction)

    edit_config_lines = []
    dropout_info = []

    for component_name, dropout_proportion in dropout_proportions:
        edit_config_lines.append(
            "set-dropout-proportion name={0} proportion={1}".format(
                component_name, dropout_proportion))
        dropout_info.append("pattern/dropout-proportion={0}/{1}".format(
            component_name, dropout_proportion))

    if _debug_dropout:
        logger.info("On iteration %d, %s", iter_, ', '.join(dropout_info))

    return "--edits='{0}'".format(";".join(edit_config_lines))


def get_dropout_edit_string(dropout_schedule, data_fraction, iter_):
    """Return an nnet3-copy --edits line to modify raw_model_string to
    set dropout proportions according to dropout_proportions.
    E.g. if _dropout_proportions(dropout_schedule, data_fraction)
    returns [('*', 0.625)],  this will return the string:
     "nnet3-copy --edits='set-dropout-proportion name=* proportion=0.625'"


    Arguments:
        dropout_schedule: Value for the --trainer.dropout-schedule option.
            See help for --trainer.dropout-schedule.
            See _self_test() for examples.

    See ReadEditConfig() in nnet3/nnet-utils.h to see how
    set-dropout-proportion directive works.
    """

    if dropout_schedule is None:
        return ""

    dropout_proportions = _get_dropout_proportions(
        dropout_schedule, data_fraction)

    edit_config_lines = []
    dropout_info = []

    for component_name, dropout_proportion in dropout_proportions:
        edit_config_lines.append(
            "set-dropout-proportion name={0} proportion={1}".format(
                component_name, dropout_proportion))
        dropout_info.append("pattern/dropout-proportion={0}/{1}".format(
            component_name, dropout_proportion))

    if _debug_dropout:
        logger.info("On iteration %d, %s", iter_, ', '.join(dropout_info))
    return ("""nnet3-copy --edits='{edits}' - - |""".format(
        edits=";".join(edit_config_lines)))


def _self_test():
    """Run self-test.
    This method is called if the module is run as a standalone script.
    """

    def assert_approx_equal(list1, list2):
        """Checks that the two dropout proportions lists are equal."""
        assert len(list1) == len(list2)
        for i in range(0, len(list1)):
            assert len(list1[i]) == 2
            assert len(list2[i]) == 2
            assert list1[i][0] == list2[i][0]
            assert abs(list1[i][1] - list2[i][1]) < 1e-8

    assert (_parse_dropout_option('*=0.0,0.5,0.0 lstm.*=0.0,0.3@0.75,0.0')
            == [ ('*', [ (1.0, 0.0), (0.5, 0.5), (0.0, 0.0) ]),
                 ('lstm.*', [ (1.0, 0.0), (0.75, 0.3), (0.0, 0.0) ]) ])
    assert_approx_equal(_get_dropout_proportions(
                           '*=0.0,0.5,0.0 lstm.*=0.0,0.3@0.75,0.0', 0.75),
                        [ ('*', 0.25), ('lstm.*', 0.3) ])
    assert_approx_equal(_get_dropout_proportions(
                            '*=0.0,0.5,0.0 lstm.*=0.0,0.3@0.75,0.0', 0.5),
                        [ ('*', 0.5), ('lstm.*', 0.2) ])
    assert_approx_equal(_get_dropout_proportions(
                            '*=0.0,0.5,0.0 lstm.*=0.0,0.3@0.75,0.0', 0.25),
                        [ ('*', 0.25), ('lstm.*', 0.1) ])

    assert (_parse_dropout_option('0.0,0.3,0.0')
            == [ ('*', [ (1.0, 0.0), (0.5, 0.3), (0.0, 0.0) ]) ])
    assert_approx_equal(_get_dropout_proportions('0.0,0.3,0.0', 0.5),
                        [ ('*', 0.3) ])
    assert_approx_equal(_get_dropout_proportions('0.0,0.3,0.0', 0.0),
                        [ ('*', 0.0) ])
    assert_approx_equal(_get_dropout_proportions('0.0,0.3,0.0', 1.0),
                        [ ('*', 0.0) ])
    assert_approx_equal(_get_dropout_proportions('0.0,0.3,0.0', 0.25),
                        [ ('*', 0.15) ])

    assert (_parse_dropout_option('0.0,0.5@0.25,0.0,0.6@0.75,0.0')
            == [ ('*', [ (1.0, 0.0), (0.75, 0.6), (0.5, 0.0), (0.25, 0.5), (0.0, 0.0) ]) ])
    assert_approx_equal(_get_dropout_proportions(
                            '0.0,0.5@0.25,0.0,0.6@0.75,0.0', 0.25),
                        [ ('*', 0.5) ])
    assert_approx_equal(_get_dropout_proportions(
                            '0.0,0.5@0.25,0.0,0.6@0.75,0.0', 0.1),
                        [ ('*', 0.2) ])

    assert (_parse_dropout_option('lstm.*=0.0,0.3,0.0@0.75,1.0')
            == [ ('lstm.*', [ (1.0, 1.0), (0.75, 0.0), (0.5, 0.3), (0.0, 0.0) ]) ])
    assert_approx_equal(_get_dropout_proportions(
                            'lstm.*=0.0,0.3,0.0@0.75,1.0', 0.25),
                        [ ('lstm.*', 0.15) ])
    assert_approx_equal(_get_dropout_proportions(
                            'lstm.*=0.0,0.3,0.0@0.75,1.0', 0.5),
                        [ ('lstm.*', 0.3) ])
    assert_approx_equal(_get_dropout_proportions(
                            'lstm.*=0.0,0.3,0.0@0.75,1.0', 0.9),
                        [ ('lstm.*', 0.6) ])


if __name__ == '__main__':
    try:
        _self_test()
    except Exception:
        logger.error("Failed self test")
        raise


================================================
FILE: egs/steps/libs/nnet3/train/frame_level_objf/__init__.py
================================================


# Copyright 2016 Vimal Manohar
# Apache 2.0

""" This library has classes and methods commonly used for training nnet3
neural networks with frame-level objectives.
"""

from . import common
from . import raw_model
from . import acoustic_model

__all__ = ["common", "raw_model", "acoustic_model"]


================================================
FILE: egs/steps/libs/nnet3/train/frame_level_objf/acoustic_model.py
================================================


# Copyright 2016    Vijayaditya Peddinti.
#           2016    Vimal Manohar
# Apache 2.0.

""" This is a module with method which will be used by scripts for
training of deep neural network acoustic model with frame-level objective.
"""

import logging

import libs.common as common_lib
import libs.nnet3.train.common as common_train_lib


logger = logging.getLogger(__name__)
logger.addHandler(logging.NullHandler())


def generate_egs(data, alidir, egs_dir,
                 left_context, right_context,
                 run_opts, stage=0,
                 left_context_initial=-1, right_context_final=-1,
                 online_ivector_dir=None,
                 samples_per_iter=20000, frames_per_eg_str="20", srand=0,
                 egs_opts=None, cmvn_opts=None):

    """ Wrapper for calling steps/nnet3/get_egs.sh

    Generates targets from alignment directory 'alidir', which contains
    the model final.mdl and alignments.
    """

    common_lib.execute_command(
        """steps/nnet3/get_egs.sh {egs_opts} \
                --cmd "{command}" \
                --cmvn-opts "{cmvn_opts}" \
                --online-ivector-dir "{ivector_dir}" \
                --left-context {left_context} \
                --right-context {right_context} \
                --left-context-initial {left_context_initial} \
                --right-context-final {right_context_final} \
                --stage {stage} \
                --samples-per-iter {samples_per_iter} \
                --frames-per-eg {frames_per_eg_str} \
                --srand {srand} \
                {data} {alidir} {egs_dir}
        """.format(command=run_opts.egs_command,
                   cmvn_opts=cmvn_opts if cmvn_opts is not None else '',
                   ivector_dir=(online_ivector_dir
                                if online_ivector_dir is not None
                                else ''),
                   left_context=left_context,
                   right_context=right_context,
                   left_context_initial=left_context_initial,
                   right_context_final=right_context_final,
                   stage=stage, samples_per_iter=samples_per_iter,
                   frames_per_eg_str=frames_per_eg_str, srand=srand, data=data,
                   alidir=alidir, egs_dir=egs_dir,
                   egs_opts=egs_opts if egs_opts is not None else ''))


def prepare_initial_acoustic_model(dir, alidir, run_opts,
                                   srand=-3, input_model=None):
    """ Adds the first layer; this will also add in the lda.mat and
        presoftmax_prior_scale.vec. It will also prepare the acoustic model
        with the transition model.
        If 'input_model' is specified, no initial network preparation(adding
        the first layer) is done and this model is used as initial 'raw' model
        instead of '0.raw' model to prepare '0.mdl' as acoustic model by adding the
        transition model.
    """

    if input_model is None:
        common_train_lib.prepare_initial_network(dir, run_opts,
                                                 srand=srand)

    # Convert to .mdl, train the transitions, set the priors.
    common_lib.execute_command(
        """{command} {dir}/log/init_mdl.log \
                nnet3-am-init {alidir}/final.mdl {raw_mdl} - \| \
                nnet3-am-train-transitions - \
                "ark:gunzip -c {alidir}/ali.*.gz|" {dir}/0.mdl
        """.format(command=run_opts.command,
                   dir=dir, alidir=alidir,
                   raw_mdl=(input_model if input_model is not None
                            else '{0}/0.raw'.format(dir))))


================================================
FILE: egs/steps/libs/nnet3/train/frame_level_objf/common.py
================================================

# Copyright 2016 Vijayaditya Peddinti.
#           2016 Vimal Manohar
#           2017 Johns Hopkins University (author: Daniel Povey)
# Apache 2.0.

""" This is a module with methods which will be used by scripts for training of
deep neural network acoustic model and raw model (i.e., generic neural
network without transition model) with frame-level objectives.
"""

from __future__ import print_function
from __future__ import division
import glob
import logging
import math
import os
import random
import time

import libs.common as common_lib
import libs.nnet3.train.common as common_train_lib

logger = logging.getLogger(__name__)
logger.addHandler(logging.NullHandler())


def train_new_models(dir, iter, srand, num_jobs,
                     num_archives_processed, num_archives,
                     raw_model_string, egs_dir,
                     momentum, max_param_change,
                     shuffle_buffer_size, minibatch_size_str,
                     image_augmentation_opts,
                     run_opts, frames_per_eg=-1,
                     min_deriv_time=None, max_deriv_time_relative=None,
                     use_multitask_egs=False, train_opts="",
                     backstitch_training_scale=0.0, backstitch_training_interval=1):
    """ Called from train_one_iteration(), this model does one iteration of
    training with 'num_jobs' jobs, and writes files like
    exp/tdnn_a/24.{1,2,3,..<num_jobs>}.raw

    We cannot easily use a single parallel SGE job to do the main training,
    because the computation of which archive and which --frame option
    to use for each job is a little complex, so we spawn each one separately.
    this is no longer true for RNNs as we use do not use the --frame option
    but we use the same script for consistency with FF-DNN code

    Selected args:
        frames_per_eg:
            The frames_per_eg, in the context of (non-chain) nnet3 training,
            is normally the number of output (supervised) frames in each training
            example.  However, the frames_per_eg argument to this function should
            only be set to that number (greater than zero) if you intend to
            train on a single frame of each example, on each minibatch.  If you
            provide this argument >0, then for each training job a different
            frame from the dumped example is selected to train on, based on
            the option --frame=n to nnet3-copy-egs.
            If you leave frames_per_eg at its default value (-1), then the
            entire sequence of frames is used for supervision.  This is suitable
            for RNN training, where it helps to amortize the cost of computing
            the activations for the frames of context needed for the recurrence.
        use_multitask_egs : True, if different examples used to train multiple
            tasks or outputs, e.g.multilingual training.  multilingual egs can
            be generated using get_egs.sh and
            steps/nnet3/multilingual/allocate_multilingual_examples.py, those
            are the top-level scripts.
    """

    chunk_level_training = False if frames_per_eg > 0 else True

    deriv_time_opts = []
    if min_deriv_time is not None:
        deriv_time_opts.append("--optimization.min-deriv-time={0}".format(
                           min_deriv_time))
    if max_deriv_time_relative is not None:
        deriv_time_opts.append("--optimization.max-deriv-time-relative={0}".format(
                           max_deriv_time_relative))

    threads = []

    # the GPU timing info is only printed if we use the --verbose=1 flag; this
    # slows down the computation slightly, so don't accumulate it on every
    # iteration.  Don't do it on iteration 0 either, because we use a smaller
    # than normal minibatch size, and people may get confused thinking it's
    # slower for iteration 0 because of the verbose option.
    verbose_opt = ("--verbose=1" if iter % 20 == 0 and iter > 0 else "")

    for job in range(1, num_jobs+1):
        # k is a zero-based index that we will derive the other indexes from.
        k = num_archives_processed + job - 1

        # work out the 1-based archive index.
        archive_index = (k % num_archives) + 1

        if not chunk_level_training:
            frame = (k // num_archives + archive_index) % frames_per_eg

        cache_io_opts = (("--read-cache={dir}/cache.{iter}".format(dir=dir,
                                                                  iter=iter)
                          if iter > 0 else "") +
                         (" --write-cache={0}/cache.{1}".format(dir, iter + 1)
                          if job == 1 else ""))

        if image_augmentation_opts:
            image_augmentation_cmd = (
                'nnet3-egs-augment-image --srand={srand} {aug_opts} ark:- ark:- |'.format(
                    srand=k+srand,
                    aug_opts=image_augmentation_opts))
        else:
            image_augmentation_cmd = ''


        multitask_egs_opts = common_train_lib.get_multitask_egs_opts(
            egs_dir,
            egs_prefix="egs.",
            archive_index=archive_index,
            use_multitask_egs=use_multitask_egs)

        scp_or_ark = "scp" if use_multitask_egs else "ark"

        egs_rspecifier = (
            """ark,bg:nnet3-copy-egs {frame_opts} {multitask_egs_opts} \
            {scp_or_ark}:{egs_dir}/egs.{archive_index}.{scp_or_ark} ark:- | \
            nnet3-shuffle-egs --buffer-size={shuffle_buffer_size} \
            --srand={srand} ark:- ark:- | {aug_cmd} \
            nnet3-merge-egs --minibatch-size={minibatch_size} ark:- ark:- |""".format(
                frame_opts=("" if chunk_level_training
                            else "--frame={0}".format(frame)),
                egs_dir=egs_dir, archive_index=archive_index,
                shuffle_buffer_size=shuffle_buffer_size,
                minibatch_size=minibatch_size_str,
                aug_cmd=image_augmentation_cmd,
                srand=iter+srand,
                scp_or_ark=scp_or_ark,
                multitask_egs_opts=multitask_egs_opts))

        # note: the thread waits on that process's completion.
        thread = common_lib.background_command(
            """{command} {train_queue_opt} {dir}/log/train.{iter}.{job}.log \
                    nnet3-train {parallel_train_opts} {cache_io_opts} \
                     {verbose_opt} --print-interval=10 \
                    --momentum={momentum} \
                    --max-param-change={max_param_change} \
                    --backstitch-training-scale={backstitch_training_scale} \
                    --l2-regularize-factor={l2_regularize_factor} \
                    --backstitch-training-interval={backstitch_training_interval} \
                    --srand={srand} {train_opts} \
                    {deriv_time_opts} "{raw_model}" "{egs_rspecifier}" \
                    {dir}/{next_iter}.{job}.raw""".format(
                command=run_opts.command,
                train_queue_opt=run_opts.train_queue_opt,
                dir=dir, iter=iter,
                next_iter=iter + 1, srand=iter + srand,
                job=job,
                parallel_train_opts=run_opts.parallel_train_opts,
                cache_io_opts=cache_io_opts,
                verbose_opt=verbose_opt,
                momentum=momentum, max_param_change=max_param_change,
                l2_regularize_factor=1.0/num_jobs,
                backstitch_training_scale=backstitch_training_scale,
                backstitch_training_interval=backstitch_training_interval,
                train_opts=train_opts,
                deriv_time_opts=" ".join(deriv_time_opts),
                raw_model=raw_model_string,
                egs_rspecifier=egs_rspecifier),
            require_zero_status=True)

        threads.append(thread)

    for thread in threads:
        thread.join()


def train_one_iteration(dir, iter, srand, egs_dir,
                        num_jobs, num_archives_processed, num_archives,
                        learning_rate, minibatch_size_str,
                        momentum, max_param_change, shuffle_buffer_size,
                        run_opts, image_augmentation_opts=None,
                        frames_per_eg=-1,
                        min_deriv_time=None, max_deriv_time_relative=None,
                        shrinkage_value=1.0, dropout_edit_string="",  train_opts="",
                        get_raw_nnet_from_am=True, use_multitask_egs=False,
                        backstitch_training_scale=0.0, backstitch_training_interval=1,
                        compute_per_dim_accuracy=False):
    """ Called from steps/nnet3/train_*.py scripts for one iteration of neural
    network training

    Selected args:
        frames_per_eg: The default value -1 implies chunk_level_training, which
            is particularly applicable to RNN training. If it is > 0, then it
            implies frame-level training, which is applicable for DNN training.
            If it is > 0, then each parallel SGE job created, a different frame
            numbered 0..frames_per_eg-1 is used.
        shrinkage_value: If value is 1.0, no shrinkage is done; otherwise
            parameter values are scaled by this value.
        get_raw_nnet_from_am: If True, then the network is read and stored as
            acoustic model i.e. along with transition model e.g. 10.mdl
            as against a raw network e.g. 10.raw when the value is False.
    """

    # Set off jobs doing some diagnostics, in the background.
    # Use the egs dir from the previous iteration for the diagnostics

    # check if different iterations use the same random seed
    if os.path.exists('{0}/srand'.format(dir)):
        try:
            saved_srand = int(open('{0}/srand'.format(dir)).readline().strip())
        except (IOError, ValueError):
            logger.error("Exception while reading the random seed "
                         "for training")
            raise
        if srand != saved_srand:
            logger.warning("The random seed provided to this iteration "
                           "(srand={0}) is different from the one saved last "
                           "time (srand={1}). Using srand={0}.".format(
                               srand, saved_srand))
    else:
        with open('{0}/srand'.format(dir), 'w') as f:
            f.write(str(srand))

    # Sets off some background jobs to compute train and
    # validation set objectives
    compute_train_cv_probabilities(
        dir=dir, iter=iter, egs_dir=egs_dir,
        run_opts=run_opts,
        get_raw_nnet_from_am=get_raw_nnet_from_am,
        use_multitask_egs=use_multitask_egs,
        compute_per_dim_accuracy=compute_per_dim_accuracy)

    if iter > 0:
        # Runs in the background
        compute_progress(dir=dir, iter=iter, egs_dir=egs_dir,
                         run_opts=run_opts,
                         get_raw_nnet_from_am=get_raw_nnet_from_am)

    do_average = (iter > 0)


    raw_model_string = ("nnet3-copy --learning-rate={lr} --scale={s} "
                        "{dir}/{iter}.{suf} - |".format(
                            lr=learning_rate, s=shrinkage_value,
                            suf="mdl" if get_raw_nnet_from_am else "raw",
                            dir=dir, iter=iter))

    raw_model_string = raw_model_string + dropout_edit_string

    if do_average:
        cur_minibatch_size_str = minibatch_size_str
        cur_max_param_change = max_param_change
    else:
        # on iteration zero, use a smaller minibatch size (and we will later
        # choose the output of just one of the jobs): the model-averaging isn't
        # always helpful when the model is changing too fast (i.e. it can worsen
        # the objective function), and the smaller minibatch size will help to
        # keep the update stable.
        cur_minibatch_size_str = common_train_lib.halve_minibatch_size_str(minibatch_size_str)
        cur_max_param_change = float(max_param_change) / math.sqrt(2)

    train_new_models(dir=dir, iter=iter, srand=srand, num_jobs=num_jobs,
                     num_archives_processed=num_archives_processed,
                     num_archives=num_archives,
                     raw_model_string=raw_model_string, egs_dir=egs_dir,
                     momentum=momentum, max_param_change=cur_max_param_change,
                     shuffle_buffer_size=shuffle_buffer_size,
                     minibatch_size_str=cur_minibatch_size_str,
                     run_opts=run_opts,
                     frames_per_eg=frames_per_eg,
                     min_deriv_time=min_deriv_time,
                     max_deriv_time_relative=max_deriv_time_relative,
                     image_augmentation_opts=image_augmentation_opts,
                     use_multitask_egs=use_multitask_egs,
                     train_opts=train_opts,
                     backstitch_training_scale=backstitch_training_scale,
                     backstitch_training_interval=backstitch_training_interval)

    [models_to_average, best_model] = common_train_lib.get_successful_models(
         num_jobs, '{0}/log/train.{1}.%.log'.format(dir, iter))
    nnets_list = []
    for n in models_to_average:
        nnets_list.append("{0}/{1}.{2}.raw".format(dir, iter + 1, n))

    if do_average:
        # average the output of the different jobs.
        common_train_lib.get_average_nnet_model(
            dir=dir, iter=iter,
            nnets_list=" ".join(nnets_list),
            run_opts=run_opts,
            get_raw_nnet_from_am=get_raw_nnet_from_am)

    else:
        # choose the best model from different jobs
        common_train_lib.get_best_nnet_model(
            dir=dir, iter=iter,
            best_model_index=best_model,
            run_opts=run_opts,
            get_raw_nnet_from_am=get_raw_nnet_from_am)

    try:
        for i in range(1, num_jobs + 1):
            os.remove("{0}/{1}.{2}.raw".format(dir, iter + 1, i))
    except OSError:
        logger.error("Error while trying to delete the raw models")
        raise

    if get_raw_nnet_from_am:
        new_model = "{0}/{1}.mdl".format(dir, iter + 1)
    else:
        new_model = "{0}/{1}.raw".format(dir, iter + 1)

    if not os.path.isfile(new_model):
        raise Exception("Could not find {0}, at the end of "
                        "iteration {1}".format(new_model, iter))
    elif os.stat(new_model).st_size == 0:
        raise Exception("{0} has size 0. Something went wrong in "
                        "iteration {1}".format(new_model, iter))
    if os.path.exists("{0}/cache.{1}".format(dir, iter)):
        os.remove("{0}/cache.{1}".format(dir, iter))


def compute_preconditioning_matrix(dir, egs_dir, num_lda_jobs, run_opts,
                                   max_lda_jobs=None, rand_prune=4.0,
                                   lda_opts=None, use_multitask_egs=False):
    if max_lda_jobs is not None:
        if num_lda_jobs > max_lda_jobs:
            num_lda_jobs = max_lda_jobs
    multitask_egs_opts = common_train_lib.get_multitask_egs_opts(
        egs_dir,
        egs_prefix="egs.",
        archive_index="JOB",
        use_multitask_egs=use_multitask_egs)
    scp_or_ark = "scp" if use_multitask_egs else "ark"
    egs_rspecifier = (
        "ark:nnet3-copy-egs {multitask_egs_opts} "
        "{scp_or_ark}:{egs_dir}/egs.JOB.{scp_or_ark} ark:- |"
        "".format(egs_dir=egs_dir, scp_or_ark=scp_or_ark,
                  multitask_egs_opts=multitask_egs_opts))

    # Write stats with the same format as stats for LDA.
    common_lib.execute_command(
        """{command} JOB=1:{num_lda_jobs} {dir}/log/get_lda_stats.JOB.log \
                nnet3-acc-lda-stats --rand-prune={rand_prune} \
                {dir}/init.raw "{egs_rspecifier}" \
                {dir}/JOB.lda_stats""".format(
                    command=run_opts.command,
                    num_lda_jobs=num_lda_jobs,
                    dir=dir,
                    egs_rspecifier=egs_rspecifier,
                    rand_prune=rand_prune))

    # the above command would have generated dir/{1..num_lda_jobs}.lda_stats
    lda_stat_files = ['{0}/{1}.lda_stats'.format(dir, x) for x in range(1, num_lda_jobs + 1)]

    common_lib.execute_command(
        """{command} {dir}/log/sum_transform_stats.log \
                sum-lda-accs {dir}/lda_stats {lda_stat_files}""".format(
                    command=run_opts.command,
                    dir=dir, lda_stat_files=" ".join(lda_stat_files)))

    for file in lda_stat_files:
        try:
            os.remove(file)
        except OSError:
            logger.error("There was error while trying to remove "
                         "lda stat files.")
            raise
    # this computes a fixed affine transform computed in the way we described
    # in Appendix C.6 of http://arxiv.org/pdf/1410.7455v6.pdf; it's a scaled
    # variant of an LDA transform but without dimensionality reduction.

    common_lib.execute_command(
        """{command} {dir}/log/get_transform.log \
                nnet-get-feature-transform {lda_opts} {dir}/lda.mat \
                {dir}/lda_stats""".format(
                    command=run_opts.command, dir=dir,
                    lda_opts=lda_opts if lda_opts is not None else ""))

    common_lib.force_symlink("../lda.mat", "{0}/configs/lda.mat".format(dir))


def compute_train_cv_probabilities(dir, iter, egs_dir, run_opts,
                                   get_raw_nnet_from_am=True,
                                   use_multitask_egs=False,
                                   compute_per_dim_accuracy=False):
    if get_raw_nnet_from_am:
        model = "{dir}/{iter}.mdl".format(dir=dir, iter=iter)
    else:
        model = "{dir}/{iter}.raw".format(dir=dir, iter=iter)

    scp_or_ark = "scp" if use_multitask_egs else "ark"
    egs_suffix = ".scp" if use_multitask_egs else ".egs"
    egs_rspecifier = ("{0}:{1}/valid_diagnostic{2}".format(
        scp_or_ark, egs_dir, egs_suffix))

    opts = []
    if compute_per_dim_accuracy:
        opts.append("--compute-per-dim-accuracy")

    multitask_egs_opts = common_train_lib.get_multitask_egs_opts(
                             egs_dir,
                             egs_prefix="valid_diagnostic.",
                             use_multitask_egs=use_multitask_egs)

    common_lib.background_command(
        """ {command} {dir}/log/compute_prob_valid.{iter}.log \
                nnet3-compute-prob "{model}" \
                "ark,bg:nnet3-copy-egs {multitask_egs_opts} \
                    {egs_rspecifier} ark:- | \
                    nnet3-merge-egs --minibatch-size=1:64 ark:- \
                    ark:- |" """.format(command=run_opts.command,
                                        dir=dir,
                                        iter=iter,
                                        egs_rspecifier=egs_rspecifier,
                                        opts=' '.join(opts), model=model,
                                        multitask_egs_opts=multitask_egs_opts))

    egs_rspecifier = ("{0}:{1}/train_diagnostic{2}".format(
        scp_or_ark, egs_dir, egs_suffix))

    multitask_egs_opts = common_train_lib.get_multitask_egs_opts(
                             egs_dir,
                             egs_prefix="train_diagnostic.",
                             use_multitask_egs=use_multitask_egs)

    common_lib.background_command(
        """{command} {dir}/log/compute_prob_train.{iter}.log \
                nnet3-compute-prob {opts} "{model}" \
                "ark,bg:nnet3-copy-egs {multitask_egs_opts} \
                    {egs_rspecifier} ark:- | \
                    nnet3-merge-egs --minibatch-size=1:64 ark:- \
                    ark:- |" """.format(command=run_opts.command,
                                        dir=dir,
                                        iter=iter,
                                        egs_rspecifier=egs_rspecifier,
                                        opts=' '.join(opts), model=model,
                                        multitask_egs_opts=multitask_egs_opts))


def compute_progress(dir, iter, egs_dir,
                     run_opts,
                     get_raw_nnet_from_am=True):
    suffix = "mdl" if get_raw_nnet_from_am else "raw"
    prev_model = '{0}/{1}.{2}'.format(dir, iter - 1, suffix)
    model = '{0}/{1}.{2}'.format(dir, iter, suffix)

    common_lib.background_command(
            """{command} {dir}/log/progress.{iter}.log \
                    nnet3-info {model} '&&' \
                    nnet3-show-progress --use-gpu=no {prev_model} {model} """
        ''.format(command=run_opts.command, dir=dir,
                  iter=iter, model=model, prev_model=prev_model))

    if iter % 10 == 0 and iter > 0:
        # Every 10 iters, print some more detailed information.
        # full_progress.X.log contains some diagnostics of the difference in
        # parameters, printed in the same format as from nnet3-info.
        common_lib.background_command(
            """{command} {dir}/log/full_progress.{iter}.log \
            nnet3-show-progress --use-gpu=no --verbose=2 {prev_model} {model}
        """.format(command=run_opts.command,
                   dir=dir,
                   iter=iter,
                   model=model,
                   prev_model=prev_model))
        # full_info.X.log is just the nnet3-info of the model, with the --verbose=2
        # option which includes stats on the singular values of the parameter matrices.
        common_lib.background_command(
            """{command} {dir}/log/full_info.{iter}.log \
            nnet3-info --verbose=2 {model}
        """.format(command=run_opts.command,
                   dir=dir,
                   iter=iter,
                   model=model))


def combine_models(dir, num_iters, models_to_combine, egs_dir,
                   minibatch_size_str,
                   run_opts,
                   chunk_width=None, get_raw_nnet_from_am=True,
                   max_objective_evaluations=30,
                   use_multitask_egs=False,
                   compute_per_dim_accuracy=False):
    """ Function to do model combination

    In the nnet3 setup, the logic
    for doing averaging of subsets of the models in the case where
    there are too many models to reliably esetimate interpolation
    factors (max_models_combine) is moved into the nnet3-combine.
    """
    raw_model_strings = []
    logger.info("Combining {0} models.".format(models_to_combine))

    models_to_combine.add(num_iters)

    for iter in sorted(models_to_combine):
        suffix = "mdl" if get_raw_nnet_from_am else "raw"
        model_file = '{0}/{1}.{2}'.format(dir, iter, suffix)
        if not os.path.exists(model_file):
            raise Exception('Model file {0} missing'.format(model_file))
        raw_model_strings.append(model_file)

    if get_raw_nnet_from_am:
        out_model = ("| nnet3-am-copy --set-raw-nnet=- {dir}/{num_iters}.mdl "
                     "{dir}/combined.mdl".format(dir=dir, num_iters=num_iters))
    else:
        out_model = '{dir}/final.raw'.format(dir=dir)


    # We reverse the order of the raw model strings so that the freshest one
    # goes first.  This is important for systems that include batch
    # normalization-- it means that the freshest batch-norm stats are used.
    # Since the batch-norm stats are not technically parameters, they are not
    # combined in the combination code, they are just obtained from the first
    # model.
    raw_model_strings = list(reversed(raw_model_strings))

    scp_or_ark = "scp" if use_multitask_egs else "ark"
    egs_suffix = ".scp" if use_multitask_egs else ".egs"

    egs_rspecifier = "{0}:{1}/combine{2}".format(scp_or_ark,
                                                 egs_dir, egs_suffix)

    multitask_egs_opts = common_train_lib.get_multitask_egs_opts(
                             egs_dir,
                             egs_prefix="combine.",
                             use_multitask_egs=use_multitask_egs)
    common_lib.execute_command(
        """{command} {combine_queue_opt} {dir}/log/combine.log \
                nnet3-combine {combine_gpu_opt} \
                --max-objective-evaluations={max_objective_evaluations} \
                --verbose=3 {raw_models} \
                "ark,bg:nnet3-copy-egs {multitask_egs_opts} \
                    {egs_rspecifier} ark:- | \
                      nnet3-merge-egs --minibatch-size=1:{mbsize} ark:- ark:- |" \
                "{out_model}"
        """.format(command=run_opts.command,
                   combine_queue_opt=run_opts.combine_queue_opt,
                   combine_gpu_opt=run_opts.combine_gpu_opt,
                   dir=dir, raw_models=" ".join(raw_model_strings),
                   max_objective_evaluations=max_objective_evaluations,
                   egs_rspecifier=egs_rspecifier,
                   mbsize=minibatch_size_str,
                   out_model=out_model,
                   multitask_egs_opts=multitask_egs_opts))

    # Compute the probability of the final, combined model with
    # the same subset we used for the previous compute_probs, as the
    # different subsets will lead to different probs.
    if get_raw_nnet_from_am:
        compute_train_cv_probabilities(
            dir=dir, iter='combined', egs_dir=egs_dir,
            run_opts=run_opts, use_multitask_egs=use_multitask_egs,
            compute_per_dim_accuracy=compute_per_dim_accuracy)
    else:
        compute_train_cv_probabilities(
            dir=dir, iter='final', egs_dir=egs_dir,
            run_opts=run_opts, get_raw_nnet_from_am=False,
            use_multitask_egs=use_multitask_egs,
            compute_per_dim_accuracy=compute_per_dim_accuracy)


def get_realign_iters(realign_times, num_iters,
                      num_jobs_initial, num_jobs_final):
    """ Takes the realign_times string and identifies the approximate
        iterations at which realignments have to be done.

    realign_times is a space seperated string of values between 0 and 1
    """

    realign_iters = []
    for realign_time in realign_times.split():
        realign_time = float(realign_time)
        assert(realign_time > 0 and realign_time < 1)
        if num_jobs_initial == num_jobs_final:
            realign_iter = int(0.5 + num_iters * realign_time)
        else:
            realign_iter = math.sqrt((1 - realign_time)
                                     * math.pow(num_jobs_initial, 2)
                                     + realign_time * math.pow(num_jobs_final,
                                                               2))
            realign_iter = realign_iter - num_jobs_initial
            realign_iter = realign_iter // (num_jobs_final - num_jobs_initial)
            realign_iter = realign_iter * num_iters
        realign_iters.append(int(realign_iter))

    return realign_iters


def align(dir, data, lang, run_opts, iter=None,
          online_ivector_dir=None):

    alidir = '{dir}/ali{ali_suffix}'.format(
            dir=dir,
            ali_suffix="_iter_{0}".format(iter) if iter is not None else "")

    logger.info("Aligning the data{gpu}with {num_jobs} jobs.".format(
        gpu=" using gpu " if run_opts.realign_use_gpu else " ",
        num_jobs=run_opts.realign_num_jobs))
    common_lib.execute_command(
        """steps/nnet3/align.sh --nj {num_jobs_align} \
                --cmd "{align_cmd} {align_queue_opt}" \
                --use-gpu {align_use_gpu} \
                --online-ivector-dir "{online_ivector_dir}" \
                --iter "{iter}" {data} {lang} {dir} {alidir}""".format(
                    dir=dir, align_use_gpu=("yes"
                                            if run_opts.realign_use_gpu
                                            else "no"),
                    align_cmd=run_opts.realign_command,
                    align_queue_opt=run_opts.realign_queue_opt,
                    num_jobs_align=run_opts.realign_num_jobs,
                    online_ivector_dir=(online_ivector_dir
                                        if online_ivector_dir is not None
                                        else ""),
                    iter=iter if iter is not None else "",
                    alidir=alidir,
                    lang=lang, data=data))
    return alidir


def realign(dir, iter, feat_dir, lang, prev_egs_dir, cur_egs_dir,
            prior_subset_size, num_archives,
            run_opts, online_ivector_dir=None):
    raise Exception("Realignment stage has not been implemented in nnet3")
    logger.info("Getting average posterior for purposes of adjusting "
                "the priors.")
    # Note: this just uses CPUs, using a smallish subset of data.
    # always use the first egs archive, which makes the script simpler;
    # we're using different random subsets of it.

    avg_post_vec_file = compute_average_posterior(
            dir=dir, iter=iter, egs_dir=prev_egs_dir,
            num_archives=num_archives, prior_subset_size=prior_subset_size,
            run_opts=run_opts)

    avg_post_vec_file = "{dir}/post.{iter}.vec".format(dir=dir, iter=iter)
    logger.info("Re-adjusting priors based on computed posteriors")
    model = '{0}/{1}.mdl'.format(dir, iter)
    adjust_am_priors(dir, model, avg_post_vec_file, model, run_opts)

    alidir = align(dir, feat_dir, lang, run_opts, iter,
                   online_ivector_dir)
    common_lib.execute_command(
        """steps/nnet3/relabel_egs.sh --cmd "{command}" --iter {iter} \
                {alidir} {prev_egs_dir} {cur_egs_dir}""".format(
                    command=run_opts.command,
                    iter=iter,
                    dir=dir,
                    alidir=alidir,
                    prev_egs_dir=prev_egs_dir,
                    cur_egs_dir=cur_egs_dir))


def adjust_am_priors(dir, input_model, avg_posterior_vector, output_model,
                     run_opts):
    common_lib.execute_command(
        """{command} {dir}/log/adjust_priors.final.log \
                nnet3-am-adjust-priors "{input_model}" {avg_posterior_vector} \
                "{output_model}" """.format(
                    command=run_opts.command,
                    dir=dir, input_model=input_model,
                    avg_posterior_vector=avg_posterior_vector,
                    output_model=output_model))


def compute_average_posterior(dir, iter, egs_dir, num_archives,
                              prior_subset_size,
                              run_opts, get_raw_nnet_from_am=True):
    """ Computes the average posterior of the network
    """
    for file in glob.glob('{0}/post.{1}.*.vec'.format(dir, iter)):
        os.remove(file)

    if run_opts.num_jobs_compute_prior > num_archives:
        egs_part = 1
    else:
        egs_part = 'JOB'

    suffix = "mdl" if get_raw_nnet_from_am else "raw"
    model = "{0}/{1}.{2}".format(dir, iter, suffix)

    common_lib.execute_command(
        """{command} JOB=1:{num_jobs_compute_prior} {prior_queue_opt} \
                {dir}/log/get_post.{iter}.JOB.log \
                nnet3-copy-egs \
                ark:{egs_dir}/egs.{egs_part}.ark ark:- \| \
                nnet3-subset-egs --srand=JOB --n={prior_subset_size} \
                ark:- ark:- \| \
                nnet3-merge-egs --minibatch-size=128 ark:- ark:- \| \
                nnet3-compute-from-egs {prior_gpu_opt} --apply-exp=true \
                "{model}" ark:- ark:- \| \
                matrix-sum-rows ark:- ark:- \| vector-sum ark:- \
                {dir}/post.{iter}.JOB.vec""".format(
                    command=run_opts.command,
                    dir=dir, model=model,
                    num_jobs_compute_prior=run_opts.num_jobs_compute_prior,
                    prior_queue_opt=run_opts.prior_queue_opt,
                    iter=iter, prior_subset_size=prior_subset_size,
                    egs_dir=egs_dir, egs_part=egs_part,
                    prior_gpu_opt=run_opts.prior_gpu_opt))

    # make sure there is time for $dir/post.{iter}.*.vec to appear.
    time.sleep(5)
    avg_post_vec_file = "{dir}/post.{iter}.vec".format(dir=dir, iter=iter)
    common_lib.execute_command(
        """{command} {dir}/log/vector_sum.{iter}.log \
                vector-sum {dir}/post.{iter}.*.vec {output_file}
        """.format(command=run_opts.command,
                   dir=dir, iter=iter, output_file=avg_post_vec_file))

    for file in glob.glob('{0}/post.{1}.*.vec'.format(dir, iter)):
        os.remove(file)
    return avg_post_vec_file


================================================
FILE: egs/steps/libs/nnet3/train/frame_level_objf/raw_model.py
================================================


# Copyright 2016    Vijayaditya Peddinti.
#           2016    Vimal Manohar
# Apache 2.0.

""" This is a module with method which will be used by scripts for
training of deep neural network raw model (i.e. without acoustic model)
with frame-level objective.
"""

import logging

import libs.common as common_lib

logger = logging.getLogger(__name__)
logger.addHandler(logging.NullHandler())


def generate_egs_using_targets(data, targets_scp, egs_dir,
                               left_context, right_context,
                               run_opts, stage=0,
                               left_context_initial=-1, right_context_final=-1,
                               online_ivector_dir=None,
                               target_type='dense', num_targets=-1,
                               samples_per_iter=20000, frames_per_eg_str="20",
                               srand=0, egs_opts=None, cmvn_opts=None):
    """ Wrapper for calling steps/nnet3/get_egs_targets.sh

    This method generates egs directly from an scp file of targets, instead of
    getting them from the alignments (as with the method generate_egs() in
    module nnet3.train.frame_level_objf.acoustic_model).

    Args:
        target_type: "dense" if the targets are in matrix format
                     "sparse" if the targets are in posterior format
        num_targets: must be explicitly specified for "sparse" targets.
            For "dense" targets, this option is ignored and the target dim
            is computed from the target matrix dimension
        For other options, see the file steps/nnet3/get_egs_targets.sh
    """

    if target_type == 'dense':
        num_targets = common_lib.get_feat_dim_from_scp(targets_scp)
    else:
        if num_targets == -1:
            raise Exception("--num-targets is required if "
                            "target-type is sparse")

    common_lib.execute_command(
        """steps/nnet3/get_egs_targets.sh {egs_opts} \
                --cmd "{command}" \
                --cmvn-opts "{cmvn_opts}" \
                --online-ivector-dir "{ivector_dir}" \
                --left-context {left_context} \
                --right-context {right_context} \
                --left-context-initial {left_context_initial} \
                --right-context-final {right_context_final} \
                --stage {stage} \
                --samples-per-iter {samples_per_iter} \
                --frames-per-eg {frames_per_eg_str} \
                --srand {srand} \
                --target-type {target_type} \
                --num-targets {num_targets} \
                {data} {targets_scp} {egs_dir}
        """.format(command=run_opts.egs_command,
                   cmvn_opts=cmvn_opts if cmvn_opts is not None else '',
                   ivector_dir=(online_ivector_dir
                                if online_ivector_dir is not None
                                else ''),
                   left_context=left_context,
                   right_context=right_context,
                   left_context_initial=left_context_initial,
                   right_context_final=right_context_final,
                   stage=stage, samples_per_iter=samples_per_iter,
                   frames_per_eg_str=frames_per_eg_str, srand=srand,
                   num_targets=num_targets,
                   data=data,
                   targets_scp=targets_scp, target_type=target_type,
                   egs_dir=egs_dir,
                   egs_opts=egs_opts if egs_opts is not None else ''))


================================================
FILE: egs/steps/libs/nnet3/xconfig/__init__.py
================================================
# Copyright 2016    Johns Hopkins University (Dan Povey)
#           2016    Vijayaditya Peddinti
#           2016    Yiming Wang
# Apache 2.0.

"""This library has classes and methods to form neural network computation graphs,
in the nnet3 framework, using higher level abstractions called 'layers'
(e.g. sub-graphs like LSTMS ).

Note : We use the term 'layer' though the computation graph can have a highly
non-linear structure as, other terms such as nodes/components have already been
used in C++ codebase of nnet3.

This is basically a config parser module, where the configs have very concise
descriptions of a neural network.

This module has methods to convert the xconfigs into a configs interpretable by
nnet3 C++ library.

It generates three different configs:
 'init.config' : which is the config with the info necessary for computing
               the preconditioning matrix i.e., LDA transform
               e.g.
                 input-node name=input dim=40
                 input-node name=ivector dim=100
                 output-node name=output input=Append(Offset(input, -2), Offset(input, -1), input, Offset(input, 1), Offset(input, 2), ReplaceIndex(ivector, t, 0)) objective=linear

 'ref.config' : which is a version of the config file used to generate
                a model for getting left and right context (it doesn't read
                anything for the LDA-like transform and/or
                presoftmax-prior-scale components)

 'final.config' : which has the actual config used to initialize the model used
                 in training i.e, it has file paths for LDA transform and
                 other initialization files
"""


__all__ = ["utils", "layers", "parser"]


================================================
FILE: egs/steps/libs/nnet3/xconfig/attention.py
================================================
# Copyright 2017    Johns Hopkins University (Dan Povey)
#           2017    Hossein Hadian
# Apache 2.0.

""" This module has the implementation of attention layers.
"""

from __future__ import print_function
from __future__ import division
import math
import re
import sys
from libs.nnet3.xconfig.basic_layers import XconfigLayerBase

# This class is for parsing lines like
#  'attention-renorm-layer num-heads=10 value-dim=50 key-dim=50 time-stride=3 num-left-inputs=5 num-right-inputs=2.'
#
# Parameters of the class, and their defaults:
#   input='[-1]'               [Descriptor giving the input of the layer.]
#   self-repair-scale=1.0e-05  [Affects relu, sigmoid and tanh layers.]
#   learning-rate-factor=1.0   [This can be used to make the affine component
#                               train faster or slower].
#   Documentation for the rest of the parameters (related to the
#   attention component) can be found in nnet-attention-component.h


class XconfigAttentionLayer(XconfigLayerBase):
    def __init__(self, first_token, key_to_value, prev_names = None):
        # Here we just list some likely combinations.. you can just add any
        # combinations you want to use, to this list.
        assert first_token in ['attention-renorm-layer',
                               'attention-relu-renorm-layer',
                               'attention-relu-batchnorm-layer',
                               'relu-renorm-attention-layer']
        XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names)

    def set_default_configs(self):
        # note: self.config['input'] is a descriptor, '[-1]' means output
        # the most recent layer.
        self.config = { 'input':'[-1]',
                        'dim': -1,
                        'max-change' : 0.75,
                        'self-repair-scale' : 1.0e-05,
                        'target-rms' : 1.0,
                        'learning-rate-factor' : 1.0,
                        'ng-affine-options' : '',
                        'l2-regularize': 0.0,
                        'num-left-inputs-required': -1,
                        'num-right-inputs-required': -1,
                        'output-context': True,
                        'time-stride': 1,
                        'num-heads': 1,
                        'key-dim': -1,
                        'key-scale': 0.0,
                        'value-dim': -1,
                        'num-left-inputs': -1,
                        'num-right-inputs': -1,
                        'dropout-proportion': 0.5}  # dropout-proportion only
                                                    # affects layers with
                                                    # 'dropout' in the name.

    def check_configs(self):
        if self.config['self-repair-scale'] < 0.0 or self.config['self-repair-scale'] > 1.0:
            raise RuntimeError("self-repair-scale has invalid value {0}"
                               .format(self.config['self-repair-scale']))
        if self.config['target-rms'] < 0.0:
            raise RuntimeError("target-rms has invalid value {0}"
                               .format(self.config['target-rms']))
        if self.config['learning-rate-factor'] <= 0.0:
            raise RuntimeError("learning-rate-factor has invalid value {0}"
                               .format(self.config['learning-rate-factor']))
        for conf in ['value-dim', 'key-dim',
                     'num-left-inputs', 'num-right-inputs']:
            if self.config[conf] < 0:
                raise RuntimeError("{0} has invalid value {1}"
                                   .format(conf, self.config[conf]))
        if self.config['key-scale'] == 0.0:
            self.config['key-scale'] = 1.0 / math.sqrt(self.config['key-dim'])

    def output_name(self, auxiliary_output=None):
        # at a later stage we might want to expose even the pre-nonlinearity
        # vectors
        assert auxiliary_output == None

        split_layer_name = self.layer_type.split('-')
        assert split_layer_name[-1] == 'layer'
        last_nonlinearity = split_layer_name[-2]
        # return something like: layer3.renorm
        return '{0}.{1}'.format(self.name, last_nonlinearity)

    def attention_input_dim(self):
        context_dim = (self.config['num-left-inputs'] +
                       self.config['num-right-inputs'] + 1)
        num_heads = self.config['num-heads']
        key_dim = self.config['key-dim']
        value_dim = self.config['value-dim']
        query_dim = key_dim + context_dim;
        return num_heads * (key_dim + value_dim + query_dim)

    def attention_output_dim(self):
        context_dim = (self.config['num-left-inputs'] +
                       self.config['num-right-inputs'] + 1)
        num_heads = self.config['num-heads']
        value_dim = self.config['value-dim']
        return (num_heads *
                (value_dim +
                 (context_dim if self.config['output-context'] else 0)))

    def output_dim(self, auxiliary_output = None):
      return self.attention_output_dim()

    def get_full_config(self):
        ans = []
        config_lines = self._generate_config()

        for line in config_lines:
            for config_name in ['ref', 'final']:
                # we do not support user specified matrices in this layer
                # so 'ref' and 'final' configs are the same.
                ans.append((config_name, line))
        return ans


    def _generate_config(self):
        split_layer_name = self.layer_type.split('-')
        assert split_layer_name[-1] == 'layer'
        nonlinearities = split_layer_name[:-1]

        # by 'descriptor_final_string' we mean a string that can appear in
        # config-files, i.e. it contains the 'final' names of nodes.
        input_desc = self.descriptors['input']['final-string']
        input_dim = self.descriptors['input']['dim']

        # the child classes e.g. tdnn might want to process the input
        # before adding the other components

        return self._add_components(input_desc, input_dim, nonlinearities)

    def _add_components(self, input_desc, input_dim, nonlinearities):
        dim = self.attention_input_dim()
        self_repair_scale = self.config['self-repair-scale']
        target_rms = self.config['target-rms']
        max_change = self.config['max-change']
        ng_affine_options = self.config['ng-affine-options']
        l2_regularize = self.config['l2-regularize']
        learning_rate_factor=self.config['learning-rate-factor']
        learning_rate_option=('learning-rate-factor={0}'.format(learning_rate_factor)
                              if learning_rate_factor != 1.0 else '')
        l2_regularize_option = ('l2-regularize={0} '.format(l2_regularize)
                                if l2_regularize != 0.0 else '')
        configs = []
        # First the affine node.
        line = ('component name={0}.affine'
                ' type=NaturalGradientAffineComponent'
                ' input-dim={1}'
                ' output-dim={2}'
                ' max-change={3}'
                ' {4} {5} {6}'
                ''.format(self.name, input_dim, dim,
                          max_change, ng_affine_options,
                          learning_rate_option, l2_regularize_option))
        configs.append(line)

        line = ('component-node name={0}.affine'
                ' component={0}.affine input={1}'
                ''.format(self.name, input_desc))
        configs.append(line)
        cur_node = '{0}.affine'.format(self.name)

        for nonlinearity in nonlinearities:
            if nonlinearity == 'relu':
                line = ('component name={0}.{1}'
                        ' type=RectifiedLinearComponent dim={2}'
                        ' self-repair-scale={3}'
                        ''.format(self.name, nonlinearity, dim,
                            self_repair_scale))

            elif nonlinearity == 'attention':
                line = ('component name={0}.{1}'
                        ' type=RestrictedAttentionComponent'
                        ' value-dim={2}'
                        ' key-dim={3}'
                        ' num-left-inputs={4}'
                        ' num-right-inputs={5}'
                        ' num-left-inputs-required={6}'
                        ' num-right-inputs-required={7}'
                        ' output-context={8}'
                        ' time-stride={9}'
                        ' num-heads={10}'
                        ' key-scale={11}'
                        ''.format(self.name, nonlinearity,
                                  self.config['value-dim'],
                                  self.config['key-dim'],
                                  self.config['num-left-inputs'],
                                  self.config['num-right-inputs'],
                                  self.config['num-left-inputs-required'],
                                  self.config['num-right-inputs-required'],
                                  self.config['output-context'],
                                  self.config['time-stride'],
                                  self.config['num-heads'],
                                  self.config['key-scale']))
                dim = self.attention_output_dim()

            elif nonlinearity == 'sigmoid':
                line = ('component name={0}.{1}'
                        ' type=SigmoidComponent dim={2}'
                        ' self-repair-scale={3}'
                        ''.format(self.name, nonlinearity, dim,
                            self_repair_scale))

            elif nonlinearity == 'tanh':
                line = ('component name={0}.{1}'
                        ' type=TanhComponent dim={2}'
                        ' self-repair-scale={3}'
                        ''.format(self.name, nonlinearity, dim,
                            self_repair_scale))

            elif nonlinearity == 'renorm':
                line = ('component name={0}.{1}'
                        ' type=NormalizeComponent dim={2}'
                        ' target-rms={3}'
                        ''.format(self.name, nonlinearity, dim,
                            target_rms))

            elif nonlinearity == 'batchnorm':
                line = ('component name={0}.{1}'
                        ' type=BatchNormComponent dim={2}'
                        ' target-rms={3}'
                        ''.format(self.name, nonlinearity, dim,
                            target_rms))

            elif nonlinearity == 'dropout':
                line = ('component name={0}.{1} type=DropoutComponent '
                           'dim={2} dropout-proportion={3}'.format(
                               self.name, nonlinearity, dim,
                               self.config['dropout-proportion']))

            else:
                raise RuntimeError("Unknown nonlinearity type: {0}"
                                   .format(nonlinearity))

            configs.append(line)
            line = ('component-node name={0}.{1}'
                    ' component={0}.{1} input={2}'
                    ''.format(self.name, nonlinearity, cur_node))

            configs.append(line)
            cur_node = '{0}.{1}'.format(self.name, nonlinearity)
        return configs


================================================
FILE: egs/steps/libs/nnet3/xconfig/basic_layers.py
================================================
# Copyright 2016    Johns Hopkins University (Dan Povey)
#           2016    Vijayaditya Peddinti
#           2017    Google Inc. (vpeddinti@google.com)
#           2017    Vimal Manohar
# Apache 2.0.

""" This module contains the parent class from which all layers are inherited
and some basic layer definitions.
"""

from __future__ import print_function
from __future__ import division
import math
import re
import sys
import libs.nnet3.xconfig.utils as xutils
import libs.common as common_lib


class XconfigLayerBase(object):
    """ A base-class for classes representing layers of xconfig files.
    """

    def __init__(self, first_token, key_to_value, all_layers):
        """
         first_token: first token on the xconfig line, e.g. 'affine-layer'.f
         key_to_value: dictionary with parameter values
             { 'name':'affine1',
               'input':'Append(0, 1, 2, ReplaceIndex(ivector, t, 0))',
               'dim=1024' }.
             The only required and 'special' values that are dealt with directly
             at this level, are 'name' and 'input'. The rest are put in
             self.config and are dealt with by the child classes' init functions.
         all_layers: An array of objects inheriting XconfigLayerBase for all
                    previously parsed layers.
        """

        self.layer_type = first_token
        if 'name' not in key_to_value:
            raise RuntimeError("Expected 'name' to be specified.")
        self.name = key_to_value['name']
        if not xutils.is_valid_line_name(self.name):
            raise RuntimeError("Invalid value: name={0}".format(
                key_to_value['name']))

        # It is possible to have two layers with a same name in 'all_layer', if
        # the layer type for one of them is 'existing'.
        # Layers of type 'existing' are corresponding to the component-node names
        # in the existing model, which we are adding layers to them.
        # 'existing' layers are not presented in any config file, and new layer
        # with the same name can exist in 'all_layers'.
        # e.g. It is possible to have 'output-node' with name 'output' in the
        # existing model, which is added to all_layers using layer type 'existing',
        # and 'output-node' of type 'output-layer' with the same name 'output' in
        # 'all_layers'.
        for prev_layer in all_layers:
            if (self.name == prev_layer.name and
                prev_layer.layer_type is not 'existing'):
                raise RuntimeError("Name '{0}' is used for more than one "
                                   "layer.".format(self.name))

        self.config = {}
        # the following, which should be overridden in the child class, sets
        # default config parameters in self.config.
        self.set_default_configs()
        # The following is not to be reimplemented in child classes;
        # it sets the config values to those specified by the user, and
        # parses any Descriptors.
        self.set_configs(key_to_value, all_layers)
        # This method, sets the derived default config values
        # i.e., config values when not specified can be derived from
        # other values. It can be overridden in the child class.
        self.set_derived_configs()
        # the following, which should be overridden in the child class, checks
        # that the config parameters that have been set are reasonable.
        self.check_configs()


    def set_configs(self, key_to_value, all_layers):
        """ Sets the config variables.
            We broke this code out of __init__ for clarity.
            the child-class constructor will deal with the configuration values
            in a more specific way.
        """

        # First check that there are no keys that don't correspond to any config
        # parameter of this layer, and if so, raise an exception with an
        # informative message saying what configs are allowed.
        for key, value in key_to_value.items():
            if key != 'name':
                if key not in self.config:
                    configs = ' '.join([('{0}->"{1}"'.format(x, y) if isinstance(y, str)
                                         else '{0}->{1}'.format(x, y))
                                        for x, y in self.config.items()])
                    raise RuntimeError("Configuration value {0}={1} was not "
                                       "expected in layer of type {2}; allowed "
                                       "configs with their defaults: {3}"
                                       "" .format(key, value, self.layer_type, configs))

        for key, value in key_to_value.items():
            if key != 'name':
                assert key in self.config  # we checked above.
                self.config[key] = xutils.convert_value_to_type(key,
                                                                type(self.config[key]),
                                                                value)
        self.descriptors = dict()
        self.descriptor_dims = dict()
        # Parse Descriptors and get their dims and their 'final' string form.
        # in self.descriptors[key]
        for key in self.get_input_descriptor_names():
            if key not in self.config:
                raise RuntimeError("{0}: object of type {1} needs to override"
                                   " get_input_descriptor_names()."
                                   "".format(sys.argv[0], str(type(self))))

            descriptor_string = self.config[key]  # input string.
            assert isinstance(descriptor_string, str)
            desc = self.convert_to_descriptor(descriptor_string, all_layers)
            desc_dim = self.get_dim_for_descriptor(desc, all_layers)
            desc_norm_str = desc.str()

            # desc_output_str contains the "final" component names, those that
            # appear in the actual config file (i.e. not names like
            # 'layer.auxiliary_output'); that's how it differs from desc_norm_str.
            # Note: it's possible that the two strings might be the same in
            # many, even most, cases-- it depends whether
            # output_name(self, auxiliary_output)
            # returns self.get_name() + '.' + auxiliary_output
            # when auxiliary_output is not None.
            # That's up to the designer of the layer type.
            desc_output_str = self.get_string_for_descriptor(desc, all_layers)
            self.descriptors[key] = {'string': desc,
                                     'normalized-string': desc_norm_str,
                                     'final-string': desc_output_str,
                                     'dim': desc_dim}

            # the following helps to check the code by parsing it again.
            desc2 = self.convert_to_descriptor(desc_norm_str, all_layers)
            desc_norm_str2 = desc2.str()
            # if the following ever fails we'll have to do some debugging.
            if desc_norm_str != desc_norm_str2:
                raise RuntimeError("Likely code error: '{0}' != '{1}'"
                                   "".format(desc_norm_str, desc_norm_str2))

    def str(self):
        """Converts 'this' to a string which could be printed to
        an xconfig file; in xconfig_to_configs.py we actually expand all the
        lines to strings and write it as xconfig.expanded as a reference
        (so users can see any defaults).
        """

        list_of_entries = ['{0} name={1}'.format(self.layer_type, self.name)]
        for key, value in sorted(self.config.items()):
            if isinstance(value, str) and re.search('=', value):
                # the value is a string that contains an '=' sign, so we need to
                # enclose it in double-quotes, otherwise we woudldn't be able to
                # parse from that output.
                if re.search('"', value):
                    print("Warning: config '{0}={1}' contains both double-quotes "
                          "and equals sign; it will not be possible to parse it "
                          "from the file.".format(key, value), file=sys.stderr)
                list_of_entries.append('{0}="{1}"'.format(key, value))
            else:
                list_of_entries.append('{0}={1}'.format(key, value))

        return ' '.join(list_of_entries)

    def __str__(self):
        return self.str()

    def normalize_descriptors(self):
        """Converts any config variables in self.config which correspond to
        Descriptors, into a 'normalized form' derived from parsing them as
        Descriptors, replacing things like [-1] with the actual layer names,
        and regenerating them as strings.  We stored this when the object was
        initialized, in self.descriptors; this function just copies them back
        to the config.
        """

        for key, desc_str_dict in self.descriptors.items():
            self.config[key] = desc_str_dict['normalized-string']

    def convert_to_descriptor(self, descriptor_string, all_layers):
        """Convenience function intended to be called from child classes,
        converts a string representing a descriptor ('descriptor_string')
        into an object of type Descriptor, and returns it. It needs 'self' and
        'all_layers' (where 'all_layers' is a list of objects of type
        XconfigLayerBase) so that it can work out a list of the names of other
        layers, and get dimensions from them.
        """

        prev_names = xutils.get_prev_names(all_layers, self)
        tokens = xutils.tokenize_descriptor(descriptor_string, prev_names)
        pos = 0
        (descriptor, pos) = xutils.parse_new_descriptor(tokens, pos, prev_names)
        # note: 'pos' should point to the 'end of string' marker
        # that terminates 'tokens'.
        if pos != len(tokens) - 1:
            raise RuntimeError("Parsing Descriptor, saw junk at end: {0}"
                               "".format(' '.join(tokens[pos:-1])))
        return descriptor

    def get_dim_for_descriptor(self, descriptor, all_layers):
        """Returns the dimension of a Descriptor object. This is a convenience
        function used in set_configs.
        """

        layer_to_dim_func = \
                lambda name: xutils.get_dim_from_layer_name(all_layers, self,
                                                            name)
        return descriptor.dim(layer_to_dim_func)

    def get_string_for_descriptor(self, descriptor, all_layers):
        """Returns the 'final' string form of a Descriptor object,
        as could be used in config files. This is a convenience function
        provided for use in child classes;
        """

        layer_to_string_func = \
                lambda name: xutils.get_string_from_layer_name(all_layers,
                                                               self, name)
        return descriptor.config_string(layer_to_string_func)

    def get_name(self):
        """Returns the name of this layer, e.g. 'affine1'.  It does not
        necessarily correspond to a component name.
        """

        return self.name

    ######  Functions that might be overridden by the child class: #####

    def set_default_configs(self):
        """Child classes should override this.
        """

        raise Exception("Child classes must override set_default_configs().")

    def set_derived_configs(self):
        """This is expected to be called after set_configs and before
        check_configs().
        """
        if 'dim' in self.config and self.config['dim'] <= 0:
            self.config['dim'] = self.descriptors['input']['dim']

    def check_configs(self):
        """child classes should override this.
        """

        pass

    def get_input_descriptor_names(self):
        """This function, which may be (but usually will not have to be)
        overridden by child classes, returns a list of names of the input
        descriptors expected by this component. Typically this would just
        return ['input'] as most layers just have one 'input'. However some
        layers might require more inputs (e.g. cell state of previous LSTM layer
        in Highway LSTMs). It is used in the function 'normalize_descriptors()'.
        This implementation will work for layer types whose only
        Descriptor-valued config is 'input'.
        If a child class adds more inputs, or does not have an input
        (e.g. the XconfigInputLayer), it should override this function's
        implementation to something like: `return ['input', 'input2']`
        """

        return ['input']

    def auxiliary_outputs(self):
        """Returns a list of all auxiliary outputs that this layer supports.
        These are either 'None' for the regular output, or a string
        (e.g. 'projection' or 'memory_cell') for any auxiliary outputs that
        the layer might provide.  Most layer types will not need to override
        this.
        """

        return [None]

    def output_name(self, auxiliary_output=None):
        """Called with auxiliary_output is None, this returns the component-node
        name of the principal output of the layer (or if you prefer, the text
        form of a descriptor that gives you such an output; such as
        Append(some_node, some_other_node)).
        The 'auxiliary_output' argument is a text value that is designed for
        extensions to layers that have additional auxiliary outputs.
        For example, to implement a highway LSTM you need the memory-cell of a
        layer, so you might allow auxiliary_output='memory_cell' for such a
        layer type, and it would return the component node or a suitable
        Descriptor: something like 'lstm3.c_t'
        """

        raise Exception("Child classes must override output_name()")

    def output_dim(self, auxiliary_output=None):
        """The dimension that this layer outputs.  The 'auxiliary_output'
        parameter is for layer types which support auxiliary outputs.
        """

        raise Exception("Child classes must override output_dim()")

    def get_full_config(self):
        """This function returns lines destined for the 'full' config format, as
        would be read by the C++ programs. Since the program
        xconfig_to_configs.py writes several config files, this function returns
        a list of pairs of the form (config_file_basename, line),
        e.g. something like
         [  ('init', 'input-node name=input dim=40'),
            ('ref', 'input-node name=input dim=40') ]
        which would be written to config_dir/init.config and config_dir/ref.config.
        """

        raise Exception("Child classes must override get_full_config()")


class XconfigInputLayer(XconfigLayerBase):
    """This class is for lines like
    'input name=input dim=40'
    or
    'input name=ivector dim=100'
    in the config file.
    """
    def __init__(self, first_token, key_to_value, prev_names=None):

        assert first_token == 'input'
        XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names)

    def set_default_configs(self):

        self.config = {'dim': -1}

    def check_configs(self):

        if self.config['dim'] <= 0:
            raise RuntimeError("Dimension of input-layer '{0}'"
                               "should be positive.".format(self.name))

    def get_input_descriptor_names(self):

        return []  # there is no 'input' field in self.config.

    def output_name(self, auxiliary_outputs=None):

        # there are no auxiliary outputs as this layer will just pass the input
        assert auxiliary_outputs is None
        return self.name

    def output_dim(self, auxiliary_outputs=None):

        # there are no auxiliary outputs as this layer will just pass the input
        assert auxiliary_outputs is None
        return self.config['dim']

    def get_full_config(self):

        # unlike other layers the input layers need to be printed in
        # 'init.config' (which initializes the neural network prior to the LDA)
        ans = []
        for config_name in ['init', 'ref', 'final']:
            ans.append((config_name,
                        'input-node name={0} dim={1}'.format(self.name,
                                                             self.config['dim'])))
        return ans


class XconfigTrivialOutputLayer(XconfigLayerBase):
    """
    This class is for lines like
    'output name=output input=Append(input@-1, input@0, input@1, ReplaceIndex(ivector, t, 0))'
    This is for outputs that are not really output "layers"
    (there is no affine transform or nonlinearity), they just directly map to an
    output-node in nnet3.

    Parameters of the class, and their defaults:
        input='[-1]'    :   Descriptor giving the input of the layer.
        objective-type=linear   :   the only other choice currently is
            'quadratic', for use in regression problems
        output-delay=0    :  Can be used to shift the frames on the output, equivalent
             to delaying labels by this many frames (positive value increases latency
             in online decoding but may help if you're using unidirectional LSTMs.
    """

    def __init__(self, first_token, key_to_value, prev_names=None):

        assert first_token == 'output'
        XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names)

    def set_default_configs(self):

        # note: self.config['input'] is a descriptor, '[-1]' means output
        # the most recent layer.
        self.config = {'input': '[-1]', 'dim': -1,
                       'objective-type': 'linear',
                       'output-delay': 0}

    def check_configs(self):

        if self.config['objective-type'] != 'linear' and \
                self.config['objective-type'] != 'quadratic':
            raise RuntimeError("In output, objective-type has"
                               " invalid value {0}"
                               "".format(self.config['objective-type']))

    def output_name(self, auxiliary_outputs=None):

        # there are no auxiliary outputs as this layer will just pass the output
        # of the previous layer
        assert auxiliary_outputs is None
        return self.name

    def output_dim(self, auxiliary_outputs=None):

        assert auxiliary_outputs is None
        # note: each value of self.descriptors is (descriptor, dim, normalized-string, output-string).
        return self.descriptors['input']['dim']

    def get_full_config(self):

        # the input layers need to be printed in 'init.config' (which
        # initializes the neural network prior to the LDA), in 'ref.config',
        # which is a version of the config file used for getting left and right
        # context (it doesn't read anything for the LDA-like transform).
        # In 'full.config' we write everything, this is just for reference,
        # and also for cases where we don't use the LDA-like transform.
        ans = []

        # note: each value of self.descriptors is (descriptor, dim,
        # normalized-string, output-string).
        # by 'output-string' we mean a string that can appear in
        # config-files, i.e. it contains the 'final' names of nodes.
        descriptor_final_str = self.descriptors['input']['final-string']
        objective_type = self.config['objective-type']
        output_delay = self.config['output-delay']

        if output_delay != 0:
            descriptor_final_str = (
                'Offset({0}, {1})'.format(descriptor_final_str, output_delay))

        for config_name in ['ref', 'final']:
            ans.append((config_name,
                        'output-node name={0} input={1} '
                        'objective={2}'.format(
                            self.name, descriptor_final_str,
                            objective_type)))
        return ans


class XconfigOutputLayer(XconfigLayerBase):
    """This class is for lines like
    'output-layer name=output dim=4257 input=Append(input@-1, input@0, input@1, ReplaceIndex(ivector, t, 0))'
    By default this includes a log-softmax component.  The parameters are
    initialized to zero, as this empirically tends to be the best approach for output layers.

    Parameters of the class, and their defaults:
        input='[-1]'    :   Descriptor giving the input of the layer.
        dim=None    :   Output dimension of layer, will normally equal the number of pdfs.
        bottleneck-dim=None    :   Bottleneck dimension of layer: if supplied, instead of
                        an affine component we'll have a linear then affine, so a linear
                        bottleneck, with the linear part constrained to be orthonormal.
        include-log-softmax=true    :   setting it to false will omit the
            log-softmax component- useful for chain models.
        objective-type=linear   :   the only other choice currently is
            'quadratic', for use in regression problems
        learning-rate-factor=1.0    :   Learning rate factor for the final
            affine component, multiplies the standard learning rate. normally
            you'll leave this as-is, but for xent regularization output layers
            for chain models you'll want to set
            learning-rate-factor=(0.5/xent_regularize),
            normally learning-rate-factor=5.0 since xent_regularize is
            normally 0.1.
        max-change=1.5 :  Can be used to change the max-change parameter in the
            affine component; this affects how much the matrix can change on each
            iteration.
        l2-regularize=0.0:  Set this to a nonzero value (e.g. 1.0e-05) to
            add l2 regularization on the parameter norm for the affine component.
        output-delay=0    :  Can be used to shift the frames on the output, equivalent
             to delaying labels by this many frames (positive value increases latency
             in online decoding but may help if you're using unidirectional LSTMs.
        ng-affine-options=''  :   Can be used supply non-default options to the affine
             layer (intended for the natural gradient but can be an arbitrary string
             to be added to the config line.  e.g. 'update-period=2'.).
        ng-linear-options=''  :   Options, like ng-affine-options, that are passed to
             the LinearComponent, only in bottleneck layers (i.e. if bottleneck-dim
             is supplied).
    """

    def __init__(self, first_token, key_to_value, prev_names=None):

        assert first_token == 'output-layer'
        XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names)

    def set_default_configs(self):

        # note: self.config['input'] is a descriptor, '[-1]' means output
        # the most recent layer.
        self.config = {'input': '[-1]',
                       'dim': -1,
                       'bottleneck-dim': -1,
                       'orthonormal-constraint': 1.0,
                            # orthonormal-constraint only matters if bottleneck-dim is set.
                       'include-log-softmax': True,
                            # this would be false for chain models
                       'objective-type': 'linear',
                            # see Nnet::ProcessOutputNodeConfigLine in
                            # nnet-nnet.cc for other options
                       'output-delay': 0,
                       'ng-affine-options': '',
                       'ng-linear-options': '',    # only affects bottleneck output layers.

                       # The following are just passed through to the affine
                       # component, and (in the bottleneck case) the linear
                       # component.
                       'learning-rate-factor': '',  # effective default: 1.0
                       'l2-regularize': '',         # effective default: 0.0
                       'max-change': 1.5,

                       # The following are passed through to the affine component only.
                       # It tends to be beneficial to initialize the output layer with
                       # zero values, unlike the hidden layers.
                       'param-stddev': 0.0,
                       'bias-stddev': 0.0,
                      }

    def check_configs(self):

        if self.config['dim'] <= -1:
            raise RuntimeError("In output-layer, dim has invalid value {0}"
                               "".format(self.config['dim']))

        if self.config['objective-type'] != 'linear' and \
                self.config['objective-type'] != 'quadratic':
            raise RuntimeError("In output-layer, objective-type has"
                               " invalid value {0}"
                               "".format(self.config['objective-type']))

        if self.config['orthonormal-constraint'] <= 0.0:
            raise RuntimeError("output-layer does not support negative (floating) "
                               "orthonormal constraint; use a separate linear-component "
                               "followed by batchnorm-component.")

    def auxiliary_outputs(self):

        auxiliary_outputs = ['affine']
        if self.config['include-log-softmax']:
            auxiliary_outputs.append('log-softmax')

        return auxiliary_outputs

    def output_name(self, auxiliary_output=None):

        if auxiliary_output is None:
            # Note: nodes of type output-node in nnet3 may not be accessed in
            # Descriptors, so calling this with auxiliary_outputs=None doesn't
            # make sense.
            raise RuntimeError("Outputs of output-layer may not be used by other"
                               " layers")

        if auxiliary_output in self.auxiliary_outputs():
            return '{0}.{1}'.format(self.name, auxiliary_output)
        else:
            raise RuntimeError("Unknown auxiliary output name {0}"
                               "".format(auxiliary_output))

    def output_dim(self, auxiliary_output=None):

        if auxiliary_output is None:
            # Note: nodes of type output-node in nnet3 may not be accessed in
            # Descriptors, so calling this with auxiliary_outputs=None doesn't
            # make sense.
            raise RuntimeError("Outputs of output-layer may not be used by other"
                               " layers")
        return self.config['dim']

    def get_full_config(self):
        ans = []
        config_lines = self._generate_config()

        for line in config_lines:
            for config_name in ['ref', 'final']:
                # we do not support user specified matrices in LSTM initialization
                # so 'ref' and 'final' configs are the same.
                ans.append((config_name, line))
        return ans


    def _generate_config(self):

        configs = []

        # note: each value of self.descriptors is (descriptor, dim,
        # normalized-string, output-string).
        # by 'descriptor_final_string' we mean a string that can appear in
        # config-files, i.e. it contains the 'final' names of nodes.
        descriptor_final_string = self.descriptors['input']['final-string']
        input_dim = self.descriptors['input']['dim']
        output_dim = self.config['dim']
        bottleneck_dim = self.config['bottleneck-dim']
        objective_type = self.config['objective-type']
        include_log_softmax = self.config['include-log-softmax']
        output_delay = self.config['output-delay']

        affine_options = self.config['ng-affine-options']
        for opt in [ 'learning-rate-factor', 'l2-regularize', 'max-change',
                     'param-stddev', 'bias-stddev' ]:
            if self.config[opt] != '':
                affine_options += ' {0}={1}'.format(opt, self.config[opt])

        cur_node = descriptor_final_string
        cur_dim = input_dim

        if bottleneck_dim >= 0:
            if bottleneck_dim == 0 or bottleneck_dim >= input_dim or bottleneck_dim >= output_dim:
                raise RuntimeError("Bottleneck dim has value that does not make sense: {0}".format(
                    bottleneck_dim))
            # This is the bottleneck case (it doesn't necessarily imply we
            # will be using the features from the bottleneck; it's just a factorization
            # of the matrix into two pieces without a nonlinearity in between).
            # We don't include the l2-regularize option because it's useless
            # given the orthonormality constraint.
            linear_options = self.config['ng-linear-options']
            for opt in [ 'learning-rate-factor', 'l2-regularize', 'max-change' ]:
                if self.config[opt] != '':
                    linear_options += ' {0}={1}'.format(opt, self.config[opt])


            # note: by default the LinearComponent uses natural gradient.
            line = ('component name={0}.linear type=LinearComponent '
                    'orthonormal-constraint={1} param-stddev={2} '
                    'input-dim={3} output-dim={4} max-change=0.75 {5}'
                    ''.format(self.name, self.config['orthonormal-constraint'],
                              self.config['orthonormal-constraint'] / math.sqrt(input_dim),
                              input_dim, bottleneck_dim, linear_options))
            configs.append(line)
            line = ('component-node name={0}.linear component={0}.linear input={1}'
                    ''.format(self.name, cur_node))
            configs.append(line)
            cur_node = '{0}.linear'.format(self.name)
            cur_dim = bottleneck_dim


        line = ('component name={0}.affine'
                ' type=NaturalGradientAffineComponent'
                ' input-dim={1} output-dim={2} {3}'
                ''.format(self.name, cur_dim, output_dim, affine_options))
        configs.append(line)
        line = ('component-node name={0}.affine'
                ' component={0}.affine input={1}'
                ''.format(self.name, cur_node))
        configs.append(line)
        cur_node = '{0}.affine'.format(self.name)

        if include_log_softmax:
            line = ('component name={0}.log-softmax'
                    ' type=LogSoftmaxComponent dim={1}'
                    ''.format(self.name, output_dim))
            configs.append(line)

            line = ('component-node name={0}.log-softmax'
                    ' component={0}.log-softmax input={1}'
                    ''.format(self.name, cur_node))
            configs.append(line)
            cur_node = '{0}.log-softmax'.format(self.name)

        if output_delay != 0:
            cur_node = 'Offset({0}, {1})'.format(cur_node, output_delay)

        line = ('output-node name={0} input={1} '
                'objective={2}'.format(
                    self.name, cur_node, objective_type))
        configs.append(line)
        return configs


class XconfigBasicLayer(XconfigLayerBase):
    """This class is for parsing lines like
     'relu-renorm-layer name=layer1 dim=1024 input=Append(-3,0,3)'
    or:
     'sigmoid-layer name=layer1 dim=1024 input=Append(-3,0,3)'
    which specify addition of an affine component and a sequence of non-linearities.
    Here, the name of the layer itself dictates the sequence of nonlinearities
    that are applied after the affine component; the name should contain some
    combination of 'relu', 'renorm', 'sigmoid' and 'tanh',
    and these nonlinearities will be added along with the affine component.

    The dimension specified is the output dim; the input dim is worked out from the input descriptor.
    This class supports only nonlinearity types that do not change the dimension; we can create
    another layer type to enable the use p-norm and similar dimension-reducing nonlinearities.

    See other configuration values below.

    Parameters of the class, and their defaults:
      input='[-1]'             [Descriptor giving the input of the layer.]
      dim=-1                   [Output dimension of layer, e.g. 1024]
      bottleneck-dim=-1        [If you set this, a linear bottleneck is added, so
                                we project to first bottleneck-dim then to dim.  The
                                first of the two matrices is constrained to be
                                orthonormal.]
      self-repair-scale=1.0e-05  [Affects relu, sigmoid and tanh layers.]
      learning-rate-factor=1.0   [This can be used to make the affine component
                                  train faster or slower].
      add-log-stddev=False     [If true, the log of the stddev of the output of
                                renorm layer is appended as an
                                additional dimension of the layer's output]
      l2-regularize=0.0       [Set this to a nonzero value (e.g. 1.0e-05) to
                               add l2 regularization on the parameter norm for
                                this component.
    """
    def __init__(self, first_token, key_to_value, prev_names=None):
        XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names)

    def set_default_configs(self):

        # note: self.config['input'] is a descriptor, '[-1]' means output
        # the most recent layer.
        self.config = {'input': '[-1]',
                       'dim': -1,
                       'bottleneck-dim': -1,  # Deprecated!  Use tdnnf-layer for
                                              # factorized TDNNs, or prefinal-layer
                                              # for bottlenecks just before the output.
                       'self-repair-scale': 1.0e-05,
                       'target-rms': 1.0,
                       'ng-affine-options': '',
                       'ng-linear-options': '',    # only affects bottleneck layers.
                       'dropout-proportion': 0.5,  # dropout-proportion only
                                                   # affects layers with
                                                   # 'dropout' in the name
                       'dropout-per-dim': False,  # if dropout-per-dim=true, the dropout
                                                  # mask is shared across time.
                       'dropout-per-dim-continuous':  False, # if you set this, it's
                                                    # like dropout-per-dim but with a
                                                    # continuous-valued (not zero-one) mask.
                       'add-log-stddev': False,
                       # the following are not really inspected by this level of
                       # code, just passed through to the affine component if
                       # their value is not ''.
                       'bias-stddev': '',
                       'l2-regularize': '',
                       'learning-rate-factor': '',
                       'max-change': 0.75 }

    def check_configs(self):
        if self.config['dim'] < 0:
            raise RuntimeError("dim has invalid value {0}".format(self.config['dim']))
        b = self.config['bottleneck-dim']
        if b >= 0 and (b >= self.config['dim'] or b == 0):
            raise RuntimeError("bottleneck-dim has an invalid value {0}".format(b))

        if self.config['self-repair-scale'] < 0.0 or self.config['self-repair-scale'] > 1.0:
            raise RuntimeError("self-repair-scale has invalid value {0}"
                               .format(self.config['self-repair-scale']))
        if self.config['target-rms'] < 0.0:
            raise RuntimeError("target-rms has invalid value {0}"
                               .format(self.config['target-rms']))
        if (self.config['learning-rate-factor'] != '' and
            self.config['learning-rate-factor'] <= 0.0):
            raise RuntimeError("learning-rate-factor has invalid value {0}"
                               .format(self.config['learning-rate-factor']))

    def output_name(self, auxiliary_output=None):
        # at a later stage we might want to expose even the pre-nonlinearity
        # vectors
        assert auxiliary_output is None

        split_layer_name = self.layer_type.split('-')
        assert split_layer_name[-1] == 'layer'
        last_nonlinearity = split_layer_name[-2]
        # return something like: layer3.renorm
        return '{0}.{1}'.format(self.name, last_nonlinearity)

    def output_dim(self, auxiliary_output=None):
        output_dim = self.config['dim']
        # If not set, the output-dim defaults to the input-dim.
        if output_dim <= 0:
            self.config['dim'] = self.descriptors['input']['dim']

        return output_dim

    def get_full_config(self):
        ans = []
        config_lines = self._generate_config()

        for line in config_lines:
            for config_name in ['ref', 'final']:
                # we do not support user specified matrices in this layer
                # so 'ref' and 'final' configs are the same.
                ans.append((config_name, line))
        return ans

    def _generate_config(self):
        split_layer_name = self.layer_type.split('-')
        assert split_layer_name[-1] == 'layer'
        nonlinearities = split_layer_name[:-1]

        # by 'descriptor_final_string' we mean a string that can appear in
        # config-files, i.e. it contains the 'final' names of nodes.
        input_desc = self.descriptors['input']['final-string']
        input_dim = self.descriptors['input']['dim']

        # the child classes e.g. tdnn might want to process the input
        # before adding the other components

        return self._add_components(input_desc, input_dim, nonlinearities)

    def _add_components(self, input_desc, input_dim, nonlinearities):
        output_dim = self.output_dim()
        self_repair_scale = self.config['self-repair-scale']
        target_rms = self.config['target-rms']

        affine_options = self.config['ng-affine-options']
        for opt_name in [ 'max-change', 'learning-rate-factor',
                          'bias-stddev', 'l2-regularize' ]:
            value = self.config[opt_name]
            if value != '':
                affine_options += ' {0}={1}'.format(opt_name, value)

        # The output of the affine component needs to have one dimension fewer in order to
        # get the required output dim, if the final 'renorm' component has 'add-log-stddev' set
        # (since in that case it increases the dimension by one).
        if self.config['add-log-stddev']:
            output_dim -= 1
            if not self.layer_type.split('-')[-2] == "renorm":
                raise RuntimeError("add-log-stddev cannot be true unless "
                                   "there is a final 'renorm' component.")

        configs = []
        cur_dim = input_dim
        cur_node = input_desc

        # First the affine node (or linear then affine, if bottleneck).
        if self.config['bottleneck-dim'] > 0:
            # The 'bottleneck-dim' option is deprecated and may eventually be
            # removed.  Best to use tdnnf-layer if you want factorized TDNNs.

            # This is the bottleneck case (it doesn't necessarily imply we
            # will be using the features from the bottleneck; it's just a factorization
            # of the matrix into two pieces without a nonlinearity in between).
            # We don't include the l2-regularize option because it's useless
            # given the orthonormality constraint.
            linear_options = self.config['ng-linear-options']
            for opt_name in [ 'max-change', 'learning-rate-factor' ]:
                value = self.config[opt_name]
                if value != '':
                    linear_options += ' {0}={1}'.format(opt_name, value)

            bottleneck_dim = self.config['bottleneck-dim']
            # note: by default the LinearComponent uses natural gradient.
            line = ('component name={0}.linear type=LinearComponent '
                    'input-dim={1} orthonormal-constraint=1.0 output-dim={2} {3}'
                    ''.format(self.name, input_dim, bottleneck_dim, linear_options))
            configs.append(line)
            line = ('component-node name={0}.linear component={0}.linear input={1}'
                    ''.format(self.name, cur_node))
            configs.append(line)
            cur_node = '{0}.linear'.format(self.name)
            cur_dim = bottleneck_dim


        line = ('component name={0}.affine type=NaturalGradientAffineComponent'
                ' input-dim={1} output-dim={2} {3}'
                ''.format(self.name, cur_dim, output_dim, affine_options))
        configs.append(line)
        line = ('component-node name={0}.affine component={0}.affine input={1}'
                ''.format(self.name, cur_node))
        configs.append(line)
        cur_node = '{0}.affine'.format(self.name)

        for i, nonlinearity in enumerate(nonlinearities):
            if nonlinearity == 'relu':
                line = ('component name={0}.{1} type=RectifiedLinearComponent dim={2}'
                        ' self-repair-scale={3}'
                        ''.format(self.name, nonlinearity, output_dim,
                                  self_repair_scale))

            elif nonlinearity == 'sigmoid':
                line = ('component name={0}.{1}'
                        ' type=SigmoidComponent dim={2}'
                        ' self-repair-scale={3}'
                        ''.format(self.name, nonlinearity, output_dim,
                                  self_repair_scale))

            elif nonlinearity == 'tanh':
                line = ('component name={0}.{1}'
                        ' type=TanhComponent dim={2}'
                        ' self-repair-scale={3}'
                        ''.format(self.name, nonlinearity, output_dim,
                                  self_repair_scale))

            elif nonlinearity == 'renorm':
                add_log_stddev = "false"
                if i == len(nonlinearities) - 1:
                    add_log_stddev = ("true" if self.config['add-log-stddev']
                                      else "false")
                line = ('component name={0}.{1}'
                        ' type=NormalizeComponent dim={2}'
                        ' target-rms={3}'
                        ' add-log-stddev={4}'
                        ''.format(self.name, nonlinearity, output_dim,
                                  target_rms, add_log_stddev))

            elif nonlinearity == 'batchnorm':
                line = ('component name={0}.{1}'
                        ' type=BatchNormComponent dim={2} target-rms={3}'
                        ''.format(self.name, nonlinearity, output_dim,
                                  target_rms))

            elif nonlinearity == 'so':
                line = ('component name={0}.{1}'
                        ' type=ScaleAndOffsetComponent dim={2} max-change=0.5 '
                        ''.format(self.name, nonlinearity, output_dim))

            elif nonlinearity == 'dropout':
                if not (self.config['dropout-per-dim'] or
                        self.config['dropout-per-dim-continuous']):
                    line = ('component name={0}.{1} type=DropoutComponent '
                            'dim={2} dropout-proportion={3}'.format(
                                self.name, nonlinearity, output_dim,
                                self.config['dropout-proportion']))
                else:
                    continuous_opt='continuous=true' if self.config['dropout-per-dim-continuous'] else ''

                    line = ('component name={0}.dropout type=GeneralDropoutComponent '
                            'dim={1} dropout-proportion={2} {3}'.format(
                                self.name, output_dim, self.config['dropout-proportion'],
                                continuous_opt))
            else:
                raise RuntimeError("Unknown nonlinearity type: {0}"
                                   .format(nonlinearity))

            configs.append(line)
            line = ('component-node name={0}.{1}'
                    ' component={0}.{1} input={2}'
                    ''.format(self.name, nonlinearity, cur_node))

            configs.append(line)
            cur_node = '{0}.{1}'.format(self.name, nonlinearity)
        return configs


class XconfigFixedAffineLayer(XconfigLayerBase):
    """
    This class is for lines like
     'fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=foo/bar/lda.mat'

    The output dimension of the layer may be specified via 'dim=xxx', but if not specified,
    the dimension defaults to the same as the input.  Note: we don't attempt to read that
    file at the time the config is created, because in the recipes, that file is created
    after the config files.

    See other configuration values below.

    Parameters of the class, and their defaults:
      input='[-1]'             [Descriptor giving the input of the layer.]
      dim=None                   [Output dimension of layer; defaults to the same as the input dim.]
      affine-transform-file='' [Must be specified.]
      delay=0                  [Optional delay for the output-node in init.config]
    """
    def __init__(self, first_token, key_to_value, prev_names=None):
        assert first_token == 'fixed-affine-layer'
        XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names)

    def set_default_configs(self):
        # note: self.config['input'] is a descriptor, '[-1]' means output
        # the most recent layer.
        self.config = {'input': '[-1]',
                       'dim': -1,
                       'affine-transform-file': '',
                       'delay': 0,
                       'write-init-config': True}

    def check_configs(self):
        if self.config['affine-transform-file'] is None:
            raise RuntimeError("affine-transform-file must be set.")

    def output_name(self, auxiliary_output=None):
        # Fixed affine layer computes only one vector, there are no intermediate
        # vectors.
        assert auxiliary_output is None
        return self.name

    def output_dim(self, auxiliary_output=None):
        output_dim = self.config['dim']
        # If not set, the output-dim defaults to the input-dim.
        if output_dim <= 0:
            output_dim = self.descriptors['input']['dim']
        return output_dim

    def get_full_config(self):
        ans = []

        # note: each value of self.descriptors is (descriptor, dim,
        # normalized-string, output-string).
        # by 'descriptor_final_string' we mean a string that can appear in
        # config-files, i.e. it contains the 'final' names of nodes.
        descriptor_final_string = self.descriptors['input']['final-string']
        input_dim = self.descriptors['input']['dim']
        output_dim = self.output_dim()
        transform_file = self.config['affine-transform-file']

        if self.config['write-init-config']:
            if self.config['delay'] != 0:
                line = 'component name={0}.delayed type=NoOpComponent dim={1}'.format(self.name, input_dim)
                ans.append(('init', line))
                line = 'component-node name={0}.delayed component={0}.delayed input={1}'.format(self.name, descriptor_final_string)
                ans.append(('init', line))
                line = 'output-node name=output input=Offset({0}.delayed, {1})'.format(self.name, self.config['delay'])
                ans.append(('init', line))
            else:
                # to init.config we write an output-node with the name 'output' and
                # with a Descriptor equal to the descriptor that's the input to this
                # layer.  This will be used to accumulate stats to learn the LDA transform.
                line = 'output-node name=output input={0}'.format(descriptor_final_string)
                ans.append(('init', line))

        # write the 'real' component to final.config
        line = 'component name={0} type=FixedAffineComponent matrix={1}'.format(
            self.name, transform_file)
        ans.append(('final', line))
        # write a random version of the component, with the same dims, to ref.config
        line = 'component name={0} type=FixedAffineComponent input-dim={1} output-dim={2}'.format(
            self.name, input_dim, output_dim)
        ans.append(('ref', line))
        # the component-node gets written to final.config and ref.config.
        line = 'component-node name={0} component={0} input={1}'.format(
            self.name, descriptor_final_string)
        ans.append(('final', line))
        ans.append(('ref', line))
        return ans


class XconfigAffineLayer(XconfigLayerBase):
    """
    This class is for lines like
     'affine-layer name=affine input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0))'

    The output dimension of the layer may be specified via 'dim=xxx', but if not specified,
    the dimension defaults to the same as the input.  Note: we don't attempt to read that
    file at the time the config is created, because in the recipes, that file is created
    after the config files.

    See other configuration values below.

    Parameters of the class, and their defaults:
      input='[-1]'             [Descriptor giving the input of the layer.]
      dim=None                 [Output dimension of layer; defaults to the same as the input dim.]

      l2-regularize=0.0       [Set this to a nonzero value (e.g. 1.0e-05) to
                               add l2 regularization on the parameter norm
                               for the affine component.]
    """

    def __init__(self, first_token, key_to_value, prev_names=None):
        assert first_token == 'affine-layer'
        XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names)

    def set_default_configs(self):
        # note: self.config['input'] is a descriptor, '[-1]' means output
        # the most recent layer.
        # use None for optional parameters as we want to default to the C++ defaults
        # C++ component provides more options but I will just expose these for now
        # Note : The type of the parameter is determined based on the value assigned
        #        so please use decimal point if your parameter is a float
        self.config = {'input': '[-1]',
                       'dim': -1,
                       'param-stddev': -1.0,  # this has to be initialized to 1/sqrt(input_dim)
                       'bias-stddev': 1.0,
                       'bias-mean': 0.0,
                       'max-change': 0.75,
                       'l2-regularize': 0.0,
                       'learning-rate-factor': 1.0,
                       'ng-affine-options': ''}

    def set_derived_configs(self):
        super(XconfigAffineLayer, self).set_derived_configs()
        if self.config['param-stddev'] < 0:
            self.config['param-stddev'] = 1.0 / math.sqrt(self.descriptors['input']['dim'])

    def check_configs(self):
        if self.config['dim'] <= 0:
            raise RuntimeError("dim specified is invalid")

    def output_name(self, auxiliary_output=None):
        # affine layer computes only one vector, there are no intermediate
        # vectors.
        assert auxiliary_output is None
        return self.name

    def output_dim(self, auxiliary_output=None):
        output_dim = self.config['dim']
        # If not set, the output-dim defaults to the input-dim.
        if output_dim <= 0:
            output_dim = self.descriptors['input']['dim']

        return output_dim

    def get_full_config(self):
        ans = []

        # note: each value of self.descriptors is (descriptor, dim,
        # normalized-string, output-string).
        # by 'descriptor_final_string' we mean a string that can appear in
        # config-files, i.e. it contains the 'final' names of nodes.
        descriptor_final_string = self.descriptors['input']['final-string']
        input_dim = self.descriptors['input']['dim']
        output_dim = self.output_dim()

        option_string = ''
        for key in ['param-stddev', 'bias-stddev', 'bias-mean', 'max-change',
                    'l2-regularize']:
            option_string += ' {0}={1}'.format(key, self.config[key])
        option_string += self.config['ng-affine-options']

        conf_lines = []
        # write the 'real' component to final.config
        conf_lines.append('component name={n} type=NaturalGradientAffineComponent '
                          'input-dim={i} output-dim={o} {opts}'.format(n=self.name,
                                                                       i=input_dim,
                                                                       o=output_dim,
                                                                       opts=option_string))
        # the component-node gets written to final.config and ref.config.
        conf_lines.append('component-node name={0} component={0} input={1}'.format(self.name,
                                                                                   descriptor_final_string))

        # the config is same for both final and ref configs
        for conf_name in ['final', 'ref']:
            for line in conf_lines:
                ans.append((conf_name, line))
        return ans


class XconfigIdctLayer(XconfigLayerBase):
    """
    This class is for lines like
     'idct-layer name=idct dim=40 cepstral-lifter=22 affine-transform-file=foo/bar/idct.mat'

    This is used to convert input MFCC-features to Filterbank featurs. The
    affine transformation is written out to the file specified via
    'affine-transform-file=xxx'.
    The output dimension of the layer may be specified via 'dim=xxx', but if not specified,
    the dimension defaults to the same as the input.

    See other configuration values below.

    Parameters of the class, and their defaults:
      input='[-1]'             [Descriptor giving the input of the layer.]
      dim=None                   [Output dimension of layer; defaults to the same as the input dim.]
      cepstral-lifter=22       [Apply liftering co-efficient.]
      affine-transform-file='' [Must be specified.]
      include-in-init=false     [You should set this to true if this precedes a
                                `fixed-affine-layer` that is to be initialized
                                 via LDA]
    """
    def __init__(self, first_token, key_to_value, prev_names=None):
        assert first_token == 'idct-layer'
        XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names)

    def set_default_configs(self):
        # note: self.config['input'] is a descriptor, '[-1]' means output
        # the most recent layer.
        self.config = {'input': '[-1]',
                       'dim': -1,
                       'cepstral-lifter': 22.0,
                       'affine-transform-file': '',
                       'include-in-init': False}

    def check_configs(self):
        if self.config['affine-transform-file'] is None:
            raise RuntimeError("affine-transform-file must be set.")

    def output_name(self, auxiliary_output=None):
        # Fixed affine layer computes only one vector, there are no intermediate
        # vectors.
        assert auxiliary_output is None
        return self.name

    def output_dim(self, auxiliary_output=None):
        output_dim = self.config['dim']
        # If not set, the output-dim defaults to the input-dim.
        if output_dim <= 0:
            output_dim = self.descriptors['input']['dim']
        return output_dim

    def get_full_config(self):
        ans = []
        config_lines = self._generate_config()
        for line in config_lines:
            for config_name in ['ref', 'final']:
                # we do not support user specified matrices in this layer
                # so 'ref' and 'final' configs are the same.
                ans.append((config_name, line))
            if self.config['include-in-init']:
                ans.append(('init', line))
        return ans


    def _generate_config(self):

        # note: each value of self.descriptors is (descriptor, dim,
        # normalized-string, output-string).
        # by 'descriptor_final_string' we mean a string that can appear in
        # config-files, i.e. it contains the 'final' names of nodes.
        descriptor_final_string = self.descriptors['input']['final-string']
        input_dim = self.descriptors['input']['dim']
        output_dim = self.output_dim()
        transform_file = self.config['affine-transform-file']

        idct_mat = common_lib.compute_idct_matrix(
            input_dim, output_dim, self.config['cepstral-lifter'])
        # append a zero column to the matrix, this is the bias of the fixed
        # affine component
        for n in range(0, output_dim):
            idct_mat[n].append(0)
        common_lib.write_kaldi_matrix(transform_file, idct_mat)

        configs = []

        # write the 'real' component to final.config
        line = 'component name={0} type=FixedAffineComponent matrix={1}'.format(
            self.name, transform_file)
        configs.append(line)
        line = 'component-node name={0} component={0} input={1}'.format(
            self.name, descriptor_final_string)
        configs.append(line)
        return configs


class XconfigExistingLayer(XconfigLayerBase):
    """
    This class is used to internally convert component-nodes in an existing
    model into lines like
    'existing name=tdnn1.affine dim=40'.

    Layers of this type are not presented in any actual xconfig or config
    files, but are created internally for all component nodes
    in an existing neural net model to use as input to other layers in xconfig.
    (i.e. get_model_component_info function, which is called in
     steps/nnet3/xconfig_to_configs.py, parses the name and
     dimension of component-nodes used in the existing model
     using the nnet3-info and returns a list of 'existing' layers.)

    This class is useful in cases like transferring existing model
    and using {input, output, component}-nodes in this model as
    input to new layers.
    """

    def __init__(self, first_token, key_to_value, prev_names=None):

        assert first_token == 'existing'
        XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names)


    def set_default_configs(self):
        self.config = { 'dim': -1}

    def check_configs(self):
        if self.config['dim'] <= 0:
            raise RuntimeError("Dimension of existing-layer '{0}'"
                                "should be positive.".format(self.name))

    def get_input_descriptor_names(self):
        return []  # there is no 'input' field in self.config.

    def output_name(self, auxiliary_outputs=None):
        # there are no auxiliary outputs as this layer will just pass the input
        assert auxiliary_outputs is None
        return self.name

    def output_dim(self, auxiliary_outputs=None):
        # there are no auxiliary outputs as this layer will just pass the input
        assert auxiliary_outputs is None
        return self.config['dim']

    def get_full_config(self):
        # unlike other layers the existing layers should not to be printed in
        # any '*.config'
        ans = []
        return ans


class XconfigSpecAugmentLayer(XconfigLayerBase):
    """This class is for parsing lines like
     'spec-augment-layer name=spec-augment freq-max-proportion=0.5 time-zeroed-proportion=0.2 time-mask-max-frames=10'

    which will produce a component of type GeneralDropoutComponent (to do the
    frequency-domain part) and then one of type SpecaugmentTimeMaskComponent (to
    do the time part).

    Parameters of the class, and their defaults:
      input='[-1]'             [Descriptor giving the input of the layer.]
      freq-max-proportion=0.5  [The maximum proportion of the frequency space that
                                might be zeroed out]
      time-zeroed-proportion=0.2  [The proportion of time frames that will be zeroed
                                  out]
      time-mask-max-frames=20   [The maximum length of a zeroed region in the time
                                axis, in frames.]
      include-in-init=false     [You should set this to true if this precedes a
                                `fixed-affine-layer` that is to be initialized
                                 via LDA]
    """
    def __init__(self, first_token, key_to_value, prev_names=None):
        XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names)

    def set_default_configs(self):
        self.config = {'input': '[-1]',
                       'freq-max-proportion': 0.5,
                       'time-zeroed-proportion': 0.2,
                       'time-mask-max-frames': 20,
                       'include-in-init': False}


    def check_configs(self):
        assert (self.config['freq-max-proportion'] > 0.0 and self.config['freq-max-proportion'] < 1.0
                and self.config['time-zeroed-proportion'] > 0.0 and self.config['time-zeroed-proportion'] < 1.0
                and self.config['time-mask-max-frames'] >= 1)


    def output_name(self, auxiliary_output=None):
        assert auxiliary_output is None
        return '{0}.time-mask'.format(self.name)

    def output_dim(self, auxiliary_output=None):
        assert auxiliary_output is None
        input_dim = self.descriptors['input']['dim']
        return input_dim

    def get_full_config(self):
        ans = []
        config_lines = self._generate_config()

        for line in config_lines:
            for config_name in ['ref', 'final']:
                # we do not support user specified matrices in this layer
                # so 'ref' and 'final' configs are the same.
                ans.append((config_name, line))
            if self.config['include-in-init']:
                ans.append(('init', line))
        return ans

    def _generate_config(self):
        # by 'descriptor_final_string' we mean a string that can appear in
        # config-files, i.e. it contains the 'final' names of nodes.
        input_desc = self.descriptors['input']['final-string']
        input_dim = self.descriptors['input']['dim']
        freq_max_proportion = self.config['freq-max-proportion']
        time_zeroed_proportion = self.config['time-zeroed-proportion']
        time_mask_max_frames = self.config['time-mask-max-frames']

        configs = []
        line = ('component name={0}.freq-mask type=GeneralDropoutComponent dim={1} specaugment-max-proportion={2}'.format(
            self.name, input_dim, freq_max_proportion))
        configs.append(line)
        line = ('component-node name={0}.freq-mask component={0}.freq-mask input={1}'.format(
            self.name, input_desc))
        configs.append(line)
        line = ('component name={0}.time-mask type=SpecAugmentTimeMaskComponent dim={1} '
                'zeroed-proportion={2} time-mask-max-frames={3}'.format(
                    self.name, input_dim, time_zeroed_proportion, time_mask_max_frames))
        configs.append(line)
        line = ('component-node name={0}.time-mask component={0}.time-mask input={0}.freq-mask'.format(
            self.name))
        configs.append(line)
        return configs


def test_layers():
    # for some config lines that should be printed the same way as they
    # are read, check that this is the case.
    for x in ['input name=input dim=30']:
        assert str(config_line_to_object(x, [])) == x


================================================
FILE: egs/steps/libs/nnet3/xconfig/composite_layers.py
================================================
# Copyright 2018    Johns Hopkins University (Dan Povey)
# Apache 2.0.

""" This module contains some composite layers, which is basically a catch-all
    term for things like TDNN-F that contain several affine or linear comopnents.
"""
from __future__ import print_function
import math
import re
import sys
from libs.nnet3.xconfig.basic_layers import XconfigLayerBase

# This class is intended to implement an extension of the factorized TDNN
# (TDNN-F) that supports resnet-type 'bypass' connections.  It is for lines like
# the following:
#
# tdnnf-layer name=tdnnf2 dim=1024 bottleneck-dim=128 dropout-proportion=0.0 time-stride=3
#
# The line above would be roughly equivalent to the following four lines (except
# for different naming, and the use of TdnnComponent, for efficiency, in place
# of AffineComponent).  Assume that the previous layer (the default input) was tdnnf1:
#
#  linear-component name=tdnnf2.linear dim=128 orthonormal-constraint=-1.0 input=Append(Offset(-3, tdnnf1), tdnnf1)
#  relu-batchnorm-dropout-layer name=tdnnf2.affine dim=1024 dropout-proportion=0.0 \
#    dropout-per-dim-continuous=true input=Append(0,3)
#  no-op-component name=tdnnf2 input=Sum(Scale(0.66,tdnnf1), tdnn2.affine)

#  Documentation of some of the important options:
#
#   - dropout-proportion
# This gets passed through to the dropout component.  If you don't set
# 'dropout-proportion', no dropout component will be included; it would be like
# using a relu-batchnorm-layer in place of a relu-batchnorm-dropout-layer.  You
# should only set 'dropout-proportion' if you intend to use dropout (it would
# usually be combined with the --dropout-schedule option to train.py).  If you
# use the --dropout-schedule option, the value doesn't really matter since it
# will be changed during training, and 0 is recommended.
#
#  - time-stride
# Controls the time offsets in the splicing, e.g. if you set time-stride to
# 1 instead of the 3 in the example, the time-offsets would be -1 and 1 instead
# of 1 and 3.
# If you set time-stride=0, as a special case no splicing over time will be
# performed (so no Append() expressions) and the second linear component (named
# tdnnf2l in the example) would be omitted, since it would add no modeling
# power.
# You can set time-stride to a negative number which will negate all the
# time indexes; it might potentially be useful to alternate negative and positive
# time-stride if you wanted to force the overall network to have symmetric
# context, since with positive time stride, this layer has more negative
# than positive time context (i.e. more left than right).
#
#  - bypass-scale

# A scale on the previous layer's output, used in bypass (resnet-type)
# connections.  Should not exceed 1.0.  The default is 0.66.  If you set it to
# zero, the layer will lack the bypass (but we don't recommend this).  won't use
# a bypass connection at all, so it would be like conventional TDNN-F Note: the
# layer outputs are added together after the batchnorm so the model cannot
# control their relative magnitudes and this does actually affect what it can
# model.  When we experimented with having this scale trainable it did not seem
# to give an advantage.
#
#  - l2-regularize
# This is passed through to the linear and affine components.  You'll normally
# want this to be set to a nonzero value, e.g. 0.004.

class XconfigTdnnfLayer(XconfigLayerBase):

    def __init__(self, first_token, key_to_value, prev_names = None):
        assert first_token == "tdnnf-layer"
        XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names)

    def set_default_configs(self):
        self.config = {'input':'[-1]',
                       'dim':-1,
                       'bottleneck-dim':-1,
                       'bypass-scale':0.66,
                       'dropout-proportion':-1.0,
                       'time-stride':1,
                       'l2-regularize':0.0,
                       'max-change': 0.75,
                       'self-repair-scale': 1.0e-05,
                       'context': 'default'}

    def set_derived_configs(self):
        pass

    def check_configs(self):
        if self.config['bottleneck-dim'] <= 0:
            raise RuntimeError("bottleneck-dim must be set and >0.")
        if self.config['dim'] <= self.config['bottleneck-dim']:
            raise RuntimeError("dim must be greater than bottleneck-dim")

        dropout = self.config['dropout-proportion']
        if dropout != -1.0 and not (dropout >= 0.0 and dropout < 1.0):
            raise RuntimeError("invalid value for dropout-proportion")

        if abs(self.config['bypass-scale']) > 1.0:
            raise RuntimeError("bypass-scale has invalid value")

        input_dim = self.descriptors['input']['dim']
        output_dim = self.config['dim']
        if output_dim != input_dim and self.config['bypass-scale'] != 0.0:
            raise RuntimeError('bypass-scale is nonzero but output-dim != input-dim: {0} != {1}'
                               ''.format(output_dim, input_dim))

        if not self.config['context'] in ['default', 'left-only', 'shift-left', 'none']:
            raise RuntimeError('context must be default, left-only shift-left or none, got {}'.format(
                self.config['context']))


    def output_name(self, auxiliary_output=None):
        assert auxiliary_output is None
        output_component = ''
        if self.config['bypass-scale'] != 0.0:
            # the no-op component is used to cache something that we don't want
            # to have to recompute.
            output_component = 'noop'
        elif self.config['dropout-proportion'] != -1.0:
            output_component = 'dropout'
        else:
            output_component = 'batchnorm'
        return '{0}.{1}'.format(self.name, output_component)


    def output_dim(self, auxiliary_output=None):
        return self.config['dim']

    def get_full_config(self):
        ans = []
        config_lines = self._generate_config()

        for line in config_lines:
            for config_name in ['ref', 'final']:
                ans.append((config_name, line))
        return ans


    def _generate_config(self):
        configs = []
        name = self.name
        input_dim = self.descriptors['input']['dim']
        input_descriptor = self.descriptors['input']['final-string']
        output_dim = self.config['dim']
        bottleneck_dim = self.config['bottleneck-dim']
        bypass_scale = self.config['bypass-scale']
        dropout_proportion = self.config['dropout-proportion']
        time_stride = self.config['time-stride']
        context = self.config['context']
        if time_stride != 0 and context != 'none':
            time_offsets1 = '{0},0'.format(-time_stride)
            if context == 'default':
                time_offsets2 = '0,{0}'.format(time_stride)
            elif context == 'shift-left':
                time_offsets2 = '{0},0'.format(-time_stride)
            else:
                assert context == 'left-only'
                time_offsets2 = '0'
        else:
            time_offsets1 = '0'
            time_offsets2 = '0'
        l2_regularize = self.config['l2-regularize']
        max_change = self.config['max-change']
        self_repair_scale = self.config['self-repair-scale']

        # The first linear layer, from input-dim (spliced x2) to bottleneck-dim
        configs.append('component name={0}.linear type=TdnnComponent input-dim={1} '
                       'output-dim={2} l2-regularize={3} max-change={4} use-bias=false '
                       'time-offsets={5} orthonormal-constraint=-1.0'.format(
                           name, input_dim, bottleneck_dim, l2_regularize,
                           max_change, time_offsets1))
        configs.append('component-node name={0}.linear component={0}.linear '
                       'input={1}'.format(name, input_descriptor))

        # The affine layer, from bottleneck-dim (spliced x2) to output-dim
        configs.append('component name={0}.affine type=TdnnComponent '
                       'input-dim={1} output-dim={2} l2-regularize={3} max-change={4} '
                       'time-offsets={5}'.format(
                           name, bottleneck_dim, output_dim, l2_regularize,
                           max_change, time_offsets2))
        configs.append('component-node name={0}.affine component={0}.affine '
                       'input={0}.linear'.format(name))

        # The ReLU layer
        configs.append('component name={0}.relu type=RectifiedLinearComponent dim={1} '
                       'self-repair-scale={2}'.format(
                           name, output_dim, self_repair_scale))
        configs.append('component-node name={0}.relu component={0}.relu '
                       'input={0}.affine'.format(name))

        # The BatchNorm layer
        configs.append('component name={0}.batchnorm type=BatchNormComponent '
                       'dim={1}'.format(name, output_dim))
        configs.append('component-node name={0}.batchnorm component={0}.batchnorm '
                       'input={0}.relu'.format(name))

        if dropout_proportion != -1:
            # This is not normal dropout.  It's dropout where the mask is shared
            # across time, and (thanks to continuous=true), instead of a
            # zero-or-one scale, it's a continuously varying scale whose
            # expected value is 1, drawn from a uniform distribution over an
            # interval of a size that varies with dropout-proportion.
            configs.append('component name={0}.dropout type=GeneralDropoutComponent '
                           'dim={1} dropout-proportion={2} continuous=true'.format(
                               name, output_dim, dropout_proportion))
            configs.append('component-node name={0}.dropout component={0}.dropout '
                           'input={0}.batchnorm'.format(name))
            cur_component_type = 'dropout'
        else:
            cur_component_type = 'batchnorm'

        if bypass_scale != 0.0:
            # Add a NoOpComponent to cache the weighted sum of the input and the
            # output.  We could easily have the output of the component be a
            # Descriptor like 'Append(Scale(0.66, tdnn1.batchnorm), tdnn2.batchnorm)',
            # but if we did that and you used many of this component in sequence,
            # the weighted sums would have more and more terms as you went deeper
            # in the network.
            configs.append('component name={0}.noop type=NoOpComponent '
                           'dim={1}'.format(name, output_dim))
            configs.append('component-node name={0}.noop component={0}.noop '
                           'input=Sum(Scale({1}, {2}), {0}.{3})'.format(
                               name, bypass_scale, input_descriptor,
                               cur_component_type))

        return configs

# This is for lines like the following:
#  prefinal-layer name=prefinal-chain input=prefinal-l l2-regularize=0.02 big-dim=1024 small-dim=256
#
# which is equivalent to the following sequence of components (except for
# name differences):
#  relu-batchnorm-layer name=prefinal-chain input=prefinal-l l2-regularize=0.02 dim=1024
#  linear-comonent name=prefinal-chain-l dim=256 l2-regularize=0.02 orthonormal-constraint=-1.0
#  batchnorm-component name=prefinal-chain-batchnorm
#
# This layer is really just for convenience in writing config files: it doesn't
# do anything that's particular hard or unusual, but it encapsulates a commonly
# repeated pattern.
class XconfigPrefinalLayer(XconfigLayerBase):

    def __init__(self, first_token, key_to_value, prev_names = None):
        assert first_token == "prefinal-layer"
        XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names)

    def set_default_configs(self):
        self.config = {'input':'[-1]',
                       'big-dim':-1,
                       'small-dim':-1,
                       'l2-regularize':0.0,
                       'max-change': 0.75,
                       'self-repair-scale': 1.0e-05}

    def set_derived_configs(self):
        pass

    def check_configs(self):
        if self.config['small-dim'] <= 0:
            raise RuntimeError("small-dim must be set and >0.")
        if self.config['big-dim'] <= self.config['small-dim']:
            raise RuntimeError("big-dim must be greater than small-dim")

    def output_name(self, auxiliary_output=None):
        assert auxiliary_output is None
        return '{0}.batchnorm2'.format(self.name)

    def output_dim(self, auxiliary_output=None):
        return self.config['small-dim']

    def get_full_config(self):
        ans = []
        config_lines = self._generate_config()

        for line in config_lines:
            for config_name in ['ref', 'final']:
                ans.append((config_name, line))
        return ans


    def _generate_config(self):
        configs = []
        name = self.name

        input_dim = self.descriptors['input']['dim']
        input_descriptor = self.descriptors['input']['final-string']
        small_dim = self.config['small-dim']
        big_dim = self.config['big-dim']
        l2_regularize = self.config['l2-regularize']
        max_change = self.config['max-change']
        self_repair_scale = self.config['self-repair-scale']

        # The affine layer, from input-dim to big-dim.
        configs.append('component name={0}.affine type=NaturalGradientAffineComponent '
                       'input-dim={1} output-dim={2} l2-regularize={3} max-change={4}'.format(
                           name, input_dim, big_dim, l2_regularize, max_change))
        configs.append('component-node name={0}.affine component={0}.affine '
                       'input={1}'.format(name, input_descriptor))

        # The ReLU layer
        configs.append('component name={0}.relu type=RectifiedLinearComponent dim={1} '
                       'self-repair-scale={2}'.format(
                           name, big_dim, self_repair_scale))
        configs.append('component-node name={0}.relu component={0}.relu '
                       'input={0}.affine'.format(name))

        # The first BatchNorm layer
        configs.append('component name={0}.batchnorm1 type=BatchNormComponent '
                       'dim={1}'.format(name, big_dim))
        configs.append('component-node name={0}.batchnorm1 component={0}.batchnorm1 '
                       'input={0}.relu'.format(name))

        # The linear layer, from big-dim to small-dim, with orthonormal-constraint=-1
        # ("floating" orthonormal constraint).
        configs.append('component name={0}.linear type=LinearComponent '
                       'input-dim={1} output-dim={2} l2-regularize={3} max-change={4} '
                       'orthonormal-constraint=-1 '.format(
                           name, big_dim, small_dim,
                           l2_regularize, max_change))
        configs.append('component-node name={0}.linear component={0}.linear '
                       'input={0}.batchnorm1'.format(name))

        # The second BatchNorm layer
        configs.append('component name={0}.batchnorm2 type=BatchNormComponent '
                       'dim={1}'.format(name, small_dim))
        configs.append('component-node name={0}.batchnorm2 component={0}.batchnorm2 '
                       'input={0}.linear'.format(name))

        return configs


================================================
FILE: egs/steps/libs/nnet3/xconfig/convolution.py
================================================
# Copyright 2018    Johns Hopkins University (Author: Dan Povey)
#           2016    Vijayaditya Peddinti
# Apache 2.0.


""" This module has the implementation of convolutional layers.
"""
from __future__ import print_function
from __future__ import division
import math
import re
import sys
from libs.nnet3.xconfig.basic_layers import XconfigLayerBase


# This class is for lines like the following:
#

#  conv-batchnorm-layer name=conv2 height-in=40 height-out=40 \
#      num-filters-out=64 height-offsets=-1,0,1 time-offsets=-1,0,1 \
#      required-time-offsets=0
#  or (with NormalizeLayer instead of batch-norm, and with subsampling on the height axis):
#  conv-renorm-layer name=conv3 height-in=40 height-out=20 \
#      height-subsample-out=2 num-filters-out=128 height-offsets=-1,0,1 \
#       time-offsets=-1,0,1 required-time-offsets=0
#
# You don't specify subsampling on the time axis explicitly, it's implicit
# in the 'time-offsets' which are the same as the splicing indexes in a TDNN,
# and which, unlike the height offsets, operate relative to a fixed clock,
# so that after subsampling by a factor of 2, we'd expect all time-offsets
# of subsequent layers to be a factor of 2.  You don't specify the input
# num-filters either; it's worked out from the input height and the input dim.
#
# The layer-name encodes the use (or not) of batch normalization, so that if you
# want to skip batch normalization you could just call it 'conv-layer'.
#
# If batch-normalization is used, it's *spatial* batch-normalization, meaning
# that the offset and scale is specific to the output filter, but shared across
# all time and height offsets.
#
# Most of the configuration values mirror same-named values in class
# TimeHeightConvolutionComponent, and for a deeper understanding of what's going
# on you should look at the comment by its declaration, in
# src/nnet3/nnet-convolutional-component.h.
#
# Parameters of the class, and their defaults if they have defaults:
#
#   input='[-1]'             Descriptor giving the input of the layer.
#   height-in                The height of the input image, e.g. 40 if the input
#                            is MFCCs.  The num-filters-in is worked out as
#                            (dimension of input) / height-in.  If the preceding
#                            layer is a convolutional layer, height-in should be
#                            the same as the height-out of the preceding layer.
#   height-subsample-out=1   The height subsampling factor, will be e.g. 2 if you
#                            want to subsample by a factor of 2 on the height
#                            axis.
#   height-out               The height of the output image.  This will normally
#                            be <= (height-in / height-subsample-out).
#                            Zero-padding on the height axis may be implied by a
#                            combination of this and height-offsets-in, e.g. if
#                            height-out==height-in and height-subsample-out=1
#                            and height-offsets=-2,-1,0,1 then we'd be padding
#                            by 2 pixels on the bottom and 1 on the top; see
#                            comments in nnet-convolutional-layers.h for more
#                            details.
#   height-offsets           The offsets on the height axis that define what
#                            inputs require for each output pixel; will
#                            often be something like -1,0,1 (if zero-padding
#                            on height axis) or 0,1,2 otherwise.  These are
#                            comparable to TDNN splicing offsets; e.g. if
#                            height-offsets=-1,0,1 then height 10 at the output
#                            would take input from heights 9,10,11 at the input.
#   num-filters-out          The number of output filters.  The output dimension
#                            of this layer is num-filters-out * height-out; the
#                            filter dim varies the fastest (filter-stride == 1).
#   time-offsets             The input offsets on the time axis; these are
#                            interpreted just like the splicing indexes in TDNNs.
#                            E.g. if time-offsets=-2,0,2 then time 100 at the
#                            output would require times 98,100,102 at the input.
#   required-time-offsets    The subset of 'time-offsets' that are required in
#                            order to produce an output; if the set has fewer
#                            elements than 'time-offsets' then it implies some
#                            kind of zero-padding on the time axis is allowed.
#                            Defaults to the same as 'time-offsets'.  For speech
#                            tasks we recommend not to set this, as the normal
#                            padding approach is to pad with copies of the
#                            first/last frame, which is handled automatically in
#                            the calling code.
#   target-rms=1.0           Only applicable if the layer type is
#                            conv-batchnorm-layer or
#                            conv-normalize-layer.  This will affect the
#                            scaling of the output features (larger -> larger),
#                            and sometimes we set target-rms=0.5 for the layer
#                            prior to the final layer to make the final layer
#                            train more slowly.
#   self-repair-scale=2.0e-05  This affects the ReLu's.  It is a scale on the
#                            'self-repair' mechanism that nudges the inputs to the
#                            ReLUs into the appropriate range in cases where
#                            the unit is active either too little of the time
#                            (<10%) or too much of the time (>90%).
#
# The following initialization and natural-gradient related options are, if
# provided, passed through to the config file; if not, they are left at the
# defaults in the code.  See nnet-convolutional-component.h for more information.
#
#  param-stddev, bias-stddev, max-change, learning-rate-factor (float)
#  use-natural-gradient (bool)
#  rank-in, rank-out    (int)
#  num-minibatches-history (float)
#  alpha-in, alpha-out (float)
# the following is also passed into the convolution components, if specified:
#  l2-regularize (float)

class XconfigConvLayer(XconfigLayerBase):
    def __init__(self, first_token, key_to_value, prev_names = None):
        for operation in first_token.split('-')[:-1]:
            assert operation in ['conv', 'renorm', 'batchnorm', 'relu',
                                 'noconv', 'dropout', 'so']
        XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names)

    def set_default_configs(self):
        self.config = {'input':'[-1]',
                       'height-in':-1,
                       'height-subsample-out':1,
                       'height-out':-1,
                       'height-offsets':'',
                       'num-filters-out':-1,
                       'time-offsets':'',
                       'required-time-offsets':'',
                       'target-rms':1.0,
                       'self-repair-scale': 2.0e-05,
                       'self-repair-lower-threshold': 0.05,
                       # the following are not really inspected by this level of
                       # code, just passed through (but not if left at '').
                       'param-stddev':'', 'bias-stddev':'',
                       'max-change': 0.75, 'learning-rate-factor':'',
                       'use-natural-gradient':'',
                       'rank-in':'', 'rank-out':'', 'num-minibatches-history':'',
                       'alpha-in':'', 'alpha-out':'', 'l2-regularize':'',
                       'dropout-proportion': 0.5}

    def set_derived_configs(self):
        # sets 'num-filters-in'.
        input_dim = self.descriptors['input']['dim']
        height_in = self.config['height-in']
        if height_in <= 0:
            raise RuntimeError("height-in must be specified");
        if input_dim % height_in != 0:
            raise RuntimeError("Input dimension {0} is not a multiple of height-in={1}".format(
                input_dim, height_in))
        self.config['num-filters-in'] = input_dim // height_in


    # Check whether 'str' is a sorted, unique, nonempty list of integers, like -1,0,1.,
    # returns true if so.
    def check_offsets_var(self, str):
        try:
            a = [ int(x) for x in str.split(",") ]
            if len(a) == 0:
                return False
            for i in range(len(a) - 1):
                if a[i] >= a[i+1]:
                    return False
            return True
        except:
            return False

    def check_configs(self):
        # Do some basic checking of the configs.  The component-level code does
        # some more thorough checking, but if you set the height-out too small it
        # prints it as a warning, which the user may not see, so at a minimum we
        # want to check for that here.
        height_subsample_out = self.config['height-subsample-out']
        height_in = self.config['height-in']
        height_out = self.config['height-out']
        if height_subsample_out <= 0:
            raise RuntimeError("height-subsample-out has invalid value {0}.".format(
                height_subsample_out))
        # we already checked height-in in set_derived_configs.
        if height_out <= 0:
            raise RuntimeError("height-out has invalid value {0}.".format(
                height_out))
        if height_out * height_subsample_out > height_in:
            raise RuntimeError("The combination height-in={0}, height-out={1} and "
                               "height-subsample-out={2} does not look right "
                               "(height-out too large).".format(
                                   height_in, height_out, height_subsample_out))
        height_offsets = self.config['height-offsets']
        time_offsets = self.config['time-offsets']
        required_time_offsets = self.config['required-time-offsets']

        if not 'noconv' in self.layer_type.split('-'):
            # only check height-offsets, time-offsets and required-time-offsets if there
            # is actually a convolution in this layer.
            if not self.check_offsets_var(height_offsets):
                raise RuntimeError("height-offsets={0} is not valid".format(height_offsets))
            if not self.check_offsets_var(time_offsets):
                raise RuntimeError("time-offsets={0} is not valid".format(time_offsets))
            if required_time_offsets != "" and not self.check_offsets_var(required_time_offsets):
                raise RuntimeError("required-time-offsets={0} is not valid".format(
                    required_time_offsets))

        if height_out * height_subsample_out < \
           height_in - len(height_offsets.split(',')):
            raise RuntimeError("The combination height-in={0}, height-out={1} and "
                               "height-subsample-out={2} and height-offsets={3} "
                               "does not look right (height-out too small).")

        if self.config['target-rms'] <= 0.0:
            raise RuntimeError("Config value target-rms={0} is not valid".format(
                self.config['target_rms']))

    def auxiliary_outputs(self):
        return []

    def output_name(self, auxiliary_output = None):
        assert auxiliary_output is None
        # note: the [:-1] is to remove the '-layer'.
        operations = self.layer_type.split('-')[:-1]
        if operations[-1] == 'noconv':
            operations = operations[:-1]
        assert len(operations) >= 1
        last_operation = operations[-1]
        assert last_operation in ['relu', 'conv', 'renorm', 'batchnorm', 'dropout', 'so']
        # we'll return something like 'layer1.batchnorm'.
        return '{0}.{1}'.format(self.name, last_operation)

    def output_dim(self, auxiliary_output = None):
        assert auxiliary_output is None
        return self.config['num-filters-out'] * self.config['height-out']

    def get_full_config(self):
        ans = []
        config_lines = self._generate_cnn_config()

        for line in config_lines:
            for config_name in ['ref', 'final']:
                # we do not support user specified matrices in CNN initialization
                # so 'ref' and 'final' configs are the same.
                ans.append((config_name, line))
        return ans

    # convenience function to generate the CNN config
    def _generate_cnn_config(self):
        configs = []

        name = self.name

        # These 3 variables will be updated as we add components.
        cur_num_filters = self.config['num-filters-in']
        cur_height = self.config['height-in']
        cur_descriptor = self.descriptors['input']['final-string']

        # note: the [:-1] is to remove the '-layer'.
        operations = self.layer_type.split('-')[:-1]
        if operations[-1] == 'noconv':
            operations = operations[:-1]
        # e.g.:
        # operations = [ 'conv', 'relu', 'batchnorm' ]
        # or:
        # operations = [ 'relu', 'conv', 'renorm' ]

        for operation in operations:
            if operation == 'conv':
                a = []
                for opt_name in [
                        'param-stddev', 'bias-stddev', 'use-natural-gradient',
                        'max-change', 'rank-in', 'rank-out', 'num-minibatches-history',
                        'alpha-in', 'alpha-out', 'num-filters-in', 'num-filters-out',
                        'height-in','height-out', 'height-subsample-out',
                        'height-offsets', 'time-offsets', 'required-time-offsets',
                        'learning-rate-factor', 'l2-regularize' ]:
                    value = self.config[opt_name]
                    if value != '':
                        a.append('{0}={1}'.format(opt_name, value))
                conv_opts = ' '.join(a)

                configs.append('component name={0}.conv type=TimeHeightConvolutionComponent '
                               '{1}'.format(name, conv_opts))
                configs.append('component-node name={0}.conv component={0}.conv '
                               'input={1}'.format(name, cur_descriptor))
                cur_num_filters = self.config['num-filters-out']
                cur_height = self.config['height-out']
            elif operation == 'batchnorm':
                configs.append('component name={0}.batchnorm  type=BatchNormComponent dim={1} '
                               'block-dim={2} target-rms={3}'.format(
                                   name, cur_num_filters * cur_height, cur_num_filters,
                                   self.config['target-rms']))
                configs.append('component-node name={0}.batchnorm component={0}.batchnorm '
                               'input={1}'.format(name, cur_descriptor))
            elif operation == 'renorm':
                configs.append('component name={0}.renorm type=NormalizeComponent '
                           'dim={1} target-rms={2}'.format(
                               name, cur_num_filters * cur_height,
                               self.config['target-rms']))
                configs.append('component-node name={0}.renorm component={0}.renorm '
                               'input={1}'.format(name, cur_descriptor))
            elif operation == 'relu':
                configs.append('component name={0}.relu type=RectifiedLinearComponent '
                               'dim={1} block-dim={2} self-repair-scale={3} '
                               'self-repair-lower-threshold={4}'.format(
                                   name, cur_num_filters * cur_height, cur_num_filters,
                                   self.config['self-repair-scale'],
                                   self.config['self-repair-lower-threshold']))
                configs.append('component-node name={0}.relu component={0}.relu '
                               'input={1}'.format(name, cur_descriptor))
            elif operation == 'dropout':
                configs.append('component name={0}.dropout type=DropoutComponent '
                           'dim={1} dropout-proportion={2}'.format(
                               name, cur_num_filters * cur_height,
                               self.config['dropout-proportion']))
                configs.append('component-node name={0}.dropout component={0}.dropout '
                               'input={1}'.format(name, cur_descriptor))
            elif operation == 'so':
                configs.append('component name={0}.so type=ScaleAndOffsetComponent '
                           'dim={1} block-dim={2}'.format(
                               name, cur_num_filters * cur_height, cur_num_filters))
                configs.append('component-node name={0}.so component={0}.so '
                               'input={1}'.format(name, cur_descriptor))
            else:
                raise RuntimeError("Un-handled operation type: " + operation)

            cur_descriptor = '{0}.{1}'.format(name, operation)

        return configs


# This class is for lines like the following:
#
# res-block name=res1 num-filters=64 height=32 time-period=1
#
# It implements a residual block as in ResNets, with pre-activation, and with
# some small differences-- basically, instead of adding the input to the output,
# we put a convolutional layer in there but initialize it to the unit matrix and
# if you want you can give it a relatively small (or even zero) learning rate
# and max-change.  And there is batch-norm in that path also.
#
# The number of filters is the same on the input and output; it is actually
# redundant to write it in the config file, because given that we know the
# height, we can work it out from the dimension of the input (as dimension =
# height * num-filters).  But we allow it to be specified anyway, for clarity.
#
# Note: the res-block does not support subsampling or changing the number of
# filters.  If you want to do that, we recommend that you should do it with a
# single relu-batchnorm-conv-layer.
#
# Here are the most important configuration values, with defaults shown if
# defaults exist:
#
# input='[-1]'    Descriptor giving the input of the layer.
# height          The input and output height of the image, e.g. 40.  Note: the width
#                 is associated with the time dimension and is dealt with
#                 implicitly, so it's not specified here.
# num-filters     The number of filters on the input and output, e.g. 64.
#                 It does not have to be specified; if it is not specified,
#                 we work it out from the input dimension.
# num-bottleneck-filters   If specified then this will be a 'bottleneck'
#                 ResBlock, in which there is a 1x1 convolution from
#                 num-filters->num-bottleneck-filters, a 3x3 convolution
#                 from num-bottleneck-filters->num-bottleneck-filters, and
#                 a 1x1 convolution from num-bottleneck-filters->num-filters.
#
# time-period=1   Think of this as the stride in the time dimension.  At the
#                 input of the network will always have time-period=1; then
#                 after subsampling once in time we'd have time-period=2; then
#                 after subsampling again we'd have time-period=4.  Because of
#                 the way nnet3 works, subsampling on the time axis is an
#                 implicit, not explicit, operation.
# height-period=1  This will almost always be left at the default (1).  It is
#                 analogous to time-period, but because the height, unlike the
#                 time, is explicitly subsampled, in normal topologies this should
#                 be left at 1.
#
# bypass-source=noop
#                       The output of this component is Sum(convolution, x), and
#                       this option controls what 'x' is.  There are 3 options
#                       here: 'noop', 'input', 'relu' or 'batchnorm'.  'noop' is
#                       equivalent to 'input' in what it computes; it just
#                       inserts a 'noop' component in order to make the
#                       computation more efficient.  For both 'noop' and
#                       'input', x is the input to this component.  If
#                       bypass-source=relu then we use the relu of the
#                       input; if 'batchnorm', then we use the relu+batchnorm of
#                       the input.
# allow-zero-padding=true By default this will allow zero-padding in the time
#                       dimension, meaning that you don't need extra frames at
#                       the input to compute the output.  There may be ASR
#                       applications where you want to pad in the time dimension
#                       with repeats of the first or last frame (as we do for
#                       TDNNs), where it would be appropriate to write
#                       allow-zero-padding=false.  Note: the way we have
#                       set it up, it does zero-padding on the height axis
#                       regardless
#
# Less important config variables:
#  self-repair-scale=2.0e-05  This affects the ReLu's.  It is a scale on the
#                            'self-repair' mechanism that nudges the inputs to the
#                            ReLUs into the appropriate range in cases where
#                            the unit is active either too little of the time
#                            (<10%) or too much of the time (>90%).
#  max-change=0.75           Max-parameter-change constant (per minibatch)
#                            used for convolutional components.
#
#
# The following natural-gradient-related configuration variables are passed in
# to the convolution components, if specified:
#  use-natural-gradient (bool)
#  rank-in, rank-out    (int)
#  num-minibatches-history (float)
#  alpha-in, alpha-out (float)
# the following is also passed into the convolution components, if specified:
#  l2-regularize (float)
#

class XconfigResBlock(XconfigLayerBase):
    def __init__(self, first_token, key_to_value, prev_names = None):
        assert first_token == 'res-block'
        XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names)

    def set_default_configs(self):
        self.config = {'input':'[-1]',
                       'height':-1,
                       'num-filters':-1,
                       'num-bottleneck-filters':-1,
                       'time-period':1,
                       'height-period':1,
                       'self-repair-scale': 2.0e-05,
                       'self-repair-lower-threshold1': 0.05,
                       'self-repair-lower-threshold2': 0.05,
                       'self-repair-lower-threshold3': 0.05,
                       'max-change': 0.75,
                       'allow-zero-padding': True,
                       'bypass-source' : 'noop',
                       # the following are not really inspected by this level of
                       # code, just passed through (but not if left at '').
                       'param-stddev':'', 'bias-stddev':'',
                       'use-natural-gradient':'',
                       'rank-in':'', 'rank-out':'',
                       'num-minibatches-history':'',
                       'alpha-in':'', 'alpha-out':'', 'l2-regularize':'' }

    def set_derived_configs(self):
        # set 'num-filters' or check it..
        input_dim = self.descriptors['input']['dim']
        height = self.config['height']

        cur_num_filters = self.config['num-filters']
        if cur_num_filters == -1:
            if input_dim % height != 0:
                raise RuntimeError("Specified image height {0} does not "
                                   "divide the input dim {1}".format(
                                       height, input_dim))
            self.config['num-filters'] = input_dim / height
        elif input_dim != cur_num_filters * height:
            raise RuntimeError("Expected the input-dim to equal "
                               "height={0} * num-filters={1} = {2}, but "
                               "it is {3}".format(
                                   height, cur_num_filters,
                                   height * cur_num_filters,
                                   input_dim));

    def check_configs(self):
        # we checked the dimensions in set_derived_configs.
        if not self.config['bypass-source'] in [
                'input', 'noop', 'relu', 'batchnorm' ]:
            raise RuntimeError("Expected direct-convolution-source to "
                               "be input, relu or batchnorm, got: {1}".format(
                                   self.config['direct-convolution-source']))

    def auxiliary_outputs(self):
        return []

    def output_name(self, auxiliary_output = None):
        bypass_source = self.config['bypass-source']
        b = self.config['num-bottleneck-filters']
        conv = ('{0}.conv2' if b <= 0 else '{0}.conv3').format(self.name)
        if bypass_source == 'input':
            residual = self.descriptors['input']['final-string']
        elif bypass_source == 'noop':
            # we let the noop be the sum of the convolutional part and the
            # input, so just return the output of the no-op component.
            return '{0}.noop'.format(self.name)
        elif bypass_source == 'relu':
            residual = '{0}.relu1'.format(self.name)
        else:
            assert bypass_source == 'batchnorm'
            residual = '{0}.batchnorm1'.format(self.name)

        return 'Sum({0}, {1})'.format(conv, residual)

    def output_dim(self, auxiliary_output = None):
        assert auxiliary_output is None
        input_dim = self.descriptors['input']['dim']
        return input_dim

    def get_full_config(self):
        ans = []
        b = self.config['num-bottleneck-filters']
        if b <= 0:
            config_lines = self._generate_normal_resblock_config()
        else:
            config_lines = self._generate_bottleneck_resblock_config()

        for line in config_lines:
            for config_name in ['ref', 'final']:
                # we do not support user specified matrices in CNN initialization
                # so 'ref' and 'final' configs are the same.
                ans.append((config_name, line))
        return ans

    # _generate_normal_resblock_config is a convenience function to generate the
    # res-block config (the non-bottleneck version).
    #
    # The main path inside the res-block in the non-bottleneck case is as
    # follows:
    #
    # input -> relu1 -> batchnorm1 -> conv1 -> relu2 -> batchnorm2 -> conv2
    #
    # We put the relu before the batchnorm because we think it makes more sense;
    # because the Torch people seemed to find that this works better
    # (https://github.com/gcr/torch-residual-networks/issues/5);
    # and because in our batchnorm component we haven't implemented the beta and
    # gamma; these would be essential to having it work before relu, but
    # when before a convolution or linear component, they add no extra modeling
    # power.
    #
    # The output of the res-block can be the sum of the last convolutional
    # component (conv2), with the input.  However, the option ('bypass-source')
    # controls whether we sum with the raw input, or its relu or relu+batchnorm.
    # If the term is going to be the raw input, we give the option ('noop') and
    # to cache the output sum via a NoOpComponent)-- because due to how nnet3
    # works, if we didn't do this, redundant summing operations would take
    # place.
    def _generate_normal_resblock_config(self):
        configs = []

        name = self.name
        num_filters = self.config['num-filters']
        assert self.config['num-bottleneck-filters'] == -1
        height = self.config['height']
        input_descriptor = self.descriptors['input']['final-string']
        allow_zero_padding = self.config['allow-zero-padding']
        height_period = self.config['height-period']
        time_period = self.config['time-period']

        # input -> relu1 -> batchnorm1 -> conv1 -> relu2 -> batchnorm2 -> conv2
        cur_descriptor = input_descriptor
        for n in [1, 2]:
            # the ReLU
            configs.append('component name={0}.relu{1} type=RectifiedLinearComponent '
                           'dim={2} block-dim={3} self-repair-scale={4} '
                           'self-repair-lower-threshold={5}'.format(
                               name, n, num_filters * height, num_filters,
                               self.config['self-repair-scale'],
                               self.config['self-repair-lower-threshold{0}'.format(n)]))
            configs.append('component-node name={0}.relu{1} component={0}.relu{1} '
                           'input={2}'.format(name, n, cur_descriptor))

            cur_descriptor = '{0}.relu{1}'.format(name, n)

            # the batch-norm
            configs.append('component name={0}.batchnorm{1}  type=BatchNormComponent dim={2} '
                               'block-dim={3}'.format(
                                   name, n, num_filters * height,
                                   num_filters))
            configs.append('component-node name={0}.batchnorm{1} component={0}.batchnorm{1} '
                           'input={2}'.format(name, n, cur_descriptor))
            cur_descriptor = '{0}.batchnorm{1}'.format(name, n)


            # the convolution.
            a = []
            for opt_name in [
                    'param-stddev', 'bias-stddev', 'use-natural-gradient',
                    'max-change', 'rank-in', 'rank-out', 'num-minibatches-history',
                    'alpha-in', 'alpha-out', 'l2-regularize' ]:
                value = self.config[opt_name]
                if value != '':
                        a.append('{0}={1}'.format(opt_name, value))
            conv_opts = ('height-in={h} height-out={h} height-offsets=-{hp},0,{hp} '
                         'time-offsets=-{p},0,{p} '
                         'num-filters-in={f} num-filters-out={f} {r} {o}'.format(
                             h=height, hp=height_period, p=time_period, f=num_filters,
                             r=('required-time-offsets=0' if allow_zero_padding else ''),
                             o=' '.join(a)))

            configs.append('component name={0}.conv{1} type=TimeHeightConvolutionComponent '
                           '{2}'.format(name, n, conv_opts))
            configs.append('component-node name={0}.conv{1} component={0}.conv{1} '
                           'input={2}'.format(name, n, cur_descriptor))
            cur_descriptor = '{0}.conv{1}'.format(name, n)


        if self.config['bypass-source'] == 'noop':
            dim = self.descriptors['input']['dim']
            configs.append('component name={0}.noop dim={1} type=NoOpComponent'.format(
                name, dim))
            configs.append('component-node name={0}.noop component={0}.noop '
                           'input=Sum({1}, {0}.conv2)'.format(name,
                                                              input_descriptor))

        # Note: the function 'output_name' is responsible for returning the
        # descriptor corresponding to the output of the network.
        return configs


    # _generate_bottleneck_resblock_config is a convenience function to generate the
    # res-block config (this is the bottleneck version, where there is
    # a 3x3 kernel with a smaller number of filters than at the input and output,
    # sandwiched between two 1x1 kernels.
    #
    # The main path inside the res-block in the bottleneck case is as follows:
    #
    # input -> relu1 -> batchnorm1 -> conv1 -> relu2 -> batchnorm2 -> conv2 ->
    #   relu3 -> batchnorm3 -> conv3
    #
    # power.
    #
    # The output of the res-block can be the sum of the last convolutional
    # component (conv3), with the input.  However we give the option
    # ('bypass-source') to sum with the raw input, or its relu or
    # relu+batchnorm.  If the term is going to be the raw input, we give the
    # option ('noop') and to cache the output sum via a NoOpComponent)-- because
    # due to how nnet3 works, if we didn't do this, redundant summing operations
    # would take place.
    def _generate_bottleneck_resblock_config(self):
        configs = []

        name = self.name
        num_filters = self.config['num-filters']
        num_bottleneck_filters = self.config['num-bottleneck-filters']
        assert num_bottleneck_filters > 0
        height = self.config['height']
        input_descriptor = self.descriptors['input']['final-string']
        allow_zero_padding = self.config['allow-zero-padding']
        height_period = self.config['height-period']
        time_period = self.config['time-period']

        # input -> relu1 -> batchnorm1 -> conv1 -> relu2 -> batchnorm2 -> conv2
        cur_descriptor = input_descriptor
        cur_num_filters = num_filters

        for n in [1, 2, 3]:
            # the ReLU
            configs.append('component name={0}.relu{1} type=RectifiedLinearComponent '
                           'dim={2} block-dim={3} self-repair-scale={4} '
                           'self-repair-lower-threshold={5}'.format(
                               name, n, cur_num_filters * height, cur_num_filters,
                               self.config['self-repair-scale'],
                               self.config['self-repair-lower-threshold{0}'.format(n)]))
            configs.append('component-node name={0}.relu{1} component={0}.relu{1} '
                           'input={2}'.format(name, n, cur_descriptor))

            cur_descriptor = '{0}.relu{1}'.format(name, n)

            # the batch-norm
            configs.append('component name={0}.batchnorm{1}  type=BatchNormComponent dim={2} '
                               'block-dim={3}'.format(
                                   name, n, cur_num_filters * height,
                                   cur_num_filters))
            configs.append('component-node name={0}.batchnorm{1} component={0}.batchnorm{1} '
                           'input={2}'.format(name, n, cur_descriptor))
            cur_descriptor = '{0}.batchnorm{1}'.format(name, n)


            # the convolution.
            a = []
            for opt_name in [
                    'param-stddev', 'bias-stddev', 'use-natural-gradient',
                    'max-change', 'rank-in', 'rank-out', 'num-minibatches-history',
                    'alpha-in', 'alpha-out', 'l2-regularize' ]:
                value = self.config[opt_name]
                if value != '':
                        a.append('{0}={1}'.format(opt_name, value))

            height_offsets = ('-{hp},0,{hp}'.format(hp=height_period) if n == 2 else '0')
            time_offsets = ('-{t},0,{t}'.format(t=time_period) if n == 2 else '0')
            next_num_filters = (num_filters if n == 3 else num_bottleneck_filters)
            conv_opts = ('height-in={h} height-out={h} height-offsets={ho} time-offsets={to} '
                         'num-filters-in={fi} num-filters-out={fo} {r} {o}'.format(
                             h=height, ho=height_offsets, to=time_offsets,
                             fi=cur_num_filters, fo=next_num_filters,
                             r=('required-time-offsets=0' if allow_zero_padding else ''),
                             o=' '.join(a)))

            configs.append('component name={0}.conv{1} type=TimeHeightConvolutionComponent '
                           '{2}'.format(name, n, conv_opts))
            configs.append('component-node name={0}.conv{1} component={0}.conv{1} '
                           'input={2}'.format(name, n, cur_descriptor))
            cur_descriptor = '{0}.conv{1}'.format(name, n)
            cur_num_filters = next_num_filters


        if self.config['bypass-source'] == 'noop':
            dim = self.descriptors['input']['dim']
            configs.append('component name={0}.noop dim={1} type=NoOpComponent'.format(
                name, dim))
            configs.append('component-node name={0}.noop component={0}.noop '
                           'input=Sum({1}, {0}.conv3)'.format(name,
                                                              input_descriptor))

        # Note: the function 'output_name' is responsible for returning the
        # descriptor corresponding to the output of the network.
        return configs


# This class is for lines like the following:
#
# res2-block name=res1 num-filters=64 height=32 time-period=1
#
# It is a residual block with post-activations, which does not support
# downsampling (strided convolution) or changing the number of filters;
# for that, see res2-downsample-block.
# It's a pretty standard res-block, more standard than "res-block" (XconfigResBlock).
#
# The number of filters is the same on the input and output; it is actually
# redundant to write it in the config file, because given that we know the
# height, we can work it out from the dimension of the input (as dimension =
# height * num-filters).  But we allow it to be specified anyway, for clarity.
#

# Here are the most important configuration values, with defaults shown if
# defaults exist:
#
# input='[-1]'    Descriptor giving the input of the layer.
# height          The input and output height of the image, e.g. 40.  Note: the width
#                 is associated with the time dimension and is dealt with
#                 implicitly, so it's not specified here.
# num-filters     The number of filters on the input and output, e.g. 64.
#                 It does not have to be specified; if it is not specified,
#                 we work it out from the input dimension.
# num-bottleneck-filters   If specified then this will be a 'bottleneck'
#                 ResBlock, in which there is a 1x1 convolution from
#                 num-filters->num-bottleneck-filters, a 3x3 convolution
#                 from num-bottleneck-filters->num-bottleneck-filters, and
#                 a 1x1 convolution from num-bottleneck-filters->num-filters.
# time-period=1   Think of this as the stride in the time dimension.  At the
#                 input of the network will always have time-period=1; then
#                 after subsampling once in time we'd have time-period=2; then
#                 after subsampling again we'd have time-period=4.  Because of
#                 the way nnet3 works, subsampling on the time axis is an
#                 implicit, not explicit, operation.
# allow-zero-padding=true By default this will allow zero-padding in the time
#                       dimension, meaning that you don't need extra frames at
#                       the input to compute the output.  There may be ASR
#                       applications where you want to pad in the time dimension
#                       with repeats of the first or last frame (as we do for
#                       TDNNs), where it would be appropriate to write
#                       allow-zero-padding=false.  Note: the way we have
#                       set it up, it does zero-padding on the height axis
#                       regardless
#
# Less important config variables:
#  self-repair-scale=2.0e-05  This affects the ReLu's.  It is a scale on the
#                            'self-repair' mechanism that nudges the inputs to the
#                            ReLUs into the appropriate range in cases where
#                            the unit is active either too little of the time
#                            (<10%) or too much of the time (>90%).
#  max-change=0.75           Max-parameter-change constant (per minibatch)
#                            used for convolutional components.
#
#
# The following natural-gradient-related configuration variables are passed in
# to the convolution components, if specified:
#  use-natural-gradient (bool)
#  rank-in, rank-out    (int)
#  num-minibatches-history (float)
#  alpha-in, alpha-out (float)
# the following is also passed into the convolution components, if specified:
#  l2-regularize (float)

class XconfigRes2Block(XconfigLayerBase):
    def __init__(self, first_token, key_to_value, prev_names = None):
        assert first_token == 'res2-block'
        XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names)

    def set_default_configs(self):
        self.config = {'input':'[-1]',
                       'height':-1,  # sets height-in and height-out
                       'height-in':-1,
                       'height-out':-1,
                       'num-filters':-1, # interpreted as num-filters-out.
                       'num-bottleneck-filters':-1,
                       'time-period':1,
                       'self-repair-scale': 2.0e-05,
                       'self-repair-lower-threshold1': 0.05,
                       'self-repair-lower-threshold2': 0.05,
                       'self-repair-lower-threshold3': 0.05,
                       'max-change': 0.75,
                       'allow-zero-padding': True,
                       # the following are not really inspected by this level of
                       # code, just passed through (but not if left at '').
                       'param-stddev':'', 'bias-stddev':'',
                       'use-natural-gradient':'',
                       'rank-in':'', 'rank-out':'',
                       'num-minibatches-history':'',
                       'alpha-in':'', 'alpha-out':'',
                       'l2-regularize':'' }

    def set_derived_configs(self):
        input_dim = self.descriptors['input']['dim']

        if not ((self.config['height'] > 0  and self.config['height-in'] == -1 and
                 self.config['height-out'] == -1) or
                (self.config['height-out'] > 0 and self.config['height-in'] > 0)):
            raise RuntimeError("You must specify height, or height-in and height-out, for res2-block.")

        if not (self.config['height-in'] > 0 and self.config['height-out'] > 0):
            height = self.config['height']
            if not height > 0:
                raise RuntimeError("You must specify either height, or height-in and height-out, for "
                                   "res2-block.")
            self.config['height-in'] = height
            self.config['height-out'] = height

        height_in = self.config['height-in']
        if input_dim % height_in != 0:
            raise RuntimeError("Specified input image height {0} does not "
                                   "divide the input dim {1}".format(
                                       height_in, input_dim))
            self.config['num-filters'] = input_dim / height

    def check_configs(self):
        if self.config['num-filters'] == -1:
            raise RuntimeError("You must specify num-filters for res2-block.")

    def auxiliary_outputs(self):
        return []

    def output_name(self, auxiliary_output = None):
        b = self.config['num-bottleneck-filters']
        return ('{0}.relu2' if b <= 0 else '{0}.relu3').format(self.name)

    def output_dim(self, auxiliary_output = None):
        assert auxiliary_output is None
        return self.config['height-out'] * self.config['num-filters']

    def get_full_config(self):
        ans = []
        b = self.config['num-bottleneck-filters']
        if b <= 0:
            config_lines = self._generate_normal_resblock_config()
        else:
            config_lines = self._generate_bottleneck_resblock_config()

        for line in config_lines:
            for config_name in ['ref', 'final']:
                # we do not support user specified matrices in CNN initialization
                # so 'ref' and 'final' configs are the same.
                ans.append((config_name, line))
        return ans

    # _generate_normal_resblock_config is a convenience function to generate the
    # res-block config (the non-bottleneck version).
    #
    # The main path inside the res-block in the non-bottleneck case is as
    # follows:
    #
    # input -> conv1 -> batchnorm1 -> scaleoffset1 -> relu1 -> conv2 -> batchnorm2 -> scaleoffset2 -> relu2
    #
    # where the 'scaleoffsetN' are ScaleAndOffsetComponent, which conventionally would be
    # considered part of the BatchNorm.
    #
    # The relu2 actually sees the sum of the input and  'scaleoffset2'-- which gives us the bypass
    # connection.
    def _generate_normal_resblock_config(self):
        configs = []
        name = self.name
        assert self.config['num-bottleneck-filters'] == -1
        input_dim = self.descriptors['input']['dim']
        height_in = self.config['height-in']
        height_out = self.config['height-out']
        time_period_out = self.config['time-period']
        if not input_dim % height_in == 0:
            raise RuntimeError("input-dim {0} does not divide height-in {1}".format(
                input_dim, height_in))
        num_filters_in = input_dim / height_in
        num_filters_out = self.config['num-filters']

        if height_out != height_in:
            if height_out < height_in / 2 - 1 or height_out > height_in / 2 + 1:
                raise RuntimeError("Expected height-out to be about half height-in, or the same: "
                                   "height-in={0} height-out={1}".format(height_in, height_out))
            if not time_period_out % 2 == 0:
                raise RuntimeError("Expected time-period to be a multiple of 2 if you are subsampling "
                                   "on height.")
            time_period_in = time_period_out / 2
            height_subsample = 2
        else:
            time_period_in = time_period_out
            height_subsample = 1


        cur_time_period = time_period_in
        cur_num_filters = num_filters_in
        cur_height = height_in

        input_descriptor = self.descriptors['input']['final-string']
        allow_zero_padding = self.config['allow-zero-padding']
        if height_subsample == 1 and num_filters_in == num_filters_out:
            bypass_descriptor = input_descriptor
        else:
            bypass_descriptor = '{0}.conv_bypass'.format(name)

        cur_descriptor = input_descriptor

        # get miscellaneous convolution options passed in from the xconfig line
        a = []
        for opt_name in [
                'param-stddev', 'bias-stddev', 'use-natural-gradient',
                'max-change', 'rank-in', 'rank-out', 'num-minibatches-history',
                'alpha-in', 'alpha-out', 'l2-regularize' ]:
            value = self.config[opt_name]
            if value != '':
                a.append('{0}={1}'.format(opt_name, value))
        misc_conv_opts = ' '.join(a)

        for n in [1, 2]:
            # the convolution.
            conv_opts = ('height-in={hi} height-out={ho} height-offsets=-1,0,1 '
                         'height-subsample-out={hs} '
                         'time-offsets=-{p},0,{p} '
                         'num-filters-in={fi} num-filters-out={fo} {r} {o}'.format(
                             hi=cur_height, ho=height_out,
                             p=cur_time_period,
                             hs=(height_subsample if n == 1 else 1),
                             fi=cur_num_filters,
                             fo=num_filters_out,
                             r=('required-time-offsets=0' if allow_zero_padding else ''),
                             o=misc_conv_opts))

            configs.append('component name={0}.conv{1} type=TimeHeightConvolutionComponent '
                           '{2}'.format(name, n, conv_opts))
            configs.append('component-node name={0}.conv{1} component={0}.conv{1} '
                           'input={2}'.format(name, n, cur_descriptor))
            cur_descriptor = '{0}.conv{1}'.format(name, n)

            cur_num_filters = num_filters_out
            cur_height = height_out
            cur_time_period = time_period_out

            # the batch-norm
            configs.append('component name={0}.batchnorm{1}  type=BatchNormComponent dim={2} '
                               'block-dim={3}'.format(
                                   name, n, cur_num_filters * cur_height,
                                   cur_num_filters))
            configs.append('component-node name={0}.batchnorm{1} component={0}.batchnorm{1} '
                           'input={2}'.format(name, n, cur_descriptor))
            cur_descriptor = '{0}.batchnorm{1}'.format(name, n)

            # the scale-and-offset
            configs.append('component name={0}.scaleoffset{1}  type=ScaleAndOffsetComponent dim={2} '
                               'block-dim={3}'.format(
                                   name, n, cur_num_filters * cur_height,
                                   cur_num_filters))
            configs.append('component-node name={0}.scaleoffset{1} component={0}.scaleoffset{1} '
                           'input={2}'.format(name, n, cur_descriptor))
            cur_descriptor = '{0}.scaleoffset{1}'.format(name, n)


            if n == 2:
                # the bypass connection
                cur_descriptor = 'Sum({0}, {1})'.format(cur_descriptor, bypass_descriptor)


            # the ReLU
            configs.append('component name={0}.relu{1} type=RectifiedLinearComponent '
                           'dim={2} block-dim={3} self-repair-scale={4} '
                           'self-repair-lower-threshold={5}'.format(
                               name, n, cur_num_filters * cur_height, cur_num_filters,
                               self.config['self-repair-scale'],
                               self.config['self-repair-lower-threshold{0}'.format(n)]))
            configs.append('component-node name={0}.relu{1} component={0}.relu{1} '
                           'input={2}'.format(name, n, cur_descriptor))

            cur_descriptor = '{0}.relu{1}'.format(name, n)

        if bypass_descriptor != input_descriptor:
            # We need to add the 1x1 bypass convolution because we're either doing height
            # subsampling or changing the number of filters.
            conv_opts = ('height-in={hi} height-out={ho} height-offsets=0 '
                         'time-offsets=0 height-subsample-out={hs} '
                         'num-filters-in={fi} num-filters-out={fo} {o}'.format(
                             hi=height_in, ho=height_out, hs=height_subsample,
                             fi=num_filters_in, fo=num_filters_out, o=misc_conv_opts))
            configs.append('component name={0}.conv_bypass type=TimeHeightConvolutionComponent '
                           '{1}'.format(name, conv_opts))
            configs.append('component-node name={0}.conv_bypass component={0}.conv_bypass '
                           'input={1}'.format(name, input_descriptor))


        # Note: the function 'output_name' is responsible for returning the
        # descriptor corresponding to the output of the network, which in
        # this case would be '{0}.relu2'.format(name).
        return configs


    # _generate_bottleneck_resblock_config is a convenience function to generate the
    # res-block config (this is the bottleneck version, where there is
    # a 3x3 kernel with a smaller number of filters than at the input and output,
    # sandwiched between two 1x1 kernels.
    #
    # The main path inside the res-block in the bottleneck case is as follows:
    #
    # input -> conv1 -> batchnorm1 -> scaleoffset1 -> relu1 ->
    #          conv2 -> batchnorm2 -> scaleoffset2 -> relu2 ->
    #          conv3 -> batchnorm3 -> scaleoffset3 -> relu3
    #
    #  but the relu3 takes as its input the sum of 'input' and 'scaleoffset3'.
    #
    def _generate_bottleneck_resblock_config(self):
        configs = []

        name = self.name
        num_bottleneck_filters = self.config['num-bottleneck-filters']
        assert num_bottleneck_filters > 0
        input_dim = self.descriptors['input']['dim']
        height_in = self.config['height-in']
        height_out = self.config['height-out']
        input_descriptor = self.descriptors['input']['final-string']
        allow_zero_padding = self.config['allow-zero-padding']
        time_period_out = self.config['time-period']
        if not input_dim % height_in == 0:
            raise RuntimeError("input-dim={0} does not divide height-in={1}".format(
                input_dim, height_in))
        num_filters_in = input_dim / height_in
        num_filters_out = self.config['num-filters']

        if height_out != height_in:
            if height_out < height_in / 2 - 1 or height_out > height_in / 2 + 1:
                raise RuntimeError("Expected height-out to be about half height-in, or the same: "
                                   "height-in={0} height-out={1}".format(height_in, height_out))
            height_subsample = 2
        else:
            height_subsample = 1

        cur_descriptor = input_descriptor
        cur_num_filters = num_filters_in
        cur_height = height_in
        if height_subsample == 1 and num_filters_in == num_filters_out:
            bypass_descriptor = input_descriptor
        else:
            bypass_descriptor = '{0}.conv_bypass'.format(name)

        # get miscellaneous convolution options passed in from the xconfig line
        a = []
        for opt_name in [
                'param-stddev', 'bias-stddev', 'use-natural-gradient',
                'max-change', 'rank-in', 'rank-out', 'num-minibatches-history',
                'alpha-in', 'alpha-out', 'l2-regularize' ]:
            value = self.config[opt_name]
            if value != '':
                a.append('{0}={1}'.format(opt_name, value))
        misc_conv_opts = ' '.join(a)


        for n in [1, 2, 3]:
            # the convolution.
            height_offsets = ('-1,0,1' if n == 2 else '0')
            this_height_subsample = height_subsample if n == 1 else 1
            time_offsets = ('-{t},0,{t}'.format(t=time_period_out) if n == 2 else '0')
            next_num_filters = (num_filters_out if n == 3 else num_bottleneck_filters)

            conv_opts = ('height-in={h_in} height-out={h_out} height-offsets={ho} time-offsets={to} '
                         'num-filters-in={fi} num-filters-out={fo} height-subsample-out={hs} '
                         '{r} {o}'.format(
                             h_in=cur_height, h_out=height_out,
                             to=time_offsets, ho=height_offsets,
                             hs=this_height_subsample,
                             fi=cur_num_filters, fo=next_num_filters,
                             r=('required-time-offsets=0' if allow_zero_padding else ''),
                             o=misc_conv_opts))

            configs.append('component name={0}.conv{1} type=TimeHeightConvolutionComponent '
                           '{2}'.format(name, n, conv_opts))
            configs.append('component-node name={0}.conv{1} component={0}.conv{1} '
                           'input={2}'.format(name, n, cur_descriptor))

            cur_num_filters = next_num_filters
            cur_height = height_out
            cur_descriptor = '{0}.conv{1}'.format(name, n)

            # the batch-norm
            configs.append('component name={0}.batchnorm{1}  type=BatchNormComponent dim={2} '
                               'block-dim={3}'.format(
                                   name, n, cur_num_filters * cur_height,
                                   cur_num_filters))
            configs.append('component-node name={0}.batchnorm{1} component={0}.batchnorm{1} '
                           'input={2}'.format(name, n, cur_descriptor))
            cur_descriptor = '{0}.batchnorm{1}'.format(name, n)

            # the scale and offset
            configs.append('component name={0}.scaleoffset{1}  type=ScaleAndOffsetComponent dim={2} '
                               'block-dim={3}'.format(
                                   name, n, cur_num_filters * cur_height,
                                   cur_num_filters))
            configs.append('component-node name={0}.scaleoffset{1} component={0}.scaleoffset{1} '
                           'input={2}'.format(name, n, cur_descriptor))
            cur_descriptor = '{0}.scaleoffset{1}'.format(name, n)

            if n == 3:
                # the bypass connection
                cur_descriptor = 'Sum({0}, {1})'.format(cur_descriptor, bypass_descriptor)

            # the ReLU
            configs.append('component name={0}.relu{1} type=RectifiedLinearComponent '
                           'dim={2} block-dim={3} self-repair-scale={4} '
                           'self-repair-lower-threshold={5}'.format(
                               name, n, cur_num_filters * cur_height, cur_num_filters,
                               self.config['self-repair-scale'],
                               self.config['self-repair-lower-threshold{0}'.format(n)]))
            configs.append('component-node name={0}.relu{1} component={0}.relu{1} '
                           'input={2}'.format(name, n, cur_descriptor))

            cur_descriptor = '{0}.relu{1}'.format(name, n)

        if bypass_descriptor != input_descriptor:
            # We need to add the 1x1 bypass convolution because we're either doing height
            # subsampling or changing the number of filters.
            conv_opts = ('height-in={hi} height-out={ho} height-offsets=0 '
                         'time-offsets=0 height-subsample-out={hs} '
                         'num-filters-in={fi} num-filters-out={fo} {o}'.format(
                             hi=height_in, ho=height_out, hs=height_subsample,
                             fi=num_filters_in, fo=num_filters_out, o=misc_conv_opts))
            configs.append('component name={0}.conv_bypass type=TimeHeightConvolutionComponent '
                           '{1}'.format(name, conv_opts))
            configs.append('component-node name={0}.conv_bypass component={0}.conv_bypass '
                           'input={1}'.format(name, input_descriptor))

        # Note: the function 'output_name' is responsible for returning the
        # descriptor corresponding to the output of the network, which
        # in this case will be '{0}.relu3'.format(name).
        return configs


# This layer just maps to a single component, a SumBlockComponent.  It's for
# doing channel averaging at the end of neural networks.  See scripts for
# examples of how to use it.
# An example line using this layer is:
# channel-average-layer name=channel-average input=Append(2, 4, 6, 8) dim=64

# the configuration value 'dim' is the output dimension of this layer.
# The input dimension is expected to be a multiple of 'dim'.  The output
# will be the average of 'dim'-sized blocks of the input.
class ChannelAverageLayer(XconfigLayerBase):
    def __init__(self, first_token, key_to_value, prev_names = None):
        assert first_token == "channel-average-layer"
        XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names)

    def set_default_configs(self):
        self.config = {'input':'[-1]',
                       'dim': -1 }

    def set_derived_configs(self):
        pass

    def check_configs(self):
        input_dim = self.descriptors['input']['dim']
        dim = self.config['dim']
        if dim <= 0:
            raise RuntimeError("dim must be specified and > 0.")
        if input_dim % dim != 0:
            raise RuntimeError("input-dim={0} is not a multiple of dim={1}".format(
                input_dim, dim))

    def auxiliary_outputs(self):
        return []

    def output_name(self, auxiliary_output = None):
        assert auxiliary_output is None
        return self.name

    def output_dim(self, auxiliary_output = None):
        assert auxiliary_output is None
        return self.config['dim']


    def get_full_config(self):
        ans = []
        config_lines = self._generate_channel_average_config()
        for line in config_lines:
            for config_name in ['ref', 'final']:
                ans.append((config_name, line))
        return ans

    def _generate_channel_average_config(self):
        configs = []
        name = self.name
        input_dim = self.descriptors['input']['dim']
        input_descriptor = self.descriptors['input']['final-string']
        dim = self.config['dim']
        # choose the scale that makes it an average rather than a sum.
        scale = dim * 1.0 / input_dim
        configs.append('component name={0} type=SumBlockComponent input-dim={1} '
                       'output-dim={2} scale={3}'.format(name, input_dim,
                                                         dim, scale))
        configs.append('component-node name={0} component={0} input={1}'.format(
            name, input_descriptor))
        return configs


================================================
FILE: egs/steps/libs/nnet3/xconfig/gru.py
================================================
# Copyright 2016    Johns Hopkins University (Dan Povey)
#           2017    Gaofeng Cheng (UCAS)
#           2017    Lu Huang (THU)
#           2018    Hang Lyu
# Apache 2.0.


""" This module has the implementations of different GRU layers.
"""
from __future__ import print_function
import math
import re
import sys
from libs.nnet3.xconfig.basic_layers import XconfigLayerBase

# This class is for lines like
#   'gru-layer name=gru1 input=[-1] delay=-3'
# It generates an GRU sub-graph without output projections.
# The output dimension of the layer may be specified via 'cell-dim=xxx', but if not specified,
# the dimension defaults to the same as the input.
# See other configuration values below.
# decay-time is deprecated under GRU or PGRU, as I found the PGRUs do not need the decay-time option to get generalized to unseen sequence length
#
# Parameters of the class, and their defaults:
#   input='[-1]'             [Descriptor giving the input of the layer.]
#   cell-dim=-1              [Dimension of the cell]
#   delay=-1                 [Delay in the recurrent connections of the GRU/LSTM ]
#   clipping-threshold=30    [similar to LSTMs ,nnet3 GRUs use a gradient clipping component at the recurrent connections.
#                             This is the threshold used to decide if clipping has to be activated ]
#   zeroing-interval=20      [interval at which we (possibly) zero out the recurrent derivatives.]
#   zeroing-threshold=15     [We only zero out the derivs every zeroing-interval, if derivs exceed this value.]
#   self-repair-scale-nonlinearity=1e-5      [It is a constant scaling the self-repair vector computed in derived classes of NonlinearComponent]
#                                       i.e.,  SigmoidComponent, TanhComponent and RectifiedLinearComponent ]
#   ng-per-element-scale-options=''     [Additional options used for the diagonal matrices in the GRU/LSTM ]
#   ng-affine-options=''                [Additional options used for the full matrices in the GRU/LSTM, can be used to do things like set biases to initialize to 1]
class XconfigGruLayer(XconfigLayerBase):
    def __init__(self, first_token, key_to_value, prev_names = None):
        assert first_token == "gru-layer"
        XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names)

    def set_default_configs(self):
        self.config = {'input':'[-1]',
                        'cell-dim' : -1, # this is a compulsory argument
                        'clipping-threshold' : 30.0,
                        'delay' : -1,
                        'ng-per-element-scale-options' : ' max-change=0.75',
                        'ng-affine-options' : ' max-change=0.75 ',
                        'self-repair-scale-nonlinearity' : 0.00001,
                        'zeroing-interval' : 20,
                        'zeroing-threshold' : 15.0
                        }

    def set_derived_configs(self):
        if self.config['cell-dim'] <= 0:
            self.config['cell-dim'] = self.descriptors['input']['dim']

    def check_configs(self):
        key = 'cell-dim'
        if self.config['cell-dim'] <= 0:
            raise RuntimeError("cell-dim has invalid value {0}.".format(self.config[key]))

        if self.config['delay'] == 0:
            raise RuntimeError("delay cannot be zero")

        for key in ['self-repair-scale-nonlinearity']:
            if self.config[key] < 0.0 or self.config[key] > 1.0:
                raise RuntimeError("{0} has invalid value {1}.".format(key, self.config[key]))

    def output_name(self, auxiliary_output = None):
        node_name = 's_t'
        return '{0}.{1}'.format(self.name, node_name)

    def output_dim(self, auxiliary_output = None):
        return self.config['cell-dim']

    def get_full_config(self):
        ans = []
        config_lines = self.generate_gru_config()

        for line in config_lines:
            for config_name in ['ref', 'final']:
                # we do not support user specified matrices in LSTM initialization
                # so 'ref' and 'final' configs are the same.
                ans.append((config_name, line))
        return ans

    # convenience function to generate the GRU config
    def generate_gru_config(self):

        # assign some variables to reduce verbosity
        name = self.name
        # in the below code we will just call descriptor_strings as descriptors for conciseness
        input_dim = self.descriptors['input']['dim']
        input_descriptor = self.descriptors['input']['final-string']
        cell_dim = self.config['cell-dim']
        delay = self.config['delay']
        bptrunc_str = ("clipping-threshold={0}"
                      " zeroing-threshold={1}"
                      " zeroing-interval={2}"
                      " recurrence-interval={3}"
                      "".format(self.config['clipping-threshold'],
                                self.config['zeroing-threshold'],
                                self.config['zeroing-interval'], abs(delay)))
        repair_nonlin = self.config['self-repair-scale-nonlinearity']
        repair_nonlin_str = "self-repair-scale={0:.10f}".format(repair_nonlin) if repair_nonlin is not None else ''
        affine_str = self.config['ng-affine-options']
        # Natural gradient per element scale parameters
        # TODO: decide if we want to keep exposing these options
        ng_per_element_scale_options = self.config['ng-per-element-scale-options']
        if re.search('param-mean', ng_per_element_scale_options) is None and \
           re.search('param-stddev', ng_per_element_scale_options) is None:
           ng_per_element_scale_options += " param-mean=0.0 param-stddev=1.0 "
        pes_str = ng_per_element_scale_options

        # formulation like:
        # z_t = \sigmoid ( x_t * U^z + h_{t-1} * W^z ) // update gate
        # r_t = \sigmoid ( x_t * U^r + h_{t-1} * W^r ) // reset gate
        # \tilde{h}_t = \tanh ( x_t * U^h + ( h_{t-1} \dot r_t ) * W^h )
        # h_t = ( 1 - z_t ) \dot \tilde{h}_t + z_t \dot h_{t-1}
        # y_t = h_t // y_t is the output

        configs = []
        configs.append("# Update gate control : W_z* matrics")
        configs.append("component name={0}.W_z.xs_z type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + cell_dim, cell_dim, affine_str))
        
        configs.append("# Reset gate control : W_r* matrics")
        configs.append("component name={0}.W_z.xs_r type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + cell_dim, cell_dim, affine_str))

        configs.append("# h related matrix : W_h* matrics")
        configs.append("component name={0}.W_h.UW type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + cell_dim, cell_dim , affine_str))
        
        configs.append("# Defining the non-linearities")
        configs.append("component name={0}.z type=SigmoidComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str))
        configs.append("component name={0}.r type=SigmoidComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str))
        configs.append("component name={0}.h type=TanhComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str))

        configs.append("# Defining the components for other cell computations")
        configs.append("component name={0}.h1 type=ElementwiseProductComponent input-dim={1} output-dim={2}".format(name, 2 * cell_dim, cell_dim))
        configs.append("component name={0}.y1 type=ElementwiseProductComponent input-dim={1} output-dim={2}".format(name, 2 * cell_dim, cell_dim))
        configs.append("component name={0}.y2 type=ElementwiseProductComponent input-dim={1} output-dim={2}".format(name, 2 * cell_dim, cell_dim))
        configs.append("component name={0}.y type=NoOpComponent dim={1}".format(name, cell_dim))

        recurrent_connection = '{0}.s_t'.format(name)

        configs.append("# z_t")
        configs.append("component-node name={0}.z_t_pre component={0}.W_z.xs_z input=Append({1}, IfDefined(Offset({2}, {3})))".format(name, input_descriptor, recurrent_connection, delay))
        configs.append("component-node name={0}.z_t component={0}.z input={0}.z_t_pre".format(name))

        configs.append("# r_t")
        configs.append("component-node name={0}.r_t_pre component={0}.W_z.xs_r input=Append({1}, IfDefined(Offset({2}, {3})))".format(name, input_descriptor, recurrent_connection, delay))
        configs.append("component-node name={0}.r_t component={0}.r input={0}.r_t_pre".format(name))
        
        configs.append("# h_t")
        configs.append("component-node name={0}.h1_t component={0}.h1 input=Append({0}.r_t, IfDefined(Offset({1}, {2})))".format(name, recurrent_connection, delay))
        configs.append("component-node name={0}.h_t_pre component={0}.W_h.UW input=Append({1}, {0}.h1_t)".format(name, input_descriptor))
        configs.append("component-node name={0}.h_t component={0}.h input={0}.h_t_pre".format(name))
        
        configs.append("# y_t")
        configs.append("# The following two lines are to implement (1 - z_t)")
        configs.append("component-node name={0}.y1_t component={0}.y1 input=Append({0}.h_t, Sum(Scale(-1.0,{0}.z_t), Const(1.0, {1})))".format(name, cell_dim))
        configs.append("component-node name={0}.y2_t component={0}.y2 input=Append(IfDefined(Offset({1}, {2})), {0}.z_t)".format(name, recurrent_connection, delay))
        configs.append("component-node name={0}.y_t component={0}.y input=Sum({0}.y1_t, {0}.y2_t)".format(name))

        configs.append("# s_t : recurrence")
        configs.append("component name={0}.s_r type=BackpropTruncationComponent dim={1} {2}".format(name, cell_dim, bptrunc_str))

        configs.append("# s_t will be output and recurrence")
        configs.append("component-node name={0}.s_t component={0}.s_r input={0}.y_t".format(name))
        return configs


# This class is for lines like
#   'pgru-layer name=pgru1 input=[-1] delay=-3'
# It generates an PGRU sub-graph with output projections. It can also generate
# outputs without projection, but you could use the XconfigGruLayer for this
# simple RNN.
# The output dimension of the layer may be specified via 'cell-dim=xxx', but if not specified,
# the dimension defaults to the same as the input.
# See other configuration values below.
#
# Parameters of the class, and their defaults:
#   input='[-1]'             [Descriptor giving the input of the layer.]
#   cell-dim=-1            [Dimension of the cell]
#   recurrent-projection-dim [Dimension of the projection used in recurrent connections, e.g. cell-dim/4]
#   non-recurrent-projection-dim   [Dimension of the projection in non-recurrent connections,
#                                   in addition to recurrent-projection-dim, e.g. cell-dim/4]
#   delay=-1                 [Delay in the recurrent connections of the GRU ]
#   clipping-threshold=30    [nnet3 GRU use a gradient clipping component at the recurrent connections.
#                             This is the threshold used to decide if clipping has to be activated ]
#   zeroing-interval=20      [interval at which we (possibly) zero out the recurrent derivatives.]
#   zeroing-threshold=15     [We only zero out the derivs every zeroing-interval, if derivs exceed this value.]
#   self-repair-scale-nonlinearity=1e-5      [It is a constant scaling the self-repair vector computed in derived classes of NonlinearComponent]
#                                       i.e.,  SigmoidComponent, TanhComponent and RectifiedLinearComponent ]
#   ng-per-element-scale-options=''   [Additional options used for the diagonal matrices in the GRU ]
#   ng-affine-options=''              [Additional options used for the full matrices in the GRU, can be used to do things like set biases to initialize to 1]

class XconfigPgruLayer(XconfigLayerBase):
    def __init__(self, first_token, key_to_value, prev_names = None):
        assert first_token == "pgru-layer"
        XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names)

    def set_default_configs(self):
        self.config = {'input' : '[-1]',
                        'cell-dim' : -1, # this is a compulsory argument
                        'recurrent-projection-dim' : -1,  # defaults to cell-dim / 4
                        'non-recurrent-projection-dim' : -1, # defaults to
                                                             # recurrent-projection-dim
                        'clipping-threshold' : 30.0,
                        'delay' : -1,
                        'ng-per-element-scale-options' : ' max-change=0.75 ',
                        'ng-affine-options' : ' max-change=0.75 ',
                        'self-repair-scale-nonlinearity' : 0.00001,
                        'zeroing-interval' : 20,
                        'zeroing-threshold' : 15.0
                       }

    def set_derived_configs(self):
        if self.config['recurrent-projection-dim'] <= 0:
            self.config['recurrent-projection-dim'] = self.config['cell-dim'] / 4

        if self.config['non-recurrent-projection-dim'] <= 0:
            self.config['non-recurrent-projection-dim'] = \
               self.config['recurrent-projection-dim']

    def check_configs(self):
        for key in ['cell-dim', 'recurrent-projection-dim',
                    'non-recurrent-projection-dim']:
            if self.config[key] <= 0:
                raise RuntimeError("{0} has invalid value {1}.".format(
                    key, self.config[key]))

        if self.config['delay'] == 0:
            raise RuntimeError("delay cannot be zero")

        if (self.config['recurrent-projection-dim'] +
            self.config['non-recurrent-projection-dim'] >
            self.config['cell-dim']):
            raise RuntimeError("recurrent+non-recurrent projection dim exceeds "
                                "cell dim.")
        for key in ['self-repair-scale-nonlinearity']:
            if self.config[key] < 0.0 or self.config[key] > 1.0:
                raise RuntimeError("{0} has invalid value {2}."
                                   .format(self.layer_type, key,
                                           self.config[key]))

    def auxiliary_outputs(self):
        return ['h_t']

    def output_name(self, auxiliary_output = None):
        node_name = 'sn_t'
        if auxiliary_output is not None:
            if auxiliary_output in self.auxiliary_outputs():
                node_name = auxiliary_output
            else:
                raise Exception("In {0} of type {1}, unknown auxiliary output name {1}".format(self.layer_type, auxiliary_output))

        return '{0}.{1}'.format(self.name, node_name)

    def output_dim(self, auxiliary_output = None):
        if auxiliary_output is not None:
            if auxiliary_output in self.auxiliary_outputs():
                if node_name == 'c_t':
                    return self.config['cell-dim']
                # add code for other auxiliary_outputs here when we decide to expose them
            else:
                raise Exception("In {0} of type {1}, unknown auxiliary output name {1}".format(self.layer_type, auxiliary_output))

        return self.config['recurrent-projection-dim'] + self.config['non-recurrent-projection-dim']

    def get_full_config(self):
        ans = []
        config_lines = self.generate_pgru_config()

        for line in config_lines:
            for config_name in ['ref', 'final']:
                # we do not support user specified matrices in LSTM initialization
                # so 'ref' and 'final' configs are the same.
                ans.append((config_name, line))
        return ans

    # convenience function to generate the PGRU config
    def generate_pgru_config(self):

        # assign some variables to reduce verbosity
        name = self.name
        # in the below code we will just call descriptor_strings as descriptors for conciseness
        input_dim = self.descriptors['input']['dim']
        input_descriptor = self.descriptors['input']['final-string']
        cell_dim = self.config['cell-dim']
        rec_proj_dim = self.config['recurrent-projection-dim']
        nonrec_proj_dim = self.config['non-recurrent-projection-dim']
        delay = self.config['delay']
        repair_nonlin = self.config['self-repair-scale-nonlinearity']
        repair_nonlin_str = "self-repair-scale={0:.10f}".format(repair_nonlin) if repair_nonlin is not None else ''
        bptrunc_str = ("clipping-threshold={0}"
                      " zeroing-threshold={1}"
                      " zeroing-interval={2}"
                      " recurrence-interval={3}"
                      "".format(self.config['clipping-threshold'],
                                self.config['zeroing-threshold'],
                                self.config['zeroing-interval'],
                                abs(delay)))
        affine_str = self.config['ng-affine-options']
        pes_str = self.config['ng-per-element-scale-options']

        # Natural gradient per element scale parameters
        # TODO: decide if we want to keep exposing these options
        if re.search('param-mean', pes_str) is None and \
           re.search('param-stddev', pes_str) is None:
           pes_str += " param-mean=0.0 param-stddev=1.0 "

        # formulation like:
        # z_t = \sigmoid ( x_t * U^z + s_{t-1} * W^z ) // update gate
        # r_t = \sigmoid ( x_t * U^r + s_{t-1} * W^r ) // reset gate
        # \tilde{h}_t = \tanh ( x_t * U^h + ( s_{t-1} \dot r_t ) * W^h )
        # h_t = ( 1 - z_t ) \dot \tilde{h}_t + z_t \dot h_{t-1}
        # y_t = h_t * W^y
        # s_t = y_t (0:rec_proj_dim-1)
        
        configs = []
        configs.append("# Update gate control : W_z* matrics")
        configs.append("component name={0}.W_z.xs_z type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + rec_proj_dim, cell_dim, affine_str))
        
        configs.append("# Reset gate control : W_r* matrics")
        configs.append("component name={0}.W_z.xs_r type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + rec_proj_dim, rec_proj_dim, affine_str))

        configs.append("# h related matrix : W_h* matrics")
        configs.append("component name={0}.W_h.UW type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + rec_proj_dim, cell_dim , affine_str))
        
        configs.append("# Defining the non-linearities")
        configs.append("component name={0}.z type=SigmoidComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str))
        configs.append("component name={0}.r type=SigmoidComponent dim={1} {2}".format(name, rec_proj_dim, repair_nonlin_str))
        configs.append("component name={0}.h type=TanhComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str))

        configs.append("# Defining the components for other cell computations")
        configs.append("component name={0}.h1 type=ElementwiseProductComponent input-dim={1} output-dim={2}".format(name, 2 * rec_proj_dim, rec_proj_dim))
        configs.append("component name={0}.y1 type=ElementwiseProductComponent input-dim={1} output-dim={2}".format(name, 2 * cell_dim, cell_dim))
        configs.append("component name={0}.y2 type=ElementwiseProductComponent input-dim={1} output-dim={2}".format(name, 2 * cell_dim, cell_dim))
        configs.append("component name={0}.y type=NoOpComponent dim={1}".format(name, cell_dim))

        recurrent_connection = '{0}.s_t'.format(name)
        recurrent_connection_y = '{0}.y_t'.format(name)

        configs.append("# z_t")
        configs.append("component-node name={0}.z_t_pre component={0}.W_z.xs_z input=Append({1}, IfDefined(Offset({2}, {3})))".format(name, input_descriptor, recurrent_connection, delay))
        configs.append("component-node name={0}.z_t component={0}.z input={0}.z_t_pre".format(name))

        configs.append("# r_t")
        configs.append("component-node name={0}.r_t_pre component={0}.W_z.xs_r input=Append({1}, IfDefined(Offset({2}, {3})))".format(name, input_descriptor, recurrent_connection, delay))
        configs.append("component-node name={0}.r_t component={0}.r input={0}.r_t_pre".format(name))

        configs.append("# h_t")
        configs.append("component-node name={0}.h1_t component={0}.h1 input=Append({0}.r_t, IfDefined(Offset({1}, {2})))".format(name, recurrent_connection, delay))
        configs.append("component-node name={0}.h_t_pre component={0}.W_h.UW input=Append({1}, {0}.h1_t)".format(name, input_descriptor))
        configs.append("component-node name={0}.h_t component={0}.h input={0}.h_t_pre".format(name))

        configs.append("component-node name={0}.y1_t component={0}.y1 input=Append({0}.h_t, Sum(Scale(-1.0,{0}.z_t), Const(1.0, {1})))".format(name, cell_dim))
        configs.append("component-node name={0}.y2_t component={0}.y2 input=Append(IfDefined(Offset({1}, {2})), {0}.z_t)".format(name, recurrent_connection_y, delay))
        
        configs.append("component-node name={0}.y_t component={0}.y input=Sum({0}.y1_t, {0}.y2_t)".format(name))

        configs.append("# s_t recurrent")
        configs.append("component name={0}.W_s.ys type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, cell_dim, rec_proj_dim + nonrec_proj_dim, affine_str))
        configs.append("component name={0}.s_r type=BackpropTruncationComponent dim={1} {2}".format(name, rec_proj_dim, bptrunc_str))

        configs.append("# s_t and n_t : sn_t will be the output")
        configs.append("component-node name={0}.sn_t component={0}.W_s.ys input={0}.y_t".format(name))
        configs.append("dim-range-node name={0}.s_t_preclip input-node={0}.sn_t dim-offset=0 dim={1}".format(name, rec_proj_dim))
        configs.append("component-node name={0}.s_t component={0}.s_r input={0}.s_t_preclip".format(name))

        return configs


# This class is for lines like
#   'norm-pgru-layer name=norm-pgru1 input=[-1] delay=-3'

# Different from the vanilla PGRU, the NormPGRU uses batchnorm in the forward direction
# and renorm in the recurrence.

# The output dimension of the layer may be specified via 'cell-dim=xxx', but if not specified,
# the dimension defaults to the same as the input.
# See other configuration values below.
#
# Parameters of the class, and their defaults:
#   input='[-1]'             [Descriptor giving the input of the layer.]
#   cell-dim=-1            [Dimension of the cell]
#   recurrent-projection-dim [Dimension of the projection used in recurrent connections, e.g. cell-dim/4]
#   non-recurrent-projection-dim   [Dimension of the projection in non-recurrent connections,
#                                   in addition to recurrent-projection-dim, e.g. cell-dim/4]
#   delay=-1                 [Delay in the recurrent connections of the GRU ]
#   clipping-threshold=30    [nnet3 GRU use a gradient clipping component at the recurrent connections.
#                             This is the threshold used to decide if clipping has to be activated ]
#   zeroing-interval=20      [interval at which we (possibly) zero out the recurrent derivatives.]
#   zeroing-threshold=15     [We only zero out the derivs every zeroing-interval, if derivs exceed this value.]
#   self-repair-scale-nonlinearity=1e-5      [It is a constant scaling the self-repair vector computed in derived classes of NonlinearComponent]
#                                       i.e.,  SigmoidComponent, TanhComponent and RectifiedLinearComponent ]
#   ng-per-element-scale-options=''   [Additional options used for the diagonal matrices in the GRU ]
#   ng-affine-options=''              [Additional options used for the full matrices in the GRU, can be used to do things like set biases to initialize to 1]

class XconfigNormPgruLayer(XconfigLayerBase):
    def __init__(self, first_token, key_to_value, prev_names = None):
        assert first_token == "norm-pgru-layer"
        XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names)

    def set_default_configs(self):
        self.config = {'input' : '[-1]',
                        'cell-dim' : -1, # this is a compulsory argument
                        'recurrent-projection-dim' : -1,  # defaults to cell-dim / 4
                        'non-recurrent-projection-dim' : -1, # defaults to
                                                             # recurrent-projection-dim
                        'clipping-threshold' : 30.0,
                        'delay' : -1,
                        'ng-per-element-scale-options' : ' max-change=0.75 ',
                        'ng-affine-options' : ' max-change=0.75 ',
                        'self-repair-scale-nonlinearity' : 0.00001,
                        'zeroing-interval' : 20,
                        'zeroing-threshold' : 15.0,
                        'dropout-proportion' : -1.0, # If -1.0, no dropout components will be added
                        'dropout-per-frame' : True # If False, regular dropout, not per frame.
                       }

    def set_derived_configs(self):
        if self.config['recurrent-projection-dim'] <= 0:
            self.config['recurrent-projection-dim'] = self.config['cell-dim'] / 4

        if self.config['non-recurrent-projection-dim'] <= 0:
            self.config['non-recurrent-projection-dim'] = \
               self.config['recurrent-projection-dim']

    def check_configs(self):
        for key in ['cell-dim', 'recurrent-projection-dim',
                    'non-recurrent-projection-dim']:
            if self.config[key] <= 0:
                raise RuntimeError("{0} has invalid value {1}.".format(
                    key, self.config[key]))

        if self.config['delay'] == 0:
            raise RuntimeError("delay cannot be zero")

        if (self.config['recurrent-projection-dim'] +
            self.config['non-recurrent-projection-dim'] >
            self.config['cell-dim']):
            raise RuntimeError("recurrent+non-recurrent projection dim exceeds "
                                "cell dim.")
        for key in ['self-repair-scale-nonlinearity']:
            if self.config[key] < 0.0 or self.config[key] > 1.0:
                raise RuntimeError("{0} has invalid value {2}."
                                   .format(self.layer_type, key,
                                           self.config[key]))
        if ((self.config['dropout-proportion'] > 1.0 or
             self.config['dropout-proportion'] < 0.0) and
             self.config['dropout-proportion'] != -1.0 ):
             raise RuntimeError("dropout-proportion has invalid value {0}."
                                .format(self.config['dropout-proportion']))

    def auxiliary_outputs(self):
        return ['h_t']

    def output_name(self, auxiliary_output = None):
        node_name = 'sn_t'
        if auxiliary_output is not None:
            if auxiliary_output in self.auxiliary_outputs():
                node_name = auxiliary_output
            else:
                raise Exception("In {0} of type {1}, unknown auxiliary output name {1}".format(self.layer_type, auxiliary_output))

        return '{0}.{1}'.format(self.name, node_name)

    def output_dim(self, auxiliary_output = None):
        if auxiliary_output is not None:
            if auxiliary_output in self.auxiliary_outputs():
                if node_name == 'h_t':
                    return self.config['cell-dim']
                # add code for other auxiliary_outputs here when we decide to expose them
            else:
                raise Exception("In {0} of type {1}, unknown auxiliary output name {1}".format(self.layer_type, auxiliary_output))

        return self.config['recurrent-projection-dim'] + self.config['non-recurrent-projection-dim']

    def get_full_config(self):
        ans = []
        config_lines = self.generate_pgru_config()

        for line in config_lines:
            for config_name in ['ref', 'final']:
                # we do not support user specified matrices in LSTM initialization
                # so 'ref' and 'final' configs are the same.
                ans.append((config_name, line))
        return ans

    # convenience function to generate the Norm-PGRU config
    def generate_pgru_config(self):

        # assign some variables to reduce verbosity
        name = self.name
        # in the below code we will just call descriptor_strings as descriptors for conciseness
        input_dim = self.descriptors['input']['dim']
        input_descriptor = self.descriptors['input']['final-string']
        cell_dim = self.config['cell-dim']
        rec_proj_dim = self.config['recurrent-projection-dim']
        nonrec_proj_dim = self.config['non-recurrent-projection-dim']
        delay = self.config['delay']
        repair_nonlin = self.config['self-repair-scale-nonlinearity']
        repair_nonlin_str = "self-repair-scale={0:.10f}".format(repair_nonlin) if repair_nonlin is not None else ''
        bptrunc_str = ("clipping-threshold={0}"
                      " zeroing-threshold={1}"
                      " zeroing-interval={2}"
                      " recurrence-interval={3}"
                      "".format(self.config['clipping-threshold'],
                                self.config['zeroing-threshold'],
                                self.config['zeroing-interval'],
                                abs(delay)))
        affine_str = self.config['ng-affine-options']
        pes_str = self.config['ng-per-element-scale-options']
        dropout_proportion = self.config['dropout-proportion']
        dropout_per_frame = 'true' if self.config['dropout-per-frame'] else 'false' 

        # Natural gradient per element scale parameters
        # TODO: decide if we want to keep exposing these options
        if re.search('param-mean', pes_str) is None and \
           re.search('param-stddev', pes_str) is None:
           pes_str += " param-mean=0.0 param-stddev=1.0 "

        # formulation like:
        # z_t = \sigmoid ( x_t * U^z + s_{t-1} * W^z ) // update gate
        # r_t = \sigmoid ( x_t * U^r + s_{t-1} * W^r ) // reset gate
        # \tilde{h}_t = \tanh ( x_t * U^h + ( s_{t-1} \dot r_t ) * W^h )
        # h_t = ( 1 - z_t ) \dot \tilde{h}_t + z_t \dot h_{t-1}
        # y_t_tmp = h_t * W^y
        # s_t = renorm ( y_t_tmp (0:rec_proj_dim-1) )
        # y_t = batchnorm ( y_t_tmp )
        
        configs = []
        configs.append("# Update gate control : W_z* matrics")
        configs.append("component name={0}.W_z.xs_z type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + rec_proj_dim, cell_dim, affine_str))
        
        configs.append("# Reset gate control : W_r* matrics")
        configs.append("component name={0}.W_z.xs_r type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + rec_proj_dim, rec_proj_dim, affine_str))

        configs.append("# h related matrix : W_h* matrics")
        configs.append("component name={0}.W_h.UW type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + rec_proj_dim, cell_dim , affine_str))
        
        if dropout_proportion != -1.0:
            configs.append("component name={0}.dropout_z type=DropoutComponent dim={1} "
                           "dropout-proportion={2} dropout-per-frame={3}"
                           .format(name, cell_dim, dropout_proportion, dropout_per_frame))
            configs.append("component name={0}.dropout_r type=DropoutComponent dim={1} "
                           "dropout-proportion={2} dropout-per-frame={3}"
                           .format(name, rec_proj_dim, dropout_proportion, dropout_per_frame))
        
        configs.append("# Defining the non-linearities")
        configs.append("component name={0}.z type=SigmoidComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str))
        configs.append("component name={0}.r type=SigmoidComponent dim={1} {2}".format(name, rec_proj_dim, repair_nonlin_str))
        configs.append("component name={0}.h type=TanhComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str))

        configs.append("# Defining the components for other cell computations")
        configs.append("component name={0}.h1 type=ElementwiseProductComponent input-dim={1} output-dim={2}".format(name, 2 * rec_proj_dim, rec_proj_dim))
        configs.append("component name={0}.y1 type=ElementwiseProductComponent input-dim={1} output-dim={2}".format(name, 2 * cell_dim, cell_dim))
        configs.append("component name={0}.y2 type=ElementwiseProductComponent input-dim={1} output-dim={2}".format(name, 2 * cell_dim, cell_dim))
        configs.append("component name={0}.y type=NoOpComponent dim={1}".format(name, cell_dim))

        recurrent_connection = '{0}.s_t'.format(name)
        recurrent_connection_y = '{0}.y_t'.format(name)

        configs.append("# z_t")
        configs.append("component-node name={0}.z_t_pre component={0}.W_z.xs_z input=Append({1}, IfDefined(Offset({2}, {3})))".format(name, input_descriptor, recurrent_connection, delay))
        if dropout_proportion != -1.0:
            configs.append("component-node name={0}.z_predrop_t component={0}.z input={0}.z_t_pre".format(name))
            configs.append("component-node name={0}.z_t component={0}.dropout_z input={0}.z_predrop_t".format(name))
        else:
            configs.append("component-node name={0}.z_t component={0}.z input={0}.z_t_pre".format(name, input_descriptor, recurrent_connection, delay))

        configs.append("# r_t")
        configs.append("component-node name={0}.r_t_pre component={0}.W_z.xs_r input=Append({1}, IfDefined(Offset({2}, {3})))".format(name, input_descriptor, recurrent_connection, delay))
        if dropout_proportion != -1.0:
            configs.append("component-node name={0}.r_predrop_t component={0}.r input={0}.r_t_pre".format(name))
            configs.append("component-node name={0}.r_t component={0}.dropout_r input={0}.r_predrop_t".format(name))            
        else:
            configs.append("component-node name={0}.r_t component={0}.r input={0}.r_t_pre".format(name))

        configs.append("# h_t")
        configs.append("component-node name={0}.h1_t component={0}.h1 input=Append({0}.r_t, IfDefined(Offset({1}, {2})))".format(name, recurrent_connection, delay))
        configs.append("component-node name={0}.h_t_pre component={0}.W_h.UW input=Append({1}, {0}.h1_t)".format(name, input_descriptor))
        configs.append("component-node name={0}.h_t component={0}.h input={0}.h_t_pre".format(name))

        configs.append("component-node name={0}.y1_t component={0}.y1 input=Append({0}.h_t, Sum(Scale(-1.0,{0}.z_t), Const(1.0, {1})))".format(name, cell_dim))
        configs.append("component-node name={0}.y2_t component={0}.y2 input=Append(IfDefined(Offset({1}, {2})), {0}.z_t)".format(name, recurrent_connection_y, delay))
        configs.append("component-node name={0}.y_t component={0}.y input=Sum({0}.y1_t, {0}.y2_t)".format(name))

        configs.append("# s_t recurrent")
        configs.append("component name={0}.W_s.ys type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, cell_dim, rec_proj_dim + nonrec_proj_dim, affine_str))
        configs.append("component name={0}.s_r type=BackpropTruncationComponent dim={1} {2}".format(name, rec_proj_dim, bptrunc_str))
        
        configs.append("component name={0}.batchnorm type=BatchNormComponent dim={1} target-rms=1.0".format(name, rec_proj_dim + nonrec_proj_dim))
        configs.append("component name={0}.renorm type=NormalizeComponent dim={1} target-rms=1.0".format(name, rec_proj_dim))

        configs.append("# s_t and n_t : sn_t will be the output")
        configs.append("component-node name={0}.sn_nobatchnorm_t component={0}.W_s.ys input={0}.y_t".format(name))
        configs.append("dim-range-node name={0}.s_t_preclip input-node={0}.sn_nobatchnorm_t dim-offset=0 dim={1}".format(name, rec_proj_dim))
        configs.append("component-node name={0}.sn_t component={0}.batchnorm input={0}.sn_nobatchnorm_t".format(name))

        configs.append("component-node name={0}.s_renorm_t component={0}.renorm input={0}.s_t_preclip".format(name))
        configs.append("component-node name={0}.s_t component={0}.s_r input={0}.s_renorm_t".format(name))

        return configs


# This class is for lines like
#   'opgru-layer name=opgru1 input=[-1] delay=-3'
# It generates an OPGRU sub-graph with output projections.
# The output dimension of the layer may be specified via 'cell-dim=xxx', but if not specified,
# the dimension defaults to the same as the input.
# See other configuration values below.
#
# Parameters of the class, and their defaults:
#   input='[-1]'             [Descriptor giving the input of the layer.]
#   cell-dim=-1            [Dimension of the cell]
#   recurrent-projection-dim [Dimension of the projection used in recurrent connections, e.g. cell-dim/4]
#   non-recurrent-projection-dim   [Dimension of the projection in non-recurrent connections,
#                                   in addition to recurrent-projection-dim, e.g. cell-dim/4]
#   delay=-1                 [Delay in the recurrent connections of the GRU ]
#   clipping-threshold=30    [nnet3 GRU use a gradient clipping component at the recurrent connections.
#                             This is the threshold used to decide if clipping has to be activated ]
#   zeroing-interval=20      [interval at which we (possibly) zero out the recurrent derivatives.]
#   zeroing-threshold=15     [We only zero out the derivs every zeroing-interval, if derivs exceed this value.]
#   self-repair-scale-nonlinearity=1e-5      [It is a constant scaling the self-repair vector computed in derived classes of NonlinearComponent]
#                                       i.e.,  SigmoidComponent, TanhComponent and RectifiedLinearComponent ]
#   ng-per-element-scale-options=''   [Additional options used for the diagonal matrices in the GRU ]
#   ng-affine-options=''              [Additional options used for the full matrices in the GRU, can be used to do things like set biases to initialize to 1]
class XconfigOpgruLayer(XconfigLayerBase):
    def __init__(self, first_token, key_to_value, prev_names = None):
        assert first_token == "opgru-layer"
        XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names)

    def set_default_configs(self):
        self.config = {'input' : '[-1]',
                        'cell-dim' : -1, # this is a compulsory argument
                        'recurrent-projection-dim' : -1,  # defaults to cell-dim / 4
                        'non-recurrent-projection-dim' : -1, # defaults to
                                                             # recurrent-projection-dim
                        'clipping-threshold' : 30.0,
                        'delay' : -1,
                        'ng-per-element-scale-options' : ' max-change=0.75 ',
                        'ng-affine-options' : ' max-change=0.75 ',
                        'self-repair-scale-nonlinearity' : 0.00001,
                        'zeroing-interval' : 20,
                        'zeroing-threshold' : 15.0
                       }

    def set_derived_configs(self):
        if self.config['recurrent-projection-dim'] <= 0:
            self.config['recurrent-projection-dim'] = self.config['cell-dim'] / 4

        if self.config['non-recurrent-projection-dim'] <= 0:
            self.config['non-recurrent-projection-dim'] = \
               self.config['recurrent-projection-dim']

    def check_configs(self):
        for key in ['cell-dim', 'recurrent-projection-dim',
                    'non-recurrent-projection-dim']:
            if self.config[key] <= 0:
                raise RuntimeError("{0} has invalid value {1}.".format(
                    key, self.config[key]))

        if self.config['delay'] == 0:
            raise RuntimeError("delay cannot be zero")

        if (self.config['recurrent-projection-dim'] +
            self.config['non-recurrent-projection-dim'] >
            self.config['cell-dim']):
            raise RuntimeError("recurrent+non-recurrent projection dim exceeds "
                                "cell dim.")
        for key in ['self-repair-scale-nonlinearity']:
            if self.config[key] < 0.0 or self.config[key] > 1.0:
                raise RuntimeError("{0} has invalid value {2}."
                                   .format(self.layer_type, key,
                                           self.config[key]))

    def auxiliary_outputs(self):
        return ['h_t']

    def output_name(self, auxiliary_output = None):
        node_name = 'sn_t'
        if auxiliary_output is not None:
            if auxiliary_output in self.auxiliary_outputs():
                node_name = auxiliary_output
            else:
                raise Exception("In {0} of type {1}, unknown auxiliary output name {1}".format(self.layer_type, auxiliary_output))

        return '{0}.{1}'.format(self.name, node_name)

    def output_dim(self, auxiliary_output = None):
        if auxiliary_output is not None:
            if auxiliary_output in self.auxiliary_outputs():
                if node_name == 'c_t':
                    return self.config['cell-dim']
                # add code for other auxiliary_outputs here when we decide to expose them
            else:
                raise Exception("In {0} of type {1}, unknown auxiliary output name {1}".format(self.layer_type, auxiliary_output))

        return self.config['recurrent-projection-dim'] + self.config['non-recurrent-projection-dim']

    def get_full_config(self):
        ans = []
        config_lines = self.generate_pgru_config()

        for line in config_lines:
            for config_name in ['ref', 'final']:
                # we do not support user specified matrices in LSTM initialization
                # so 'ref' and 'final' configs are the same.
                ans.append((config_name, line))
        return ans

    # convenience function to generate the OPGRU config
    def generate_pgru_config(self):

        # assign some variables to reduce verbosity
        name = self.name
        # in the below code we will just call descriptor_strings as descriptors for conciseness
        input_dim = self.descriptors['input']['dim']
        input_descriptor = self.descriptors['input']['final-string']
        cell_dim = self.config['cell-dim']
        rec_proj_dim = self.config['recurrent-projection-dim']
        nonrec_proj_dim = self.config['non-recurrent-projection-dim']
        delay = self.config['delay']
        repair_nonlin = self.config['self-repair-scale-nonlinearity']
        repair_nonlin_str = "self-repair-scale={0:.10f}".format(repair_nonlin) if repair_nonlin is not None else ''
        bptrunc_str = ("clipping-threshold={0}"
                      " zeroing-threshold={1}"
                      " zeroing-interval={2}"
                      " recurrence-interval={3}"
                      "".format(self.config['clipping-threshold'],
                                self.config['zeroing-threshold'],
                                self.config['zeroing-interval'],
                                abs(delay)))
        affine_str = self.config['ng-affine-options']
        pes_str = self.config['ng-per-element-scale-options']

        # Natural gradient per element scale parameters
        # TODO: decide if we want to keep exposing these options
        if re.search('param-mean', pes_str) is None and \
           re.search('param-stddev', pes_str) is None:
           pes_str += " param-mean=0.0 param-stddev=1.0 "

        # formulation for OPGRU like:
        # z_t = \sigmoid ( x_t * U^z + s_{t-1} * W^z ) // update gate
        # o_t = \sigmoid ( x_t * U^o + s_{t-1} * W^o ) // output gate
        # \tilde{h}_t = \tanh ( x_t * U^h + h_{t-1} \dot W^h ) // W^h is learnable vector
        # h_t = ( 1 - z_t ) \dot \tilde{h}_t + z_t \dot h_{t-1}
        # y_t = (y_t \dot o_t) * W^y
        # s_t = y_t(0:rec_proj_dim-1)
        
        configs = []
        configs.append("# Update gate control : W_z* matrics")
        configs.append("component name={0}.W_z.xs_z type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + rec_proj_dim, cell_dim, affine_str))
        
        configs.append("# Output gate control : W_r* matrics")
        configs.append("component name={0}.W_z.xs_o type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + rec_proj_dim, cell_dim, affine_str))

        configs.append("# h related matrix : W_h* matrics")
        configs.append("component name={0}.W_h.UW type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim , cell_dim , affine_str))
        configs.append("component name={0}.W_h.UW_elementwise type=NaturalGradientPerElementScaleComponent dim={1} {2}".format(name, cell_dim , pes_str))
        
        configs.append("# Defining the non-linearities")
        configs.append("component name={0}.z type=SigmoidComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str))
        configs.append("component name={0}.o type=SigmoidComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str))
        configs.append("component name={0}.h type=TanhComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str))

        configs.append("# Defining the components for other cell computations")
        configs.append("component name={0}.o1 type=ElementwiseProductComponent input-dim={1} output-dim={2}".format(name, 2 * cell_dim, cell_dim))
        configs.append("component name={0}.y1 type=ElementwiseProductComponent input-dim={1} output-dim={2}".format(name, 2 * cell_dim, cell_dim))
        configs.append("component name={0}.y2 type=ElementwiseProductComponent input-dim={1} output-dim={2}".format(name, 2 * cell_dim, cell_dim))
        configs.append("component name={0}.y type=NoOpComponent dim={1}".format(name, cell_dim))

        recurrent_connection = '{0}.s_t'.format(name)
        recurrent_connection_y = '{0}.y_t'.format(name)

        configs.append("# z_t")
        configs.append("component-node name={0}.z_t_pre component={0}.W_z.xs_z input=Append({1}, IfDefined(Offset({2}, {3})))".format(name, input_descriptor, recurrent_connection, delay))
        configs.append("component-node name={0}.z_t component={0}.z input={0}.z_t_pre".format(name))

        configs.append("# o_t")
        configs.append("component-node name={0}.o_t_pre component={0}.W_z.xs_o input=Append({1}, IfDefined(Offset({2}, {3})))".format(name, input_descriptor, recurrent_connection, delay))
        configs.append("component-node name={0}.o_t component={0}.o input={0}.o_t_pre".format(name))
        
        configs.append("# h_t")
        configs.append("component-node name={0}.h_t_pre component={0}.W_h.UW input={1}".format(name, input_descriptor))
        configs.append("component-node name={0}.h_t_pre2 component={0}.W_h.UW_elementwise input=IfDefined(Offset({1}, {2}))".format(name, recurrent_connection_y, delay))
        configs.append("component-node name={0}.h_t component={0}.h input=Sum({0}.h_t_pre, {0}.h_t_pre2)".format(name))

        configs.append("component-node name={0}.y1_t component={0}.y1 input=Append({0}.h_t, Sum(Scale(-1.0,{0}.z_t), Const(1.0, {1})))".format(name, cell_dim))
        configs.append("component-node name={0}.y2_t component={0}.y2 input=Append(IfDefined(Offset({1}, {2})), {0}.z_t)".format(name, recurrent_connection_y, delay))
        configs.append("component-node name={0}.y_t component={0}.y input=Sum({0}.y1_t, {0}.y2_t)".format(name))
        configs.append("component-node name={0}.y_o_t component={0}.o1 input=Append({0}.o_t, {0}.y_t)".format(name))

        configs.append("# s_t recurrent")
        configs.append("component name={0}.W_s.ys type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, cell_dim, rec_proj_dim + nonrec_proj_dim, affine_str))
        configs.append("component name={0}.s_r type=BackpropTruncationComponent dim={1} {2}".format(name, rec_proj_dim, bptrunc_str))

        configs.append("# s_t and n_t : sn_t will be the output")
        configs.append("component-node name={0}.sn_t component={0}.W_s.ys input={0}.y_o_t".format(name))
        configs.append("dim-range-node name={0}.s_t_preclip input-node={0}.sn_t dim-offset=0 dim={1}".format(name, rec_proj_dim))
        configs.append("component-node name={0}.s_t component={0}.s_r input={0}.s_t_preclip".format(name))

        return configs

# This class is for lines like
#   'norm-opgru-layer name=norm-opgru1 input=[-1] delay=-3'
# It generates a norm-OPGRU sub-graph with output projections.

# Different from the vanilla OPGRU, the NormOPGRU uses batchnorm in the forward direction
# and renorm in the recurrence.

# The output dimension of the layer may be specified via 'cell-dim=xxx', but if not specified,
# the dimension defaults to the same as the input.
# See other configuration values below.
#
# Parameters of the class, and their defaults:
#   input='[-1]'             [Descriptor giving the input of the layer.]
#   cell-dim=-1            [Dimension of the cell]
#   recurrent-projection-dim [Dimension of the projection used in recurrent connections, e.g. cell-dim/4]
#   non-recurrent-projection-dim   [Dimension of the projection in non-recurrent connections,
#                                   in addition to recurrent-projection-dim, e.g. cell-dim/4]
#   delay=-1                 [Delay in the recurrent connections of the GRU ]
#   clipping-threshold=30    [nnet3 GRU use a gradient clipping component at the recurrent connections.
#                             This is the threshold used to decide if clipping has to be activated ]
#   zeroing-interval=20      [interval at which we (possibly) zero out the recurrent derivatives.]
#   zeroing-threshold=15     [We only zero out the derivs every zeroing-interval, if derivs exceed this value.]
#   self-repair-scale-nonlinearity=1e-5      [It is a constant scaling the self-repair vector computed in derived classes of NonlinearComponent]
#                                       i.e.,  SigmoidComponent, TanhComponent and RectifiedLinearComponent ]
#   ng-per-element-scale-options=''   [Additional options used for the diagonal matrices in the GRU ]
#   ng-affine-options=''              [Additional options used for the full matrices in the GRU, can be used to do things like set biases to initialize to 1]
class XconfigNormOpgruLayer(XconfigLayerBase):
    def __init__(self, first_token, key_to_value, prev_names = None):
        assert first_token == "norm-opgru-layer"
        XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names)

    def set_default_configs(self):
        self.config = {'input' : '[-1]',
                        'cell-dim' : -1, # this is a compulsory argument
                        'recurrent-projection-dim' : -1,  # defaults to cell-dim / 4
                        'non-recurrent-projection-dim' : -1, # defaults to
                                                             # recurrent-projection-dim
                        'clipping-threshold' : 30.0,
                        'delay' : -1,
                        'ng-per-element-scale-options' : ' max-change=0.75 ',
                        'ng-affine-options' : ' max-change=0.75 ',
                        'self-repair-scale-nonlinearity' : 0.00001,
                        'zeroing-interval' : 20,
                        'zeroing-threshold' : 15.0,
                        'dropout-proportion' : -1.0, # If -1.0, no dropout components will be added
                        'l2-regularize': 0.0,
                        'dropout-per-frame' : True  # If false, regular dropout, not per frame.
                       }

    def set_derived_configs(self):
        if self.config['recurrent-projection-dim'] <= 0:
            self.config['recurrent-projection-dim'] = self.config['cell-dim'] / 4

        if self.config['non-recurrent-projection-dim'] <= 0:
            self.config['non-recurrent-projection-dim'] = \
               self.config['recurrent-projection-dim']

    def check_configs(self):
        for key in ['cell-dim', 'recurrent-projection-dim',
                    'non-recurrent-projection-dim']:
            if self.config[key] <= 0:
                raise RuntimeError("{0} has invalid value {1}.".format(
                    key, self.config[key]))

        if self.config['delay'] == 0:
            raise RuntimeError("delay cannot be zero")

        if (self.config['recurrent-projection-dim'] +
            self.config['non-recurrent-projection-dim'] >
            self.config['cell-dim']):
            raise RuntimeError("recurrent+non-recurrent projection dim exceeds "
                                "cell dim.")
        for key in ['self-repair-scale-nonlinearity']:
            if self.config[key] < 0.0 or self.config[key] > 1.0:
                raise RuntimeError("{0} has invalid value {2}."
                                   .format(self.layer_type, key,
                                           self.config[key]))
        if ((self.config['dropout-proportion'] > 1.0 or
             self.config['dropout-proportion'] < 0.0) and
             self.config['dropout-proportion'] != -1.0 ):
             raise RuntimeError("dropout-proportion has invalid value {0}."
                                .format(self.config['dropout-proportion']))

    def auxiliary_outputs(self):
        return ['h_t']

    def output_name(self, auxiliary_output = None):
        node_name = 'sn_t'
        if auxiliary_output is not None:
            if auxiliary_output in self.auxiliary_outputs():
                node_name = auxiliary_output
            else:
                raise Exception("In {0} of type {1}, unknown auxiliary output name {1}".format(self.layer_type, auxiliary_output))

        return '{0}.{1}'.format(self.name, node_name)

    def output_dim(self, auxiliary_output = None):
        if auxiliary_output is not None:
            if auxiliary_output in self.auxiliary_outputs():
                if node_name == 'c_t':
                    return self.config['cell-dim']
                # add code for other auxiliary_outputs here when we decide to expose them
            else:
                raise Exception("In {0} of type {1}, unknown auxiliary output name {1}".format(self.layer_type, auxiliary_output))

        return self.config['recurrent-projection-dim'] + self.config['non-recurrent-projection-dim']

    def get_full_config(self):
        ans = []
        config_lines = self.generate_pgru_config()

        for line in config_lines:
            for config_name in ['ref', 'final']:
                # we do not support user specified matrices in LSTM initialization
                # so 'ref' and 'final' configs are the same.
                ans.append((config_name, line))
        return ans

    # convenience function to generate the Norm-OPGRU config
    def generate_pgru_config(self):

        # assign some variables to reduce verbosity
        name = self.name
        # in the below code we will just call descriptor_strings as descriptors for conciseness
        input_dim = self.descriptors['input']['dim']
        input_descriptor = self.descriptors['input']['final-string']
        cell_dim = self.config['cell-dim']
        rec_proj_dim = self.config['recurrent-projection-dim']
        nonrec_proj_dim = self.config['non-recurrent-projection-dim']
        delay = self.config['delay']
        repair_nonlin = self.config['self-repair-scale-nonlinearity']
        repair_nonlin_str = "self-repair-scale={0:.10f}".format(repair_nonlin) if repair_nonlin is not None else ''
        bptrunc_str = ("clipping-threshold={0}"
                      " zeroing-threshold={1}"
                      " zeroing-interval={2}"
                      " recurrence-interval={3}"
                      "".format(self.config['clipping-threshold'],
                                self.config['zeroing-threshold'],
                                self.config['zeroing-interval'],
                                abs(delay)))
        affine_str = self.config['ng-affine-options']
        pes_str = self.config['ng-per-element-scale-options']
        dropout_proportion = self.config['dropout-proportion']
        dropout_per_frame = 'true' if self.config['dropout-per-frame'] else 'false' 

        l2_regularize = self.config['l2-regularize']
        l2_regularize_option = ('l2-regularize={0} '.format(l2_regularize)
                                if l2_regularize != 0.0 else '')

        # Natural gradient per element scale parameters
        # TODO: decide if we want to keep exposing these options
        if re.search('param-mean', pes_str) is None and \
           re.search('param-stddev', pes_str) is None:
           pes_str += " param-mean=0.0 param-stddev=1.0 "

        # formulation for OPGRU like:
        # z_t = \sigmoid ( x_t * U^z + s_{t-1} * W^z ) // update gate
        # o_t = \sigmoid ( x_t * U^o + s_{t-1} * W^o ) // output gate
        # \tilde{h}_t = \tanh ( x_t * U^h + h_{t-1} \dot W^h ) // W^h is learnable vector
        # h_t = ( 1 - z_t ) \dot \tilde{h}_t + z_t \dot h_{t-1}
        # y_t_tmp = ( h_t \dot o_t) * W^y
        # s_t = renorm ( y_t_tmp(0:rec_proj_dim-1) )
        # y_t = batchnorm ( y_t_tmp )
        
        configs = []
        configs.append("# Update gate control : W_z* matrics")
        configs.append("component name={0}.W_z.xs_z type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3} {4}".format(name, input_dim + rec_proj_dim, cell_dim, affine_str, l2_regularize_option))
        
        configs.append("# Output gate control : W_r* matrics")
        configs.append("component name={0}.W_z.xs_o type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3} {4}".format(name, input_dim + rec_proj_dim, cell_dim, affine_str, l2_regularize_option))

        configs.append("# h related matrix : W_h* matrics")
        configs.append("component name={0}.W_h.UW type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3} {4}".format(name, input_dim , cell_dim , affine_str, l2_regularize_option))
        configs.append("component name={0}.W_h.UW_elementwise type=NaturalGradientPerElementScaleComponent dim={1} {2}".format(name, cell_dim , pes_str))
        
        configs.append("# Defining the non-linearities")
        configs.append("component name={0}.z type=SigmoidComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str))
        configs.append("component name={0}.o type=SigmoidComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str))
        configs.append("component name={0}.h type=TanhComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str))

        configs.append("# Defining the components for other cell computations")
        configs.append("component name={0}.o1 type=ElementwiseProductComponent input-dim={1} output-dim={2}".format(name, 2 * cell_dim, cell_dim))
        configs.append("component name={0}.y1 type=ElementwiseProductComponent input-dim={1} output-dim={2}".format(name, 2 * cell_dim, cell_dim))
        configs.append("component name={0}.y2 type=ElementwiseProductComponent input-dim={1} output-dim={2}".format(name, 2 * cell_dim, cell_dim))
        configs.append("component name={0}.y type=NoOpComponent dim={1}".format(name, cell_dim))

        if dropout_proportion != -1.0:
            configs.append("component name={0}.dropout type=DropoutComponent dim={1} "
                           "dropout-proportion={2} dropout-per-frame={3}"
                           .format(name, cell_dim, dropout_proportion, dropout_per_frame))

        recurrent_connection = '{0}.s_t'.format(name)
        recurrent_connection_y = '{0}.y_t'.format(name)

        configs.append("# z_t")
        configs.append("component-node name={0}.z_t_pre component={0}.W_z.xs_z input=Append({1}, IfDefined(Offset({2}, {3})))".format(name, input_descriptor, recurrent_connection, delay))
        if dropout_proportion != -1.0:
            configs.append("component-node name={0}.z_predrop_t component={0}.z input={0}.z_t_pre".format(name))
            configs.append("component-node name={0}.z_t component={0}.dropout input={0}.z_predrop_t".format(name))
        else:
            configs.append("component-node name={0}.z_t component={0}.z input={0}.z_t_pre".format(name))

        configs.append("# o_t")
        configs.append("component-node name={0}.o_t_pre component={0}.W_z.xs_o input=Append({1}, IfDefined(Offset({2}, {3})))".format(name, input_descriptor, recurrent_connection, delay))
        if dropout_proportion != -1.0:
            configs.append("component-node name={0}.o_predrop_t component={0}.o input={0}.o_t_pre".format(name))
            configs.append("component-node name={0}.o_t component={0}.dropout input={0}.o_predrop_t".format(name))
        else:
            configs.append("component-node name={0}.o_t component={0}.o input={0}.o_t_pre".format(name))
        
        configs.append("# h_t")
        configs.append("component-node name={0}.h_t_pre component={0}.W_h.UW input={1}".format(name, input_descriptor))
        configs.append("component-node name={0}.h_t_pre2 component={0}.W_h.UW_elementwise input=IfDefined(Offset({1}, {2}))".format(name, recurrent_connection_y, delay))
        configs.append("component-node name={0}.h_t component={0}.h input=Sum({0}.h_t_pre, {0}.h_t_pre2)".format(name))

        configs.append("# The following two lines are to implement (1 - z_t)")
        configs.append("component-node name={0}.y1_t component={0}.y1 input=Append({0}.h_t, Sum(Scale(-1.0,{0}.z_t), Const(1.0, {1})))".format(name, cell_dim))
        configs.append("component-node name={0}.y2_t component={0}.y2 input=Append(IfDefined(Offset({1}, {2})), {0}.z_t)".format(name, recurrent_connection_y, delay))
        configs.append("component-node name={0}.y_t component={0}.y input=Sum({0}.y1_t, {0}.y2_t)".format(name))
        configs.append("component-node name={0}.y_o_t component={0}.o1 input=Append({0}.o_t, {0}.y_t)".format(name))

        configs.append("# s_t recurrent")
        configs.append("component name={0}.W_s.ys type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3} {4}".format(name, cell_dim, rec_proj_dim + nonrec_proj_dim, affine_str, l2_regularize_option))
        configs.append("component name={0}.s_r type=BackpropTruncationComponent dim={1} {2}".format(name, rec_proj_dim, bptrunc_str))
        configs.append("component name={0}.batchnorm type=BatchNormComponent dim={1} target-rms=1.0".format(name, rec_proj_dim + nonrec_proj_dim))
        configs.append("component name={0}.renorm type=NormalizeComponent dim={1} target-rms=1.0".format(name, rec_proj_dim))

        configs.append("# s_t and n_t : sn_t will be the output")
        configs.append("component-node name={0}.sn_nobatchnorm_t component={0}.W_s.ys input={0}.y_o_t".format(name))
        configs.append("component-node name={0}.sn_t component={0}.batchnorm input={0}.sn_nobatchnorm_t".format(name))
        configs.append("dim-range-node name={0}.s_t_preclip input-node={0}.sn_nobatchnorm_t dim-offset=0 dim={1}".format(name, rec_proj_dim))
        configs.append("component-node name={0}.s_t_preclip_renorm component={0}.renorm input={0}.s_t_preclip".format(name))
        configs.append("component-node name={0}.s_t component={0}.s_r input={0}.s_t_preclip_renorm".format(name))

        return configs

# This class is for lines like
#   'fast-gru-layer name=gru1 input=[-1] delay=-3'
# It generates an GRU sub-graph without output projections.
# The output dimension of the layer may be specified via 'cell-dim=xxx', but if not specified,
# the dimension defaults to the same as the input.
# See other configuration values below.
# decay-time is deprecated under GRU or PGRU, as I found the PGRUs do not need the decay-time option to get generalized to unseen sequence length
#
# Parameters of the class, and their defaults:
#   input='[-1]'             [Descriptor giving the input of the layer.]
#   cell-dim=-1              [Dimension of the cell]
#   delay=-1                 [Delay in the recurrent connections of the GRU/LSTM ]
#   clipping-threshold=30    [similar to LSTMs ,nnet3 GRUs use a gradient clipping component at the recurrent connections.
#                             This is the threshold used to decide if clipping has to be activated ]
#   zeroing-interval=20      [interval at which we (possibly) zero out the recurrent derivatives.]
#   zeroing-threshold=15     [We only zero out the derivs every zeroing-interval, if derivs exceed this value.]
#   self-repair-scale-nonlinearity=1e-5      [It is a constant scaling the self-repair vector computed in derived classes of NonlinearComponent]
#                                       i.e.,  SigmoidComponent, TanhComponent and RectifiedLinearComponent ]
#   ng-per-element-scale-options=''     [Additional options used for the diagonal matrices in the GRU/LSTM ]
#   gru-nonlinearity-options=' max-change=0.75' [options for GruNonlinearityComponent, see below for detail]
#   ng-affine-options=''                [Additional options used for the full matrices in the GRU/LSTM, can be used to do things like set biases to initialize to 1]
class XconfigFastGruLayer(XconfigLayerBase):
    def __init__(self, first_token, key_to_value, prev_names = None):
        assert first_token == "fast-gru-layer"
        XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names)

    def set_default_configs(self):
        self.config = {'input':'[-1]',
                        'cell-dim' : -1, # this is a compulsory argument
                        'clipping-threshold' : 30.0,
                        'delay' : -1,
                        'ng-per-element-scale-options' : ' max-change=0.75',
                        'ng-affine-options' : ' max-change=0.75 ',
                        'self-repair-scale-nonlinearity' : 0.00001,
                        'zeroing-interval' : 20,
                        'zeroing-threshold' : 15.0,
                        # if you want to set 'self-repair-scale', ' self-repair-threshold'
                        # or 'param-stddev' for GruNonlinearityComponent
                        # For default, they are 1.0e-05, 0.2 and  1.0 / sqrt(d) where d is cell-dim.
                        # you can add somethig like 'self-repair-scale=xxx' to gru-nonlinearity-options.
                        # you can also see src/nnet3/nnet-combined-component.h for detail
                        'gru-nonlinearity-options' : ' max-change=0.75'
                        }

    def set_derived_configs(self):
        if self.config['cell-dim'] <= 0:
            self.config['cell-dim'] = self.descriptors['input']['dim']

    def check_configs(self):
        key = 'cell-dim'
        if self.config['cell-dim'] <= 0:
            raise RuntimeError("cell-dim has invalid value {0}.".format(self.config[key]))

        if self.config['delay'] == 0:
            raise RuntimeError("delay cannot be zero")

        for key in ['self-repair-scale-nonlinearity']:
            if self.config[key] < 0.0 or self.config[key] > 1.0:
                raise RuntimeError("{0} has invalid value {1}.".format(key, self.config[key]))

    def output_name(self, auxiliary_output = None):
        node_name = 'y_t'
        return '{0}.{1}'.format(self.name, node_name)

    def output_dim(self, auxiliary_output = None):
        return self.config['cell-dim']

    def get_full_config(self):
        ans = []
        config_lines = self.generate_gru_config()

        for line in config_lines:
            for config_name in ['ref', 'final']:
                # we do not support user specified matrices in LSTM initialization
                # so 'ref' and 'final' configs are the same.
                ans.append((config_name, line))
        return ans

    # convenience function to generate the GRU config
    def generate_gru_config(self):

        # assign some variables to reduce verbosity
        name = self.name
        # in the below code we will just call descriptor_strings as descriptors for conciseness
        input_dim = self.descriptors['input']['dim']
        input_descriptor = self.descriptors['input']['final-string']
        cell_dim = self.config['cell-dim']
        delay = self.config['delay']
        bptrunc_str = ("clipping-threshold={0}"
                      " zeroing-threshold={1}"
                      " zeroing-interval={2}"
                      " recurrence-interval={3}"
                      "".format(self.config['clipping-threshold'],
                                self.config['zeroing-threshold'],
                                self.config['zeroing-interval'], abs(delay)))
        repair_nonlin = self.config['self-repair-scale-nonlinearity']
        repair_nonlin_str = "self-repair-scale={0:.10f}".format(repair_nonlin) if repair_nonlin is not None else ''
        affine_str = self.config['ng-affine-options']

        # string for GruNonlinearityComponent
        gru_nonlin_str = self.config['gru-nonlinearity-options']
        
        # formulation like:
        # z_t = \sigmoid ( U^z x_t + W^z y_{t-1} )   # update gate
        # r_t = \sigmoid ( U^r x_t + W^r y_{t-1} )   # reset gate
        # h_t = \tanh ( U^h x_t + W^h ( y_{t-1} \dot r_t ) )
        # y_t = ( 1 - z_t ) \dot h_t  +  z_t \dot y_{t-1}
        # Note:
        # naming convention:
        # <layer-name>.W_<outputname>.<inputname> e.g. Gru1.W_i.xr for matrix
        # providing output to gate i and operating on an appended vector [x,r]
        # notation convention:
        # In order to be consistent with the notations which are used in
        # nnet-combined-component.cc, we map "\tilde{h_t}" and "h_t" which are
        # used in paper to "h_t" and "c_t"

        configs = []

        configs.append("### Begin Gru layer '{0}'".format(name))
        configs.append("# Update gate control : W_z* matrices")
        configs.append("component name={0}.W_z.xh type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + cell_dim, cell_dim, affine_str))
        configs.append("# Reset gate control : W_r* matrices")
        configs.append("component name={0}.W_r.xh type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + cell_dim, cell_dim, affine_str))

        configs.append("# hpart_t related matrix : W_hpart matrice")
        configs.append("component name={0}.W_hpart.x type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim, cell_dim , affine_str))
        
        configs.append("# Defining the non-linearities for z_t and r_t")
        configs.append("component name={0}.z type=SigmoidComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str))
        configs.append("component name={0}.r type=SigmoidComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str))
        
        recurrent_connection = '{0}.s_t'.format(name)

        configs.append("# z_t")
        configs.append("component-node name={0}.z_t_pre component={0}.W_z.xh input=Append({1}, IfDefined(Offset({2}, {3})))".format(name, input_descriptor, recurrent_connection, delay))
        configs.append("component-node name={0}.z_t component={0}.z input={0}.z_t_pre".format(name))
        configs.append("# r_t")
        configs.append("component-node name={0}.r_t_pre component={0}.W_r.xh input=Append({1}, IfDefined(Offset({2}, {3})))".format(name, input_descriptor, recurrent_connection, delay))
        configs.append("component-node name={0}.r_t component={0}.r input={0}.r_t_pre".format(name))

        configs.append("# hpart_t")
        configs.append("component-node name={0}.hpart_t component={0}.W_hpart.x input={1}".format(name, input_descriptor))
        
        configs.append("# y_t")
        configs.append("# Note: the output of GruNonlinearityComponent is (h_t, c_t), we just get the second half. Otherwise, in non-projection gru layer, y_t = c_t")
        configs.append("component name={0}.gru_nonlin type=GruNonlinearityComponent cell-dim={1} {2}".format(name, cell_dim, gru_nonlin_str))
        configs.append("component-node name={0}.gru_nonlin_t component={0}.gru_nonlin input=Append({0}.z_t, {0}.r_t, {0}.hpart_t, IfDefined(Offset({1}, {2})))".format(name, recurrent_connection, delay))
        configs.append("dim-range-node name={0}.y_t input-node={0}.gru_nonlin_t dim-offset={1} dim={1}".format(name, cell_dim))

        configs.append("# s_t : recurrence")
        configs.append("# Note: in non-projection gru layer, the recurrent part equals the output, namely y_t.")
        configs.append("component name={0}.s_r type=BackpropTruncationComponent dim={1} {2}".format(name, cell_dim, bptrunc_str))
        configs.append("component-node name={0}.s_t component={0}.s_r input={0}.y_t".format(name))
        return configs


# This class is for lines like
#   'fast-pgru-layer name=pgru1 input=[-1] delay=-3'
# It generates an PGRU sub-graph with output projections. It can also generate
# outputs without projection, but you could use the XconfigGruLayer for this
# simple RNN.
# The output dimension of the layer may be specified via 'cell-dim=xxx', but if not specified,
# the dimension defaults to the same as the input.
# See other configuration values below.
#
# Parameters of the class, and their defaults:
#   input='[-1]'             [Descriptor giving the input of the layer.]
#   cell-dim=-1            [Dimension of the cell]
#   recurrent-projection_dim [Dimension of the projection used in recurrent connections, e.g. cell-dim/4]
#   non-recurrent-projection-dim   [Dimension of the projection in non-recurrent connections,
#                                   in addition to recurrent-projection-dim, e.g. cell-dim/4]
#   delay=-1                 [Delay in the recurrent connections of the GRU ]
#   clipping-threshold=30    [nnet3 GRU use a gradient clipping component at the recurrent connections.
#                             This is the threshold used to decide if clipping has to be activated ]
#   zeroing-interval=20      [interval at which we (possibly) zero out the recurrent derivatives.]
#   zeroing-threshold=15     [We only zero out the derivs every zeroing-interval, if derivs exceed this value.]
#   self_repair_scale_nonlinearity=1e-5      [It is a constant scaling the self-repair vector computed in derived classes of NonlinearComponent]
#                                       i.e.,  SigmoidComponent, TanhComponent and RectifiedLinearComponent ]
#   ng-per-element-scale-options=''   [Additional options used for the diagonal matrices in the GRU ]
#   gru-nonlinearity-options=' max-change=0.75' [options for GruNonlinearityComponent, see below for detail]
#   ng-affine-options=''              [Additional options used for the full matrices in the GRU, can be used to do things like set biases to initialize to 1]
class XconfigFastPgruLayer(XconfigLayerBase):
    def __init__(self, first_token, key_to_value, prev_names = None):
        assert first_token == "fast-pgru-layer"
        XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names)

    def set_default_configs(self):
        self.config = {'input' : '[-1]',
                        'cell-dim' : -1, # this is a compulsory argument
                        'recurrent-projection-dim' : -1,  # defaults to cell-dim / 4
                        'non-recurrent-projection-dim' : -1, # defaults to
                                                             # recurrent-projection-dim
                        'clipping-threshold' : 30.0,
                        'delay' : -1,
                        'ng-per-element-scale-options' : ' max-change=0.75 ',
                        'ng-affine-options' : ' max-change=0.75 ',
                        'self-repair-scale-nonlinearity' : 0.00001,
                        'zeroing-interval' : 20,
                        'zeroing-threshold' : 15.0,
                        # if you want to set 'self-repair-scale', ' self-repair-threshold'
                        # or 'param-stddev' for GruNonlinearityComponent
                        # For default, they are 1.0e-05, 0.2 and  1.0 / sqrt(d) where d is cell-dim.
                        # you can add somethig like 'self-repair-scale=xxx' to gru-nonlinearity-options.
                        # you can also see src/nnet3/nnet-combined-component.h for detail
                        'gru-nonlinearity-options' : ' max-change=0.75'
                       }

    def set_derived_configs(self):
        if self.config['recurrent-projection-dim'] <= 0:
            self.config['recurrent-projection-dim'] = self.config['cell-dim'] / 4

        if self.config['non-recurrent-projection-dim'] <= 0:
            self.config['non-recurrent-projection-dim'] = \
               self.config['recurrent-projection-dim']

    def check_configs(self):
        for key in ['cell-dim', 'recurrent-projection-dim',
                    'non-recurrent-projection-dim']:
            if self.config[key] <= 0:
                raise RuntimeError("{0} has invalid value {1}.".format(
                    key, self.config[key]))

        if self.config['delay'] == 0:
            raise RuntimeError("delay cannot be zero")

        if (self.config['recurrent-projection-dim'] +
            self.config['non-recurrent-projection-dim'] >
            self.config['cell-dim']):
            raise RuntimeError("recurrent+non-recurrent projection dim exceeds "
                                "cell dim.")
        for key in ['self-repair-scale-nonlinearity']:
            if self.config[key] < 0.0 or self.config[key] > 1.0:
                raise RuntimeError("{0} has invalid value {2}."
                                   .format(self.layer_type, key,
                                           self.config[key]))

    def auxiliary_outputs(self):
        return ['c_t']

    def output_name(self, auxiliary_output = None):
        node_name = 'y_t'
        if auxiliary_output is not None:
            if auxiliary_output in self.auxiliary_outputs():
                node_name = auxiliary_output
            else:
                raise Exception("In {0} of type {1}, unknown auxiliary output name {1}".format(self.layer_type, auxiliary_output))

        return '{0}.{1}'.format(self.name, node_name)

    def output_dim(self, auxiliary_output = None):
        if auxiliary_output is not None:
            if auxiliary_output in self.auxiliary_outputs():
                if node_name == 'c_t':
                    return self.config['cell-dim']
                # add code for other auxiliary_outputs here when we decide to expose them
            else:
                raise Exception("In {0} of type {1}, unknown auxiliary output name {1}".format(self.layer_type, auxiliary_output))

        return self.config['recurrent-projection-dim'] + self.config['non-recurrent-projection-dim']

    def get_full_config(self):
        ans = []
        config_lines = self.generate_pgru_config()

        for line in config_lines:
            for config_name in ['ref', 'final']:
                # we do not support user specified matrices in LSTM initialization
                # so 'ref' and 'final' configs are the same.
                ans.append((config_name, line))
        return ans

    # convenience function to generate the PGRU config
    def generate_pgru_config(self):

        # assign some variables to reduce verbosity
        name = self.name
        # in the below code we will just call descriptor_strings as descriptors for conciseness
        input_dim = self.descriptors['input']['dim']
        input_descriptor = self.descriptors['input']['final-string']
        cell_dim = self.config['cell-dim']
        rec_proj_dim = self.config['recurrent-projection-dim']
        nonrec_proj_dim = self.config['non-recurrent-projection-dim']
        delay = self.config['delay']
        repair_nonlin = self.config['self-repair-scale-nonlinearity']
        repair_nonlin_str = "self-repair-scale={0:.10f}".format(repair_nonlin) if repair_nonlin is not None else ''
        bptrunc_str = ("clipping-threshold={0}"
                      " zeroing-threshold={1}"
                      " zeroing-interval={2}"
                      " recurrence-interval={3}"
                      "".format(self.config['clipping-threshold'],
                                self.config['zeroing-threshold'],
                                self.config['zeroing-interval'],
                                abs(delay)))
        affine_str = self.config['ng-affine-options']
        pes_str = self.config['ng-per-element-scale-options']

        # Natural gradient per element scale parameters
        # TODO: decide if we want to keep exposing these options
        if re.search('param-mean', pes_str) is None and \
           re.search('param-stddev', pes_str) is None:
           pes_str += " param-mean=0.0 param-stddev=1.0 "

        # string for GruNonlinearityComponent
        gru_nonlin_str = self.config['gru-nonlinearity-options']
        
        # formulation like:
        # z_t = \sigmoid ( U^z x_t + W^z s_{t-1} )   # update gate
        # r_t = \sigmoid ( U^r x_t + W^r s_{t-1} )   # reset gate
        # h_t = \tanh ( U^h x_t + W^h ( s_{t-1} \dot r_t ) )
        # c_t = ( 1 - z_t ) \dot h_t  +  z_t \dot c_{t-1}
        # y_t = W^y c_t  # dim(y_t) = recurrent_dim + non_recurrent_dim.
                         #  This is the output of the GRU.
        # s_t = y_t[0:recurrent_dim-1]  # dimension range of y_t 
                                        # dim(s_t) = recurrent_dim.
        # Note:
        # naming convention:
        # <layer-name>.W_<outputname>.<inputname> e.g. Gru1.W_i.xr for matrix
        # providing output to gate i and operating on an appended vector [x,r]
        # notation convention:
        # In order to be consistent with the notations which are used in
        # nnet-combined-component.cc, we map "\tilde{h_t}" and "h_t" which are
        # used in paper to "h_t" and "c_t"

        configs = []
        configs.append("### Begin Gru layer '{0}'".format(name))
        configs.append("# Update gate control : W_z* matrices")
        configs.append("component name={0}.W_z.xs type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + rec_proj_dim, cell_dim, affine_str))
        configs.append("# Reset gate control : W_r* matrices")
        configs.append("component name={0}.W_r.xs type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + rec_proj_dim, rec_proj_dim, affine_str))


        configs.append("# hpart_t related matrix : W_hpart matric")
        configs.append("component name={0}.W_hpart.x type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim, cell_dim , affine_str))
        
        configs.append("# Defining the non-linearities")
        configs.append("component name={0}.z type=SigmoidComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str))
        configs.append("component name={0}.r type=SigmoidComponent dim={1} {2}".format(name, rec_proj_dim, repair_nonlin_str))
        
        recurrent_connection = '{0}.s_t'.format(name)

        configs.append("# z_t and r_t")
        configs.append("component-node name={0}.z_t_pre component={0}.W_z.xs input=Append({1}, IfDefined(Offset({2}, {3})))".format(name, input_descriptor, recurrent_connection, delay))
        configs.append("component-node name={0}.z_t component={0}.z input={0}.z_t_pre".format(name))
        configs.append("component-node name={0}.r_t_pre component={0}.W_r.xs input=Append({1}, IfDefined(Offset({2}, {3})))".format(name, input_descriptor, recurrent_connection, delay))
        configs.append("component-node name={0}.r_t component={0}.r input={0}.r_t_pre".format(name))

        configs.append("# hpart_t")
        configs.append("component-node name={0}.hpart_t component={0}.W_hpart.x input={1}".format(name, input_descriptor))
        
        configs.append("# c_t")
        configs.append("# Note: the output of GruNonlinearityComponent is (h_t, c_t), we use the second half.")
        configs.append("component name={0}.gru_nonlin type=GruNonlinearityComponent cell-dim={1} recurrent-dim={2} {3}".format(name, cell_dim, rec_proj_dim, gru_nonlin_str))
        configs.append("component-node name={0}.gru_nonlin_t component={0}.gru_nonlin input=Append({0}.z_t, {0}.r_t, {0}.hpart_t, IfDefined(Offset({0}.c_t, {2})), IfDefined(Offset({1}, {2})))".format(name, recurrent_connection, delay))
        configs.append("dim-range-node name={0}.c_t input-node={0}.gru_nonlin_t dim-offset={1} dim={1}".format(name, cell_dim))

        configs.append("# the projected matrix W_y.c and y_t")
        configs.append("component name={0}.W_y.c type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, cell_dim, rec_proj_dim + nonrec_proj_dim, affine_str))
        configs.append("component-node name={0}.y_t component={0}.W_y.c input={0}.c_t".format(name))

        configs.append("# s_t : recurrence")
        configs.append("component name={0}.s_r type=BackpropTruncationComponent dim={1} {2}".format(name, rec_proj_dim, bptrunc_str))
        configs.append("dim-range-node name={0}.s_t_pre input-node={0}.y_t dim-offset=0 dim={1}".format(name, rec_proj_dim))
        configs.append("component-node name={0}.s_t component={0}.s_r input={0}.s_t_pre".format(name))
        return configs


# This class is for lines like
#   'fast-norm-pgru-layer name=pgru1 input=[-1] delay=-3'

# Different from the vanilla PGRU, the NormPGRU uses batchnorm in the forward direction
# and renorm in the recurrence.

# The output dimension of the layer may be specified via 'cell-dim=xxx', but if not specified,
# the dimension defaults to the same as the input.
# See other configuration values below.
#
# Parameters of the class, and their defaults:
#   input='[-1]'             [Descriptor giving the input of the layer.]
#   cell-dim=-1            [Dimension of the cell]
#   recurrent-projection_dim [Dimension of the projection used in recurrent connections, e.g. cell-dim/4]
#   non-recurrent-projection-dim   [Dimension of the projection in non-recurrent connections,
#                                   in addition to recurrent-projection-dim, e.g. cell-dim/4]
#   delay=-1                 [Delay in the recurrent connections of the GRU ]
#   clipping-threshold=30    [nnet3 GRU use a gradient clipping component at the recurrent connections.
#                             This is the threshold used to decide if clipping has to be activated ]
#   zeroing-interval=20      [interval at which we (possibly) zero out the recurrent derivatives.]
#   zeroing-threshold=15     [We only zero out the derivs every zeroing-interval, if derivs exceed this value.]
#   self_repair_scale_nonlinearity=1e-5      [It is a constant scaling the self-repair vector computed in derived classes of NonlinearComponent]
#                                       i.e.,  SigmoidComponent, TanhComponent and RectifiedLinearComponent ]
#   ng-per-element-scale-options=''   [Additional options used for the diagonal matrices in the GRU ]
#   gru-nonlinearity-options=' max-change=0.75' [options for GruNonlinearityComponent, see below for detail]
#   ng-affine-options=''              [Additional options used for the full matrices in the GRU, can be used to do things like set biases to initialize to 1]
class XconfigFastNormPgruLayer(XconfigLayerBase):
    def __init__(self, first_token, key_to_value, prev_names = None):
        assert first_token == "fast-norm-pgru-layer"
        XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names)

    def set_default_configs(self):
        self.config = {'input' : '[-1]',
                        'cell-dim' : -1, # this is a compulsory argument
                        'recurrent-projection-dim' : -1,  # defaults to cell-dim / 4
                        'non-recurrent-projection-dim' : -1, # defaults to
                                                             # recurrent-projection-dim
                        'clipping-threshold' : 30.0,
                        'delay' : -1,
                        'ng-per-element-scale-options' : ' max-change=0.75 ',
                        'ng-affine-options' : ' max-change=0.75 ',
                        'self-repair-scale-nonlinearity' : 0.00001,
                        'zeroing-interval' : 20,
                        'zeroing-threshold' : 15.0,
                        # if you want to set 'self-repair-scale', ' self-repair-threshold'
                        # or 'param-stddev' for GruNonlinearityComponent
                        # For default, they are 1.0e-05, 0.2 and  1.0 / sqrt(d) where d is cell-dim.
                        # you can add somethig like 'self-repair-scale=xxx' to gru-nonlinearity-options.
                        # you can also see src/nnet3/nnet-combined-component.h for detail
                        'gru-nonlinearity-options' : ' max-change=0.75',
                        'dropout-proportion' : -1.0,  # If -1.0, no dropout components will be added
                        'dropout-per-frame' : True  # If False, regular dropout, not per frame
                       }

    def set_derived_configs(self):
        if self.config['recurrent-projection-dim'] <= 0:
            self.config['recurrent-projection-dim'] = self.config['cell-dim'] / 4

        if self.config['non-recurrent-projection-dim'] <= 0:
            self.config['non-recurrent-projection-dim'] = \
               self.config['recurrent-projection-dim']

    def check_configs(self):
        for key in ['cell-dim', 'recurrent-projection-dim',
                    'non-recurrent-projection-dim']:
            if self.config[key] <= 0:
                raise RuntimeError("{0} has invalid value {1}.".format(
                    key, self.config[key]))

        if self.config['delay'] == 0:
            raise RuntimeError("delay cannot be zero")

        if (self.config['recurrent-projection-dim'] +
            self.config['non-recurrent-projection-dim'] >
            self.config['cell-dim']):
            raise RuntimeError("recurrent+non-recurrent projection dim exceeds "
                                "cell dim.")
        for key in ['self-repair-scale-nonlinearity']:
            if self.config[key] < 0.0 or self.config[key] > 1.0:
                raise RuntimeError("{0} has invalid value {2}."
                                   .format(self.layer_type, key,
                                           self.config[key]))
        if ((self.config['dropout-proportion'] > 1.0 or
             self.config['dropout-proportion'] < 0.0) and
             self.config['dropout-proportion'] != -1.0 ):
             raise RuntimeError("dropout-proportion has invalid value {0}."
                                .format(self.config['dropout-proportion']))

    def auxiliary_outputs(self):
        return ['c_t']

    def output_name(self, auxiliary_output = None):
        node_name = 'y_t'
        if auxiliary_output is not None:
            if auxiliary_output in self.auxiliary_outputs():
                node_name = auxiliary_output
            else:
                raise Exception("In {0} of type {1}, unknown auxiliary output name {1}".format(self.layer_type, auxiliary_output))

        return '{0}.{1}'.format(self.name, node_name)

    def output_dim(self, auxiliary_output = None):
        if auxiliary_output is not None:
            if auxiliary_output in self.auxiliary_outputs():
                if node_name == 'c_t':
                    return self.config['cell-dim']
                # add code for other auxiliary_outputs here when we decide to expose them
            else:
                raise Exception("In {0} of type {1}, unknown auxiliary output name {1}".format(self.layer_type, auxiliary_output))

        return self.config['recurrent-projection-dim'] + self.config['non-recurrent-projection-dim']

    def get_full_config(self):
        ans = []
        config_lines = self.generate_pgru_config()

        for line in config_lines:
            for config_name in ['ref', 'final']:
                # we do not support user specified matrices in LSTM initialization
                # so 'ref' and 'final' configs are the same.
                ans.append((config_name, line))
        return ans

    # convenience function to generate the Norm-PGRU config
    def generate_pgru_config(self):

        # assign some variables to reduce verbosity
        name = self.name
        # in the below code we will just call descriptor_strings as descriptors for conciseness
        input_dim = self.descriptors['input']['dim']
        input_descriptor = self.descriptors['input']['final-string']
        cell_dim = self.config['cell-dim']
        rec_proj_dim = self.config['recurrent-projection-dim']
        nonrec_proj_dim = self.config['non-recurrent-projection-dim']
        delay = self.config['delay']
        repair_nonlin = self.config['self-repair-scale-nonlinearity']
        repair_nonlin_str = "self-repair-scale={0:.10f}".format(repair_nonlin) if repair_nonlin is not None else ''
        bptrunc_str = ("clipping-threshold={0}"
                      " zeroing-threshold={1}"
                      " zeroing-interval={2}"
                      " recurrence-interval={3}"
                      "".format(self.config['clipping-threshold'],
                                self.config['zeroing-threshold'],
                                self.config['zeroing-interval'],
                                abs(delay)))
        affine_str = self.config['ng-affine-options']
        pes_str = self.config['ng-per-element-scale-options']
        dropout_proportion = self.config['dropout-proportion']
        dropout_per_frame = 'true' if self.config['dropout-per-frame'] else 'false' 

        # Natural gradient per element scale parameters
        # TODO: decide if we want to keep exposing these options
        if re.search('param-mean', pes_str) is None and \
           re.search('param-stddev', pes_str) is None:
           pes_str += " param-mean=0.0 param-stddev=1.0 "

        # string for GruNonlinearityComponent
        gru_nonlin_str = self.config['gru-nonlinearity-options']
        
        # formulation like:
        # z_t = \sigmoid ( U^z x_t + W^z s_{t-1} )   # update gate
        # r_t = \sigmoid ( U^r x_t + W^r s_{t-1} )   # reset gate
        # h_t = \tanh ( U^h x_t + W^h ( s_{t-1} \dot r_t ) )
        # c_t = ( 1 - z_t ) \dot h_t  +  z_t \dot c_{t-1}
        # y_t_tmp = W^y c_t
        # s_t = renorm ( y_t_tmp[0:rec_proj_dim-1] ) # dim(s_t) = recurrent_dim.
        # y_t = batchnorm ( y_t_tmp )  # dim(y_t) = recurrent_dim + non_recurrent_dim.
                                       # This is the output of the GRU.
        # Note:
        # naming convention:
        # <layer-name>.W_<outputname>.<inputname> e.g. Gru1.W_i.xr for matrix
        # providing output to gate i and operating on an appended vector [x,r]
        # notation convention:
        # In order to be consistent with the notations which are used in
        # nnet-combined-component.cc, we map "\tilde{h_t}" and "h_t" which are
        # used in paper to "h_t" and "c_t"

        configs = []
        configs.append("### Begin Gru layer '{0}'".format(name))
        configs.append("# Update gate control : W_z* matrices")
        configs.append("component name={0}.W_z.xs type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + rec_proj_dim, cell_dim, affine_str))
        configs.append("# Reset gate control : W_r* matrices")
        configs.append("component name={0}.W_r.xs type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + rec_proj_dim, rec_proj_dim, affine_str))


        configs.append("# hpart_t related matrix : W_hpart matric")
        configs.append("component name={0}.W_hpart.x type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim, cell_dim , affine_str))
        
        configs.append("# Defining the non-linearities")
        configs.append("component name={0}.z type=SigmoidComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str))
        configs.append("component name={0}.r type=SigmoidComponent dim={1} {2}".format(name, rec_proj_dim, repair_nonlin_str))

        if dropout_proportion != -1.0:
            configs.append("# Defining the dropout component")
            configs.append("component name={0}.dropout_z type=DropoutComponent dim={1} "
                           "dropout-proportion={2} dropout-per-frame={3}"
                           .format(name, cell_dim, dropout_proportion, dropout_per_frame))
            configs.append("component name={0}.dropout_r type=DropoutComponent dim={1} "
                           "dropout-proportion={2} dropout-per-frame={3}"
                           .format(name, rec_proj_dim, dropout_proportion, dropout_per_frame))


        recurrent_connection = '{0}.s_t'.format(name)

        configs.append("# z_t")
        configs.append("component-node name={0}.z_t_pre component={0}.W_z.xs input=Append({1}, IfDefined(Offset({2}, {3})))".format(name, input_descriptor, recurrent_connection, delay))
        if dropout_proportion != -1.0:
            configs.append("component-node name={0}.z_t_predrop component={0}.z input={0}.z_t_pre".format(name))
            configs.append("component-node name={0}.z_t component={0}.dropout_z input={0}.z_t_predrop".format(name))
        else:
            configs.append("component-node name={0}.z_t component={0}.z input={0}.z_t_pre".format(name))

        configs.append("# r_t")
        configs.append("component-node name={0}.r_t_pre component={0}.W_r.xs input=Append({1}, IfDefined(Offset({2}, {3})))".format(name, input_descriptor, recurrent_connection, delay))
        if dropout_proportion != -1.0:
            configs.append("component-node name={0}.r_t_predrop component={0}.r input={0}.r_t_pre".format(name))
            configs.append("component-node name={0}.r_t component={0}.dropout_r input={0}.r_t_predrop".format(name))
        else:
            configs.append("component-node name={0}.r_t component={0}.r input={0}.r_t_pre".format(name))

        configs.append("# hpart_t")
        configs.append("component-node name={0}.hpart_t component={0}.W_hpart.x input={1}".format(name, input_descriptor))
        
        configs.append("# c_t")
        configs.append("# Note: the output of GruNonlinearityComponent is (h_t, c_t), we use the second half.")
        configs.append("component name={0}.gru_nonlin type=GruNonlinearityComponent cell-dim={1} recurrent-dim={2} {3}".format(name, cell_dim, rec_proj_dim, gru_nonlin_str))
        configs.append("component-node name={0}.gru_nonlin_t component={0}.gru_nonlin input=Append({0}.z_t, {0}.r_t, {0}.hpart_t, IfDefined(Offset({0}.c_t, {2})), IfDefined(Offset({1}, {2})))".format(name, recurrent_connection, delay))
        configs.append("dim-range-node name={0}.c_t input-node={0}.gru_nonlin_t dim-offset={1} dim={1}".format(name, cell_dim))

        configs.append("# the projected matrix W_y.c and y_t_tmp")
        configs.append("component name={0}.W_y.c type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, cell_dim, rec_proj_dim + nonrec_proj_dim, affine_str))
        configs.append("component-node name={0}.y_t_tmp component={0}.W_y.c input={0}.c_t".format(name))

        configs.append("# s_t : recurrence")
        configs.append("component name={0}.renorm type=NormalizeComponent dim={1} target-rms=1.0".format(name, rec_proj_dim))
        configs.append("component name={0}.s_r type=BackpropTruncationComponent dim={1} {2}".format(name, rec_proj_dim, bptrunc_str))
        configs.append("dim-range-node name={0}.s_t_pre input-node={0}.y_t_tmp dim-offset=0 dim={1}".format(name, rec_proj_dim))
        configs.append("component-node name={0}.s_t_renorm component={0}.renorm input={0}.s_t_pre".format(name))
        configs.append("component-node name={0}.s_t component={0}.s_r input={0}.s_t_renorm".format(name))

        configs.append("# y_t : output")
        configs.append("component name={0}.batchnorm type=BatchNormComponent dim={1} target-rms=1.0".format(name, rec_proj_dim + nonrec_proj_dim))
        configs.append("component-node name={0}.y_t component={0}.batchnorm input={0}.y_t_tmp".format(name))
        return configs


# This class is for lines like
#   'fast-opgru-layer name=opgru1 input=[-1] delay=-3'
# It generates an PGRU sub-graph with output projections. It can also generate
# outputs without projection, but you could use the XconfigGruLayer for this
# simple RNN.
# The output dimension of the layer may be specified via 'cell-dim=xxx', but if not specified,
# the dimension defaults to the same as the input.
# See other configuration values below.
#
# Parameters of the class, and their defaults:
#   input='[-1]'             [Descriptor giving the input of the layer.]
#   cell-dim=-1            [Dimension of the cell]
#   recurrent-projection_dim [Dimension of the projection used in recurrent connections, e.g. cell-dim/4]
#   non-recurrent-projection-dim   [Dimension of the projection in non-recurrent connections,
#                                   in addition to recurrent-projection-dim, e.g. cell-dim/4]
#   delay=-1                 [Delay in the recurrent connections of the GRU ]
#   clipping-threshold=30    [nnet3 GRU use a gradient clipping component at the recurrent connections.
#                             This is the threshold used to decide if clipping has to be activated ]
#   zeroing-interval=20      [interval at which we (possibly) zero out the recurrent derivatives.]
#   zeroing-threshold=15     [We only zero out the derivs every zeroing-interval, if derivs exceed this value.]
#   self_repair_scale_nonlinearity=1e-5      [It is a constant scaling the self-repair vector computed in derived classes of NonlinearComponent]
#                                       i.e.,  SigmoidComponent, TanhComponent and RectifiedLinearComponent ]
#   ng-per-element-scale-options=''   [Additional options used for the diagonal matrices in the GRU ]
#   gru-nonlinearity-options=' max-change=0.75' [options for GruNonlinearityComponent, see below for detail]
#   ng-affine-options=''              [Additional options used for the full matrices in the GRU, can be used to do things like set biases to initialize to 1]
class XconfigFastOpgruLayer(XconfigLayerBase):
    def __init__(self, first_token, key_to_value, prev_names = None):
        assert first_token == "fast-opgru-layer"
        XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names)

    def set_default_configs(self):
        self.config = {'input' : '[-1]',
                        'cell-dim' : -1, # this is a compulsory argument
                        'recurrent-projection-dim' : -1,  # defaults to cell-dim / 4
                        'non-recurrent-projection-dim' : -1, # defaults to
                                                             # recurrent-projection-dim
                        'clipping-threshold' : 30.0,
                        'delay' : -1,
                        'ng-per-element-scale-options' : ' max-change=0.75 ',
                        'ng-affine-options' : ' max-change=0.75 ',
                        'self-repair-scale-nonlinearity' : 0.00001,
                        'zeroing-interval' : 20,
                        'zeroing-threshold' : 15.0,
                        # if you want to set 'self-repair-scale', ' self-repair-threshold'
                        # or 'param-stddev' for GruNonlinearityComponent
                        # For default, they are 1.0e-05, 0.2 and  1.0 / sqrt(d) where d is cell-dim.
                        # you can add somethig like 'self-repair-scale=xxx' to gru-nonlinearity-options.
                        # you can also see src/nnet3/nnet-combined-component.h for detail
                        'gru-nonlinearity-options' : ' max-change=0.75'
                       }

    def set_derived_configs(self):
        if self.config['recurrent-projection-dim'] <= 0:
            self.config['recurrent-projection-dim'] = self.config['cell-dim'] / 4

        if self.config['non-recurrent-projection-dim'] <= 0:
            self.config['non-recurrent-projection-dim'] = \
               self.config['recurrent-projection-dim']

    def check_configs(self):
        for key in ['cell-dim', 'recurrent-projection-dim',
                    'non-recurrent-projection-dim']:
            if self.config[key] <= 0:
                raise RuntimeError("{0} has invalid value {1}.".format(
                    key, self.config[key]))

        if self.config['delay'] == 0:
            raise RuntimeError("delay cannot be zero")

        if (self.config['recurrent-projection-dim'] +
            self.config['non-recurrent-projection-dim'] >
            self.config['cell-dim']):
            raise RuntimeError("recurrent+non-recurrent projection dim exceeds "
                                "cell dim.")
        for key in ['self-repair-scale-nonlinearity']:
            if self.config[key] < 0.0 or self.config[key] > 1.0:
                raise RuntimeError("{0} has invalid value {2}."
                                   .format(self.layer_type, key,
                                           self.config[key]))

    def auxiliary_outputs(self):
        return ['c_t']

    def output_name(self, auxiliary_output = None):
        node_name = 'y_t'
        if auxiliary_output is not None:
            if auxiliary_output in self.auxiliary_outputs():
                node_name = auxiliary_output
            else:
                raise Exception("In {0} of type {1}, unknown auxiliary output name {1}".format(self.layer_type, auxiliary_output))

        return '{0}.{1}'.format(self.name, node_name)

    def output_dim(self, auxiliary_output = None):
        if auxiliary_output is not None:
            if auxiliary_output in self.auxiliary_outputs():
                if node_name == 'c_t':
                    return self.config['cell-dim']
                # add code for other auxiliary_outputs here when we decide to expose them
            else:
                raise Exception("In {0} of type {1}, unknown auxiliary output name {1}".format(self.layer_type, auxiliary_output))

        return self.config['recurrent-projection-dim'] + self.config['non-recurrent-projection-dim']

    def get_full_config(self):
        ans = []
        config_lines = self.generate_pgru_config()

        for line in config_lines:
            for config_name in ['ref', 'final']:
                # we do not support user specified matrices in LSTM initialization
                # so 'ref' and 'final' configs are the same.
                ans.append((config_name, line))
        return ans

    # convenience function to generate the OPGRU config
    def generate_pgru_config(self):

        # assign some variables to reduce verbosity
        name = self.name
        # in the below code we will just call descriptor_strings as descriptors for conciseness
        input_dim = self.descriptors['input']['dim']
        input_descriptor = self.descriptors['input']['final-string']
        cell_dim = self.config['cell-dim']
        rec_proj_dim = self.config['recurrent-projection-dim']
        nonrec_proj_dim = self.config['non-recurrent-projection-dim']
        delay = self.config['delay']
        repair_nonlin = self.config['self-repair-scale-nonlinearity']
        repair_nonlin_str = "self-repair-scale={0:.10f}".format(repair_nonlin) if repair_nonlin is not None else ''
        bptrunc_str = ("clipping-threshold={0}"
                      " zeroing-threshold={1}"
                      " zeroing-interval={2}"
                      " recurrence-interval={3}"
                      "".format(self.config['clipping-threshold'],
                                self.config['zeroing-threshold'],
                                self.config['zeroing-interval'],
                                abs(delay)))
        affine_str = self.config['ng-affine-options']
        pes_str = self.config['ng-per-element-scale-options']

        # Natural gradient per element scale parameters
        # TODO: decide if we want to keep exposing these options
        if re.search('param-mean', pes_str) is None and \
           re.search('param-stddev', pes_str) is None:
           pes_str += " param-mean=0.0 param-stddev=1.0 "

        # string for GruNonlinearityComponent
        gru_nonlin_str = self.config['gru-nonlinearity-options']
        
        # formulation like:
        # z_t = \sigmoid ( U^z x_t + W^z s_{t-1} )   # update gate
        # o_t = \sigmoid ( U^o x_t + W^o s_{t-1} )   # reset gate
        # h_t = \tanh ( U^h x_t + W^h \dot c_{t-1} )
        # c_t = ( 1 - z_t ) \dot h_t  +  z_t \dot c_{t-1}
        # y_t = ( c_t \dot o_t ) W^y  # dim(y_t) = recurrent_dim + non_recurrent_dim.
                                      #  This is the output of the GRU.
        # s_t = y_t[0:recurrent_dim-1]  # dimension range of y_t 
                                        # dim(s_t) = recurrent_dim.
        # Note:
        # naming convention:
        # <layer-name>.W_<outputname>.<inputname> e.g. Gru1.W_i.xr for matrix
        # providing output to gate i and operating on an appended vector [x,r]
        # notation convention:
        # In order to be consistent with the notations which are used in
        # nnet-combined-component.cc, we map "\tilde{h_t}" and "h_t" which are
        # used in paper to "h_t" and "c_t"

        configs = []
        configs.append("### Begin Gru layer '{0}'".format(name))
        configs.append("# Update gate control : W_z* matrices")
        configs.append("component name={0}.W_z.xs type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + rec_proj_dim, cell_dim, affine_str))
        configs.append("# Reset gate control : W_o* matrices")
        configs.append("component name={0}.W_o.xs type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + rec_proj_dim, cell_dim, affine_str))


        configs.append("# hpart_t related matrix : W_hpart matric")
        configs.append("component name={0}.W_hpart.x type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim, cell_dim , affine_str))
        
        configs.append("# Defining the non-linearities")
        configs.append("component name={0}.z type=SigmoidComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str))
        configs.append("component name={0}.o type=SigmoidComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str))
        
        recurrent_connection = '{0}.s_t'.format(name)

        configs.append("# z_t and o_t")
        configs.append("component-node name={0}.z_t_pre component={0}.W_z.xs input=Append({1}, IfDefined(Offset({2}, {3})))".format(name, input_descriptor, recurrent_connection, delay))
        configs.append("component-node name={0}.z_t component={0}.z input={0}.z_t_pre".format(name))
        configs.append("component-node name={0}.o_t_pre component={0}.W_o.xs input=Append({1}, IfDefined(Offset({2}, {3})))".format(name, input_descriptor, recurrent_connection, delay))
        configs.append("component-node name={0}.o_t component={0}.o input={0}.o_t_pre".format(name))

        configs.append("# hpart_t")
        configs.append("component-node name={0}.hpart_t component={0}.W_hpart.x input={1}".format(name, input_descriptor))
        
        configs.append("# c_t")
        configs.append("# Note: the output of OutputGruNonlinearityComponent is (h_t, c_t), we use the second half.")
        configs.append("component name={0}.gru_nonlin type=OutputGruNonlinearityComponent cell-dim={1} {2}".format(name, cell_dim, gru_nonlin_str))
        configs.append("component-node name={0}.gru_nonlin_t component={0}.gru_nonlin input=Append({0}.z_t, {0}.hpart_t, IfDefined(Offset({0}.c_t, {1})))".format(name, delay))
        configs.append("dim-range-node name={0}.c_t input-node={0}.gru_nonlin_t dim-offset={1} dim={1}".format(name, cell_dim))

        configs.append("# the projected matrix W_y.cdoto and y_t")
        configs.append("component name={0}.cdoto type=ElementwiseProductComponent input-dim={1} output-dim={2}".format(name, 2 * cell_dim, cell_dim))
        configs.append("component-node name={0}.cdoto component={0}.cdoto input=Append({0}.c_t, {0}.o_t)".format(name))
        configs.append("component name={0}.W_y.cdoto type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, cell_dim, rec_proj_dim + nonrec_proj_dim, affine_str))
        configs.append("component-node name={0}.y_t component={0}.W_y.cdoto input={0}.cdoto".format(name))

        configs.append("# s_t recurrence")
        configs.append("component name={0}.s_r type=BackpropTruncationComponent dim={1} {2}".format(name, rec_proj_dim, bptrunc_str))
        configs.append("dim-range-node name={0}.s_t_preclip input-node={0}.y_t dim-offset=0 dim={1}".format(name, rec_proj_dim))
        configs.append("component-node name={0}.s_t component={0}.s_r input={0}.s_t_preclip".format(name))

        return configs


# This class is for lines like
#   'fast-norm-opgru-layer name=opgru1 input=[-1] delay=-3'

# Different from the vanilla OPGRU, the NormOPGRU uses batchnorm in the forward direction
# and renorm in the recurrence.

# The output dimension of the layer may be specified via 'cell-dim=xxx', but if not specified,
# the dimension defaults to the same as the input.
# See other configuration values below.
#
# Parameters of the class, and their defaults:
#   input='[-1]'             [Descriptor giving the input of the layer.]
#   cell-dim=-1            [Dimension of the cell]
#   recurrent-projection_dim [Dimension of the projection used in recurrent connections, e.g. cell-dim/4]
#   non-recurrent-projection-dim   [Dimension of the projection in non-recurrent connections,
#                                   in addition to recurrent-projection-dim, e.g. cell-dim/4]
#   delay=-1                 [Delay in the recurrent connections of the GRU ]
#   clipping-threshold=30    [nnet3 GRU use a gradient clipping component at the recurrent connections.
#                             This is the threshold used to decide if clipping has to be activated ]
#   zeroing-interval=20      [interval at which we (possibly) zero out the recurrent derivatives.]
#   zeroing-threshold=15     [We only zero out the derivs every zeroing-interval, if derivs exceed this value.]
#   self_repair_scale_nonlinearity=1e-5      [It is a constant scaling the self-repair vector computed in derived classes of NonlinearComponent]
#                                       i.e.,  SigmoidComponent, TanhComponent and RectifiedLinearComponent ]
#   ng-per-element-scale-options=''   [Additional options used for the diagonal matrices in the GRU ]
#   gru-nonlinearity-options=' max-change=0.75' [options for GruNonlinearityComponent, see below for detail]
#   ng-affine-options=''              [Additional options used for the full matrices in the GRU, can be used to do things like set biases to initialize to 1]
class XconfigFastNormOpgruLayer(XconfigLayerBase):
    def __init__(self, first_token, key_to_value, prev_names = None):
        assert first_token == "fast-norm-opgru-layer"
        XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names)

    def set_default_configs(self):
        self.config = {'input' : '[-1]',
                        'cell-dim' : -1, # this is a compulsory argument
                        'recurrent-projection-dim' : -1,  # defaults to cell-dim / 4
                        'non-recurrent-projection-dim' : -1, # defaults to
                                                             # recurrent-projection-dim
                        'clipping-threshold' : 30.0,
                        'delay' : -1,
                        'ng-per-element-scale-options' : ' max-change=0.75 ',
                        'ng-affine-options' : ' max-change=0.75 ',
                        'self-repair-scale-nonlinearity' : 0.00001,
                        'zeroing-interval' : 20,
                        'zeroing-threshold' : 15.0,
                        # if you want to set 'self-repair-scale', ' self-repair-threshold'
                        # or 'param-stddev' for GruNonlinearityComponent
                        # For default, they are 1.0e-05, 0.2 and  1.0 / sqrt(d) where d is cell-dim.
                        # you can add somethig like 'self-repair-scale=xxx' to gru-nonlinearity-options.
                        # you can also see src/nnet3/nnet-combined-component.h for detail
                        'gru-nonlinearity-options' : ' max-change=0.75',
                        'dropout-proportion' : -1.0,  # If -1.0, no dropout components will be added
                        'dropout-per-frame' : True  # If False, regular dropout, not per frame
                       }

    def set_derived_configs(self):
        if self.config['recurrent-projection-dim'] <= 0:
            self.config['recurrent-projection-dim'] = self.config['cell-dim'] / 4

        if self.config['non-recurrent-projection-dim'] <= 0:
            self.config['non-recurrent-projection-dim'] = \
               self.config['recurrent-projection-dim']

    def check_configs(self):
        for key in ['cell-dim', 'recurrent-projection-dim',
                    'non-recurrent-projection-dim']:
            if self.config[key] <= 0:
                raise RuntimeError("{0} has invalid value {1}.".format(
                    key, self.config[key]))

        if self.config['delay'] == 0:
            raise RuntimeError("delay cannot be zero")

        if (self.config['recurrent-projection-dim'] +
            self.config['non-recurrent-projection-dim'] >
            self.config['cell-dim']):
            raise RuntimeError("recurrent+non-recurrent projection dim exceeds "
                                "cell dim.")
        for key in ['self-repair-scale-nonlinearity']:
            if self.config[key] < 0.0 or self.config[key] > 1.0:
                raise RuntimeError("{0} has invalid value {2}."
                                   .format(self.layer_type, key,
                                           self.config[key]))
        if ((self.config['dropout-proportion'] > 1.0 or
             self.config['dropout-proportion'] < 0.0) and
             self.config['dropout-proportion'] != -1.0 ):
             raise RuntimeError("dropout-proportion has invalid value {0}."
                                .format(self.config['dropout-proportion']))

    def auxiliary_outputs(self):
        return ['c_t']

    def output_name(self, auxiliary_output = None):
        node_name = 'y_t'
        if auxiliary_output is not None:
            if auxiliary_output in self.auxiliary_outputs():
                node_name = auxiliary_output
            else:
                raise Exception("In {0} of type {1}, unknown auxiliary output name {1}".format(self.layer_type, auxiliary_output))

        return '{0}.{1}'.format(self.name, node_name)

    def output_dim(self, auxiliary_output = None):
        if auxiliary_output is not None:
            if auxiliary_output in self.auxiliary_outputs():
                if node_name == 'c_t':
                    return self.config['cell-dim']
                # add code for other auxiliary_outputs here when we decide to expose them
            else:
                raise Exception("In {0} of type {1}, unknown auxiliary output name {1}".format(self.layer_type, auxiliary_output))

        return self.config['recurrent-projection-dim'] + self.config['non-recurrent-projection-dim']

    def get_full_config(self):
        ans = []
        config_lines = self.generate_pgru_config()

        for line in config_lines:
            for config_name in ['ref', 'final']:
                # we do not support user specified matrices in LSTM initialization
                # so 'ref' and 'final' configs are the same.
                ans.append((config_name, line))
        return ans

    # convenience function to generate the Norm-OPGRU config
    def generate_pgru_config(self):

        # assign some variables to reduce verbosity
        name = self.name
        # in the below code we will just call descriptor_strings as descriptors for conciseness
        input_dim = self.descriptors['input']['dim']
        input_descriptor = self.descriptors['input']['final-string']
        cell_dim = self.config['cell-dim']
        rec_proj_dim = self.config['recurrent-projection-dim']
        nonrec_proj_dim = self.config['non-recurrent-projection-dim']
        delay = self.config['delay']
        repair_nonlin = self.config['self-repair-scale-nonlinearity']
        repair_nonlin_str = "self-repair-scale={0:.10f}".format(repair_nonlin) if repair_nonlin is not None else ''
        bptrunc_str = ("clipping-threshold={0}"
                      " zeroing-threshold={1}"
                      " zeroing-interval={2}"
                      " recurrence-interval={3}"
                      "".format(self.config['clipping-threshold'],
                                self.config['zeroing-threshold'],
                                self.config['zeroing-interval'],
                                abs(delay)))
        affine_str = self.config['ng-affine-options']
        pes_str = self.config['ng-per-element-scale-options']
        dropout_proportion = self.config['dropout-proportion']
        dropout_per_frame = 'true' if self.config['dropout-per-frame'] else 'false' 

        # Natural gradient per element scale parameters
        # TODO: decide if we want to keep exposing these options
        if re.search('param-mean', pes_str) is None and \
           re.search('param-stddev', pes_str) is None:
           pes_str += " param-mean=0.0 param-stddev=1.0 "

        # string for GruNonlinearityComponent
        gru_nonlin_str = self.config['gru-nonlinearity-options']
        
        # formulation like:
        # z_t = \sigmoid ( U^z x_t + W^z s_{t-1} )   # update gate
        # o_t = \sigmoid ( U^o x_t + W^o s_{t-1} )   # output gate
        # h_t = \tanh ( U^h x_t + W^h \dot c_{t-1} )
        # c_t = ( 1 - z_t ) \dot h_t  +  z_t \dot c_{t-1}
        # y_t_tmp = ( c_t \dot o_t ) W^y
        # s_t = renorm ( y_t_tmp[0:rec_proj_dim-1] ) # dim(s_t) = recurrent_dim.
        # y_t = batchnorm ( y_t_tmp )  # dim(y_t) = recurrent_dim + non_recurrent_dim.
                                       # This is the output of the GRU.
        # Note:
        # naming convention:
        # <layer-name>.W_<outputname>.<inputname> e.g. Gru1.W_i.xr for matrix
        # providing output to gate i and operating on an appended vector [x,r]
        # notation convention:
        # In order to be consistent with the notations which are used in
        # nnet-combined-component.cc, we map "\tilde{h_t}" and "h_t" which are
        # used in paper to "h_t" and "c_t"

        configs = []
        configs.append("### Begin Gru layer '{0}'".format(name))
        configs.append("# Update gate control : W_z* matrices")
        configs.append("component name={0}.W_z.xs type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + rec_proj_dim, cell_dim, affine_str))
        configs.append("# Reset gate control : W_o* matrices")
        configs.append("component name={0}.W_o.xs type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + rec_proj_dim, cell_dim, affine_str))


        configs.append("# hpart_t related matrix : W_hpart matric")
        configs.append("component name={0}.W_hpart.x type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim, cell_dim , affine_str))
        
        configs.append("# Defining the non-linearities")
        configs.append("component name={0}.z type=SigmoidComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str))
        configs.append("component name={0}.o type=SigmoidComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str))

        if dropout_proportion != -1.0:
            configs.append("# Defining the dropout component")
            configs.append("component name={0}.dropout type=DropoutComponent dim={1} "
                           "dropout-proportion={2} dropout-per-frame={3}"
                           .format(name, cell_dim, dropout_proportion, dropout_per_frame))

        recurrent_connection = '{0}.s_t'.format(name)

        configs.append("# z_t")
        configs.append("component-node name={0}.z_t_pre component={0}.W_z.xs input=Append({1}, IfDefined(Offset({2}, {3})))".format(name, input_descriptor, recurrent_connection, delay))
        if dropout_proportion != -1.0:
            configs.append("component-node name={0}.z_t_predrop component={0}.z input={0}.z_t_pre".format(name))
            configs.append("component-node name={0}.z_t component={0}.dropout input={0}.z_t_predrop".format(name))
        else:
            configs.append("component-node name={0}.z_t component={0}.z input={0}.z_t_pre".format(name))

        configs.append("# o_t")
        configs.append("component-node name={0}.o_t_pre component={0}.W_o.xs input=Append({1}, IfDefined(Offset({2}, {3})))".format(name, input_descriptor, recurrent_connection, delay))
        if dropout_proportion != -1.0:
            configs.append("component-node name={0}.o_t_predrop component={0}.o input={0}.o_t_pre".format(name))
            configs.append("component-node name={0}.o_t component={0}.dropout input={0}.o_t_predrop".format(name))
        else:
            configs.append("component-node name={0}.o_t component={0}.o input={0}.o_t_pre".format(name))

        configs.append("# hpart_t")
        configs.append("component-node name={0}.hpart_t component={0}.W_hpart.x input={1}".format(name, input_descriptor))
        
        configs.append("# c_t")
        configs.append("# Note: the output of OutputGruNonlinearityComponent is (h_t, c_t), we use the second half.")
        configs.append("component name={0}.gru_nonlin type=OutputGruNonlinearityComponent cell-dim={1} {2}".format(name, cell_dim, gru_nonlin_str))
        configs.append("component-node name={0}.gru_nonlin_t component={0}.gru_nonlin input=Append({0}.z_t, {0}.hpart_t, IfDefined(Offset({0}.c_t, {1})))".format(name, delay))
        configs.append("dim-range-node name={0}.c_t input-node={0}.gru_nonlin_t dim-offset={1} dim={1}".format(name, cell_dim))

        configs.append("# the projected matrix W_y.cdoto and y_t_tmp")
        configs.append("component name={0}.cdoto type=ElementwiseProductComponent input-dim={1} output-dim={2}".format(name, 2 * cell_dim, cell_dim))
        configs.append("component-node name={0}.cdoto component={0}.cdoto input=Append({0}.c_t, {0}.o_t)".format(name))
        configs.append("component name={0}.W_y.cdoto type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, cell_dim, rec_proj_dim + nonrec_proj_dim, affine_str))
        configs.append("component-node name={0}.y_t_tmp component={0}.W_y.cdoto input={0}.cdoto".format(name))

        configs.append("# s_t : recurrence")
        configs.append("component name={0}.renorm type=NormalizeComponent dim={1} target-rms=1.0".format(name, rec_proj_dim))
        configs.append("component name={0}.s_r type=BackpropTruncationComponent dim={1} {2}".format(name, rec_proj_dim, bptrunc_str))
        configs.append("dim-range-node name={0}.s_t_pre input-node={0}.y_t_tmp dim-offset=0 dim={1}".format(name, rec_proj_dim))
        configs.append("component-node name={0}.s_t_renorm component={0}.renorm input={0}.s_t_pre".format(name))
        configs.append("component-node name={0}.s_t component={0}.s_r input={0}.s_t_renorm".format(name))

        configs.append("# y_t : output")
        configs.append("component name={0}.batchnorm type=BatchNormComponent dim={1} target-rms=1.0".format(name, rec_proj_dim + nonrec_proj_dim))
        configs.append("component-node name={0}.y_t component={0}.batchnorm input={0}.y_t_tmp".format(name))
        
        return configs


================================================
FILE: egs/steps/libs/nnet3/xconfig/layers.py
================================================
# Copyright 2016    Johns Hopkins University (Dan Povey)
#           2016    Vijayaditya Peddinti
#           2016    Yiming Wang
# Apache 2.0.

from .basic_layers import *
from .convolution import *
from .attention import *
from .lstm import *
from .gru import *
from .stats_layer import *
from .trivial_layers import *
from .composite_layers import *


================================================
FILE: egs/steps/libs/nnet3/xconfig/lstm.py
================================================
# Copyright 2016    Johns Hopkins University (Dan Povey)
#           2016    Vijayaditya Peddinti
#           2016    Yiming Wang
# Apache 2.0.


""" This module has the implementations of different LSTM layers.
"""
from __future__ import print_function
import math
import re
import sys
from libs.nnet3.xconfig.basic_layers import XconfigLayerBase


# This class is for lines like
#   'lstm-layer name=lstm1 input=[-1] delay=-3'
# It generates an LSTM sub-graph without output projections.
# The output dimension of the layer may be specified via 'cell-dim=xxx', but if not specified,
# the dimension defaults to the same as the input.
# See other configuration values below.
#
# Parameters of the class, and their defaults:
#   input='[-1]'             [Descriptor giving the input of the layer.]
#   cell-dim=-1              [Dimension of the cell]
#   delay=-1                 [Delay in the recurrent connections of the LSTM ]
#   clipping-threshold=30    [nnet3 LSTMs use a gradient clipping component at the recurrent connections.
#                             This is the threshold used to decide if clipping has to be activated ]
#   zeroing-interval=20      [interval at which we (possibly) zero out the recurrent derivatives.]
#   zeroing-threshold=15     [We only zero out the derivs every zeroing-interval, if derivs exceed this value.]
#   self_repair_scale_nonlinearity=1e-5      [It is a constant scaling the self-repair vector computed in derived classes of NonlinearComponent]
#                                       i.e.,  SigmoidComponent, TanhComponent and RectifiedLinearComponent ]
#   ng-per-element-scale-options=''     [Additional options used for the diagonal matrices in the LSTM ]
#   ng-affine-options=''                [Additional options used for the full matrices in the LSTM, can be used to do things like set biases to initialize to 1]
#   decay-time=-1            [If >0, an approximate maximum on how many frames
#                            can be remembered via summation into the cell
#                            contents c_t; enforced by putting a scaling factor
#                            of recurrence_scale = 1 - abs(delay)/decay_time on
#                            the recurrence, i.e. the term c_{t-1} in the LSTM
#                            equations.  E.g. setting this to 20 means no more
#                            than about 20 frames' worth of history,
#                            i.e. history since about t = t-20, can be
#                            accumulated in c_t.]
#  l2-regularize=0.0         Constant controlling l2 regularization for this layer
class XconfigLstmLayer(XconfigLayerBase):
    def __init__(self, first_token, key_to_value, prev_names = None):
        assert first_token == "lstm-layer"
        XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names)

    def set_default_configs(self):
        self.config = {'input':'[-1]',
                        'cell-dim' : -1, # this is a compulsory argument
                        'clipping-threshold' : 30.0,
                        'delay' : -1,
                        'ng-per-element-scale-options' : ' max-change=0.75',
                        'ng-affine-options' : ' max-change=0.75 ',
                        'self-repair-scale-nonlinearity' : 0.00001,
                        'zeroing-interval' : 20,
                        'zeroing-threshold' : 15.0,
                       'l2-regularize': 0.0,
                        'decay-time':  -1.0
                        }

    def set_derived_configs(self):
        if self.config['cell-dim'] <= 0:
            self.config['cell-dim'] = self.descriptors['input']['dim']

    def check_configs(self):
        key = 'cell-dim'
        if self.config['cell-dim'] <= 0:
            raise RuntimeError("cell-dim has invalid value {0}.".format(self.config[key]))

        if self.config['delay'] == 0:
            raise RuntimeError("delay cannot be zero")

        for key in ['self-repair-scale-nonlinearity']:
            if self.config[key] < 0.0 or self.config[key] > 1.0:
                raise RuntimeError("{0} has invalid value {1}.".format(key, self.config[key]))

    def auxiliary_outputs(self):
        return ['c_t']

    def output_name(self, auxiliary_output = None):
        node_name = 'm_t'
        if auxiliary_output is not None:
            if auxiliary_output in self.auxiliary_outputs():
                node_name = auxiliary_output
            else:
                raise RuntimeError("Unknown auxiliary output name {0}".format(auxiliary_output))

        return '{0}.{1}'.format(self.name, node_name)

    def output_dim(self, auxiliary_output = None):
        if auxiliary_output is not None:
            if auxiliary_output in self.auxiliary_outputs():
                if node_name == 'c_t':
                    return self.config['cell-dim']
                # add code for other auxiliary_outputs here when we decide to expose them
            else:
                raise RuntimeError("Unknown auxiliary output name {0}".format(auxiliary_output))

        return self.config['cell-dim']

    def get_full_config(self):
        ans = []
        config_lines = self._generate_lstm_config()

        for line in config_lines:
            for config_name in ['ref', 'final']:
                # we do not support user specified matrices in LSTM initialization
                # so 'ref' and 'final' configs are the same.
                ans.append((config_name, line))
        return ans

    # convenience function to generate the LSTM config
    def _generate_lstm_config(self):

        # assign some variables to reduce verbosity
        name = self.name
        # in the below code we will just call descriptor_strings as descriptors for conciseness
        input_dim = self.descriptors['input']['dim']
        input_descriptor = self.descriptors['input']['final-string']
        cell_dim = self.config['cell-dim']
        delay = self.config['delay']
        decay_time = self.config['decay-time']
        # we expect decay_time to be either -1, or large, like 10 or 50.
        recurrence_scale = (1.0 if decay_time < 0 else
                            1.0 - (abs(delay) / decay_time))
        assert recurrence_scale > 0   # or user may have set decay-time much
                                      # too small.
        bptrunc_str = ("clipping-threshold={0}"
                      " zeroing-threshold={1}"
                      " zeroing-interval={2}"
                      " recurrence-interval={3}"
                      " scale={4}"
                      "".format(self.config['clipping-threshold'],
                                self.config['zeroing-threshold'],
                                self.config['zeroing-interval'],
                                abs(delay), recurrence_scale))
        repair_nonlin = self.config['self-repair-scale-nonlinearity']
        repair_nonlin_str = "self-repair-scale={0:.10f}".format(repair_nonlin) if repair_nonlin is not None else ''
        affine_str = self.config['ng-affine-options']
        # Natural gradient per element scale parameters
        ng_per_element_scale_options = self.config['ng-per-element-scale-options']
        if re.search('param-mean', ng_per_element_scale_options) is None and \
           re.search('param-stddev', ng_per_element_scale_options) is None:
           ng_per_element_scale_options += " param-mean=0.0 param-stddev=1.0 "
        pes_str = ng_per_element_scale_options
        l2_regularize = self.config['l2-regularize']
        l2_regularize_option = ('l2-regularize={0} '.format(l2_regularize)
                                if l2_regularize != 0.0 else '')


        configs = []

        # To see the equations implemented here, see
        # eqs (1)-(6) in https://arxiv.org/abs/1402.1128
        # naming convention:
        # <layer-name>.W_<outputname>.<input_name> e.g. Lstm1.W_i.xr for matrix
        # providing output to gate i and operating on an appended vector [x,r]

        configs.append("### Begin LTSM layer '{0}'".format(name))
        configs.append("# Input gate control : W_i* matrices")
        configs.append("component name={0}.W_i.xr type=NaturalGradientAffineComponent input-dim={1} "
                       "output-dim={2} {3} {4}".format(name, input_dim + cell_dim, cell_dim,
                                                       affine_str, l2_regularize_option))
        configs.append("# note : the cell outputs pass through a diagonal matrix")
        configs.append("component name={0}.w_i.c type=NaturalGradientPerElementScaleComponent "
                       "dim={1} {2} {3} ".format(name, cell_dim, pes_str,
                                                 l2_regularize_option))
        configs.append("# Forget gate control : W_f* matrices")
        configs.append("component name={0}.W_f.xr type=NaturalGradientAffineComponent input-dim={1} "
                       "output-dim={2} {3} {4}".format(name, input_dim + cell_dim, cell_dim,
                                                       affine_str, l2_regularize_option))
        configs.append("# note : the cell outputs pass through a diagonal matrix")
        configs.append("component name={0}.w_f.c type=NaturalGradientPerElementScaleComponent "
                       "dim={1} {2} {3}".format(name, cell_dim, pes_str, l2_regularize_option))

        configs.append("#  Output gate control : W_o* matrices")
        configs.append("component name={0}.W_o.xr type=NaturalGradientAffineComponent input-dim={1} "
                       "output-dim={2} {3} {4}".format(name, input_dim + cell_dim, cell_dim,
                                                       affine_str, l2_regularize_option))
        configs.append("# note : the cell outputs pass through a diagonal matrix")
        configs.append("component name={0}.w_o.c type=NaturalGradientPerElementScaleComponent "
                       " dim={1} {2} {3}".format(name, cell_dim, pes_str,
                                                 l2_regularize_option))

        configs.append("# Cell input matrices : W_c* matrices")
        configs.append("component name={0}.W_c.xr type=NaturalGradientAffineComponent input-dim={1} "
                       "output-dim={2} {3} {4}".format(name, input_dim + cell_dim, cell_dim,
                                                       affine_str, l2_regularize_option))


        configs.append("# Defining the non-linearities")
        configs.append("component name={0}.i type=SigmoidComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str))
        configs.append("component name={0}.f type=SigmoidComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str))
        configs.append("component name={0}.o type=SigmoidComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str))
        configs.append("component name={0}.g type=TanhComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str))
        configs.append("component name={0}.h type=TanhComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str))

        configs.append("# Defining the components for other cell computations")
        configs.append("component name={0}.c1 type=ElementwiseProductComponent input-dim={1} output-dim={2}"
                       "".format(name, 2 * cell_dim, cell_dim))
        configs.append("component name={0}.c2 type=ElementwiseProductComponent input-dim={1} output-dim={2}"
                       "".format(name, 2 * cell_dim, cell_dim))
        configs.append("component name={0}.m type=ElementwiseProductComponent input-dim={1} output-dim={2}"
                       "".format(name, 2 * cell_dim, cell_dim))
        configs.append("component name={0}.c type=BackpropTruncationComponent dim={1} {2}"
                       "".format(name, cell_dim, bptrunc_str))

        # c1_t and c2_t defined below
        configs.append("component-node name={0}.c_t component={0}.c input=Sum({0}.c1_t, {0}.c2_t)".format(name))
        delayed_c_t_descriptor = "IfDefined(Offset({0}.c_t, {1}))".format(name, delay)

        configs.append("# i_t")
        configs.append("component-node name={0}.i1_t component={0}.W_i.xr input=Append({1}, IfDefined(Offset({0}.r_t, {2})))"
                       "".format(name, input_descriptor, delay))
        configs.append("component-node name={0}.i2_t component={0}.w_i.c  input={1}".format(name, delayed_c_t_descriptor))
        configs.append("component-node name={0}.i_t component={0}.i input=Sum({0}.i1_t, {0}.i2_t)".format(name))

        configs.append("# f_t")
        configs.append("component-node name={0}.f1_t component={0}.W_f.xr input=Append({1}, IfDefined(Offset({0}.r_t, {2})))"
                       "".format(name, input_descriptor, delay))
        configs.append("component-node name={0}.f2_t component={0}.w_f.c  input={1}".format(name, delayed_c_t_descriptor))
        configs.append("component-node name={0}.f_t component={0}.f input=Sum({0}.f1_t, {0}.f2_t)".format(name))

        configs.append("# o_t")
        configs.append("component-node name={0}.o1_t component={0}.W_o.xr input=Append({1}, IfDefined(Offset({0}.r_t, {2})))"
                       "".format(name, input_descriptor, delay))
        configs.append("component-node name={0}.o2_t component={0}.w_o.c input={0}.c_t".format(name))
        configs.append("component-node name={0}.o_t component={0}.o input=Sum({0}.o1_t, {0}.o2_t)".format(name))

        configs.append("# h_t")
        configs.append("component-node name={0}.h_t component={0}.h input={0}.c_t".format(name))

        configs.append("# g_t")
        configs.append("component-node name={0}.g1_t component={0}.W_c.xr input=Append({1}, IfDefined(Offset({0}.r_t, {2})))"
                       "".format(name, input_descriptor, delay))
        configs.append("component-node name={0}.g_t component={0}.g input={0}.g1_t".format(name))

        configs.append("# parts of c_t")
        configs.append("component-node name={0}.c1_t component={0}.c1  input=Append({0}.f_t, {1})"
                       "".format(name, delayed_c_t_descriptor))
        configs.append("component-node name={0}.c2_t component={0}.c2 input=Append({0}.i_t, {0}.g_t)"
                       "".format(name))

        configs.append("# m_t")
        configs.append("component-node name={0}.m_t component={0}.m input=Append({0}.o_t, {0}.h_t)"
                       "".format(name))

        # add the recurrent connections
        configs.append("component name={0}.r type=BackpropTruncationComponent dim={1} {2}"
                       "".format(name, cell_dim, bptrunc_str))
        configs.append("component-node name={0}.r_t component={0}.r input={0}.m_t".format(name))
        configs.append("### End LTSM layer '{0}'".format(name))
        return configs


# This class is for lines like
#   'lstmp-layer name=lstm1 input=[-1] delay=-3'
# (you can also use the name 'lstmp-batchnorm-layer' if you want it to be followed
# by batchnorm).
# It generates an LSTM sub-graph with output projections. It can also generate
# outputs without projection, but you could use the XconfigLstmLayer for this
# simple LSTM.
# The output dimension of the layer may be specified via 'cell-dim=xxx', but if not specified,
# the dimension defaults to the same as the input.
# See other configuration values below.
#
# Parameters of the class, and their defaults:
#   input='[-1]'             [Descriptor giving the input of the layer.]
#   cell-dim=-1            [Dimension of the cell]
#   recurrent-projection_dim [Dimension of the projection used in recurrent connections, e.g. cell-dim/4]
#   non-recurrent-projection-dim   [Dimension of the projection in non-recurrent connections,
#                                   in addition to recurrent-projection-dim, e.g. cell-dim/4]
#   delay=-1                 [Delay in the recurrent connections of the LSTM ]
#   clipping-threshold=30    [nnet3 LSTMs use a gradient clipping component at the recurrent connections.
#                             This is the threshold used to decide if clipping has to be activated ]
#   zeroing-interval=20      [interval at which we (possibly) zero out the recurrent derivatives.]
#   zeroing-threshold=15     [We only zero out the derivs every zeroing-interval, if derivs exceed this value.]
#   self_repair_scale_nonlinearity=1e-5      [It is a constant scaling the self-repair vector computed in derived classes of NonlinearComponent]
#                                       i.e.,  SigmoidComponent, TanhComponent and RectifiedLinearComponent ]
#   ng-per-element-scale-options=''   [Additional options used for the diagonal matrices in the LSTM ]
#   ng-affine-options=''              [Additional options used for the full matrices in the LSTM, can be used to do things like set biases to initialize to 1]
#   decay-time=-1            [If >0, an approximate maximum on how many frames
#                            can be remembered via summation into the cell
#                            contents c_t; enforced by putting a scaling factor
#                            of recurrence_scale = 1 - abs(delay)/decay_time on
#                            the recurrence, i.e. the term c_{t-1} in the LSTM
#                            equations.  E.g. setting this to 20 means no more
#                            than about 20 frames' worth of history,
#                            i.e. history since about t = t-20, can be
#                            accumulated in c_t.]
#  l2-regularize=0.0         Constant controlling l2 regularization for this layer
class XconfigLstmpLayer(XconfigLayerBase):
    def __init__(self, first_token, key_to_value, prev_names = None):
        # lstmp-batchnorm-layer is like lstmp-layer but followed by a batchnorm
        # component.
        assert first_token in ["lstmp-layer", "lstmp-batchnorm-layer"]
        XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names)

    def set_default_configs(self):
        self.config = {'input' : '[-1]',
                        'cell-dim' : -1, # this is a compulsory argument
                        'recurrent-projection-dim' : -1,  # defaults to cell-dim / 4
                        'non-recurrent-projection-dim' : -1, # defaults to
                                                             # recurrent-projection-dim
                        'clipping-threshold' : 30.0,
                        'delay' : -1,
                        'ng-per-element-scale-options' : ' max-change=0.75 ',
                        'ng-affine-options' : ' max-change=0.75 ',
                        'self-repair-scale-nonlinearity' : 0.00001,
                        'zeroing-interval' : 20,
                        'zeroing-threshold' : 15.0,
                        'dropout-proportion' : -1.0, # If -1.0, no dropout components will be added
                        'dropout-per-frame' : False,  # If false, regular dropout, not per frame.
                        'decay-time':  -1.0,
                       'l2-regularize': 0.0,
                       }

    def set_derived_configs(self):
        if self.config['recurrent-projection-dim'] <= 0:
            self.config['recurrent-projection-dim'] = self.config['cell-dim'] / 4

        if self.config['non-recurrent-projection-dim'] <= 0:
            self.config['non-recurrent-projection-dim'] = \
               self.config['recurrent-projection-dim']

    def check_configs(self):
        for key in ['cell-dim', 'recurrent-projection-dim',
                    'non-recurrent-projection-dim']:
            if self.config[key] <= 0:
                raise RuntimeError("{0} has invalid value {1}.".format(
                    key, self.config[key]))

        if self.config['delay'] == 0:
            raise RuntimeError("delay cannot be zero")

        if (self.config['recurrent-projection-dim'] +
            self.config['non-recurrent-projection-dim'] >
            self.config['cell-dim']):
            raise RuntimeError("recurrent+non-recurrent projection dim exceeds "
                                "cell dim.")
        for key in ['self-repair-scale-nonlinearity']:
            if self.config[key] < 0.0 or self.config[key] > 1.0:
                raise RuntimeError("{0} has invalid value {2}."
                                   .format(self.layer_type, key,
                                           self.config[key]))

        if ((self.config['dropout-proportion'] > 1.0 or
             self.config['dropout-proportion'] < 0.0) and
             self.config['dropout-proportion'] != -1.0 ):
             raise RuntimeError("dropout-proportion has invalid value {0}."
                                .format(self.config['dropout-proportion']))

    def auxiliary_outputs(self):
        return ['c_t']

    def output_name(self, auxiliary_output = None):
        node_name = ( 'rp_t_batchnorm' if self.layer_type == 'lstmp-batchnorm-layer'
                      else 'rp_t' )
        if auxiliary_output is not None:
            if auxiliary_output in self.auxiliary_outputs():
                node_name = auxiliary_output
            else:
                raise Exception("In {0} of type {1}, unknown auxiliary output name {1}".format(self.layer_type, auxiliary_output))

        return '{0}.{1}'.format(self.name, node_name)

    def output_dim(self, auxiliary_output = None):
        if auxiliary_output is not None:
            if auxiliary_output in self.auxiliary_outputs():
                if node_name == 'c_t':
                    return self.config['cell-dim']
                # add code for other auxiliary_outputs here when we decide to expose them
            else:
                raise Exception("In {0} of type {1}, unknown auxiliary output name {1}".format(self.layer_type, auxiliary_output))

        return self.config['recurrent-projection-dim'] + self.config['non-recurrent-projection-dim']

    def get_full_config(self):
        ans = []
        config_lines = self._generate_lstm_config()

        for line in config_lines:
            for config_name in ['ref', 'final']:
                # we do not support user specified matrices in LSTM initialization
                # so 'ref' and 'final' configs are the same.
                ans.append((config_name, line))
        return ans

    # convenience function to generate the LSTM config
    def _generate_lstm_config(self):

        # assign some variables to reduce verbosity
        name = self.name
        # in the below code we will just call descriptor_strings as descriptors for conciseness
        input_dim = self.descriptors['input']['dim']
        input_descriptor = self.descriptors['input']['final-string']
        cell_dim = self.config['cell-dim']
        rec_proj_dim = self.config['recurrent-projection-dim']
        nonrec_proj_dim = self.config['non-recurrent-projection-dim']
        delay = self.config['delay']
        repair_nonlin = self.config['self-repair-scale-nonlinearity']
        repair_nonlin_str = "self-repair-scale={0:.10f}".format(repair_nonlin) if repair_nonlin is not None else ''
        decay_time = self.config['decay-time']
        # we expect decay_time to be either -1, or large, like 10 or 50.
        recurrence_scale = (1.0 if decay_time < 0 else
                            1.0 - (abs(delay) / decay_time))
        assert recurrence_scale > 0   # or user may have set decay-time much
                                      # too small.
        bptrunc_str = ("clipping-threshold={0}"
                      " zeroing-threshold={1}"
                      " zeroing-interval={2}"
                      " recurrence-interval={3}"
                      " scale={4}"
                      "".format(self.config['clipping-threshold'],
                                self.config['zeroing-threshold'],
                                self.config['zeroing-interval'],
                                abs(delay), recurrence_scale))
        affine_str = self.config['ng-affine-options']
        pes_str = self.config['ng-per-element-scale-options']
        dropout_proportion = self.config['dropout-proportion']
        dropout_per_frame = 'true' if self.config['dropout-per-frame'] else 'false'

        # Natural gradient per element scale parameters
        if re.search('param-mean', pes_str) is None and \
           re.search('param-stddev', pes_str) is None:
           pes_str += " param-mean=0.0 param-stddev=1.0 "
        l2_regularize = self.config['l2-regularize']
        l2_regularize_option = ('l2-regularize={0} '.format(l2_regularize)
                                if l2_regularize != 0.0 else '')

        configs = []

        # the equations implemented here are from Sak et. al. "Long Short-Term
        # Memory Recurrent Neural Network Architectures for Large Scale Acoustic
        # Modeling"
        # https://arxiv.org/pdf/1402.1128.pdf
        # See equations (7) to (14).
        # naming convention <layer-name>.W_<outputname>.<input_name>
        # e.g. Lstm1.W_i.xr for matrix providing output to gate i and operating
        # on an appended vector [x,r]
        configs.append("# Input gate control : W_i* matrices")
        configs.append("component name={0}.W_i.xr type=NaturalGradientAffineComponent input-dim={1} "
                       "output-dim={2} {3} {4}".format(name, input_dim + rec_proj_dim,
                                                       cell_dim, affine_str, l2_regularize_option))
        configs.append("# note : the cell outputs pass through a diagonal matrix")
        configs.append("component name={0}.w_i.c type=NaturalGradientPerElementScaleComponent "
                       "dim={1} {2} {3}".format(name, cell_dim, pes_str,
                                                l2_regularize_option))
        configs.append("# Forget gate control : W_f* matrices")
        configs.append("component name={0}.W_f.xr type=NaturalGradientAffineComponent input-dim={1} "
                       "output-dim={2} {3} {4}".format(name, input_dim + rec_proj_dim, cell_dim,
                                                       affine_str, l2_regularize_option))
        configs.append("# note : the cell outputs pass through a diagonal matrix")
        configs.append("component name={0}.w_f.c type=NaturalGradientPerElementScaleComponent  "
                       "dim={1} {2} {3}".format(name, cell_dim, pes_str, l2_regularize_option))

        configs.append("#  Output gate control : W_o* matrices")
        configs.append("component name={0}.W_o.xr type=NaturalGradientAffineComponent input-dim={1} "
                       "output-dim={2} {3} {4}".format(name, input_dim + rec_proj_dim, cell_dim,
                                                       affine_str, l2_regularize_option))
        configs.append("# note : the cell outputs pass through a diagonal matrix")
        configs.append("component name={0}.w_o.c type=NaturalGradientPerElementScaleComponent "
                       "dim={1} {2} {3}".format(name, cell_dim, pes_str, l2_regularize_option))

        configs.append("# Cell input matrices : W_c* matrices")
        configs.append("component name={0}.W_c.xr type=NaturalGradientAffineComponent input-dim={1} "
                       "output-dim={2} {3} {4}".format(name, input_dim + rec_proj_dim, cell_dim,
                                                       affine_str, l2_regularize_option))

        configs.append("# Defining the non-linearities")
        configs.append("component name={0}.i type=SigmoidComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str))
        configs.append("component name={0}.f type=SigmoidComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str))
        configs.append("component name={0}.o type=SigmoidComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str))
        configs.append("component name={0}.g type=TanhComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str))
        configs.append("component name={0}.h type=TanhComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str))
        if dropout_proportion != -1.0:
            configs.append("component name={0}.dropout type=DropoutComponent dim={1} "
                           "dropout-proportion={2} dropout-per-frame={3}"
                           .format(name, cell_dim, dropout_proportion, dropout_per_frame))
        configs.append("# Defining the components for other cell computations")
        configs.append("component name={0}.c1 type=ElementwiseProductComponent input-dim={1} output-dim={2}"
                       "".format(name, 2 * cell_dim, cell_dim))
        configs.append("component name={0}.c2 type=ElementwiseProductComponent input-dim={1} output-dim={2}"
                       "".format(name, 2 * cell_dim, cell_dim))
        configs.append("component name={0}.m type=ElementwiseProductComponent input-dim={1} output-dim={2}"
                       "".format(name, 2 * cell_dim, cell_dim))
        configs.append("component name={0}.c type=BackpropTruncationComponent dim={1} {2}"
                       "".format(name, cell_dim, bptrunc_str))

        # c1_t and c2_t defined below
        configs.append("component-node name={0}.c_t component={0}.c input=Sum({0}.c1_t, {0}.c2_t)".format(name))
        delayed_c_t_descriptor = "IfDefined(Offset({0}.c_t, {1}))".format(name, delay)

        recurrent_connection = '{0}.r_t'.format(name)
        configs.append("# i_t")
        configs.append("component-node name={0}.i1_t component={0}.W_i.xr input=Append({1}, IfDefined(Offset({2}, {3})))"
                       "".format(name, input_descriptor, recurrent_connection, delay))
        configs.append("component-node name={0}.i2_t component={0}.w_i.c  input={1}".format(name, delayed_c_t_descriptor))
        if dropout_proportion != -1.0:
            configs.append("component-node name={0}.i_t_predrop component={0}.i input=Sum({0}.i1_t, {0}.i2_t)".format(name))
            configs.append("component-node name={0}.i_t component={0}.dropout input={0}.i_t_predrop".format(name))
        else:
            configs.append("component-node name={0}.i_t component={0}.i input=Sum({0}.i1_t, {0}.i2_t)".format(name))

        configs.append("# f_t")
        configs.append("component-node name={0}.f1_t component={0}.W_f.xr input=Append({1}, IfDefined(Offset({2}, {3})))"
                       "".format(name, input_descriptor, recurrent_connection, delay))
        configs.append("component-node name={0}.f2_t component={0}.w_f.c  input={1}".format(name, delayed_c_t_descriptor))
        if dropout_proportion != -1.0:
            configs.append("component-node name={0}.f_t_predrop component={0}.f input=Sum({0}.f1_t, {0}.f2_t)".format(name))
            configs.append("component-node name={0}.f_t component={0}.dropout input={0}.f_t_predrop".format(name))
        else:
            configs.append("component-node name={0}.f_t component={0}.f input=Sum({0}.f1_t, {0}.f2_t)".format(name))

        configs.append("# o_t")
        configs.append("component-node name={0}.o1_t component={0}.W_o.xr input=Append({1}, IfDefined(Offset({2}, {3})))".format(name, input_descriptor, recurrent_connection, delay))
        configs.append("component-node name={0}.o2_t component={0}.w_o.c input={0}.c_t".format(name))
        if dropout_proportion != -1.0:
            configs.append("component-node name={0}.o_t_predrop component={0}.o input=Sum({0}.o1_t, {0}.o2_t)".format(name))
            configs.append("component-node name={0}.o_t component={0}.dropout input={0}.o_t_predrop".format(name))
        else:
            configs.append("component-node name={0}.o_t component={0}.o input=Sum({0}.o1_t, {0}.o2_t)".format(name))

        configs.append("# h_t")
        configs.append("component-node name={0}.h_t component={0}.h input={0}.c_t".format(name))

        configs.append("# g_t")
        configs.append("component-node name={0}.g1_t component={0}.W_c.xr input=Append({1}, IfDefined(Offset({2}, {3})))"
                       "".format(name, input_descriptor, recurrent_connection, delay))
        configs.append("component-node name={0}.g_t component={0}.g input={0}.g1_t".format(name))

        configs.append("# parts of c_t")
        configs.append("component-node name={0}.c1_t component={0}.c1  input=Append({0}.f_t, {1})".format(name, delayed_c_t_descriptor))
        configs.append("component-node name={0}.c2_t component={0}.c2 input=Append({0}.i_t, {0}.g_t)".format(name))

        configs.append("# m_t")
        configs.append("component-node name={0}.m_t component={0}.m input=Append({0}.o_t, {0}.h_t)".format(name))

        # add the recurrent connections
        configs.append("# projection matrices : Wrm and Wpm")
        configs.append("component name={0}.W_rp.m type=NaturalGradientAffineComponent input-dim={1} "
                       "output-dim={2} {3} {4}".format(name, cell_dim, rec_proj_dim + nonrec_proj_dim,
                                                       affine_str, l2_regularize_option))
        configs.append("component name={0}.r type=BackpropTruncationComponent dim={1} {2}"
                       "".format(name, rec_proj_dim, bptrunc_str))

        configs.append("# r_t and p_t : rp_t will be the output (if we're not doing batchnorm)")
        configs.append("component-node name={0}.rp_t component={0}.W_rp.m input={0}.m_t"
                       "".format(name))
        configs.append("dim-range-node name={0}.r_t_preclip input-node={0}.rp_t dim-offset=0 "
                       "dim={1}".format(name, rec_proj_dim))
        configs.append("component-node name={0}.r_t component={0}.r input={0}.r_t_preclip".format(name))

        if self.layer_type == "lstmp-batchnorm-layer":
            # Add the batchnorm component, if requested to include batchnorm.
            configs.append("component name={0}.rp_t_batchnorm type=BatchNormComponent dim={1} ".format(
                name, rec_proj_dim + nonrec_proj_dim))
            configs.append("component-node name={0}.rp_t_batchnorm component={0}.rp_t_batchnorm "
                           "input={0}.rp_t".format(name))

        return configs


# This class is for lines like
#   'fast-lstm-layer name=lstm1 input=[-1] delay=-3'
# (you can also use the name 'fast-lstm-batchnorm-layer' if you want it to be followed
# by batchnorm).
# It generates an LSTM sub-graph without output projections.
# Unlike 'lstm-layer', the core nonlinearities of the LSTM are done in a special-purpose
# component (LstmNonlinearityComponent), and most of the affine parts of the LSTM are combined
# into one.
#
# The output dimension of the layer may be specified via 'cell-dim=xxx', but if not specified,
# the dimension defaults to the same as the input.
# See other configuration values below.
#
# Parameters of the class, and their defaults:
#   input='[-1]'             [Descriptor giving the input of the layer.]
#   cell-dim=-1              [Dimension of the cell]
#   delay=-1                 [Delay in the recurrent connections of the LSTM ]
#   clipping-threshold=30    [nnet3 LSTMs use a gradient clipping component at the recurrent connections.
#                             This is the threshold used to decide if clipping has to be activated ]
#   zeroing-interval=20      [interval at which we (possibly) zero out the recurrent derivatives.]
#   zeroing-threshold=15     [We only zero out the derivs every zeroing-interval, if derivs exceed this value.]
#   lstm-nonlinearity-options=' max-change=0.75 '  [Options string to pass into the LSTM nonlinearity component.]
#   ng-affine-options=' max-change=1.5 '           [Additional options used for the full matrices in the LSTM, can be used to
#                                      do things like set biases to initialize to 1]
#   decay-time=-1            [If >0, an approximate maximum on how many frames
#                            can be remembered via summation into the cell
#                            contents c_t; enforced by putting a scaling factor
#                            of recurrence_scale = 1 - abs(delay)/decay_time on
#                            the recurrence, i.e. the term c_{t-1} in the LSTM
#                            equations.  E.g. setting this to 20 means no more
#                            than about 20 frames' worth of history,
#                            i.e. history since about t = t-20, can be
#                            accumulated in c_t.]
#  l2-regularize=0.0         Constant controlling l2 regularization for this layer
class XconfigFastLstmLayer(XconfigLayerBase):
    def __init__(self, first_token, key_to_value, prev_names = None):
        assert first_token in ["fast-lstm-layer", "fast-lstm-batchnorm-layer"]
        XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names)

    def set_default_configs(self):
        self.config = {'input':'[-1]',
                        'cell-dim' : -1, # this is a compulsory argument
                        'clipping-threshold' : 30.0,
                        'zeroing-interval' : 20,
                        'zeroing-threshold' : 15.0,
                        'delay' : -1,
                        # if you want to set 'self-repair-scale' (c.f. the
                        # self-repair-scale-nonlinearity config value in older LSTM layers), you can
                        # add 'self-repair-scale=xxx' to
                        # lstm-nonlinearity-options.
                        'lstm-nonlinearity-options' : ' max-change=0.75',
                        # the affine layer contains 4 of our old layers -> use a
                        # larger max-change than the normal value of 0.75.
                        'ng-affine-options' : ' max-change=1.5',
                        'l2-regularize': 0.0,
                        'decay-time':  -1.0
                        }
        self.c_needed = False  # keep track of whether the 'c' output is needed.

    def set_derived_configs(self):
        if self.config['cell-dim'] <= 0:
            self.config['cell-dim'] = self.descriptors['input']['dim']

    def check_configs(self):
        key = 'cell-dim'
        if self.config['cell-dim'] <= 0:
            raise RuntimeError("cell-dim has invalid value {0}.".format(self.config[key]))
        if self.config['delay'] == 0:
            raise RuntimeError("delay cannot be zero")


    def auxiliary_outputs(self):
        return ['c']

    def output_name(self, auxiliary_output = None):
        node_name = ('m_batchnorm' if self.layer_type == 'fast-lstm-batchnorm-layer'
                      else 'm')
        if auxiliary_output is not None:
            if auxiliary_output == 'c':
                node_name = 'c'
                self.c_needed = True
            else:
                raise RuntimeError("Unknown auxiliary output name {0}".format(auxiliary_output))
        return '{0}.{1}'.format(self.name, node_name)

    def output_dim(self, auxiliary_output = None):
        if auxiliary_output is not None:
            if auxiliary_output == 'c':
                self.c_needed = True
                return self.config['cell-dim']
                # add code for other auxiliary_outputs here when we decide to expose them
            else:
                raise RuntimeError("Unknown auxiliary output name {0}".format(auxiliary_output))
        return self.config['cell-dim']

    def get_full_config(self):
        ans = []
        config_lines = self._generate_lstm_config()

        for line in config_lines:
            for config_name in ['ref', 'final']:
                # we do not support user specified matrices in LSTM initialization
                # so 'ref' and 'final' configs are the same.
                ans.append((config_name, line))
        return ans

    # convenience function to generate the LSTM config
    def _generate_lstm_config(self):

        # assign some variables to reduce verbosity
        name = self.name
        # in the below code we will just call descriptor_strings as descriptors for conciseness
        input_dim = self.descriptors['input']['dim']
        input_descriptor = self.descriptors['input']['final-string']
        cell_dim = self.config['cell-dim']
        delay = self.config['delay']
        affine_str = self.config['ng-affine-options']
        l2_regularize = self.config['l2-regularize']
        l2_regularize_option = ('l2-regularize={0} '.format(l2_regularize)
                                if l2_regularize != 0.0 else '')
        decay_time = self.config['decay-time']
        # we expect decay_time to be either -1, or large, like 10 or 50.
        recurrence_scale = (1.0 if decay_time < 0 else
                            1.0 - (abs(delay) / decay_time))
        assert recurrence_scale > 0   # or user may have set decay-time much
                                      # too small.
        bptrunc_str = ("clipping-threshold={0}"
                      " zeroing-threshold={1}"
                      " zeroing-interval={2}"
                      " recurrence-interval={3}"
                      " scale={4}"
                      "".format(self.config['clipping-threshold'],
                                self.config['zeroing-threshold'],
                                self.config['zeroing-interval'],
                                abs(delay), recurrence_scale))
        lstm_str = self.config['lstm-nonlinearity-options']


        configs = []

        # the equations implemented here are equations (1) through (6) of
        # https://arxiv.org/pdf/1402.1128.pdf.
        # naming convention
        # <layer-name>.W_<outputname>.<input_name> e.g. Lstm1.W_i.xr for matrix
        # providing output to gate i and operating on an appended vector [x,r]
        configs.append("### Begin LTSM layer '{0}'".format(name))
        configs.append("# Gate control: contains W_i, W_f, W_c and W_o matrices as blocks.")

        configs.append("component name={0}.W_all type=NaturalGradientAffineComponent input-dim={1} "
                       "output-dim={2} {3} {4}".format(name, input_dim + cell_dim, cell_dim * 4,
                                                       affine_str, l2_regularize_option))

        configs.append("# The core LSTM nonlinearity, implemented as a single component.")
        configs.append("# Input = (i_part, f_part, c_part, o_part, c_{t-1}), output = (c_t, m_t)")
        configs.append("# See cu-math.h:ComputeLstmNonlinearity() for details.")
        configs.append("component name={0}.lstm_nonlin type=LstmNonlinearityComponent "
                       "cell-dim={1} {2} {3}".format(name, cell_dim, lstm_str,
                                                     l2_regularize_option))

        configs.append("# Component for backprop truncation, to avoid gradient blowup in long training examples.")
        configs.append("component name={0}.cm_trunc type=BackpropTruncationComponent dim={1} "
                       "{2}".format(name, 2 * cell_dim, bptrunc_str))

        configs.append("###  Nodes for the components above.")
        configs.append("component-node name={0}.W_all component={0}.W_all input=Append({1}, "
                       "IfDefined(Offset({0}.m_trunc, {2})))".format(
                           name, input_descriptor, delay))

        configs.append("component-node name={0}.lstm_nonlin component={0}.lstm_nonlin "
                       "input=Append({0}.W_all, IfDefined(Offset({0}.c_trunc, {1})))".format(
                           name, delay))
        # we can print .c later if needed, but it generates a warning since it's not used.  could use c_trunc instead
        #configs.append("dim-range-node name={0}.c input-node={0}.lstm_nonlin dim-offset=0 dim={1}".format(name, cell_dim))
        configs.append("dim-range-node name={0}.m input-node={0}.lstm_nonlin dim-offset={1} dim={1}".format(name, cell_dim))
        configs.append("component-node name={0}.cm_trunc component={0}.cm_trunc input={0}.lstm_nonlin".format(name))
        configs.append("dim-range-node name={0}.c_trunc input-node={0}.cm_trunc dim-offset=0 dim={1}".format(name, cell_dim))
        configs.append("dim-range-node name={0}.m_trunc input-node={0}.cm_trunc dim-offset={1} dim={1}".format(name, cell_dim))

        if self.layer_type == "fast-lstm-batchnorm-layer":
            # Add the batchnorm component, if requested to include batchnorm.
            configs.append("component name={0}.m_batchnorm type=BatchNormComponent dim={1} ".format(
                name, cell_dim))
            configs.append("component-node name={0}.m_batchnorm component={0}.m_batchnorm "
                           "input={0}.m".format(name))
        configs.append("### End LTSM layer '{0}'".format(name))
        return configs


# This class is for lines like
#   'lstmb-layer name=lstm1 input=[-1] delay=-3'
#
# LSTMB is not something we've published; it's LSTM with a bottleneck in the
# middle of the W_all matrix (where W_all is a matrix that combines the 8 full
# matrices of standard LSTM).  W_all is factored into W_all_a and W_all_b, where
# W_all_a is constrained to have orthonormal rows (this keeps it training stably).
#
# It also contains a couple of other improvements: W_all_b is followed by
# trainable ScaleAndOffsetComponent (this is a bit like the idea from the
# publication "Self-stabilized deep neural network" by Ghahramani et al).
# And the LSTM is followed by a batchnorm component (this is by default; it's not
# part of the layer name, like lstmb-batchnorm-layer).

#
# The output dimension of the layer may be specified via 'cell-dim=xxx', but if not specified,
# the dimension defaults to the same as the input.
# See other configuration values below.
#
# Parameters of the class, and their defaults:
#   input='[-1]'             [Descriptor giving the input of the layer.]
#   cell-dim=-1              [Dimension of the cell]
#   bottleneck-dim=-1        [Bottleneck dim, should be less than cell-dim plus the input dim.]
#   delay=-1                 [Delay in the recurrent connections of the LSTM ]
#   clipping-threshold=30    [nnet3 LSTMs use a gradient clipping component at the recurrent connections.
#                             This is the threshold used to decide if clipping has to be activated ]
#   zeroing-interval=20      [interval at which we (possibly) zero out the recurrent derivatives.]
#   zeroing-threshold=15     [We only zero out the derivs every zeroing-interval, if derivs exceed this value.]
#   lstm-nonlinearity-options=' max-change=0.75 '  [Options string to pass into the LSTM nonlinearity component.]
#   ng-affine-options=' max-change=1.5 '           [Additional options used for the full matrices in the LSTM, can be used to
#                                      do things like set biases to initialize to 1]
#   decay-time=-1            [If >0, an approximate maximum on how many frames
#                            can be remembered via summation into the cell
#                            contents c_t; enforced by putting a scaling factor
#                            of recurrence_scale = 1 - abs(delay)/decay_time on
#                            the recurrence, i.e. the term c_{t-1} in the LSTM
#                            equations.  E.g. setting this to 20 means no more
#                            than about 20 frames' worth of history,
#                            i.e. history since about t = t-20, can be
#                            accumulated in c_t.]
#  l2-regularize=0.0         Constant controlling l2 regularization for this layer
class XconfigLstmbLayer(XconfigLayerBase):
    def __init__(self, first_token, key_to_value, prev_names = None):
        assert first_token == 'lstmb-layer'
        XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names)

    def set_default_configs(self):
        self.config = { 'input':'[-1]',
                        'cell-dim' : -1, # this is a required argument
                        'bottleneck-dim': -1, # this is a required argument
                        'clipping-threshold': 30.0,
                        'zeroing-interval': 20,
                        'zeroing-threshold': 15.0,
                        'orthonormal-constraint': 1.0,
                        'delay' : -1,
                        'lstm-nonlinearity-options' : ' max-change=0.75',
                        # the recurrence scale is the scale on m_trunc, used in the
                        # recurrence (to balance its size with the input).
                        'self-scale' : 1.0,
                        # the affine layer contains 4 of our old layers -> use a
                        # larger max-change than the normal value of 0.75.
                        'ng-affine-options' : ' max-change=1.5',
                        'l2-regularize': 0.0,
                        'decay-time':  -1.0
                        }

    def set_derived_configs(self):
        if self.config['cell-dim'] <= 0:
            self.config['cell-dim'] = self.descriptors['input']['dim']

    def check_configs(self):
        if self.config['cell-dim'] <= 0:
            raise RuntimeError("cell-dim has invalid value {0}.".format(
                self.config['cell-dim']))
        if self.config['bottleneck-dim'] <= 0:
            raise RuntimeError("bottleneck-dim has invalid value {0}.".format(
                self.config['bottleneck-dim']))
        if self.config['delay'] == 0:
            raise RuntimeError("delay cannot be zero")

    def auxiliary_outputs(self):
        return []

    def output_name(self, auxiliary_output = None):
        assert auxiliary_output is None
        return '{0}.m_batchnorm'.format(self.name)

    def output_dim(self, auxiliary_output = None):
        assert auxiliary_output is None
        return self.config['cell-dim']

    def get_full_config(self):
        ans = []
        config_lines = self._generate_lstm_config()

        for line in config_lines:
            for config_name in ['ref', 'final']:
                # we do not support user specified matrices in LSTM initialization
                # so 'ref' and 'final' configs are the same.
                ans.append((config_name, line))
        return ans

    # convenience function to generate the LSTM config
    def _generate_lstm_config(self):

        # assign some variables to reduce verbosity
        name = self.name
        # in the below code we will just call descriptor_strings as descriptors for conciseness
        input_dim = self.descriptors['input']['dim']
        input_descriptor = self.descriptors['input']['final-string']
        cell_dim = self.config['cell-dim']
        bottleneck_dim = self.config['bottleneck-dim']
        self_scale = self.config['self-scale']
        delay = self.config['delay']
        affine_str = self.config['ng-affine-options']
        l2_regularize = self.config['l2-regularize']
        l2_regularize_option = ('l2-regularize={0} '.format(l2_regularize)
                                if l2_regularize != 0.0 else '')
        decay_time = self.config['decay-time']
        # we expect decay_time to be either -1, or large, like 10 or 50.
        recurrence_scale = (1.0 if decay_time < 0 else
                            1.0 - (abs(delay) / decay_time))
        assert recurrence_scale > 0   # or user may have set decay-time much
                                      # too small.
        bptrunc_str = ("clipping-threshold={0}"
                      " zeroing-threshold={1}"
                      " zeroing-interval={2}"
                      " recurrence-interval={3}"
                      " scale={4}"
                      "".format(self.config['clipping-threshold'],
                                self.config['zeroing-threshold'],
                                self.config['zeroing-interval'],
                                abs(delay), recurrence_scale))
        lstm_str = self.config['lstm-nonlinearity-options']


        configs = []

        # See XconfigFastLstmLayer to understand what's going on here.  This
        # differs from that code by a factorization of the W_all matrix into two
        # pieces with a smaller dimension in between (with the first of the two
        # pieces constrained to have orthonormal rows).  Note: we don't apply l2
        # regularization to this layer, since, with the orthonormality
        # constraint, it's meaningless.
        configs.append("### Begin LTSM layer '{0}'".format(name))
        configs.append("component name={0}.W_all_a type=LinearComponent input-dim={1} "
                       "orthonormal-constraint={2} output-dim={3} {4}".format(
                           name, input_dim + cell_dim,
                           self.config['orthonormal-constraint'],
                           bottleneck_dim, affine_str))

        configs.append("component name={0}.W_all_b type=LinearComponent input-dim={1} "
                       "output-dim={2} {3} {4}".format(name, bottleneck_dim, cell_dim * 4,
                                                       affine_str, l2_regularize_option))
        configs.append("component name={0}.W_all_b_so type=ScaleAndOffsetComponent dim={1} "
                       "max-change=0.75".format(name, cell_dim * 4))


        configs.append("# The core LSTM nonlinearity, implemented as a single component.")
        configs.append("# Input = (i_part, f_part, c_part, o_part, c_{t-1}), output = (c_t, m_t)")
        configs.append("# See cu-math.h:ComputeLstmNonlinearity() for details.")
        configs.append("component name={0}.lstm_nonlin type=LstmNonlinearityComponent "
                       "cell-dim={1} {2} {3}".format(name, cell_dim, lstm_str,
                                                     l2_regularize_option))
        configs.append("# Component for backprop truncation, to avoid gradient blowup in long training examples.")

        configs.append("component name={0}.cm_trunc type=BackpropTruncationComponent dim={1} {2}".format(
            name, 2 * cell_dim, bptrunc_str))
        configs.append("component name={0}.m_batchnorm type=BatchNormComponent dim={1} ".format(
            name, cell_dim))

        configs.append("###  Nodes for the components above.")
        configs.append("component-node name={0}.W_all_a component={0}.W_all_a input=Append({1}, "
                       "IfDefined(Offset(Scale({2}, {0}.m_trunc), {3})))".format(
                           name, input_descriptor, self_scale, delay))
        configs.append("component-node name={0}.W_all_b component={0}.W_all_b "
                       "input={0}.W_all_a".format(name))
        configs.append("component-node name={0}.W_all_b_so component={0}.W_all_b_so "
                       "input={0}.W_all_b".format(name))

        configs.append("component-node name={0}.lstm_nonlin component={0}.lstm_nonlin "
                       "input=Append({0}.W_all_b_so, IfDefined(Offset({0}.c_trunc, {1})))".format(
                           name, delay))
        configs.append("dim-range-node name={0}.m input-node={0}.lstm_nonlin dim-offset={1} "
                       "dim={1}".format(name, cell_dim))
        configs.append("component-node name={0}.cm_trunc component={0}.cm_trunc input={0}.lstm_nonlin".format(name))
        configs.append("dim-range-node name={0}.c_trunc input-node={0}.cm_trunc dim-offset=0 "
                       "dim={1}".format(name, cell_dim))
        configs.append("dim-range-node name={0}.m_trunc input-node={0}.cm_trunc dim-offset={1} "
                       "dim={1}".format(name, cell_dim))
        configs.append("component-node name={0}.m_batchnorm component={0}.m_batchnorm "
                       "input={0}.m".format(name))
        configs.append("### End LTSM layer '{0}'".format(name))
        return configs


# This class is for lines like
#   'fast-lstmp-layer name=lstm1 input=[-1] delay=-3'
# or:
#   'fast-lstmp-layer name=lstm1 input=[-1] delay=-3 cell-dim=1024 recurrent-projection-dim=512 non-recurrent-projection-dim=512'
# (you can also use the name 'fast-lstmp-batchnorm-layer' if you want it to be followed
# by batchnorm).
# It generates an LSTM sub-graph with output projections (i.e. a projected LSTM, AKA LSTMP).
# Unlike 'lstmp-layer', the core nonlinearities of the LSTM are done in a special-purpose
# component (LstmNonlinearityComponent), and most of the affine parts of the LSTM are combined
# into one.
#
# The output dimension of the layer may be specified via 'cell-dim=xxx', but if not specified,
# the dimension defaults to the same as the input.
# See other configuration values below.
#
# Parameters of the class, and their defaults:
#   input='[-1]'             [Descriptor giving the input of the layer.]
#   cell-dim=-1              [Dimension of the cell]
#   recurrent-projection_dim [Dimension of the projection used in recurrent connections, e.g. cell-dim/4]
#   non-recurrent-projection-dim   [Dimension of the projection in non-recurrent connections,
#                                   in addition to recurrent-projection-dim, e.g. cell-dim/4]
#   delay=-1                 [Delay in the recurrent connections of the LSTM ]
#   clipping-threshold=30    [nnet3 LSTMs use a gradient clipping component at the recurrent connections.
#                             This is the threshold used to decide if clipping has to be activated ]
#   zeroing-interval=20      [interval at which we (possibly) zero out the recurrent derivatives.]
#   zeroing-threshold=15     [We only zero out the derivs every zeroing-interval, if derivs exceed this value.]
#   lstm-nonlinearity-options=' max-change=0.75 '  [Options string to pass into the LSTM nonlinearity component.]
#   ng-affine-options=' max-change=1.5 '           [Additional options used for the full matrices in the LSTM, can be used to
#                                      do things like set biases to initialize to 1]
#   decay-time=-1            [If >0, an approximate maximum on how many frames
#                            can be remembered via summation into the cell
#                            contents c_t; enforced by putting a scaling factor
#                            of recurrence_scale = 1 - abs(delay)/decay_time on
#                            the recurrence, i.e. the term c_{t-1} in the LSTM
#                            equations.  E.g. setting this to 20 means no more
#                            than about 20 frames' worth of history,
#                            i.e. history since about t = t-20, can be
#                            accumulated in c_t.]
#  l2-regularize=0.0         Constant controlling l2 regularization for this layer
class XconfigFastLstmpLayer(XconfigLayerBase):
    def __init__(self, first_token, key_to_value, prev_names = None):
        assert first_token in ['fast-lstmp-layer', 'fast-lstmp-batchnorm-layer']
        XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names)

    def set_default_configs(self):
        self.config = {'input':'[-1]',
                        'cell-dim' : -1, # this is a compulsory argument
                        'recurrent-projection-dim' : -1,
                        'non-recurrent-projection-dim' : -1,
                        'clipping-threshold' : 30.0,
                        'delay' : -1,
                        # if you want to set 'self-repair-scale' (c.f. the
                        # self-repair-scale-nonlinearity config value in older LSTM layers), you can
                        # add 'self-repair-scale=xxx' to
                        # lstm-nonlinearity-options.
                        'lstm-nonlinearity-options' : ' max-change=0.75',
                        # the affine layer contains 4 of our old layers -> use a
                        # larger max-change than the normal value of 0.75.
                        'ng-affine-options' : ' max-change=1.5',
                        'l2-regularize': 0.0,
                        'decay-time':  -1.0,
                        'zeroing-interval' : 20,
                        'zeroing-threshold' : 15.0,
                        'dropout-proportion' : -1.0, # If -1.0, no dropout will
                                                     # be used)
                         }

    def set_derived_configs(self):
        if self.config['recurrent-projection-dim'] <= 0:
            self.config['recurrent-projection-dim'] = self.config['cell-dim'] / 4

        if self.config['non-recurrent-projection-dim'] <= 0:
            self.config['non-recurrent-projection-dim'] = \
               self.config['recurrent-projection-dim']


    def check_configs(self):
        for key in ['cell-dim', 'recurrent-projection-dim',
                    'non-recurrent-projection-dim']:
            if self.config[key] <= 0:
                raise RuntimeError("{0} has invalid value {1}.".format(
                    key, self.config[key]))
        if self.config['delay'] == 0:
            raise RuntimeError("delay cannot be zero")
        if (self.config['recurrent-projection-dim'] +
            self.config['non-recurrent-projection-dim'] >
            self.config['cell-dim']):
            raise RuntimeError("recurrent+non-recurrent projection dim exceeds "
                                "cell dim")
        if ((self.config['dropout-proportion'] > 1.0 or
             self.config['dropout-proportion'] < 0.0) and
             self.config['dropout-proportion'] != -1.0 ):
            raise RuntimeError("dropout-proportion has invalid value {0}.".format(self.config['dropout-proportion']))


    def auxiliary_outputs(self):
        return ['c_t']

    def output_name(self, auxiliary_output = None):
        node_name = ('rp_batchnorm' if self.layer_type == 'fast-lstmp-batchnorm-layer'
                     else 'rp')
        if auxiliary_output is not None:
            if auxiliary_output in self.auxiliary_outputs():
                node_name = auxiliary_output
            else:
                raise RuntimeError("Unknown auxiliary output name {0}".format(auxiliary_output))

        return '{0}.{1}'.format(self.name, node_name)

    def output_dim(self, auxiliary_output = None):
        if auxiliary_output is not None:
            if auxiliary_output in self.auxiliary_outputs():
                if node_name == 'c':
                    return self.config['cell-dim']
                # add code for other auxiliary_outputs here when we decide to expose them
            else:
                raise RuntimeError("Unknown auxiliary output name {0}".format(auxiliary_output))
        return self.config['recurrent-projection-dim'] + \
               self.config['non-recurrent-projection-dim']

    def get_full_config(self):
        ans = []
        config_lines = self._generate_lstm_config()

        for line in config_lines:
            for config_name in ['ref', 'final']:
                # we do not support user specified matrices in LSTM initialization
                # so 'ref' and 'final' configs are the same.
                ans.append((config_name, line))
        return ans

    # convenience function to generate the LSTM config
    def _generate_lstm_config(self):
        # assign some variables to reduce verbosity
        name = self.name
        # in the below code we will just call descriptor_strings as descriptors for conciseness
        input_dim = self.descriptors['input']['dim']
        input_descriptor = self.descriptors['input']['final-string']
        cell_dim = self.config['cell-dim']
        delay = self.config['delay']
        rec_proj_dim = self.config['recurrent-projection-dim']
        nonrec_proj_dim = self.config['non-recurrent-projection-dim']
        affine_str = self.config['ng-affine-options']
        decay_time = self.config['decay-time']
        # we expect decay_time to be either -1, or large, like 10 or 50.
        recurrence_scale = (1.0 if decay_time < 0 else
                            1.0 - (abs(delay) / decay_time))
        assert recurrence_scale > 0   # or user may have set decay-time much
                                      # too small.
        bptrunc_str = ("clipping-threshold={0}"
                      " zeroing-threshold={1}"
                      " zeroing-interval={2}"
                      " recurrence-interval={3}"
                      " scale={4}"
                      "".format(self.config['clipping-threshold'],
                                self.config['zeroing-threshold'],
                                self.config['zeroing-interval'],
                                abs(delay), recurrence_scale))

        lstm_str = self.config['lstm-nonlinearity-options']
        dropout_proportion = self.config['dropout-proportion']
        l2_regularize = self.config['l2-regularize']
        l2_regularize_option = ('l2-regularize={0} '.format(l2_regularize)
                                if l2_regularize != 0.0 else '')

        configs = []

        # the equations implemented here are from Sak et. al. "Long Short-Term
        # Memory Recurrent Neural Network Architectures for Large Scale Acoustic
        # Modeling"
        # https://arxiv.org/pdf/1402.1128.pdf
        # See equations (7) to (14).
        # naming convention
        # <layer-name>.W_<outputname>.<input_name> e.g. Lstm1.W_i.xr for matrix providing output to gate i and operating on an appended vector [x,r]
        configs.append("##  Begin LTSM layer '{0}'".format(name))
        configs.append("# Gate control: contains W_i, W_f, W_c and W_o matrices as blocks.")
        configs.append("component name={0}.W_all type=NaturalGradientAffineComponent input-dim={1} "
                       "output-dim={2} {3} {4}".format(
                           name, input_dim + rec_proj_dim, cell_dim * 4,
                           affine_str, l2_regularize_option))
        configs.append("# The core LSTM nonlinearity, implemented as a single component.")
        configs.append("# Input = (i_part, f_part, c_part, o_part, c_{t-1}), output = (c_t, m_t)")
        configs.append("# See cu-math.h:ComputeLstmNonlinearity() for details.")
        configs.append("component name={0}.lstm_nonlin type=LstmNonlinearityComponent cell-dim={1} "
                       "use-dropout={2} {3} {4}"
                       .format(name, cell_dim,
                               "true" if dropout_proportion != -1.0 else "false",
                               lstm_str, l2_regularize_option))
        configs.append("# Component for backprop truncation, to avoid gradient blowup in long training examples.")
        configs.append("component name={0}.cr_trunc type=BackpropTruncationComponent "
                       "dim={1} {2}".format(name, cell_dim + rec_proj_dim, bptrunc_str))
        if dropout_proportion != -1.0:
            configs.append("component name={0}.dropout_mask type=DropoutMaskComponent output-dim=3 "
                           "dropout-proportion={1} "
                           .format(name, dropout_proportion))
        configs.append("# Component specific to 'projected' LSTM (LSTMP), contains both recurrent");
        configs.append("# and non-recurrent projections")
        configs.append("component name={0}.W_rp type=NaturalGradientAffineComponent "
                       "input-dim={1} output-dim={2} {3} {4}".format(
                           name, cell_dim, rec_proj_dim + nonrec_proj_dim,
                           affine_str, l2_regularize_option))
        configs.append("###  Nodes for the components above.")
        configs.append("component-node name={0}.W_all component={0}.W_all input=Append({1}, "
                       "IfDefined(Offset({0}.r_trunc, {2})))".format(name, input_descriptor, delay))

        if dropout_proportion != -1.0:
            # note: the 'input' is a don't-care as the component never uses it; it's required
            # in component-node lines.
            configs.append("component-node name={0}.dropout_mask component={0}.dropout_mask "
                           "input={0}.dropout_mask".format(name))
            configs.append("component-node name={0}.lstm_nonlin component={0}.lstm_nonlin "
                           "input=Append({0}.W_all, IfDefined(Offset({0}.c_trunc, {1})), "
                           "{0}.dropout_mask)".format(name, delay))
        else:
            configs.append("component-node name={0}.lstm_nonlin component={0}.lstm_nonlin "
                           "input=Append({0}.W_all, IfDefined(Offset({0}.c_trunc, {1})))".format(
                               name, delay))
        configs.append("dim-range-node name={0}.c input-node={0}.lstm_nonlin "
                       "dim-offset=0 dim={1}".format(name, cell_dim))
        configs.append("dim-range-node name={0}.m input-node={0}.lstm_nonlin "
                       "dim-offset={1} dim={1}".format(name, cell_dim))
        configs.append("# {0}.rp is the output node of this layer (if we're not "
                       "including batchnorm)".format(name))
        configs.append("component-node name={0}.rp component={0}.W_rp input={0}.m".format(name))
        configs.append("dim-range-node name={0}.r input-node={0}.rp dim-offset=0 "
                       "dim={1}".format(name, rec_proj_dim))
        configs.append("# Note: it's not 100% efficient that we have to stitch the c")
        configs.append("# and r back together to truncate them but it probably");
        configs.append("# makes the deriv truncation more accurate .")
        configs.append("component-node name={0}.cr_trunc component={0}.cr_trunc "
                       "input=Append({0}.c, {0}.r)".format(name))
        configs.append("dim-range-node name={0}.c_trunc input-node={0}.cr_trunc "
                       "dim-offset=0 dim={1}".format(name, cell_dim))
        configs.append("dim-range-node name={0}.r_trunc input-node={0}.cr_trunc "
                       "dim-offset={1} dim={2}".format(name, cell_dim, rec_proj_dim))
        if self.layer_type == "fast-lstmp-batchnorm-layer":
            # Add the batchnorm component, if requested to include batchnorm.
            configs.append("component name={0}.rp_batchnorm type=BatchNormComponent dim={1} ".format(
                name, rec_proj_dim + nonrec_proj_dim))
            configs.append("component-node name={0}.rp_batchnorm component={0}.rp_batchnorm "
                           "input={0}.rp".format(name))
        configs.append("### End LSTM Layer '{0}'".format(name))

        return configs


================================================
FILE: egs/steps/libs/nnet3/xconfig/parser.py
================================================
# Copyright 2016    Johns Hopkins University (Dan Povey)
#           2016    Vijayaditya Peddinti
# Apache 2.0.

""" This module contains the top level xconfig parsing functions.
"""

from __future__ import print_function

import logging
import sys
import libs.nnet3.xconfig.layers as xlayers
import libs.nnet3.xconfig.utils as xutils

import libs.common as common_lib


# We have to modify this dictionary when adding new layers
config_to_layer = {
        'input' : xlayers.XconfigInputLayer,
        'output' : xlayers.XconfigTrivialOutputLayer,
        'output-layer' : xlayers.XconfigOutputLayer,
        'relu-layer' : xlayers.XconfigBasicLayer,
        'relu-renorm-layer' : xlayers.XconfigBasicLayer,
        'relu-batchnorm-dropout-layer' : xlayers.XconfigBasicLayer,
        'relu-dropout-layer': xlayers.XconfigBasicLayer,
        'relu-batchnorm-layer' : xlayers.XconfigBasicLayer,
        'relu-batchnorm-so-layer' : xlayers.XconfigBasicLayer,
        'batchnorm-so-relu-layer' : xlayers.XconfigBasicLayer,
        'batchnorm-layer' : xlayers.XconfigBasicLayer,
        'sigmoid-layer' : xlayers.XconfigBasicLayer,
        'tanh-layer' : xlayers.XconfigBasicLayer,
        'fixed-affine-layer' : xlayers.XconfigFixedAffineLayer,
        'idct-layer' : xlayers.XconfigIdctLayer,
        'affine-layer' : xlayers.XconfigAffineLayer,
        'lstm-layer' : xlayers.XconfigLstmLayer,
        'lstmp-layer' : xlayers.XconfigLstmpLayer,
        'lstmp-batchnorm-layer' : xlayers.XconfigLstmpLayer,
        'fast-lstm-layer' : xlayers.XconfigFastLstmLayer,
        'fast-lstm-batchnorm-layer' : xlayers.XconfigFastLstmLayer,
        'fast-lstmp-layer' : xlayers.XconfigFastLstmpLayer,
        'fast-lstmp-batchnorm-layer' : xlayers.XconfigFastLstmpLayer,
        'lstmb-layer' : xlayers.XconfigLstmbLayer,
        'stats-layer': xlayers.XconfigStatsLayer,
        'relu-conv-layer': xlayers.XconfigConvLayer,
        'conv-layer': xlayers.XconfigConvLayer,
        'conv-relu-layer': xlayers.XconfigConvLayer,
        'conv-renorm-layer': xlayers.XconfigConvLayer,
        'relu-conv-renorm-layer': xlayers.XconfigConvLayer,
        'batchnorm-conv-layer': xlayers.XconfigConvLayer,
        'conv-relu-renorm-layer': xlayers.XconfigConvLayer,
        'batchnorm-conv-relu-layer': xlayers.XconfigConvLayer,
        'relu-batchnorm-conv-layer': xlayers.XconfigConvLayer,
        'relu-batchnorm-noconv-layer': xlayers.XconfigConvLayer,
        'relu-noconv-layer': xlayers.XconfigConvLayer,
        'conv-relu-batchnorm-layer': xlayers.XconfigConvLayer,
        'conv-relu-batchnorm-so-layer': xlayers.XconfigConvLayer,
        'conv-relu-batchnorm-dropout-layer': xlayers.XconfigConvLayer,
        'conv-relu-dropout-layer': xlayers.XconfigConvLayer,
        'res-block': xlayers.XconfigResBlock,
        'res2-block': xlayers.XconfigRes2Block,
        'channel-average-layer': xlayers.ChannelAverageLayer,
        'attention-renorm-layer': xlayers.XconfigAttentionLayer,
        'attention-relu-renorm-layer': xlayers.XconfigAttentionLayer,
        'attention-relu-batchnorm-layer': xlayers.XconfigAttentionLayer,
        'relu-renorm-attention-layer': xlayers.XconfigAttentionLayer,
        'gru-layer' : xlayers.XconfigGruLayer,
        'pgru-layer' : xlayers.XconfigPgruLayer,
        'opgru-layer' : xlayers.XconfigOpgruLayer,
        'norm-pgru-layer' : xlayers.XconfigNormPgruLayer,
        'norm-opgru-layer' : xlayers.XconfigNormOpgruLayer,
        'fast-gru-layer' : xlayers.XconfigFastGruLayer,
        'fast-pgru-layer' : xlayers.XconfigFastPgruLayer,
        'fast-norm-pgru-layer' : xlayers.XconfigFastNormPgruLayer,
        'fast-opgru-layer' : xlayers.XconfigFastOpgruLayer,
        'fast-norm-opgru-layer' : xlayers.XconfigFastNormOpgruLayer,
        'tdnnf-layer': xlayers.XconfigTdnnfLayer,
        'prefinal-layer': xlayers.XconfigPrefinalLayer,
        'spec-augment-layer': xlayers.XconfigSpecAugmentLayer,
        'renorm-component': xlayers.XconfigRenormComponent,
        'batchnorm-component': xlayers.XconfigBatchnormComponent,
        'no-op-component': xlayers.XconfigNoOpComponent,
        'linear-component': xlayers.XconfigLinearComponent,
        'affine-component': xlayers.XconfigAffineComponent,
        'scale-component':  xlayers.XconfigPerElementScaleComponent,
        'dim-range-component': xlayers.XconfigDimRangeComponent,
        'offset-component':  xlayers.XconfigPerElementOffsetComponent,
        'combine-feature-maps-layer': xlayers.XconfigCombineFeatureMapsLayer,
        'delta-layer': xlayers.XconfigDeltaLayer
}

# Turn a config line and a list of previous layers into
# either an object representing that line of the config file; or None
# if the line was empty after removing comments.
# 'prev_layers' is a list of objects corresponding to preceding layers of the
# config file.
def xconfig_line_to_object(config_line, prev_layers = None):
    try:
        x  = xutils.parse_config_line(config_line)
        if x is None:
            return None
        (first_token, key_to_value) = x
        if not first_token in config_to_layer:
            raise RuntimeError("No such layer type '{0}'".format(first_token))
        return config_to_layer[first_token](first_token, key_to_value, prev_layers)
    except Exception:
        logging.error(
            "***Exception caught while parsing the following xconfig line:\n"
            "*** {0}".format(config_line))
        raise


def get_model_component_info(model_filename):
    """
    This function reads existing model (*.raw or *.mdl) and returns array
    of XconfigExistingLayer one per {input,output}-node or component-node
    with same 'name' used in the raw model and 'dim' equal to 'output-dim'
    for component-node and 'dim' for {input,output}-node.

    e.g. layer in *.mdl -> corresponding 'XconfigExistingLayer' layer
         'input-node name=ivector dim=100' ->
         'existing name=ivector dim=100'
         'component-node name=tdnn1.affine ... input-dim=1000 '
         'output-dim=500' ->
         'existing name=tdnn1.affine dim=500'
    """

    all_layers = []
    try:
        f = open(model_filename, 'r')
    except Exception as e:
        sys.exit("{0}: error reading model file '{1}'".format(sys.argv[0],
                                                              model_filename,
                                                              repr(e)))

    # use nnet3-info to get component names in the model.
    out = common_lib.get_command_stdout("""nnet3-info {0} | grep '\-node' """
                                        """ """.format(model_filename))

    # out contains all {output, input, component}-nodes used in model_filename
    # It can parse lines in out like:
    # i.e. input-node name=input dim=40
    #   component-node name=tdnn1.affine component=tdnn1.affine input=lda
    #   input-dim=300 output-dim=512
    layer_names = []
    key_to_value = dict()
    for line in out.split("\n"):
        parts = line.split(" ")
        dim = -1
        for  field in parts:
            key_value = field.split("=")
            if len(key_value) == 2:
                key = key_value[0]
                value = key_value[1]
                if key == "name":           # name=**
                    layer_name = value
                elif key == "dim":          # for input-node
                    dim = int(value)
                elif key == "output-dim":   # for component-node
                    dim = int(value)

        if layer_name is not None and layer_name not in layer_names:
            layer_names.append(layer_name)
            key_to_value['name'] = layer_name
            assert(dim != -1)
            key_to_value['dim'] = dim
            all_layers.append(xlayers.XconfigExistingLayer('existing', key_to_value, all_layers))
    if len(all_layers) == 0:
        raise RuntimeError("{0}: model filename '{1}' is empty.".format(
            sys.argv[0], model_filename))
    f.close()
    return all_layers


# This function reads xconfig file and returns it as a list of layers
# (usually we use the variable name 'all_layers' elsewhere for this).
# It will die if the xconfig file is empty or if there was
# some error parsing it.
# 'existing_layers' contains some layers of type 'existing' (layers which are not really
# layers but are actual component node names from an existing neural net model
# and created using get_model_component_info function).
# 'existing' layers can be used as input to component-nodes in layers of xconfig file.
def read_xconfig_file(xconfig_filename, existing_layers=None):
    if existing_layers is None:
        existing_layers = []
    try:
        f = open(xconfig_filename, 'r')
    except Exception as e:
        sys.exit("{0}: error reading xconfig file '{1}'; error was {2}".format(
            sys.argv[0], xconfig_filename, repr(e)))
    all_layers = []
    while True:
        line = f.readline()
        if line == '':
            break
        # the next call will raise an easy-to-understand exception if
        # it fails.
        this_layer = xconfig_line_to_object(line, existing_layers)
        if this_layer is None:
            continue  # line was blank after removing comments.
        all_layers.append(this_layer)
        existing_layers.append(this_layer)
    if len(all_layers) == 0:
        raise RuntimeError("{0}: xconfig file '{1}' is empty".format(
            sys.argv[0], xconfig_filename))
    f.close()
    return all_layers


================================================
FILE: egs/steps/libs/nnet3/xconfig/stats_layer.py
================================================
# Copyright 2016    Johns Hopkins University (Author: Daniel Povey)
#           2016    Vimal Manohar
# Apache 2.0.

""" This module contains the statistics extraction and pooling layer.
"""

from __future__ import print_function
import re
from libs.nnet3.xconfig.basic_layers import XconfigLayerBase


class XconfigStatsLayer(XconfigLayerBase):
    """This class is for parsing lines like
    stats-layer name=tdnn1-stats config=mean+stddev(-99:3:9:99) input=tdnn1

    This adds statistics-pooling and statistics-extraction components.  An
    example string is 'mean(-99:3:9::99)', which means, compute the mean of
    data within a window of -99 to +99, with distinct means computed every 9
    frames (we round to get the appropriate one), and with the input extracted
    on multiples of 3 frames (so this will force the input to this layer to be
    evaluated every 3 frames).  Another example string is
    'mean+stddev(-99:3:9:99)', which will also cause the standard deviation to
    be computed.

    The dimension is worked out from the input. mean and stddev add a
    dimension of input_dim each to the output dimension. If counts is
    specified, an additional dimension is added to the output to store log
    counts.

    Parameters of the class, and their defaults:
        input='[-1]'    [Descriptor giving the input of the layer.]
        dim=-1      [Output dimension of layer. If provided, must match the
                     dimension computed from input]
        config=''   [Required. Defines what stats must be computed.]
    """
    def __init__(self, first_token, key_to_value, prev_names=None):
        assert first_token in ['stats-layer']
        XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names)

    def set_default_configs(self):
        self.config = {'input': '[-1]',
                       'dim': -1,
                       'config': ''}

    def set_derived_configs(self):
        config_string = self.config['config']
        if config_string == '':
            raise RuntimeError("config has to be non-empty",
                                self.str())
        m = re.search("(mean|mean\+stddev|mean\+count|mean\+stddev\+count)"
                      "\((-?\d+):(-?\d+):(-?\d+):(-?\d+)\)",
                      config_string)
        if m is None:
            raise RuntimeError("Invalid statistic-config string: {0}".format(
                config_string), self)

        self._output_stddev = (m.group(1) in ['mean+stddev',
                                              'mean+stddev+count'])
        self._output_log_counts = (m.group(1) in ['mean+count',
                                                  'mean+stddev+count'])
        self._left_context = -int(m.group(2))
        self._input_period = int(m.group(3))
        self._stats_period = int(m.group(4))
        self._right_context = int(m.group(5))

        if self._output_stddev:
          output_dim = 2 * self.descriptors['input']['dim']
        else:
          output_dim = self.descriptors['input']['dim']
        if self._output_log_counts:
          output_dim = output_dim + 1

        if self.config['dim'] > 0 and self.config['dim'] != output_dim:
            raise RuntimeError(
                "Invalid dim supplied {0:d} != "
                "actual output dim {1:d}".format(
                    self.config['dim'], output_dim))
        self.config['dim'] = output_dim

    def check_configs(self):
        if not (self._left_context >= 0 and self._right_context >= 0
                and self._input_period > 0 and self._stats_period > 0
                and self._left_context % self._stats_period == 0
                and self._right_context % self._stats_period == 0
                and self._stats_period % self._input_period == 0):
            raise RuntimeError(
                "Invalid configuration of statistics-extraction: {0}".format(
                    self.config['config']), self)
        super(XconfigStatsLayer, self).check_configs()

    def _generate_config(self):
        input_desc = self.descriptors['input']['final-string']
        input_dim = self.descriptors['input']['dim']

        configs = []
        configs.append(
            'component name={name}-extraction-{lc}-{rc} '
            'type=StatisticsExtractionComponent input-dim={dim} '
            'input-period={input_period} output-period={output_period} '
            'include-variance={var} '.format(
                name=self.name, lc=self._left_context, rc=self._right_context,
                dim=input_dim, input_period=self._input_period,
                output_period=self._stats_period,
                var='true' if self._output_stddev else 'false'))
        configs.append(
            'component-node name={name}-extraction-{lc}-{rc} '
            'component={name}-extraction-{lc}-{rc} input={input} '.format(
                name=self.name, lc=self._left_context, rc=self._right_context,
                input=input_desc))

        stats_dim = 1 + input_dim * (2 if self._output_stddev else 1)
        configs.append(
            'component name={name}-pooling-{lc}-{rc} '
            'type=StatisticsPoolingComponent input-dim={dim} '
            'input-period={input_period} left-context={lc} right-context={rc} '
            'num-log-count-features={count} output-stddevs={var} '.format(
                name=self.name, lc=self._left_context, rc=self._right_context,
                dim=stats_dim, input_period=self._stats_period,
                count=1 if self._output_log_counts else 0,
                var='true' if self._output_stddev else 'false'))
        configs.append(
            'component-node name={name}-pooling-{lc}-{rc} '
            'component={name}-pooling-{lc}-{rc} '
            'input={name}-extraction-{lc}-{rc} '.format(
                name=self.name, lc=self._left_context, rc=self._right_context))
        return configs

    def output_name(self, auxiliary_output=None):
        return 'Round({name}-pooling-{lc}-{rc}, {period})'.format(
            name=self.name, lc=self._left_context,
            rc=self._right_context, period=self._stats_period)

    def output_dim(self, auxiliary_outputs=None):
        return self.config['dim']

    def get_full_config(self):
        ans = []
        config_lines = self._generate_config()

        for line in config_lines:
            for config_name in ['ref', 'final']:
                ans.append((config_name, line))

        return ans


================================================
FILE: egs/steps/libs/nnet3/xconfig/trivial_layers.py
================================================
# Copyright 2016    Johns Hopkins University (Dan Povey)
#           2016    Vijayaditya Peddinti
#           2017    Google Inc. (vpeddinti@google.com)
#           2017    Vimal Manohar
# Apache 2.0.

""" This module contains layers that just map to a single component.
"""

from __future__ import print_function
import math
import re
import sys
from libs.nnet3.xconfig.basic_layers import XconfigLayerBase


class XconfigRenormComponent(XconfigLayerBase):
    """This class is for parsing lines like
     'renorm-component name=renorm1 input=Append(-3,0,3)'
    which will produce just a single component, of type NormalizeComponent.

    Parameters of the class, and their defaults:
      input='[-1]'             [Descriptor giving the input of the layer.]
      target-rms=1.0           [The target RMS of the NormalizeComponent]
    """
    def __init__(self, first_token, key_to_value, prev_names=None):
        XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names)

    def set_default_configs(self):
        self.config = {'input': '[-1]',
                       'target-rms': 1.0 }

    def check_configs(self):
        assert self.config['target-rms'] > 0.0

    def output_name(self, auxiliary_output=None):
        assert auxiliary_output is None
        return self.name

    def output_dim(self, auxiliary_output=None):
        assert auxiliary_output is None
        input_dim = self.descriptors['input']['dim']
        return input_dim

    def get_full_config(self):
        ans = []
        config_lines = self._generate_config()

        for line in config_lines:
            for config_name in ['ref', 'final']:
                # we do not support user specified matrices in this layer
                # so 'ref' and 'final' configs are the same.
                ans.append((config_name, line))
        return ans

    def _generate_config(self):
        # by 'descriptor_final_string' we mean a string that can appear in
        # config-files, i.e. it contains the 'final' names of nodes.
        input_desc = self.descriptors['input']['final-string']
        input_dim = self.descriptors['input']['dim']
        target_rms = self.config['target-rms']

        configs = []
        line = ('component name={0} type=NormalizeComponent dim={1} target-rms={2}'.format(
            self.name, input_dim, target_rms))
        configs.append(line)
        line = ('component-node name={0} component={0} input={1}'.format(
            self.name, input_desc))
        configs.append(line)
        return configs


class XconfigBatchnormComponent(XconfigLayerBase):
    """This class is for parsing lines like
     'batchnorm-component name=batchnorm input=Append(-3,0,3)'
    which will produce just a single component, of type BatchNormComponent.

    Parameters of the class, and their defaults:
      input='[-1]'             [Descriptor giving the input of the layer.]
      target-rms=1.0           [The target RMS of the BatchNormComponent]
      include-in-init=false     [You should set this to true if this precedes a
                                `fixed-affine-layer` that is to be initialized
                                 via LDA]
    """
    def __init__(self, first_token, key_to_value, prev_names=None):
        XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names)

    def set_default_configs(self):
        self.config = {'input': '[-1]',
                       'target-rms': 1.0,
                       'include-in-init': False}

    def check_configs(self):
        assert self.config['target-rms'] > 0.0

    def output_name(self, auxiliary_output=None):
        assert auxiliary_output is None
        return self.name

    def output_dim(self, auxiliary_output=None):
        assert auxiliary_output is None
        input_dim = self.descriptors['input']['dim']
        return input_dim

    def get_full_config(self):
        ans = []
        config_lines = self._generate_config()

        for line in config_lines:
            for config_name in ['ref', 'final']:
                # we do not support user specified matrices in this layer
                # so 'ref' and 'final' configs are the same.
                ans.append((config_name, line))
            if self.config['include-in-init']:
                ans.append(('init', line))
        return ans

    def _generate_config(self):
        # by 'descriptor_final_string' we mean a string that can appear in
        # config-files, i.e. it contains the 'final' names of nodes.
        input_desc = self.descriptors['input']['final-string']
        input_dim = self.descriptors['input']['dim']
        target_rms = self.config['target-rms']

        configs = []
        line = ('component name={0} type=BatchNormComponent dim={1} target-rms={2}'.format(
            self.name, input_dim, target_rms))
        configs.append(line)
        line = ('component-node name={0} component={0} input={1}'.format(
            self.name, input_desc))
        configs.append(line)
        return configs


class XconfigNoOpComponent(XconfigLayerBase):
    """This class is for parsing lines like
     'no-op-component name=noop1 input=Append(-3,0,3)'
    which will produce just a single component, of type NoOpComponent.

    Parameters of the class, and their defaults:
      input='[-1]'             [Descriptor giving the input of the layer.]
    """
    def __init__(self, first_token, key_to_value, prev_names=None):
        XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names)

    def set_default_configs(self):
        self.config = {'input': '[-1]' }

    def check_configs(self):
        pass

    def output_name(self, auxiliary_output=None):
        assert auxiliary_output is None
        return self.name

    def output_dim(self, auxiliary_output=None):
        assert auxiliary_output is None
        input_dim = self.descriptors['input']['dim']
        return input_dim

    def get_full_config(self):
        ans = []
        config_lines = self._generate_config()

        for line in config_lines:
            for config_name in ['ref', 'final']:
                # we do not support user specified matrices in this layer
                # so 'ref' and 'final' configs are the same.
                ans.append((config_name, line))
        return ans

    def _generate_config(self):
        # by 'descriptor_final_string' we mean a string that can appear in
        # config-files, i.e. it contains the 'final' names of nodes.
        input_desc = self.descriptors['input']['final-string']
        input_dim = self.descriptors['input']['dim']

        configs = []
        line = ('component name={0} type=NoOpComponent dim={1}'.format(
            self.name, input_dim))
        configs.append(line)
        line = ('component-node name={0} component={0} input={1}'.format(
            self.name, input_desc))
        configs.append(line)
        return configs


class XconfigDeltaLayer(XconfigLayerBase):
    """This class is for parsing lines like
     'delta-layer name=delta input=idct'
    which appends the central frame with the delta features
    (i.e. -1,0,1 since scale equals 1) and delta-delta features 
    (i.e. 1,0,-2,0,1), and then applies batchnorm to it.

    Parameters of the class, and their defaults:
      input='[-1]'             [Descriptor giving the input of the layer]
    """
    def __init__(self, first_token, key_to_value, prev_names=None):
        XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names)

    def set_default_configs(self):
        self.config = {'input': '[-1]'}

    def check_configs(self):
        pass

    def output_name(self, auxiliary_output=None):
        assert auxiliary_output is None
        return self.name

    def output_dim(self, auxiliary_output=None):
        assert auxiliary_output is None
        input_dim = self.descriptors['input']['dim']
        return (3*input_dim)

    def get_full_config(self):
        ans = []
        config_lines = self._generate_config()

        for line in config_lines:
            for config_name in ['ref', 'final']:
                # we do not support user specified matrices in this layer
                # so 'ref' and 'final' configs are the same.
                ans.append((config_name, line))
        return ans

    def _generate_config(self):
        # by 'descriptor_final_string' we mean a string that can appear in
        # config-files, i.e. it contains the 'final' names of nodes.
        input_desc = self.descriptors['input']['final-string']
        input_dim = self.descriptors['input']['dim']
        output_dim = self.output_dim()

        configs = []
        line = ('dim-range-node name={0}_copy1 input-node={0} dim={1} dim-offset=0'.format(
            input_desc, input_dim))
        configs.append(line)
        line = ('dim-range-node name={0}_copy2 input-node={0} dim={1} dim-offset=0'.format(
            input_desc, input_dim))
        configs.append(line)

        line = ('component name={0}_2 type=NoOpComponent dim={1}'.format(
            input_desc, output_dim))
        configs.append(line)
        line = ('component-node name={0}_2 component={0}_2 input=Append(Offset({0},0),'
            ' Sum(Offset(Scale(-1.0,{0}_copy1),-1), Offset({0},1)), Sum(Offset({0},-2), Offset({0},2),' 
            ' Offset(Scale(-2.0,{0}_copy2),0)))'.format(input_desc))
        configs.append(line)
        
        line = ('component name={0} type=BatchNormComponent dim={1}'.format(
            self.name, output_dim))
        configs.append(line)
        line = ('component-node name={0} component={0} input={1}_2'.format(
            self.name, input_desc))
        configs.append(line)
        return configs


class XconfigLinearComponent(XconfigLayerBase):
    """This class is for parsing lines like
     'linear-component name=linear1 dim=1024 input=Append(-3,0,3)'
    which will produce just a single component, of type LinearComponent, with
    output-dim 1024 in this case, and input-dim determined by the dimension
    of the input .

    Parameters of the class, and their defaults:
      input='[-1]'             [Descriptor giving the input of the layer.]
      dim=-1                   [Dimension of the output]

    The following (shown with their effective defaults) are just passed through
    to the component's config line.

      orthonormal-constraint=0.0
      max-change=0.75
      l2-regularize=0.0

    """
    def __init__(self, first_token, key_to_value, prev_names=None):
        XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names)

    def set_default_configs(self):
        self.config = {'input': '[-1]',
                       'dim': -1,
                       'orthonormal-constraint': '',
                       'max-change': 0.75,
                       'l2-regularize': '',
                       'param-stddev': '',
                       'learning-rate-factor': '' }

    def check_configs(self):
        if self.config['dim'] <= 0:
            raise RuntimeError("'dim' must be specified and > 0.")

    def output_name(self, auxiliary_output=None):
        assert auxiliary_output is None
        return self.name

    def output_dim(self, auxiliary_output=None):
        assert auxiliary_output is None
        assert self.config['dim'] > 0
        return self.config['dim']

    def get_full_config(self):
        ans = []
        config_lines = self._generate_config()

        for line in config_lines:
            for config_name in ['ref', 'final']:
                # we do not support user specified matrices in this layer
                # so 'ref' and 'final' configs are the same.
                ans.append((config_name, line))
        return ans

    def _generate_config(self):
        # by 'descriptor_final_string' we mean a string that can appear in
        # config-files, i.e. it contains the 'final' names of nodes.
        input_desc = self.descriptors['input']['final-string']
        input_dim = self.descriptors['input']['dim']
        output_dim = self.config['dim']

        opts = ''
        for opt_name in ['orthonormal-constraint', 'max-change', 'l2-regularize',
                         'param-stddev', 'learning-rate-factor' ]:
            value = self.config[opt_name]
            if value != '':
                opts += ' {0}={1}'.format(opt_name, value)

        configs = []
        line = ('component name={0} type=LinearComponent input-dim={1} output-dim={2} '
                '{3}'.format(self.name, input_dim, output_dim, opts))
        configs.append(line)
        line = ('component-node name={0} component={0} input={1}'.format(
            self.name, input_desc))
        configs.append(line)
        return configs


class XconfigCombineFeatureMapsLayer(XconfigLayerBase):
    """This class is for parsing lines like
      'combine-feature-maps-layer name=combine_features1 height=40 num-filters1=1 num-filters2=4'
      or
      'combine-feature-maps-layer name=combine_features1 height=40 num-filters1=1 num-filters2=4 num-filters3=2'

      It produces a PermuteComponent.  It expects its input to be two or three things
      appended together, where the first is of dimension height * num-filters1 and
      the second is of dimension height * num-filters2 (and the third, if present is
      of dimension height * num-filters2; it interpolates the filters
      so the output can be interpreted as a single feature map with the same height
      as the input and the sum of the num-filters.

      This is to be used in convolutional setups as part of how we combine the
      filterbank inputs with ivectors.
    """

    def __init__(self, first_token, key_to_value, prev_names=None):
        XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names)

    def set_default_configs(self):
        self.config = { 'input': '[-1]',
                        'num-filters1': -1,
                        'num-filters2': -1,
                        'num-filters3': 0,
                        'height': -1 }

    def check_configs(self):
        input_dim = self.descriptors['input']['dim']
        if (self.config['num-filters1'] <= 0 or
            self.config['num-filters2'] <= 0 or
            self.config['num-filters3'] < 0 or
            self.config['height'] <= 0):
            raise RuntimeError("invalid values of num-filters1, num-filters2 and/or height")
        f1 = self.config['num-filters1']
        f2 = self.config['num-filters2']
        f3 = self.config['num-filters3']
        h = self.config['height']
        if input_dim != (f1 + f2 + f3) * h:
            raise RuntimeError("Expected input-dim={0} based on num-filters1={1}, num-filters2={2}, "
                               "num-filters3={3} and height={4}, but got input-dim={5}".format(
                                   (f1 + f2 + f3) * h, f1, f2, f3, h, input_dim))

    def output_name(self, auxiliary_output=None):
        assert auxiliary_output is None
        return self.name

    def output_dim(self, auxiliary_output=None):
        assert auxiliary_output is None
        input_dim = self.descriptors['input']['dim']
        return input_dim

    def get_full_config(self):
        ans = []
        config_lines = self._generate_config()

        for line in config_lines:
            for config_name in ['ref', 'final']:
                # we do not support user specified matrices in this layer
                # so 'ref' and 'final' configs are the same.
                ans.append((config_name, line))
        return ans

    def _generate_config(self):
        # by 'descriptor_final_string' we mean a string that can appear in
        # config-files, i.e. it contains the 'final' names of nodes.
        input_desc = self.descriptors['input']['final-string']
        dim = self.descriptors['input']['dim']
        num_filters1 = self.config['num-filters1']
        num_filters2 = self.config['num-filters2']
        num_filters3 = self.config['num-filters3']  # normally 0.
        height = self.config['height']
        assert dim == (num_filters1 + num_filters2 + num_filters3) * height

        column_map = []
        for h in range(height):
            for f in range(num_filters1):
                column_map.append(h * num_filters1 + f)
            for f in range(num_filters2):
                column_map.append(height * num_filters1 + h * num_filters2 + f)
            for f in range(num_filters3):
                column_map.append(height * (num_filters1 + num_filters2) + h * num_filters3 + f)

        configs = []
        line = ('component name={0} type=PermuteComponent column-map={1} '.format(
            self.name, ','.join([str(x) for x in column_map])))
        configs.append(line)

        line = ('component-node name={0} component={0} input={1}'.format(
            self.name, input_desc))
        configs.append(line)
        return configs


class XconfigAffineComponent(XconfigLayerBase):
    """This class is for parsing lines like
     'affine-component name=linear1 dim=1024 input=Append(-3,0,3)'
    which will produce just a single component, of type NaturalGradientAffineComponent,
    with output-dim 1024 in this case, and input-dim determined by the dimension
    of the input .

    Parameters of the class, and their defaults:
      input='[-1]'             [Descriptor giving the input of the layer.]
      dim=-1                   [Dimension of the output]

    The following (shown with their effective defaults) are just passed through
    to the component's config line.

      orthonormal-constraint=0.0
      max-change=0.75
      l2-regularize=0.0

    """
    def __init__(self, first_token, key_to_value, prev_names=None):
        XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names)

    def set_default_configs(self):
        self.config = {'input': '[-1]',
                       'dim': -1,
                       'orthonormal-constraint': '',
                       'max-change': 0.75,
                       'param-stddev': '',
                       'bias-stddev': '',
                       'l2-regularize': '' }

    def check_configs(self):
        if self.config['dim'] <= 0:
            raise RuntimeError("'dim' must be specified and > 0.")

    def output_name(self, auxiliary_output=None):
        assert auxiliary_output is None
        return self.name

    def output_dim(self, auxiliary_output=None):
        assert auxiliary_output is None
        assert self.config['dim'] > 0
        return self.config['dim']

    def get_full_config(self):
        ans = []
        config_lines = self._generate_config()

        for line in config_lines:
            for config_name in ['ref', 'final']:
                # we do not support user specified matrices in this layer
                # so 'ref' and 'final' configs are the same.
                ans.append((config_name, line))
        return ans

    def _generate_config(self):
        # by 'descriptor_final_string' we mean a string that can appear in
        # config-files, i.e. it contains the 'final' names of nodes.
        input_desc = self.descriptors['input']['final-string']
        input_dim = self.descriptors['input']['dim']
        output_dim = self.config['dim']

        opts = ''
        for opt_name in ['orthonormal-constraint', 'max-change', 'l2-regularize',
                         'param-stddev', 'bias-stddev']:
            value = self.config[opt_name]
            if value != '':
                opts += ' {0}={1}'.format(opt_name, value)

        configs = []
        line = ('component name={0} type=NaturalGradientAffineComponent input-dim={1} output-dim={2} '
                '{3}'.format(self.name, input_dim, output_dim, opts))
        configs.append(line)
        line = ('component-node name={0} component={0} input={1}'.format(
            self.name, input_desc))
        configs.append(line)
        return configs


class XconfigPerElementScaleComponent(XconfigLayerBase):
    """This class is for parsing lines like
     'scale-component name=scale1 input=Append(-3,0,3)'
    which will produce just a single component, of type NaturalGradientPerElementScaleComponent, with
    output-dim 1024 in this case, and input-dim determined by the dimension of the input .

    Parameters of the class, and their defaults:
      input='[-1]'             [Descriptor giving the input of the layer.]

    The following (shown with their effective defaults) are just passed through
    to the component's config line.  (These defaults are mostly set in the
    code).

      max-change=0.75
      l2-regularize=0.0
      param-mean=1.0   # affects initialization
      param-stddev=0.0  # affects initialization
      learning-rate-factor=1.0
    """
    def __init__(self, first_token, key_to_value, prev_names=None):
        XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names)

    def set_default_configs(self):
        self.config = {'input': '[-1]',
                       'l2-regularize': '',
                       'max-change': 0.75,
                       'param-mean': '',
                       'param-stddev': '',
                       'learning-rate-factor': '' }

    def check_configs(self):
        pass

    def output_name(self, auxiliary_output=None):
        assert auxiliary_output is None
        return self.name

    def output_dim(self, auxiliary_output=None):
        assert auxiliary_output is None
        return self.descriptors['input']['dim']

    def get_full_config(self):
        ans = []
        config_lines = self._generate_config()

        for line in config_lines:
            for config_name in ['ref', 'final']:
                # we do not support user specified matrices in this layer
                # so 'ref' and 'final' configs are the same.
                ans.append((config_name, line))
        return ans

    def _generate_config(self):
        # by 'descriptor_final_string' we mean a string that can appear in
        # config-files, i.e. it contains the 'final' names of nodes.
        input_desc = self.descriptors['input']['final-string']
        dim = self.descriptors['input']['dim']

        opts = ''
        for opt_name in ['learning-rate-factor', 'max-change', 'l2-regularize', 'param-mean',
                         'param-stddev' ]:
            value = self.config[opt_name]
            if value != '':
                opts += ' {0}={1}'.format(opt_name, value)

        configs = []
        line = ('component name={0} type=NaturalGradientPerElementScaleComponent dim={1} {2} '
                ''.format(self.name, dim, opts))
        configs.append(line)
        line = ('component-node name={0} component={0} input={1}'.format(
            self.name, input_desc))
        configs.append(line)
        return configs

class XconfigPerElementOffsetComponent(XconfigLayerBase):
    """This class is for parsing lines like
     'offset-component name=offset1 input=Append(-3,0,3)'
    which will produce just a single component, of type PerElementOffsetComponent, with
    output-dim 1024 in this case, and input-dim determined by the dimension of the input .

    Parameters of the class, and their defaults:
      input='[-1]'             [Descriptor giving the input of the layer.]

    The following (shown with their effective defaults) are just passed through
    to the component's config line.  (These defaults are mostly set in the
    code).

      max-change=0.75
      l2-regularize=0.0
      param-mean=0.0   # affects initialization
      param-stddev=0.0  # affects initialization
      learning-rate-factor=1.0
    """
    def __init__(self, first_token, key_to_value, prev_names=None):
        XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names)

    def set_default_configs(self):
        self.config = {'input': '[-1]',
                       'l2-regularize': '',
                       'max-change': 0.75,
                       'param-mean': '',
                       'param-stddev': '',
                       'learning-rate-factor': '' }

    def check_configs(self):
        pass

    def output_name(self, auxiliary_output=None):
        assert auxiliary_output is None
        return self.name

    def output_dim(self, auxiliary_output=None):
        assert auxiliary_output is None
        return self.descriptors['input']['dim']

    def get_full_config(self):
        ans = []
        config_lines = self._generate_config()

        for line in config_lines:
            for config_name in ['ref', 'final']:
                # we do not support user specified matrices in this layer
                # so 'ref' and 'final' configs are the same.
                ans.append((config_name, line))
        return ans

    def _generate_config(self):
        # by 'descriptor_final_string' we mean a string that can appear in
        # config-files, i.e. it contains the 'final' names of nodes.
        input_desc = self.descriptors['input']['final-string']
        dim = self.descriptors['input']['dim']

        opts = ''
        for opt_name in ['learning-rate-factor', 'max-change', 'l2-regularize', 'param-mean',
                         'param-stddev' ]:
            value = self.config[opt_name]
            if value != '':
                opts += ' {0}={1}'.format(opt_name, value)

        configs = []
        line = ('component name={0} type=PerElementOffsetComponent dim={1} {2} '
                ''.format(self.name, dim, opts))
        configs.append(line)
        line = ('component-node name={0} component={0} input={1}'.format(
            self.name, input_desc))
        configs.append(line)
        return configs


class XconfigDimRangeComponent(XconfigLayerBase):
    """This class is for parsing lines like
     'dim-range-component name=feature1 input=Append(-3,0,3) dim=40 dim-offset=0'
    which will produce just a single component, of part of the input.
    Parameters of the class, and their defaults:
      input='[-1]'             [Descriptor giving the input of the layer.]
      dim=-1                   [Dimension of the output.]
      dim-offset=0             [Dimension offset of the input.]
    """
    def __init__(self, first_token, key_to_value, prev_names=None):
        XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names)

    def set_default_configs(self):
        self.config = {'input': '[-1]',
                       'dim': -1,
                       'dim-offset': 0 }

    def check_configs(self):
        input_dim = self.descriptors['input']['dim']
        if self.config['dim'] <= 0:
            raise RuntimeError("'dim' must be specified and > 0.")
        elif self.config['dim'] > input_dim:
            raise RuntimeError("'dim' must be specified and lower than the input dim.")
        if self.config['dim-offset'] < 0 :
            raise RuntimeError("'dim-offset' must be specified and >= 0.")
        elif self.config['dim-offset'] + self.config['dim'] > input_dim:
            raise RuntimeError("'dim-offset' plus output dim must be lower than the input dim.")

    def output_name(self, auxiliary_output=None):
        assert auxiliary_output is None
        return self.name

    def output_dim(self, auxiliary_output=None):
        assert auxiliary_output is None
        output_dim = self.config['dim']
        if output_dim <= 0:
            self.config['dim'] = self.descriptors['input']['dim']
        return output_dim

    def get_full_config(self):
        ans = []
        config_lines = self._generate_config()

        for line in config_lines:
            for config_name in ['ref', 'final']:
                # we do not support user specified matrices in this layer
                # so 'ref' and 'final' configs are the same.
                ans.append((config_name, line))
        return ans

    def _generate_config(self):
        # by 'descriptor_final_string' we mean a string that can appear in
        # config-files, i.e. it contains the 'final' names of nodes.
        input_node = self.descriptors['input']['final-string']
        output_dim = self.config['dim']
        dim_offset = self.config['dim-offset']

        configs = []
        line = ('dim-range-node name={0} input-node={1} dim={2} dim-offset={3}'.format(
            self.name, input_node, output_dim, dim_offset))
        configs.append(line)
        return configs


================================================
FILE: egs/steps/libs/nnet3/xconfig/utils.py
================================================
# Copyright  2016  Johns Hopkins University (Author: Daniel Povey).
# License: Apache 2.0.

# This library contains various utilities that are involved in processing
# of xconfig -> config conversion.  It contains "generic" lower-level code
# while xconfig_layers.py contains the code specific to layer types.

from __future__ import print_function
from __future__ import division
import re
import sys


# [utility function used in xconfig_layers.py]
# Given a list of objects of type XconfigLayerBase ('all_layers'),
# including at least the layers preceding 'current_layer' (and maybe
# more layers), return the names of layers preceding 'current_layer'
# other than layers of type 'existing', which corresponds to component-node
# names from an existing model that we are adding layers to them.
# This will be used in parsing expressions like [-1] in descriptors
# (which is an alias for the previous layer).
def get_prev_names(all_layers, current_layer):
    prev_names = []
    for layer in all_layers:
        if layer is current_layer:
            break

        # The following if-statement is needed to handle the case where the
        # the layer is an 'existing' layer, derived from an existing trained
        # neural network supplied via the existing-model option, that we are
        # adding layers to. In this case, these layers are not considered as
        # layers preceding 'current_layer'.
        if layer.layer_type is not 'existing':
            prev_names.append(layer.get_name())
    prev_names_set = set()
    for name in prev_names:
        if name in prev_names_set:
            raise RuntimeError("{0}: Layer name {1} is used more than once.".format(
                    sys.argv[0], name))
        prev_names_set.add(name)
    return prev_names


# This is a convenience function to parser the auxiliary output name from the
# full layer name
def split_layer_name(full_layer_name):
    assert isinstance(full_layer_name, str)
    split_name = full_layer_name.split('.')
    if len(split_name) == 0:
        raise RuntimeError("Bad layer name: " + full_layer_name)
    layer_name = split_name[0]
    if len(split_name) == 1:
        auxiliary_output = None
    else:
        # we probably expect len(split_name) == 2 in this case,
        # but no harm in allowing dots in the auxiliary_output.
        auxiliary_output = '.'.join(split_name[1:])

    return [layer_name, auxiliary_output]

# [utility function used in xconfig_layers.py]
# this converts a layer-name like 'ivector' or 'input', or a sub-layer name like
# 'lstm2.memory_cell', into a dimension.  'all_layers' is a vector of objects
# inheriting from XconfigLayerBase.  'current_layer' is provided so that the
# function can make sure not to look in layers that appear *after* this layer
# (because that's not allowed).
def get_dim_from_layer_name(all_layers, current_layer, full_layer_name):
    layer_name, auxiliary_output = split_layer_name(full_layer_name)
    for layer in all_layers:
        if layer is current_layer:
            break

        # If 'all_layers' contains some 'existing' layers, i.e. layers which
        # are not really layers but are actual component names from an existing
        # neural net that we are adding components to, they may already be
        # of the form 'xxx.yyy', e.g. 'tdnn1.affine'.  In this case the name of
        # the layer in 'all_layers' won't be just the 'xxx' part (e.g. 'tdnn1'),
        # it will be the full thing, like 'tdnn1.affine'.
        # We will also use the if-statement immediately below this comment for
        # regular layers, e.g. where full_layer_name is something like 'tdnn2'.
        # The if-statement below the next one, that uses
        # auxiliary_output, will only be used in the (rare) case when we are
        # using auxiliary outputs, e.g. 'lstm1.c'.
        if layer.get_name() == full_layer_name:
            return  layer.output_dim()

        if layer.get_name() == layer_name:
            if (not auxiliary_output in layer.auxiliary_outputs()
                and auxiliary_output is not None):
                raise RuntimeError("Layer '{0}' has no such auxiliary output:"
                                   "'{1}' ({0}.{1})".format(layer_name,
                                                            auxiliary_output))
            return layer.output_dim(auxiliary_output)
    # No such layer was found.
    if layer_name in [ layer.get_name() for layer in all_layers ]:
        raise RuntimeError("Layer '{0}' was requested before it appeared in "
                        "the xconfig file (circular dependencies or out-of-order "
                        "layers".format(layer_name))
    else:
        raise RuntimeError("No such layer: '{0}'".format(layer_name))


# [utility function used in xconfig_layers.py]
# this converts a layer-name like 'ivector' or 'input', or a sub-layer name like
# 'lstm2.memory_cell', into a descriptor (usually, but not required to be a simple
# component-node name) that can appear in the generated config file.  'all_layers' is a vector of objects
# inheriting from XconfigLayerBase.  'current_layer' is provided so that the
# function can make sure not to look in layers that appear *after* this layer
# (because that's not allowed).
def get_string_from_layer_name(all_layers, current_layer, full_layer_name):
    layer_name, auxiliary_output = split_layer_name(full_layer_name)
    for layer in all_layers:
        if layer is current_layer:
            break

        # The following if-statement is needed to handle the case where the
        # layer is an 'existing' layer, derived from an existing trained
        # neural network supplied via the --existing-model option, that we are
        # adding layers to.  In this case the name of the layer will actually
        # be of the form xxx.yyy, e.g. 'tdnn1.affine'.
        # The code path will also be taken for regular (non-'existing') layer
        # names where the 'auxiliary_output' field is not used, which is actually
        # the normal case (e.g. when 'full_layer_name' is 'lstm1',
        # as opposed to, say, 'lstm1.c'
        if layer.get_name() == full_layer_name:
            return layer.output_name()

        if layer.get_name() == layer_name:
            if (not auxiliary_output in layer.auxiliary_outputs() and
                auxiliary_output is not None):
                raise RuntimeError("Layer '{0}' has no such auxiliary output: "
                                   "'{1}' ({0}.{1})".format(
                    layer_name, auxiliary_output))
            return layer.output_name(auxiliary_output)
    # No such layer was found.
    if layer_name in [ layer.get_name() for layer in all_layers ]:
        raise RuntimeError("Layer '{0}' was requested before it appeared in "
                        "the xconfig file (circular dependencies or out-of-order "
                        "layers".format(layer_name))
    else:
        raise RuntimeError("No such layer: '{0}'".format(layer_name))


# This function, used in converting string values in config lines to
# configuration values in self.config in layers, attempts to
# convert 'string_value' to an instance dest_type (which is of type Type)
# 'key' is only needed for printing errors.
def convert_value_to_type(key, dest_type, string_value):
    if dest_type == type(bool()):
        if string_value == "True" or string_value == "true":
            return True
        elif string_value == "False" or string_value == "false":
            return False
        else:
            raise RuntimeError("Invalid configuration value {0}={1} (expected bool)".format(
                key, string_value))
    elif dest_type == type(int()):
        try:
            return int(string_value)
        except:
            raise RuntimeError("Invalid configuration value {0}={1} (expected int)".format(
                key, string_value))
    elif dest_type == type(float()):
        try:
            return float(string_value)
        except:
            raise RuntimeError("Invalid configuration value {0}={1} (expected int)".format(
                key, string_value))
    elif dest_type == type(str()):
        return string_value


# This class parses and stores a Descriptor-- expression
# like Append(Offset(input, -3), input) and so on.
# For the full range of possible expressions, see the comment at the
# top of src/nnet3/nnet-descriptor.h.
# Note: as an extension to the descriptor format used in the C++
# code, we can have e.g. input@-3 meaning Offset(input, -3);
# and if bare integer numbers appear where a descriptor was expected,
# they are interpreted as Offset(prev_layer, -3) where 'prev_layer'
# is the previous layer in the config file.

# Also, in any place a raw input/layer/output name can appear, we accept things
# like [-1] meaning the previous input/layer/output's name, or [-2] meaning the
# last-but-one input/layer/output, and so on.
class Descriptor(object):
    def __init__(self,
                 descriptor_string = None,
                 prev_names = None):
        # self.operator is a string that may be 'Offset', 'Append',
        # 'Sum', 'Failover', 'IfDefined', 'Offset', 'Switch', 'Round',
        # 'ReplaceIndex'; it also may be None, representing the base-case
        # (where it's just a layer name)

        # self.items will be whatever items are
        # inside the parentheses, e.g. if this is Sum(foo bar),
        # then items will be [d1, d2], where d1 is a Descriptor for
        # 'foo' and d1 is a Descriptor for 'bar'.  However, there are
        # cases where elements of self.items are strings or integers,
        # for instance in an expression 'ReplaceIndex(ivector, x, 0)',
        # self.items would be [d, 'x', 0], where d is a Descriptor
        # for 'ivector'.  In the case where self.operator is None (where
        # this Descriptor represents just a bare layer name), self.
        # items contains the name of the input layer as a string.
        self.operator = None
        self.items = None

        if descriptor_string != None:
            try:
                tokens = tokenize_descriptor(descriptor_string, prev_names)
                pos = 0
                (d, pos) = parse_new_descriptor(tokens, pos, prev_names)
                # note: 'pos' should point to the 'end of string' marker
                # that terminates 'tokens'.
                if pos != len(tokens) - 1:
                    raise RuntimeError("Parsing Descriptor, saw junk at end: " +
                                    ' '.join(tokens[pos:-1]))
                # copy members from d.
                self.operator = d.operator
                self.items = d.items
            except RuntimeError as e:
                traceback.print_tb(sys.exc_info()[2])
                raise RuntimeError("Error parsing Descriptor '{0}', specific error was: {1}".format(
                    descriptor_string, repr(e)))

    # This is like the str() function, but it uses the layer_to_string function
    # (which is a function from strings to strings) to convert layer names (or
    # in general sub-layer names of the form 'foo.bar') to the component-node
    # (or, in general, descriptor) names that appear in the final config file.
    # This mechanism gives those designing layer types the freedom to name their
    # nodes as they want.
    def config_string(self, layer_to_string):
        if self.operator is None:
            assert len(self.items) == 1 and isinstance(self.items[0], str)
            return layer_to_string(self.items[0])
        else:
            assert isinstance(self.operator, str)
            return self.operator + '(' + ', '.join(
                    [ item.config_string(layer_to_string) if isinstance(item, Descriptor) else str(item)
                      for item in self.items]) + ')'

    def str(self):
        if self.operator is None:
            assert len(self.items) == 1 and isinstance(self.items[0], str)
            return self.items[0]
        else:
            assert isinstance(self.operator, str)
            return self.operator + '(' + ', '.join([str(item) for item in self.items]) + ')'

    def __str__(self):
        return self.str()

    # This function returns the dimension (i.e. the feature dimension) of the
    # descriptor.  It takes 'layer_to_dim' which is a function from
    # layer-names (including sub-layer names, like lstm1.memory_cell) to
    # dimensions, e.g. you might have layer_to_dim('ivector') = 100, or
    # layer_to_dim('affine1') = 1024.
    # note: layer_to_dim will raise an exception if a nonexistent layer or
    # sub-layer is requested.
    def dim(self, layer_to_dim):
        if self.operator is None:
            # base-case: self.items = [ layer_name ] (or sub-layer name, like
            # 'lstm.memory_cell').
            return layer_to_dim(self.items[0])
        elif self.operator in [ 'Sum', 'Failover', 'IfDefined', 'Switch' ]:
            # these are all operators for which all args are descriptors
            # and must have the same dim.
            dim = self.items[0].dim(layer_to_dim)
            for desc in self.items[1:]:
                next_dim = desc.dim(layer_to_dim)
                if next_dim != dim:
                    raise RuntimeError("In descriptor {0}, different fields have different "
                                       "dimensions: {1} != {2}".format(self.str(), dim, next_dim))
            return dim
        elif self.operator in [  'Offset', 'Round', 'ReplaceIndex' ]:
            # for these operators, only the 1st arg is relevant.
            return self.items[0].dim(layer_to_dim)
        elif self.operator == 'Append':
            return sum([ x.dim(layer_to_dim) for x in self.items])
        elif self.operator == 'Scale':
            # e.g. Scale(2.0, lstm1).  Return dim of 2nd arg.
            return self.items[1].dim(layer_to_dim)
        elif self.operator == 'Const':
            # e.g. Const(0.5, 512).  Return 2nd arg, which is an int.
            return self.items[1]
        else:
            raise RuntimeError("Unknown operator {0}".format(self.operator))


# This just checks that seen_item == expected_item, and raises an
# exception if not.
def expect_token(expected_item, seen_item, what_parsing):
    if seen_item != expected_item:
        raise RuntimeError("parsing {0}, expected '{1}' but got '{2}'".format(
            what_parsing, expected_item, seen_item))

# returns true if 'name' is valid as the name of a line (input, layer or output);
# this is the same as IsValidname() in the nnet3 code.
def is_valid_line_name(name):
    return isinstance(name, str) and re.match(r'^[a-zA-Z_][-a-zA-Z_0-9.]*', name) != None

# This function for parsing Descriptors takes an array of tokens as produced
# by tokenize_descriptor.  It parses a descriptor
# starting from position pos >= 0 of the array 'tokens', and
# returns a new position in the array that reflects any tokens consumed while
# parsing the descriptor.
# It returns a pair (d, pos) where d is the newly parsed Descriptor,
# and 'pos' is the new position after consuming the relevant input.
# 'prev_names' is so that we can find the most recent layer name for
# expressions like Append(-3, 0, 3) which is shorthand for the most recent
# layer spliced at those time offsets.
def parse_new_descriptor(tokens, pos, prev_names):
    size = len(tokens)
    first_token = tokens[pos]
    pos += 1
    d = Descriptor()

    # when reading this function, be careful to note the indent level,
    # there is an if-statement within an if-statement.
    if first_token in [ 'Offset', 'Round', 'ReplaceIndex', 'Append', 'Sum',
                        'Switch', 'Failover', 'IfDefined' ]:
        expect_token('(', tokens[pos], first_token + '()')
        pos += 1
        d.operator = first_token
        # the 1st argument of all these operators is a Descriptor.
        (desc, pos) = parse_new_descriptor(tokens, pos, prev_names)
        d.items = [desc]

        if first_token == 'Offset':
            expect_token(',', tokens[pos], 'Offset()')
            pos += 1
            try:
                t_offset = int(tokens[pos])
                pos += 1
                d.items.append(t_offset)
            except:
                raise RuntimeError("Parsing Offset(), expected integer, got " + tokens[pos])
            if tokens[pos] == ')':
                return (d, pos + 1)
            elif tokens[pos] != ',':
                raise RuntimeError("Parsing Offset(), expected ')' or ',', got " + tokens[pos])
            pos += 1
            try:
                x_offset = int(tokens[pos])
                pos += 1
                d.items.append(x_offset)
            except:
                raise RuntimeError("Parsing Offset(), expected integer, got " + tokens[pos])
            expect_token(')', tokens[pos], 'Offset()')
            pos += 1
        elif first_token in [ 'Append', 'Sum', 'Switch', 'Failover', 'IfDefined' ]:
            while True:
                if tokens[pos] == ')':
                    # check num-items is correct for some special cases.
                    if first_token == 'Failover' and len(d.items) != 2:
                        raise RuntimeError("Parsing Failover(), expected 2 items but got {0}".format(len(d.items)))
                    if first_token == 'IfDefined' and len(d.items) != 1:
                        raise RuntimeError("Parsing IfDefined(), expected 1 item but got {0}".format(len(d.items)))
                    pos += 1
                    break
                elif tokens[pos] == ',':
                    pos += 1  # consume the comma.
                else:
                    raise RuntimeError("Parsing Append(), expected ')' or ',', got " + tokens[pos])

                (desc, pos) = parse_new_descriptor(tokens, pos, prev_names)
                d.items.append(desc)
        elif first_token == 'Round':
            expect_token(',', tokens[pos], 'Round()')
            pos += 1
            try:
                t_modulus = int(tokens[pos])
                assert t_modulus > 0
                pos += 1
                d.items.append(t_modulus)
            except:
                raise RuntimeError("Parsing Offset(), expected integer, got " + tokens[pos])
            expect_token(')', tokens[pos], 'Round()')
            pos += 1
        elif first_token == 'ReplaceIndex':
            expect_token(',', tokens[pos], 'ReplaceIndex()')
            pos += 1
            if tokens[pos] in [ 'x', 't' ]:
                d.items.append(tokens[pos])
                pos += 1
            else:
                raise RuntimeError("Parsing ReplaceIndex(), expected 'x' or 't', got " +
                                tokens[pos])
            expect_token(',', tokens[pos], 'ReplaceIndex()')
            pos += 1
            try:
                new_value = int(tokens[pos])
                pos += 1
                d.items.append(new_value)
            except:
                raise RuntimeError("Parsing Offset(), expected integer, got " + tokens[pos])
            expect_token(')', tokens[pos], 'ReplaceIndex()')
            pos += 1
        else:
            raise RuntimeError("code error")
    elif first_token in ['Scale', 'Const' ]:
        # Parsing something like 'Scale(2.0, lstm1)' or 'Const(1.0, 512)'
        expect_token('(', tokens[pos], first_token + '()')
        pos += 1
        d.operator = first_token
        # First arg of Scale() and Const() is a float: the scale or value,
        # respectively.
        try:
            value = float(tokens[pos])
            pos += 1
            d.items = [value]
        except:
            raise RuntimeError("Parsing {0}, expected float, got {1}".format(
                first_token, tokens[pos]))
        # Consume the comma.
        expect_token(',', tokens[pos], first_token + '()')
        pos += 1
        if first_token == 'Scale':
            # Second arg of Scale() is a Descriptor.
            (desc, pos) = parse_new_descriptor(tokens, pos, prev_names)
            d.items.append(desc)
        else:
            assert first_token == 'Const'
            try:
                dim = int(tokens[pos])
                pos += 1
                d.items.append(dim)
            except:
                raise RuntimeError("Parsing Const() expression, expected int, got {0}".format(
                    tokens[pos]))
        expect_token(')', tokens[pos], first_token)
        pos += 1
    elif first_token in [ 'end of string', '(', ')', ',', '@' ]:
        raise RuntimeError("Expected descriptor, got " + first_token)
    elif is_valid_line_name(first_token) or first_token == '[':
        # This section parses a raw input/layer/output name, e.g. "affine2"
        # (which must start with an alphabetic character or underscore),
        # optionally followed by an offset like '@-3'.

        d.operator = None
        d.items = [first_token]

        # If the layer-name o is followed by '@', then
        # we're parsing something like 'affine1@-3' which
        # is syntactic sugar for 'Offset(affine1, 3)'.
        if tokens[pos] == '@':
            pos += 1
            try:
                offset_t = int(tokens[pos])
                pos += 1
            except:
                raise RuntimeError("Parse error parsing {0}@{1}".format(
                    first_token, tokens[pos]))
            if offset_t != 0:
                inner_d = d
                d = Descriptor()
                # e.g. foo@3 is equivalent to 'Offset(foo, 3)'.
                d.operator = 'Offset'
                d.items = [ inner_d, offset_t ]
    else:
        # the last possible case is that 'first_token' is just an integer i,
        # which can appear in things like Append(-3, 0, 3).
        # See if the token is an integer.
        # In this case, it's interpreted as the name of previous layer
        # (with that time offset applied).
        try:
            offset_t = int(first_token)
        except:
            raise RuntimeError("Parsing descriptor, expected descriptor but got " +
                            first_token)
        assert isinstance(prev_names, list)
        if len(prev_names) < 1:
            raise RuntimeError("Parsing descriptor, could not interpret '{0}' because "
                            "there is no previous layer".format(first_token))
        d.operator = None
        # the layer name is the name of the most recent layer.
        d.items = [prev_names[-1]]
        if offset_t != 0:
            inner_d = d
            d = Descriptor()
            d.operator = 'Offset'
            d.items = [ inner_d, offset_t ]
    return (d, pos)


# This function takes a string 'descriptor_string' which might
# look like 'Append([-1], [-2], input)', and a list of previous layer
# names like prev_names = ['foo', 'bar', 'baz'], and replaces
# the integers in brackets with the previous layers.  -1 means
# the most recent previous layer ('baz' in this case), -2
# means the last layer but one ('bar' in this case), and so on.
# It will throw an exception if the number is out of range.
# If there are no such expressions in the string, it's OK if
# prev_names == None (this is useful for testing).
def replace_bracket_expressions_in_descriptor(descriptor_string,
                                              prev_names = None):
    fields = re.split(r'(\[|\])\s*', descriptor_string)
    out_fields = []
    i = 0
    while i < len(fields):
        f = fields[i]
        i += 1
        if f == ']':
            raise RuntimeError("Unmatched ']' in descriptor")
        elif f == '[':
            if i + 2 >= len(fields):
                raise RuntimeError("Error tokenizing string '{0}': '[' found too close "
                                "to the end of the descriptor.".format(descriptor_string))
            assert isinstance(prev_names, list)
            try:
                offset = int(fields[i])
                assert offset < 0 and -offset <= len(prev_names)
                i += 2  # consume the int and the ']'.
            except:
                raise RuntimeError("Error tokenizing string '{0}': expression [{1}] has an "
                                "invalid or out of range offset.".format(descriptor_string, fields[i]))
            this_field = prev_names[offset]
            out_fields.append(this_field)
        else:
            out_fields.append(f)
    return ''.join(out_fields)

# tokenizes 'descriptor_string' into the tokens that may be part of Descriptors.
# Note: for convenience in parsing, we add the token 'end-of-string' to this
# list.
# The argument 'prev_names' (for the names of previous layers and input and
# output nodes) is needed to process expressions like [-1] meaning the most
# recent layer, or [-2] meaning the last layer but one.
# The default None for prev_names is only supplied for testing purposes.
# Called with 'Append(-1, 0, 1)' this would return
# [ 'Append', '(',  '-1', ',', '0', ',', '1' ')' ].
# for a more complicated example: if you call
#   tokenize_descriptor('Append(-1, 0, 1, [-2]@0)', prev_names = ['a', 'b', 'c', 'd'])
# the [-2] would get replaced with prev_names[-2] = 'c', returning:
#  [ 'Append', '(', '-1', ',', '0', ',', '1', ',', 'c', '@', '0', ')' ]
def tokenize_descriptor(descriptor_string,
                       prev_names = None):
    # split on '(', ')', ',', '@', and space.  Note: the parenthesis () in the
    # regexp causes it to output the stuff inside the () as if it were a field,
    # which is how the call to re.split() keeps characters like '(' and ')' as
    # tokens.
    fields = re.split(r'(\(|\)|@|,|\s)\s*',
                      replace_bracket_expressions_in_descriptor(descriptor_string,
                                                                prev_names))
    ans = []
    for f in fields:
        # don't include fields that are space, or are empty.
        if re.match(r'^\s*$', f) is None:
            ans.append(f)

    ans.append('end of string')
    return ans


# This function parses a line in a config file, something like
# affine-layer name=affine1 input=Append(-3, 0, 3)
# and returns a pair,
# (first_token, fields), as (string, dict) e.g. in this case
# ('affine-layer', {'name':'affine1', 'input':'Append(-3, 0, 3)"
# Note: spaces are allowed in the field names but = signs are
# disallowed, except when quoted with double quotes,
# which is why it's possible to parse them.
# This function also removes comments (anything after '#').
# As a special case, this function will return None if the line
# is empty after removing spaces.
def parse_config_line(orig_config_line):
    # Remove comments.
    # note: splitting on '#' will always give at least one field...  python
    # treats splitting on space as a special case that may give zero fields.
    config_line = orig_config_line.split('#')[0]
    # Note: this set of allowed characters may have to be expanded in future.
    x = re.search('[^a-zA-Z0-9\.\-\(\)@_=,/+:\s"]', config_line)
    if x is not None:
        bad_char = x.group(0)
        if bad_char == "'":
            raise RuntimeError("Xconfig line has disallowed character ' (use "
                               "double quotes for strings containing = signs)")
        else:
            raise RuntimeError("Xconfig line has disallowed character: {0}"
                               .format(bad_char))

    # Now split on space; later we may splice things back together.
    fields=config_line.split()
    if len(fields) == 0:
        return None   # Line was only whitespace after removing comments.
    first_token = fields[0]
    # if first_token does not look like 'foo-bar' or 'foo-bar2', then die.
    if re.match('^[a-z][-a-z0-9]+$', first_token) is None:
        raise RuntimeError("Error parsing config line (first field doesn't look right).")

    # get rid of the first field which we put in 'first_token'.
    fields = fields[1:]

    rest_of_line = ' '.join(fields)
    # rest of the line can be of the form 'a=1 b=" x=1 y=2 " c=Append( i1, i2)'
    positions = [x.start() for x in re.finditer('"', rest_of_line)]
    if not len(positions) % 2 == 0:
        raise RuntimeError("Double-quotes should occur in pairs")

    # Replace all the equals signs inside the "-enclosed strings
    # with question marks ('?') [this is just an arbitrary character
    # that won't otherwise be present, search above for 'banned'],
    # and replace the quotation marks themselves with spaces.
    # Then later on we'll convert all the question marks to
    # equals signs in the values in the dicts.
    num_strings = len(positions) // 2
    fields = []
    for i in range(num_strings):
        start = positions[i * 2]
        end = positions[i * 2 + 1]

        line_before_start = rest_of_line[:start]
        inside_quotes=rest_of_line[start+1:end].replace('=', '?')
        line_after_end = rest_of_line[end + 1:]
        # the reason why we include the spaces here, is to keep the length of
        # rest_of_line the same, and the positions in 'positions' valid.
        new_rest_of_line = line_before_start + ' ' + inside_quotes + ' ' + line_after_end
        assert len(new_rest_of_line) == len(rest_of_line)
        rest_of_line = new_rest_of_line

    # suppose rest_of_line is: 'input=Append(foo, bar) foo=bar'
    # then after the below we'll get
    # fields = ['', 'input', 'Append(foo, bar)', 'foo', 'bar']
    ans_dict = dict()
    other_fields = re.split(r'\s*([-a-zA-Z0-9_]*)=', rest_of_line)
    if not (other_fields[0] == '' and len(other_fields) % 2 ==  1):
        raise RuntimeError("Could not parse config line.");
    fields += other_fields[1:]
    num_variables = len(fields) // 2
    for i in range(num_variables):
        var_name = fields[i * 2]
        var_value = fields[i * 2 + 1]
        if re.match(r'[a-zA-Z_]', var_name) is None:
            raise RuntimeError("Expected variable name '{0}' to start with alphabetic character or _, "
                            "in config line {1}".format(var_name, orig_config_line))
        if var_name in ans_dict:
            raise RuntimeError("Config line has multiply defined variable {0}: {1}".format(
                var_name, orig_config_line))
        # Teplace any '?' characters that we inserted above, with the original
        # '=' characters.
        # The 'strip()' is to remove initial and final spaces that we might
        # have inserted while processing double-quotes above (search above
        # for the string 'inside_quotes' to see what is meant by this).
        ans_dict[var_name] = var_value.replace('?', '=').strip()
    return (first_token, ans_dict)


def test_library():
    tokenize_test = lambda x: tokenize_descriptor(x)[:-1]  # remove 'end of string'
    assert tokenize_test("hi") == ['hi']
    assert tokenize_test("hi there") == ['hi', 'there']
    assert tokenize_test("hi,there") == ['hi', ',', 'there']
    assert tokenize_test("hi@-1,there") == ['hi', '@', '-1', ',', 'there']
    assert tokenize_test("hi(there)") == ['hi', '(', 'there', ')']
    assert tokenize_descriptor("[-1]@2", ['foo', 'bar'])[:-1] == ['bar', '@', '2' ]
    assert tokenize_descriptor("[-2].special@2", ['foo', 'bar'])[:-1] == ['foo.special', '@', '2' ]

    assert Descriptor('foo').str() == 'foo'
    assert Descriptor('Sum(foo,bar)').str() == 'Sum(foo, bar)'
    assert Descriptor('Sum(Offset(foo,1),Offset(foo,0))').str() == 'Sum(Offset(foo, 1), Offset(foo, 0))'
    for x in [ 'Append(foo, Sum(bar, Offset(baz, 1)))', 'Failover(foo, Offset(bar, -1))',
               'IfDefined(Round(baz, 3))', 'Switch(foo1, Offset(foo2, 2), Offset(foo3, 3))',
               'IfDefined(ReplaceIndex(ivector, t, 0))', 'ReplaceIndex(foo, x, 0)' ]:
        if not Descriptor(x).str() == x:
            print("Error: '{0}' != '{1}'".format(Descriptor(x).str(), x))

    prev_names = ['last_but_one_layer', 'prev_layer']
    for x, y in [ ('Sum(foo,bar)', 'Sum(foo, bar)'),
                  ('Sum(foo1,bar-3_4)', 'Sum(foo1, bar-3_4)'),
                  ('Append(input@-3, input@0, input@3)',
                   'Append(Offset(input, -3), input, Offset(input, 3))'),
                  ('Append(-3,0,3)',
                   'Append(Offset(prev_layer, -3), prev_layer, Offset(prev_layer, 3))'),
                  ('[-1]', 'prev_layer'),
                  ('Scale(2.0,foo)', 'Scale(2.0, foo)'),
                  ('Const(0.5,500)', 'Const(0.5, 500)'),
                  ('[-2]', 'last_but_one_layer'),
                  ('[-2]@3',
                   'Offset(last_but_one_layer, 3)') ]:
        if not Descriptor(x, prev_names).str() == y:
            print("Error: '{0}' != '{1}'".format(Descriptor(x).str(), y))


    print(parse_config_line('affine-layer input=Append(foo, bar) foo=bar'))
    print(parse_config_line('affine-layer x="y z" input=Append(foo, bar) foo=bar opt2="a=1 b=2"'))
    print(parse_config_line('affine-layer1 input=Append(foo, bar) foo=bar'))
    print(parse_config_line('affine-layer'))

if __name__ == "__main__":
    test_library()


================================================
FILE: egs/steps/lmrescore.sh
================================================
#!/usr/bin/env bash

set -e -o pipefail

# Begin configuration section.
mode=4  # mode can be 1 through 5.  They should all give roughly similar results.
        # See the comments in the case statement for more details.
cmd=run.pl
skip_scoring=false
self_loop_scale=0.1  # only matters for mode 4.
acoustic_scale=0.1   # only matters for mode 5.
scoring_opts=
# End configuration section.

echo "$0 $@"  # Print the command line for logging

. ./utils/parse_options.sh

if [ $# != 5 ]; then
   echo "Do language model rescoring of lattices (remove old LM, add new LM)"
   echo "Usage: steps/lmrescore.sh [options] <old-lang-dir> <new-lang-dir> <data-dir> <input-decode-dir> <output-decode-dir>"
   echo "Ooptions:"
   echo " --cmd   <cmd-string>       # How to run commands (e.g. run.pl, queue.pl)"
   echo " --mode  (1|2|3|4|5)        # Mode of LM rescoring to use (default: 4)."
   echo "                            # These should give very similar results."
   echo " --self-loop-scale  <scale> # Self-loop-scale, only relevant in mode 4."
   echo "                            # Default: 0.1."
   echo " --acoustic-scale  <scale>  # Acoustic scale, only relevant in mode 5."
   echo "                            # Default: 0.1."
   exit 1;
fi

[ -f path.sh ] && . ./path.sh;

oldlang=$1
newlang=$2
data=$3
indir=$4
outdir=$5

oldlm=$oldlang/G.fst
newlm=$newlang/G.fst
! cmp $oldlang/words.txt $newlang/words.txt && echo "Warning: vocabularies may be incompatible."
[ ! -f $oldlm ] && echo Missing file $oldlm && exit 1;
[ ! -f $newlm ] && echo Missing file $newlm && exit 1;
! ls $indir/lat.*.gz >/dev/null && echo "No lattices input directory $indir" && exit 1;

if ! cmp -s $oldlang/words.txt $newlang/words.txt; then
  echo "$0: $oldlang/words.txt and $newlang/words.txt differ: make sure you know what you are doing.";
fi

oldlmcommand="fstproject --project_output=true $oldlm |"
newlmcommand="fstproject --project_output=true $newlm |"

mkdir -p $outdir/log

phi=`grep -w '#0' $newlang/words.txt | awk '{print $2}'`

if [ "$mode" == 4 ]; then
  # we have to prepare $outdir/Ldet.fst in this case: determinized
  # lexicon (determinized on phones), with disambig syms removed.
  # take L_disambig.fst; get rid of transition with "#0 #0" on it; determinize
  # with epsilon removal; remove disambiguation symbols.
  fstprint $newlang/L_disambig.fst | awk '{if($4 != '$phi'){print;}}' | fstcompile | \
    fstdeterminizestar | fstrmsymbols $newlang/phones/disambig.int >$outdir/Ldet.fst || exit 1;
fi

nj=`cat $indir/num_jobs` || exit 1;
cp $indir/num_jobs $outdir


#for lat in $indir/lat.*.gz; do
#  number=`basename $lat | cut -d. -f2`;
#  newlat=$outdir/`basename $lat`

case "$mode" in
  1) # 1 is inexact, it's the original way of doing it.
    $cmd JOB=1:$nj $outdir/log/rescorelm.JOB.log \
      lattice-lmrescore --lm-scale=-1.0 "ark:gunzip -c $indir/lat.JOB.gz|" "$oldlmcommand" ark:-  \| \
      lattice-lmrescore --lm-scale=1.0 ark:- "$newlmcommand" "ark,t:|gzip -c>$outdir/lat.JOB.gz" \
      || exit 1;
    ;;
  2)  # 2 is equivalent to 1, but using more basic operations, combined.
    $cmd JOB=1:$nj $outdir/log/rescorelm.JOB.log \
      gunzip -c $indir/lat.JOB.gz \| \
      lattice-scale --acoustic-scale=-1 --lm-scale=-1 ark:- ark:- \| \
      lattice-compose ark:- "fstproject --project_output=true $oldlm |" ark:- \| \
      lattice-determinize ark:- ark:- \| \
      lattice-scale --acoustic-scale=-1 --lm-scale=-1 ark:- ark:- \| \
      lattice-compose ark:- "fstproject --project_output=true $newlm |" ark:- \| \
      lattice-determinize ark:- ark:- \| \
      gzip -c \>$outdir/lat.JOB.gz || exit 1;
    ;;
  3) # 3 is "exact" in that we remove the old LM scores accepting any path
     # through G.fst (which is what we want as that happened in lattice
     # generation), but we add the new one with "phi matcher", only taking
     # backoff arcs if an explicit arc did not exist.
    $cmd JOB=1:$nj $outdir/log/rescorelm.JOB.log \
      gunzip -c $indir/lat.JOB.gz \| \
      lattice-scale --acoustic-scale=-1 --lm-scale=-1 ark:- ark:- \| \
      lattice-compose ark:- "fstproject --project_output=true $oldlm |" ark:- \| \
      lattice-determinize ark:- ark:- \| \
      lattice-scale --acoustic-scale=-1 --lm-scale=-1 ark:- ark:- \| \
      lattice-compose --phi-label=$phi ark:- $newlm ark:- \| \
      lattice-determinize ark:- ark:- \| \
      gzip -c \>$outdir/lat.JOB.gz || exit 1;
    ;;
  4) # 4 is also exact (like 3), but instead of subtracting the old LM-scores,
     # it removes the old graph scores entirely and adds in the lexicon,
     # grammar and transition weights.
    mdl=`dirname $indir`/final.mdl
    [ ! -f $mdl ] && echo No such model $mdl && exit 1;
    [[ -f `dirname $indir`/frame_subsampling_factor && "$self_loop_scale" == 0.1 ]] && \
      echo "$0: WARNING: chain models need '--self-loop-scale 1.0'";
    $cmd JOB=1:$nj $outdir/log/rescorelm.JOB.log \
      gunzip -c $indir/lat.JOB.gz \| \
      lattice-scale --lm-scale=0.0 ark:- ark:- \| \
      lattice-to-phone-lattice $mdl ark:- ark:- \| \
      lattice-compose ark:- $outdir/Ldet.fst ark:- \| \
      lattice-determinize ark:- ark:- \| \
      lattice-compose --phi-label=$phi ark:- $newlm ark:- \| \
      lattice-add-trans-probs --transition-scale=1.0 --self-loop-scale=$self_loop_scale \
      $mdl ark:- ark:- \| \
      gzip -c \>$outdir/lat.JOB.gz  || exit 1;
    ;;
  5) # Mode 5 uses the binary lattice-lmrescore-pruned to do the LM rescoring
    # within a single program.  There are options for pruning, but these won't
    # normally need to be modified; the pruned aspect is more necessary for
    # RNNLM rescoring or when the lattices are extremely deep.

    [[ -f `dirname $indir`/frame_subsampling_factor && "$acoustic_scale" == 0.1 ]] && \
      echo "$0: WARNING: chain models need '--acoustic-scale 1.0'";

    $cmd JOB=1:$nj $outdir/log/rescorelm.JOB.log \
      lattice-lmrescore-pruned --acoustic-scale=$acoustic_scale "$oldlm" "$newlm" \
      "ark:gunzip -c $indir/lat.JOB.gz|" "ark:|gzip -c >$outdir/lat.JOB.gz" || exit 1;
    ;;
esac

rm $outdir/Ldet.fst 2>/dev/null || true

if ! $skip_scoring ; then
  [ ! -x local/score.sh ] && \
    echo "Not scoring because local/score.sh does not exist or not executable." && exit 1;
  local/score.sh $scoring_opts --cmd "$cmd" $data $newlang $outdir
else
  echo "Not scoring because requested so..."
fi

exit 0;


================================================
FILE: egs/steps/lmrescore_const_arpa.sh
================================================
#!/usr/bin/env bash

# Copyright 2014  Guoguo Chen
# Apache 2.0

# This script rescores lattices with the ConstArpaLm format language model.

# Begin configuration section.
cmd=run.pl
skip_scoring=false
stage=1
scoring_opts=
# End configuration section.

echo "$0 $@"  # Print the command line for logging

. ./utils/parse_options.sh

if [ $# != 5 ]; then
   echo "Does language model rescoring of lattices (remove old LM, add new LM)"
   echo "Usage: $0 [options] <old-lang-dir> <new-lang-dir> \\"
   echo "                   <data-dir> <input-decode-dir> <output-decode-dir>"
   echo "options: [--cmd (run.pl|queue.pl [queue opts])]"
   exit 1;
fi

[ -f path.sh ] && . ./path.sh;

oldlang=$1
newlang=$2
data=$3
indir=$4
outdir=$5

oldlm=$oldlang/G.fst
newlm=$newlang/G.carpa
! cmp $oldlang/words.txt $newlang/words.txt &&\
  echo "$0: Warning: vocabularies may be incompatible."
[ ! -f $oldlm ] && echo "$0: Missing file $oldlm" && exit 1;
[ ! -f $newlm ] && echo "$0: Missing file $newlm" && exit 1;
! ls $indir/lat.*.gz >/dev/null &&\
  echo "$0: No lattices input directory $indir" && exit 1;

if ! cmp -s $oldlang/words.txt $newlang/words.txt; then
  echo "$0: $oldlang/words.txt and $newlang/words.txt differ: make sure you know what you are doing.";
fi

oldlmcommand="fstproject --project_output=true $oldlm |"

mkdir -p $outdir/log
nj=`cat $indir/num_jobs` || exit 1;
cp $indir/num_jobs $outdir

if [ $stage -le 1 ]; then
  $cmd JOB=1:$nj $outdir/log/rescorelm.JOB.log \
    lattice-lmrescore --lm-scale=-1.0 \
    "ark:gunzip -c $indir/lat.JOB.gz|" "$oldlmcommand" ark:-  \| \
    lattice-lmrescore-const-arpa --lm-scale=1.0 \
    ark:- "$newlm" "ark,t:|gzip -c>$outdir/lat.JOB.gz" || exit 1;
fi

if ! $skip_scoring && [ $stage -le 2 ]; then
  err_msg="Not scoring because local/score.sh does not exist or not executable."
  [ ! -x local/score.sh ] && echo $err_msg && exit 1;
  local/score.sh --cmd "$cmd" $scoring_opts $data $newlang $outdir
else
  echo "Not scoring because requested so..."
fi

exit 0;


================================================
FILE: egs/steps/lmrescore_const_arpa_undeterminized.sh
================================================
#!/usr/bin/env bash

# Copyright 2014  Guoguo Chen
#           2017  Vimal Manohar
# Apache 2.0

# This script rescores non-compact, (possibly) undeterminized lattices with the 
# ConstArpaLm format language model.
# This is similar to steps/lmrescore_const_arpa.sh, but expects 
# non-compact lattices as input.
# This works by first determinizing the lattice and rescoring it with 
# const ARPA LM, followed by composing it with the original lattice to add the 
# new LM scores.

# If you use the option "--write compact false" it outputs non-compact lattices;
# the purpose is to add in LM scores while leaving the frame-by-frame acoustic
# scores in the same position that they were in in the input, undeterminized
# lattices. This is important in our 'chain' semi-supervised training recipes,
# where it helps us to split lattices while keeping the scores at the edges of
# the split points correct.

# Begin configuration section.
cmd=run.pl
skip_scoring=false
stage=1
scoring_opts=
write_compact=true   # If set to false, writes lattice in non-compact format.
                     # This retains the acoustic scores on the arcs of the lattice.
                     # Useful for another stage of LM rescoring.
acwt=0.1  # used for pruning and determinization
beam=8.0  # beam used in determinization

# End configuration section.

echo "$0 $@"  # Print the command line for logging

. ./utils/parse_options.sh

if [ $# != 5 ]; then
  cat <<EOF
   Does language model rescoring of non-compact undeterminized lattices 
   (remove old LM, add new LM). This script expects the input lattices 
   to be in non-compact format.
   Usage: $0 [options] <old-lang-dir> <new-lang-dir> \\
                      <data-dir> <input-decode-dir> <output-decode-dir>
   options: [--cmd (run.pl|queue.pl [queue opts])]
   See also: steps/lmrescore_const_arpa.sh 
EOF
   exit 1;
fi

[ -f path.sh ] && . ./path.sh;

oldlang=$1
newlang=$2
data=$3
indir=$4
outdir=$5

oldlm=$oldlang/G.fst
newlm=$newlang/G.carpa
! cmp $oldlang/words.txt $newlang/words.txt &&\
  echo "$0: Warning: vocabularies may be incompatible."
[ ! -f $oldlm ] && echo "$0: Missing file $oldlm" && exit 1;
[ ! -f $newlm ] && echo "$0: Missing file $newlm" && exit 1;
! ls $indir/lat.*.gz >/dev/null &&\
  echo "$0: No lattices input directory $indir" && exit 1;

if ! cmp -s $oldlang/words.txt $newlang/words.txt; then
  echo "$0: $oldlang/words.txt and $newlang/words.txt differ: make sure you know what you are doing.";
fi

oldlmcommand="fstproject --project_output=true $oldlm |"

mkdir -p $outdir/log
nj=`cat $indir/num_jobs` || exit 1;
cp $indir/num_jobs $outdir

lats_rspecifier="ark:gunzip -c $indir/lat.JOB.gz |"
  
lats_wspecifier="ark:| gzip -c > $outdir/lat.JOB.gz" 

if [ $stage -le 1 ]; then
  $cmd JOB=1:$nj $outdir/log/rescorelm.JOB.log \
    lattice-determinize-pruned --acoustic-scale=$acwt --beam=$beam \
      "ark:gunzip -c $indir/lat.JOB.gz |" ark:- \| \
    lattice-scale --lm-scale=0.0 --acoustic-scale=0.0 ark:- ark:- \| \
    lattice-lmrescore --lm-scale=-1.0 ark:- "$oldlmcommand" ark:- \| \
    lattice-lmrescore-const-arpa --lm-scale=1.0 \
      ark:- "$newlm" ark:- \| \
    lattice-project ark:- ark:- \| \
    lattice-compose --write-compact=$write_compact \
      "$lats_rspecifier" \
      ark,s,cs:- "$lats_wspecifier" || exit 1
fi

if ! $skip_scoring && [ $stage -le 2 ]; then
  err_msg="Not scoring because local/score.sh does not exist or not executable."
  [ ! -x local/score.sh ] && echo $err_msg && exit 1;
  local/score.sh --cmd "$cmd" $scoring_opts $data $newlang $outdir
else
  echo "Not scoring because requested so..."
fi

exit 0;


================================================
FILE: egs/steps/lmrescore_rnnlm_lat.sh
================================================
#!/usr/bin/env bash

# Copyright 2015  Guoguo Chen
#           2017  Hainan Xu
# Apache 2.0

# This script rescores lattices with RNNLM.  See also rnnlmrescore.sh which is
# an older script using n-best lists.

# Begin configuration section.
cmd=run.pl
skip_scoring=false
max_ngram_order=4
acwt=0.1
weight=0.5  # Interpolation weight for RNNLM.
rnnlm_ver=
# End configuration section.

echo "$0 $@"  # Print the command line for logging

. ./utils/parse_options.sh

if [ $# != 5 ]; then
   echo "Does language model rescoring of lattices (remove old LM, add new LM)"
   echo "with RNNLM."
   echo ""
   echo "Usage: $0 [options] <old-lang-dir> <rnnlm-dir> \\"
   echo "                   <data-dir> <input-decode-dir> <output-decode-dir>"
   echo " e.g.: $0 ./rnnlm data/lang_tg data/test \\"
   echo "                   exp/tri3/test_tg exp/tri3/test_rnnlm"
   echo "options: [--cmd (run.pl|queue.pl [queue opts])]"
   exit 1;
fi

[ -f path.sh ] && . ./path.sh;

oldlang=$1
rnnlm_dir=$2
data=$3
indir=$4
outdir=$5

rescoring_binary=lattice-lmrescore-rnnlm

first_arg=ark:$rnnlm_dir/unk.probs # this is for mikolov's rnnlm
extra_arg=

if [ "$rnnlm_ver" == "cuedrnnlm" ]; then
  layer_string=`cat $rnnlm_dir/layer_string | sed "s=:= =g"`
  total_size=`wc -l $rnnlm_dir/unigram.counts | awk '{print $1}'`
  rescoring_binary="lattice-lmrescore-cuedrnnlm"
  cat $rnnlm_dir/rnnlm.input.wlist.index | tail -n +2 | awk '{print $1-1,$2}' > $rnnlm_dir/rnn.wlist
  extra_arg="--full-voc-size=$total_size --layer-sizes=\"$layer_string\""
  first_arg=$rnnlm_dir/rnn.wlist
fi

oldlm=$oldlang/G.fst
if [ -f $oldlang/G.carpa ]; then
  oldlm=$oldlang/G.carpa
fi

[ ! -f $oldlm ] && echo "$0: expecting either $oldlang/G.fst or $oldlang/G.carpa to exist" && exit 1;
[ ! -f $rnnlm_dir/rnnlm ] && echo "$0: Missing file $rnnlm_dir/rnnlm" && exit 1;
[ ! -f $rnnlm_dir/unk.probs ] &&\
  echo "$0: Missing file $rnnlm_dir/unk.probs" && exit 1;
[ ! -f $oldlang/words.txt ] &&\
  echo "$0: Missing file $oldlang/words.txt" && exit 1;
! ls $indir/lat.*.gz >/dev/null &&\
  echo "$0: No lattices input directory $indir" && exit 1;
awk -v n=$0 -v w=$weight 'BEGIN {if (w < 0 || w > 1) {
  print n": Interpolation weight should be in the range of [0, 1]"; exit 1;}}' \
  || exit 1;

oldlm_command="fstproject --project_output=true $oldlm |"

mkdir -p $outdir/log
nj=`cat $indir/num_jobs` || exit 1;
cp $indir/num_jobs $outdir

oldlm_weight=`perl -e "print -1.0 * $weight;"`
if [ "$oldlm" == "$oldlang/G.fst" ]; then
  $cmd JOB=1:$nj $outdir/log/rescorelm.JOB.log \
    lattice-lmrescore --lm-scale=$oldlm_weight \
    "ark:gunzip -c $indir/lat.JOB.gz|" "$oldlm_command" ark:-  \| \
    $rescoring_binary $extra_arg --lm-scale=$weight \
    --max-ngram-order=$max_ngram_order \
    $first_arg $oldlang/words.txt ark:- "$rnnlm_dir/rnnlm" \
    "ark,t:|gzip -c>$outdir/lat.JOB.gz" || exit 1;
else
  $cmd JOB=1:$nj $outdir/log/rescorelm.JOB.log \
    lattice-lmrescore-const-arpa --lm-scale=$oldlm_weight \
    "ark:gunzip -c $indir/lat.JOB.gz|" "$oldlm" ark:-  \| \
    $rescoring_binary $extra_arg --lm-scale=$weight \
    --max-ngram-order=$max_ngram_order \
    $first_arg $oldlang/words.txt ark:- "$rnnlm_dir/rnnlm" \
    "ark,t:|gzip -c>$outdir/lat.JOB.gz" || exit 1;
fi
if ! $skip_scoring ; then
  err_msg="Not scoring because local/score.sh does not exist or not executable."
  [ ! -x local/score.sh ] && echo $err_msg && exit 1;
  local/score.sh --cmd "$cmd" $data $oldlang $outdir
else
  echo "$0: Not scoring because --skip-scoring was specified."
fi

exit 0;


================================================
FILE: egs/steps/make_denlats.sh
================================================
#!/usr/bin/env bash
# Copyright 2012  Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.

# Create denominator lattices for MMI/MPE training.
# Creates its output in $dir/lat.*.gz

# Begin configuration section.
nj=4
cmd=run.pl
sub_split=1
beam=13.0
lattice_beam=7.0
acwt=0.1
max_active=5000
transform_dir=
max_mem=20000000 # This will stop the processes getting too large.
# This is in bytes, but not "real" bytes-- you have to multiply
# by something like 5 or 10 to get real bytes (not sure why so large)
num_threads=1
parallel_opts= # ignored now
# End configuration section.

echo "$0 $@"  # Print the command line for logging

[ -f ./path.sh ] && . ./path.sh; # source the path.
. parse_options.sh || exit 1;

if [ $# != 4 ]; then
   echo "Usage: steps/make_denlats.sh [options] <data-dir> <lang-dir> <src-dir> <exp-dir>"
   echo "  e.g.: steps/make_denlats.sh data/train data/lang exp/tri1 exp/tri1_denlats"
   echo "Works for (delta|lda) features, and (with --transform-dir option) such features"
   echo " plus transforms."
   echo ""
   echo "Main options (for others, see top of script file)"
   echo "  --config <config-file>                           # config containing options"
   echo "  --nj <nj>                                        # number of parallel jobs"
   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
   echo "  --sub-split <n-split>                            # e.g. 40; use this for "
   echo "                           # large databases so your jobs will be smaller and"
   echo "                           # will (individually) finish reasonably soon."
   echo "  --transform-dir <transform-dir>   # directory to find fMLLR transforms."
   echo "  --num-threads  <n>                # number of threads per decoding job"
   exit 1;
fi

data=$1
lang=$2
srcdir=$3
dir=$4

sdata=$data/split$nj
splice_opts=`cat $srcdir/splice_opts 2>/dev/null`
cmvn_opts=`cat $srcdir/cmvn_opts 2>/dev/null`
delta_opts=`cat $srcdir/delta_opts 2>/dev/null`
thread_string=
[ $num_threads -gt 1 ] && thread_string="-parallel --num-threads=$num_threads"

mkdir -p $dir/log
[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
echo $nj > $dir/num_jobs

utils/lang/check_phones_compatible.sh $lang/phones.txt $srcdir/phones.txt || exit 1;
cp $lang/phones.txt $dir || exit 1;

oov=`cat $lang/oov.int` || exit 1;

mkdir -p $dir

cp -RH $lang $dir/

# Compute grammar FST which corresponds to unigram decoding graph.
new_lang="$dir/"$(basename "$lang")
echo "Making unigram grammar FST in $new_lang"
cat $data/text | utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt | \
  awk '{for(n=2;n<=NF;n++){ printf("%s ", $n); } printf("\n"); }' | \
  utils/make_unigram_grammar.pl | fstcompile | fstarcsort --sort_type=ilabel > $new_lang/G.fst \
   || exit 1;

# mkgraph.sh expects a whole directory "lang", so put everything in one directory...
# it gets L_disambig.fst and G.fst (among other things) from $dir/lang, and
# final.mdl from $srcdir; the output HCLG.fst goes in $dir/graph.

echo "Compiling decoding graph in $dir/dengraph"
if [ -s $dir/dengraph/HCLG.fst ] && [ $dir/dengraph/HCLG.fst -nt $srcdir/final.mdl ]; then
   echo "Graph $dir/dengraph/HCLG.fst already exists: skipping graph creation."
else
  utils/mkgraph.sh $new_lang $srcdir $dir/dengraph || exit 1;
fi

if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
echo "$0: feature type is $feat_type"

case $feat_type in
  delta) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas $delta_opts ark:- ark:- |";;
  lda) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |"
    cp $srcdir/final.mat $dir
   ;;
  *) echo "Invalid feature type $feat_type" && exit 1;
esac

if [ ! -z "$transform_dir" ]; then # add transforms to features...
  echo "$0: using fMLLR transforms from $transform_dir"
  [ ! -f $transform_dir/trans.1 ] && echo "Expected $transform_dir/trans.1 to exist." && exit 1;
  [ "`cat $transform_dir/num_jobs`" -ne "$nj" ] \
    && echo "$0: mismatch in number of jobs with $transform_dir" && exit 1;
  [ -f $srcdir/final.mat ] && ! cmp $transform_dir/final.mat $srcdir/final.mat && \
     echo "$0: LDA transforms differ between $srcdir and $transform_dir"
  feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/trans.JOB ark:- ark:- |"
else
  if [ -f $srcdir/final.alimdl ]; then
    echo "$0: you seem to have a SAT system but you did not supply the --transform-dir option.";
    exit 1;
  fi
fi


# if this job is interrupted by the user, we want any background jobs to be
# killed too.
cleanup() {
  local pids=$(jobs -pr)
  [ -n "$pids" ] && kill $pids
}
trap "cleanup" INT QUIT TERM EXIT


if [ $sub_split -eq 1 ]; then
  $cmd --num-threads $num_threads JOB=1:$nj $dir/log/decode_den.JOB.log \
   gmm-latgen-faster$thread_string --beam=$beam --lattice-beam=$lattice_beam --acoustic-scale=$acwt \
    --max-mem=$max_mem --max-active=$max_active --word-symbol-table=$lang/words.txt $srcdir/final.mdl  \
     $dir/dengraph/HCLG.fst "$feats" "ark:|gzip -c >$dir/lat.JOB.gz" || exit 1;
else
  # each job from 1 to $nj is split into multiple pieces (sub-split), and we aim
  # to have at most two jobs running at each time.  The idea is that if we have stragglers
  # from one job, we can be processing another one at the same time.
  rm $dir/.error 2>/dev/null

  prev_pid=
  for n in `seq $[nj+1]`; do
    if [ $n -gt $nj ]; then
      this_pid=
    elif [ -f $dir/.done.$n ] && [ $dir/.done.$n -nt $srcdir/final.mdl ]; then
      echo "Not processing subset $n as already done (delete $dir/.done.$n if not)";
      this_pid=
    else
      sdata2=$data/split$nj/$n/split${sub_split}utt;
      split_data.sh --per-utt $sdata/$n $sub_split || exit 1;
      mkdir -p $dir/log/$n
      mkdir -p $dir/part
      feats_subset=`echo $feats | sed "s/trans.JOB/trans.$n/g" | sed s:JOB/:$n/split${sub_split}utt/JOB/:g`

      $cmd --num-threads $num_threads JOB=1:$sub_split $dir/log/$n/decode_den.JOB.log \
        gmm-latgen-faster$thread_string --beam=$beam --lattice-beam=$lattice_beam --acoustic-scale=$acwt \
        --max-mem=$max_mem --max-active=$max_active --word-symbol-table=$lang/words.txt $srcdir/final.mdl  \
          $dir/dengraph/HCLG.fst "$feats_subset" "ark:|gzip -c >$dir/lat.$n.JOB.gz" || touch $dir/.error &
      this_pid=$!
    fi
    if [ ! -z "$prev_pid" ]; then  # Wait for the previous job; merge the previous set of lattices.
      wait $prev_pid
      [ -f $dir/.error ] && echo "$0: error generating denominator lattices" && exit 1;
      rm $dir/.merge_error 2>/dev/null
      echo Merging archives for data subset $prev_n
      for k in `seq $sub_split`; do
        gunzip -c $dir/lat.$prev_n.$k.gz || touch $dir/.merge_error;
      done | gzip -c > $dir/lat.$prev_n.gz || touch $dir/.merge_error;
      [ -f $dir/.merge_error ] && echo "$0: Merging lattices for subset $prev_n failed (or maybe some other error)" && exit 1;
      rm $dir/lat.$prev_n.*.gz
      touch $dir/.done.$prev_n
    fi
    prev_n=$n
    prev_pid=$this_pid
  done
fi


echo "$0: done generating denominator lattices."


================================================
FILE: egs/steps/make_denlats_sgmm2.sh
================================================
#!/usr/bin/env bash
# Copyright 2012  Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.
#           2014  Guoguo Chen

# Create denominator lattices for MMI/MPE training, with SGMM models.  If the
# features have fMLLR transforms you have to supply the --transform-dir option.
# It gets any speaker vectors from the "alignment dir" ($alidir).  Note: this is
# possibly a slight mismatch because the speaker vectors come from supervised
# adaptation.

# Begin configuration section.
nj=4
cmd=run.pl
sub_split=1
beam=13.0
lattice_beam=7.0
acwt=0.1
max_active=5000
transform_dir=
max_mem=20000000 # This will stop the processes getting too large.
num_threads=1
parallel_opts=  # ignored now.
# End configuration section.

echo "$0 $@"  # Print the command line for logging

[ -f ./path.sh ] && . ./path.sh; # source the path.
. parse_options.sh || exit 1;

if [ $# != 4 ]; then
   echo "Usage: steps/make_denlats_sgmm2.sh [options] <data-dir> <lang-dir> <src-dir|alidir> <exp-dir>"
   echo "  e.g.: steps/make_denlats_sgmm2.sh data/train data/lang exp/sgmm4a_ali exp/sgmm4a_denlats"
   echo "Works for (delta|lda) features, and (with --transform-dir option) such features"
   echo " plus transforms."
   echo ""
   echo "Main options (for others, see top of script file)"
   echo "  --config <config-file>                           # config containing options"
   echo "  --nj <nj>                                        # number of parallel jobs"
   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
   echo "  --sub-split <n-split>                            # e.g. 40; use this for "
   echo "                           # large databases so your jobs will be smaller and"
   echo "                           # will (individually) finish reasonably soon."
   echo "  --transform-dir <transform-dir>   # directory to find fMLLR transforms."
   echo "  --num-threads  <n>                # number of threads per decoding job"
   exit 1;
fi

data=$1
lang=$2
alidir=$3 # could also be $srcdir, but only if no vectors supplied.
dir=$4

sdata=$data/split$nj
splice_opts=`cat $alidir/splice_opts 2>/dev/null`
cmvn_opts=`cat $srcdir/cmvn_opts 2>/dev/null`
if [ $num_threads -gt 1 ]; then
  # the -parallel becomes part of the binary name we decode with.
  thread_string="-parallel --num-threads=$num_threads"
fi

mkdir -p $dir/log
[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
echo $nj > $dir/num_jobs

oov=`cat $lang/oov.int` || exit 1;

mkdir -p $dir

utils/lang/check_phones_compatible.sh $lang/phones.txt $alidir/phones.txt || exit 1;

cp -RH $lang $dir/

# Compute grammar FST which corresponds to unigram decoding graph.
new_lang="$dir/"$(basename "$lang")
echo "$0: Making unigram grammar FST in $new_lang"
cat $data/text | utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt | \
  awk '{for(n=2;n<=NF;n++){ printf("%s ", $n); } printf("\n"); }' | \
  utils/make_unigram_grammar.pl | fstcompile | fstarcsort --sort_type=ilabel > $new_lang/G.fst \
   || exit 1;

# mkgraph.sh expects a whole directory "lang", so put everything in one directory...
# it gets L_disambig.fst and G.fst (among other things) from $dir/lang, and
# final.mdl from $alidir; the output HCLG.fst goes in $dir/graph.

echo "$0: Compiling decoding graph in $dir/dengraph"
if [ -s $dir/dengraph/HCLG.fst ] && [ $dir/dengraph/HCLG.fst -nt $srcdir/final.mdl ]; then
   echo "$0: Graph $dir/dengraph/HCLG.fst already exists: skipping graph creation."
else
  utils/mkgraph.sh $new_lang $alidir $dir/dengraph || exit 1;
fi

if [ -f $alidir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
echo "$0: feature type is $feat_type"

case $feat_type in
  delta) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
  lda) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |"
    cp $alidir/final.mat $dir
   ;;
  *) echo "$0: Invalid feature type $feat_type" && exit 1;
esac

if [ ! -z "$transform_dir" ]; then # add transforms to features...
  echo "$0: using fMLLR transforms from $transform_dir"
  [ ! -f $transform_dir/trans.1 ] && echo "Expected $transform_dir/trans.1 to exist." && exit 1
  [ ! -f $transform_dir/num_jobs ] && echo "Expected $transform_dir/num_jobs to exist." && exit 1
  [ "`cat $transform_dir/num_jobs`" -ne "$nj" ] \
    && echo "$0: mismatch in number of jobs with $transform_dir" && exit 1;
  [ -f $alidir/final.mat ] && ! cmp $transform_dir/final.mat $alidir/final.mat && \
     echo "$0: LDA transforms differ between $alidir and $transform_dir"
  feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/trans.JOB ark:- ark:- |"
else
  echo "$0: Assuming you don't have a SAT system, since no --transform-dir option supplied "
fi

if [ -f $alidir/gselect.1.gz ]; then
  gselect_opt="--gselect=ark,s,cs:gunzip -c $alidir/gselect.JOB.gz|"
else
  echo "$0: no such file $alidir/gselect.1.gz" && exit 1;
fi

if [ -f $alidir/vecs.1 ]; then
  spkvecs_opt="--spk-vecs=ark:$alidir/vecs.JOB --utt2spk=ark:$sdata/JOB/utt2spk"
  [ "`cat $alidir/num_jobs`" -ne "$nj" ] \
    && echo "$0: mismatch in number of jobs with $alidir" && exit 1;
else
  if [ -f $alidir/final.alimdl ]; then
    echo "$0: You seem to have an SGMM system with speaker vectors,"
    echo "yet we can't find speaker vectors.  Perhaps you supplied"
    echo "the model director instead of the alignment directory?"
    exit 1;
  fi
fi

# if this job is interrupted by the user, we want any background jobs to be
# killed too.
cleanup() {
  local pids=$(jobs -pr)
  [ -n "$pids" ] && kill $pids
}
trap "cleanup" INT QUIT TERM EXIT

if [ $sub_split -eq 1 ]; then
  $cmd --num-threads $num_threads JOB=1:$nj $dir/log/decode_den.JOB.log \
    sgmm2-latgen-faster$thread_string $spkvecs_opt "$gselect_opt" --beam=$beam \
    --lattice-beam=$lattice_beam --acoustic-scale=$acwt \
    --max-mem=$max_mem --max-active=$max_active \
    --word-symbol-table=$lang/words.txt $alidir/final.mdl  \
    $dir/dengraph/HCLG.fst "$feats" "ark:|gzip -c >$dir/lat.JOB.gz" || exit 1;
else
  # each job from 1 to $nj is split into multiple pieces (sub-split), and we aim
  # to have at most two jobs running at each time.  The idea is that if we have
  # stragglers from one job, we can be processing another one at the same time.
  rm $dir/.error 2>/dev/null

  prev_pid=
  for n in `seq $[nj+1]`; do
    if [ $n -gt $nj ]; then
      this_pid=
    elif [ -f $dir/.done.$n ] && [ $dir/.done.$n -nt $alidir/final.mdl ]; then
      echo "$0: Not processing subset $n as already done (delete $dir/.done.$n if not)";
      this_pid=
    else
      sdata2=$data/split$nj/$n/split${sub_split}utt;
      split_data.sh --per-utt $sdata/$n $sub_split || exit 1;
      mkdir -p $dir/log/$n
      mkdir -p $dir/part
      feats_subset=`echo $feats | sed "s/trans.JOB/trans.$n/g" | sed s:JOB/:$n/split${sub_split}utt/JOB/:g`
      spkvecs_opt_subset=`echo $spkvecs_opt | sed "s/JOB/$n/g"`
      gselect_opt_subset=`echo $gselect_opt | sed "s/JOB/$n/g"`
      $cmd --num-threads $num_threads JOB=1:$sub_split $dir/log/$n/decode_den.JOB.log \
        sgmm2-latgen-faster$thread_string \
        $spkvecs_opt_subset "$gselect_opt_subset" \
        --beam=$beam --lattice-beam=$lattice_beam \
        --acoustic-scale=$acwt --max-mem=$max_mem --max-active=$max_active \
        --word-symbol-table=$lang/words.txt $alidir/final.mdl  \
        $dir/dengraph/HCLG.fst "$feats_subset" \
        "ark:|gzip -c >$dir/lat.$n.JOB.gz" || touch $dir/.error &
      this_pid=$!
    fi
    if [ ! -z "$prev_pid" ]; then # Wait for the previous job to merge lattices.
      wait $prev_pid
      [ -f $dir/.error ] && \
        echo "$0: error generating denominator lattices" && exit 1;
      rm $dir/.merge_error 2>/dev/null
      echo "$0: Merging archives for data subset $prev_n"
      for k in `seq $sub_split`; do
        gunzip -c $dir/lat.$prev_n.$k.gz || touch $dir/.merge_error;
      done | gzip -c > $dir/lat.$prev_n.gz || touch $dir/.merge_error;
      [ -f $dir/.merge_error ] && \
        echo "$0: Merging lattices for subset $prev_n failed" && exit 1;
      rm $dir/lat.$prev_n.*.gz
      touch $dir/.done.$prev_n
    fi
    prev_n=$n
    prev_pid=$this_pid
  done
fi


echo "$0: done generating denominator lattices with SGMMs."


================================================
FILE: egs/steps/make_fbank.sh
================================================
#!/usr/bin/env bash

# Copyright 2012-2016  Karel Vesely
# Copyright 2012-2016  Johns Hopkins University (Author: Daniel Povey)
# Apache 2.0
# To be run from .. (one directory up from here)
# see ../run.sh for example

# Begin configuration section.
nj=4
cmd=run.pl
fbank_config=conf/fbank.conf
compress=true
write_utt2num_frames=true  # If true writes utt2num_frames.
write_utt2dur=true
# End configuration section.

echo "$0 $@"  # Print the command line for logging.

if [ -f path.sh ]; then . ./path.sh; fi
. parse_options.sh || exit 1;

if [ $# -lt 1 ] || [ $# -gt 3 ]; then
  cat >&2 <<EOF
Usage: $0 [options] <data-dir> [<log-dir> [<fbank-dir>] ]
 e.g.: $0 data/train
Note: <log-dir> defaults to <data-dir>/log, and
      <fbank-dir> defaults to <data-dir>/data
Options:
  --fbank-config <config-file>         # config passed to compute-fbank-feats.
  --nj <nj>                            # number of parallel jobs.
  --cmd <run.pl|queue.pl <queue opts>> # how to run jobs.
  --write-utt2num-frames <true|false>  # If true, write utt2num_frames file.
  --write-utt2dur <true|false>         # If true, write utt2dur file.
EOF
   exit 1;
fi

data=$1
if [ $# -ge 2 ]; then
  logdir=$2
else
  logdir=$data/log
fi
if [ $# -ge 3 ]; then
  fbankdir=$3
else
  fbankdir=$data/data
fi


# make $fbankdir an absolute pathname.
fbankdir=`perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' $fbankdir ${PWD}`

# use "name" as part of name of the archive.
name=`basename $data`

mkdir -p $fbankdir || exit 1;
mkdir -p $logdir || exit 1;

if [ -f $data/feats.scp ]; then
  mkdir -p $data/.backup
  echo "$0: moving $data/feats.scp to $data/.backup"
  mv $data/feats.scp $data/.backup
fi

scp=$data/wav.scp

required="$scp $fbank_config"

for f in $required; do
  if [ ! -f $f ]; then
    echo "$0: no such file $f"
    exit 1;
  fi
done

utils/validate_data_dir.sh --no-text --no-feats $data || exit 1;

if [ -f $data/spk2warp ]; then
  echo "$0 [info]: using VTLN warp factors from $data/spk2warp"
  vtln_opts="--vtln-map=ark:$data/spk2warp --utt2spk=ark:$data/utt2spk"
elif [ -f $data/utt2warp ]; then
  echo "$0 [info]: using VTLN warp factors from $data/utt2warp"
  vtln_opts="--vtln-map=ark:$data/utt2warp"
fi

for n in $(seq $nj); do
  # the next command does nothing unless $fbankdir/storage/ exists, see
  # utils/create_data_link.pl for more info.
  utils/create_data_link.pl $fbankdir/raw_fbank_$name.$n.ark
done

if $write_utt2num_frames; then
  write_num_frames_opt="--write-num-frames=ark,t:$logdir/utt2num_frames.JOB"
else
  write_num_frames_opt=
fi

if $write_utt2dur; then
  write_utt2dur_opt="--write-utt2dur=ark,t:$logdir/utt2dur.JOB"
else
  write_utt2dur_opt=
fi

if [ -f $data/segments ]; then
  echo "$0 [info]: segments file exists: using that."
  split_segments=
  for n in $(seq $nj); do
    split_segments="$split_segments $logdir/segments.$n"
  done

  utils/split_scp.pl $data/segments $split_segments || exit 1;
  rm $logdir/.error 2>/dev/null

  $cmd JOB=1:$nj $logdir/make_fbank_${name}.JOB.log \
    extract-segments scp,p:$scp $logdir/segments.JOB ark:- \| \
    compute-fbank-feats $vtln_opts $write_utt2dur_opt --verbose=2 \
      --config=$fbank_config ark:- ark:- \| \
    copy-feats --compress=$compress $write_num_frames_opt ark:- \
     ark,scp:$fbankdir/raw_fbank_$name.JOB.ark,$fbankdir/raw_fbank_$name.JOB.scp \
     || exit 1;
else
  echo "$0: [info]: no segments file exists: assuming wav.scp indexed by utterance."
  split_scps=""
  for n in $(seq $nj); do
    split_scps="$split_scps $logdir/wav.$n.scp"
  done

  utils/split_scp.pl $scp $split_scps || exit 1;

  $cmd JOB=1:$nj $logdir/make_fbank_${name}.JOB.log \
    compute-fbank-feats $vtln_opts $write_utt2dur_opt --verbose=2 \
     --config=$fbank_config scp,p:$logdir/wav.JOB.scp ark:- \| \
    copy-feats --compress=$compress $write_num_frames_opt ark:- \
     ark,scp:$fbankdir/raw_fbank_$name.JOB.ark,$fbankdir/raw_fbank_$name.JOB.scp \
     || exit 1;
fi


if [ -f $logdir/.error.$name ]; then
  echo "$0: Error producing filterbank features for $name:"
  tail $logdir/make_fbank_${name}.1.log
  exit 1;
fi

# concatenate the .scp files together.
for n in $(seq $nj); do
  cat $fbankdir/raw_fbank_$name.$n.scp || exit 1
done > $data/feats.scp || exit 1

if $write_utt2num_frames; then
  for n in $(seq $nj); do
    cat $logdir/utt2num_frames.$n || exit 1
  done > $data/utt2num_frames || exit 1
fi

if $write_utt2dur; then
  for n in $(seq $nj); do
    cat $logdir/utt2dur.$n || exit 1
  done > $data/utt2dur || exit 1
fi

# Store frame_shift and fbank_config along with features.
frame_shift=$(perl -ne 'if (/^--frame-shift=(\d+)/) {
                          printf "%.3f", 0.001 * $1; exit; }' $fbank_config)
echo ${frame_shift:-'0.01'} > $data/frame_shift
mkdir -p $data/conf && cp $fbank_config $data/conf/fbank.conf || exit 1

rm $logdir/wav_${name}.*.scp  $logdir/segments.* \
   $logdir/utt2num_frames.* $logdir/utt2dur.* 2>/dev/null

nf=$(wc -l < $data/feats.scp)
nu=$(wc -l < $data/utt2spk)
if [ $nf -ne $nu ]; then
  echo "$0: It seems not all of the feature files were successfully procesed" \
       "($nf != $nu); consider using utils/fix_data_dir.sh $data"
fi

if (( nf < nu - nu/20 )); then
  echo "$0: Less than 95% the features were successfully generated."\
       "Probably a serious error."
  exit 1
fi

echo "$0: Succeeded creating filterbank features for $name"


================================================
FILE: egs/steps/make_fbank_pitch.sh
================================================
#!/usr/bin/env bash

# Copyright 2013  The Shenzhen Key Laboratory of Intelligent Media and Speech,
#                 PKU-HKUST Shenzhen Hong Kong Institution (Author: Wei Shi)
#           2016  Johns Hopkins University (Author: Daniel Povey)
# Apache 2.0
# Combine filterbank and pitch features together
# Note: This file is based on make_fbank.sh and make_pitch_kaldi.sh

# Begin configuration section.
nj=4
cmd=run.pl
fbank_config=conf/fbank.conf
pitch_config=conf/pitch.conf
pitch_postprocess_config=
paste_length_tolerance=2
compress=true
write_utt2num_frames=true  # If true writes utt2num_frames.
write_utt2dur=true
# End configuration section.

echo "$0 $@"  # Print the command line for logging.

if [ -f path.sh ]; then . ./path.sh; fi
. parse_options.sh || exit 1;

if [ $# -lt 1 ] || [ $# -gt 3 ]; then
  cat >&2 <<EOF
Usage: $0 [options] <data-dir> [<log-dir> [<fbank-dir>] ]
 e.g.: $0 data/train
Note: <log-dir> defaults to <data-dir>/log, and
      <fbank-dir> defaults to <data-dir>/data
Options:
  --fbank-config <fbank-config-file>   # config passed to compute-fbank-feats.
  --pitch-config <pitch-config-file>   # config passed to compute-kaldi-pitch-feats.
  --pitch-postprocess-config <postprocess-config-file> # config passed to process-kaldi-pitch-feats.
  --paste-length-tolerance <tolerance> # length tolerance passed to paste-feats.
  --nj <nj>                            # number of parallel jobs.
  --cmd <run.pl|queue.pl <queue opts>> # how to run jobs.
  --write-utt2num-frames <true|false>  # If true, write utt2num_frames file.
  --write-utt2dur <true|false>         # If true, write utt2dur file.
EOF
   exit 1;
fi

data=$1
if [ $# -ge 2 ]; then
  logdir=$2
else
  logdir=$data/log
fi
if [ $# -ge 3 ]; then
  fbank_pitch_dir=$3
else
  fbank_pitch_dir=$data/data
fi


# make $fbank_pitch_dir an absolute pathname.
fbank_pitch_dir=`perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' $fbank_pitch_dir ${PWD}`

# use "name" as part of name of the archive.
name=`basename $data`

mkdir -p $fbank_pitch_dir || exit 1;
mkdir -p $logdir || exit 1;

if [ -f $data/feats.scp ]; then
  mkdir -p $data/.backup
  echo "$0: moving $data/feats.scp to $data/.backup"
  mv $data/feats.scp $data/.backup
fi

scp=$data/wav.scp

required="$scp $fbank_config $pitch_config"

for f in $required; do
  if [ ! -f $f ]; then
    echo "$0: no such file $f"
    exit 1;
  fi
done

# utils/validate_data_dir.sh --no-text --no-feats $data || exit 1;

if [ ! -z "$pitch_postprocess_config" ]; then
  postprocess_config_opt="--config=$pitch_postprocess_config";
else
  postprocess_config_opt=
fi

if [ -f $data/spk2warp ]; then
  echo "$0 [info]: using VTLN warp factors from $data/spk2warp"
  vtln_opts="--vtln-map=ark:$data/spk2warp --utt2spk=ark:$data/utt2spk"
elif [ -f $data/utt2warp ]; then
  echo "$0 [info]: using VTLN warp factors from $data/utt2warp"
  vtln_opts="--vtln-map=ark:$data/utt2warp"
fi

for n in $(seq $nj); do
  # the next command does nothing unless $fbank_pitch_dir/storage/ exists, see
  # utils/create_data_link.pl for more info.
  utils/create_data_link.pl $fbank_pitch_dir/raw_fbank_pitch_$name.$n.ark
done

if $write_utt2num_frames; then
  write_num_frames_opt="--write-num-frames=ark,t:$logdir/utt2num_frames.JOB"
else
  write_num_frames_opt=
fi

if $write_utt2dur; then
  write_utt2dur_opt="--write-utt2dur=ark,t:$logdir/utt2dur.JOB"
else
  write_utt2dur_opt=
fi

if [ -f $data/segments ]; then
  echo "$0 [info]: segments file exists: using that."
  split_segments=
  for n in $(seq $nj); do
    split_segments="$split_segments $logdir/segments.$n"
  done

  utils/split_scp.pl $data/segments $split_segments || exit 1;
  rm $logdir/.error 2>/dev/null

  fbank_feats="ark:extract-segments scp,p:$scp $logdir/segments.JOB ark:- |\
    compute-fbank-feats $vtln_opts $write_utt2dur_opt --verbose=2 \
      --config=$fbank_config ark:- ark:- |"
  pitch_feats="ark,s,cs:extract-segments scp,p:$scp $logdir/segments.JOB ark:- | \
    compute-kaldi-pitch-feats --verbose=2 --config=$pitch_config ark:- ark:- | \
    process-kaldi-pitch-feats $postprocess_config_opt ark:- ark:- |"

  $cmd JOB=1:$nj $logdir/make_fbank_pitch_${name}.JOB.log \
    paste-feats --length-tolerance=$paste_length_tolerance \
      "$fbank_feats" "$pitch_feats" ark:- \| \
    copy-feats --compress=$compress $write_num_frames_opt ark:- \
      ark,scp:$fbank_pitch_dir/raw_fbank_pitch_$name.JOB.ark,$fbank_pitch_dir/raw_fbank_pitch_$name.JOB.scp \
     || exit 1;

else
  echo "$0: [info]: no segments file exists: assuming wav.scp indexed by utterance."
  split_scps=
  for n in $(seq $nj); do
    split_scps="$split_scps $logdir/wav_${name}.$n.scp"
  done

  utils/split_scp.pl $scp $split_scps || exit 1;

  fbank_feats="ark:compute-fbank-feats $vtln_opts $write_utt2dur_opt \
   --verbose=2 --config=$fbank_config scp,p:$logdir/wav_${name}.JOB.scp ark:- |"
  pitch_feats="ark,s,cs:compute-kaldi-pitch-feats --verbose=2 \
      --config=$pitch_config scp,p:$logdir/wav_${name}.JOB.scp ark:- | \
    process-kaldi-pitch-feats $postprocess_config_opt ark:- ark:- |"

  $cmd JOB=1:$nj $logdir/make_fbank_pitch_${name}.JOB.log \
    paste-feats --length-tolerance=$paste_length_tolerance \
      "$fbank_feats" "$pitch_feats" ark:- \| \
    copy-feats --compress=$compress $write_num_frames_opt ark:- \
      ark,scp:$fbank_pitch_dir/raw_fbank_pitch_$name.JOB.ark,$fbank_pitch_dir/raw_fbank_pitch_$name.JOB.scp \
      || exit 1;
fi


if [ -f $logdir/.error.$name ]; then
  echo "$0: Error producing filterbank and pitch features for $name:"
  tail $logdir/make_fbank_pitch_${name}.1.log
  exit 1;
fi

# Concatenate the .scp files together.
for n in $(seq $nj); do
  cat $fbank_pitch_dir/raw_fbank_pitch_$name.$n.scp || exit 1
done > $data/feats.scp || exit 1

if $write_utt2num_frames; then
  for n in $(seq $nj); do
    cat $logdir/utt2num_frames.$n || exit 1
  done > $data/utt2num_frames || exit 1
fi

if $write_utt2dur; then
  for n in $(seq $nj); do
    cat $logdir/utt2dur.$n || exit 1
  done > $data/utt2dur || exit 1
fi

# Store frame_shift, fbank_config and pitch_config along with features.
frame_shift=$(perl -ne 'if (/^--frame-shift=(\d+)/) {
                          printf "%.3f", 0.001 * $1; exit; }' $fbank_config)
echo ${frame_shift:-'0.01'} > $data/frame_shift
mkdir -p $data/conf &&
  cp $fbank_config $data/conf/fbank.conf &&
  cp $pitch_config $data/conf/pitch.conf || exit 1

rm $logdir/wav_${name}.*.scp  $logdir/segments.* \
   $logdir/utt2num_frames.* $logdir/utt2dur.* 2>/dev/null

nf=$(wc -l < $data/feats.scp)
nu=$(wc -l < $data/utt2spk)
if [ $nf -ne $nu ]; then
  echo "$0: It seems not all of the feature files were successfully procesed" \
       "($nf != $nu); consider using utils/fix_data_dir.sh $data"
fi

if (( nf < nu - nu/20 )); then
  echo "$0: Less than 95% the features were successfully generated."\
       "Probably a serious error."
  exit 1
fi

echo "$0: Succeeded creating filterbank and pitch features for $name"


================================================
FILE: egs/steps/make_index.sh
================================================
#!/usr/bin/env bash

# Copyright 2012  Johns Hopkins University (Author: Guoguo Chen)
# Apache 2.0

# Begin configuration section.
model= # You can specify the model to use
cmd=run.pl
acwt=0.083333
lmwt=1.0
max_silence_frames=50
max_states=1000000
max_states_scale=4
max_expand=180 # limit memory blowup in lattice-align-words
strict=true
word_ins_penalty=0
silence_word=  # Specify this only if you did so in kws_setup
skip_optimization=false     # If you only search for few thousands of keywords, you probablly
                            # can skip the optimization; but if you're going to search for
                            # millions of keywords, you'd better do set this optimization to
                            # false and do the optimization on the final index.
frame_subsampling_factor=   # We will try to autodetect this. You should specify
                            # the right value if your directory structure is
                            # non-standard
# End configuration section.

echo "$0 $@"  # Print the command line for logging

[ -f ./path.sh ] && . ./path.sh; # source the path.
. parse_options.sh || exit 1;

if [ $# != 4 ]; then
   echo "Usage: steps/make_index.sh [options] <kws-data-dir> <lang-dir> <decode-dir> <kws-dir>"
   echo "... where <decode-dir> is where you have the lattices, and is assumed to be"
   echo " a sub-directory of the directory where the model is."
   echo "e.g.: steps/make_index.sh data/kws data/lang exp/sgmm2_5a_mmi/decode/ exp/sgmm2_5a_mmi/decode/kws/"
   echo ""
   echo "main options (for others, see top of script file)"
   echo "  --acwt <float>                                   # acoustic scale used for lattice"
   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
   echo "  --lmwt <float>                                   # lm scale used for lattice"
   echo "  --model <model>                                  # which model to use"
   echo "                                                   # speaker-adapted decoding"
   echo "  --max-silence-frames <int>                       # maximum #frames for silence"
   exit 1;
fi


kwsdatadir=$1;
langdir=$2;
decodedir=$3;
kwsdir=$4;
srcdir=`dirname $decodedir`; # The model directory is one level up from decoding directory.

mkdir -p $kwsdir/log;
nj=`cat $decodedir/num_jobs` || exit 1;
echo $nj > $kwsdir/num_jobs;

utter_id=$kwsdatadir/utter_id
if [ ! -f $utter_id ] ; then
  utter_id=$kwsdatadir/utt.map
fi


if [ -z "$model" ]; then # if --model <mdl> was not specified on the command line...
  model=$srcdir/final.mdl;
fi

for f in $model $decodedir/lat.1.gz $utter_id; do
  [ ! -f $f ] && echo "$0: Error: no such file $f" && exit 1;
done

echo "$0: Using model: $model"

if [ ! -z $silence_word ]; then
  silence_int=`grep -w $silence_word $langdir/words.txt | awk '{print $2}'`
  [ -z $silence_int ] && \
    echo "$0: Error: could not find integer representation of silence word $silence_word" && exit 1;
  silence_opt="--silence-label=$silence_int"
fi

if [ -z "$frame_subsampling_factor" ]; then
  if [ -f $decodedir/../frame_subsampling_factor ] ; then
    frame_subsampling_factor=$(cat $decodedir/../frame_subsampling_factor)
  else 
    frame_subsampling_factor=1
  fi
  echo "$0: Frame subsampling factor autodetected: $frame_subsampling_factor"
fi

word_boundary=$langdir/phones/word_boundary.int
align_lexicon=$langdir/phones/align_lexicon.int
if [ -f $word_boundary ] ; then
  $cmd JOB=1:$nj $kwsdir/log/index.JOB.log \
    lattice-add-penalty --word-ins-penalty=$word_ins_penalty "ark:gzip -cdf $decodedir/lat.JOB.gz|" ark:- \| \
      lattice-align-words $silence_opt --max-expand=$max_expand $word_boundary $model  ark:- ark:- \| \
      lattice-scale --acoustic-scale=$acwt --lm-scale=$lmwt ark:- ark:- \| \
      lattice-to-kws-index --max-states-scale=$max_states_scale --allow-partial=true \
      --frame-subsampling-factor=$frame_subsampling_factor \
      --max-silence-frames=$max_silence_frames --strict=$strict ark:$utter_id ark:- ark:- \| \
      kws-index-union --skip-optimization=$skip_optimization --strict=$strict --max-states=$max_states \
      ark:- "ark:|gzip -c > $kwsdir/index.JOB.gz" || exit 1
elif [ -f $align_lexicon ]; then
  $cmd JOB=1:$nj $kwsdir/log/index.JOB.log \
    lattice-add-penalty --word-ins-penalty=$word_ins_penalty "ark:gzip -cdf $decodedir/lat.JOB.gz|" ark:- \| \
      lattice-align-words-lexicon $silence_opt --max-expand=$max_expand $align_lexicon $model  ark:- ark:- \| \
      lattice-scale --acoustic-scale=$acwt --lm-scale=$lmwt ark:- ark:- \| \
      lattice-to-kws-index --max-states-scale=$max_states_scale --allow-partial=true \
      --frame-subsampling-factor=$frame_subsampling_factor \
      --max-silence-frames=$max_silence_frames --strict=$strict ark:$utter_id ark:- ark:- \| \
      kws-index-union --skip-optimization=$skip_optimization --strict=$strict --max-states=$max_states \
      ark:- "ark:|gzip -c > $kwsdir/index.JOB.gz" || exit 1
else
  echo "$0: Error: cannot find either word-boundary file $word_boundary or alignment lexicon $align_lexicon"
  exit 1
fi

exit 0;


================================================
FILE: egs/steps/make_mfcc.sh
================================================
#!/usr/bin/env bash

# Copyright 2012-2016  Johns Hopkins University (Author: Daniel Povey)
# Apache 2.0
# To be run from .. (one directory up from here)
# see ../run.sh for example

# Begin configuration section.
nj=4
cmd=run.pl
mfcc_config=conf/mfcc.conf
compress=true
write_utt2num_frames=true  # If true writes utt2num_frames.
write_utt2dur=true
# End configuration section.

echo "$0 $@"  # Print the command line for logging.

if [ -f path.sh ]; then . ./path.sh; fi
. parse_options.sh || exit 1;

if [ $# -lt 1 ] || [ $# -gt 3 ]; then
  cat >&2 <<EOF
Usage: $0 [options] <data-dir> [<log-dir> [<mfcc-dir>] ]
 e.g.: $0 data/train
Note: <log-dir> defaults to <data-dir>/log, and
      <mfcc-dir> defaults to <data-dir>/data.
Options:
  --mfcc-config <config-file>          # config passed to compute-mfcc-feats.
  --nj <nj>                            # number of parallel jobs.
  --cmd <run.pl|queue.pl <queue opts>> # how to run jobs.
  --write-utt2num-frames <true|false>  # If true, write utt2num_frames file.
  --write-utt2dur <true|false>         # If true, write utt2dur file.
EOF
   exit 1;
fi

data=$1
if [ $# -ge 2 ]; then
  logdir=$2
else
  logdir=$data/log
fi
if [ $# -ge 3 ]; then
  mfccdir=$3
else
  mfccdir=$data/data
fi

# make $mfccdir an absolute pathname.
mfccdir=`perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' $mfccdir ${PWD}`

# use "name" as part of name of the archive.
name=`basename $data`

mkdir -p $mfccdir || exit 1;
mkdir -p $logdir || exit 1;

if [ -f $data/feats.scp ]; then
  mkdir -p $data/.backup
  echo "$0: moving $data/feats.scp to $data/.backup"
  mv $data/feats.scp $data/.backup
fi

scp=$data/wav.scp

required="$scp $mfcc_config"

for f in $required; do
  if [ ! -f $f ]; then
    echo "$0: no such file $f"
    exit 1;
  fi
done

utils/validate_data_dir.sh --no-text --no-feats $data || exit 1;

if [ -f $data/spk2warp ]; then
  echo "$0 [info]: using VTLN warp factors from $data/spk2warp"
  vtln_opts="--vtln-map=ark:$data/spk2warp --utt2spk=ark:$data/utt2spk"
elif [ -f $data/utt2warp ]; then
  echo "$0 [info]: using VTLN warp factors from $data/utt2warp"
  vtln_opts="--vtln-map=ark:$data/utt2warp"
else
  vtln_opts=""
fi

for n in $(seq $nj); do
  # the next command does nothing unless $mfccdir/storage/ exists, see
  # utils/create_data_link.pl for more info.
  utils/create_data_link.pl $mfccdir/raw_mfcc_$name.$n.ark
done


if $write_utt2num_frames; then
  write_num_frames_opt="--write-num-frames=ark,t:$logdir/utt2num_frames.JOB"
else
  write_num_frames_opt=
fi

if $write_utt2dur; then
  write_utt2dur_opt="--write-utt2dur=ark,t:$logdir/utt2dur.JOB"
else
  write_utt2dur_opt=
fi

if [ -f $data/segments ]; then
  echo "$0 [info]: segments file exists: using that."

  split_segments=
  for n in $(seq $nj); do
    split_segments="$split_segments $logdir/segments.$n"
  done

  utils/split_scp.pl $data/segments $split_segments || exit 1;
  rm $logdir/.error 2>/dev/null

  $cmd JOB=1:$nj $logdir/make_mfcc_${name}.JOB.log \
    extract-segments scp,p:$scp $logdir/segments.JOB ark:- \| \
    compute-mfcc-feats $vtln_opts $write_utt2dur_opt --verbose=2 \
      --config=$mfcc_config ark:- ark:- \| \
    copy-feats --compress=$compress $write_num_frames_opt ark:- \
      ark,scp:$mfccdir/raw_mfcc_$name.JOB.ark,$mfccdir/raw_mfcc_$name.JOB.scp \
     || exit 1;

else
  echo "$0: [info]: no segments file exists: assuming wav.scp indexed by utterance."
  split_scps=
  for n in $(seq $nj); do
    split_scps="$split_scps $logdir/wav_${name}.$n.scp"
  done

  utils/split_scp.pl $scp $split_scps || exit 1;


  # add ,p to the input rspecifier so that we can just skip over
  # utterances that have bad wave data.

  $cmd JOB=1:$nj $logdir/make_mfcc_${name}.JOB.log \
    compute-mfcc-feats $vtln_opts $write_utt2dur_opt --verbose=2 \
      --config=$mfcc_config scp,p:$logdir/wav_${name}.JOB.scp ark:- \| \
    copy-feats $write_num_frames_opt --compress=$compress ark:- \
      ark,scp:$mfccdir/raw_mfcc_$name.JOB.ark,$mfccdir/raw_mfcc_$name.JOB.scp \
      || exit 1;
fi


if [ -f $logdir/.error.$name ]; then
  echo "$0: Error producing MFCC features for $name:"
  tail $logdir/make_mfcc_${name}.1.log
  exit 1;
fi

# concatenate the .scp files together.
for n in $(seq $nj); do
  cat $mfccdir/raw_mfcc_$name.$n.scp || exit 1
done > $data/feats.scp || exit 1

if $write_utt2num_frames; then
  for n in $(seq $nj); do
    cat $logdir/utt2num_frames.$n || exit 1
  done > $data/utt2num_frames || exit 1
fi

if $write_utt2dur; then
  for n in $(seq $nj); do
    cat $logdir/utt2dur.$n || exit 1
  done > $data/utt2dur || exit 1
fi

# Store frame_shift and mfcc_config along with features.
frame_shift=$(perl -ne 'if (/^--frame-shift=(\d+)/) {
                          printf "%.3f", 0.001 * $1; exit; }' $mfcc_config)
echo ${frame_shift:-'0.01'} > $data/frame_shift
mkdir -p $data/conf && cp $mfcc_config $data/conf/mfcc.conf || exit 1

rm $logdir/wav_${name}.*.scp  $logdir/segments.* \
   $logdir/utt2num_frames.* $logdir/utt2dur.* 2>/dev/null

nf=$(wc -l < $data/feats.scp)
nu=$(wc -l < $data/utt2spk)
if [ $nf -ne $nu ]; then
  echo "$0: It seems not all of the feature files were successfully procesed" \
       "($nf != $nu); consider using utils/fix_data_dir.sh $data"
fi

if (( nf < nu - nu/20 )); then
  echo "$0: Less than 95% the features were successfully generated."\
       "Probably a serious error."
  exit 1
fi


echo "$0: Succeeded creating MFCC features for $name"


================================================
FILE: egs/steps/make_mfcc_pitch.sh
================================================
#!/usr/bin/env bash

# Copyright 2013  The Shenzhen Key Laboratory of Intelligent Media and Speech,
#                 PKU-HKUST Shenzhen Hong Kong Institution (Author: Wei Shi)
#           2016  Johns Hopkins University (Author: Daniel Povey)
# Apache 2.0
# Combine MFCC and pitch features together
# Note: This file is based on make_mfcc.sh and make_pitch_kaldi.sh

# Begin configuration section.
nj=4
cmd=run.pl
mfcc_config=conf/mfcc.conf
pitch_config=conf/pitch.conf
pitch_postprocess_config=
paste_length_tolerance=2
compress=true
write_utt2num_frames=true  # If true writes utt2num_frames.
write_utt2dur=true
# End configuration section.

echo "$0 $@"  # Print the command line for logging.

if [ -f path.sh ]; then . ./path.sh; fi
. parse_options.sh || exit 1;

if [ $# -lt 1 ] || [ $# -gt 3 ]; then
  cat >&2 <<EOF
Usage: $0 [options] <data-dir> [<log-dir> [<mfcc-dir>] ]
 e.g.: $0 data/train
Note: <log-dir> defaults to <data-dir>/log, and
      <mfcc-dir> defaults to <data-dir>/data
Options:
  --mfcc-config <mfcc-config-file>     # config passed to compute-mfcc-feats.
  --pitch-config <pitch-config-file>   # config passed to compute-kaldi-pitch-feats.
  --pitch-postprocess-config <postprocess-config-file> # config passed to process-kaldi-pitch-feats.
  --paste-length-tolerance <tolerance> # length tolerance passed to paste-feats.
  --nj <nj>                            # number of parallel jobs.
  --cmd <run.pl|queue.pl <queue opts>> # how to run jobs.
  --write-utt2num-frames <true|false>  # If true, write utt2num_frames file.
  --write-utt2dur <true|false>         # If true, write utt2dur file.
EOF
   exit 1;
fi

data=$1
if [ $# -ge 2 ]; then
  logdir=$2
else
  logdir=$data/log
fi
if [ $# -ge 3 ]; then
  mfcc_pitch_dir=$3
else
  mfcc_pitch_dir=$data/data
fi


# make $mfcc_pitch_dir an absolute pathname.
mfcc_pitch_dir=`perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' $mfcc_pitch_dir ${PWD}`

# use "name" as part of name of the archive.
name=`basename $data`

mkdir -p $mfcc_pitch_dir || exit 1;
mkdir -p $logdir || exit 1;

if [ -f $data/feats.scp ]; then
  mkdir -p $data/.backup
  echo "$0: moving $data/feats.scp to $data/.backup"
  mv $data/feats.scp $data/.backup
fi

scp=$data/wav.scp

required="$scp $mfcc_config $pitch_config"

for f in $required; do
  if [ ! -f $f ]; then
    echo "$0: no such file $f"
    exit 1;
  fi
done

utils/validate_data_dir.sh --no-text --no-feats $data || exit 1;

if [ ! -z "$pitch_postprocess_config" ]; then
  postprocess_config_opt="--config=$pitch_postprocess_config";
else
  postprocess_config_opt=
fi

if [ -f $data/spk2warp ]; then
  echo "$0 [info]: using VTLN warp factors from $data/spk2warp"
  vtln_opts="--vtln-map=ark:$data/spk2warp --utt2spk=ark:$data/utt2spk"
elif [ -f $data/utt2warp ]; then
  echo "$0 [info]: using VTLN warp factors from $data/utt2warp"
  vtln_opts="--vtln-map=ark:$data/utt2warp"
fi

for n in $(seq $nj); do
  # the next command does nothing unless $mfcc_pitch_dir/storage/ exists, see
  # utils/create_data_link.pl for more info.
  utils/create_data_link.pl $mfcc_pitch_dir/raw_mfcc_pitch_$name.$n.ark
done

if $write_utt2num_frames; then
  write_num_frames_opt="--write-num-frames=ark,t:$logdir/utt2num_frames.JOB"
else
  write_num_frames_opt=
fi

if $write_utt2dur; then
  write_utt2dur_opt="--write-utt2dur=ark,t:$logdir/utt2dur.JOB"
else
  write_utt2dur_opt=
fi

if [ -f $data/segments ]; then
  echo "$0 [info]: segments file exists: using that."
  split_segments=
  for n in $(seq $nj); do
    split_segments="$split_segments $logdir/segments.$n"
  done

  utils/split_scp.pl $data/segments $split_segments || exit 1;
  rm $logdir/.error 2>/dev/null

  mfcc_feats="ark:extract-segments scp,p:$scp $logdir/segments.JOB ark:- | \
    compute-mfcc-feats $vtln_opts $write_utt2dur_opt --verbose=2 \
      --config=$mfcc_config ark:- ark:- |"
  pitch_feats="ark,s,cs:extract-segments scp,p:$scp $logdir/segments.JOB ark:- | \
    compute-kaldi-pitch-feats --verbose=2 --config=$pitch_config ark:- ark:- | \
    process-kaldi-pitch-feats $postprocess_config_opt ark:- ark:- |"

  $cmd JOB=1:$nj $logdir/make_mfcc_pitch_${name}.JOB.log \
    paste-feats --length-tolerance=$paste_length_tolerance \
      "$mfcc_feats" "$pitch_feats" ark:- \| \
    copy-feats --compress=$compress $write_num_frames_opt ark:- \
      ark,scp:$mfcc_pitch_dir/raw_mfcc_pitch_$name.JOB.ark,$mfcc_pitch_dir/raw_mfcc_pitch_$name.JOB.scp \
     || exit 1;

else
  echo "$0: [info]: no segments file exists: assuming wav.scp indexed by utterance."
  split_scps=
  for n in $(seq $nj); do
    split_scps="$split_scps $logdir/wav_${name}.$n.scp"
  done

  utils/split_scp.pl $scp $split_scps || exit 1;

  mfcc_feats="ark:compute-mfcc-feats $vtln_opts $write_utt2dur_opt --verbose=2 \
    --config=$mfcc_config scp,p:$logdir/wav_${name}.JOB.scp ark:- |"
  pitch_feats="ark,s,cs:compute-kaldi-pitch-feats --verbose=2 \
      --config=$pitch_config scp,p:$logdir/wav_${name}.JOB.scp ark:- | \
    process-kaldi-pitch-feats $postprocess_config_opt ark:- ark:- |"

  $cmd JOB=1:$nj $logdir/make_mfcc_pitch_${name}.JOB.log \
    paste-feats --length-tolerance=$paste_length_tolerance \
      "$mfcc_feats" "$pitch_feats" ark:- \| \
    copy-feats --compress=$compress $write_num_frames_opt ark:- \
      ark,scp:$mfcc_pitch_dir/raw_mfcc_pitch_$name.JOB.ark,$mfcc_pitch_dir/raw_mfcc_pitch_$name.JOB.scp \
      || exit 1;
fi


if [ -f $logdir/.error.$name ]; then
  echo "$0: Error producing MFCC and pitch features for $name:"
  tail $logdir/make_mfcc_pitch_${name}.1.log
  exit 1;
fi

# Concatenate the .scp files together.
for n in $(seq $nj); do
  cat $mfcc_pitch_dir/raw_mfcc_pitch_$name.$n.scp || exit 1;
done > $data/feats.scp || exit 1

if $write_utt2num_frames; then
  for n in $(seq $nj); do
    cat $logdir/utt2num_frames.$n || exit 1
  done > $data/utt2num_frames || exit 1
fi

if $write_utt2dur; then
  for n in $(seq $nj); do
    cat $logdir/utt2dur.$n || exit 1
  done > $data/utt2dur || exit 1
fi

# Store frame_shift, mfcc_config and pitch_config along with features.
frame_shift=$(perl -ne 'if (/^--frame-shift=(\d+)/) {
                          printf "%.3f", 0.001 * $1; exit; }' $mfcc_config)
echo ${frame_shift:-'0.01'} > $data/frame_shift
mkdir -p $data/conf &&
  cp $mfcc_config $data/conf/mfcc.conf &&
  cp $pitch_config $data/conf/pitch.conf || exit 1

rm $logdir/wav_${name}.*.scp  $logdir/segments.* \
   $logdir/utt2num_frames.* $logdir/utt2dur.* 2>/dev/null

nf=$(wc -l < $data/feats.scp)
nu=$(wc -l < $data/utt2spk)
if [ $nf -ne $nu ]; then
  echo "$0: It seems not all of the feature files were successfully procesed" \
       "($nf != $nu); consider using utils/fix_data_dir.sh $data"
fi

if (( nf < nu - nu/20 )); then
  echo "$0: Less than 95% the features were successfully generated."\
       "Probably a serious error."
  exit 1
fi

echo "$0: Succeeded creating MFCC and pitch features for $name"


================================================
FILE: egs/steps/make_mfcc_pitch_online.sh
================================================
#!/usr/bin/env bash

# Copyright 2013  The Shenzhen Key Laboratory of Intelligent Media and Speech,
#                 PKU-HKUST Shenzhen Hong Kong Institution (Author: Wei Shi)
#           2014-2016  Johns Hopkins University (Author: Daniel Povey)
# Apache 2.0
# Combine MFCC and online-pitch features together
# Note: This file is based on make_mfcc_pitch.sh

# Begin configuration section.
nj=4
cmd=run.pl
mfcc_config=conf/mfcc.conf
online_pitch_config=conf/online_pitch.conf
paste_length_tolerance=2
compress=true
write_utt2num_frames=true  # If true writes utt2num_frames.
write_utt2dur=true
# End configuration section.

echo "$0 $@"  # Print the command line for logging.

if [ -f path.sh ]; then . ./path.sh; fi
. parse_options.sh || exit 1;

if [ $# -lt 1 ] || [ $# -gt 3 ]; then
  cat >&2 <<EOF
Usage: $0 [options] <data-dir> [<log-dir> [<mfcc-dir>] ]
 e.g.: $0 data/train
Note: <log-dir> defaults to <data-dir>/log, and
      <mfcc-dir> defaults to <data-dir>/data
Options:
  --mfcc-config <mfcc-config-file>     # config passed to compute-mfcc-feats [conf/mfcc.conf]
  --online-pitch-config <online-pitch-config-file> # config passed to compute-and-process-kaldi-pitch-feats [conf/online_pitch.conf]
  --paste-length-tolerance <tolerance> # length tolerance passed to paste-feats.
  --nj <nj>                            # number of parallel jobs.
  --cmd <run.pl|queue.pl <queue opts>> # how to run jobs.
  --write-utt2num-frames <true|false>  # If true, write utt2num_frames file.
  --write-utt2dur <true|false>         # If true, write utt2dur file.
EOF
   exit 1;
fi

data=$1
if [ $# -ge 2 ]; then
  logdir=$2
else
  logdir=$data/log
fi
if [ $# -ge 3 ]; then
  mfcc_pitch_dir=$3
else
  mfcc_pitch_dir=$data/data
fi


# make $mfcc_pitch_dir an absolute pathname.
mfcc_pitch_dir=`perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' $mfcc_pitch_dir ${PWD}`

# use "name" as part of name of the archive.
name=`basename $data`

mkdir -p $mfcc_pitch_dir || exit 1;
mkdir -p $logdir || exit 1;

if [ -f $data/feats.scp ]; then
  mkdir -p $data/.backup
  echo "$0: moving $data/feats.scp to $data/.backup"
  mv $data/feats.scp $data/.backup
fi

scp=$data/wav.scp

required="$scp $mfcc_config $online_pitch_config"

for f in $required; do
  if [ ! -f $f ]; then
    echo "$0: no such file $f"
    exit 1;
  fi
done
utils/validate_data_dir.sh --no-text --no-feats $data || exit 1;

if [ -f $data/spk2warp ]; then
  echo "$0 [info]: using VTLN warp factors from $data/spk2warp"
  vtln_opts="--vtln-map=ark:$data/spk2warp --utt2spk=ark:$data/utt2spk"
elif [ -f $data/utt2warp ]; then
  echo "$0 [info]: using VTLN warp factors from $data/utt2warp"
  vtln_opts="--vtln-map=ark:$data/utt2warp"
fi

for n in $(seq $nj); do
  # the next command does nothing unless $mfcc_pitch_dir/storage/ exists, see
  # utils/create_data_link.pl for more info.
  utils/create_data_link.pl $mfcc_pitch_dir/raw_mfcc_online_pitch_$name.$n.ark
done

if $write_utt2num_frames; then
  write_num_frames_opt="--write-num-frames=ark,t:$logdir/utt2num_frames.JOB"
else
  write_num_frames_opt=
fi

if $write_utt2dur; then
  write_utt2dur_opt="--write-utt2dur=ark,t:$logdir/utt2dur.JOB"
else
  write_utt2dur_opt=
fi

if [ -f $data/segments ]; then
  echo "$0 [info]: segments file exists: using that."
  split_segments=
  for n in $(seq $nj); do
    split_segments="$split_segments $logdir/segments.$n"
  done

  utils/split_scp.pl $data/segments $split_segments || exit 1;
  rm $logdir/.error 2>/dev/null

  mfcc_feats="ark:extract-segments scp,p:$scp $logdir/segments.JOB ark:- | \
    compute-mfcc-feats $vtln_opts $write_utt2dur_opt --verbose=2 \
      --config=$mfcc_config ark:- ark:- |"
  pitch_feats="ark,s,cs:extract-segments scp,p:$scp $logdir/segments.JOB ark:- | \
    compute-and-process-kaldi-pitch-feats --verbose=2 \
      --config=$online_pitch_config ark:- ark:- |"

  $cmd JOB=1:$nj $logdir/make_mfcc_pitch_${name}.JOB.log \
    paste-feats --length-tolerance=$paste_length_tolerance \
      "$mfcc_feats" "$pitch_feats" ark:- \| \
    copy-feats --compress=$compress $write_num_frames_opt ark:- \
      ark,scp:$mfcc_pitch_dir/raw_mfcc_online_pitch_$name.JOB.ark,$mfcc_pitch_dir/raw_mfcc_online_pitch_$name.JOB.scp \
     || exit 1;

else
  echo "$0: [info]: no segments file exists: assuming wav.scp indexed by utterance."
  split_scps=
  for n in $(seq $nj); do
    split_scps="$split_scps $logdir/wav_${name}.$n.scp"
  done

  utils/split_scp.pl $scp $split_scps || exit 1;

  mfcc_feats="ark:compute-mfcc-feats $vtln_opts $write_utt2dur_opt --verbose=2 \
    --config=$mfcc_config scp,p:$logdir/wav_${name}.JOB.scp ark:- |"
  pitch_feats="ark,s,cs:compute-and-process-kaldi-pitch-feats --verbose=2 \
    --config=$online_pitch_config scp,p:$logdir/wav_${name}.JOB.scp ark:- |"

  $cmd JOB=1:$nj $logdir/make_mfcc_pitch_${name}.JOB.log \
    paste-feats --length-tolerance=$paste_length_tolerance \
      "$mfcc_feats" "$pitch_feats" ark:- \| \
    copy-feats --compress=$compress $write_num_frames_opt ark:- \
      ark,scp:$mfcc_pitch_dir/raw_mfcc_online_pitch_$name.JOB.ark,$mfcc_pitch_dir/raw_mfcc_online_pitch_$name.JOB.scp \
      || exit 1;
fi


if [ -f $logdir/.error.$name ]; then
  echo "$0: Error producing MFCC and online-pitch features for $name:"
  tail $logdir/make_mfcc_pitch_${name}.1.log
  exit 1;
fi

# Concatenate the .scp files together.
for n in $(seq $nj); do
  cat $mfcc_pitch_dir/raw_mfcc_online_pitch_$name.$n.scp || exit 1
done > $data/feats.scp || exit 1

if $write_utt2num_frames; then
  for n in $(seq $nj); do
    cat $logdir/utt2num_frames.$n || exit 1
  done > $data/utt2num_frames || exit 1
fi

if $write_utt2dur; then
  for n in $(seq $nj); do
    cat $logdir/utt2dur.$n || exit 1
  done > $data/utt2dur || exit 1
fi

# Store frame_shift, mfcc_config and pitch_config_online along with features.
frame_shift=$(perl -ne 'if (/^--frame-shift=(\d+)/) {
                          printf "%.3f", 0.001 * $1; exit; }' $mfcc_config)
echo ${frame_shift:-'0.01'} > $data/frame_shift
mkdir -p $data/conf &&
  cp $mfcc_config $data/conf/mfcc.conf &&
  cp $online_pitch_config $data/conf/online_pitch.conf || exit 1

rm $logdir/wav_${name}.*.scp  $logdir/segments.* \
   $logdir/utt2num_frames.* $logdir/utt2dur.* 2>/dev/null

nf=$(wc -l < $data/feats.scp)
nu=$(wc -l < $data/utt2spk)
if [ $nf -ne $nu ]; then
  echo "$0: It seems not all of the feature files were successfully procesed" \
       "($nf != $nu); consider using utils/fix_data_dir.sh $data"
fi

if (( nf < nu - nu/20 )); then
  echo "$0: Less than 95% the features were successfully generated."\
       "Probably a serious error."
  exit 1
fi

echo "$0: Succeeded creating MFCC and online-pitch features for $name"


================================================
FILE: egs/steps/make_phone_graph.sh
================================================
#!/usr/bin/env bash

# steps/make_phone_graph.sh data/train_100k_nodup/ data/lang exp/tri2_ali_100k_nodup/ exp/tri2

# Copyright 2013  Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.

# This script makes a phone-based LM, without smoothing to unigram, that
# is to be used for segmentation, and uses that together with a model to
# make a decoding graph.
# Uses SRILM.
# See also utils/lang/make_phone_bigram_lm.sh.

# Begin configuration section.
stage=0
cmd=run.pl
N=3  # change N and P for non-trigram systems.
P=1
tscale=1.0 # transition scale.
loopscale=0.1 # scale for self-loops.
# End configuration section.

echo "$0 $@"  # Print the command line for logging

[ -f ./path.sh ] && . ./path.sh; # source the path.
. parse_options.sh || exit 1;

if [ $# -ne 3 ]; then
  echo "Usage: $0  [options] <lang-dir> <alignment-dir> <model-dir>"
  echo " e.g.: $0 data/lang exp/tri3b_ali exp/tri4b_seg"
  echo "Makes the graph in $dir/phone_graph, corresponding to the model in $dir"
  echo "The alignments from $ali_dir are used to train the phone LM."
  exit 1;
fi

lang=$1
alidir=$2
dir=$3


for f in $lang/L.fst $alidir/ali.1.gz $alidir/final.mdl $dir/final.mdl; do
  if [ ! -f $f ]; then
    echo "$0: expected $f to exist"
    exit 1;
  fi
done

loc=`which ngram-count`;
if [ -z $loc ]; then
  if uname -a | grep 64 >/dev/null; then # some kind of 64 bit...
    sdir=$KALDI_ROOT/tools/srilm/bin/i686-m64
  else
    sdir=$KALDI_ROOT/tools/srilm/bin/i686
  fi
  if [ -f $sdir/ngram-count ]; then
    echo Using SRILM tools from $sdir
    export PATH=$PATH:$sdir
  else
    echo You appear to not have SRILM tools installed, either on your path,
    echo or installed in $sdir.  See tools/install_srilm.sh for installation
    echo instructions.
    exit 1
  fi
fi

set -e # exit on error status

mkdir -p $dir/phone_graph

utils/lang/check_phones_compatible.sh $lang/phones.txt $alidir/phones.txt

if [ $stage -le 0 ]; then
  echo "$0: creating phone LM-training data"
  gunzip -c $alidir/ali.*gz | ali-to-phones $alidir/final.mdl ark:- ark,t:- | \
    awk '{for (x=2; x <= NF; x++) printf("%s ", $x); printf("\n"); }' | \
    utils/int2sym.pl $lang/phones.txt > $dir/phone_graph/train_phones.txt
fi

if [ $stage -le 1 ]; then
  echo "$0: building ARPA LM"
  ngram-count -text $dir/phone_graph/train_phones.txt -order 3  \
    -addsmooth1 1 -kndiscount2 -kndiscount3 -interpolate -lm $dir/phone_graph/arpa.gz
fi

# Set the unigram and unigram-backoff log-probs to -99.  we'll later remove the
# arcs from the FST.  This is to avoid CLG blowup, and to increase speed.

if [ $stage -le 2 ]; then
  echo "$0: removing unigrams from ARPA LM"

  gunzip -c $dir/phone_graph/arpa.gz | \
    awk '/\\1-grams/{state=1;} /\\2-grams:/{ state=2; }
       {if(state == 1 && NF == 3) { printf("-99\t%s\t-99\n", $2); } else {print;}}' | \
         gzip -c >$dir/phone_graph/arpa_noug.gz
fi

if [ $stage -le 3 ]; then
  echo "$0: creating G_phones.fst from ARPA"
  gunzip -c $dir/phone_graph/arpa_noug.gz | \
    arpa2fst --disambig-symbol=#0 --read-symbol-table=$lang/phones.txt - - | \
    fstprint | awk '{if (NF < 5 || $5 < 100.0) { print; }}' | fstcompile | \
    fstconnect > $dir/phone_graph/G_phones.fst
  fstisstochastic $dir/phone_graph/G_phones.fst || echo "[info]: G_phones not stochastic."
fi


if [ $stage -le 4 ]; then
  echo "$0: creating CLG."

  fstcomposecontext --context-size=$N --central-position=$P \
   --read-disambig-syms=$lang/phones/disambig.int \
   --write-disambig-syms=$dir/phone_graph/disambig_ilabels_${N}_${P}.int \
    $dir/phone_graph/ilabels_${N}_${P} < $dir/phone_graph/G_phones.fst | \
      fstdeterminize >$dir/phone_graph/CLG.fst
  fstisstochastic $dir/phone_graph/CLG.fst  || echo "[info]: CLG not stochastic."
fi

if [ $stage -le 5 ]; then
  echo "$0: creating Ha.fst"
  make-h-transducer --disambig-syms-out=$dir/phone_graph/disambig_tid.int \
    --transition-scale=$tscale $dir/phone_graph/ilabels_${N}_${P} $dir/tree $dir/final.mdl \
       > $dir/phone_graph/Ha.fst
fi

if [ $stage -le 6 ]; then
  echo "$0: creating HCLGa.fst"
  fsttablecompose $dir/phone_graph/Ha.fst $dir/phone_graph/CLG.fst | \
      fstdeterminizestar --use-log=true | \
      fstrmsymbols $dir/phone_graph/disambig_tid.int | fstrmepslocal | \
      fstminimizeencoded > $dir/phone_graph/HCLGa.fst || exit 1;
  fstisstochastic $dir/phone_graph/HCLGa.fst || echo "HCLGa is not stochastic"
fi

if [ $stage -le 7 ]; then
  add-self-loops --self-loop-scale=$loopscale --reorder=true \
    $dir/final.mdl < $dir/phone_graph/HCLGa.fst > $dir/phone_graph/HCLG.fst || exit 1;

  if [ $tscale == 1.0 -a $loopscale == 1.0 ]; then
    # No point doing this test if transition-scale not 1, as it is bound to fail.
    fstisstochastic $dir/phone_graph/HCLG.fst || echo "[info]: final HCLG is not stochastic."
  fi

  # $lang/phones.txt is the symbol table that corresponds to the output
  # symbols on the graph; decoding scripts expect it as words.txt.
  cp $lang/phones.txt $dir/phone_graph/words.txt
  cp -r $lang/phones $dir/phone_graph/
fi


================================================
FILE: egs/steps/make_plp.sh
================================================
#!/usr/bin/env bash

# Copyright 2012-2016  Johns Hopkins University (Author: Daniel Povey)
# Apache 2.0
# To be run from .. (one directory up from here)
# see ../run.sh for example

# Begin configuration section.
nj=4
cmd=run.pl
plp_config=conf/plp.conf
compress=true
write_utt2num_frames=true  # If true writes utt2num_frames.
write_utt2dur=true
# End configuration section.

echo "$0 $@"  # Print the command line for logging.

if [ -f path.sh ]; then . ./path.sh; fi
. parse_options.sh || exit 1;

if [ $# -lt 1 ] || [ $# -gt 3 ]; then
  cat >&2 <<EOF
Usage: $0 [options] <data-dir> [<log-dir> [<plp-dir>] ]
 e.g.: $0 data/train
Note: <log-dir> defaults to <data-dir>/log, and
      <plp-dir> defaults to <data-dir>/data
Options:
  --plp-config <config-file>           # config passed to compute-plp-feats.
  --nj <nj>                            # number of parallel jobs.
  --cmd <run.pl|queue.pl <queue opts>> # how to run jobs.
  --write-utt2num-frames <true|false>  # If true, write utt2num_frames file.
  --write-utt2dur <true|false>         # If true, write utt2dur file.
EOF
   exit 1;
fi

data=$1
if [ $# -ge 2 ]; then
  logdir=$2
else
  logdir=$data/log
fi
if [ $# -ge 3 ]; then
  plpdir=$3
else
  plpdir=$data/data
fi

# make $plpdir an absolute pathname.
plpdir=`perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' $plpdir ${PWD}`

# use "name" as part of name of the archive.
name=`basename $data`

mkdir -p $plpdir || exit 1;
mkdir -p $logdir || exit 1;

if [ -f $data/feats.scp ]; then
  mkdir -p $data/.backup
  echo "$0: moving $data/feats.scp to $data/.backup"
  mv $data/feats.scp $data/.backup
fi

scp=$data/wav.scp

required="$scp $plp_config"

for f in $required; do
  if [ ! -f $f ]; then
    echo "$0: no such file $f"
    exit 1;
  fi
done
utils/validate_data_dir.sh --no-text --no-feats $data || exit 1;

if [ -f $data/spk2warp ]; then
  echo "$0 [info]: using VTLN warp factors from $data/spk2warp"
  vtln_opts="--vtln-map=ark:$data/spk2warp --utt2spk=ark:$data/utt2spk"
elif [ -f $data/utt2warp ]; then
  echo "$0 [info]: using VTLN warp factors from $data/utt2warp"
  vtln_opts="--vtln-map=ark:$data/utt2warp"
else
  vtln_opts=
fi

for n in $(seq $nj); do
  # the next command does nothing unless $plpdir/storage/ exists, see
  # utils/create_data_link.pl for more info.
  utils/create_data_link.pl $plpdir/raw_plp_$name.$n.ark
done

if $write_utt2num_frames; then
  write_num_frames_opt="--write-num-frames=ark,t:$logdir/utt2num_frames.JOB"
else
  write_num_frames_opt=
fi

if $write_utt2dur; then
  write_utt2dur_opt="--write-utt2dur=ark,t:$logdir/utt2dur.JOB"
else
  write_utt2dur_opt=
fi

if [ -f $data/segments ]; then
  echo "$0 [info]: segments file exists: using that."
  split_segments=
  for n in $(seq $nj); do
    split_segments="$split_segments $logdir/segments.$n"
  done

  utils/split_scp.pl $data/segments $split_segments || exit 1;
  rm $logdir/.error 2>/dev/null

  $cmd JOB=1:$nj $logdir/make_plp_${name}.JOB.log \
    extract-segments scp,p:$scp $logdir/segments.JOB ark:- \| \
    compute-plp-feats $vtln_opts $write_utt2dur_opt --verbose=2 \
      --config=$plp_config ark:- ark:- \| \
    copy-feats --compress=$compress $write_num_frames_opt ark:- \
      ark,scp:$plpdir/raw_plp_$name.JOB.ark,$plpdir/raw_plp_$name.JOB.scp \
     || exit 1;

else
  echo "$0: [info]: no segments file exists: assuming wav.scp indexed by utterance."
  split_scps=
  for n in $(seq $nj); do
    split_scps="$split_scps $logdir/wav_${name}.$n.scp"
  done

  utils/split_scp.pl $scp $split_scps || exit 1;

  $cmd JOB=1:$nj $logdir/make_plp_${name}.JOB.log \
    compute-plp-feats $vtln_opts $write_utt2dur_opt --verbose=2 \
      --config=$plp_config scp,p:$logdir/wav_${name}.JOB.scp ark:- \| \
    copy-feats --compress=$compress $write_num_frames_opt ark:- \
      ark,scp:$plpdir/raw_plp_$name.JOB.ark,$plpdir/raw_plp_$name.JOB.scp \
      || exit 1;

fi


if [ -f $logdir/.error.$name ]; then
  echo "$0: Error producing PLP features for $name:"
  tail $logdir/make_plp_${name}.1.log
  exit 1;
fi

# concatenate the .scp files together.
for n in $(seq $nj); do
  cat $plpdir/raw_plp_$name.$n.scp || exit 1
done > $data/feats.scp

if $write_utt2num_frames; then
  for n in $(seq $nj); do
    cat $logdir/utt2num_frames.$n || exit 1
  done > $data/utt2num_frames || exit 1
fi

if $write_utt2dur; then
  for n in $(seq $nj); do
    cat $logdir/utt2dur.$n || exit 1
  done > $data/utt2dur || exit 1
fi

# Store frame_shift and plp_config along with features.
frame_shift=$(perl -ne 'if (/^--frame-shift=(\d+)/) {
                          printf "%.3f", 0.001 * $1; exit; }' $plp_config)
echo ${frame_shift:-'0.01'} > $data/frame_shift
mkdir -p $data/conf && cp $plp_config $data/conf/plp.conf || exit 1

rm $logdir/wav_${name}.*.scp  $logdir/segments.* \
   $logdir/utt2num_frames.* $logdir/utt2dur.* 2>/dev/null

nf=$(wc -l < $data/feats.scp)
nu=$(wc -l < $data/utt2spk)
if [ $nf -ne $nu ]; then
  echo "$0: It seems not all of the feature files were successfully procesed" \
       "($nf != $nu); consider using utils/fix_data_dir.sh $data"
fi

if (( nf < nu - nu/20 )); then
  echo "$0: Less than 95% the features were successfully generated."\
       "Probably a serious error."
  exit 1
fi

echo "$0: Succeeded creating PLP features for $name"


================================================
FILE: egs/steps/make_plp_pitch.sh
================================================
#!/usr/bin/env bash

# Copyright 2013  The Shenzhen Key Laboratory of Intelligent Media and Speech,
#                 PKU-HKUST Shenzhen Hong Kong Institution (Author: Wei Shi)
#           2016  Johns Hopkins University (Author: Daniel Povey)
# Apache 2.0
# Combine PLP and pitch features together
# Note: This file is based on make_plp.sh and make_pitch_kaldi.sh

# Begin configuration section.
nj=4
cmd=run.pl
plp_config=conf/plp.conf
pitch_config=conf/pitch.conf
pitch_postprocess_config=
paste_length_tolerance=2
compress=true
write_utt2num_frames=true  # If true writes utt2num_frames.
write_utt2dur=true
# End configuration section.

echo "$0 $@"  # Print the command line for logging.

if [ -f ./path.sh ]; then . ./path.sh;  fi
. parse_options.sh || exit 1;

if [ $# -lt 1 ] || [ $# -gt 3 ]; then
  cat >&2 <<EOF
Usage: $0 [options] <data-dir> [<log-dir> [<plp-dir>] ]
 e.g.: $0 data/train
Note: <log-dir> defaults to <data-dir>/log, and
      <plp-dir> defaults to <data-dir>/data
Options:
  --plp-config <plp-config-file>       # config passed to compute-plp-feats.
  --pitch-config <pitch-config-file>   # config passed to compute-kaldi-pitch-feats.
  --pitch-postprocess-config <postprocess-config-file> # config passed to process-kaldi-pitch-feats.
  --paste-length-tolerance <tolerance> # length tolerance passed to paste-feats.
  --nj <nj>                            # number of parallel jobs.
  --cmd <run.pl|queue.pl <queue opts>> # how to run jobs.
  --write-utt2num-frames <true|false>  # If true, write utt2num_frames file.
  --write-utt2dur <true|false>         # If true, write utt2dur file.
EOF
   exit 1;
fi

data=$1
if [ $# -ge 2 ]; then
  logdir=$2
else
  logdir=$data/log
fi
if [ $# -ge 3 ]; then
  plp_pitch_dir=$3
else
  plp_pitch_dir=$data/data
fi

# make $plp_pitch_dir an absolute pathname.
plp_pitch_dir=`perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' $plp_pitch_dir ${PWD}`

# use "name" as part of name of the archive.
name=`basename $data`

mkdir -p $plp_pitch_dir || exit 1;
mkdir -p $logdir || exit 1;

if [ -f $data/feats.scp ]; then
  mkdir -p $data/.backup
  echo "$0: moving $data/feats.scp to $data/.backup"
  mv $data/feats.scp $data/.backup
fi

scp=$data/wav.scp

required="$scp $plp_config $pitch_config"

for f in $required; do
  if [ ! -f $f ]; then
    echo "$0: no such file $f"
    exit 1;
  fi
done
utils/validate_data_dir.sh --no-text --no-feats $data || exit 1;

if [ ! -z "$pitch_postprocess_config" ]; then
  postprocess_config_opt="--config=$pitch_postprocess_config";
else
  postprocess_config_opt=
fi

if [ -f $data/spk2warp ]; then
  echo "$0 [info]: using VTLN warp factors from $data/spk2warp"
  vtln_opts="--vtln-map=ark:$data/spk2warp --utt2spk=ark:$data/utt2spk"
elif [ -f $data/utt2warp ]; then
  echo "$0 [info]: using VTLN warp factors from $data/utt2warp"
  vtln_opts="--vtln-map=ark:$data/utt2warp"
fi

for n in $(seq $nj); do
  # the next command does nothing unless $plp_pitch_dir/storage/ exists, see
  # utils/create_data_link.pl for more info.
  utils/create_data_link.pl $plp_pitch_dir/raw_plp_pitch_$name.$n.ark
done

if $write_utt2num_frames; then
  write_num_frames_opt="--write-num-frames=ark,t:$logdir/utt2num_frames.JOB"
else
  write_num_frames_opt=
fi

if $write_utt2dur; then
  write_utt2dur_opt="--write-utt2dur=ark,t:$logdir/utt2dur.JOB"
else
  write_utt2dur_opt=
fi

if [ -f $data/segments ]; then
  echo "$0 [info]: segments file exists: using that."
  split_segments=
  for n in $(seq $nj); do
    split_segments="$split_segments $logdir/segments.$n"
  done

  utils/split_scp.pl $data/segments $split_segments || exit 1;
  rm $logdir/.error 2>/dev/null

  plp_feats="ark:extract-segments scp,p:$scp $logdir/segments.JOB ark:- | \
    compute-plp-feats $vtln_opts $write_utt2dur_opt --verbose=2 \
      --config=$plp_config ark:- ark:- |"
  pitch_feats="ark,s,cs:extract-segments scp,p:$scp $logdir/segments.JOB ark:- | \
    compute-kaldi-pitch-feats --verbose=2 --config=$pitch_config ark:- ark:- | \
    process-kaldi-pitch-feats $postprocess_config_opt ark:- ark:- |"

  $cmd JOB=1:$nj $logdir/make_plp_pitch_${name}.JOB.log \
    paste-feats --length-tolerance=$paste_length_tolerance \
      "$plp_feats" "$pitch_feats" ark:- \| \
    copy-feats --compress=$compress $write_num_frames_opt ark:- \
      ark,scp:$plp_pitch_dir/raw_plp_pitch_$name.JOB.ark,$plp_pitch_dir/raw_plp_pitch_$name.JOB.scp \
     || exit 1;

else
  echo "$0: [info]: no segments file exists: assuming wav.scp indexed by utterance."
  split_scps=
  for n in $(seq $nj); do
    split_scps="$split_scps $logdir/wav_${name}.$n.scp"
  done

  utils/split_scp.pl $scp $split_scps || exit 1;

  plp_feats="ark:compute-plp-feats $vtln_opts $write_utt2dur_opt --verbose=2 \
    --config=$plp_config scp,p:$logdir/wav_${name}.JOB.scp ark:- |"
  pitch_feats="ark,s,cs:compute-kaldi-pitch-feats --verbose=2 \
      --config=$pitch_config scp,p:$logdir/wav_${name}.JOB.scp ark:- | \
    process-kaldi-pitch-feats $postprocess_config_opt ark:- ark:- |"

  $cmd JOB=1:$nj $logdir/make_plp_pitch_${name}.JOB.log \
    paste-feats --length-tolerance=$paste_length_tolerance \
      "$plp_feats" "$pitch_feats" ark:- \| \
    copy-feats --compress=$compress $write_num_frames_opt ark:- \
      ark,scp:$plp_pitch_dir/raw_plp_pitch_$name.JOB.ark,$plp_pitch_dir/raw_plp_pitch_$name.JOB.scp \
      || exit 1;
fi


if [ -f $logdir/.error.$name ]; then
  echo "$0: Error producing PLP and pitch features for $name:"
  tail $logdir/make_plp_pitch_${name}.1.log
  exit 1;
fi

# Concatenate the .scp files together.
for n in $(seq $nj); do
  cat $plp_pitch_dir/raw_plp_pitch_$name.$n.scp || exit 1
done > $data/feats.scp || exit 1

if $write_utt2num_frames; then
  for n in $(seq $nj); do
    cat $logdir/utt2num_frames.$n || exit 1
  done > $data/utt2num_frames || exit 1
fi

if $write_utt2dur; then
  for n in $(seq $nj); do
    cat $logdir/utt2dur.$n || exit 1
  done > $data/utt2dur || exit 1
fi

# Store frame_shift, plp_config and pitch_config along with features.
frame_shift=$(perl -ne 'if (/^--frame-shift=(\d+)/) {
                          printf "%.3f", 0.001 * $1; exit; }' $plp_config)
echo ${frame_shift:-'0.01'} > $data/frame_shift
mkdir -p $data/conf &&
  cp $plp_config $data/conf/plp.conf &&
  cp $pitch_config $data/conf/pitch.conf || exit 1

rm $logdir/wav_${name}.*.scp  $logdir/segments.* \
   $logdir/utt2num_frames.* $logdir/utt2dur.* 2>/dev/null

nf=$(wc -l < $data/feats.scp)
nu=$(wc -l < $data/utt2spk)
if [ $nf -ne $nu ]; then
  echo "$0: It seems not all of the feature files were successfully procesed" \
       "($nf != $nu); consider using utils/fix_data_dir.sh $data"
fi

if (( nf < nu - nu/20 )); then
  echo "$0: Less than 95% the features were successfully generated."\
       "Probably a serious error."
  exit 1
fi

echo "$0: Succeeded creating PLP and pitch features for $name"


================================================
FILE: egs/steps/nnet/align.sh
================================================
#!/usr/bin/env bash
# Copyright 2012-2015 Brno University of Technology (author: Karel Vesely)
# Apache 2.0

# Aligns 'data' to sequences of transition-ids using Neural Network based acoustic model.
# Optionally produces alignment in lattice format, this is handy to get word alignment.

# Begin configuration section.
nj=4
cmd=run.pl
stage=0
# Begin configuration.
scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
beam=10
retry_beam=40
nnet_forward_opts="--no-softmax=true --prior-scale=1.0"
ivector=            # rx-specifier with i-vectors (ark-with-vectors),
text= # (optional) transcipts we align to,

align_to_lats=false # optionally produce alignment in lattice format
 lats_decode_opts="--acoustic-scale=0.1 --beam=20 --lattice_beam=10"
 lats_graph_scales="--transition-scale=1.0 --self-loop-scale=0.1"

use_gpu="no" # yes|no|optionaly
# End configuration options.

[ $# -gt 0 ] && echo "$0 $@"  # Print the command line for logging

[ -f path.sh ] && . ./path.sh # source the path.
. parse_options.sh || exit 1;

set -euo pipefail

if [ $# != 4 ]; then
   echo "usage: $0 <data-dir> <lang-dir> <src-dir> <align-dir>"
   echo "e.g.:  $0 data/train data/lang exp/tri1 exp/tri1_ali"
   echo "main options (for others, see top of script file)"
   echo "  --config <config-file>                           # config containing options"
   echo "  --nj <nj>                                        # number of parallel jobs"
   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
   exit 1;
fi

data=$1
lang=$2
srcdir=$3
dir=$4

mkdir -p $dir/log
echo $nj > $dir/num_jobs
sdata=$data/split$nj
[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;

utils/lang/check_phones_compatible.sh $lang/phones.txt $srcdir/phones.txt
cp $lang/phones.txt $dir

cp $srcdir/{tree,final.mdl} $dir || exit 1;

# Select default locations to model files
nnet=$srcdir/final.nnet;
class_frame_counts=$srcdir/ali_train_pdf.counts
feature_transform=$srcdir/final.feature_transform
model=$dir/final.mdl

# Check that files exist
for f in $sdata/1/feats.scp $lang/L.fst $nnet $model $feature_transform $class_frame_counts; do
  [ ! -f $f ] && echo "$0: missing file $f" && exit 1;
done
[ -z "$text" -a ! -f $sdata/1/text ] && echo "$0: missing file $f" && exit 1


# PREPARE FEATURE EXTRACTION PIPELINE
# import config,
online_cmvn_opts=
cmvn_opts=
delta_opts=
D=$srcdir
[ -e $D/online_cmvn_opts ] && online_cmvn_opts=$(cat $D/online_cmvn_opts)
[ -e $D/cmvn_opts ] && cmvn_opts=$(cat $D/cmvn_opts)
[ -e $D/delta_opts ] && delta_opts=$(cat $D/delta_opts)
#
# Create the feature stream,
feats="ark,s,cs:copy-feats scp:$sdata/JOB/feats.scp ark:- |"
# apply-cmvn-online (optional),
[ -n "$online_cmvn_opts" -a ! -f $D/global_cmvn_stats.mat ] && echo "$0: Missing $D/global_cmvn_stats.mat" && exit 1
[ -n "$online_cmvn_opts" ] && feats="$feats apply-cmvn-online $online_cmvn_opts --spk2utt=ark:$srcdata/spk2utt $D/global_cmvn_stats.mat ark:- ark:- |"
# apply-cmvn (optional),
[ -n "$cmvn_opts" -a ! -f $sdata/1/cmvn.scp ] && echo "$0: Missing $sdata/1/cmvn.scp" && exit 1
[ -n "$cmvn_opts" ] && feats="$feats apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp ark:- ark:- |"
# add-deltas (optional),
[ -n "$delta_opts" ] && feats="$feats add-deltas $delta_opts ark:- ark:- |"

# add-ivector (optional),
if [ -e $D/ivector_dim ]; then
  [ -z $ivector ] && echo "Missing --ivector, they were used in training!" && exit 1
  # Get the tool,
  ivector_append_tool=append-vector-to-feats # default,
  [ -e $D/ivector_append_tool ] && ivector_append_tool=$(cat $D/ivector_append_tool)
  # Check dims,
  feats_job_1=$(sed 's:JOB:1:g' <(echo $feats))
  dim_raw=$(feat-to-dim "$feats_job_1" -)
  dim_raw_and_ivec=$(feat-to-dim "$feats_job_1 $ivector_append_tool ark:- '$ivector' ark:- |" -)
  dim_ivec=$((dim_raw_and_ivec - dim_raw))
  [ $dim_ivec != "$(cat $D/ivector_dim)" ] && \
    echo "Error, i-vector dim. mismatch (expected $(cat $D/ivector_dim), got $dim_ivec in '$ivector')" && \
    exit 1
  # Append to feats,
  feats="$feats $ivector_append_tool ark:- '$ivector' ark:- |"
fi

# nnet-forward,
feats="$feats nnet-forward $nnet_forward_opts --feature-transform=$feature_transform --class-frame-counts=$class_frame_counts --use-gpu=$use_gpu $nnet ark:- ark:- |"
#

echo "$0: aligning data '$data' using nnet/model '$srcdir', putting alignments in '$dir'"

# Map oovs in reference transcription,
oov=`cat $lang/oov.int` || exit 1;
[ -z "$text" ] && text=$sdata/JOB/text
tra="ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt $text |";
# We could just use align-mapped in the next line, but it's less efficient as it compiles the
# training graphs one by one.
if [ $stage -le 0 ]; then
  $cmd JOB=1:$nj $dir/log/align.JOB.log \
    compile-train-graphs --read-disambig-syms=$lang/phones/disambig.int $dir/tree $dir/final.mdl $lang/L.fst "$tra" ark:- \| \
    align-compiled-mapped $scale_opts --beam=$beam --retry-beam=$retry_beam $dir/final.mdl ark:- \
      "$feats" "ark,t:|gzip -c >$dir/ali.JOB.gz" || exit 1;
fi

# Optionally align to lattice format (handy to get word alignment)
if [ "$align_to_lats" == "true" ]; then
  echo "$0: aligning also to lattices '$dir/lat.*.gz'"
  $cmd JOB=1:$nj $dir/log/align_lat.JOB.log \
    compile-train-graphs --read-disambig-syms=$lang/phones/disambig.int $lats_graph_scales $dir/tree $dir/final.mdl  $lang/L.fst "$tra" ark:- \| \
    latgen-faster-mapped $lats_decode_opts --word-symbol-table=$lang/words.txt $dir/final.mdl ark:- \
      "$feats" "ark:|gzip -c >$dir/lat.JOB.gz" || exit 1;
fi

echo "$0: done aligning data."


================================================
FILE: egs/steps/nnet/decode.sh
================================================
#!/usr/bin/env bash

# Copyright 2012-2015 Brno University of Technology (author: Karel Vesely), Daniel Povey
# Apache 2.0

# Begin configuration section.
nnet=               # non-default location of DNN (optional)
feature_transform=  # non-default location of feature_transform (optional)
model=              # non-default location of transition model (optional)
class_frame_counts= # non-default location of PDF counts (optional)
srcdir=             # non-default location of DNN-dir (decouples model dir from decode dir)
ivector=            # rx-specifier with i-vectors (ark-with-vectors),

blocksoftmax_dims=   # 'csl' with block-softmax dimensions: dim1,dim2,dim3,...
blocksoftmax_active= # '1' for the 1st block,

stage=0 # stage=1 skips lattice generation
nj=4
cmd=run.pl

acwt=0.10 # note: only really affects pruning (scoring is on lattices).
beam=13.0
lattice_beam=8.0
min_active=200
max_active=7000 # limit of active tokens
max_mem=50000000 # approx. limit to memory consumption during minimization in bytes
nnet_forward_opts="--no-softmax=true --prior-scale=1.0"

skip_scoring=false
scoring_opts="--min-lmwt 4 --max-lmwt 15"

num_threads=1 # if >1, will use latgen-faster-parallel
parallel_opts=   # Ignored now.
use_gpu="no" # yes|no|optionaly
# End configuration section.

echo "$0 $@"  # Print the command line for logging

[ -f ./path.sh ] && . ./path.sh; # source the path.
. parse_options.sh || exit 1;

set -euo pipefail

if [ $# != 3 ]; then
   echo "Usage: $0 [options] <graph-dir> <data-dir> <decode-dir>"
   echo "... where <decode-dir> is assumed to be a sub-directory of the directory"
   echo " where the DNN and transition model is."
   echo "e.g.: $0 exp/dnn1/graph_tgpr data/test exp/dnn1/decode_tgpr"
   echo ""
   echo "This script works on plain or modified features (CMN,delta+delta-delta),"
   echo "which are then sent through feature-transform. It works out what type"
   echo "of features you used from content of srcdir."
   echo ""
   echo "main options (for others, see top of script file)"
   echo "  --config <config-file>                           # config containing options"
   echo "  --nj <nj>                                        # number of parallel jobs"
   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
   echo ""
   echo "  --nnet <nnet>                                    # non-default location of DNN (opt.)"
   echo "  --srcdir <dir>                                   # non-default dir with DNN/models, can be different"
   echo "                                                   # from parent dir of <decode-dir>' (opt.)"
   echo ""
   echo "  --acwt <float>                                   # select acoustic scale for decoding"
   echo "  --scoring-opts <opts>                            # options forwarded to local/score.sh"
   echo "  --num-threads <N>                                # N>1: run multi-threaded decoder"
   exit 1;
fi


graphdir=$1
data=$2
dir=$3
[ -z $srcdir ] && srcdir=`dirname $dir`; # Default model directory one level up from decoding directory.
sdata=$data/split$nj;

mkdir -p $dir/log

[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
echo $nj > $dir/num_jobs

# Select default locations to model files (if not already set externally)
[ -z "$nnet" ] && nnet=$srcdir/final.nnet
[ -z "$model" ] && model=$srcdir/final.mdl
[ -z "$feature_transform" -a -e $srcdir/final.feature_transform ] && feature_transform=$srcdir/final.feature_transform
#
[ -z "$class_frame_counts" -a -f $srcdir/prior_counts ] && class_frame_counts=$srcdir/prior_counts # priority,
[ -z "$class_frame_counts" ] && class_frame_counts=$srcdir/ali_train_pdf.counts

# Check that files exist,
for f in $sdata/1/feats.scp $nnet $model $feature_transform $class_frame_counts $graphdir/HCLG.fst; do
  [ ! -f $f ] && echo "$0: missing file $f" && exit 1;
done

# Possibly use multi-threaded decoder
thread_string=
[ $num_threads -gt 1 ] && thread_string="-parallel --num-threads=$num_threads"


# PREPARE FEATURE EXTRACTION PIPELINE
# import config,
online_cmvn_opts=
cmvn_opts=
delta_opts=
D=$srcdir
[ -e $D/online_cmvn_opts ] && online_cmvn_opts=$(cat $D/online_cmvn_opts)
[ -e $D/cmvn_opts ] && cmvn_opts=$(cat $D/cmvn_opts)
[ -e $D/delta_opts ] && delta_opts=$(cat $D/delta_opts)
#
# Create the feature stream,
feats="ark,s,cs:copy-feats scp:$sdata/JOB/feats.scp ark:- |"
# apply-cmvn-online (optional),
[ -n "$online_cmvn_opts" -a ! -f $D/global_cmvn_stats.mat ] && echo "$0: Missing $D/global_cmvn_stats.mat" && exit 1
[ -n "$online_cmvn_opts" ] && feats="$feats apply-cmvn-online $online_cmvn_opts --spk2utt=ark:$srcdata/spk2utt $D/global_cmvn_stats.mat ark:- ark:- |"
# apply-cmvn (optional),
[ -n "$cmvn_opts" -a ! -f $sdata/1/cmvn.scp ] && echo "$0: Missing $sdata/1/cmvn.scp" && exit 1
[ -n "$cmvn_opts" ] && feats="$feats apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp ark:- ark:- |"
# add-deltas (optional),
[ -n "$delta_opts" ] && feats="$feats add-deltas $delta_opts ark:- ark:- |"

# add-ivector (optional),
if [ -e $D/ivector_dim ]; then
  [ -z $ivector ] && echo "Missing --ivector, they were used in training!" && exit 1
  # Get the tool,
  ivector_append_tool=append-vector-to-feats # default,
  [ -e $D/ivector_append_tool ] && ivector_append_tool=$(cat $D/ivector_append_tool)
  # Check dims,
  feats_job_1=$(sed 's:JOB:1:g' <(echo $feats))
  dim_raw=$(feat-to-dim "$feats_job_1" -)
  dim_raw_and_ivec=$(feat-to-dim "$feats_job_1 $ivector_append_tool ark:- '$ivector' ark:- |" -)
  dim_ivec=$((dim_raw_and_ivec - dim_raw))
  [ $dim_ivec != "$(cat $D/ivector_dim)" ] && \
    echo "Error, i-vector dim. mismatch (expected $(cat $D/ivector_dim), got $dim_ivec in '$ivector')" && \
    exit 1
  # Append to feats,
  feats="$feats $ivector_append_tool ark:- '$ivector' ark:- |"
fi

# select a block from blocksoftmax,
if [ ! -z "$blocksoftmax_dims" ]; then
  # blocksoftmax_active is a csl! dim1,dim2,dim3,...
  [ -z "$blocksoftmax_active" ] && echo "$0 Missing option --blocksoftmax-active N" && exit 1
  # getting dims,
  dim_total=$(awk -F'[:,]' '{ for(i=1;i<=NF;i++) { sum += $i }; print sum; }' <(echo $blocksoftmax_dims))
  dim_block=$(awk -F'[:,]' -v active=$blocksoftmax_active '{ print $active; }' <(echo $blocksoftmax_dims))
  offset=$(awk -F'[:,]' -v active=$blocksoftmax_active '{ sum=0; for(i=1;i<active;i++) { sum += $i }; print sum; }' <(echo $blocksoftmax_dims))
  # create components which select a block,
  nnet-initialize <(echo "<Copy> <InputDim> $dim_total <OutputDim> $dim_block <BuildVector> $((1+offset)):$((offset+dim_block)) </BuildVector>";
                    echo "<Softmax> <InputDim> $dim_block <OutputDim> $dim_block") $dir/copy_and_softmax.nnet
  # nnet is assembled on-the fly, <BlockSoftmax> is removed, while <Copy> + <Softmax> is added,
  nnet="nnet-concat 'nnet-copy --remove-last-components=1 $nnet - |' $dir/copy_and_softmax.nnet - |"
fi

# Run the decoding in the queue,
if [ $stage -le 0 ]; then
  $cmd --num-threads $((num_threads+1)) JOB=1:$nj $dir/log/decode.JOB.log \
    nnet-forward $nnet_forward_opts --feature-transform=$feature_transform --class-frame-counts=$class_frame_counts --use-gpu=$use_gpu "$nnet" "$feats" ark:- \| \
    latgen-faster-mapped$thread_string --min-active=$min_active --max-active=$max_active --max-mem=$max_mem --beam=$beam \
    --lattice-beam=$lattice_beam --acoustic-scale=$acwt --allow-partial=true --word-symbol-table=$graphdir/words.txt \
    $model $graphdir/HCLG.fst ark:- "ark:|gzip -c > $dir/lat.JOB.gz" || exit 1;
fi

# Run the scoring
if ! $skip_scoring ; then
  [ ! -x local/score.sh ] && \
    echo "Not scoring because local/score.sh does not exist or not executable." && exit 1;
  local/score.sh $scoring_opts --cmd "$cmd" $data $graphdir $dir || exit 1;
fi

exit 0;


================================================
FILE: egs/steps/nnet/ivector/extract_ivectors.sh
================================================
#!/usr/bin/env bash

# Copyright     2013  Daniel Povey
#               2016  Brno University of Technology (author: Karel Vesely)
# Apache 2.0.


# This script computes iVectors in the same format as extract_ivectors_online.sh,
# except that they are actually not really computed online, they are first computed
# per speaker and just duplicated many times.
# This is mainly intended for use in decoding, where you want the best possible
# quality of iVectors.
#
# This setup also makes it possible to use a previous decoding or alignment, to
# down-weight silence in the stats (default is --silence-weight 0.0).
#
# This is for when you use the "online-decoding" setup in an offline task, and
# you want the best possible results.


# Begin configuration section.
nj=30
cmd="run.pl"
stage=0
num_gselect=5 # Gaussian-selection using diagonal model: number of Gaussians to select
min_post=0.025 # Minimum posterior to use (posteriors below this are pruned out)

posterior_scale=0.1 # Scale on the acoustic posteriors, intended to account for
                    # inter-frame correlations.  Making this small during iVector
                    # extraction is equivalent to scaling up the prior, and will
                    # will tend to produce smaller iVectors where data-counts are
                    # small.  It's not so important that this match the value
                    # used when training the iVector extractor, but more important
                    # that this match the value used when you do real online decoding
                    # with the neural nets trained with these iVectors.

max_count=100       # Interpret this as a number of frames times posterior scale...
                    # this config ensures that once the count exceeds this (i.e.
                    # 1000 frames, or 10 seconds, by default), we start to scale
                    # down the stats, accentuating the prior term.   This seems quite
                    # important for some reason.

silence_weight=0.0
acwt=0.1  # used if input is a decode dir, to get best path from lattices.
mdl=final  # change this if decode directory did not have ../final.mdl present.

# End configuration section.

echo "$0 $@"  # Print the command line for logging

if [ -f path.sh ]; then . ./path.sh; fi
. parse_options.sh || exit 1;


if [ $# != 4 ] && [ $# != 5 ]; then
  echo "Usage: $0 [options] <data> <lang> <extractor-dir> [<alignment-dir>|<decode-dir>|<weights-archive>] <ivector-dir>"
  echo " e.g.: $0 data/test exp/nnet2_online/extractor exp/tri3/decode_test exp/nnet2_online/ivectors_test"
  echo "main options (for others, see top of script file)"
  echo "  --config <config-file>                           # config containing options"
  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
  echo "  --nj <n|10>                                      # Number of jobs (also see num-processes and num-threads)"
  echo "                                                   # Ignored if <alignment-dir> or <decode-dir> supplied."
  echo "  --stage <stage|0>                                # To control partial reruns"
  echo "  --num-gselect <n|5>                              # Number of Gaussians to select using"
  echo "                                                   # diagonal model."
  echo "  --min-post <float;default=0.025>                 # Pruning threshold for posteriors"
  echo "  --ivector-period <int;default=10>                # How often to extract an iVector (frames)"
  echo "  --posterior-scale <float;default=0.1>            # Scale on posteriors in iVector extraction; "
  echo "                                                   # affects strength of prior term."

  exit 1;
fi

set -euxo pipefail

if [ $# -eq 4 ]; then
  data=$1
  lang=$2
  srcdir=$3
  dir=$4
else # 5 arguments
  data=$1
  lang=$2
  srcdir=$3
  ali_or_decode_dir=$4
  dir=$5
fi

for f in $data/feats.scp $srcdir/final.ie $srcdir/final.dubm $lang/phones.txt; do
  [ ! -f $f ] && echo "$0: No such file $f" && exit 1;
done

mkdir -p $dir/log
silphonelist=$(cat $lang/phones/silence.csl) || exit 1;

if [ ! -z "$ali_or_decode_dir" ]; then

  if [ -f $ali_or_decode_dir/ali.1.gz ]; then
    if [ ! -f $ali_or_decode_dir/${mdl}.mdl ]; then
      echo "$0: expected $ali_or_decode_dir/${mdl}.mdl to exist."
      exit 1;
    fi
    nj_orig=$(cat $ali_or_decode_dir/num_jobs) || exit 1;

    if [ $stage -le 0 ]; then
      rm $dir/weights.*.gz 2>/dev/null || true

      $cmd JOB=1:$nj_orig  $dir/log/ali_to_post.JOB.log \
        gunzip -c $ali_or_decode_dir/ali.JOB.gz \| \
        ali-to-post ark:- ark:- \| \
        weight-silence-post $silence_weight $silphonelist $ali_or_decode_dir/final.mdl ark:- ark:- \| \
        post-to-weights ark:- "ark:|gzip -c >$dir/weights.JOB.gz" || exit 1;

      # put all the weights in one archive.
      for j in $(seq $nj_orig); do gunzip -c $dir/weights.$j.gz; done | gzip -c >$dir/weights.gz || exit 1;
      rm $dir/weights.*.gz || exit 1;
    fi

  elif [ -f $ali_or_decode_dir/lat.1.gz ]; then
    nj_orig=$(cat $ali_or_decode_dir/num_jobs) || exit 1;
    if [ ! -f $ali_or_decode_dir/../${mdl}.mdl ]; then
      echo "$0: expected $ali_or_decode_dir/../${mdl}.mdl to exist."
      exit 1;
    fi


    if [ $stage -le 0 ]; then
      rm $dir/weights.*.gz 2>/dev/null || true

      $cmd JOB=1:$nj_orig  $dir/log/lat_to_post.JOB.log \
        lattice-best-path --acoustic-scale=$acwt "ark:gunzip -c $ali_or_decode_dir/lat.JOB.gz|" ark:/dev/null ark:- \| \
        ali-to-post ark:- ark:- \| \
        weight-silence-post $silence_weight $silphonelist $ali_or_decode_dir/../${mdl}.mdl ark:- ark:- \| \
        post-to-weights ark:- "ark:|gzip -c >$dir/weights.JOB.gz" || exit 1;

      # put all the weights in one archive.
      for j in $(seq $nj_orig); do gunzip -c $dir/weights.$j.gz; done | gzip -c >$dir/weights.gz || exit 1;
      rm $dir/weights.*.gz || exit 1;
    fi

  elif [ -f $ali_or_decode_dir ] && gunzip -c $ali_or_decode_dir >/dev/null; then
    cp $ali_or_decode_dir $dir/weights.gz || exit 1;

  else
    echo "$0: expected ali.1.gz or lat.1.gz to exist in $ali_or_decode_dir";
    exit 1;
  fi
fi

sdata=$data/split$nj;
utils/split_data.sh $data $nj || exit 1;

gmm_feats="ark,s,cs:copy-feats scp:$sdata/JOB/feats.scp ark:- |"
feats="$gmm_feats"

# (here originally was the sub-speaker hack),
this_sdata=$sdata

# Per-speaker i-vectors,
if [ $stage -le 2 ]; then
  if [ ! -z "$ali_or_decode_dir" ]; then
    $cmd JOB=1:$nj $dir/log/extract_ivectors.JOB.log \
      gmm-global-get-post --n=$num_gselect --min-post=$min_post $srcdir/final.dubm "$gmm_feats" ark:- \| \
      weight-post ark:- "ark,s,cs:gunzip -c $dir/weights.gz|" ark:- \| \
      ivector-extract --acoustic-weight=$posterior_scale --compute-objf-change=true \
        --max-count=$max_count --spk2utt=ark:$this_sdata/JOB/spk2utt \
      $srcdir/final.ie "$feats" ark,s,cs:- ark:$dir/ivectors_spk.JOB.ark
  else
    $cmd JOB=1:$nj $dir/log/extract_ivectors.JOB.log \
      gmm-global-get-post --n=$num_gselect --min-post=$min_post $srcdir/final.dubm "$gmm_feats" ark:- \| \
      ivector-extract --acoustic-weight=$posterior_scale --compute-objf-change=true \
        --max-count=$max_count --spk2utt=ark:$this_sdata/JOB/spk2utt \
      $srcdir/final.ie "$feats" ark,s,cs:- ark:$dir/ivectors_spk.JOB.ark
  fi
fi

# Per-utterance i-vectors,
if [ $stage -le 3 ]; then
  if [ ! -z "$ali_or_decode_dir" ]; then
    $cmd JOB=1:$nj $dir/log/extract_ivectors_utt.JOB.log \
      gmm-global-get-post --n=$num_gselect --min-post=$min_post $srcdir/final.dubm "$gmm_feats" ark:- \| \
      weight-post ark:- "ark,s,cs:gunzip -c $dir/weights.gz|" ark:- \| \
      ivector-extract --acoustic-weight=$posterior_scale --compute-objf-change=true --max-count=$max_count \
      $srcdir/final.ie "$feats" ark,s,cs:- ark:$dir/ivectors_utt.JOB.ark
  else
    $cmd JOB=1:$nj $dir/log/extract_ivectors_utt.JOB.log \
      gmm-global-get-post --n=$num_gselect --min-post=$min_post $srcdir/final.dubm "$gmm_feats" ark:- \| \
      ivector-extract --acoustic-weight=$posterior_scale --compute-objf-change=true --max-count=$max_count \
      $srcdir/final.ie "$feats" ark,s,cs:- ark:$dir/ivectors_utt.JOB.ark
  fi
fi

absdir=$(utils/make_absolute.sh $dir)
if [ $stage -le 4 ]; then
  echo "$0: merging iVectors across jobs"
  copy-vector "ark:cat $dir/ivectors_spk.*.ark |" ark,scp:$absdir/ivectors_spk.ark,$dir/ivectors_spk.scp
  rm $dir/ivectors_spk.*.ark
  copy-vector "ark:cat $dir/ivectors_utt.*.ark |" ark,scp:$absdir/ivectors_utt.ark,$dir/ivectors_utt.scp
  rm $dir/ivectors_utt.*.ark
fi

# duplicate the `speaker' i-vector to all `utterances' of that speaker,
if [ $stage -le 5 ]; then
  # filter utt2spk (remove speakers with no iVector),
  awk -v ivec_spk=$dir/ivectors_spk.scp \
    'BEGIN{ while(getline < ivec_spk) { spk_has_ivec[$1] = 1; }} { spk=$2; if(spk_has_ivec[spk]) { print $0 }}' \
    $data/utt2spk >$dir/utt2spk.filt
  # expand the list of i-vectors,
  utils/apply_map.pl -f 2 $dir/ivectors_spk.scp <$dir/utt2spk.filt >$dir/ivectors_spk-as-utt.scp
fi

echo "$0: done extracting iVectors (per-speaker, per-sentence) into '$dir'"


================================================
FILE: egs/steps/nnet/ivector/train_diag_ubm.sh
================================================
#!/usr/bin/env bash

# Copyright   2012  Johns Hopkins University (Author: Daniel Povey)
#             2013  Daniel Povey
#             2016  Brno University of Technology (Author: Karel Vesely)
# Apache 2.0.

# This script trains a diagonal UBM that we'll use in online iVector estimation,
# where the online-estimated iVector will be used as a secondary input to a deep
# neural net for single-pass DNN-based decoding.

# This script was modified from ../../sre08/v1/sid/train_diag_ubm.sh.
# It trains a diagonal UBM on top of input features. We use the original features,
# assuming they are already normalized (or transformed).

# This script does not use the trained model from the source directory to
# initialize the diagonal GMM; instead, we initialize the GMM using
# gmm-global-init-from-feats, which sets the means to random data points and
# then does some iterations of E-M in memory.  After the in-memory
# initialization we train for a few iterations in parallel.
# Note that there is a slight mismatch in that the source LDA+MLLT matrix
# (final.mat) will have been estimated using standard CMVN, and we're using
# online CMVN.  We don't think this will have much effect.


# Begin configuration section.
nj=4
cmd=run.pl
num_iters=4
stage=-2
num_gselect=30 # Number of Gaussian-selection indices to use while training
               # the model.
num_frames=500000 # number of frames to keep in memory for initialization
num_iters_init=20
initial_gauss_proportion=0.5 # Start with half the target number of Gaussians
subsample=2 # subsample all features with this periodicity, in the main E-M phase.
cleanup=true
min_gaussian_weight=0.0001
remove_low_count_gaussians=true # set this to false if you need #gauss to stay fixed.
num_threads=8
# End configuration section.

echo "$0 $@"  # Print the command line for logging

[ -f ./path.sh ] && . ./path.sh; # source the path.
. parse_options.sh || exit 1;

if [ $# != 3 ]; then
  echo "Usage: $0  <data> <num-gauss> <output-dir>"
  echo " e.g.: $0 data/train 1024 exp/diag_ubm"
  echo "Options: "
  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
  echo "  --nj <num-jobs|4>                                # number of parallel jobs to run."
  echo "  --num-iters <niter|20>                           # number of iterations of parallel "
  echo "                                                   # training (default: $num_iters)"
  echo "  --stage <stage|-2>                               # stage to do partial re-run from."
  echo "  --num-gselect <n|30>                             # Number of Gaussians per frame to"
  echo "                                                   # limit computation to, for speed"
  echo " --subsample <n|5>                                 # In main E-M phase, use every n"
  echo "                                                   # frames (a speedup)"
  echo "  --num-frames <n|500000>                          # Maximum num-frames to keep in memory"
  echo "                                                   # for model initialization"
  echo "  --num-iters-init <n|20>                          # Number of E-M iterations for model"
  echo "                                                   # initialization"
  echo " --initial-gauss-proportion <proportion|0.5>       # Proportion of Gaussians to start with"
  echo "                                                   # in initialization phase (then split)"
  echo " --num-threads <n|16>                              # number of threads to use in initialization"
  echo "                                                   # phase (must match with parallel-opts option)"
  echo " --min-gaussian-weight <weight|0.0001>             # min Gaussian weight allowed in GMM"
  echo "                                                   # initialization (this relatively high"
  echo "                                                   # value keeps counts fairly even)"
  exit 1;
fi

set -euo pipefail

data=$1
num_gauss=$2
dir=$3

! [ $num_gauss -gt 0 ] && echo "Bad num-gauss $num_gauss" && exit 1;

sdata=$data/split$nj
mkdir -p $dir/log
utils/split_data.sh $data $nj || exit 1;

for f in $data/feats.scp; do
   [ ! -f "$f" ] && echo "$0: expecting file $f to exist" && exit 1
done

# Note: there is no point subsampling all_feats, because gmm-global-init-from-feats
# effectively does subsampling itself (it keeps a random subset of the features).
all_feats="ark,s,cs:copy-feats scp:$data/feats.scp ark:- |"
feats="ark,s,cs:copy-feats scp:$sdata/JOB/feats.scp ark:- | subsample-feats --n=$subsample ark:- ark:- |"

num_gauss_init=$(perl -e "print int($initial_gauss_proportion * $num_gauss); ");
! [ $num_gauss_init -gt 0 ] && echo "Invalid num-gauss-init $num_gauss_init" && exit 1;

if [ $stage -le -2 ]; then
  echo "$0: initializing model from E-M in memory, "
  echo "$0: starting from $num_gauss_init Gaussians, reaching $num_gauss;"
  echo "$0: for $num_iters_init iterations, using at most $num_frames frames of data"

  $cmd --num-threads $num_threads $dir/log/gmm_init.log \
    gmm-global-init-from-feats --num-threads=$num_threads --num-frames=$num_frames \
     --min-gaussian-weight=$min_gaussian_weight \
     --num-gauss=$num_gauss --num-gauss-init=$num_gauss_init --num-iters=$num_iters_init \
    "$all_feats" $dir/0.dubm
fi

# Store Gaussian selection indices on disk-- this speeds up the training passes.
if [ $stage -le -1 ]; then
  echo "Getting Gaussian-selection info"
  $cmd JOB=1:$nj $dir/log/gselect.JOB.log \
    gmm-gselect --n=$num_gselect $dir/0.dubm "$feats" \
      "ark:|gzip -c >$dir/gselect.JOB.gz"
fi

echo "$0: will train for $num_iters iterations, in parallel over"
echo "$0: $nj machines, parallelized with '$cmd'"

for x in $(seq 0 $[$num_iters-1]); do
  echo "$0: Training pass $x"
  if [ $stage -le $x ]; then
  # Accumulate stats.
    $cmd JOB=1:$nj $dir/log/acc.${x}.JOB.log \
      gmm-global-acc-stats "--gselect=ark,s,cs:gunzip -c $dir/gselect.JOB.gz|" \
      $dir/$x.dubm "$feats" $dir/$x.JOB.acc
    if [ $x -lt $[$num_iters-1] ]; then # Don't remove low-count Gaussians till last iter,
      opt="--remove-low-count-gaussians=false" # or gselect info won't be valid any more.
    else
      opt="--remove-low-count-gaussians=$remove_low_count_gaussians"
    fi
    $cmd $dir/log/update.${x}.log \
      gmm-global-est $opt --min-gaussian-weight=$min_gaussian_weight $dir/${x}.dubm "gmm-global-sum-accs - $dir/${x}.*.acc|" \
      $dir/$[$x+1].dubm
    rm $dir/$x.*.acc $dir/$x.dubm
  fi
done

rm $dir/gselect.*.gz
mv $dir/$num_iters.dubm $dir/final.dubm

exit 0 # Done!


================================================
FILE: egs/steps/nnet/ivector/train_ivector_extractor.sh
================================================
#!/usr/bin/env bash

# Copyright   2013  Daniel Povey
#             2016  Brno University of Technology (Author: Karel Vesely)
# Apache 2.0.

# This script is modified from ^/egs/sre08/v1/sid/train_ivector_extractor.sh.
# It trains an iVector extractor for use in DNN training.

# This script trains the i-vector extractor.  Note: there are 3 separate levels
# of parallelization: num_threads, num_processes, and num_jobs.  This may seem a
# bit excessive.  It has to do with minimizing memory usage and disk I/O,
# subject to various constraints.  The "num_threads" is how many threads a
# program uses; the "num_processes" is the number of separate processes a single
# job spawns, and then sums the accumulators in memory.  Our recommendation:
#  - Set num_threads to the minimum of (4, or how many virtual cores your machine has).
#    (because of needing to lock various global quantities, the program can't
#    use many more than 4 threads with good CPU utilization).
#  - Set num_processes to the number of virtual cores on each machine you have, divided by
#    num_threads.  E.g. 4, if you have 16 virtual cores.   If you're on a shared queue
#    that's busy with other people's jobs, it may be wise to set it to rather less
#    than this maximum though, or your jobs won't get scheduled.  And if memory is
#    tight you need to be careful; in our normal setup, each process uses about 5G.
#  - Set num_jobs to as many of the jobs (each using $num_threads * $num_processes CPUs)
#    your queue will let you run at one time, but don't go much more than 10 or 20, or
#    summing the accumulators will possibly get slow.  If you have a lot of data, you
#    may want more jobs, though.

# Begin configuration section.
nj=10   # this is the number of separate queue jobs we run, but each one
        # contains num_processes sub-jobs.. the real number of threads we
        # run is nj * num_processes * num_threads, and the number of
        # separate pieces of data is nj * num_processes.
num_threads=4
num_processes=2 # each job runs this many processes, each with --num-threads threads
cmd="run.pl"
stage=-4
ivector_dim=100 # dimension of the extracted i-vector
num_iters=10
num_gselect=5 # Gaussian-selection using diagonal model: number of Gaussians to select
posterior_scale=0.1 # Scale on the acoustic posteriors, intended to account for
                    # inter-frame correlations.
min_post=0.025 # Minimum posterior to use (posteriors below this are pruned out)
               # caution: you should use the same value in the online-estimation
               # code.
subsample=2  # This speeds up the training: training on every 2nd feature
             # (configurable) Since the features are highly correlated across
             # frames, we don't expect to lose too much from this.
parallel_opts=  # ignored now.
cleanup=true
# End configuration section.

echo "$0 $@"  # Print the command line for logging

if [ -f path.sh ]; then . ./path.sh; fi
. parse_options.sh || exit 1;


if [ $# != 3 ]; then
  echo "Usage: $0 <data> <diagonal-ubm-dir> <extractor-dir>"
  echo " e.g.: $0 data/train exp/nnet2_online/diag_ubm/ exp/nnet2_online/extractor"
  echo "main options (for others, see top of script file)"
  echo "  --config <config-file>                           # config containing options"
  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
  echo "  --num-iters <#iters|10>                          # Number of iterations of E-M"
  echo "  --nj <n|10>                                      # Number of jobs (also see num-processes and num-threads)"
  echo "  --num-processes <n|4>                            # Number of processes for each queue job (relates"
  echo "                                                   # to summing accs in memory)"
  echo "  --num-threads <n|4>                              # Number of threads for each process (can't be usefully"
  echo "                                                   # increased much above 4)"
  echo "  --stage <stage|-4>                               # To control partial reruns"
  echo "  --num-gselect <n|5>                              # Number of Gaussians to select using"
  echo "                                                   # diagonal model."
  exit 1;
fi

set -euxo pipefail

data=$1
srcdir=$2
dir=$3

for f in $srcdir/final.dubm $data/feats.scp; do
  [ ! -f $f ] && echo "No such file $f" && exit 1;
done

# Set various variables.
mkdir -p $dir/log
nj_full=$[$nj*$num_processes]
sdata=$data/split$nj_full;
utils/split_data.sh $data $nj_full

cp $srcdir/final.dubm $dir

## Set up features.
gmm_feats="ark,s,cs:copy-feats scp:$sdata/JOB/feats.scp ark:- | subsample-feats --n=$subsample ark:- ark:- |"
feats="$gmm_feats"

# Initialize the i-vector extractor using the input GMM, which is converted to
# full because that's what the i-vector extractor expects.  Note: we have to do
# --use-weights=false to disable regression of the log weights on the ivector,
# because that would make the online estimation of the ivector difficult (since
# the online/real-time ivector estimation is the whole point of this script).
if [ $stage -le -2 ]; then
  $cmd $dir/log/init.log \
    ivector-extractor-init --ivector-dim=$ivector_dim --use-weights=false \
     "gmm-global-to-fgmm $dir/final.dubm -|" $dir/0.ie
fi

# Do Gaussian selection and posterior extracion

# if we subsample frame, modify the posterior-scale; this is likely
# to make the original posterior-scale (before subsampling) suitable.
modified_posterior_scale=$(perl -e "print $posterior_scale * $subsample;");

if [ $stage -le -1 ]; then
  echo $nj_full > $dir/num_jobs
  echo "$0: doing Gaussian selection and posterior computation"
  $cmd JOB=1:$nj_full $dir/log/post.JOB.log \
    gmm-global-get-post --n=$num_gselect --min-post=$min_post $dir/final.dubm "$gmm_feats" ark:- \| \
    scale-post ark:- $modified_posterior_scale "ark:|gzip -c >$dir/post.JOB.gz"
else
  # make sure we at least have the right number of post.*.gz files.
  if ! [ $nj_full -eq $(cat $dir/num_jobs) ]; then
    echo "Num-jobs mismatch $nj_full versus $(cat $dir/num_jobs)"
    exit 1
  fi
fi

x=0
while [ $x -lt $num_iters ]; do
  if [ $stage -le $x ]; then
    rm $dir/.error 2>/dev/null || true

    Args=() # bash array of training commands for 1:nj, that put accs to stdout.
    for j in $(seq $nj_full); do
      Args[$j]=`echo "ivector-extractor-acc-stats --num-threads=$num_threads $dir/$x.ie '$feats' 'ark,s,cs:gunzip -c $dir/post.JOB.gz|' -|" | sed s/JOB/$j/g`
    done

    echo "Accumulating stats (pass $x)"
    for g in $(seq $nj); do
      start=$[$num_processes*($g-1)+1]
      $cmd --num-threads $[$num_threads*$num_processes] $dir/log/acc.$x.$g.log \
        ivector-extractor-sum-accs --parallel=true "${Args[@]:$start:$num_processes}" \
          $dir/acc.$x.$g || touch $dir/.error &
    done
    wait
    [ -f $dir/.error ] && echo "Error accumulating stats on iteration $x" && exit 1;

    accs=""
    for j in $(seq $nj); do
      accs+="$dir/acc.$x.$j "
    done
    echo "Summing accs (pass $x)"
    $cmd $dir/log/sum_acc.$x.log \
      ivector-extractor-sum-accs $accs $dir/acc.$x

    echo "Updating model (pass $x)"
    nt=$[$num_threads*$num_processes] # use the same number of threads that
                                      # each accumulation process uses, since we
                                      # can be sure the queue will support this many.
                                      #
                                      # The parallel-opts was either specified by
                                      # the user or we computed it correctly in
                                      # tge previous stages
    $cmd --num-threads $[$num_threads*$num_processes] $dir/log/update.$x.log \
      ivector-extractor-est --num-threads=$nt $dir/$x.ie $dir/acc.$x $dir/$[$x+1].ie
    rm $dir/acc.$x.*

    if $cleanup; then
      rm $dir/acc.$x
      # rm $dir/$x.ie
    fi
  fi
  x=$[$x+1]
done

rm $dir/final.ie 2>/dev/null || true
ln -s $x.ie $dir/final.ie


================================================
FILE: egs/steps/nnet/make_bn_feats.sh
================================================
#!/usr/bin/env bash

# Copyright 2012-2015 Brno University of Technology (author: Karel Vesely)
# Apache 2.0
# To be run from .. (one directory up from here)
# see ../run.sh for example

# Begin configuration section.
nj=4
cmd=run.pl
remove_last_components=4 # remove N last components from the nnet
nnet_forward_opts=
use_gpu=no
htk_save=false
ivector=            # rx-specifier with i-vectors (ark-with-vectors),
# End configuration section.

echo "$0 $@"  # Print the command line for logging

if [ -f path.sh ]; then . ./path.sh; fi
. parse_options.sh || exit 1;

set -euo pipefail

if [ $# != 5 ]; then
   echo "usage: $0 [options] <tgt-data-dir> <src-data-dir> <nnet-dir> <log-dir> <abs-path-to-bn-feat-dir>";
   echo "options: "
   echo "  --cmd 'queue.pl <queue opts>'   # how to run jobs."
   echo "  --nj <nj>                       # number of parallel jobs"
   echo "  --remove-last-components <N>    # number of NNet Components to remove from the end"
   echo "  --use-gpu (no|yes|optional)     # forwarding on GPU"
   exit 1;
fi

if [ -f path.sh ]; then . ./path.sh; fi

data=$1
srcdata=$2
nndir=$3
logdir=$4
bnfeadir=$5

######## CONFIGURATION

# copy the dataset metadata from srcdata.
mkdir -p $data $logdir $bnfeadir || exit 1;
utils/copy_data_dir.sh $srcdata $data; rm -f $data/{feats,cmvn}.scp 2>/dev/null

# make $bnfeadir an absolute pathname.
[ '/' != ${bnfeadir:0:1} ] && bnfeadir=$PWD/$bnfeadir

required="$srcdata/feats.scp $nndir/final.nnet $nndir/final.feature_transform"
for f in $required; do
  [ ! -f $f ] && echo "$0: Missing $f" && exit 1;
done

name=$(basename $srcdata)
sdata=$srcdata/split$nj
[[ -d $sdata && $srcdata/feats.scp -ot $sdata ]] || split_data.sh $srcdata $nj || exit 1;

# Concat feature transform with trimmed MLP:
nnet=$bnfeadir/feature_extractor.nnet
nnet-concat $nndir/final.feature_transform "nnet-copy --remove-last-components=$remove_last_components $nndir/final.nnet - |" $nnet 2>$logdir/feature_extractor.log || exit 1
nnet-info $nnet >$data/feature_extractor.nnet-info

echo "Creating bn-feats into $data"

# PREPARE FEATURE EXTRACTION PIPELINE
# import config,
online_cmvn_opts=
cmvn_opts=
delta_opts=
D=$nndir
[ -e $D/online_cmvn_opts ] && online_cmvn_opts=$(cat $D/online_cmvn_opts)
[ -e $D/cmvn_opts ] && cmvn_opts=$(cat $D/cmvn_opts)
[ -e $D/delta_opts ] && delta_opts=$(cat $D/delta_opts)
#
# Create the feature stream,
feats="ark,s,cs:copy-feats scp:$sdata/JOB/feats.scp ark:- |"
# apply-cmvn-online (optional),
[ -n "$online_cmvn_opts" -a ! -f $nndir/global_cmvn_stats.mat ] && echo "$0: Missing $nndir/global_cmvn_stats.mat" && exit 1
[ -n "$online_cmvn_opts" ] && feats="$feats apply-cmvn-online $online_cmvn_opts --spk2utt=ark:$srcdata/spk2utt $nndir/global_cmvn_stats.mat ark:- ark:- |"
# apply-cmvn (optional),
[ -n "$cmvn_opts" -a ! -f $sdata/1/cmvn.scp ] && echo "$0: Missing $sdata/1/cmvn.scp" && exit 1
[ -n "$cmvn_opts" ] && feats="$feats apply-cmvn $cmvn_opts --utt2spk=ark:$srcdata/utt2spk scp:$srcdata/cmvn.scp ark:- ark:- |"
# add-deltas (optional),
[ -n "$delta_opts" ] && feats="$feats add-deltas $delta_opts ark:- ark:- |"

# add-ivector (optional),
if [ -e $D/ivector_dim ]; then
  [ -z $ivector ] && echo "Missing --ivector, they were used in training!" && exit 1
  # Get the tool,
  ivector_append_tool=append-vector-to-feats # default,
  [ -e $D/ivector_append_tool ] && ivector_append_tool=$(cat $D/ivector_append_tool)
  # Check dims,
  feats_job_1=$(sed 's:JOB:1:g' <(echo $feats))
  dim_raw=$(feat-to-dim "$feats_job_1" -)
  dim_raw_and_ivec=$(feat-to-dim "$feats_job_1 $ivector_append_tool ark:- '$ivector' ark:- |" -)
  dim_ivec=$((dim_raw_and_ivec - dim_raw))
  [ $dim_ivec != "$(cat $D/ivector_dim)" ] && \
    echo "Error, i-vector dim. mismatch (expected $(cat $D/ivector_dim), got $dim_ivec in '$ivector')" && \
    exit 1
  # Append to feats,
  feats="$feats $ivector_append_tool ark:- '$ivector' ark:- |"
fi

if [ $htk_save == false ]; then
  # Run the forward pass,
  $cmd JOB=1:$nj $logdir/make_bnfeats.JOB.log \
    nnet-forward $nnet_forward_opts --use-gpu=$use_gpu $nnet "$feats" \
    ark,scp:$bnfeadir/raw_bnfea_$name.JOB.ark,$bnfeadir/raw_bnfea_$name.JOB.scp \
    || exit 1;
  # concatenate the .scp files
  for ((n=1; n<=nj; n++)); do
    cat $bnfeadir/raw_bnfea_$name.$n.scp >> $data/feats.scp
  done

  # check sentence counts,
  N0=$(cat $srcdata/feats.scp | wc -l)
  N1=$(cat $data/feats.scp | wc -l)
  [[ "$N0" != "$N1" ]] && echo "$0: sentence-count mismatch, $srcdata $N0, $data $N1" && exit 1
  echo "Succeeded creating MLP-BN features '$data'"

else # htk_save == true
  # Run the forward pass saving HTK features,
  $cmd JOB=1:$nj $logdir/make_bnfeats_htk.JOB.log \
    mkdir -p $data/htkfeats/JOB \; \
    nnet-forward $nnet_forward_opts --use-gpu=$use_gpu $nnet "$feats" ark:- \| \
    copy-feats-to-htk --output-dir=$data/htkfeats/JOB ark:- || exit 1
  # Make list of htk features,
  find $data/htkfeats -name *.fea >$data/htkfeats.scp

  # Check sentence counts,
  N0=$(cat $srcdata/feats.scp | wc -l)
  N1=$(find $data/htkfeats.scp | wc -l)
  [[ "$N0" != "$N1" ]] && echo "$0: sentence-count mismatch, $srcdata $N0, $data/htk* $N1" && exit 1
  echo "Succeeded creating MLP-BN features '$data/htkfeats.scp'"
fi


================================================
FILE: egs/steps/nnet/make_denlats.sh
================================================
#!/usr/bin/env bash
# Copyright 2012-2013 Brno University of Technology (author: Karel Vesely), Daniel Povey
# Apache 2.0.

# Create denominator lattices for MMI/MPE/sMBR training.
# Creates its output in $dir/lat.*.ark,$dir/lat.scp
# The lattices are uncompressed, we need random access for DNN training.

# Begin configuration section.
nj=4
cmd=run.pl
sub_split=1
beam=13.0
lattice_beam=7.0
acwt=0.1
max_active=5000
nnet=
nnet_forward_opts="--no-softmax=true --prior-scale=1.0"
max_mem=20000000 # This will stop the processes getting too large.
# This is in bytes, but not "real" bytes-- you have to multiply
# by something like 5 or 10 to get real bytes (not sure why so large)
# End configuration section.
use_gpu=no # yes|no|optional
parallel_opts="--num-threads 2"
ivector=         # rx-specifier with i-vectors (ark-with-vectors),

echo "$0 $@"  # Print the command line for logging

[ -f ./path.sh ] && . ./path.sh; # source the path.
. parse_options.sh || exit 1;

set -euo pipefail

if [ $# != 4 ]; then
   echo "Usage: steps/$0 [options] <data-dir> <lang-dir> <src-dir> <exp-dir>"
   echo "  e.g.: steps/$0 data/train data/lang exp/tri1 exp/tri1_denlats"
   echo "Works for plain features (or CMN, delta), forwarded through feature-transform."
   echo ""
   echo "Main options (for others, see top of script file)"
   echo "  --config <config-file>                           # config containing options"
   echo "  --nj <nj>                                        # number of parallel jobs"
   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
   echo "  --sub-split <n-split>                            # e.g. 40; use this for "
   echo "                           # large databases so your jobs will be smaller and"
   echo "                           # will (individually) finish reasonably soon."
   exit 1;
fi

data=$1
lang=$2
srcdir=$3
dir=$4

sdata=$data/split$nj
mkdir -p $dir/log
[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
echo $nj > $dir/num_jobs

oov=`cat $lang/oov.int` || exit 1;

mkdir -p $dir

utils/lang/check_phones_compatible.sh $lang/phones.txt $srcdir/phones.txt

cp -r $lang $dir/

# Compute grammar FST which corresponds to unigram decoding graph.
new_lang="$dir/"$(basename "$lang")
echo "Making unigram grammar FST in $new_lang"
cat $data/text | utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt | \
  awk '{for(n=2;n<=NF;n++){ printf("%s ", $n); } printf("\n"); }' | \
  utils/make_unigram_grammar.pl | fstcompile | fstarcsort --sort_type=ilabel > $new_lang/G.fst \
   || exit 1;

# mkgraph.sh expects a whole directory "lang", so put everything in one directory...
# it gets L_disambig.fst and G.fst (among other things) from $dir/lang, and
# final.mdl from $srcdir; the output HCLG.fst goes in $dir/graph.

echo "Compiling decoding graph in $dir/dengraph"
if [ -s $dir/dengraph/HCLG.fst ] && [ $dir/dengraph/HCLG.fst -nt $srcdir/final.mdl ]; then
   echo "Graph $dir/dengraph/HCLG.fst already exists: skipping graph creation."
else
  utils/mkgraph.sh $new_lang $srcdir $dir/dengraph || exit 1;
fi


cp $srcdir/{tree,final.mdl} $dir

# Select default locations to model files
[ -z "$nnet" ] && nnet=$srcdir/final.nnet;
class_frame_counts=$srcdir/ali_train_pdf.counts
feature_transform=$srcdir/final.feature_transform
model=$dir/final.mdl

# Check that files exist
for f in $sdata/1/feats.scp $nnet $model $feature_transform $class_frame_counts; do
  [ ! -f $f ] && echo "$0: missing file $f" && exit 1;
done


# PREPARE FEATURE EXTRACTION PIPELINE
# import config,
cmvn_opts=
delta_opts=
D=$srcdir
[ -e $D/norm_vars ] && cmvn_opts="--norm-means=true --norm-vars=$(cat $D/norm_vars)" # Bwd-compatibility,
[ -e $D/cmvn_opts ] && cmvn_opts=$(cat $D/cmvn_opts)
[ -e $D/delta_order ] && delta_opts="--delta-order=$(cat $D/delta_order)" # Bwd-compatibility,
[ -e $D/delta_opts ] && delta_opts=$(cat $D/delta_opts)
#
# Create the feature stream,
feats="ark,s,cs:copy-feats scp:$sdata/JOB/feats.scp ark:- |"
# apply-cmvn (optional),
[ ! -z "$cmvn_opts" -a ! -f $sdata/1/cmvn.scp ] && echo "$0: Missing $sdata/1/cmvn.scp" && exit 1
[ ! -z "$cmvn_opts" ] && feats="$feats apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp ark:- ark:- |"
# add-deltas (optional),
[ ! -z "$delta_opts" ] && feats="$feats add-deltas $delta_opts ark:- ark:- |"
# add-pytel transform (optional),
[ -e $D/pytel_transform.py ] && feats="$feats /bin/env python $D/pytel_transform.py |"

# add-ivector (optional),
if [ -e $D/ivector_dim ]; then
  [ -z $ivector ] && echo "Missing --ivector, they were used in training!" && exit 1
  # Get the tool,
  ivector_append_tool=append-vector-to-feats # default,
  [ -e $D/ivector_append_tool ] && ivector_append_tool=$(cat $D/ivector_append_tool)
  # Check dims,
  feats_job_1=$(sed 's:JOB:1:g' <(echo $feats))
  dim_raw=$(feat-to-dim "$feats_job_1" -)
  dim_raw_and_ivec=$(feat-to-dim "$feats_job_1 $ivector_append_tool ark:- '$ivector' ark:- |" -)
  dim_ivec=$((dim_raw_and_ivec - dim_raw))
  [ $dim_ivec != "$(cat $D/ivector_dim)" ] && \
    echo "Error, i-vector dim. mismatch (expected $(cat $D/ivector_dim), got $dim_ivec in '$ivector')" && \
    exit 1
  # Append to feats,
  feats="$feats $ivector_append_tool ark:- '$ivector' ark:- |"
fi

# nnet-forward,
feats="$feats nnet-forward $nnet_forward_opts --feature-transform=$feature_transform --class-frame-counts=$class_frame_counts --use-gpu=$use_gpu $nnet ark:- ark:- |"

# if this job is interrupted by the user, we want any background jobs to be
# killed too.
cleanup() {
  local pids=$(jobs -pr)
  [ -n "$pids" ] && kill $pids || true
}
trap "cleanup" INT QUIT TERM EXIT


echo "$0: generating denlats from data '$data', putting lattices in '$dir'"
#1) Generate the denominator lattices
if [ $sub_split -eq 1 ]; then
  # Prepare 'scp' for storing lattices separately and gzipped
  for n in `seq $nj`; do
    [ ! -d $dir/lat$n ] && mkdir $dir/lat$n;
    cat $sdata/$n/feats.scp | \
    awk -v dir=$dir -v n=$n '{ utt=$1; utt_noslash=utt; gsub("/","_",utt_noslash);
                               printf("%s | gzip -c >%s/lat%d/%s.gz\n", utt, dir, n, utt_noslash); }'
  done >$dir/lat.store_separately_as_gz.scp
  # Generate the lattices
  $cmd $parallel_opts JOB=1:$nj $dir/log/decode_den.JOB.log \
    latgen-faster-mapped --beam=$beam --lattice-beam=$lattice_beam --acoustic-scale=$acwt \
      --max-mem=$max_mem --max-active=$max_active --word-symbol-table=$lang/words.txt $srcdir/final.mdl  \
      $dir/dengraph/HCLG.fst "$feats" "scp:$dir/lat.store_separately_as_gz.scp" || exit 1;
else
  # each job from 1 to $nj is split into multiple pieces (sub-split), and we aim
  # to have at most two jobs running at each time.  The idea is that if we have stragglers
  # from one job, we can be processing another one at the same time.
  rm -f $dir/.error

  prev_pid=
  for n in `seq $[nj+1]`; do
    if [ $n -gt $nj ]; then
      this_pid=
    elif [ -f $dir/.done.$n ] && [ $dir/.done.$n -nt $srcdir/final.mdl ]; then
      echo "Not processing subset $n as already done (delete $dir/.done.$n if not)";
      this_pid=
    else
      sdata2=$data/split$nj/$n/split${sub_split}utt;
      split_data.sh --per-utt $sdata/$n $sub_split || exit 1;
      mkdir -p $dir/log/$n
      mkdir -p $dir/part
      feats_subset=$(echo $feats | sed s:JOB/:$n/split${sub_split}utt/JOB/:g)
      # Prepare 'scp' for storing lattices separately and gzipped
      for k in `seq $sub_split`; do
        [ ! -d $dir/lat$n/$k ] && mkdir -p $dir/lat$n/$k;
        cat $sdata2/$k/feats.scp | \
        awk -v dir=$dir -v n=$n -v k=$k '{ utt=$1; utt_noslash=utt; gsub("/","_",utt_noslash);
                                           printf("%s | gzip -c >%s/lat%d/%d/%s.gz\n", utt, dir, n, k, utt_noslash); }'
      done >$dir/lat.${n}.store_separately_as_gz.scp
      # Generate lattices
      $cmd $parallel_opts JOB=1:$sub_split $dir/log/$n/decode_den.JOB.log \
        latgen-faster-mapped --beam=$beam --lattice-beam=$lattice_beam --acoustic-scale=$acwt \
          --max-mem=$max_mem --max-active=$max_active --word-symbol-table=$lang/words.txt $srcdir/final.mdl  \
          $dir/dengraph/HCLG.fst "$feats_subset" scp:$dir/lat.$n.store_separately_as_gz.scp || touch .error &
      this_pid=$!
    fi
    if [ ! -z "$prev_pid" ]; then  # Wait for the previous job; merge the previous set of lattices.
      wait $prev_pid
      [ -f $dir/.error ] && echo "$0: error generating denominator lattices" && exit 1;
      touch $dir/.done.$prev_n
    fi
    prev_n=$n
    prev_pid=$this_pid
  done
fi

#2) Generate 'scp' for reading the lattices
# make $dir an absolute pathname.
[ '/' != ${dir:0:1} ] && dir=$PWD/$dir
for n in `seq $nj`; do
  find $dir/lat${n} -name "*.gz" | perl -ape 's:.*/([^/]+)\.gz$:$1 gunzip -c $& |:; '
done | sort >$dir/lat.scp
[ -s $dir/lat.scp ] || exit 1

echo "$0: done generating denominator lattices."


================================================
FILE: egs/steps/nnet/make_fmllr_feats.sh
================================================
#!/usr/bin/env bash

# Copyright 2012-2015  Brno University of Technology (author: Karel Vesely),
#                 
# Apache 2.0.
#
# This script dumps fMLLR features in a new data directory, 
# which is later used for neural network training/testing.

# Begin configuration section.  
nj=4
cmd=run.pl
transform_dir=
raw_transform_dir=
# End configuration section.

echo "$0 $@"  # Print the command line for logging

[ -f ./path.sh ] && . ./path.sh; # source the path.
. parse_options.sh || exit 1;

set -euo pipefail

if [ $# != 5 ]; then
   echo "Usage: $0 [options] <tgt-data-dir> <src-data-dir> <gmm-dir> <log-dir> <fea-dir>"
   echo "e.g.: $0 data-fmllr/train data/train exp/tri5a exp/make_fmllr_feats/log plp/processed/"
   echo ""
   echo "This script dumps fMLLR features to disk, so it can be used for NN training."
   echo "It automoatically figures out the 'feature-type' of the source GMM systems."
   echo ""
   echo "main options (for others, see top of script file)"
   echo "  --config <config-file>                           # config containing options"
   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs"
   echo "  --nj <nj>                                        # number of parallel jobs"
   echo "  --transform-dir <transform-dir>                  # dir with fMLLR transforms"
   echo "  --raw-transform-dir <transform-dir>              # dir with raw-fMLLR transforms"
   exit 1;
fi

data=$1
srcdata=$2
gmmdir=$3
logdir=$4
feadir=$5

sdata=$srcdata/split$nj;

# Get the config,
D=$gmmdir
[ -f $D/cmvn_opts ] && cmvn_opts=$(cat $D/cmvn_opts) || cmvn_opts=
[ -f $D/delta_opts ] && delta_opts=$(cat $D/delta_opts) || delta_opts=
[ -f $D/splice_opts ] && splice_opts=$(cat $D/splice_opts) || splice_opts=

mkdir -p $data $logdir $feadir
[[ -d $sdata && $srcdata/feats.scp -ot $sdata ]] || split_data.sh $srcdata $nj || exit 1;

# Check files exist,
for f in $sdata/1/feats.scp $sdata/1/cmvn.scp; do
  [ ! -f $f ] && echo "$0: Missing $f" && exit 1;
done
[ ! -z "$transform_dir" -a ! -f $transform_dir/trans.1 ] && \
  echo "$0: Missing $transform_dir/trans.1" && exit 1;
[ ! -z "$raw_transform_dir" -a ! -f $raw_transform_dir/raw_trans.1 ] && \
  echo "$0: Missing $raw_transform_dir/raw_trans.1" && exit 1;

# Figure-out the feature-type,
feat_type="[UNKNOWN]"
[ -z "$raw_transform_dir" -a ! -f $gmmdir/final.mat -a ! -z "$transform_dir" ] && feat_type=delta_fmllr
[ -z "$raw_transform_dir" -a -f $gmmdir/final.mat -a ! -z "$transform_dir" ] && feat_type=lda_fmllr
[ ! -z "$raw_transform_dir" ] && feat_type=raw_fmllr
echo "$0: feature type is $feat_type";

# Hand-code the feature pipeline,
case $feat_type in
  delta_fmllr) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas $delta_opts ark:- ark:- | transform-feats --utt2spk=ark:$sdata/JOB/utt2spk \"ark:cat $transform_dir/trans.* |\" ark:- ark:- |";;
  lda_fmllr) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $gmmdir/final.mat ark:- ark:- | transform-feats --utt2spk=ark:$sdata/JOB/utt2spk \"ark:cat $transform_dir/trans.* |\" ark:- ark:- |";;
  raw_fmllr) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$raw_transform_dir/raw_trans.JOB ark:- ark:- |";;
  *) echo "Invalid feature type $feat_type" && exit 1;
esac

# Prepare the output dir,
utils/copy_data_dir.sh $srcdata $data; rm $data/{feats,cmvn}.scp 2>/dev/null
# Make $feadir an absolute pathname,
[ '/' != ${feadir:0:1} ] && feadir=$PWD/$feadir

# Store the output-features,
name=`basename $data`
$cmd JOB=1:$nj $logdir/make_fmllr_feats.JOB.log \
  copy-feats "$feats" \
  ark,scp:$feadir/feats_fmllr_$name.JOB.ark,$feadir/feats_fmllr_$name.JOB.scp || exit 1;

# Merge the scp,
for n in $(seq 1 $nj); do
  cat $feadir/feats_fmllr_$name.$n.scp 
done > $data/feats.scp

echo "$0: Done!, type $feat_type, $srcdata --> $data, using : raw-trans ${raw_transform_dir:-None}, gmm $gmmdir, trans ${transform_dir:-None}"

exit 0;


================================================
FILE: egs/steps/nnet/make_fmmi_feats.sh
================================================
#!/usr/bin/env bash

# Copyright 2012-2015  Brno University of Technology (author: Karel Vesely),
#
# Apache 2.0
#
# This script dumps fMMI features in a new data directory, 
# which is later used for neural network training/testing.

# Begin configuration section.  
iter=final
nj=4
cmd=run.pl
ngselect=2; # Just use the 2 top Gaussians for fMMI/fMPE.  Should match train.
transform_dir=
# End configuration section.

echo "$0 $@"  # Print the command line for logging

[ -f ./path.sh ] && . ./path.sh; # source the path.
. parse_options.sh || exit 1;

set -euo pipefail

if [ $# != 5 ]; then
   echo "Usage: $0 [options] <tgt-data-dir> <src-data-dir> <gmm-dir> <log-dir> <fea-dir>"
   echo "e.g.: $0 data-fmmi/train data/train exp/tri5a_fmmi_b0.1 data-fmmi/train/_log data-fmmi/train/_data "
   echo ""
   echo "This script works on CMN + (delta+delta-delta | LDA+MLLT) features; it works out"
   echo "what type of features you used (assuming it's one of these two)"
   echo "You can also use fMLLR features-- you have to supply --transform-dir option."
   echo ""
   echo "main options (for others, see top of script file)"
   echo "  --config <config-file>                           # config containing options"
   echo "  --nj <nj>                                        # number of parallel jobs"
   echo "  --iter <iter>                                    # Iteration of model to test."
   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
   echo "  --transform-dir <transform-dir>                  # where to find fMLLR transforms."
   exit 1;
fi

data=$1
srcdata=$2
gmmdir=$3
logdir=$4
feadir=$5

sdata=$srcdata/split$nj;

# Get the config,
D=$gmmdir
[ -f $D/cmvn_opts ] && cmvn_opts=$(cat $D/cmvn_opts) || cmvn_opts=
[ -f $D/splice_opts ] && splice_opts=$(cat $D/splice_opts) || splice_opts=

mkdir -p $data $logdir $feadir
[[ -d $sdata && $srcdata/feats.scp -ot $sdata ]] || split_data.sh $srcdata $nj || exit 1;

for f in $sdata/1/feats.scp $sdata/1/cmvn.scp $gmmdir/$iter.fmpe; do
  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
done

if [ -f $gmmdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
echo "$0: feature type is $feat_type";

case $feat_type in
  delta) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
  lda) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $gmmdir/final.mat ark:- ark:- |";;
  *) echo "Invalid feature type $feat_type" && exit 1;
esac

if [ ! -z "$transform_dir" ]; then # add transforms to features...
  echo "Using fMLLR transforms from $transform_dir"
  [ ! -f $transform_dir/trans.1 ] && echo "Expected $transform_dir/trans.1 to exist."
  [ "`cat $transform_dir/num_jobs`" -ne $nj ] && \
     echo "Mismatch in number of jobs with $transform_dir";
  feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/trans.JOB ark:- ark:- |"
fi

# Get Gaussian selection info.
$cmd JOB=1:$nj $logdir/gselect.JOB.log \
  gmm-gselect --n=$ngselect $gmmdir/$iter.fmpe "$feats" \
  "ark:|gzip -c >$feadir/gselect.JOB.gz" || exit 1;

# prepare the dir
cp $srcdata/* $data 2>/dev/null; rm $data/{feats,cmvn}.scp;

# make $bnfeadir an absolute pathname.
feadir=`perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' $feadir ${PWD}`

# forward the feats
$cmd JOB=1:$nj $logdir/make_fmmi_feats.JOB.log \
  fmpe-apply-transform $gmmdir/$iter.fmpe "$feats" "ark,s,cs:gunzip -c $feadir/gselect.JOB.gz|"  \
  ark,scp:$feadir/feats_fmmi.JOB.ark,$feadir/feats_fmmi.JOB.scp || exit 1;
   
# merge the feats to single SCP
for n in $(seq 1 $nj); do
  cat $feadir/feats_fmmi.$n.scp 
done > $data/feats.scp

echo "$0 finished... $srcdata -> $data ($gmmdir)"

exit 0;


================================================
FILE: egs/steps/nnet/make_priors.sh
================================================
#!/bin/bash 

# Copyright 2012-2015 Brno University of Technology (author: Karel Vesely)
# Apache 2.0
# To be run from .. (one directory up from here)
# see ../run.sh for example

# Begin configuration section.
nj=4
cmd=run.pl
use_gpu=no
ivector=
# End configuration section.

echo "$0 $@"  # Print the command line for logging

if [ -f path.sh ]; then . ./path.sh; fi
. parse_options.sh || exit 1;

set -euo pipefail

if [ $# != 2 ]; then
   echo "usage: $0 [options] <data-dir> <nnet-dir>";
   echo "options: "
   echo "  --cmd 'queue.pl <queue opts>'   # how to run jobs."
   echo "  --nj <nj>                       # number of parallel jobs"
   echo "  --remove-last-components <N>    # number of NNet Components to remove from the end"
   echo "  --use-gpu (no|yes|optional)     # forwarding on GPU"
   exit 1;
fi

if [ -f path.sh ]; then . ./path.sh; fi

data=$1
nndir=$2

######## CONFIGURATION

required="$data/feats.scp $nndir/final.nnet $nndir/final.feature_transform"
for f in $required; do
  [ ! -f $f ] && echo "$0: Missing $f" && exit 1;
done

sdata=$data/split$nj
[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;

echo "Accumulating prior stats by forwarding '$data' with '$nndir'"

# We estimate priors on 10k utterances, selected randomly from the splitted data,
N=$((10000/nj))

# PREPARE FEATURE EXTRACTION PIPELINE
# import config,
cmvn_opts=
delta_opts=
D=$nndir
[ -e $D/norm_vars ] && cmvn_opts="--norm-means=true --norm-vars=$(cat $D/norm_vars)" # Bwd-compatibility,
[ -e $D/cmvn_opts ] && cmvn_opts=$(cat $D/cmvn_opts)
[ -e $D/delta_order ] && delta_opts="--delta-order=$(cat $D/delta_order)" # Bwd-compatibility,
[ -e $D/delta_opts ] && delta_opts=$(cat $D/delta_opts)
#
# Create the feature stream,
feats="ark:cat $sdata/JOB/feats.scp | utils/shuffle_list.pl --srand 777 | head -n$N | copy-feats scp:- ark:- |"
# apply-cmvn (optional),
[ ! -z "$cmvn_opts" -a ! -f $sdata/1/cmvn.scp ] && echo "$0: Missing $sdata/1/cmvn.scp" && exit 1
[ ! -z "$cmvn_opts" ] && feats="$feats apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp ark:- ark:- |"
# add-deltas (optional),
[ ! -z "$delta_opts" ] && feats="$feats add-deltas $delta_opts ark:- ark:- |"
# add-pytel transform (optional),
[ -e $D/pytel_transform.py ] && feats="$feats /bin/env python $D/pytel_transform.py |"

# add-ivector (optional),
if [ -e $D/ivector_dim ]; then
  [ -z $ivector ] && echo "Missing --ivector, they were used in training!" && exit 1
  # Get the tool, 
  ivector_append_tool=append-vector-to-feats # default,
  [ -e $D/ivector_append_tool ] && ivector_append_tool=$(cat $D/ivector_append_tool)
  # Check dims,
  feats_job_1=$(sed 's:JOB:1:g' <(echo $feats))
  dim_raw=$(feat-to-dim "$feats_job_1" -)
  dim_raw_and_ivec=$(feat-to-dim "$feats_job_1 $ivector_append_tool ark:- '$ivector' ark:- |" -)
  dim_ivec=$((dim_raw_and_ivec - dim_raw))
  [ $dim_ivec != "$(cat $D/ivector_dim)" ] && \
    echo "Error, i-vector dim. mismatch (expected $(cat $D/ivector_dim), got $dim_ivec in '$ivector')" && \
    exit 1
  # Append to feats,
  feats="$feats $ivector_append_tool ark:- '$ivector' ark:- |"
fi

# Run the forward pass,
$cmd JOB=1:$nj $nndir/log/prior_stats.JOB.log \
  nnet-forward --use-gpu=$use_gpu --feature-transform=$nndir/final.feature_transform $nndir/final.nnet "$feats" ark:- \| \
  compute-cmvn-stats --binary=false ark:- $nndir/JOB.prior_cmvn_stats || exit 1

sum-matrices --binary=false $nndir/prior_cmvn_stats $nndir/*.prior_cmvn_stats 2>$nndir/log/prior_sum_matrices.log || exit 1
rm $nndir/*.prior_cmvn_stats

awk 'NR==2{ $NF=""; print "[",$0,"]"; }' $nndir/prior_cmvn_stats >$nndir/prior_counts || exit 1
    
echo "Succeeded creating prior counts '$nndir/prior_counts' from '$data'" 


================================================
FILE: egs/steps/nnet/pretrain_dbn.sh
================================================
#!/usr/bin/env bash
# Copyright 2013-2015 Brno University of Technology (author: Karel Vesely)

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#  http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.

# To be run from ../../
#
# Restricted Boltzman Machine (RBM) pre-training by Contrastive Divergence
# algorithm (CD-1). A stack of RBMs forms a Deep Belief Neetwork (DBN).
#
# This script by default pre-trains on plain features (ie. saved fMLLR features),
# building a 'feature_transform' containing +/-5 frame splice and global CMVN.
#
# There is also a support for adding speaker-based CMVN, deltas, i-vectors,
# or passing custom 'feature_transform' or its prototype.
#

# Begin configuration.

# topology, initialization,
nn_depth=6             # number of hidden layers,
hid_dim=2048           # number of neurons per layer,
param_stddev_first=0.1 # init parameters in 1st RBM
param_stddev=0.1 # init parameters in other RBMs
input_vis_type=gauss # type of visible nodes on DBN input

# number of iterations,
rbm_iter=1            # number of pre-training epochs (Gaussian-Bernoulli RBM has 2x more)

# pre-training opts,
rbm_lrate=0.4         # RBM learning rate
rbm_lrate_low=0.01    # lower RBM learning rate (for Gaussian units)
rbm_l2penalty=0.0002  # L2 penalty (increases RBM-mixing rate)
rbm_extra_opts=

# data processing,
copy_feats=true     # resave the features to tmpdir,
copy_feats_tmproot=/tmp/kaldi.XXXX # sets tmproot for 'copy-feats',
copy_feats_compress=true # compress feats while resaving

# feature processing,
splice=5            # (default) splice features both-ways along time axis,
cmvn_opts=          # (optional) adds 'apply-cmvn' to input feature pipeline, see opts,
delta_opts=         # (optional) adds 'add-deltas' to input feature pipeline, see opts,
ivector=            # (optional) adds 'append-vector-to-feats', the option is rx-filename for the 2nd stream,
ivector_append_tool=append-vector-to-feats # (optional) the tool for appending ivectors,

feature_transform_proto= # (optional) use this prototype for 'feature_transform',
feature_transform=  # (optional) directly use this 'feature_transform',

# misc.
verbose=1 # enable per-cache reports
skip_cuda_check=false

# End configuration.

echo "$0 $@"  # Print the command line for logging

[ -f path.sh ] && . ./path.sh;
. parse_options.sh || exit 1;

set -euo pipefail

if [ $# != 2 ]; then
   echo "Usage: $0 <data> <exp-dir>"
   echo " e.g.: $0 data/train exp/rbm_pretrain"
   echo "main options (for others, see top of script file)"
   echo "  --config <config-file>           # config containing options"
   echo ""
   echo "  --nn-depth <N>                   # number of RBM layers"
   echo "  --hid-dim <N>                    # number of hidden units per layer"
   echo "  --rbm-iter <N>                   # number of CD-1 iterations per layer"
   echo "                                   # can be used to subsample large datasets"
   echo "  --rbm-lrate <float>              # learning-rate for Bernoulli-Bernoulli RBMs"
   echo "  --rbm-lrate-low <float>          # learning-rate for Gaussian-Bernoulli RBM"
   echo ""
   echo "  --cmvn-opts  <string>            # add 'apply-cmvn' to input feature pipeline"
   echo "  --delta-opts <string>            # add 'add-deltas' to input feature pipeline"
   echo "  --splice <N>                     # splice +/-N frames of input features"
   echo "  --copy-feats <bool>              # copy features to /tmp, lowers storage stress"
   echo ""
   echo "  --feature_transform_proto <file> # use this prototype for 'feature_transform'"
   echo "  --feature-transform <file>       # directly use this 'feature_transform'"
   exit 1;
fi

data=$1
dir=$2

for f in $data/feats.scp; do
  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
done

echo "# INFO"
echo "$0 : Pre-training Deep Belief Network as a stack of RBMs"
printf "\t dir       : $dir \n"
printf "\t Train-set : $data '$(cat $data/feats.scp | wc -l)'\n"
echo

[ -e $dir/${nn_depth}.dbn ] && echo "$0 Skipping, already have $dir/${nn_depth}.dbn" && exit 0

# check if CUDA compiled in and GPU is available,
if ! $skip_cuda_check; then cuda-gpu-available || exit 1; fi

mkdir -p $dir/log

###### PREPARE FEATURES ######
echo
echo "# PREPARING FEATURES"
if [ "$copy_feats" == "true" ]; then
  # re-save the features to local disk into /tmp/,
  tmpdir=$(mktemp -d $copy_feats_tmproot)
  trap "echo \"# Removing features tmpdir $tmpdir @ $(hostname)\"; ls $tmpdir; rm -r $tmpdir" INT QUIT TERM EXIT
  copy-feats --compress=$copy_feats_compress scp:$data/feats.scp ark,scp:$tmpdir/train.ark,$dir/train_sorted.scp || exit 1
else
  # or copy the list,
  cp $data/feats.scp $dir/train_sorted.scp
fi
# shuffle the list,
utils/shuffle_list.pl --srand 777 <$dir/train_sorted.scp >$dir/train.scp

# create a 10k utt subset for global cmvn estimates,
head -n 10000 $dir/train.scp > $dir/train.scp.10k

# for debugging, add list with non-local features,
utils/shuffle_list.pl --srand 777 <$data/feats.scp >$dir/train.scp_non_local

###### OPTIONALLY IMPORT FEATURE SETTINGS ######
ivector_dim= # no ivectors,
if [ ! -z $feature_transform ]; then
  D=$(dirname $feature_transform)
  echo "# importing feature settings from dir '$D'"
  [ -e $D/cmvn_opts ] && cmvn_opts=$(cat $D/cmvn_opts)
  [ -e $D/delta_opts ] && delta_opts=$(cat $D/delta_opts)
  [ -e $D/ivector_dim ] && ivector_dim=$(cat $D/ivector_dim)
  [ -e $D/ivector_append_tool ] && ivector_append_tool=$(cat $D/ivector_append_tool)
  echo "# cmvn_opts='$cmvn_opts' delta_opts='$delta_opts' ivector_dim='$ivector_dim'"
fi

###### PREPARE FEATURE PIPELINE ######
# read the features
feats_tr="ark:copy-feats scp:$dir/train.scp ark:- |"

# optionally add per-speaker CMVN
if [ ! -z "$cmvn_opts" ]; then
  echo "+ 'apply-cmvn' with '$cmvn_opts' using statistics : $data/cmvn.scp"
  [ ! -r $data/cmvn.scp ] && echo "Missing $data/cmvn.scp" && exit 1;
  [ ! -r $data/utt2spk ] && echo "Missing $data/utt2spk" && exit 1;
  feats_tr="$feats_tr apply-cmvn $cmvn_opts --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp ark:- ark:- |"
else
  echo "# 'apply-cmvn' not used,"
fi

# optionally add deltas
if [ ! -z "$delta_opts" ]; then
  feats_tr="$feats_tr add-deltas $delta_opts ark:- ark:- |"
  echo "# + 'add-deltas' with '$delta_opts'"
fi

# keep track of the config,
[ ! -z "$cmvn_opts" ] && echo "$cmvn_opts" >$dir/cmvn_opts
[ ! -z "$delta_opts" ] && echo "$delta_opts" >$dir/delta_opts
#

# get feature dim,
feat_dim=$(feat-to-dim "$feats_tr" -)
echo "# feature dim : $feat_dim (input of 'feature_transform')"

# Now we start building 'feature_transform' which goes right in front of a NN.
# The forwarding is computed on a GPU before the frame shuffling is applied.
#
# Same GPU is used both for 'feature_transform' and the NN training.
# So it has to be done by a single process (we are using exclusive mode).
# This also reduces the CPU-GPU uploads/downloads to minimum.

if [ ! -z "$feature_transform" ]; then
  echo "# importing 'feature_transform' from '$feature_transform'"
  tmp=$dir/imported_$(basename $feature_transform)
  cp $feature_transform $tmp; feature_transform=$tmp
else
  # Make default proto with splice,
  if [ ! -z $feature_transform_proto ]; then
    echo "# importing custom 'feature_transform_proto' from : $feature_transform_proto"
  else
    echo "+ default 'feature_transform_proto' with splice +/-$splice frames"
    feature_transform_proto=$dir/splice${splice}.proto
    echo "<Splice> <InputDim> $feat_dim <OutputDim> $(((2*splice+1)*feat_dim)) <BuildVector> -$splice:$splice </BuildVector>" >$feature_transform_proto
  fi

  # Initialize 'feature-transform' from a prototype,
  feature_transform=$dir/tr_$(basename $feature_transform_proto .proto).nnet
  nnet-initialize --binary=false $feature_transform_proto $feature_transform

  # Renormalize the MLP input to zero mean and unit variance,
  feature_transform_old=$feature_transform
  feature_transform=${feature_transform%.nnet}_cmvn-g.nnet
  echo "# compute normalization stats from 10k sentences"
  nnet-forward --print-args=true --use-gpu=yes $feature_transform_old \
    "$(echo $feats_tr | sed 's|train.scp|train.scp.10k|')" ark:- |\
    compute-cmvn-stats ark:- $dir/cmvn-g.stats
  echo "# + normalization of NN-input at '$feature_transform'"
  nnet-concat --print-args=false --binary=false $feature_transform_old \
    "cmvn-to-nnet $dir/cmvn-g.stats -|" $feature_transform
fi

if [ ! -z $ivector ]; then
  echo
  echo "# ADDING IVECTOR FEATURES"
  # The iVectors are concatenated 'as they are' directly to the input of the neural network,
  # To do this, we paste the features, and use <ParallelComponent> where the 1st component
  # contains the transform and 2nd network contains <Copy> component.

  echo "# getting dims,"
  dim_raw=$(feat-to-dim "$feats_tr" -)
  dim_raw_and_ivec=$(feat-to-dim "$feats_tr $ivector_append_tool ark:- '$ivector' ark:- |" -)
  dim_ivec=$((dim_raw_and_ivec - dim_raw))
  echo "# dims, feats-raw $dim_raw, ivectors $dim_ivec,"

  # Should we do something with 'feature_transform'?
  if [ ! -z $ivector_dim ]; then
    # No, the 'ivector_dim' comes from dir with 'feature_transform' with iVec forwarding,
    echo "# assuming we got '$feature_transform' with ivector forwarding,"
    [ $ivector_dim != $dim_ivec ] && \
    echo -n "Error, i-vector dimensionality mismatch!" && \
    echo " (expected $ivector_dim, got $dim_ivec in $ivector)" && exit 1
  else
    # Yes, adjust the transform to do ``iVec forwarding'',
    feature_transform_old=$feature_transform
    feature_transform=${feature_transform%.nnet}_ivec_copy.nnet
    echo "# setting up ivector forwarding into '$feature_transform',"
    dim_transformed=$(feat-to-dim "$feats_tr nnet-forward $feature_transform_old ark:- ark:- |" -)
    nnet-initialize --print-args=false <(echo "<Copy> <InputDim> $dim_ivec <OutputDim> $dim_ivec <BuildVector> 1:$dim_ivec </BuildVector>") $dir/tr_ivec_copy.nnet
    nnet-initialize --print-args=false <(echo "<ParallelComponent> <InputDim> $((dim_raw+dim_ivec)) <OutputDim> $((dim_transformed+dim_ivec)) <NestedNnetFilename> $feature_transform_old $dir/tr_ivec_copy.nnet </NestedNnetFilename>") $feature_transform
  fi
  echo $dim_ivec >$dir/ivector_dim # mark down the iVec dim!
  echo $ivector_append_tool >$dir/ivector_append_tool

  # pasting the iVecs to the feaures,
  echo "# + ivector input '$ivector'"
  feats_tr="$feats_tr $ivector_append_tool ark:- '$ivector' ark:- |"
fi

###### Show the final 'feature_transform' in the log,
echo
echo "### Showing the final 'feature_transform':"
nnet-info $feature_transform
echo "###"

###### MAKE LINK TO THE FINAL feature_transform, so the other scripts will find it ######
[ -f $dir/final.feature_transform ] && unlink $dir/final.feature_transform
(cd $dir; ln -s $(basename $feature_transform) final.feature_transform )
feature_transform=$dir/final.feature_transform


###### GET THE DIMENSIONS ######
num_fea=$(feat-to-dim --print-args=false "$feats_tr nnet-forward --use-gpu=no $feature_transform ark:- ark:- |" - 2>/dev/null)
num_hid=$hid_dim


###### PERFORM THE PRE-TRAINING ######
for depth in $(seq 1 $nn_depth); do
  echo
  echo "# PRE-TRAINING RBM LAYER $depth"
  RBM=$dir/$depth.rbm
  [ -f $RBM ] && echo "RBM '$RBM' already trained, skipping." && continue

  # The first RBM needs special treatment, because of Gussian input nodes,
  if [ "$depth" == "1" ]; then
    # This is usually Gaussian-Bernoulli RBM (not if CNN layers are part of input transform)
    # initialize,
    echo "# initializing '$RBM.init'"
    echo "<Rbm> <InputDim> $num_fea <OutputDim> $num_hid <VisibleType> $input_vis_type <HiddenType> bern <ParamStddev> $param_stddev_first" > $RBM.proto
    nnet-initialize $RBM.proto $RBM.init 2>$dir/log/nnet-initialize.$depth.log || exit 1
    # pre-train,
    num_iter=$rbm_iter; [ $input_vis_type == "gauss" ] && num_iter=$((2*rbm_iter)) # 2x more epochs for Gaussian input
    [ $input_vis_type == "bern" ] && rbm_lrate_low=$rbm_lrate # original lrate for Bernoulli input
    echo "# pretraining '$RBM' (input $input_vis_type, lrate $rbm_lrate_low, iters $num_iter)"
    rbm-train-cd1-frmshuff --learn-rate=$rbm_lrate_low --l2-penalty=$rbm_l2penalty \
      --num-iters=$num_iter --verbose=$verbose \
      --feature-transform=$feature_transform \
      $rbm_extra_opts \
      $RBM.init "$feats_tr" $RBM 2>$dir/log/rbm.$depth.log || exit 1
  else
    # This is Bernoulli-Bernoulli RBM,
    # cmvn stats for init,
    echo "# computing cmvn stats '$dir/$depth.cmvn' for RBM initialization"
    if [ ! -f $dir/$depth.cmvn ]; then
      nnet-forward --print-args=false --use-gpu=yes \
        "nnet-concat $feature_transform $dir/$((depth-1)).dbn - |" \
        "$(echo $feats_tr | sed 's|train.scp|train.scp.10k|')" ark:- | \
      compute-cmvn-stats --print-args=false ark:- - | \
      cmvn-to-nnet --print-args=false - $dir/$depth.cmvn || exit 1
    else
      echo "# compute-cmvn-stats already done, skipping."
    fi
    # initialize,
    echo "initializing '$RBM.init'"
    echo "<Rbm> <InputDim> $num_hid <OutputDim> $num_hid <VisibleType> bern <HiddenType> bern <ParamStddev> $param_stddev <VisibleBiasCmvnFilename> $dir/$depth.cmvn" > $RBM.proto
    nnet-initialize $RBM.proto $RBM.init 2>$dir/log/nnet-initialize.$depth.log || exit 1
    # pre-train,
    echo "pretraining '$RBM' (lrate $rbm_lrate, iters $rbm_iter)"
    rbm-train-cd1-frmshuff --learn-rate=$rbm_lrate --l2-penalty=$rbm_l2penalty \
      --num-iters=$rbm_iter --verbose=$verbose \
      --feature-transform="nnet-concat $feature_transform $dir/$((depth-1)).dbn - |" \
      $rbm_extra_opts \
      $RBM.init "$feats_tr" $RBM 2>$dir/log/rbm.$depth.log || exit 1
  fi

  # Create DBN stack,
  if [ "$depth" == "1" ]; then
    echo "# converting RBM to $dir/$depth.dbn"
    rbm-convert-to-nnet $RBM $dir/$depth.dbn
  else
    echo "# appending RBM to $dir/$depth.dbn"
    nnet-concat $dir/$((depth-1)).dbn "rbm-convert-to-nnet $RBM - |"  $dir/$depth.dbn
  fi

done

echo
echo "# REPORT"
echo "# RBM pre-training progress (line per-layer)"
grep progress $dir/log/rbm.*.log
echo

echo "Pre-training finished."

sleep 3
exit 0


================================================
FILE: egs/steps/nnet/train.sh
================================================
#!/usr/bin/env bash

# Copyright 2012-2017  Brno University of Technology (author: Karel Vesely)
# Apache 2.0

# Begin configuration.

config=             # config, also forwarded to 'train_scheduler.sh',

# topology, initialization,
network_type=dnn    # select type of neural network (dnn,cnn1d,cnn2d,lstm),
hid_layers=4        # nr. of hidden layers (before sotfmax or bottleneck),
hid_dim=1024        # number of neurons per layer,
bn_dim=             # (optional) adds bottleneck and one more hidden layer to the NN,
dbn=                # (optional) prepend layers to the initialized NN,

proto_opts=         # adds options to 'make_nnet_proto.py',
cnn_proto_opts=     # adds options to 'make_cnn_proto.py',

nnet_init=          # (optional) use this pre-initialized NN,
nnet_proto=         # (optional) use this NN prototype for initialization,

# feature processing,
splice=5            # (default) splice features both-ways along time axis,
online_cmvn_opts=   # (optional) adds 'apply-cmvn-online' to input feature pipeline, see opts,
cmvn_opts=          # (optional) adds 'apply-cmvn' to input feature pipeline, see opts,
delta_opts=         # (optional) adds 'add-deltas' to input feature pipeline, see opts,
ivector=            # (optional) adds 'append-vector-to-feats', the option is rx-filename for the 2nd stream,
ivector_append_tool=append-vector-to-feats # (optional) the tool for appending ivectors,

feat_type=plain
traps_dct_basis=11    # (feat_type=traps) nr. of DCT basis, 11 is good with splice=10,
transf=               # (feat_type=transf) import this linear tranform,
splice_after_transf=5 # (feat_type=transf) splice after the linear transform,

feature_transform_proto= # (optional) use this prototype for 'feature_transform',
feature_transform=  # (optional) directly use this 'feature_transform',

# labels,
labels=            # (optional) specify non-default training targets,
                   # (targets need to be in posterior format, see 'ali-to-post', 'feat-to-post'),
num_tgt=           # (optional) specifiy number of NN outputs, to be used with 'labels=',

# training scheduler,
learn_rate=0.008   # initial learning rate,
scheduler_opts=    # options, passed to the training scheduler,
train_tool=        # optionally change the training tool,
train_tool_opts=   # options for the training tool,
frame_weights=     # per-frame weights for gradient weighting,
utt_weights=       # per-utterance weights (scalar for --frame-weights),

# data processing, misc.
copy_feats=true     # resave the train/cv features into /tmp (disabled by default),
copy_feats_tmproot=/tmp/kaldi.XXXX # sets tmproot for 'copy-feats',
copy_feats_compress=true # compress feats while resaving
feats_std=1.0

split_feats=        # split the training data into N portions, one portion will be one 'epoch',
                    # (empty = no splitting)

seed=777            # seed value used for data-shuffling, nn-initialization, and training,
skip_cuda_check=false
skip_phoneset_check=false

# End configuration.

echo "$0 $@"  # Print the command line for logging

[ -f path.sh ] && . ./path.sh;
. parse_options.sh || exit 1;

set -euo pipefail

if [ $# != 6 ]; then
   echo "Usage: $0 <data-train> <data-dev> <lang-dir> <ali-train> <ali-dev> <exp-dir>"
   echo " e.g.: $0 data/train data/cv data/lang exp/mono_ali_train exp/mono_ali_cv exp/mono_nnet"
   echo ""
   echo " Training data : <data-train>,<ali-train> (for optimizing cross-entropy)"
   echo " Held-out data : <data-dev>,<ali-dev> (for learn-rate scheduling, model selection)"
   echo " note.: <ali-train>,<ali-dev> can point to same directory, or 2 separate directories."
   echo ""
   echo "main options (for others, see top of script file)"
   echo "  --config <config-file>   # config containing options"
   echo ""
   echo "  --network-type (dnn,cnn1d,cnn2d,lstm)  # type of neural network"
   echo "  --nnet-proto <file>      # use this NN prototype"
   echo "  --feature-transform <file> # re-use this input feature transform"
   echo ""
   echo "  --feat-type (plain|traps|transf) # type of input features"
   echo "  --cmvn-opts  <string>            # add 'apply-cmvn' to input feature pipeline"
   echo "  --delta-opts <string>            # add 'add-deltas' to input feature pipeline"
   echo "  --splice <N>                     # splice +/-N frames of input features"
   echo
   echo "  --learn-rate <float>     # initial leaning-rate"
   echo "  --copy-feats <bool>      # copy features to /tmp, lowers storage stress"
   echo ""
   exit 1;
fi

data=$1
data_cv=$2
lang=$3
alidir=$4
alidir_cv=$5
dir=$6

# Using alidir for supervision (default)
if [ -z "$labels" ]; then
  silphonelist=`cat $lang/phones/silence.csl`
  for f in $alidir/final.mdl $alidir/ali.1.gz $alidir_cv/ali.1.gz; do
    [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
  done
fi

for f in $data/feats.scp $data_cv/feats.scp; do
  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
done

echo
echo "# INFO"
echo "$0 : Training Neural Network"
printf "\t dir       : $dir \n"
printf "\t Train-set : $data $(cat $data/feats.scp | wc -l), $alidir \n"
printf "\t CV-set    : $data_cv $(cat $data_cv/feats.scp | wc -l) $alidir_cv \n"
echo

mkdir -p $dir/{log,nnet}

if ! $skip_phoneset_check; then
  utils/lang/check_phones_compatible.sh $lang/phones.txt $alidir/phones.txt
  utils/lang/check_phones_compatible.sh $lang/phones.txt $alidir_cv/phones.txt
  cp $lang/phones.txt $dir
fi

# skip when already trained,
if [ -e $dir/final.nnet ]; then
  echo "SKIPPING TRAINING... ($0)"
  echo "nnet already trained : $dir/final.nnet ($(readlink $dir/final.nnet))"
  exit 0
fi

# check if CUDA compiled in and GPU is available,
if ! $skip_cuda_check; then cuda-gpu-available || exit 1; fi

###### PREPARE ALIGNMENTS ######
echo
echo "# PREPARING ALIGNMENTS"
if [ ! -z "$labels" ]; then
  echo "Using targets '$labels' (by force)"
  labels_tr="$labels"
  labels_cv="$labels"
else
  echo "Using PDF targets from dirs '$alidir' '$alidir_cv'"
  # training targets in posterior format,
  labels_tr="ark:ali-to-pdf $alidir/final.mdl \"ark:gunzip -c $alidir/ali.*.gz |\" ark:- | ali-to-post ark:- ark:- |"
  labels_cv="ark:ali-to-pdf $alidir/final.mdl \"ark:gunzip -c $alidir_cv/ali.*.gz |\" ark:- | ali-to-post ark:- ark:- |"
  # training targets for analyze-counts,
  labels_tr_pdf="ark:ali-to-pdf $alidir/final.mdl \"ark:gunzip -c $alidir/ali.*.gz |\" ark:- |"
  labels_tr_phn="ark:ali-to-phones --per-frame=true $alidir/final.mdl \"ark:gunzip -c $alidir/ali.*.gz |\" ark:- |"

  # get pdf-counts, used later for decoding/aligning,
  num_pdf=$(hmm-info $alidir/final.mdl | awk '/pdfs/{print $4}')
  analyze-counts --verbose=1 --binary=false --counts-dim=$num_pdf \
    ${frame_weights:+ "--frame-weights=$frame_weights"} \
    ${utt_weights:+ "--utt-weights=$utt_weights"} \
    "$labels_tr_pdf" $dir/ali_train_pdf.counts 2>$dir/log/analyze_counts_pdf.log
  # copy the old transition model, will be needed by decoder,
  copy-transition-model --binary=false $alidir/final.mdl $dir/final.mdl
  # copy the tree
  cp $alidir/tree $dir/tree

  # make phone counts for analysis,
  [ -e $lang/phones.txt ] && analyze-counts --verbose=1 --symbol-table=$lang/phones.txt --counts-dim=$num_pdf \
    ${frame_weights:+ "--frame-weights=$frame_weights"} \
    ${utt_weights:+ "--utt-weights=$utt_weights"} \
    "$labels_tr_phn" /dev/null 2>$dir/log/analyze_counts_phones.log
fi

###### PREPARE FEATURES ######
echo
echo "# PREPARING FEATURES"
if [ "$copy_feats" == "true" ]; then
  echo "# re-saving features to local disk,"
  tmpdir=$(mktemp -d $copy_feats_tmproot)
  copy-feats --compress=$copy_feats_compress scp:$data/feats.scp ark,scp:$tmpdir/train.ark,$dir/train_sorted.scp
  copy-feats --compress=$copy_feats_compress scp:$data_cv/feats.scp ark,scp:$tmpdir/cv.ark,$dir/cv.scp
  trap "echo '# Removing features tmpdir $tmpdir @ $(hostname)'; ls $tmpdir; rm -r $tmpdir" EXIT
else
  # or copy the list,
  cp $data/feats.scp $dir/train_sorted.scp
  cp $data_cv/feats.scp $dir/cv.scp
fi
# shuffle the list,
utils/shuffle_list.pl --srand ${seed:-777} <$dir/train_sorted.scp >$dir/train.scp

# create a 10k utt subset for global cmvn estimates,
head -n 10000 $dir/train.scp > $dir/train.scp.10k

# split the list,
if [ -n "$split_feats" ]; then
  scps= # 1..split_feats,
  for (( ii=1; ii<=$split_feats; ii++ )); do scps="$scps $dir/train.${ii}.scp"; done
  utils/split_scp.pl $dir/train.scp $scps
fi

# for debugging, add lists with non-local features,
utils/shuffle_list.pl --srand ${seed:-777} <$data/feats.scp >$dir/train.scp_non_local
cp $data_cv/feats.scp $dir/cv.scp_non_local

###### OPTIONALLY IMPORT FEATURE SETTINGS (from pre-training) ######
ivector_dim= # no ivectors,
if [ -n "$feature_transform" ]; then
  D=$(dirname $feature_transform)
  echo "# importing feature settings from dir '$D'"
  [ -e $D/online_cmvn_opts ] && online_cmvn_opts=$(cat $D/online_cmvn_opts)
  [ -e $D/cmvn_opts ] && cmvn_opts=$(cat $D/cmvn_opts)
  [ -e $D/delta_opts ] && delta_opts=$(cat $D/delta_opts)
  [ -e $D/ivector_dim ] && ivector_dim=$(cat $D/ivector_dim)
  [ -e $D/ivector_append_tool ] && ivector_append_tool=$(cat $D/ivector_append_tool)
  echo "# cmvn_opts='$cmvn_opts' delta_opts='$delta_opts' ivector_dim='$ivector_dim'"
fi

###### PREPARE FEATURE PIPELINE ######
# read the features,
feats_tr="ark:copy-feats scp:$dir/train.scp ark:- |"
feats_cv="ark:copy-feats scp:$dir/cv.scp ark:- |"

# optionally add per-speaker CMVN,
[ -n "$online_cmvn_opts" -a -n "$cmvn_opts" ] && echo "Error: use \$online_cmvn_opts or \$cmvn_opts, not both!" && exit 1
if [ -n "$online_cmvn_opts" ]; then
  echo "# + 'apply-cmvn-online' with '$online_cmvn_opts' is used,"
  global_cmvn_stats=$dir/global_cmvn_stats.mat
  matrix-sum --binary=false scp:$data/cmvn.scp $global_cmvn_stats
  feats_tr="$feats_tr apply-cmvn-online $online_cmvn_opts $global_cmvn_stats ark:- ark:- |"
  feats_cv="$feats_cv apply-cmvn-online $online_cmvn_opts $global_cmvn_stats ark:- ark:- |"
elif [ -n "$cmvn_opts" ]; then
  echo "# + 'apply-cmvn' with '$cmvn_opts' using statistics : $data/cmvn.scp, $data_cv/cmvn.scp"
  [ ! -r $data/cmvn.scp ] && echo "Missing $data/cmvn.scp" && exit 1;
  [ ! -r $data_cv/cmvn.scp ] && echo "Missing $data_cv/cmvn.scp" && exit 1;
  feats_tr="$feats_tr apply-cmvn $cmvn_opts --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp ark:- ark:- |"
  feats_cv="$feats_cv apply-cmvn $cmvn_opts --utt2spk=ark:$data_cv/utt2spk scp:$data_cv/cmvn.scp ark:- ark:- |"
else
  echo "# 'apply-cmvn' is not used,"
fi

# optionally add deltas,
if [ ! -z "$delta_opts" ]; then
  feats_tr="$feats_tr add-deltas $delta_opts ark:- ark:- |"
  feats_cv="$feats_cv add-deltas $delta_opts ark:- ark:- |"
  echo "# + 'add-deltas' with '$delta_opts'"
fi

# keep track of the config,
[ -n "$online_cmvn_opts" ] && echo "$online_cmvn_opts" >$dir/online_cmvn_opts
[ -n "$cmvn_opts" ] && echo "$cmvn_opts" >$dir/cmvn_opts
[ -n "$delta_opts" ] && echo "$delta_opts" >$dir/delta_opts
#

# temoprary pipeline with first 10k,
feats_tr_10k="${feats_tr/train.scp/train.scp.10k}"

# get feature dim,
feat_dim=$(feat-to-dim "$feats_tr_10k" -)
echo "# feature dim : $feat_dim (input of 'feature_transform')"

# Now we start building 'feature_transform' which goes right in front of a NN.
# The forwarding is computed on a GPU before the frame shuffling is applied.
#
# Same GPU is used both for 'feature_transform' and the NN training.
# So it has to be done by a single process (we are using exclusive mode).
# This also reduces the CPU-GPU uploads/downloads to minimum.

if [ -n "$feature_transform" ]; then
  echo "# importing 'feature_transform' from '$feature_transform'"
  tmp=$dir/imported_$(basename $feature_transform)
  cp $feature_transform $tmp; feature_transform=$tmp
else
  # Make default proto with splice,
  if [ -n "$feature_transform_proto" ]; then
    echo "# importing custom 'feature_transform_proto' from '$feature_transform_proto'"
  else
    echo "# + default 'feature_transform_proto' with splice +/-$splice frames,"
    feature_transform_proto=$dir/splice${splice}.proto
    echo "<Splice> <InputDim> $feat_dim <OutputDim> $(((2*splice+1)*feat_dim)) <BuildVector> -$splice:$splice </BuildVector>" >$feature_transform_proto
  fi

  # Initialize 'feature-transform' from a prototype,
  feature_transform=$dir/tr_$(basename $feature_transform_proto .proto).nnet
  nnet-initialize --binary=false $feature_transform_proto $feature_transform

  # Choose further processing of spliced features
  echo "# feature type : $feat_type"
  case $feat_type in
    plain)
    ;;
    traps)
      #generate hamming+dct transform
      feature_transform_old=$feature_transform
      feature_transform=${feature_transform%.nnet}_hamm_dct${traps_dct_basis}.nnet
      echo "# + Hamming DCT transform (t$((splice*2+1)),dct${traps_dct_basis}) into '$feature_transform'"
      #prepare matrices with time-transposed hamming and dct
      utils/nnet/gen_hamm_mat.py --fea-dim=$feat_dim --splice=$splice > $dir/hamm.mat
      utils/nnet/gen_dct_mat.py --fea-dim=$feat_dim --splice=$splice --dct-basis=$traps_dct_basis > $dir/dct.mat
      #put everything together
      compose-transforms --binary=false $dir/dct.mat $dir/hamm.mat - | \
        transf-to-nnet - - | \
        nnet-concat --binary=false $feature_transform_old - $feature_transform
    ;;
    transf)
      feature_transform_old=$feature_transform
      feature_transform=${feature_transform%.nnet}_transf_splice${splice_after_transf}.nnet
      [ -z $transf ] && transf=$alidir/final.mat
      [ ! -f $transf ] && echo "Missing transf $transf" && exit 1
      feat_dim=$(feat-to-dim "$feats_tr_10k nnet-forward 'nnet-concat $feature_transform_old \"transf-to-nnet $transf - |\" - |' ark:- ark:- |" -)
      nnet-concat --binary=false $feature_transform_old \
        "transf-to-nnet $transf - |" \
        "utils/nnet/gen_splice.py --fea-dim=$feat_dim --splice=$splice_after_transf |" \
        $feature_transform
    ;;
    *)
      echo "Unknown feature type $feat_type"
      exit 1;
    ;;
  esac

  # keep track of feat_type,
  echo $feat_type > $dir/feat_type

  # Renormalize the MLP input to zero mean and unit variance,
  feature_transform_old=$feature_transform
  feature_transform=${feature_transform%.nnet}_cmvn-g.nnet
  echo "# compute normalization stats from 10k sentences"
  nnet-forward --print-args=true --use-gpu=yes $feature_transform_old \
    "$feats_tr_10k" ark:- |\
    compute-cmvn-stats ark:- $dir/cmvn-g.stats
  echo "# + normalization of NN-input at '$feature_transform'"
  nnet-concat --binary=false $feature_transform_old \
    "cmvn-to-nnet --std-dev=$feats_std $dir/cmvn-g.stats -|" $feature_transform
fi

if [ ! -z $ivector ]; then
  echo
  echo "# ADDING IVECTOR FEATURES"
  # The iVectors are concatenated 'as they are' directly to the input of the neural network,
  # To do this, we paste the features, and use <ParallelComponent> where the 1st component
  # contains the transform and 2nd network contains <Copy> component.

  echo "# getting dims,"
  dim_raw=$(feat-to-dim "$feats_tr_10k" -)
  dim_raw_and_ivec=$(feat-to-dim "$feats_tr_10k $ivector_append_tool ark:- '$ivector' ark:- |" -)
  dim_ivec=$((dim_raw_and_ivec - dim_raw))
  echo "# dims, feats-raw $dim_raw, ivectors $dim_ivec,"

  # Should we do something with 'feature_transform'?
  if [ ! -z $ivector_dim ]; then
    # No, the 'ivector_dim' comes from dir with 'feature_transform' with iVec forwarding,
    echo "# assuming we got '$feature_transform' with ivector forwarding,"
    [ $ivector_dim != $dim_ivec ] && \
    echo -n "Error, i-vector dimensionality mismatch!" && \
    echo " (expected $ivector_dim, got $dim_ivec in $ivector)" && exit 1
  else
    # Yes, adjust the transform to do ``iVec forwarding'',
    feature_transform_old=$feature_transform
    feature_transform=${feature_transform%.nnet}_ivec_copy.nnet
    echo "# setting up ivector forwarding into '$feature_transform',"
    dim_transformed=$(feat-to-dim "$feats_tr_10k nnet-forward $feature_transform_old ark:- ark:- |" -)
    nnet-initialize --print-args=false <(echo "<Copy> <InputDim> $dim_ivec <OutputDim> $dim_ivec <BuildVector> 1:$dim_ivec </BuildVector>") $dir/tr_ivec_copy.nnet
    nnet-initialize --print-args=false <(echo "<ParallelComponent> <InputDim> $((dim_raw+dim_ivec)) <OutputDim> $((dim_transformed+dim_ivec)) \
                                               <NestedNnetFilename> $feature_transform_old $dir/tr_ivec_copy.nnet </NestedNnetFilename>") $feature_transform
  fi
  echo $dim_ivec >$dir/ivector_dim # mark down the iVec dim!
  echo $ivector_append_tool >$dir/ivector_append_tool

  # pasting the iVecs to the features,
  echo "# + ivector input '$ivector'"
  feats_tr="$feats_tr $ivector_append_tool ark:- '$ivector' ark:- |"
  feats_cv="$feats_cv $ivector_append_tool ark:- '$ivector' ark:- |"
fi

###### Show the final 'feature_transform' in the log,
echo
echo "### Showing the final 'feature_transform':"
nnet-info $feature_transform
echo "###"

###### MAKE LINK TO THE FINAL feature_transform, so the other scripts will find it ######
[ -f $dir/final.feature_transform ] && unlink $dir/final.feature_transform
(cd $dir; ln -s $(basename $feature_transform) final.feature_transform )
feature_transform=$dir/final.feature_transform


###### INITIALIZE THE NNET ######
echo
echo "# NN-INITIALIZATION"
if [ ! -z $nnet_init ]; then
  echo "# using pre-initialized network '$nnet_init'"
elif [ ! -z $nnet_proto ]; then
  echo "# initializing NN from prototype '$nnet_proto'";
  nnet_init=$dir/nnet.init; log=$dir/log/nnet_initialize.log
  nnet-initialize --seed=$seed $nnet_proto $nnet_init
else
  echo "# getting input/output dims :"
  # input-dim,
  get_dim_from=$feature_transform
  [ ! -z "$dbn" ] && get_dim_from="nnet-concat $feature_transform '$dbn' -|"
  num_fea=$(feat-to-dim "$feats_tr_10k nnet-forward \"$get_dim_from\" ark:- ark:- |" -)

  # output-dim,
  [ -z $num_tgt ] && \
    num_tgt=$(hmm-info --print-args=false $alidir/final.mdl | grep pdfs | awk '{ print $NF }')

  # make network prototype,
  nnet_proto=$dir/nnet.proto
  echo "# genrating network prototype $nnet_proto"
  case "$network_type" in
    dnn)
      utils/nnet/make_nnet_proto.py $proto_opts \
        ${bn_dim:+ --bottleneck-dim=$bn_dim} \
        $num_fea $num_tgt $hid_layers $hid_dim >$nnet_proto
      ;;
    cnn1d)
      delta_order=$([ -z $delta_opts ] && echo "0" || { echo $delta_opts | tr ' ' '\n' | grep "delta[-_]order" | sed 's:^.*=::'; })
      echo "Debug : $delta_opts, delta_order $delta_order"
      utils/nnet/make_cnn_proto.py $cnn_proto_opts \
        --splice=$splice --delta-order=$delta_order --dir=$dir \
        $num_fea >$nnet_proto
      cnn_fea=$(cat $nnet_proto | grep -v '^$' | tail -n1 | awk '{ print $5; }')
      utils/nnet/make_nnet_proto.py $proto_opts \
        --no-smaller-input-weights \
        ${bn_dim:+ --bottleneck-dim=$bn_dim} \
        "$cnn_fea" $num_tgt $hid_layers $hid_dim >>$nnet_proto
      ;;
    lstm)
      utils/nnet/make_lstm_proto.py $proto_opts \
        $num_fea $num_tgt >$nnet_proto
      ;;
    blstm)
      utils/nnet/make_blstm_proto.py $proto_opts \
        $num_fea $num_tgt >$nnet_proto
      ;;
    *) echo "Unknown : --network-type $network_type" && exit 1;
  esac

  # initialize,
  nnet_init=$dir/nnet.init
  echo "# initializing the NN '$nnet_proto' -> '$nnet_init'"
  nnet-initialize --seed=$seed $nnet_proto $nnet_init

  # optionally prepend dbn to the initialization,
  if [ ! -z "$dbn" ]; then
    nnet_init_old=$nnet_init; nnet_init=$dir/nnet_dbn_dnn.init
    nnet-concat "$dbn" $nnet_init_old $nnet_init
  fi
fi


###### TRAIN ######
echo
echo "# RUNNING THE NN-TRAINING SCHEDULER"
steps/nnet/train_scheduler.sh \
  ${scheduler_opts} \
  ${train_tool:+ --train-tool "$train_tool"} \
  ${train_tool_opts:+ --train-tool-opts "$train_tool_opts"} \
  ${feature_transform:+ --feature-transform $feature_transform} \
  ${split_feats:+ --split-feats $split_feats} \
  --learn-rate $learn_rate \
  ${frame_weights:+ --frame-weights "$frame_weights"} \
  ${utt_weights:+ --utt-weights "$utt_weights"} \
  ${config:+ --config $config} \
  $nnet_init "$feats_tr" "$feats_cv" "$labels_tr" "$labels_cv" $dir

echo "$0: Successfuly finished. '$dir'"

sleep 3
exit 0


================================================
FILE: egs/steps/nnet/train_mmi.sh
================================================
#!/usr/bin/env bash
# Copyright 2013-2015  Brno University of Technology (author: Karel Vesely)
# Apache 2.0.

# Sequence-discriminative MMI/BMMI training of DNN.
# 4 iterations (by default) of Stochastic Gradient Descent with per-utterance updates.
# Boosting of paths with more errors (BMMI) gets activated by '--boost <float>' option.

# For the numerator we have a fixed alignment rather than a lattice--
# this actually follows from the way lattices are defined in Kaldi, which
# is to have a single path for each word (output-symbol) sequence.


# Begin configuration section.
cmd=run.pl
num_iters=4
boost=0.0 #ie. disable boosting
acwt=0.1
lmwt=1.0
learn_rate=0.00001
halving_factor=1.0 #ie. disable halving
drop_frames=true
verbose=0 # 0 No GPU time-stats, 1 with GPU time-stats (slower),
ivector=

seed=777    # seed value used for training data shuffling
skip_cuda_check=false
# End configuration section

echo "$0 $@"  # Print the command line for logging

[ -f ./path.sh ] && . ./path.sh; # source the path.
. parse_options.sh || exit 1;

set -euo pipefail

if [ $# -ne 6 ]; then
  echo "Usage: $0 <data> <lang> <srcdir> <ali> <denlats> <exp>"
  echo " e.g.: $0 data/train_all data/lang exp/tri3b_dnn exp/tri3b_dnn_ali exp/tri3b_dnn_denlats exp/tri3b_dnn_mmi"
  echo "Main options (for others, see top of script file)"
  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
  echo "  --config <config-file>                           # config containing options"
  echo "  --num-iters <N>                                  # number of iterations to run"
  echo "  --acwt <float>                                   # acoustic score scaling"
  echo "  --lmwt <float>                                   # linguistic score scaling"
  echo "  --learn-rate <float>                             # learning rate for NN training"
  echo "  --drop-frames <bool>                             # drop frames num/den completely disagree"
  echo "  --boost <boost-weight>                           # (e.g. 0.1), for boosted MMI.  (default 0)"

  exit 1;
fi

data=$1
lang=$2
srcdir=$3
alidir=$4
denlatdir=$5
dir=$6

for f in $data/feats.scp $denlatdir/lat.scp \
         $alidir/{tree,final.mdl,ali.1.gz} \
         $srcdir/{final.nnet,final.feature_transform}; do
  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
done

# check if CUDA compiled in,
if ! $skip_cuda_check; then cuda-compiled || { echo "Error, CUDA not compiled-in!"; exit 1; } fi

mkdir -p $dir/log

utils/lang/check_phones_compatible.sh $lang/phones.txt $srcdir/phones.txt
utils/lang/check_phones_compatible.sh $lang/phones.txt $alidir/phones.txt
cp $lang/phones.txt $dir

cp $alidir/{final.mdl,tree} $dir

silphonelist=`cat $lang/phones/silence.csl`


#Get the files we will need
nnet=$srcdir/$(readlink $srcdir/final.nnet || echo final.nnet);
[ -z "$nnet" ] && echo "Error nnet '$nnet' does not exist!" && exit 1;
cp $nnet $dir/0.nnet; nnet=$dir/0.nnet

class_frame_counts=$srcdir/ali_train_pdf.counts
[ -z "$class_frame_counts" ] && echo "Error class_frame_counts '$class_frame_counts' does not exist!" && exit 1;
cp $srcdir/ali_train_pdf.counts $dir

feature_transform=$srcdir/final.feature_transform
if [ ! -f $feature_transform ]; then
  echo "Missing feature_transform '$feature_transform'"
  exit 1
fi
cp $feature_transform $dir/final.feature_transform

model=$dir/final.mdl
[ -z "$model" ] && echo "Error transition model '$model' does not exist!" && exit 1;


# Shuffle the feature list to make the GD stochastic!
# By shuffling features, we have to use lattices with random access (indexed by .scp file).
cat $data/feats.scp | utils/shuffle_list.pl --srand $seed >$dir/train.scp

###
### PREPARE FEATURE EXTRACTION PIPELINE
###
# import config,
cmvn_opts=
delta_opts=
D=$srcdir
[ -e $D/norm_vars ] && cmvn_opts="--norm-means=true --norm-vars=$(cat $D/norm_vars)" # Bwd-compatibility,
[ -e $D/cmvn_opts ] && cmvn_opts=$(cat $D/cmvn_opts)
[ -e $D/delta_order ] && delta_opts="--delta-order=$(cat $D/delta_order)" # Bwd-compatibility,
[ -e $D/delta_opts ] && delta_opts=$(cat $D/delta_opts)
#
# Create the feature stream,
feats="ark,o:copy-feats scp:$dir/train.scp ark:- |"
# apply-cmvn (optional),
[ ! -z "$cmvn_opts" -a ! -f $data/cmvn.scp ] && echo "$0: Missing $data/cmvn.scp" && exit 1
[ ! -z "$cmvn_opts" ] && feats="$feats apply-cmvn $cmvn_opts --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp ark:- ark:- |"
# add-deltas (optional),
[ ! -z "$delta_opts" ] && feats="$feats add-deltas $delta_opts ark:- ark:- |"
# add-pytel transform (optional),
[ -e $D/pytel_transform.py ] && feats="$feats /bin/env python $D/pytel_transform.py |"

# add-ivector (optional),
if [ -e $D/ivector_dim ]; then
  [ -z $ivector ] && echo "Missing --ivector, they were used in training!" && exit 1
  # Get the tool,
  ivector_append_tool=append-vector-to-feats # default,
  [ -e $D/ivector_append_tool ] && ivector_append_tool=$(cat $D/ivector_append_tool)
  # Check dims,
  dim_raw=$(feat-to-dim "$feats" -)
  dim_raw_and_ivec=$(feat-to-dim "$feats $ivector_append_tool ark:- '$ivector' ark:- |" -)
  dim_ivec=$((dim_raw_and_ivec - dim_raw))
  [ $dim_ivec != "$(cat $D/ivector_dim)" ] && \
    echo "Error, i-vector dim. mismatch (expected $(cat $D/ivector_dim), got $dim_ivec in '$ivector')" && \
    exit 1
  # Append to feats,
  feats="$feats $ivector_append_tool ark:- '$ivector' ark:- |"
fi

### Record the setup,
[ ! -z "$cmvn_opts" ] && echo $cmvn_opts >$dir/cmvn_opts
[ ! -z "$delta_opts" ] && echo $delta_opts >$dir/delta_opts
[ -e $D/pytel_transform.py ] && cp $D/pytel_transform.py $dir/pytel_transform.py
[ -e $D/ivector_dim ] && cp $D/ivector_dim $dir/ivector_dim
[ -e $D/ivector_append_tool ] && cp $D/ivector_append_tool $dir/ivector_append_tool
###

###
### Prepare the alignments
###
# Assuming all alignments will fit into memory
ali="ark:gunzip -c $alidir/ali.*.gz |"


###
### Prepare the lattices
###
# The lattices are indexed by SCP (they are not gziped because of the random access in SGD)
lats="scp:$denlatdir/lat.scp"

# Optionally apply boosting
if [[ "$boost" != "0.0" && "$boost" != 0 ]]; then
  # make lattice scp with same order as the shuffled feature scp,
  awk '{ if(r==0) { utt_id=$1; latH[$1]=$0; } # lat.scp
         if(r==1) { if(latH[$1] != "") { print latH[$1]; } } # train.scp
  }' r=0 $denlatdir/lat.scp r=1 $dir/train.scp > $dir/lat.scp
  # get the list of alignments,
  ali-to-phones $alidir/final.mdl "$ali" ark,t:- | awk '{print $1;}' > $dir/ali.lst
  # remove from features sentences which have no lattice or no alignment,
  # (so that the mmi training tool does not blow-up due to lattice caching),
  mv $dir/train.scp $dir/train.scp_unfilt
  awk '{ if(r==0) { latH[$1]="1"; } # lat.scp
         if(r==1) { aliH[$1]="1"; } # ali.lst
         if(r==2) { if((latH[$1] != "") && (aliH[$1] != "")) { print $0; } } # train.scp_
  }' r=0 $dir/lat.scp r=1 $dir/ali.lst r=2 $dir/train.scp_unfilt > $dir/train.scp
  # create the lat pipeline,
  lats="ark,o:lattice-boost-ali --b=$boost --silence-phones=$silphonelist $alidir/final.mdl scp:$dir/lat.scp '$ali' ark:- |"
fi
###
###
###

# Run several iterations of the MMI/BMMI training
cur_mdl=$nnet
x=1
while [ $x -le $num_iters ]; do
  echo "Pass $x (learnrate $learn_rate)"
  if [ -f $dir/$x.nnet ]; then
    echo "Skipped, file $dir/$x.nnet exists"
  else
    $cmd $dir/log/mmi.$x.log \
     nnet-train-mmi-sequential \
       --feature-transform=$feature_transform \
       --class-frame-counts=$class_frame_counts \
       --acoustic-scale=$acwt \
       --lm-scale=$lmwt \
       --learn-rate=$learn_rate \
       --drop-frames=$drop_frames \
       --verbose=$verbose \
       $cur_mdl $alidir/final.mdl "$feats" "$lats" "$ali" $dir/$x.nnet
  fi
  cur_mdl=$dir/$x.nnet

  #report the progress
  grep -B 2 MMI-objective $dir/log/mmi.$x.log | sed -e 's|^[^)]*)[^)]*)||'

  x=$((x+1))
  learn_rate=$(awk "BEGIN{print($learn_rate*$halving_factor)}")

done

(cd $dir; [ -e final.nnet ] && unlink final.nnet; ln -s $((x-1)).nnet final.nnet)

echo "MMI/BMMI training finished"

if [ -e $dir/prior_counts ]; then
  echo "Priors are already re-estimated, skipping... ($dir/prior_counts)"
else
  echo "Re-estimating priors by forwarding 10k utterances from training set."
  . ./cmd.sh
  nj=$(cat $alidir/num_jobs)
  steps/nnet/make_priors.sh --cmd "$train_cmd" --nj $nj \
    ${ivector:+ --ivector "$ivector"} $data $dir
fi

echo "$0: Done. '$dir'"
exit 0


================================================
FILE: egs/steps/nnet/train_mpe.sh
================================================
#!/usr/bin/env bash
# Copyright 2013-2017  Brno University of Technology (author: Karel Vesely)
# Apache 2.0.

# Sequence-discriminative MPE/sMBR training of DNN.
# 4 iterations (by default) of Stochastic Gradient Descent with per-utterance updates.
# We select between MPE/sMBR optimization by '--do-smbr <bool>' option.

# For the numerator we have a fixed alignment rather than a lattice--
# this actually follows from the way lattices are defined in Kaldi, which
# is to have a single path for each word (output-symbol) sequence.


# Begin configuration section.
cmd=run.pl
num_iters=4
acwt=0.1
lmwt=1.0
learn_rate=0.00001
momentum=0.0
halving_factor=1.0 #ie. disable halving
do_smbr=true
one_silence_class=true # if true : all the `silphones' are mapped to a single class in the Forward-backward of sMBR/MPE,
                       # (this prevents the sMBR from WER explosion, which was happenning with some data).
                       # if false : the silphone-frames are always counted as 'wrong' in the calculation of the approximate accuracies,
silphonelist=          # this overrides default silphone-list (for selecting a subset of sil-phones)

unkphonelist=          # dummy deprecated option, for backward compatibility,
exclude_silphones=     # dummy deprecated option, for backward compatibility,

verbose=0 # 0 No GPU time-stats, 1 with GPU time-stats (slower),
ivector=
nnet=  # For non-default location of nnet,

seed=777    # seed value used for training data shuffling
skip_cuda_check=false
# End configuration section

echo "$0 $@"  # Print the command line for logging

[ -f ./path.sh ] && . ./path.sh; # source the path.
. parse_options.sh || exit 1;

set -euo pipefail

if [ $# -ne 6 ]; then
  echo "Usage: $0 <data> <lang> <srcdir> <ali> <denlats> <exp>"
  echo " e.g.: $0 data/train_all data/lang exp/tri3b_dnn exp/tri3b_dnn_ali exp/tri3b_dnn_denlats exp/tri3b_dnn_smbr"
  echo "Main options (for others, see top of script file)"
  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
  echo "  --config <config-file>                           # config containing options"
  echo "  --num-iters <N>                                  # number of iterations to run"
  echo "  --acwt <float>                                   # acoustic score scaling"
  echo "  --lmwt <float>                                   # linguistic score scaling"
  echo "  --learn-rate <float>                             # learning rate for NN training"
  echo "  --do-smbr <bool>                                 # do sMBR training, otherwise MPE"

  exit 1;
fi

data=$1
lang=$2
srcdir=$3
alidir=$4
denlatdir=$5
dir=$6

for f in $data/feats.scp $denlatdir/lat.scp \
         $alidir/{tree,final.mdl,ali.1.gz} \
         $srcdir/{final.nnet,final.feature_transform}; do
  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
done

# check if CUDA compiled in,
if ! $skip_cuda_check; then cuda-compiled || { echo "Error, CUDA not compiled-in!"; exit 1; } fi

mkdir -p $dir/log

utils/lang/check_phones_compatible.sh $lang/phones.txt $srcdir/phones.txt
utils/lang/check_phones_compatible.sh $lang/phones.txt $alidir/phones.txt
cp $lang/phones.txt $dir

cp $alidir/{final.mdl,tree} $dir

[ -z $silphonelist ] && silphonelist=`cat $lang/phones/silence.csl` # Default 'silphonelist',

#Get the files we will need
[ -z "$nnet" ] && nnet=$srcdir/$(readlink $srcdir/final.nnet || echo final.nnet);
[ -z "$nnet" ] && echo "Error nnet '$nnet' does not exist!" && exit 1;
cp $nnet $dir/0.nnet; nnet=$dir/0.nnet

class_frame_counts=$srcdir/ali_train_pdf.counts
[ -z "$class_frame_counts" ] && echo "Error class_frame_counts '$class_frame_counts' does not exist!" && exit 1;
cp $srcdir/ali_train_pdf.counts $dir

feature_transform=$srcdir/final.feature_transform
if [ ! -f $feature_transform ]; then
  echo "Missing feature_transform '$feature_transform'"
  exit 1
fi
cp $feature_transform $dir/final.feature_transform

model=$dir/final.mdl
[ -z "$model" ] && echo "Error transition model '$model' does not exist!" && exit 1;

# Shuffle the feature list to make the GD stochastic!
# By shuffling features, we have to use lattices with random access (indexed by .scp file).
cat $data/feats.scp | utils/shuffle_list.pl --srand $seed > $dir/train.scp

[ -n "$unkphonelist" ] && echo "WARNING: The option '--unkphonelist' is now deprecated. Please remove it from your recipe..."
[ -n "$exclude_silphones" ] && echo "WARNING: The option '--exclude-silphones' is now deprecated. Please remove it from your recipe..."

###
### PREPARE FEATURE EXTRACTION PIPELINE
###
# import config,
cmvn_opts=
delta_opts=
D=$srcdir
[ -e $D/norm_vars ] && cmvn_opts="--norm-means=true --norm-vars=$(cat $D/norm_vars)" # Bwd-compatibility,
[ -e $D/cmvn_opts ] && cmvn_opts=$(cat $D/cmvn_opts)
[ -e $D/delta_order ] && delta_opts="--delta-order=$(cat $D/delta_order)" # Bwd-compatibility,
[ -e $D/delta_opts ] && delta_opts=$(cat $D/delta_opts)
#
# Create the feature stream,
feats="ark,o:copy-feats scp:$dir/train.scp ark:- |"
# apply-cmvn (optional),
[ ! -z "$cmvn_opts" -a ! -f $data/cmvn.scp ] && echo "$0: Missing $data/cmvn.scp" && exit 1
[ ! -z "$cmvn_opts" ] && feats="$feats apply-cmvn $cmvn_opts --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp ark:- ark:- |"
# add-deltas (optional),
[ ! -z "$delta_opts" ] && feats="$feats add-deltas $delta_opts ark:- ark:- |"
# add-pytel transform (optional),
[ -e $D/pytel_transform.py ] && feats="$feats /bin/env python $D/pytel_transform.py |"

# add-ivector (optional),
if [ -e $D/ivector_dim ]; then
  [ -z $ivector ] && echo "Missing --ivector, they were used in training!" && exit 1
  # Get the tool,
  ivector_append_tool=append-vector-to-feats # default,
  [ -e $D/ivector_append_tool ] && ivector_append_tool=$(cat $D/ivector_append_tool)
  # Check dims,
  dim_raw=$(feat-to-dim "$feats" -)
  dim_raw_and_ivec=$(feat-to-dim "$feats $ivector_append_tool ark:- '$ivector' ark:- |" -)
  dim_ivec=$((dim_raw_and_ivec - dim_raw))
  [ $dim_ivec != "$(cat $D/ivector_dim)" ] && \
    echo "Error, i-vector dim. mismatch (expected $(cat $D/ivector_dim), got $dim_ivec in '$ivector')" && \
    exit 1
  # Append to feats,
  feats="$feats $ivector_append_tool ark:- '$ivector' ark:- |"
fi

### Record the setup,
[ ! -z "$cmvn_opts" ] && echo $cmvn_opts >$dir/cmvn_opts
[ ! -z "$delta_opts" ] && echo $delta_opts >$dir/delta_opts
[ -e $D/pytel_transform.py ] && cp {$D,$dir}/pytel_transform.py
[ -e $D/ivector_dim ] && cp {$D,$dir}/ivector_dim
[ -e $D/ivector_append_tool ] && cp $D/ivector_append_tool $dir/ivector_append_tool
###

###
### Prepare the alignments
###
# Assuming all alignments will fit into memory
ali="ark:gunzip -c $alidir/ali.*.gz |"


###
### Prepare the lattices
###
# The lattices are indexed by SCP (they are not gziped because of the random access in SGD)
lats="scp:$denlatdir/lat.scp"


# Run several iterations of the MPE/sMBR training
cur_mdl=$nnet
x=1
while [ $x -le $num_iters ]; do
  echo "Pass $x (learnrate $learn_rate)"
  if [ -f $dir/$x.nnet ]; then
    echo "Skipped, file $dir/$x.nnet exists"
  else
    #train
    $cmd $dir/log/mpe.$x.log \
     nnet-train-mpe-sequential \
       --feature-transform=$feature_transform \
       --class-frame-counts=$class_frame_counts \
       --acoustic-scale=$acwt \
       --lm-scale=$lmwt \
       --learn-rate=$learn_rate \
       --momentum=$momentum \
       --do-smbr=$do_smbr \
       --verbose=$verbose \
       --one-silence-class=$one_silence_class \
       ${silphonelist:+ --silence-phones=$silphonelist} \
       $cur_mdl $alidir/final.mdl "$feats" "$lats" "$ali" $dir/$x.nnet
  fi
  cur_mdl=$dir/$x.nnet

  #report the progress
  grep -B 2 "Overall average frame-accuracy" $dir/log/mpe.$x.log | sed -e 's|.*)||'

  x=$((x+1))
  learn_rate=$(awk "BEGIN{print($learn_rate*$halving_factor)}")

done

(cd $dir; [ -e final.nnet ] && unlink final.nnet; ln -s $((x-1)).nnet final.nnet)


echo "MPE/sMBR training finished"

if [ -e $dir/prior_counts ]; then
  echo "Priors are already re-estimated, skipping... ($dir/prior_counts)"
else
  echo "Re-estimating priors by forwarding 10k utterances from training set."
  . ./cmd.sh
  nj=$(cat $alidir/num_jobs)
  steps/nnet/make_priors.sh --cmd "$train_cmd" --nj $nj \
    ${ivector:+ --ivector "$ivector"} $data $dir
fi

echo "$0: Done. '$dir'"
exit 0


================================================
FILE: egs/steps/nnet/train_scheduler.sh
================================================
#!/usr/bin/env bash

# Copyright 2012-2017  Brno University of Technology (author: Karel Vesely)
# Apache 2.0

# Schedules epochs and controls learning rate during the neural network training

# Begin configuration.

# training options,
learn_rate=0.008
momentum=0
l1_penalty=0
l2_penalty=0

# data processing,
train_tool="nnet-train-frmshuff"
train_tool_opts="--minibatch-size=256 --randomizer-size=32768 --randomizer-seed=777"
feature_transform=

split_feats= # int -> number of splits 'feats.scp -> feats.${i}.scp', starting from feats.1.scp,
             # (data are alredy shuffled and split to N parts),
             # empty -> no splitting,

# learn rate scheduling,
max_iters=20
min_iters=0 # keep training, disable weight rejection, start learn-rate halving as usual,
keep_lr_iters=0 # fix learning rate for N initial epochs, disable weight rejection,
dropout_schedule= # dropout-rates for N initial epochs, for example: 0.1,0.1,0.1,0.1,0.1,0.0
start_halving_impr=0.01
end_halving_impr=0.001
halving_factor=0.5

# misc,
verbose=0 # 0 No GPU time-stats, 1 with GPU time-stats (slower),
frame_weights=
utt_weights=

# End configuration.

echo "$0 $@"  # Print the command line for logging
[ -f path.sh ] && . ./path.sh;

. parse_options.sh || exit 1;

set -euo pipefail

if [ $# != 6 ]; then
   echo "Usage: $0 <mlp-init> <feats-tr> <feats-cv> <labels-tr> <labels-cv> <exp-dir>"
   echo " e.g.: $0 0.nnet scp:train.scp scp:cv.scp ark:labels_tr.ark ark:labels_cv.ark exp/dnn1"
   echo "main options (for others, see top of script file)"
   echo "  --config <config-file>  # config containing options"
   exit 1;
fi

mlp_init=$1
feats_tr=$2
feats_cv=$3
labels_tr=$4
labels_cv=$5
dir=$6

[ ! -d $dir ] && mkdir $dir
[ ! -d $dir/log ] && mkdir $dir/log
[ ! -d $dir/nnet ] && mkdir $dir/nnet

dropout_array=($(echo ${dropout_schedule} | tr ',' ' '))

# Skip training
[ -e $dir/final.nnet ] && echo "'$dir/final.nnet' exists, skipping training" && exit 0

##############################
# start training

# choose mlp to start with,
mlp_best=$mlp_init
mlp_base=${mlp_init##*/}; mlp_base=${mlp_base%.*}

# optionally resume training from the best epoch, using saved learning-rate,
[ -e $dir/.mlp_best ] && mlp_best=$(cat $dir/.mlp_best)
[ -e $dir/.learn_rate ] && learn_rate=$(cat $dir/.learn_rate)

# cross-validation on original network,
log=$dir/log/iter00.initial.log; hostname>$log
$train_tool --cross-validate=true --randomize=false --verbose=$verbose $train_tool_opts \
  ${feature_transform:+ --feature-transform=$feature_transform} \
  ${frame_weights:+ "--frame-weights=$frame_weights"} \
  ${utt_weights:+ "--utt-weights=$utt_weights"} \
  "$feats_cv" "$labels_cv" $mlp_best \
  2>> $log

loss=$(cat $dir/log/iter00.initial.log | grep "AvgLoss:" | tail -n 1 | awk '{ print $4; }')
loss_type=$(cat $dir/log/iter00.initial.log | grep "AvgLoss:" | tail -n 1 | awk '{ print $5; }')
echo "CROSSVAL PRERUN AVG.LOSS $(printf "%.4f" $loss) $loss_type"

# resume lr-halving,
halving=0
[ -e $dir/.halving ] && halving=$(cat $dir/.halving)

# training,
for iter in $(seq -w $max_iters); do
  echo -n "ITERATION $iter: "
  mlp_next=$dir/nnet/${mlp_base}_iter${iter}

  # skip iteration (epoch) if already done,
  [ -e $dir/.done_iter$iter ] && echo -n "skipping... " && ls $mlp_next* && continue

  # set dropout-rate from the schedule,
  if [ -n ${dropout_array[$((${iter#0}-1))]-''} ]; then
    dropout_rate=${dropout_array[$((${iter#0}-1))]}
    nnet-copy --dropout-rate=$dropout_rate $mlp_best ${mlp_best}.dropout_rate${dropout_rate}
    mlp_best=${mlp_best}.dropout_rate${dropout_rate}
  fi

  # select the split,
  feats_tr_portion="$feats_tr" # no split?
  if [ -n "$split_feats" ]; then
    portion=$((1 + iter % split_feats))
    feats_tr_portion="${feats_tr/train.scp/train.${portion}.scp}"
  fi

  # training,
  log=$dir/log/iter${iter}.tr.log; hostname>$log
  $train_tool --cross-validate=false --randomize=true --verbose=$verbose $train_tool_opts \
    --learn-rate=$learn_rate --momentum=$momentum \
    --l1-penalty=$l1_penalty --l2-penalty=$l2_penalty \
    ${feature_transform:+ --feature-transform=$feature_transform} \
    ${frame_weights:+ "--frame-weights=$frame_weights"} \
    ${utt_weights:+ "--utt-weights=$utt_weights"} \
    "$feats_tr_portion" "$labels_tr" $mlp_best $mlp_next \
    2>> $log || exit 1;

  tr_loss=$(cat $dir/log/iter${iter}.tr.log | grep "AvgLoss:" | tail -n 1 | awk '{ print $4; }')
  echo -n "TRAIN AVG.LOSS $(printf "%.4f" $tr_loss), (lrate$(printf "%.6g" $learn_rate)), "

  # cross-validation,
  log=$dir/log/iter${iter}.cv.log; hostname>$log
  $train_tool --cross-validate=true --randomize=false --verbose=$verbose $train_tool_opts \
    ${feature_transform:+ --feature-transform=$feature_transform} \
    ${frame_weights:+ "--frame-weights=$frame_weights"} \
    ${utt_weights:+ "--utt-weights=$utt_weights"} \
    "$feats_cv" "$labels_cv" $mlp_next \
    2>>$log || exit 1;

  loss_new=$(cat $dir/log/iter${iter}.cv.log | grep "AvgLoss:" | tail -n 1 | awk '{ print $4; }')
  echo -n "CROSSVAL AVG.LOSS $(printf "%.4f" $loss_new), "

  # accept or reject?
  loss_prev=$loss
  if [ 1 == $(awk "BEGIN{print($loss_new < $loss ? 1:0);}") -o $iter -le $keep_lr_iters -o $iter -le $min_iters ]; then
    # accepting: the loss was better, or we had fixed learn-rate, or we had fixed epoch-number,
    loss=$loss_new
    mlp_best=$dir/nnet/${mlp_base}_iter${iter}_learnrate${learn_rate}_tr$(printf "%.4f" $tr_loss)_cv$(printf "%.4f" $loss_new)
    [ $iter -le $min_iters ] && mlp_best=${mlp_best}_min-iters-$min_iters
    [ $iter -le $keep_lr_iters ] && mlp_best=${mlp_best}_keep-lr-iters-$keep_lr_iters
    mv $mlp_next $mlp_best
    echo "nnet accepted ($(basename $mlp_best))"
    echo $mlp_best > $dir/.mlp_best
  else
    # rejecting,
    mlp_reject=$dir/nnet/${mlp_base}_iter${iter}_learnrate${learn_rate}_tr$(printf "%.4f" $tr_loss)_cv$(printf "%.4f" $loss_new)_rejected
    mv $mlp_next $mlp_reject
    echo "nnet rejected ($(basename $mlp_reject))"
  fi

  # create .done file, the iteration (epoch) is completed,
  touch $dir/.done_iter$iter

  # continue with original learn-rate,
  [ $iter -le $keep_lr_iters ] && continue

  # stopping criterion,
  rel_impr=$(awk "BEGIN{print(($loss_prev-$loss)/$loss_prev);}")
  if [ 1 == $halving -a 1 == $(awk "BEGIN{print($rel_impr < $end_halving_impr ? 1:0);}") ]; then
    if [ $iter -le $min_iters ]; then
      echo we were supposed to finish, but we continue as min_iters : $min_iters
      continue
    fi
    echo finished, too small rel. improvement $rel_impr
    break
  fi

  # start learning-rate fade-out when improvement is low,
  if [ 1 == $(awk "BEGIN{print($rel_impr < $start_halving_impr ? 1:0);}") ]; then
    halving=1
    echo $halving >$dir/.halving
  fi

  # reduce the learning-rate,
  if [ 1 == $halving ]; then
    learn_rate=$(awk "BEGIN{print($learn_rate*$halving_factor)}")
    echo $learn_rate >$dir/.learn_rate
  fi
done

# select the best network,
if [ $mlp_best != $mlp_init ]; then
  mlp_final=${mlp_best}_final_
  ( cd $dir/nnet; ln -s $(basename $mlp_best) $(basename $mlp_final); )
  ( cd $dir; ln -s nnet/$(basename $mlp_final) final.nnet; )
  echo "$0: Succeeded training the Neural Network : '$dir/final.nnet'"
else
  echo "$0: Error training neural network..."
  exit 1
fi


================================================
FILE: egs/steps/nnet2/adjust_priors.sh
================================================
#!/usr/bin/env bash

# Copyright 2012-2014  Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.
# Copyright (c) 2015, Johns Hopkins University (Yenda Trmal <jtrmal@gmail.com>)
# License: Apache 2.0

# Begin configuration section.
cmd=run.pl
iter=final
# End configuration section


echo "$0 $@"  # Print the command line for logging

if [ -f path.sh ]; then . ./path.sh; fi
. parse_options.sh || exit 1;

if [ $# != 2 ]; then
  echo "Usage: $0 [opts] <degs-dir> <nnet-dir>"
  echo " e.g.: $0 exp/tri4_mpe_degs exp/tri4_mpe"
  echo ""
  echo "Performs priors adjustment either on the final iteration"
  echo "or iteration of choice of the training. The adjusted model"
  echo "filename will be suffixed by \"adj\", i.e. for the final"
  echo "iteration final.mdl will become final.adj.mdl"
  echo ""
  echo "Main options (for others, see top of script file)"
  echo "  --config <config-file>                           # config file containing options"
  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
  echo "  --iter <iteration|final>                         # which iteration to be adjusted"
  exit 1;
fi

degs_dir=$1
dir=$2

src_model=$dir/${iter}.mdl

if [ ! -f $src_model ]; then
  echo "$0: Expecting $src_model to exist."
  exit 1
fi

if [ ! -f $degs_dir/priors_egs.1.ark ]; then
  echo "$0: Expecting $degs_dir/priors_egs.1.ark to exist."
  exit 1
fi

num_archives_priors=`cat $degs_dir/info/num_archives_priors` || {
  echo "Could not find $degs_dir/info/num_archives_priors.";
  exit 1;
}

$cmd JOB=1:$num_archives_priors $dir/log/get_post.${iter}.JOB.log \
  nnet-compute-from-egs "nnet-to-raw-nnet $src_model -|" \
  ark:$degs_dir/priors_egs.JOB.ark ark:- \| \
  matrix-sum-rows ark:- ark:- \| \
  vector-sum ark:- $dir/post.${iter}.JOB.vec || {
    echo "Error in getting posteriors for adjusting priors."
    echo "See $dir/log/get_post.${iter}.*.log";
    exit 1;
  }


$cmd $dir/log/sum_post.${iter}.log \
  vector-sum $dir/post.${iter}.*.vec $dir/post.${iter}.vec || {
    echo "Error in summing posteriors. See $dir/log/sum_post.${iter}.log";
    exit 1;
  }

rm -f $dir/post.${iter}.*.vec

echo "Re-adjusting priors based on computed posteriors for iter $iter"
$cmd $dir/log/adjust_priors.${iter}.log \
  nnet-adjust-priors $src_model $dir/post.${iter}.vec $dir/${iter}.adj.mdl || {
    echo "Error in adjusting priors. See $dir/log/adjust_priors.${iter}.log";
    exit 1;
  }

echo "Done adjusting priors (on $src_model)"


================================================
FILE: egs/steps/nnet2/align.sh
================================================
#!/usr/bin/env bash
# Copyright 2012  Brno University of Technology (Author: Karel Vesely)
#           2013  Johns Hopkins University (Author: Daniel Povey)
# Apache 2.0

# Computes training alignments using DNN

# Begin configuration section.
nj=4
cmd=run.pl
# Begin configuration.
scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
beam=10
retry_beam=40
transform_dir=
iter=final
use_gpu=no
online_ivector_dir=
feat_type=  # you can set this to force it to use delta features.
# End configuration options.

echo "$0 $@"  # Print the command line for logging

[ -f path.sh ] && . ./path.sh # source the path.
. parse_options.sh || exit 1;

if [ $# != 4 ]; then
   echo "Usage: $0 [--transform-dir <transform-dir>] <data-dir> <lang-dir> <src-dir> <align-dir>"
   echo "e.g.: $0 data/train data/lang exp/nnet4 exp/nnet4_ali"
   echo "main options (for others, see top of script file)"
   echo "  --config <config-file>                           # config containing options"
   echo "  --nj <nj>                                        # number of parallel jobs"
   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
   exit 1;
fi

data=$1
lang=$2
srcdir=$3
dir=$4

oov=`cat $lang/oov.int` || exit 1;
mkdir -p $dir/log
echo $nj > $dir/num_jobs
sdata=$data/split$nj
[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;


extra_files=
[ ! -z "$online_ivector_dir" ] && \
  extra_files="$online_ivector_dir/ivector_online.scp $online_ivector_dir/ivector_period"
for f in $srcdir/tree $srcdir/${iter}.mdl $data/feats.scp $lang/L.fst $extra_files; do
  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
done

utils/lang/check_phones_compatible.sh $lang/phones.txt $srcdir/phones.txt || exit 1;
cp $lang/phones.txt $dir || exit 1;

cp $srcdir/{tree,${iter}.mdl} $dir || exit 1;


## Set up features.  Note: these are different from the normal features
## because we have one rspecifier that has the features for the entire
## training set, not separate ones for each batch.
if [ -z "$feat_type" ]; then
  if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=raw; fi
fi
echo "$0: feature type is $feat_type"

cmvn_opts=`cat $srcdir/cmvn_opts 2>/dev/null`
cp $srcdir/cmvn_opts $srcdir/splice_opts $dir 2>/dev/null

case $feat_type in
  raw) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- |"
   ;;
  lda)
    splice_opts=`cat $srcdir/splice_opts 2>/dev/null`
    cp $srcdir/splice_opts $dir 2>/dev/null
    cp $srcdir/final.mat $dir || exit 1;
    feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $dir/final.mat ark:- ark:- |"
    ;;
  *) echo "$0: invalid feature type $feat_type" && exit 1;
esac

if [ ! -z "$transform_dir" ]; then
  echo "$0: using transforms from $transform_dir"
  [ ! -s $transform_dir/num_jobs ] && \
    echo "$0: expected $transform_dir/num_jobs to contain the number of jobs." && exit 1;
  nj_orig=$(cat $transform_dir/num_jobs)

  if [ $feat_type == "raw" ]; then trans=raw_trans;
  else trans=trans; fi
  if [ $feat_type == "lda" ] && ! cmp $transform_dir/final.mat $srcdir/final.mat; then
    echo "$0: LDA transforms differ between $srcdir and $transform_dir"
    exit 1;
  fi
  if [ ! -f $transform_dir/$trans.1 ]; then
    echo "$0: expected $transform_dir/$trans.1 to exist (--transform-dir option)"
    exit 1;
  fi
  if [ $nj -ne $nj_orig ]; then
    # Copy the transforms into an archive with an index.
    for n in $(seq $nj_orig); do cat $transform_dir/$trans.$n; done | \
       copy-feats ark:- ark,scp:$dir/$trans.ark,$dir/$trans.scp || exit 1;
    feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk scp:$dir/$trans.scp ark:- ark:- |"
  else
    # number of jobs matches with alignment dir.
    feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/$trans.JOB ark:- ark:- |"
  fi
fi

if [ ! -z "$online_ivector_dir" ]; then
  ivector_period=$(cat $online_ivector_dir/ivector_period) || exit 1;
  # note: subsample-feats, with negative n, will repeat each feature -n times.
  feats="$feats paste-feats --length-tolerance=$ivector_period ark:- 'ark,s,cs:utils/filter_scp.pl $sdata/JOB/utt2spk $online_ivector_dir/ivector_online.scp | subsample-feats --n=-$ivector_period scp:- ark:- |' ark:- |"
fi

echo "$0: aligning data in $data using model from $srcdir, putting alignments in $dir"

tra="ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt $sdata/JOB/text|";

$cmd JOB=1:$nj $dir/log/align.JOB.log \
  compile-train-graphs --read-disambig-syms=$lang/phones/disambig.int $dir/tree $srcdir/${iter}.mdl  $lang/L.fst "$tra" ark:- \| \
  nnet-align-compiled $scale_opts --use-gpu=$use_gpu --beam=$beam --retry-beam=$retry_beam \
    $srcdir/${iter}.mdl ark:- "$feats" "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1;

steps/diagnostic/analyze_alignments.sh --cmd "$cmd" $lang $dir

echo "$0: done aligning data."


================================================
FILE: egs/steps/nnet2/check_ivectors_compatible.sh
================================================
#!/usr/bin/env bash
# Copyright (c) 2016, Johns Hopkins University (Yenda Trmal <jtrmal@gmail.com>)
# License: Apache 2.0

# Begin configuration section.
# End configuration section

#echo >&2 "$0 $@"  # Print the command line for logging
if [ $# != 2 ] ; then
  echo >&2 "Usage: $0  <first-dir> <second-dir>"
  echo >&2 " e.g.: $0 exp/nnet3/extractor exp/nnet3/ivectors_dev10h.pem"
fi

dir_a=$1
dir_b=$2

id_a=$(steps/nnet2/get_ivector_id.sh $dir_a)
ret_a=$?
id_b=$(steps/nnet2/get_ivector_id.sh $dir_b)
ret_b=$?

if [ ! -z "$id_a" ] && [ ! -z "${id_b}" ] ; then
  if [ "${id_a}" == "${id_b}" ]; then
    exit 0
  else
    echo >&2 "$0: ERROR: iVector id ${id_a} in $dir_a and the iVector id ${id_b} in $dir_b do not match"
    echo >&2 "$0: ERROR: that means that the systems are not compatible."
    exit 1
  fi
elif [ -z "$id_a" ] && [ -z "${id_b}" ] ; then
    echo >&2 "$0: WARNING: The directories do not contain iVector ID."
    echo >&2 "$0: WARNING: That means it's you who's reponsible for keeping "
    echo >&2 "$0: WARNING: the directories compatible"
    exit 0
else
    echo >&2 "$0: WARNING: One of the directories do not contain iVector ID."
    echo >&2 "$0: WARNING: That means it's you who's reponsible for keeping "
    echo >&2 "$0: WARNING: the directories compatible"
    exit 0
fi


================================================
FILE: egs/steps/nnet2/convert_lda_to_raw.sh
================================================
#!/usr/bin/env bash

# Copyright 2014    Johns Hopkins University (Author: Daniel Povey).
# Apache 2.0.

# This script converts nnet2 models which expect splice+LDA as the input, into
# models which expect raw features (e.g. MFCC) as the input.  If you include
# the option --global-cmvn-stats <matrix>, it will also remove CMVN from the model
# by including it as part of the neural net.


# Begin configuration section
cleanup=true
global_cmvn_stats=
cmd=run.pl
# learning_rate and max_change will only make a difference if we train this model, which is unlikely.
learning_rate=0.00001 # give it a tiny learning rate by default; the user
                      # should probably tune this or set it if they want to train.
max_change=5.0
# End configuration section.

echo "$0 $@"  # Print the command line for logging

[ -f ./path.sh ] && . ./path.sh; # source the path.
. parse_options.sh || exit 1;


if [ $# -ne 2 ]; then
  echo "Usage: $0 [options] <src-nnet-dir> <dest-nnet-dir>"
  echo "e.g.: $0 --global-cmvn-stats global_cmvn.mat exp/dnn4b_nnet2 exp/dnn4b_nnet2_raw"
  echo "Options include"
  echo "   --global-cmvn-stats <stats-file>         # Filename of globally summed CMVN stats, if"
  echo "                                            # you want to push the CMVN inside the nnet"
  echo "                                            # (it won't any longer be speaker specific)"
  exit 1;
fi

src=$1
dir=$2

mkdir -p $dir/log || exit 1;

for f in $src/final.mdl $src/final.mat $src/splice_opts $src/cmvn_opts; do
  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
done

cp $src/phones.txt $dir 2>/dev/null

mkdir -p $dir/log

# nnet.config will be a config for a few trivial neural-network layers
# that come before the main network, and which do things like
echo -n >$dir/nnet.config

if [ ! -z "$global_cmvn_stats" ]; then
  [ ! -f $src/cmvn_opts ] && \
    echo "$0: expected $src/cmvn_opts to exist" && exit 1;
  norm_vars=false
  if grep 'norm-means=false' $src/cmvn_opts; then
    echo "$0: if --norm-means=false, don't supply the --global-cmvn-stats option to this script"
    exit 1;
  elif grep 'norm-vars=true' $src/cmvn_opts; then
    echo "$0: warning: this script has not been tested with --norm-vars=true in CMVN options"
    norm_vars=true
  fi


  # First add to the config, layers that will do the same transform as cepstral
  # mean and variance normalization using these global stats.  We do this as
  # first an added offset (FixedBiasComonent), then, only if norm-vars=true
  # in the CMVN options, a scaling (FixedScaleComponent).
  
  $cmd $dir/log/copy_feats.log \
    copy-feats --binary=false "$global_cmvn_stats" $dir/global_cmvn_stats.txt || exit 1;
  cat $dir/global_cmvn_stats.txt | \
    perl -e ' $line0 = <STDIN>; $line0 == "[\n" || die "expected first line to be [, got $line0";
    $line1 = <STDIN>; $line2 = <STDIN>; @L1 = split(" ",$line1); @L2 = split(" ",$line2);
    ($bias_out, $scale_out) = @ARGV;
    open(B, ">$bias_out") || die "opening bias-out file $bias_out";
    open(S, ">$scale_out") || die "opening scale-out file $scale_out";
    pop @L2; pop @L2; # remove the " 0 ]"
    $count = pop @L1;  # last element of line 1 is total count.
    ($count > 0.0) || die "Bad count $count";
    $dim = @L1;
    $dim == scalar @L2 || die "Bad dimension of second line of CMVN stats @L2";
    print B "[ ";  print S "[ ";
    for ($x = 0; $x < $dim; $x++) {
      $mean = $L1[$x] / $count;  $var = ($L2[$x] / $count) - ($mean * $mean);
      $bias = -$mean;  print B "$bias ";
      $scale = 1.0 / sqrt($var); $scale > 0 || die "Bad scale $scale";  print S "$scale ";
    }
    print B "]\n";  print S "]\n"; ' $dir/bias.txt $dir/scales.txt || exit 1;
  echo "FixedBiasComponent bias=$dir/bias.txt" >> $dir/nnet.config  
  if $norm_vars; then
    echo "FixedScaleComponent scales=$dir/scales.txt" >> $dir/nnet.config  
  fi
  echo "--norm-means=false --norm-vars=false" >$dir/cmvn_opts || exit 1;
else
  cp $src/cmvn_opts $dir/ || exit 1;
fi

# We need the dimension of the raw features.  We work it out from the LDA matrix dimension.
# get a word-count of the second row of the LDA matrix...  this will be either the
# spliced dim or the spliced dim plus one.
spliced_dim=$(copy-matrix --binary=false $src/final.mat - | head -n 2 | tail -n 1 | wc -w) || exit 1;


splice_opts=$(cat $src/splice_opts) || exit 1;
# Work out how many frames are spliced together by splicing a matrix with one element
# and testing the resulting number of columns.
num_splice=$(echo "foo [ 1.0 ]" | splice-feats $splice_opts ark:- ark:- | feat-to-dim ark:- -)

# We'll separately need the left-context and right-context.
# defaults in the splice-feats code are 4 and 4.
left_context=4
right_context=4
for opt in $(cat $src/splice_opts); do
  if echo $opt | grep left-context  >/dev/null; then
    left_context=$(echo $opt | cut -d= -f2) || exit 1;
  fi
  if echo $opt | grep right-context  >/dev/null; then
    right_context=$(echo $opt | cut -d= -f2) || exit 1;
  fi
done
if ! [ $num_splice -eq $[$left_context+1+$right_context] ]; then
  echo "$0: num-splice worked out from the binaries differs from our interpreation of the options:"
  echo "$num_splice != $left_context + 1 + $right_context"
  exit 1;
fi

modulo=$[$spliced_dim%$num_splice]
if [ $modulo -eq 1 ]; then
  # matrix includes offset term.
  spliced_dim=$[$spliced_dim-1];
  cp $src/final.mat $dir/
elif [ $modulo -eq 0 ]; then
  # We need to add a zero bias term to the matrix, because the AffineComponent
  # expects that.
  copy-matrix --binary=false $src/final.mat - | \
    awk '{if ($NF == "]") { $NF = "0"; print $0, "]"; } else { if (NF > 1) { print $0, "0"; } else {print;}}}' >$dir/final.mat
else
  echo "$0: Cannot make sense of spliced dimension $spliced_dim and num-splice=$num_splice"
  exit 1;
fi
feat_dim=$[$spliced_dim/$num_splice];
echo "SpliceComponent input-dim=$feat_dim left-context=$left_context right-context=$right_context" >>$dir/nnet.config

# use AffineComponentPreconditioned as it's easier to configure than AffineComponentPreconditionedOnline.
echo "AffineComponentPreconditioned alpha=4.0 learning-rate=$learning_rate max-change=$max_change matrix=$dir/final.mat" >>$dir/nnet.config


$cmd $dir/log/nnet_init.log \
  nnet-init $dir/nnet.config $dir/lda.nnet || exit 1;

$cmd $dir/log/nnet_insert.log \
  nnet-insert --insert-at=0 --randomize-next-component=false \
   $src/final.mdl $dir/lda.nnet $dir/final.mdl || exit 1;

if $cleanup; then
  rm $dir/final.mat $dir/lda.nnet
fi


================================================
FILE: egs/steps/nnet2/convert_nnet1_to_nnet2.sh
================================================
#!/usr/bin/env bash

# Copyright 2014    Johns Hopkins University (Author: Daniel Povey).
# Apache 2.0.

# This script converts nnet1 into nnet2 models.
# Note, it doesn't support all possible types of nnet1 models.

# Begin configuration section
cleanup=true
cmd=run.pl
# End configuration section.

echo "$0 $@"  # Print the command line for logging

[ -f ./path.sh ] && . ./path.sh; # source the path.
. parse_options.sh || exit 1;


if [ $# -ne 2 ]; then
  echo "Usage: $0 [options] <src-nnet1-dir> <dest-nnet2-dir>"
  echo "e.g.: $0 exp/dnn4b_pretrain-dbn_dnn exp/dnn4b_nnet2"
  exit 1;
fi

src=$1
dir=$2

mkdir -p $dir/log || exit 1;

for f in $src/final.mdl $src/final.feature_transform $src/ali_train_pdf.counts; do
  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
done

cp $src/phones.txt $dir 2>/dev/null

$cmd $dir/log/convert_feature_transform.log \
  nnet1-to-raw-nnet $src/final.feature_transform $dir/0.raw || exit 1;


if [ -f $src/final.nnet ]; then
  echo "$0: $src/final.nnet exists, using it as input."
  $cmd $dir/log/convert_model.log \
    nnet1-to-raw-nnet $src/final.nnet $dir/1.raw || exit 1;
elif [ -f $src/final.dbn ]; then
  echo "$0: $src/final.dbn exists, using it as input."
  num_leaves=$(am-info $src/final.mdl | grep -w pdfs | awk '{print $NF}') || exit 1;
  dbn_output_dim=$(nnet-info exp/dnn4b_pretrain-dbn/6.dbn  | grep component | tail -n 1 | sed s:,::g | awk '{print $NF}') || exit 1;
  [ -z "$dbn_output_dim" ] && exit 1;
  
  cat > $dir/final_layer.conf <<EOF
AffineComponent input-dim=$dbn_output_dim output-dim=$num_leaves learning-rate=0.001
SoftmaxComponent dim=$num_leaves
EOF
  $cmd $dir/log/convert_model.log \
    nnet1-to-raw-nnet $src/final.dbn - \| \
    raw-nnet-concat - "raw-nnet-init $dir/final_layer.conf -|" $dir/1.raw || exit 1;
else
  echo "$0: expected either $src/final.nnet or $src/final.dbn to exist"
fi

$cmd $dir/log/append_model.log \
  raw-nnet-concat $dir/0.raw $dir/1.raw $dir/concat.raw || exit 1;

$cmd $dir/log/init_model.log \
  nnet-am-init $src/final.mdl $dir/concat.raw $dir/final_noprior.mdl || exit 1;

$cmd $dir/log/set_priors.log \
  nnet-adjust-priors $dir/final_noprior.mdl $src/ali_train_pdf.counts $dir/final.mdl || exit 1;

if $cleanup; then
  rm $dir/0.raw $dir/1.raw $dir/concat.raw $dir/final_noprior.mdl
fi


================================================
FILE: egs/steps/nnet2/create_appended_model.sh
================================================
#!/usr/bin/env bash

#  Copyright 2014  Johns Hopkins University (Author: Daniel Povey)
#  Apache 2.0.

# This script is for use with "retrain_fast.sh"; it combines the original model
# that you trained on top of, with the single layer model you trained, so that
# you can do joint backpropagation.

# Begin configuration options.
cmd=run.pl
# End configuration options.


echo "$0 $@"  # Print the command line for logging

if [ -f path.sh ]; then . ./path.sh; fi
. parse_options.sh || exit 1;

if [ $# != 3 ]; then
  echo "Usage: $0 <original-nnet-dir> <new-nnet-dir> <combined-nnet-dir>"
  echo "where <original-nnet-dir> will typically be a normal neural net from another corpus,"
  echo "and <new-nnet-dir> will usually be a single-layer neural net trained on top of it by"
  echo "dumping the activations (e.g. using steps/online/nnet2/dump_nnet_activations.sh, I"
  echo "think no such script exists for non-online), and then training using"
  echo "steps/nnet2/retrain_fast.sh."
  echo "e.g.: $0 ../../swbd/s5b/exp/nnet2_online/nnet_gpu_online exp/nnet2_swbd_online/nnet_gpu_online exp/nnet2_swbd_online/nnet_gpu_online_combined"
fi


src1=$1
src2=$2
dir=$3

for f in $src1/final.mdl $src2/tree $src2/final.mdl; do
   [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1;
done


mkdir -p $dir/log

info=$dir/nnet_info
nnet-am-info $src1/final.mdl >$info
nc=$(grep num-components $info | awk '{print $2}');
if grep SumGroupComponent $info >/dev/null; then 
  nc_truncate=$[$nc-3]  # we did mix-up: remove AffineComponent,
                        # SumGroupComponent, SoftmaxComponent
else
                        # we didn't mix-up:
  nc_truncate=$[$nc-2]  # remove AffineComponent, SoftmaxComponent
fi

$cmd $dir/log/get_raw_nnet.log \
 nnet-to-raw-nnet --truncate=$nc_truncate $src1/final.mdl $dir/first_nnet.raw || exit 1;

$cmd $dir/log/append_nnet.log \
  nnet-insert --randomize-next-component=false --insert-at=0 \
  $src2/final.mdl $dir/first_nnet.raw $dir/final.mdl || exit 1;

$cleanup && rm $dir/first_nnet.raw

# Copy the tree etc., 

cp $src2/tree $dir || exit 1;

# Copy feature-related things from src1 where we built the initial model.
# Note: if you've done anything like mess with the feature-extraction configs,
# or changed the feature type, you have to keep track of that yourself.
for f in final.mat cmvn_opts splice_opts; do
  if [ -f $src1/$f ]; then
    cp $src1/$f $dir || exit 1;
  fi
done

echo "$0: created appended model in $dir"


================================================
FILE: egs/steps/nnet2/decode.sh
================================================
#!/usr/bin/env bash

# Copyright 2012-2013  Johns Hopkins University (Author: Daniel Povey).
# Apache 2.0.

# This script does decoding with a neural-net.  If the neural net was built on
# top of fMLLR transforms from a conventional system, you should provide the
# --transform-dir option.

# Begin configuration section.
stage=1
transform_dir=    # dir to find fMLLR transforms.
nj=4 # number of decoding jobs.  If --transform-dir set, must match that number!
acwt=0.1  # Just a default value, used for adaptation and beam-pruning..
cmd=run.pl
beam=15.0
max_active=7000
min_active=200
ivector_scale=1.0
lattice_beam=8.0 # Beam we use in lattice generation.
iter=final
num_threads=1 # if >1, will use gmm-latgen-faster-parallel
parallel_opts=  # ignored now.
scoring_opts=
skip_scoring=false
feat_type=
online_ivector_dir=
minimize=false
# End configuration section.

echo "$0 $@"  # Print the command line for logging

[ -f ./path.sh ] && . ./path.sh; # source the path.
. parse_options.sh || exit 1;

if [ $# -ne 3 ]; then
  echo "Usage: $0 [options] <graph-dir> <data-dir> <decode-dir>"
  echo " e.g.: $0 --transform-dir exp/tri3b/decode_dev93_tgpr \\"
  echo "      exp/tri3b/graph_tgpr data/test_dev93 exp/tri4a_nnet/decode_dev93_tgpr"
  echo "main options (for others, see top of script file)"
  echo "  --transform-dir <decoding-dir>           # directory of previous decoding"
  echo "                                           # where we can find transforms for SAT systems."
  echo "  --config <config-file>                   # config containing options"
  echo "  --nj <nj>                                # number of parallel jobs"
  echo "  --cmd <cmd>                              # Command to run in parallel with"
  echo "  --beam <beam>                            # Decoding beam; default 15.0"
  echo "  --iter <iter>                            # Iteration of model to decode; default is final."
  echo "  --scoring-opts <string>                  # options to local/score.sh"
  echo "  --num-threads <n>                        # number of threads to use, default 1."
  echo "  --parallel-opts <opts>                   # e.g. '--num-threads 4' if you supply --num-threads 4"
  exit 1;
fi

graphdir=$1
data=$2
dir=$3
srcdir=`dirname $dir`; # Assume model directory one level up from decoding directory.
model=$srcdir/$iter.mdl


[ ! -z "$online_ivector_dir" ] && \
  extra_files="$online_ivector_dir/ivector_online.scp $online_ivector_dir/ivector_period"

for f in $graphdir/HCLG.fst $data/feats.scp $model $extra_files; do
  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
done

sdata=$data/split$nj;
cmvn_opts=`cat $srcdir/cmvn_opts` || exit 1;
thread_string=
[ $num_threads -gt 1 ] && thread_string="-parallel --num-threads=$num_threads"

mkdir -p $dir/log
[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
echo $nj > $dir/num_jobs


## Set up features.
if [ -z "$feat_type" ]; then
  if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=raw; fi
  echo "$0: feature type is $feat_type"
fi

splice_opts=`cat $srcdir/splice_opts 2>/dev/null`

case $feat_type in
  raw) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- |"
  if [ -f $srcdir/delta_order ]; then
    delta_order=`cat $srcdir/delta_order 2>/dev/null`
    feats="$feats add-deltas --delta-order=$delta_order ark:- ark:- |"
  fi
    ;;
  lda) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |"
    ;;
  *) echo "$0: invalid feature type $feat_type" && exit 1;
esac
if [ ! -z "$transform_dir" ]; then
  echo "$0: using transforms from $transform_dir"
  [ ! -s $transform_dir/num_jobs ] && \
    echo "$0: expected $transform_dir/num_jobs to contain the number of jobs." && exit 1;
  nj_orig=$(cat $transform_dir/num_jobs)

  if [ $feat_type == "raw" ]; then trans=raw_trans;
  else trans=trans; fi
  if [ $feat_type == "lda" ] && \
    ! cmp $transform_dir/../final.mat $srcdir/final.mat && \
    ! cmp $transform_dir/final.mat $srcdir/final.mat; then
    echo "$0: LDA transforms differ between $srcdir and $transform_dir"
    exit 1;
  fi
  if [ ! -f $transform_dir/$trans.1 ]; then
    echo "$0: expected $transform_dir/$trans.1 to exist (--transform-dir option)"
    exit 1;
  fi
  if [ $nj -ne $nj_orig ]; then
    # Copy the transforms into an archive with an index.
    for n in $(seq $nj_orig); do cat $transform_dir/$trans.$n; done | \
       copy-feats ark:- ark,scp:$dir/$trans.ark,$dir/$trans.scp || exit 1;
    feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk scp:$dir/$trans.scp ark:- ark:- |"
  else
    # number of jobs matches with alignment dir.
    feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/$trans.JOB ark:- ark:- |"
  fi
elif grep 'transform-feats --utt2spk' $srcdir/log/train.1.log >&/dev/null; then
  echo "$0: **WARNING**: you seem to be using a neural net system trained with transforms,"
  echo "  but you are not providing the --transform-dir option in test time."
fi
##

if [ ! -z "$online_ivector_dir" ]; then
  ivector_period=$(cat $online_ivector_dir/ivector_period) || exit 1;
  # note: subsample-feats, with negative n, will repeat each feature -n times.
  feats="$feats paste-feats --length-tolerance=$ivector_period ark:- 'ark,s,cs:utils/filter_scp.pl $sdata/JOB/utt2spk $online_ivector_dir/ivector_online.scp | subsample-feats --n=-$ivector_period scp:- ark:- | copy-matrix --scale=$ivector_scale ark:- ark:-|' ark:- |"
fi

if [ $stage -le 1 ]; then
  $cmd --num-threads $num_threads JOB=1:$nj $dir/log/decode.JOB.log \
    nnet-latgen-faster$thread_string \
     --minimize=$minimize --max-active=$max_active --min-active=$min_active --beam=$beam \
     --lattice-beam=$lattice_beam --acoustic-scale=$acwt --allow-partial=true \
     --word-symbol-table=$graphdir/words.txt "$model" \
     $graphdir/HCLG.fst "$feats" "ark:|gzip -c > $dir/lat.JOB.gz" || exit 1;
fi

if [ $stage -le 2 ]; then
  [ ! -z $iter ] && iter_opt="--iter $iter"
  steps/diagnostic/analyze_lats.sh --cmd "$cmd" $iter_opt $graphdir $dir
fi

# The output of this script is the files "lat.*.gz"-- we'll rescore this at
# different acoustic scales to get the final output.

if [ $stage -le 3 ]; then
  if ! $skip_scoring ; then
    [ ! -x local/score.sh ] && \
      echo "Not scoring because local/score.sh does not exist or not executable." && exit 1;
    echo "score best paths"
    [ "$iter" != "final" ] && iter_opt="--iter $iter"
    local/score.sh $iter_opt $scoring_opts --cmd "$cmd" $data $graphdir $dir
    echo "score confidence and timing with sclite"
  fi
fi
echo "Decoding done."
exit 0;


================================================
FILE: egs/steps/nnet2/dump_bottleneck_features.sh
================================================
#!/usr/bin/env bash

#           2014  Pegah Ghahremani
# Apache 2.0


# Begin configuration section.
feat_type=
stage=1
nj=4
cmd=run.pl

# Begin configuration.
transform_dir=

# End configuration options.

echo "$0 $@"  # Print the command line for logging

[ -f path.sh ] && . ./path.sh # source the path.
. parse_options.sh || exit 1;

if [ $# != 5 ]; then
   echo "usage: steps/nnet2/dump_bottleneck_features.sh <input-data-dir> <output-data-dir> <bnf-nnet-dir> <archive-dir> <log-dir>"
   echo "e.g.:  steps/nnet2/dump_bottleneck_features.sh data/train data/train_bnf exp_bnf/bnf_net exp/tri5_ali mfcc exp_bnf/dump_bnf"
   echo "main options (for others, see top of script file)"
   echo "  --config <config-file>                           # config containing options"
   echo "  --nj <nj>                                        # number of parallel jobs"
   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
   exit 1;
fi

data=$1
bnf_data=$2
nnetdir=$3
archivedir=$4
dir=$5

# Assume that final.nnet is in nnetdir
bnf_nnet=$nnetdir/final.raw
if [ ! -f $bnf_nnet ] ; then
  echo "No such file $bnf_nnet";
  exit 1;
fi

## Set up input features of nnet
if [ -z "$feat_type" ]; then
  if [ -f $nnetdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
fi
echo "$0: feature type is $feat_type"

if [ "$feat_type" == "lda" ] && [ ! -f $nnetdir/final.mat ]; then
  echo "$0: no such file $nnetdir/final.mat"
  exit 1
fi

name=`basename $data`
sdata=$data/split$nj

mkdir -p $dir/log
mkdir -p $bnf_data
echo $nj > $nnetdir/num_jobs
splice_opts=`cat $nnetdir/splice_opts 2>/dev/null`
delta_opts=`cat $nnetdir/delta_opts 2>/dev/null`
[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;

case $feat_type in
  raw) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- |";;
  delta) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas $delta_opts ark:- ark:- |";;
  lda) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $nnetdir/final.mat ark:- ark:- |"
   ;;
  *) echo "Invalid feature type $feat_type" && exit 1;
esac

if [ ! -z "$transform_dir" ]; then
  echo "Using transforms from $transform_dir"
  [ ! -f $transform_dir/trans.1 ] && echo "No such file $transform_dir/trans.1" && exit 1;
  transform_nj=`cat $transform_dir/num_jobs` || exit 1;
  if [ "$nj" != "$transform_nj" ]; then
    for n in $(seq $transform_nj); do cat $transform_dir/trans.$n; done >$dir/trans.ark
    feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$dir/trans.ark ark:- ark:- |"
  else
    feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$transform_dir/trans.JOB ark:- ark:- |"
  fi
fi


if [ $stage -le 1 ]; then
  echo "Making BNF scp and ark."
  $cmd JOB=1:$nj $dir/log/make_bnf_$name.JOB.log \
    nnet-compute $bnf_nnet "$feats" ark:- \| \
    copy-feats --compress=true ark:- ark,scp:$archivedir/raw_bnfeat_$name.JOB.ark,$archivedir/raw_bnfeat_$name.JOB.scp || exit 1;
fi

rm $dir/trans.ark 2>/dev/null

N0=$(cat $data/feats.scp | wc -l)
N1=$(cat $archivedir/raw_bnfeat_$name.*.scp | wc -l)
if [[ "$N0" != "$N1" ]]; then
  echo "Error happens when generating BNF for $name (Original:$N0  BNF:$N1)"
  exit 1;
fi

# Concatenate feats.scp into bnf_data
for n in $(seq $nj); do  cat $archivedir/raw_bnfeat_$name.$n.scp; done > $bnf_data/feats.scp

for f in segments spk2utt text utt2spk wav.scp char.stm glm kws reco2file_and_channel stm; do
  [ -e $data/$f ] && cp -r $data/$f $bnf_data/$f
done

echo "$0: computing CMVN stats."
steps/compute_cmvn_stats.sh $bnf_data $dir $archivedir

echo "$0: done making BNF feats.scp."

exit 0;


================================================
FILE: egs/steps/nnet2/get_egs.sh
================================================
#!/usr/bin/env bash

# Copyright 2012 Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.
# This script, which will generally be called from other neural-net training
# scripts, extracts the training examples used to train the neural net (and also
# the validation examples used for diagnostics), and puts them in separate archives.

# Begin configuration section.
cmd=run.pl
feat_type=
num_utts_subset=300    # number of utterances in validation and training
                       # subsets used for shrinkage and diagnostics
num_valid_frames_combine=0 # #valid frames for combination weights at the very end.
num_train_frames_combine=10000 # # train frames for the above.
num_frames_diagnostic=4000 # number of frames for "compute_prob" jobs
samples_per_iter=200000 # each iteration of training, see this many samples
                        # per job.  This is just a guideline; it will pick a number
                        # that divides the number of samples in the entire data.
transform_dir=     # If supplied, overrides alidir
num_jobs_nnet=16    # Number of neural net jobs to run in parallel
stage=0
io_opts="--max-jobs-run 5" # for jobs with a lot of I/O, limits the number running at one time.
splice_width=4 # meaning +- 4 frames on each side for second LDA
left_context=
right_context=
random_copy=false
online_ivector_dir=
ivector_randomize_prob=0.0 # if >0.0, randomizes iVectors during training with
                           # this prob per iVector.
cmvn_opts=  # can be used for specifying CMVN options, if feature type is not lda.

echo "$0 $@"  # Print the command line for logging

if [ -f path.sh ]; then . ./path.sh; fi
. parse_options.sh || exit 1;


if [ $# != 4 ]; then
  echo "Usage: steps/nnet2/get_egs.sh [opts] <data> <lang> <ali-dir> <exp-dir>"
  echo " e.g.: steps/nnet2/get_egs.sh data/train data/lang exp/tri3_ali exp/tri4_nnet"
  echo ""
  echo "Main options (for others, see top of script file)"
  echo "  --config <config-file>                           # config file containing options"
  echo "  --cmd (utils/run.pl;utils/queue.pl <queue opts>) # how to run jobs."
  echo "  --num-jobs-nnet <num-jobs;16>                    # Number of parallel jobs to use for main neural net"
  echo "                                                   # training (will affect results as well as speed; try 8, 16)"
  echo "                                                   # Note: if you increase this, you may want to also increase"
  echo "                                                   # the learning rate."
  echo "  --samples-per-iter <#samples;400000>             # Number of samples of data to process per iteration, per"
  echo "                                                   # process."
  echo "  --feat-type <lda|raw>                            # (by default it tries to guess).  The feature type you want"
  echo "                                                   # to use as input to the neural net."
  echo "  --splice-width <width;4>                         # Number of frames on each side to append for feature input"
  echo "  --left-context <width;4>                         # Number of frames on left side to append for feature input, overrides splice-width"
  echo "  --right-context <width;4>                        # Number of frames on right side to append for feature input, overrides splice-width"
  echo "  --num-frames-diagnostic <#frames;4000>           # Number of frames used in computing (train,valid) diagnostics"
  echo "  --num-valid-frames-combine <#frames;10000>       # Number of frames used in getting combination weights at the"
  echo "                                                   # very end."
  echo "  --stage <stage|0>                                # Used to run a partially-completed training process from somewhere in"
  echo "                                                   # the middle."

  exit 1;
fi

data=$1
lang=$2  # kept for historical reasons, but never used.
alidir=$3
dir=$4

[ -z "$left_context" ] && left_context=$splice_width
[ -z "$right_context" ] && right_context=$splice_width


# Check some files.
[ ! -z "$online_ivector_dir" ] && \
  extra_files="$online_ivector_dir/ivector_online.scp $online_ivector_dir/ivector_period"

for f in $data/feats.scp $lang/L.fst $alidir/ali.1.gz $alidir/final.mdl $alidir/tree $extra_files; do
  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
done


nj=`cat $alidir/num_jobs` || exit 1;  # number of jobs in alignment dir...

sdata=$data/split$nj
utils/split_data.sh $data $nj

mkdir -p $dir/log
cp $alidir/tree $dir

utils/lang/check_phones_compatible.sh $lang/phones.txt $alidir/phones.txt || exit 1;
cp $lang/phones.txt $dir || exit 1;

# Get list of validation utterances.
awk '{print $1}' $data/utt2spk | utils/shuffle_list.pl | head -$num_utts_subset \
    > $dir/valid_uttlist || exit 1;

if [ -f $data/utt2uniq ]; then
  echo "File $data/utt2uniq exists, so augmenting valid_uttlist to"
  echo "include all perturbed versions of the same 'real' utterances."
  mv $dir/valid_uttlist $dir/valid_uttlist.tmp
  utils/utt2spk_to_spk2utt.pl $data/utt2uniq > $dir/uniq2utt
  cat $dir/valid_uttlist.tmp | utils/apply_map.pl $data/utt2uniq | \
    sort | uniq | utils/apply_map.pl $dir/uniq2utt | \
    awk '{for(n=1;n<=NF;n++) print $n;}' | sort  > $dir/valid_uttlist
  rm $dir/uniq2utt $dir/valid_uttlist.tmp
fi

awk '{print $1}' $data/utt2spk | utils/filter_scp.pl --exclude $dir/valid_uttlist | \
   utils/shuffle_list.pl | head -$num_utts_subset > $dir/train_subset_uttlist || exit 1;

[ -z "$transform_dir" ] && transform_dir=$alidir

## Set up features.
if [ -z $feat_type ]; then
  if [ -f $alidir/final.mat ] && [ ! -f $transform_dir/raw_trans.1 ]; then feat_type=lda; else feat_type=raw; fi
fi
echo "$0: feature type is $feat_type"

case $feat_type in
  raw) feats="ark,s,cs:utils/filter_scp.pl --exclude $dir/valid_uttlist $sdata/JOB/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:- ark:- |"
    valid_feats="ark,s,cs:utils/filter_scp.pl $dir/valid_uttlist $data/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- |"
    train_subset_feats="ark,s,cs:utils/filter_scp.pl $dir/train_subset_uttlist $data/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- |"
    echo $cmvn_opts >$dir/cmvn_opts
   ;;
  lda)
    splice_opts=`cat $alidir/splice_opts 2>/dev/null`
    cp $alidir/{splice_opts,cmvn_opts,final.mat} $dir || exit 1;
    [ ! -z "$cmvn_opts" ] && \
       echo "You cannot supply --cmvn-opts option if feature type is LDA." && exit 1;
    cmvn_opts=$(cat $dir/cmvn_opts)
    feats="ark,s,cs:utils/filter_scp.pl --exclude $dir/valid_uttlist $sdata/JOB/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:- ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $dir/final.mat ark:- ark:- |"
    valid_feats="ark,s,cs:utils/filter_scp.pl $dir/valid_uttlist $data/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $dir/final.mat ark:- ark:- |"
    train_subset_feats="ark,s,cs:utils/filter_scp.pl $dir/train_subset_uttlist $data/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $dir/final.mat ark:- ark:- |"
    ;;
  *) echo "$0: invalid feature type $feat_type" && exit 1;
esac

if [ -f $transform_dir/trans.1 ] && [ $feat_type != "raw" ]; then
  echo "$0: using transforms from $transform_dir"
  feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/trans.JOB ark:- ark:- |"
  valid_feats="$valid_feats transform-feats --utt2spk=ark:$data/utt2spk 'ark:cat $transform_dir/trans.*|' ark:- ark:- |"
  train_subset_feats="$train_subset_feats transform-feats --utt2spk=ark:$data/utt2spk 'ark:cat $transform_dir/trans.*|' ark:- ark:- |"
fi
if [ -f $transform_dir/raw_trans.1 ] && [ $feat_type == "raw" ]; then
  echo "$0: using raw-fMLLR transforms from $transform_dir"
  feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/raw_trans.JOB ark:- ark:- |"
  valid_feats="$valid_feats transform-feats --utt2spk=ark:$data/utt2spk 'ark:cat $transform_dir/raw_trans.*|' ark:- ark:- |"
  train_subset_feats="$train_subset_feats transform-feats --utt2spk=ark:$data/utt2spk 'ark:cat $transform_dir/raw_trans.*|' ark:- ark:- |"
fi
if [ ! -z "$online_ivector_dir" ]; then
  feats_one="$(echo "$feats" | sed s:JOB:1:g)"
  ivector_dim=$(feat-to-dim scp:$online_ivector_dir/ivector_online.scp -) || exit 1;
  ivectors_opt="--const-feat-dim=$ivector_dim"
  ivector_period=$(cat $online_ivector_dir/ivector_period) || exit 1;
  feats="$feats paste-feats --length-tolerance=$ivector_period ark:- 'ark,s,cs:utils/filter_scp.pl $sdata/JOB/utt2spk $online_ivector_dir/ivector_online.scp | subsample-feats --n=-$ivector_period scp:- ark:- | ivector-randomize --randomize-prob=$ivector_randomize_prob ark:- ark:- |' ark:- |"
  valid_feats="$valid_feats paste-feats --length-tolerance=$ivector_period ark:- 'ark,s,cs:utils/filter_scp.pl $dir/valid_uttlist $online_ivector_dir/ivector_online.scp | subsample-feats --n=-$ivector_period scp:- ark:- | ivector-randomize --randomize-prob=$ivector_randomize_prob ark:- ark:- |' ark:- |"
  train_subset_feats="$train_subset_feats paste-feats --length-tolerance=$ivector_period ark:- 'ark,s,cs:utils/filter_scp.pl $dir/train_subset_uttlist $online_ivector_dir/ivector_online.scp | subsample-feats --n=-$ivector_period scp:- ark:- | ivector-randomize --randomize-prob=$ivector_randomize_prob ark:- ark:- |' ark:- |"
fi

if [ $stage -le 0 ]; then
  echo "$0: working out number of frames of training data"
  num_frames=$(steps/nnet2/get_num_frames.sh $data)
  echo $num_frames > $dir/num_frames
else
  num_frames=`cat $dir/num_frames` || exit 1;
fi

# Working out number of iterations per epoch.
iters_per_epoch=`perl -e "print int($num_frames/($samples_per_iter * $num_jobs_nnet) + 0.5);"` || exit 1;
[ $iters_per_epoch -eq 0 ] && iters_per_epoch=1
samples_per_iter_real=$[$num_frames/($num_jobs_nnet*$iters_per_epoch)]
echo "$0: Every epoch, splitting the data up into $iters_per_epoch iterations,"
echo "$0: giving samples-per-iteration of $samples_per_iter_real (you requested $samples_per_iter)."

# Making soft links to storage directories.  This is a no-up unless
# the subdirectory $dir/egs/storage/ exists.  See utils/create_split_dir.pl
for x in `seq 1 $num_jobs_nnet`; do
  for y in `seq 0 $[$iters_per_epoch-1]`; do
    utils/create_data_link.pl $dir/egs/egs.$x.$y.ark
    utils/create_data_link.pl $dir/egs/egs_tmp.$x.$y.ark
  done
  for y in `seq 1 $nj`; do
    utils/create_data_link.pl $dir/egs/egs_orig.$x.$y.ark
  done
done

remove () { for x in $*; do [ -L $x ] && rm $(utils/make_absolute.sh $x); rm $x; done }

nnet_context_opts="--left-context=$left_context --right-context=$right_context"
mkdir -p $dir/egs

if [ $stage -le 2 ]; then
  echo "Getting validation and training subset examples."
  rm $dir/.error 2>/dev/null
  echo "$0: extracting validation and training-subset alignments."
  set -o pipefail;
  for id in $(seq $nj); do gunzip -c $alidir/ali.$id.gz; done | \
    copy-int-vector ark:- ark,t:- | \
    utils/filter_scp.pl <(cat $dir/valid_uttlist $dir/train_subset_uttlist) | \
    gzip -c >$dir/ali_special.gz || exit 1;
  set +o pipefail; # unset the pipefail option.

  $cmd $dir/log/create_valid_subset.log \
    nnet-get-egs $ivectors_opt $nnet_context_opts "$valid_feats" \
    "ark,s,cs:gunzip -c $dir/ali_special.gz | ali-to-pdf $alidir/final.mdl ark:- ark:- | ali-to-post ark:- ark:- |" \
     "ark:$dir/egs/valid_all.egs" || touch $dir/.error &
  $cmd $dir/log/create_train_subset.log \
    nnet-get-egs $ivectors_opt $nnet_context_opts "$train_subset_feats" \
     "ark,s,cs:gunzip -c $dir/ali_special.gz | ali-to-pdf $alidir/final.mdl ark:- ark:- | ali-to-post ark:- ark:- |" \
     "ark:$dir/egs/train_subset_all.egs" || touch $dir/.error &
  wait;
  [ -f $dir/.error ] && echo "Error detected while creating train/valid egs" && exit 1
  echo "Getting subsets of validation examples for diagnostics and combination."
  $cmd $dir/log/create_valid_subset_combine.log \
    nnet-subset-egs --n=$num_valid_frames_combine ark:$dir/egs/valid_all.egs \
        ark:$dir/egs/valid_combine.egs || touch $dir/.error &
  $cmd $dir/log/create_valid_subset_diagnostic.log \
    nnet-subset-egs --n=$num_frames_diagnostic ark:$dir/egs/valid_all.egs \
    ark:$dir/egs/valid_diagnostic.egs || touch $dir/.error &

  $cmd $dir/log/create_train_subset_combine.log \
    nnet-subset-egs --n=$num_train_frames_combine ark:$dir/egs/train_subset_all.egs \
    ark:$dir/egs/train_combine.egs || touch $dir/.error &
  $cmd $dir/log/create_train_subset_diagnostic.log \
    nnet-subset-egs --n=$num_frames_diagnostic ark:$dir/egs/train_subset_all.egs \
    ark:$dir/egs/train_diagnostic.egs || touch $dir/.error &
  wait
  cat $dir/egs/valid_combine.egs $dir/egs/train_combine.egs > $dir/egs/combine.egs

  for f in $dir/egs/{combine,train_diagnostic,valid_diagnostic}.egs; do
    [ ! -s $f ] && echo "No examples in file $f" && exit 1;
  done
  rm $dir/egs/valid_all.egs $dir/egs/train_subset_all.egs $dir/egs/{train,valid}_combine.egs $dir/ali_special.gz
fi

if [ $stage -le 3 ]; then
  # Other scripts might need to know the following info:
  echo $num_jobs_nnet >$dir/egs/num_jobs_nnet
  echo $iters_per_epoch >$dir/egs/iters_per_epoch
  echo $samples_per_iter_real >$dir/egs/samples_per_iter

  echo "Creating training examples";
  # in $dir/egs, create $num_jobs_nnet separate files with training examples.
  # The order is not randomized at this point.

  egs_list=
  for n in `seq 1 $num_jobs_nnet`; do
    egs_list="$egs_list ark:$dir/egs/egs_orig.$n.JOB.ark"
  done
  echo "Generating training examples on disk"
  # The examples will go round-robin to egs_list.
  $cmd $io_opts JOB=1:$nj $dir/log/get_egs.JOB.log \
    nnet-get-egs $ivectors_opt $nnet_context_opts "$feats" \
    "ark,s,cs:gunzip -c $alidir/ali.JOB.gz | ali-to-pdf $alidir/final.mdl ark:- ark:- | ali-to-post ark:- ark:- |" ark:- \| \
    nnet-copy-egs ark:- $egs_list || exit 1;
fi

if [ $stage -le 4 ]; then
  echo "$0: rearranging examples into parts for different parallel jobs"
  # combine all the "egs_orig.JOB.*.scp" (over the $nj splits of the data) and
  # then split into multiple parts egs.JOB.*.scp for different parts of the
  # data, 0 .. $iters_per_epoch-1.

  if [ $iters_per_epoch -eq 1 ]; then
    echo "$0: Since iters-per-epoch == 1, just concatenating the data."
    for n in `seq 1 $num_jobs_nnet`; do
      cat $dir/egs/egs_orig.$n.*.ark > $dir/egs/egs_tmp.$n.0.ark || exit 1;
      remove $dir/egs/egs_orig.$n.*.ark
    done
  else # We'll have to split it up using nnet-copy-egs.
    egs_list=
    for n in `seq 0 $[$iters_per_epoch-1]`; do
      egs_list="$egs_list ark:$dir/egs/egs_tmp.JOB.$n.ark"
    done
    # note, the "|| true" below is a workaround for NFS bugs
    # we encountered running this script with Debian-7, NFS-v4.
    $cmd $io_opts JOB=1:$num_jobs_nnet $dir/log/split_egs.JOB.log \
      nnet-copy-egs --random=$random_copy --srand=JOB \
        "ark:cat $dir/egs/egs_orig.JOB.*.ark|" $egs_list || exit 1;
    remove $dir/egs/egs_orig.*.*.ark  2>/dev/null
  fi
fi

if [ $stage -le 5 ]; then
  # Next, shuffle the order of the examples in each of those files.
  # Each one should not be too large, so we can do this in memory.
  echo "Shuffling the order of training examples"
  echo "(in order to avoid stressing the disk, these won't all run at once)."

  for n in `seq 0 $[$iters_per_epoch-1]`; do
    $cmd $io_opts JOB=1:$num_jobs_nnet $dir/log/shuffle.$n.JOB.log \
      nnet-shuffle-egs "--srand=\$[JOB+($num_jobs_nnet*$n)]" \
      ark:$dir/egs/egs_tmp.JOB.$n.ark ark:$dir/egs/egs.JOB.$n.ark
    remove $dir/egs/egs_tmp.*.$n.ark
  done
fi

echo "$0: Finished preparing training examples"


================================================
FILE: egs/steps/nnet2/get_egs2.sh
================================================
#!/usr/bin/env bash

# Copyright 2012-2014 Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.
#
# This script, which will generally be called from other neural-net training
# scripts, extracts the training examples used to train the neural net (and also
# the validation examples used for diagnostics), and puts them in separate archives.
#
# This script differs from get_egs.sh in that it dumps egs with several frames
# of labels, controlled by the frames_per_eg config variable (default: 8).  This
# takes many times less disk space because typically we have 4 to 7 frames of
# context on the left and right, and this ends up getting shared.  This is at
# the expense of slightly higher disk I/O during training time.
#
# We also have a simpler way of dividing the egs up into pieces, with one level
# of index, so we have $dir/egs.{0,1,2,...}.ark instead of having two levels of
# indexes.  The extra files we write to $dir that explain the structure are
# $dir/info/num_archives, which contains the number of files egs.*.ark, and
# $dir/info/frames_per_eg, which contains the number of frames of labels per eg
# (e.g. 7), and $dir/samples_per_archive.  These replace the files
# iters_per_epoch and num_jobs_nnet and egs_per_iter that the previous script
# wrote to.  This script takes the directory where the "egs" are located as the
# argument, not the directory one level up.

# Begin configuration section.
cmd=run.pl
feat_type=          # e.g. set it to "raw" to use raw MFCC
frames_per_eg=8   # number of frames of labels per example.  more->less disk space and
                  # less time preparing egs, but more I/O during training.
                  # note: the script may reduce this if reduce_frames_per_eg is true.
left_context=4    # amount of left-context per eg
right_context=4   # amount of right-context per eg
delta_order=      # delta feature order

reduce_frames_per_eg=true  # If true, this script may reduce the frames_per_eg
                           # if there is only one archive and even with the
                           # reduced frames_pe_eg, the number of
                           # samples_per_iter that would result is less than or
                           # equal to the user-specified value.
num_utts_subset=300     # number of utterances in validation and training
                        # subsets used for shrinkage and diagnostics.
num_valid_frames_combine=0 # #valid frames for combination weights at the very end.
num_train_frames_combine=10000 # # train frames for the above.
num_frames_diagnostic=4000 # number of frames for "compute_prob" jobs
samples_per_iter=400000 # each iteration of training, see this many samples
                        # per job.  This is just a guideline; it will pick a number
                        # that divides the number of samples in the entire data.

transform_dir=     # If supplied, overrides alidir as the place to find fMLLR transforms
postdir=        # If supplied, we will use posteriors in it as soft training targets.

stage=0
io_opts="--max-jobs-run 5" # for jobs with a lot of I/O, limits the number running at one time.
random_copy=false
online_ivector_dir=  # can be used if we are including speaker information as iVectors.
cmvn_opts=  # can be used for specifying CMVN options, if feature type is not lda (if lda,
            # it doesn't make sense to use different options than were used as input to the
            # LDA transform).  This is used to turn off CMVN in the online-nnet experiments.

echo "$0 $@"  # Print the command line for logging

if [ -f path.sh ]; then . ./path.sh; fi
. parse_options.sh || exit 1;


if [ $# != 3 ]; then
  echo "Usage: $0 [opts] <data> <ali-dir> <egs-dir>"
  echo " e.g.: $0 data/train exp/tri3_ali exp/tri4_nnet/egs"
  echo ""
  echo "Main options (for others, see top of script file)"
  echo "  --config <config-file>                           # config file containing options"
  echo "  --cmd (utils/run.pl;utils/queue.pl <queue opts>) # how to run jobs."
  echo "  --samples-per-iter <#samples;400000>             # Number of samples of data to process per iteration, per"
  echo "                                                   # process."
  echo "  --feat-type <lda|raw>                            # (by default it tries to guess).  The feature type you want"
  echo "                                                   # to use as input to the neural net."
  echo "  --frames-per-eg <frames;8>                       # number of frames per eg on disk"
  echo "  --left-context <width;4>                         # Number of frames on left side to append for feature input"
  echo "  --right-context <width;4>                        # Number of frames on right side to append for feature input"
  echo "  --num-frames-diagnostic <#frames;4000>           # Number of frames used in computing (train,valid) diagnostics"
  echo "  --num-valid-frames-combine <#frames;10000>       # Number of frames used in getting combination weights at the"
  echo "                                                   # very end."
  echo "  --stage <stage|0>                                # Used to run a partially-completed training process from somewhere in"
  echo "                                                   # the middle."

  exit 1;
fi

data=$1
alidir=$2
dir=$3


# Check some files.
[ ! -z "$online_ivector_dir" ] && \
  extra_files="$online_ivector_dir/ivector_online.scp $online_ivector_dir/ivector_period"

for f in $data/feats.scp $alidir/ali.1.gz $alidir/final.mdl $alidir/tree $extra_files; do
  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
done


nj=`cat $alidir/num_jobs` || exit 1;  # number of jobs in alignment dir...

sdata=$data/split$nj
utils/split_data.sh $data $nj

mkdir -p $dir/log $dir/info
cp $alidir/tree $dir

num_utts=$(cat $data/utt2spk | wc -l)
if ! [ $num_utts -gt $[$num_utts_subset*4] ]; then
  echo "$0: number of utterances $num_utts in your training data is too small versus --num-utts-subset=$num_utts_subset"
  echo "... you probably have so little data that it doesn't make sense to train a neural net."
  exit 1
fi

# Get list of validation utterances.
awk '{print $1}' $data/utt2spk | utils/shuffle_list.pl | head -$num_utts_subset \
    > $dir/valid_uttlist || exit 1;

if [ -f $data/utt2uniq ]; then
  echo "File $data/utt2uniq exists, so augmenting valid_uttlist to"
  echo "include all perturbed versions of the same 'real' utterances."
  mv $dir/valid_uttlist $dir/valid_uttlist.tmp
  utils/utt2spk_to_spk2utt.pl $data/utt2uniq > $dir/uniq2utt
  cat $dir/valid_uttlist.tmp | utils/apply_map.pl $data/utt2uniq | \
    sort | uniq | utils/apply_map.pl $dir/uniq2utt | \
    awk '{for(n=1;n<=NF;n++) print $n;}' | sort  > $dir/valid_uttlist
  rm $dir/uniq2utt $dir/valid_uttlist.tmp
fi

awk '{print $1}' $data/utt2spk | utils/filter_scp.pl --exclude $dir/valid_uttlist | \
   utils/shuffle_list.pl | head -$num_utts_subset > $dir/train_subset_uttlist || exit 1;

[ -z "$transform_dir" ] && transform_dir=$alidir

## Set up features.
if [ -z $feat_type ]; then
  if [ -f $alidir/final.mat ] && [ ! -f $transform_dir/raw_trans.1 ]; then feat_type=lda; else feat_type=raw; fi
fi
echo "$0: feature type is $feat_type"

case $feat_type in
  raw) feats="ark,s,cs:utils/filter_scp.pl --exclude $dir/valid_uttlist $sdata/JOB/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:- ark:- |"
    valid_feats="ark,s,cs:utils/filter_scp.pl $dir/valid_uttlist $data/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- |"
    train_subset_feats="ark,s,cs:utils/filter_scp.pl $dir/train_subset_uttlist $data/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- |"
    echo $cmvn_opts >$dir/cmvn_opts # caution: the top-level nnet training script should copy this to its own dir now.
    if [ ! -z "$delta_order" ]; then
      feats="$feats add-deltas --delta-order=$delta_order ark:- ark:- |"
      valid_feats="$valid_feats add-deltas --delta-order=$delta_order ark:- ark:- |"
      train_subset_feats="$train_subset_feats add-deltas --delta-order=$delta_order ark:- ark:- |"
      echo $delta_order >$dir/delta_order
    fi
   ;;
  lda)
    splice_opts=`cat $alidir/splice_opts 2>/dev/null`
    # caution: the top-level nnet training script should copy these to its own dir now.
    cp $alidir/{splice_opts,cmvn_opts,final.mat} $dir || exit 1;
    [ ! -z "$cmvn_opts" ] && \
       echo "You cannot supply --cmvn-opts option if feature type is LDA." && exit 1;
    cmvn_opts=$(cat $dir/cmvn_opts)
    feats="ark,s,cs:utils/filter_scp.pl --exclude $dir/valid_uttlist $sdata/JOB/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:- ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $dir/final.mat ark:- ark:- |"
    valid_feats="ark,s,cs:utils/filter_scp.pl $dir/valid_uttlist $data/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $dir/final.mat ark:- ark:- |"
    train_subset_feats="ark,s,cs:utils/filter_scp.pl $dir/train_subset_uttlist $data/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $dir/final.mat ark:- ark:- |"
    ;;
  *) echo "$0: invalid feature type $feat_type" && exit 1;
esac

if [ -f $transform_dir/trans.1 ] && [ $feat_type != "raw" ]; then
  echo "$0: using transforms from $transform_dir"
  feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/trans.JOB ark:- ark:- |"
  valid_feats="$valid_feats transform-feats --utt2spk=ark:$data/utt2spk 'ark:cat $transform_dir/trans.*|' ark:- ark:- |"
  train_subset_feats="$train_subset_feats transform-feats --utt2spk=ark:$data/utt2spk 'ark:cat $transform_dir/trans.*|' ark:- ark:- |"
fi
if [ -f $transform_dir/raw_trans.1 ] && [ $feat_type == "raw" ]; then
  echo "$0: using raw-fMLLR transforms from $transform_dir"
  feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/raw_trans.JOB ark:- ark:- |"
  valid_feats="$valid_feats transform-feats --utt2spk=ark:$data/utt2spk 'ark:cat $transform_dir/raw_trans.*|' ark:- ark:- |"
  train_subset_feats="$train_subset_feats transform-feats --utt2spk=ark:$data/utt2spk 'ark:cat $transform_dir/raw_trans.*|' ark:- ark:- |"
fi
if [ ! -z "$online_ivector_dir" ]; then
  feats_one="$(echo "$feats" | sed s:JOB:1:g)"
  ivector_dim=$(feat-to-dim scp:$online_ivector_dir/ivector_online.scp -) || exit 1;
  echo $ivector_dim > $dir/info/ivector_dim
  ivectors_opt="--const-feat-dim=$ivector_dim"
  ivector_period=$(cat $online_ivector_dir/ivector_period) || exit 1;
  feats="$feats paste-feats --length-tolerance=$ivector_period ark:- 'ark,s,cs:utils/filter_scp.pl $sdata/JOB/utt2spk $online_ivector_dir/ivector_online.scp | subsample-feats --n=-$ivector_period scp:- ark:- |' ark:- |"
  valid_feats="$valid_feats paste-feats --length-tolerance=$ivector_period ark:- 'ark,s,cs:utils/filter_scp.pl $dir/valid_uttlist $online_ivector_dir/ivector_online.scp | subsample-feats --n=-$ivector_period scp:- ark:- |' ark:- |"
  train_subset_feats="$train_subset_feats paste-feats --length-tolerance=$ivector_period ark:- 'ark,s,cs:utils/filter_scp.pl $dir/train_subset_uttlist $online_ivector_dir/ivector_online.scp | subsample-feats --n=-$ivector_period scp:- ark:- |' ark:- |"
else
  echo 0 >$dir/info/ivector_dim
fi

if [ $stage -le 0 ]; then
  echo "$0: working out number of frames of training data"
  num_frames=$(steps/nnet2/get_num_frames.sh $data)
  echo $num_frames > $dir/info/num_frames
else
  num_frames=`cat $dir/info/num_frames` || exit 1;
fi

# the + 1 is to round up, not down... we assume it doesn't divide exactly.
num_archives=$[$num_frames/($frames_per_eg*$samples_per_iter)+1]
# (for small data)- while reduce_frames_per_eg == true and the number of
# archives is 1 and would still be 1 if we reduced frames_per_eg by 1, reduce it
# by 1.
reduced=false
while $reduce_frames_per_eg && [ $frames_per_eg -gt 1 ] && \
  [ $[$num_frames/(($frames_per_eg-1)*$samples_per_iter)] -eq 0 ]; do
  frames_per_eg=$[$frames_per_eg-1]
  num_archives=1
  reduced=true
done
$reduced && echo "$0: reduced frames_per_eg to $frames_per_eg because amount of data is small."

echo $num_archives >$dir/info/num_archives
echo $frames_per_eg >$dir/info/frames_per_eg

# Working out number of egs per archive
egs_per_archive=$[$num_frames/($frames_per_eg*$num_archives)]
! [ $egs_per_archive -le $samples_per_iter ] && \
  echo "$0: script error: egs_per_archive=$egs_per_archive not <= samples_per_iter=$samples_per_iter" \
  && exit 1;

echo $egs_per_archive > $dir/info/egs_per_archive

echo "$0: creating $num_archives archives, each with $egs_per_archive egs, with"
echo "$0:   $frames_per_eg labels per example, and (left,right) context = ($left_context,$right_context)"

# Making soft links to storage directories.  This is a no-up unless
# the subdirectory $dir/storage/ exists.  See utils/create_split_dir.pl
for x in `seq $num_archives`; do
  utils/create_data_link.pl $dir/egs.$x.ark
  for y in `seq $nj`; do
    utils/create_data_link.pl $dir/egs_orig.$x.$y.ark
  done
done

nnet_context_opts="--left-context=$left_context --right-context=$right_context"

echo $left_context > $dir/info/left_context
echo $right_context > $dir/info/right_context
if [ $stage -le 2 ]; then
  echo "$0: Getting validation and training subset examples."
  rm $dir/.error 2>/dev/null
  echo "$0: ... extracting validation and training-subset alignments."
  set -o pipefail;
  for id in $(seq $nj); do gunzip -c $alidir/ali.$id.gz; done | \
    copy-int-vector ark:- ark,t:- | \
    utils/filter_scp.pl <(cat $dir/valid_uttlist $dir/train_subset_uttlist) | \
    gzip -c >$dir/ali_special.gz || exit 1;
  set +o pipefail; # unset the pipefail option.

  $cmd $dir/log/create_valid_subset.log \
    nnet-get-egs $ivectors_opt $nnet_context_opts "$valid_feats" \
    "ark,s,cs:gunzip -c $dir/ali_special.gz | ali-to-pdf $alidir/final.mdl ark:- ark:- | ali-to-post ark:- ark:- |" \
     "ark:$dir/valid_all.egs" || touch $dir/.error &
  $cmd $dir/log/create_train_subset.log \
    nnet-get-egs $ivectors_opt $nnet_context_opts "$train_subset_feats" \
     "ark,s,cs:gunzip -c $dir/ali_special.gz | ali-to-pdf $alidir/final.mdl ark:- ark:- | ali-to-post ark:- ark:- |" \
     "ark:$dir/train_subset_all.egs" || touch $dir/.error &
  wait;
  [ -f $dir/.error ] && echo "Error detected while creating train/valid egs" && exit 1
  echo "... Getting subsets of validation examples for diagnostics and combination."
  $cmd $dir/log/create_valid_subset_combine.log \
    nnet-subset-egs --n=$num_valid_frames_combine ark:$dir/valid_all.egs \
        ark:$dir/valid_combine.egs || touch $dir/.error &
  $cmd $dir/log/create_valid_subset_diagnostic.log \
    nnet-subset-egs --n=$num_frames_diagnostic ark:$dir/valid_all.egs \
    ark:$dir/valid_diagnostic.egs || touch $dir/.error &

  $cmd $dir/log/create_train_subset_combine.log \
    nnet-subset-egs --n=$num_train_frames_combine ark:$dir/train_subset_all.egs \
    ark:$dir/train_combine.egs || touch $dir/.error &
  $cmd $dir/log/create_train_subset_diagnostic.log \
    nnet-subset-egs --n=$num_frames_diagnostic ark:$dir/train_subset_all.egs \
    ark:$dir/train_diagnostic.egs || touch $dir/.error &
  wait
  sleep 5  # wait for file system to sync.
  cat $dir/valid_combine.egs $dir/train_combine.egs > $dir/combine.egs

  for f in $dir/{combine,train_diagnostic,valid_diagnostic}.egs; do
    [ ! -s $f ] && echo "No examples in file $f" && exit 1;
  done
  rm $dir/valid_all.egs $dir/train_subset_all.egs $dir/{train,valid}_combine.egs $dir/ali_special.gz
fi

if [ $stage -le 3 ]; then
  # create egs_orig.*.*.ark; the first index goes to $num_archives,
  # the second to $nj (which is the number of jobs in the original alignment
  # dir)

  egs_list=
  for n in $(seq $num_archives); do
    egs_list="$egs_list ark:$dir/egs_orig.$n.JOB.ark"
  done
  echo "$0: Generating training examples on disk"
  # The examples will go round-robin to egs_list.
  if [ ! -z $postdir ]; then
    $cmd $io_opts JOB=1:$nj $dir/log/get_egs.JOB.log \
      nnet-get-egs $ivectors_opt $nnet_context_opts --num-frames=$frames_per_eg "$feats" \
      scp:$postdir/post.JOB.scp ark:- \| \
      nnet-copy-egs ark:- $egs_list || exit 1;
  else
    $cmd $io_opts JOB=1:$nj $dir/log/get_egs.JOB.log \
      nnet-get-egs $ivectors_opt $nnet_context_opts --num-frames=$frames_per_eg "$feats" \
      "ark,s,cs:gunzip -c $alidir/ali.JOB.gz | ali-to-pdf $alidir/final.mdl ark:- ark:- | ali-to-post ark:- ark:- |" ark:- \| \
      nnet-copy-egs ark:- $egs_list || exit 1;
  fi
fi
if [ $stage -le 4 ]; then
  echo "$0: recombining and shuffling order of archives on disk"
  # combine all the "egs_orig.JOB.*.scp" (over the $nj splits of the data) and
  # shuffle the order, writing to the egs.JOB.ark

  egs_list=
  for n in $(seq $nj); do
    egs_list="$egs_list $dir/egs_orig.JOB.$n.ark"
  done

  $cmd $io_opts $extra_opts JOB=1:$num_archives $dir/log/shuffle.JOB.log \
    nnet-shuffle-egs --srand=JOB "ark:cat $egs_list|" ark:$dir/egs.JOB.ark  || exit 1;
fi

if [ $stage -le 5 ]; then
  echo "$0: removing temporary archives"
  for x in `seq $num_archives`; do
    for y in `seq $nj`; do
      file=$dir/egs_orig.$x.$y.ark
      [ -L $file ] && rm $(utils/make_absolute.sh $file)
      rm $file
    done
  done
fi

echo "$0: Finished preparing training examples"


================================================
FILE: egs/steps/nnet2/get_egs_discriminative2.sh
================================================
#!/usr/bin/env bash

# Copyright 2012  Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.

# This script dumps examples MPE or MMI or state-level minimum bayes risk (sMBR)
# training of neural nets.  Note: for "criterion", smbr > mpe > mmi in terms of
# compatibility of the dumped egs, meaning you can use the egs dumped with
# --criterion smbr for MPE or MMI, and egs dumped with --criterion mpe for MMI
# training.  The discriminative training program itself doesn't enforce this and
# it would let you mix and match them arbitrarily; we area speaking in terms of
# the correctness of the algorithm that splits the lattices into pieces.

# Begin configuration section.
cmd=run.pl
criterion=smbr
drop_frames=false #  option relevant for MMI, affects how we dump examples.
samples_per_iter=400000 # measured in frames, not in "examples"
max_temp_archives=128 # maximum number of temp archives per input job, only
                      # affects the process of generating archives, not the
                      # final result.

stage=0

cleanup=true
transform_dir= # If this is a SAT system, directory for transforms
online_ivector_dir=

num_utts_subset=3000
num_archives_priors=10

# End configuration section.


echo "$0 $@"  # Print the command line for logging

if [ -f path.sh ]; then . ./path.sh; fi
. parse_options.sh || exit 1;


if [ $# != 6 ]; then
  echo "Usage: $0 [opts] <data> <lang> <ali-dir> <denlat-dir> <src-model-file> <degs-dir>"
  echo " e.g.: $0 data/train data/lang exp/tri3_ali exp/tri4_nnet_denlats exp/tri4/final.mdl exp/tri4_mpe/degs"
  echo ""
  echo "Main options (for others, see top of script file)"
  echo "  --config <config-file>                           # config file containing options"
  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs (probably would be good to add --max-jobs-run 5 or so if using"
  echo "                                                   # GridEngine (to avoid excessive NFS traffic)."
  echo "  --samples-per-iter <#samples|400000>             # Number of samples of data to process per iteration, per"
  echo "                                                   # process."
  echo "  --stage <stage|-8>                               # Used to run a partially-completed training process from somewhere in"
  echo "                                                   # the middle."
  echo "  --criterion <criterion|smbr>                     # Training criterion: may be smbr, mmi or mpfe"
  echo "  --online-ivector-dir <dir|"">                    # Directory for online-estimated iVectors, used in the"
  echo "                                                   # online-neural-net setup.  (but you may want to use"
  echo "                                                   # steps/online/nnet2/get_egs_discriminative2.sh instead)"
  exit 1;
fi

data=$1
lang=$2
alidir=$3
denlatdir=$4
src_model=$5
dir=$6


extra_files=
[ ! -z $online_ivector_dir ] && \
  extra_files="$online_ivector_dir/ivector_period $online_ivector_dir/ivector_online.scp"

# Check some files.
for f in $data/feats.scp $lang/L.fst $alidir/ali.1.gz $alidir/num_jobs $alidir/tree \
         $denlatdir/lat.1.gz $denlatdir/num_jobs $src_model $extra_files; do
  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
done

mkdir -p $dir/log $dir/info || exit 1;

utils/lang/check_phones_compatible.sh $lang/phones.txt $alidir/phones.txt || exit 1;

nj=$(cat $denlatdir/num_jobs) || exit 1; # $nj is the number of
                                         # splits of the denlats and alignments.

nj_ali=$(cat $alidir/num_jobs) || exit 1;

sdata=$data/split$nj
utils/split_data.sh $data $nj

if [ $nj_ali -eq $nj ]; then
  ali_rspecifier="ark,s,cs:gunzip -c $alidir/ali.JOB.gz |"
  alis=$(for n in $(seq $nj); do echo -n "$alidir/ali.$n.gz "; done)
  prior_ali_rspecifier="ark,s,cs:gunzip -c $alis | copy-int-vector ark:- ark,t:- | utils/filter_scp.pl $dir/priors_uttlist | ali-to-pdf $alidir/final.mdl ark,t:- ark:- |"
else
  ali_rspecifier="scp:$dir/ali.scp"
  prior_ali_rspecifier="ark,s,cs:utils/filter_scp.pl $dir/priors_uttlist $dir/ali.scp | ali-to-pdf $alidir/final.mdl scp:- ark:- |"
  if [ $stage -le 1 ]; then
    echo "$0: number of jobs in den-lats versus alignments differ: dumping them as single archive and index."
    alis=$(for n in $(seq $nj_ali); do echo -n "$alidir/ali.$n.gz "; done)
    $cmd $dir/log/copy_alignments.log \
      copy-int-vector "ark:gunzip -c $alis|" \
      ark,scp:$dir/ali.ark,$dir/ali.scp || exit 1;
  fi
fi

splice_opts=`cat $alidir/splice_opts 2>/dev/null`
silphonelist=`cat $lang/phones/silence.csl` || exit 1;
cmvn_opts=`cat $alidir/cmvn_opts 2>/dev/null`
cp $alidir/splice_opts $dir 2>/dev/null
cp $alidir/cmvn_opts $dir 2>/dev/null
cp $alidir/tree $dir
cp $lang/phones/silence.csl $dir/info/
cp $src_model $dir/final.mdl || exit 1

if [ ! -z "$online_ivector_dir" ]; then
  ivector_period=$(cat $online_ivector_dir/ivector_period)
  ivector_dim=$(feat-to-dim scp:$online_ivector_dir/ivector_online.scp -) || exit 1;
  echo $ivector_dim >$dir/info/ivector_dim
  # the 'const_dim_opt' allows it to write only one iVector per example,
  # rather than one per time-index... it has to average over
  const_dim_opt="--const-feat-dim=$ivector_dim"
else
  echo 0 > $dir/info/ivector_dim
fi

# Get list of validation utterances.
awk '{print $1}' $data/utt2spk | utils/shuffle_list.pl | head -$num_utts_subset \
    > $dir/priors_uttlist || exit 1;

## We don't support deltas here, only LDA or raw (mainly because deltas are less
## frequently used).
if [ -z $feat_type ]; then
  if [ -f $alidir/final.mat ] && [ ! -f $transform_dir/raw_trans.1 ]; then feat_type=lda; else feat_type=raw; fi
fi
echo "$0: feature type is $feat_type"

case $feat_type in
  raw) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- |"
    priors_feats="ark,s,cs:utils/filter_scp.pl $dir/priors_uttlist $data/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- |"
   ;;
  lda)
    splice_opts=`cat $alidir/splice_opts 2>/dev/null`
    cp $alidir/final.mat $dir
    feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $dir/final.mat ark:- ark:- |"
    priors_feats="ark,s,cs:utils/filter_scp.pl $dir/priors_uttlist $data/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $dir/final.mat ark:- ark:- |"
    ;;
  *) echo "$0: invalid feature type $feat_type" && exit 1;
esac

if [ -z "$transform_dir" ]; then
  if [ -f $transform_dir/trans.1 ] || [ -f $transform_dir/raw_trans.1 ]; then
    transform_dir=$alidir
  fi
fi

if [ ! -z "$transform_dir" ]; then
  echo "$0: using transforms from $transform_dir"
  [ ! -s $transform_dir/num_jobs ] && \
    echo "$0: expected $transform_dir/num_jobs to contain the number of jobs." && exit 1;
  nj_orig=$(cat $transform_dir/num_jobs)

  if [ $feat_type == "raw" ]; then trans=raw_trans;
  else trans=trans; fi
  if [ $feat_type == "lda" ] && ! cmp $transform_dir/final.mat $alidir/final.mat; then
    echo "$0: LDA transforms differ between $alidir and $transform_dir"
    exit 1;
  fi
  if [ ! -f $transform_dir/$trans.1 ]; then
    echo "$0: expected $transform_dir/$trans.1 to exist (--transform-dir option)"
    exit 1;
  fi
  if [ $nj -ne $nj_orig ]; then
    # Copy the transforms into an archive with an index.
    for n in $(seq $nj_orig); do cat $transform_dir/$trans.$n; done | \
      copy-feats ark:- ark,scp:$dir/$trans.ark,$dir/$trans.scp || exit 1;
    feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk scp:$dir/$trans.scp ark:- ark:- |"
    priors_feats="$priors_feats transform-feats --utt2spk=ark:$data/utt2spk scp:$dir/$trans.scp ark:- ark:- |"
  else
    # number of jobs matches with alignment dir.
    feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/$trans.JOB ark:- ark:- |"
    tras=$(for n in $(seq $nj); do echo -n "$transform_dir/$trans.$n "; done)
    priors_feats="$priors_feats transform-feats --utt2spk=ark:$data/utt2spk 'ark:cat $tras |' ark:- ark:- |"
  fi
fi
if [ ! -z $online_ivector_dir ]; then
  # add iVectors to the features.
  feats="$feats paste-feats --length-tolerance=$ivector_period ark:- 'ark,s,cs:utils/filter_scp.pl $sdata/JOB/utt2spk $online_ivector_dir/ivector_online.scp | subsample-feats --n=-$ivector_period scp:- ark:- |' ark:- |"
  priors_feats="$priors_feats paste-feats --length-tolerance=$ivector_period ark:- 'ark,s,cs:utils/filter_scp.pl $dir/priors_uttlist $online_ivector_dir/ivector_online.scp | subsample-feats --n=-$ivector_period scp:- ark:- |' ark:- |"
fi


if [ $stage -le 2 ]; then
  echo "$0: working out number of frames of training data"
  num_frames=$(steps/nnet2/get_num_frames.sh $data)

  echo $num_frames > $dir/info/num_frames

  # Working out total number of archives. Add one on the assumption the
  # num-frames won't divide exactly, and we want to round up.
  num_archives=$[$num_frames/$samples_per_iter + 1]

  # the next few lines relate to how we may temporarily split each input job
  # into fewer than $num_archives pieces, to avoid using an excessive
  # number of filehandles.
  archive_ratio=$[$num_archives/$max_temp_archives+1]
  num_archives_temp=$[$num_archives/$archive_ratio]
  # change $num_archives slightly to make it an exact multiple
  # of $archive_ratio.
  num_archives=$[$num_archives_temp*$archive_ratio]

  echo $num_archives >$dir/info/num_archives || exit 1
  echo $num_archives_temp >$dir/info/num_archives_temp || exit 1

  frames_per_archive=$[$num_frames/$num_archives]

  # note, this is the number of frames per archive prior to discarding frames.
  echo $frames_per_archive > $dir/info/frames_per_archive
else
  num_archives=$(cat $dir/info/num_archives) || exit 1;
  num_archives_temp=$(cat $dir/info/num_archives_temp) || exit 1;
  frames_per_archive=$(cat $dir/info/frames_per_archive) || exit 1;
fi

echo "$0: Splitting the data up into $num_archives archives (using $num_archives_temp temporary pieces per input job)"
echo "$0: giving samples-per-iteration of $frames_per_archive (you requested $samples_per_iter)."

# we create these data links regardless of the stage, as there are situations
# where we would want to recreate a data link that had previously been deleted.

if [ -d $dir/storage ]; then
  echo "$0: creating data links for distributed storage of degs"
  # See utils/create_split_dir.pl for how this 'storage' directory is created.
  for x in $(seq $nj); do
    for y in $(seq $num_archives_temp); do
      utils/create_data_link.pl $dir/degs_orig.$x.$y.ark
    done
  done
  for z in $(seq $num_archives); do
    utils/create_data_link.pl $dir/degs.$z.ark
  done
  if [ $num_archives_temp -ne $num_archives ]; then
    for z in $(seq $num_archives); do
      utils/create_data_link.pl $dir/degs_temp.$z.ark
    done
  fi
fi

rm $dir/.error 2>/dev/null
left_context=$(nnet-am-info $dir/final.mdl | grep '^left-context' | awk '{print $2}') || exit 1
right_context=$(nnet-am-info $dir/final.mdl | grep '^right-context' | awk '{print $2}') || exit 1

(

if [ $stage -le 10 ]; then

priors_egs_list=
for y in `seq $num_archives_priors`; do
  utils/create_data_link.pl $dir/priors_egs.$y.ark
  priors_egs_list="$priors_egs_list ark:$dir/priors_egs.$y.ark"
done

nnet_context_opts="--left-context=$left_context --right-context=$right_context"

echo "$0: dumping egs for prior adjustment in the background."

$cmd $dir/log/create_priors_subset.log \
  nnet-get-egs $ivectors_opt $nnet_context_opts "$priors_feats" \
  "$prior_ali_rspecifier ali-to-post ark:- ark:- |" \
  ark:- \| nnet-copy-egs ark:- $priors_egs_list || \
  { touch $dir/.error; echo "Error in creating priors subset. See $dir/log/create_priors_subset.log"; exit 1; }

sleep 3;

echo $num_archives_priors >$dir/info/num_archives_priors

fi

) &

if [ $stage -le 3 ]; then
  echo "$0: getting initial training examples by splitting lattices"

  degs_list=$(for n in $(seq $num_archives_temp); do echo -n "ark:$dir/degs_orig.JOB.$n.ark "; done)

  $cmd JOB=1:$nj $dir/log/get_egs.JOB.log \
    nnet-get-egs-discriminative --criterion=$criterion --drop-frames=$drop_frames \
      "$src_model" "$feats" "$ali_rspecifier" "ark,s,cs:gunzip -c $denlatdir/lat.JOB.gz|" ark:- \| \
    nnet-copy-egs-discriminative $const_dim_opt ark:- $degs_list || exit 1;
  sleep 5;  # wait a bit so NFS has time to write files.
fi

if [ $stage -le 4 ]; then

  degs_list=$(for n in $(seq $nj); do echo -n "$dir/degs_orig.$n.JOB.ark "; done)

  if [ $num_archives -eq $num_archives_temp ]; then
    echo "$0: combining data into final archives and shuffling it"

    $cmd JOB=1:$num_archives $dir/log/shuffle.JOB.log \
      cat $degs_list \| nnet-shuffle-egs-discriminative --srand=JOB ark:- \
       ark:$dir/degs.JOB.ark || exit 1;
  else
    echo "$0: combining and re-splitting data into un-shuffled versions of final archives."

    archive_ratio=$[$num_archives/$num_archives_temp]
    ! [ $archive_ratio -gt 1 ] && echo "$0: Bad archive_ratio $archive_ratio" && exit 1;

    # note: the \$[ .. ] won't be evaluated until the job gets executed.  The
    # aim is to write to the archives with the final numbering, 1
    # ... num_archives, which is more than num_archives_temp.  The list with
    # \$[... ] expressions in it computes the set of final indexes for each
    # temporary index.
    degs_list_out=$(for n in $(seq $archive_ratio); do echo -n "ark:$dir/degs_temp.\$[((JOB-1)*$archive_ratio)+$n].ark "; done)
    # e.g. if dir=foo and archive_ratio=2, we'd have
    # degs_list_out='foo/degs_temp.$[((JOB-1)*2)+1].ark foo/degs_temp.$[((JOB-1)*2)+2].ark'

    $cmd JOB=1:$num_archives_temp $dir/log/resplit.JOB.log \
      cat $degs_list \| nnet-copy-egs-discriminative --srand=JOB ark:- \
      $degs_list_out || exit 1;
  fi
fi

if [ $stage -le 5 ] && [ $num_archives -ne $num_archives_temp ]; then
  echo "$0: shuffling final archives."

  $cmd JOB=1:$num_archives $dir/log/shuffle.JOB.log \
    nnet-shuffle-egs-discriminative --srand=JOB ark:$dir/degs_temp.JOB.ark \
      ark:$dir/degs.JOB.ark || exit 1
fi

wait;
[ -f $dir/.error ] && echo "Error detected while creating priors adjustment egs" && exit 1

if $cleanup; then
  echo "$0: removing temporary archives."
  for x in $(seq $nj); do
    for y in $(seq $num_archives_temp); do
      file=$dir/degs_orig.$x.$y.ark
      [ -L $file ] && rm $(utils/make_absolute.sh $file); rm $file
    done
  done
  if [ $num_archives_temp -ne $num_archives ]; then
    for z in $(seq $num_archives); do
      file=$dir/degs_temp.$z.ark
      [ -L $file ] && rm $(utils/make_absolute.sh $file); rm $file
    done
  fi
fi

echo "$0: Done."


================================================
FILE: egs/steps/nnet2/get_ivector_id.sh
================================================
#!/usr/bin/env bash
# Copyright (c) 2016, Johns Hopkins University (Yenda Trmal <jtrmal@gmail.com>)
# License: Apache 2.0

# Begin configuration section.
# End configuration section
set -e -o pipefail
set -o nounset                              # Treat unset variables as an error

# End configuration section.

#echo >&2 "$0 $@"  # Print the command line for logging

if [ -f path.sh ]; then . ./path.sh; fi
. parse_options.sh || exit 1;


if [ $# != 1 ]; then
  echo >&2 "Usage: $0 <directory>"
  echo >&2 " e.g.: $0 exp/nnet3/extractor"
  exit 1
fi

ivecdir=$1

if [ -f $ivecdir/final.ie.id ] ; then
  cat $ivecdir/final.ie.id
elif [ -f $ivecdir/final.ie ] ; then
  # note the creation can fail in case the extractor directory
  # is not read-only media or the user des not have access rights
  # in that case we will just behave as if the id is not available
  id=$(md5sum $ivecdir/final.ie | awk '{print $1}')
  echo "$id" > $ivecdir/final.ie.id || true
  echo "$id"
else
  exit 0
fi

exit 0


================================================
FILE: egs/steps/nnet2/get_lda.sh
================================================
#!/usr/bin/env bash

# Copyright 2012 Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.
# This script, which will generally be called from other neural-net training
# scripts, extracts the training examples used to train the neural net (and also
# the validation examples used for diagnostics), and puts them in separate archives.

# Begin configuration section.
cmd=run.pl

feat_type=
stage=0
splice_width=4 # meaning +- 4 frames on each side for second LDA
left_context= # left context for second LDA
right_context= # right context for second LDA
rand_prune=4.0 # Relates to a speedup we do for LDA.
within_class_factor=0.0001 # This affects the scaling of the transform rows...
                           # sorry for no explanation, you'll have to see the code.
transform_dir=     # If supplied, overrides alidir
num_feats=10000 # maximum number of feature files to use.  Beyond a certain point it just
                # gets silly to use more data.
lda_dim=  # This defaults to no dimension reduction.
online_ivector_dir=
ivector_randomize_prob=0.0 # if >0.0, randomizes iVectors during training with
                           # this prob per iVector.
ivector_dir=
cmvn_opts=  # allows you to specify options for CMVN, if feature type is not lda.

echo "$0 $@"  # Print the command line for logging

if [ -f path.sh ]; then . ./path.sh; fi
. parse_options.sh || exit 1;


if [ $# != 4 ]; then
  echo "Usage: steps/nnet2/get_lda.sh [opts] <data> <lang> <ali-dir> <exp-dir>"
  echo " e.g.: steps/nnet2/get_lda.sh data/train data/lang exp/tri3_ali exp/tri4_nnet"
  echo " As well as extracting the examples, this script will also do the LDA computation,"
  echo " if --est-lda=true (default:true)"
  echo ""
  echo "Main options (for others, see top of script file)"
  echo "  --config <config-file>                           # config file containing options"
  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
  echo "  --splice-width <width|4>                         # Number of frames on each side to append for feature input"
  echo "                                                   # (note: we splice processed, typically 40-dimensional frames"
  echo "  --left-context <width;4>                         # Number of frames on left side to append for feature input, overrides splice-width"
  echo "  --right-context <width;4>                        # Number of frames on right side to append for feature input, overrides splice-width"
  echo "  --stage <stage|0>                                # Used to run a partially-completed training process from somewhere in"
  echo "                                                   # the middle."
  echo "  --online-vector-dir <dir|none>                   # Directory produced by"
  echo "                                                   # steps/online/nnet2/extract_ivectors_online.sh"
  exit 1;
fi

data=$1
lang=$2
alidir=$3
dir=$4

[ -z "$left_context" ] && left_context=$splice_width
[ -z "$right_context" ] && right_context=$splice_width

[ ! -z "$online_ivector_dir" ] && \
  extra_files="$online_ivector_dir/ivector_online.scp $online_ivector_dir/ivector_period"

# Check some files.
for f in $data/feats.scp $lang/L.fst $alidir/ali.1.gz $alidir/final.mdl $alidir/tree $extra_files; do
  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
done


# Set some variables.
oov=`cat $lang/oov.int`
num_leaves=`gmm-info $alidir/final.mdl 2>/dev/null | awk '/number of pdfs/{print $NF}'` || exit 1;
silphonelist=`cat $lang/phones/silence.csl` || exit 1;

nj=`cat $alidir/num_jobs` || exit 1;  # number of jobs in alignment dir...
# in this dir we'll have just one job.
sdata=$data/split$nj
utils/split_data.sh $data $nj

mkdir -p $dir/log
echo $nj > $dir/num_jobs
cp $alidir/tree $dir


utils/lang/check_phones_compatible.sh $lang/phones.txt $alidir/phones.txt || exit 1;
cp $lang/phones.txt $dir || exit 1;

[ -z "$transform_dir" ] && transform_dir=$alidir
if [ -z "$cmvn_opts" ]; then
  cmvn_opts=`cat $alidir/cmvn_opts 2>/dev/null`
fi
echo $cmvn_opts >$dir/cmvn_opts 2>/dev/null

## Set up features.  Note: these are different from the normal features
## because we have one rspecifier that has the features for the entire
## training set, not separate ones for each batch.
if [ -z $feat_type ]; then
  if [ -f $alidir/final.mat ] && ! [ -f $alidir/raw_trans.1 ]; then feat_type=lda; else feat_type=raw; fi
fi
echo "$0: feature type is $feat_type"


# If we have more than $num_feats feature files (default: 10k),
# we use a random subset.  This won't affect the transform much, and will
# spare us an unnecessary pass over the data.  Probably 10k is
# way too much, but for small datasets this phase is quite fast.
N=$[$num_feats/$nj]

case $feat_type in
  raw) feats="ark,s,cs:utils/subset_scp.pl --quiet $N $sdata/JOB/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:- ark:- |"
    echo $cmvn_opts >$dir/cmvn_opts
   ;;
  lda) 
    splice_opts=`cat $alidir/splice_opts 2>/dev/null`
    cp $alidir/{splice_opts,cmvn_opts,final.mat} $dir || exit 1;
    [ ! -z "$cmvn_opts" ] && \
       echo "You cannot supply --cmvn-opts option of feature type is LDA." && exit 1;
    cmvn_opts=$(cat $dir/cmvn_opts)
     feats="ark,s,cs:utils/subset_scp.pl --quiet $N $sdata/JOB/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:- ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $dir/final.mat ark:- ark:- |"
    ;;
  *) echo "$0: invalid feature type $feat_type" && exit 1;
esac

if [ -f $transform_dir/trans.1 ] && [ $feat_type != "raw" ]; then
  echo "$0: using transforms from $transform_dir"
  feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/trans.JOB ark:- ark:- |"
fi
if [ -f $transform_dir/raw_trans.1 ] && [ $feat_type == "raw" ]; then
  echo "$0: using raw-fMLLR transforms from $transform_dir"
  feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/raw_trans.JOB ark:- ark:- |"
fi


feats_one="$(echo "$feats" | sed s:JOB:1:g)"
# note: feat_dim is the raw, un-spliced feature dim without the iVectors.
feat_dim=$(feat-to-dim "$feats_one" -) || exit 1;
# by default: no dim reduction.

spliced_feats="$feats splice-feats --left-context=$left_context --right-context=$right_context ark:- ark:- |"

if [ ! -z "$online_ivector_dir" ]; then
  ivector_period=$(cat $online_ivector_dir/ivector_period) || exit 1;
  # note: subsample-feats, with negative value of n, repeats each feature n times.
  spliced_feats="$spliced_feats paste-feats --length-tolerance=$ivector_period ark:- 'ark,s,cs:utils/filter_scp.pl $sdata/JOB/utt2spk $online_ivector_dir/ivector_online.scp | subsample-feats --n=-$ivector_period scp:- ark:- | ivector-randomize --randomize-prob=$ivector_randomize_prob ark:- ark:- |' ark:- |"
  ivector_dim=$(feat-to-dim scp:$online_ivector_dir/ivector_online.scp -) || exit 1;
else
  ivector_dim=0
fi
echo $ivector_dim >$dir/ivector_dim

if [ -z "$lda_dim" ]; then
  spliced_feats_one="$(echo "$spliced_feats" | sed s:JOB:1:g)"  
  lda_dim=$(feat-to-dim "$spliced_feats_one" -) || exit 1;
fi

if [ $stage -le 0 ]; then
  echo "$0: Accumulating LDA statistics."
  rm $dir/lda.*.acc 2>/dev/null # in case any left over from before.
  $cmd JOB=1:$nj $dir/log/lda_acc.JOB.log \
    ali-to-post "ark:gunzip -c $alidir/ali.JOB.gz|" ark:- \| \
      weight-silence-post 0.0 $silphonelist $alidir/final.mdl ark:- ark:- \| \
      acc-lda --rand-prune=$rand_prune $alidir/final.mdl "$spliced_feats" ark,s,cs:- \
       $dir/lda.JOB.acc || exit 1;
fi

echo $feat_dim > $dir/feat_dim
echo $lda_dim > $dir/lda_dim
echo $ivector_dim > $dir/ivector_dim

if [ $stage -le 1 ]; then
  sum-lda-accs $dir/lda.acc $dir/lda.*.acc 2>$dir/log/lda_sum.log || exit 1;
  rm $dir/lda.*.acc
fi

if [ $stage -le 2 ]; then
  # There are various things that we sometimes (but not always) need
  # the within-class covariance and its Cholesky factor for, and we
  # write these to disk just in case.
  nnet-get-feature-transform --write-cholesky=$dir/cholesky.tpmat \
     --write-within-covar=$dir/within_covar.spmat \
     --within-class-factor=$within_class_factor --dim=$lda_dim \
      $dir/lda.mat $dir/lda.acc \
      2>$dir/log/lda_est.log || exit 1;
fi

echo "$0: Finished estimating LDA"


================================================
FILE: egs/steps/nnet2/get_lda_block.sh
================================================
#!/usr/bin/env bash

# Copyright 2012 Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.
# This script, which will generally be called from other neural-net training
# scripts, extracts the training examples used to train the neural net (and also
# the validation examples used for diagnostics), and puts them in separate archives.

# Begin configuration section.
cmd=run.pl

stage=0
splice_width=4 # meaning +- 4 frames on each side for second LDA
rand_prune=4.0 # Relates to a speedup we do for LDA.
within_class_factor=0.0001 # This affects the scaling of the transform rows...
                           # sorry for no explanation, you'll have to see the code.
block_size=10
block_shift=5

echo "$0 $@"  # Print the command line for logging

if [ -f path.sh ]; then . ./path.sh; fi
. parse_options.sh || exit 1;


if [ $# != 4 ]; then
  echo "Usage: steps/nnet2/get_lda_block.sh [opts] <data> <lang> <ali-dir> <exp-dir>"
  echo " e.g.: steps/nnet2/get_lda.sh data/train data/lang exp/tri3_ali exp/tri4_nnet"
  echo " As well as extracting the examples, this script will also do the LDA computation,"
  echo " if --est-lda=true (default:true)"
  echo ""
  echo "Main options (for others, see top of script file)"
  echo "  --config <config-file>                           # config file containing options"
  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
  echo "  --splice-width <width|4>                         # Number of frames on each side to append for feature input"
  echo "                                                   # (note: we splice processed, typically 40-dimensional frames"
  echo "  --stage <stage|0>                                # Used to run a partially-completed training process from somewhere in"
  echo "                                                   # the middle."

  exit 1;
fi

data=$1
lang=$2
alidir=$3
dir=$4

# Check some files.
for f in $data/feats.scp $lang/L.fst $alidir/ali.1.gz $alidir/final.mdl $alidir/tree; do
  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
done


# Set some variables.
oov=`cat $lang/oov.int`
num_leaves=`gmm-info $alidir/final.mdl 2>/dev/null | awk '/number of pdfs/{print $NF}'` || exit 1;
silphonelist=`cat $lang/phones/silence.csl` || exit 1;

nj=`cat $alidir/num_jobs` || exit 1;  # number of jobs in alignment dir...
# in this dir we'll have just one job.
sdata=$data/split$nj
utils/split_data.sh $data $nj

mkdir -p $dir/log
echo $nj > $dir/num_jobs
cp $alidir/tree $dir
cmvn_opts=`cat $alidir/cmvn_opts 2>/dev/null`

utils/lang/check_phones_compatible.sh $lang/phones.txt $alidir/phones.txt || exit 1;
cp $lang/phones.txt $dir || exit 1;

## Set up features.  Note: these are different from the normal features
## because we have one rspecifier that has the features for the entire
## training set, not separate ones for each batch.


feats="ark,s,cs:utils/filter_scp.pl --exclude $dir/valid_uttlist $sdata/JOB/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:- ark:- |"
train_subset_feats="ark,s,cs:utils/filter_scp.pl $dir/train_subset_uttlist $data/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- |"

feat_dim=`feat-to-dim "$train_subset_feats" -` || exit 1;

if [ $stage -le 0 ]; then
  echo "$0: Accumulating LDA statistics."
  $cmd JOB=1:$nj $dir/log/lda_acc.JOB.log \
    set -o pipefail '&&' \
    ali-to-post "ark:gunzip -c $alidir/ali.JOB.gz|" ark:- \| \
      weight-silence-post 0.0 $silphonelist $alidir/final.mdl ark:- ark:- \| \
      acc-lda --rand-prune=$rand_prune $alidir/final.mdl "$feats splice-feats --left-context=$splice_width --right-context=$splice_width ark:- ark:- |" ark,s,cs:- \
       $dir/lda.JOB.acc || exit 1;
fi

echo $feat_dim > $dir/feat_dim

echo -n > $dir/indexes
# Get list of indexes, e.g. a file like:
# 0 1 2 3 4 5 6 7 8 9
# 5 6 7 8 9 10 11 12 13 14
# 10 ...

cur_index=0
num_blocks=0
context_length=$[1+2*($splice_width)]

while true; do
  for n in `seq $cur_index $[cur_index+$block_size-1]`; do
    echo -n `seq $n $feat_dim $[$n+($feat_dim*($context_length-1))]` '' >> $dir/indexes
  done
  echo >> $dir/indexes
  num_blocks=$[$num_blocks+1]
  next_index=$[$cur_index+$block_shift]
  if [ $[$next_index+$block_size] -gt $feat_dim ]; then
    next_index=$[$feat_dim-$block_size];
  fi
  if [ $next_index -le $cur_index ]; then break; fi
  cur_index=$next_index
done
echo $num_blocks >$dir/num_blocks

lda_dim=`cat $dir/indexes | wc -w`
echo $lda_dim > $dir/lda_dim

if [ $stage -le 1 ]; then
  nnet-get-feature-transform-multi --within-class-factor=$within_class_factor $dir/indexes $dir/lda.*.acc $dir/lda.mat \
      2>$dir/log/lda_est.log || exit 1;
  rm $dir/lda.*.acc
fi

echo "$0: Finished estimating LDA"


================================================
FILE: egs/steps/nnet2/get_perturbed_feats.sh
================================================
#!/usr/bin/env bash


# begin configuration section

cmd="run.pl"
num_copies=5  # support 3, 4 or 5 perturbed copies of the data.
stage=0
nj=8
cleanup=true
feature_type=fbank
# end configuration section

set -e
. utils/parse_options.sh 

if [ $# -ne 5 ]; then
  echo "Usage: $0 [options] <baseline-feature-config> <feature-storage-dir> <log-location> <input-data-dir> <output-data-dir> "
  echo "e.g.: $0 conf/fbank_40.conf mfcc exp/perturbed_fbank_train data/train data/train_perturbed_fbank"
  echo "Supported options: "
  echo "--feature-type (fbank|mfcc|plp)  # Type of features we are making, default fbank"
  echo "--cmd 'command-program'      # Mechanism to run jobs, e.g. run.pl"
  echo "--num-copies <n>             # Number of perturbed copies of the data (support 3, 4 or 5), default 5"
  echo "--stage <stage>              # Use for partial re-run"
  echo "--cleanup (true|false)       # If false, do not clean up temp files (default: true)"
  echo "--nj <num-jobs>              # How many jobs to use for feature extraction (default: 8)"
  exit 1;
fi

base_config=$1
featdir=$2
dir=$3 # dir/log* will contain log-files
inputdata=$4
data=$5

# Set pairs of (VTLN warp factor, time-warp factor)
# Aim to put these roughly in a circle centered at 1.0-1.0; the
# dynamic range of the VTLN warp factor will be 0.9 to 1.1 and
# of the time-warping factor will be 0.8 to 1.2.
if [ $num_copies -eq 5 ]; then
  pairs="1.1-1.0 1.05-1.2 1.0-0.8 0.95-1.1 0.9-0.9" 
elif [ $num_copies -eq 4 ]; then
  pairs="1.1-1.0 1.0-0.8 1.0-1.2 0.9-1.0"
elif [ $num_copies -eq 3 ]; then
  pairs="1.1-1.1 1.0-0.8 0.9-1.1"
else
  echo "$0: unsupported --num-copies value: $num_copies (support 3, 4 or 5)"
fi

for f in $base_config $inputdata/wav.scp; do 
  if [ ! -f $f ]; then
    echo "Expected file $f to exist"
    exit 1;
  fi
done

if [ "$feature_type" != "fbank" ] && [ "$feature_type" != "mfcc" ] && \
   [ "$feature_type" != "plp" ]; then 
  echo "$0: Invalid option --feature-type=$feature_type"
  exit 1;
fi

mkdir -p $featdir
mkdir -p $dir/conf $dir/log

all_feature_dirs=""

for pair in $pairs; do
  vtln_warp=`echo $pair | cut -d- -f1`
  time_warp=`echo $pair | cut -d- -f2`
  fs=`perl -e "print ($time_warp*10);"`
  conf=$dir/conf/$pair.conf
  this_dir=$dir/$pair
  
  ( cat $base_config; echo; echo "--frame-shift=$fs"; echo "--vtln-warp=$vtln_warp" ) > $conf
  
  echo "Making ${feature_type} features for VTLN-warp $vtln_warp and time-warp $time_warp"

  feature_data=${data}-$pair
  all_feature_dirs="$all_feature_dirs $feature_data"

  utils/copy_data_dir.sh --spk-prefix ${pair}- --utt-prefix ${pair}- $inputdata $feature_data
  steps/make_${feature_type}.sh --${feature_type}-config $conf --nj "$nj" --cmd "$cmd" $feature_data $this_dir $featdir

  steps/compute_cmvn_stats.sh $feature_data $this_dir $featdir
done

utils/combine_data.sh $data $all_feature_dirs


# In the combined feature directory, create a file utt2uniq which maps
# our extended utterance-ids to "unique utterances".  This enables the
# script steps/nnet2/get_egs.sh to hold out data in a more proper way.
cat $data/utt2spk | \
   perl -e ' while(<STDIN>){ @A=split; $x=shift @A; $y=$x; 
     foreach $pair (@ARGV) { $y =~ s/^${pair}-// && last; } print "$x $y\n"; } ' $pairs \
  > $data/utt2uniq

if $cleanup; then
  echo "$0: Cleaning up temporary directories for ${feature_type} features."
  # Note, this just removes the .scp files and so on, not the data which is located in
  # $featdir and which is still needed.
  rm -r $all_feature_dirs
fi


================================================
FILE: egs/steps/nnet2/make_denlats.sh
================================================
#!/usr/bin/env bash
# Copyright 2012  Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.

# Create denominator lattices for MMI/MPE training.
# This version uses the neural-net models (version 2, i.e. the nnet2 code).
# Creates its output in $dir/lat.*.gz

# Begin configuration section.
nj=4
cmd=run.pl
sub_split=1
beam=13.0
lattice_beam=7.0
acwt=0.1
max_active=5000
transform_dir=
max_mem=20000000 # This will stop the processes getting too large.
# This is in bytes, but not "real" bytes-- you have to multiply
# by something like 5 or 10 to get real bytes (not sure why so large)
num_threads=1
online_ivector_dir=
parallel_opts= # ignored now
feat_type=  # you can set this in order to run on top of delta features, although we don't
            # normally want to do this.
# End configuration section.


echo "$0 $@"  # Print the command line for logging

[ -f ./path.sh ] && . ./path.sh; # source the path.
. parse_options.sh || exit 1;

if [ $# != 4 ]; then
  echo "Usage: steps/make_denlats.sh [options] <data-dir> <lang-dir> <src-dir> <exp-dir>"
  echo "  e.g.: steps/make_denlats.sh data/train data/lang exp/nnet4 exp/nnet4_denlats"
  echo "Works for (delta|lda) features, and (with --transform-dir option) such features"
  echo " plus transforms."
  echo ""
  echo "Main options (for others, see top of script file)"
  echo "  --config <config-file>                           # config containing options"
  echo "  --nj <nj>                                        # number of parallel jobs"
  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
  echo "  --sub-split <n-split>                            # e.g. 40; use this for "
  echo "                           # large databases so your jobs will be smaller and"
  echo "                           # will (individually) finish reasonably soon."
  echo "  --transform-dir <transform-dir>   # directory to find fMLLR transforms."
  echo "  --num-threads  <n>                # number of threads per decoding job"
  exit 1;
fi

data=$1
lang=$2
srcdir=$3
dir=$4


extra_files=
[ ! -z "$online_ivector_dir" ] && \
  extra_files="$online_ivector_dir/ivector_online.scp $online_ivector_dir/ivector_period"
for f in $data/feats.scp $lang/L.fst $srcdir/final.mdl $extra_files; do
  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1;
done

sdata=$data/split$nj
splice_opts=`cat $srcdir/splice_opts 2>/dev/null`
thread_string=
[ $num_threads -gt 1 ] && thread_string="-parallel --num-threads=$num_threads"

mkdir -p $dir/log
split_data.sh $data $nj || exit 1;
echo $nj > $dir/num_jobs

utils/lang/check_phones_compatible.sh $lang/phones.txt $srcdir/phones.txt || exit 1;

oov=`cat $lang/oov.int` || exit 1;

cp -rH $lang $dir/

# Compute grammar FST which corresponds to unigram decoding graph.
new_lang="$dir/"$(basename "$lang")

# mkgraph.sh expects a whole directory "lang", so put everything in one directory...
# it gets L_disambig.fst and G.fst (among other things) from $dir/lang, and
# final.mdl from $srcdir; the output HCLG.fst goes in $dir/graph.

echo "Compiling decoding graph in $dir/dengraph"
if [ -s $dir/dengraph/HCLG.fst ] && [ $dir/dengraph/HCLG.fst -nt $srcdir/final.mdl ]; then
  echo "Graph $dir/dengraph/HCLG.fst already exists: skipping graph creation."
else
  echo "Making unigram grammar FST in $new_lang"
  cat $data/text | utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt | \
   awk '{for(n=2;n<=NF;n++){ printf("%s ", $n); } printf("\n"); }' | \
    utils/make_unigram_grammar.pl | fstcompile | fstarcsort --sort_type=ilabel > $new_lang/G.fst \
    || exit 1;
  utils/mkgraph.sh $new_lang $srcdir $dir/dengraph || exit 1;
fi
cmvn_opts=`cat $srcdir/cmvn_opts 2>/dev/null`
cp $srcdir/cmvn_opts $dir 2>/dev/null

if [ -z "$feat_type" ]; then
  if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=raw; fi
fi
echo "$0: feature type is $feat_type"

case $feat_type in
  delta) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
  raw) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- |"
   ;;
  lda) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |"
    cp $srcdir/final.mat $dir
   ;;
  *) echo "Invalid feature type $feat_type" && exit 1;
esac

if [ ! -z "$transform_dir" ]; then
  echo "$0: using transforms from $transform_dir"
  [ ! -s $transform_dir/num_jobs ] && \
    echo "$0: expected $transform_dir/num_jobs to contain the number of jobs." && exit 1;
  nj_orig=$(cat $transform_dir/num_jobs)

  if [ $feat_type == "raw" ]; then trans=raw_trans;
  else trans=trans; fi
  if [ $feat_type == "lda" ] && ! cmp $transform_dir/final.mat $srcdir/final.mat; then
    echo "$0: LDA transforms differ between $srcdir and $transform_dir"
    exit 1;
  fi
  if [ ! -f $transform_dir/$trans.1 ]; then
    echo "$0: expected $transform_dir/$trans.1 to exist (--transform-dir option)"
    exit 1;
  fi
  if [ $nj -ne $nj_orig ]; then
    # Copy the transforms into an archive with an index.
    for n in $(seq $nj_orig); do cat $transform_dir/$trans.$n; done | \
       copy-feats ark:- ark,scp:$dir/$trans.ark,$dir/$trans.scp || exit 1;
    feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk scp:$dir/$trans.scp ark:- ark:- |"
  else
    # number of jobs matches with alignment dir.
    feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/$trans.JOB ark:- ark:- |"
  fi
fi


if [ ! -z "$online_ivector_dir" ]; then
  ivector_period=$(cat $online_ivector_dir/ivector_period) || exit 1;
  # note: subsample-feats, with negative n, will repeat each feature -n times.
  feats="$feats paste-feats --length-tolerance=$ivector_period ark:- 'ark,s,cs:utils/filter_scp.pl $sdata/JOB/utt2spk $online_ivector_dir/ivector_online.scp | subsample-feats --n=-$ivector_period scp:- ark:- |' ark:- |"
fi


# if this job is interrupted by the user, we want any background jobs to be
# killed too.
cleanup() {
  local pids=$(jobs -pr)
  [ -n "$pids" ] && kill $pids
}
trap "cleanup" INT QUIT TERM EXIT


if [ $sub_split -eq 1 ]; then
  $cmd --num-threads $num_threads JOB=1:$nj $dir/log/decode_den.JOB.log \
   nnet-latgen-faster$thread_string --beam=$beam --lattice-beam=$lattice_beam --acoustic-scale=$acwt \
    --max-mem=$max_mem --max-active=$max_active --word-symbol-table=$lang/words.txt $srcdir/final.mdl  \
     $dir/dengraph/HCLG.fst "$feats" "ark:|gzip -c >$dir/lat.JOB.gz" || exit 1;
else
  # each job from 1 to $nj is split into multiple pieces (sub-split), and we aim
  # to have at most two jobs running at each time.  The idea is that if we have stragglers
  # from one job, we can be processing another one at the same time.
  rm $dir/.error 2>/dev/null

  prev_pid=
  for n in `seq $[nj+1]`; do
    if [ $n -gt $nj ]; then
      this_pid=
    elif [ -f $dir/.done.$n ] && [ $dir/.done.$n -nt $srcdir/final.mdl ]; then
      echo "Not processing subset $n as already done (delete $dir/.done.$n if not)";
      this_pid=
    else
      sdata2=$data/split$nj/$n/split${sub_split}utt;
      split_data.sh --per-utt $sdata/$n $sub_split || exit 1;
      mkdir -p $dir/log/$n
      mkdir -p $dir/part
      feats_subset=`echo $feats | sed "s/trans.JOB/trans.$n/g" | sed s:JOB/:$n/split${sub_split}utt/JOB/:g`

      $cmd --num-threads $num_threads JOB=1:$sub_split $dir/log/$n/decode_den.JOB.log \
        nnet-latgen-faster$thread_string --beam=$beam --lattice-beam=$lattice_beam --acoustic-scale=$acwt \
        --max-mem=$max_mem --max-active=$max_active --word-symbol-table=$lang/words.txt $srcdir/final.mdl  \
          $dir/dengraph/HCLG.fst "$feats_subset" "ark:|gzip -c >$dir/lat.$n.JOB.gz" || touch $dir/.error &
      this_pid=$!
    fi
    if [ ! -z "$prev_pid" ]; then  # Wait for the previous job; merge the previous set of lattices.
      wait $prev_pid
      [ -f $dir/.error ] && echo "$0: error generating denominator lattices" && exit 1;
      rm $dir/.merge_error 2>/dev/null
      echo Merging archives for data subset $prev_n
      for k in `seq $sub_split`; do
        gunzip -c $dir/lat.$prev_n.$k.gz || touch $dir/.merge_error;
      done | gzip -c > $dir/lat.$prev_n.gz || touch $dir/.merge_error;
      [ -f $dir/.merge_error ] && echo "$0: Merging lattices for subset $prev_n failed (or maybe some other error)" && exit 1;
      rm $dir/lat.$prev_n.*.gz
      touch $dir/.done.$prev_n
    fi
    prev_n=$n
    prev_pid=$this_pid
  done
fi


echo "$0: done generating denominator lattices."


================================================
FILE: egs/steps/nnet2/make_multisplice_configs.py
================================================
#!/usr/bin/env python
# Copyright 2014  Johns Hopkins University (Authors: Daniel Povey and Vijayaditya Peddinti).  Apache 2.0.

# Creates the nnet.config and hidde_*.config scripts used in train_pnorm_multisplice.sh
# Parses the splice string to generate relevant variables for get_egs.sh, get_lda.sh and nnet/hidden.config files

from __future__ import division
from __future__ import print_function
import re, argparse, sys, math, warnings

# returns the set of frame indices required to perform the convolution
# between sequences with frame indices in x and y
def get_convolution_index_set(x, y):
  z = []
  for i in range(len(x)):
    for j in range(len(y)):
      z.append(x[i]+y[j])
  z = list(set(z))
  z.sort()
  return z

def parse_splice_string(splice_string):
  layerwise_splice_indexes = splice_string.split('layer')[1:]
  print(splice_string.split('layer'))
  contexts={}
  first_right_context = 0 # default value
  first_left_context = 0 # default value
  nnet_frame_indexes = [0] # frame indexes required by the network
                           # at the initial layer (will be used in 
                           # determining the context for get_egs.sh)
  try:
    for cur_splice_indexes in layerwise_splice_indexes:
      layer_index, frame_indexes  = cur_splice_indexes.split("/")
      frame_indexes = [int(x) for x in frame_indexes.split(':')]
      layer_index = int(layer_index)
      assert(layer_index >= 0)
      if layer_index == 0:
        first_left_context = min(frame_indexes)
        first_right_context = max(frame_indexes)
        try:
          assert(frame_indexes == list(range(first_left_context, first_right_context+1)))
        except AssertionError:
          raise Exception('Currently the first splice component just accepts contiguous context.')
        try:
          assert((first_left_context <=0) and (first_right_context >=0))
        except AssertionError:
          raise Exception("""get_lda.sh script does not support postive left-context or negative right context.
          left context provided is %d and right context provided is %d.""" % (first_left_context, first_right_context))
        # convolve the current splice indices with the splice indices until last layer
      nnet_frame_indexes = get_convolution_index_set(frame_indexes, nnet_frame_indexes)
      cur_context = ":".join([str(x) for x in frame_indexes])
      contexts[layer_index] = cur_context
  except ValueError:
    raise Exception('Unknown format in splice_indexes variable: {0}'.format(params.splice_indexes))
  print(nnet_frame_indexes)
  max_left_context = min(nnet_frame_indexes)
  max_right_context = max(nnet_frame_indexes)
  return [contexts, ' nnet_left_context={0};\n nnet_right_context={1}\n first_left_context={2};\n first_right_context={3}\n'.format(abs(max_left_context), abs(max_right_context), abs(first_left_context), abs(first_right_context) )]

def create_config_files(output_dir, params):
  pnorm_p = 2
  pnorm_input_dim = params.pnorm_input_dim
  pnorm_output_dim = params.pnorm_output_dim
  contexts, context_variables = parse_splice_string(params.splice_indexes)
  var_file = open("{0}/vars".format(output_dir), "w")
  var_file.write(context_variables)
  var_file.close()

  try:
    assert(max(contexts.keys()) < params.num_hidden_layers)
  except AssertionError:
    raise Exception("""Splice string provided is {2}.
    Number of hidden layers {0}, is less than the number of context specifications provided.
    Splicing is supported only until layer {1}.""".format(params.num_hidden_layers, params.num_hidden_layers - 1, params.splice_indexes))

  stddev=1.0/math.sqrt(pnorm_input_dim)
  try :
    nnet_config = ["SpliceComponent input-dim={0} context={1} const-component-dim={2}".format(params.total_input_dim, contexts[0], params.ivector_dim),
    "FixedAffineComponent matrix={0}".format(params.lda_mat),
    "AffineComponentPreconditionedOnline input-dim={0} output-dim={1} {2} learning-rate={3} param-stddev={4} bias-stddev={5}".format(params.lda_dim, pnorm_input_dim, params.online_preconditioning_opts, params.initial_learning_rate, stddev, params.bias_stddev),
    ("PnormComponent input-dim={0} output-dim={1} p={2}".format(pnorm_input_dim, pnorm_output_dim, pnorm_p) if pnorm_input_dim != pnorm_output_dim else "RectifiedLinearComponent dim={0}".format(pnorm_input_dim)),
    "NormalizeComponent dim={0}".format(pnorm_output_dim),
    "AffineComponentPreconditionedOnline input-dim={0} output-dim={1} {2} learning-rate={3} param-stddev=0 bias-stddev=0".format(pnorm_output_dim, params.num_targets, params.online_preconditioning_opts, params.initial_learning_rate),
    "SoftmaxComponent dim={0}".format(params.num_targets)]

    nnet_config_file = open(("{0}/nnet.config").format(output_dir), "w")
    nnet_config_file.write("\n".join(nnet_config))
    nnet_config_file.close()
  except KeyError:
    raise Exception('A splice layer is expected to be the first layer. Provide a context for the first layer.')

  for i in range(1, params.num_hidden_layers): #just run till num_hidden_layers-1 since we do not add splice before the final affine transform
    lines=[]
    context_len = 1
    if i in contexts:
        # Adding the splice component as a context is provided
        lines.append("SpliceComponent input-dim=%d context=%s " % (pnorm_output_dim, contexts[i]))
        context_len = len(contexts[i].split(":"))
    # Add the hidden layer, which is a composition of an affine component, pnorm component and normalization component
    lines.append("AffineComponentPreconditionedOnline input-dim=%d output-dim=%d %s learning-rate=%f param-stddev=%f bias-stddev=%f" 
        % ( pnorm_output_dim*context_len, pnorm_input_dim, params.online_preconditioning_opts, params.initial_learning_rate, stddev, params.bias_stddev))
    if pnorm_input_dim != pnorm_output_dim:
      lines.append("PnormComponent input-dim=%d output-dim=%d p=%d" % (pnorm_input_dim, pnorm_output_dim, pnorm_p))
    else:
      lines.append("RectifiedLinearComponent dim=%d" % (pnorm_input_dim)) 
      warnings.warn("Using the RectifiedLinearComponent, in place of the PnormComponent as pnorm_input_dim == pnorm_output_dim")
    lines.append("NormalizeComponent dim={0}".format(pnorm_output_dim))
    out_file = open("{0}/hidden_{1}.config".format(output_dir, i), 'w')
    out_file.write("\n".join(lines))
    out_file.close()


if __name__ == "__main__":
  print(" ".join(sys.argv))
  parser = argparse.ArgumentParser()
  parser.add_argument('--splice-indexes', type=str, help='string specifying the indexes for the splice layers throughout the network')
  parser.add_argument('--total-input-dim', type=int, help='dimension of the input to the network')
  parser.add_argument('--ivector-dim', type=int, help='dimension of the ivector portion of the neural network input')
  parser.add_argument('--lda-mat', type=str, help='lda-matrix used after the first splice component')
  parser.add_argument('--lda-dim', type=str, help='dimension of the lda output')
  parser.add_argument('--pnorm-input-dim', type=int, help='dimension of input to pnorm layer')
  parser.add_argument('--pnorm-output-dim', type=int, help='dimension of output of pnorm layer')
  parser.add_argument('--online-preconditioning-opts', type=str, help='extra options for the AffineComponentPreconditionedOnline component')
  parser.add_argument('--initial-learning-rate', type=float, help='')
  parser.add_argument('--num-targets', type=int, help='#targets for the neural network ')
  parser.add_argument('--num-hidden-layers', type=int, help='#hidden layers in the neural network ')
  parser.add_argument('--bias-stddev', type=float, help='standard deviation of r.v. used for bias component initialization')
  parser.add_argument("mode", type=str, help="contexts|configs")
  parser.add_argument("output_dir", type=str, help="output directory to store the files")
  params = parser.parse_args() 
  
  print(params)
  if params.mode == "contexts":
    [context, context_variables] = parse_splice_string(params.splice_indexes)
    var_file = open("{0}/vars".format(params.output_dir), "w")
    var_file.write(context_variables)
    var_file.close()
  elif params.mode == "configs":
    create_config_files(params.output_dir, params)
  else:
    raise Exception("mode has to be in the set {contexts, configs}")


================================================
FILE: egs/steps/nnet2/relabel_egs.sh
================================================
#!/usr/bin/env bash

# Copyright 2014  Vimal Manohar. Apache 2.0.
# This script, which will generally be called during the neural-net training
# relabels existing examples with better labels obtained by realigning the data
# with the current nnet model

# Begin configuration section
cmd=run.pl
stage=0
extra_egs=        # Names of additional egs files that need to relabelled
                  # other than egs.*.*.ark, combine.egs, train_diagnostic.egs,
                  # valid_diagnostic.egs
iter=final
echo "$0 $@"  # Print the command line for logging

if [ -f path.sh ]; then . ./path.sh; fi
. parse_options.sh || exit 1;

if [ $# != 3 ]; then
  echo "Usage: steps/nnet2/relabel_egs.sh [opts] <ali-dir> <egs-in-dir> <egs-out-dir>"
  echo "  e.g: steps/nnet2/relabel_egs.sh exp/tri6_nnet/ali_1.5 exp/tri6_nnet/egs exp/tri6_nnet/egs_1.5"
  echo ""
  echo "Main options (for others, see top of script file)"
  echo "  --config <config-file>                           # config file containing options"
  echo "  --cmd (utils/run.pl;utils/queue.pl <queue opts>) # how to run jobs."

  exit 1;
fi

alidir=$1
egs_in_dir=$2
dir=$3

model=$alidir/$iter.mdl

# Check some files.

for f in $alidir/ali.1.gz $model $egs_in_dir/egs.1.0.ark $egs_in_dir/combine.egs \
  $egs_in_dir/valid_diagnostic.egs $egs_in_dir/train_diagnostic.egs \
  $egs_in_dir/num_jobs_nnet $egs_in_dir/iters_per_epoch $egs_in_dir/samples_per_iter; do
  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
done

num_jobs_nnet=`cat $egs_in_dir/num_jobs_nnet`
iters_per_epoch=`cat $egs_in_dir/iters_per_epoch`
samples_per_iter_real=`cat $egs_in_dir/samples_per_iter`
num_jobs_align=`cat $alidir/num_jobs`

mkdir -p $dir/log

echo $num_jobs_nnet > $dir/num_jobs_nnet
echo $iters_per_epoch > $dir/iters_per_epoch
echo $samples_per_iter_real > $dir/samples_per_iter

alignments=$(for n in $(seq $num_jobs_align); do echo -n "$alidir/ali.$n.gz "; done)

if [ $stage -le 0 ]; then
  egs_in=
  egs_out=
  for x in `seq 1 $num_jobs_nnet`; do
    for y in `seq 0 $[$iters_per_epoch-1]`; do
      utils/create_data_link.pl $dir/egs.$x.$y.ark
      if [ $x -eq 1 ]; then
        egs_in="$egs_in ark:$egs_in_dir/egs.JOB.$y.ark "
        egs_out="$egs_out ark:$dir/egs.JOB.$y.ark "
      fi
    done
  done

  $cmd JOB=1:$num_jobs_nnet $dir/log/relabel_egs.JOB.log \
    nnet-relabel-egs "ark:gunzip -c $alignments | ali-to-pdf $model ark:- ark:- |" \
    $egs_in $egs_out || exit 1
fi

if [ $stage -le 1 ]; then
  egs_in=
  egs_out=
  for x in combine.egs valid_diagnostic.egs train_diagnostic.egs $extra_egs; do
    utils/create_data_link.pl $dir/$x
    egs_in="$egs_in ark:$egs_in_dir/$x"
    egs_out="$egs_out ark:$dir/$x"
  done

  $cmd $dir/log/relabel_egs_extra.log \
    nnet-relabel-egs "ark:gunzip -c $alignments | ali-to-pdf $model ark:- ark:- |" \
    $egs_in $egs_out || exit 1
fi

echo "$0: Finished relabeling training examples"


================================================
FILE: egs/steps/nnet2/relabel_egs2.sh
================================================
#!/usr/bin/env bash

# Copyright 2014  Vimal Manohar.
#           2014  Johns Hopkins University (author: Daniel Povey)
# Apache 2.0.
#
# This script, which will generally be called during the neural-net training 
# relabels existing examples with better labels obtained by realigning the data
# with the current nnet model.
# This script is as relabel_egs.sh, but is adapted to work with the newer
# egs format that is written by get_egs2.sh

# Begin configuration section
cmd=run.pl
stage=0
extra_egs=        # Names of additional egs files that need to relabelled 
                  # other than egs.*.*.ark, combine.egs, train_diagnostic.egs,
                  # valid_diagnostic.egs
iter=final
parallel_opts=
echo "$0 $@"  # Print the command line for logging

if [ -f path.sh ]; then . ./path.sh; fi
. parse_options.sh || exit 1;

if [ $# != 3 ]; then
  echo "Usage: steps/nnet2/relabel_egs.sh [opts] <ali-dir> <egs-in-dir> <egs-out-dir>"
  echo "  e.g: steps/nnet2/relabel_egs.sh exp/tri6_nnet/ali_1.5 exp/tri6_nnet/egs exp/tri6_nnet/egs_1.5"
  echo ""
  echo "Main options (for others, see top of script file)"
  echo "  --config <config-file>                           # config file containing options"
  echo "  --cmd (utils/run.pl;utils/queue.pl <queue opts>) # how to run jobs."

  exit 1;
fi

alidir=$1
egs_in_dir=$2
dir=$3

model=$alidir/$iter.mdl

# Check some files.

[ -f $egs_in_dir/iters_per_epoch ] && \
  echo "$0: this script does not work with the old egs directory format" && exit 1;

for f in $alidir/ali.1.gz $model $egs_in_dir/egs.1.ark $egs_in_dir/combine.egs \
  $egs_in_dir/valid_diagnostic.egs $egs_in_dir/train_diagnostic.egs \
  $egs_in_dir/info/num_archives; do
  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
done

num_archives=$(cat $egs_in_dir/info/num_archives) || exit 1;
num_jobs_align=$(cat $alidir/num_jobs) || exit 1;

mkdir -p $dir/log

mkdir -p $dir/info
cp -r $egs_in_dir/info/*  $dir/info

alignments=$(for n in $(seq $num_jobs_align); do echo $alidir/ali.$n.gz; done)

if [ $stage -le 0 ]; then
  for x in $(seq $num_archives); do
    # if $dir/storage exists, make the soft links that we'll
    # use to distribute the data across machines
    utils/create_data_link.pl $dir/egs.$x.ark
  done

  $cmd $parallel_opts JOB=1:$num_archives $dir/log/relabel_egs.JOB.log \
    nnet-relabel-egs "ark:gunzip -c $alignments | ali-to-pdf $model ark:- ark:- |" \
     ark:$egs_in_dir/egs.JOB.ark ark:$dir/egs.JOB.ark || exit 1
fi

if [ $stage -le 1 ]; then
  egs_in=
  egs_out=
  for x in combine.egs valid_diagnostic.egs train_diagnostic.egs $extra_egs; do
    utils/create_data_link.pl $dir/$x
    egs_in="$egs_in ark:$egs_in_dir/$x"
    egs_out="$egs_out ark:$dir/$x"
  done

  $cmd $dir/log/relabel_egs_extra.log \
    nnet-relabel-egs "ark:gunzip -c $alignments | ali-to-pdf $model ark:- ark:- |" \
    $egs_in $egs_out || exit 1
fi

echo "$0: Finished relabeling training examples"


================================================
FILE: egs/steps/nnet2/remove_egs.sh
================================================
#!/usr/bin/env bash

# Copyright 2014  Johns Hopkins University (Author: Daniel Povey).  
# Apache 2.0.

# This script removes the examples in an egs/ directory, e.g.
# steps/nnet2/remove_egs.sh exp/nnet4b/egs/
# We give it its own script because we need to be careful about
# things that are soft links to something in storage/ (i.e. remove the
# data that's linked to as well as the soft link), and we want to not
# delete the examples if someone has done "touch $dir/egs/.nodelete".


if [ $# != 1 ]; then
  echo "Usage: $0 <egs-dir>"
  echo "e.g.: $0 data/nnet4b/egs/"
  echo "e.g.: $0 data/nnet4b_mpe/degs/"
  echo "This script is usually equivalent to 'rm <egs-dir>/egs.* <egs-dir>/degs.*' but it follows"
  echo "soft links to <egs-dir>/storage/; and it avoids deleting anything in the directory if"
  echo "someone did 'touch <egs-dir>/.nodelete"
  exit 1;
fi

egs=$1

if [ ! -d $egs ]; then
  echo "$0: expected directory $egs to exist"
  exit 1;
fi

if [ -f $egs/.nodelete ]; then
  echo "$0: not deleting egs in $egs since $egs/.nodelete exists"
  exit 0;
fi


for f in $egs/egs.*.ark $egs/degs.*.ark $egs/cegs.*.ark; do
  if [ -L $f ]; then
    rm $(dirname $f)/$(readlink $f)  # this will print a warning if it fails.
  fi
  rm $f 2>/dev/null
done


echo "$0: Finished deleting examples in $egs"


================================================
FILE: egs/steps/nnet2/retrain_fast.sh
================================================
#!/usr/bin/env bash

# Copyright 2014  Johns Hopkins University (Author: Daniel Povey).
# Apache 2.0.

# retrain_fast.sh is a neural net training script that's intended to train
# a system on top of an already-trained neural network, whose activations have
# been dumped to disk.  All it really is is training a neural network with
# no hidden layers, so it's a simplified version of some of the other scripts.
# There is no get_lda stage, as we don't support any pre-scaling of the inputs.
# It uses the AffineComponentPreconditionedOnline components, which is why
# we name it _fast.

# Begin configuration section.
cmd=run.pl
num_epochs=4       # Number of epochs during which we reduce
                   # the learning rate; number of iterations is worked out from this.
num_epochs_extra=1 # Number of epochs after we stop reducing
                   # the learning rate.
num_iters_final=10 # Maximum number of final iterations to give to the
                   # optimization over the validation set (maximum)
initial_learning_rate=0.04
final_learning_rate=0.004

minibatch_size=128 # by default use a smallish minibatch size for neural net
                   # training; this controls instability which would otherwise
                   # be a problem with multi-threaded update.
samples_per_iter=200000 # each iteration of training, see this many samples
                        # per job.  This option is passed to get_egs.sh
num_jobs_nnet=16   # Number of neural net jobs to run in parallel.  This option
                   # is passed to get_egs.sh.
get_egs_stage=0

shuffle_buffer_size=5000 # This "buffer_size" variable controls randomization of the samples
                # on each iter.  You could set it to 0 or to a large value for complete
                # randomization, but this would both consume memory and cause spikes in
                # disk I/O.  Smaller is easier on disk and memory but less random.  It's
                # not a huge deal though, as samples are anyway randomized right at the start.
                # (the point of this is to get data in different minibatches on different iterations,
                # since in the preconditioning method, 2 samples in the same minibatch can
                # affect each others' gradients.

stage=-5

io_opts="--max-jobs-run 5" # for jobs with a lot of I/O, limits the number running at one time.   These don't

alpha=4.0   # relates to preconditioning.
update_period=4 # relates to online preconditioning: says how often we update the subspace.
num_samples_history=2000 # relates to online preconditioning
max_change_per_sample=0.075
precondition_rank_in=20  # relates to online preconditioning
precondition_rank_out=80 # relates to online preconditioning

# this relates to perturbed training.
min_target_objf_change=0.1
target_multiplier=0 #  Set this to e.g. 1.0 to enable perturbed training.

mix_up=0 # Number of components to mix up to (should be > #tree leaves, if
        # specified.)
num_threads=16
parallel_opts="--num-threads 16 --mem 1G" # by default we use 16 threads; this lets the queue know.
  # note: parallel_opts doesn't automatically get adjusted if you adjust num-threads.
combine_num_threads=8
combine_parallel_opts="--num-threads 8"  # queue options for the "combine" stage.
cleanup=true
egs_dir=
egs_opts=
prior_subset_size=10000 # 10k samples per job, for computing priors.  Should be
                        # more than enough.
# End configuration section.


echo "$0 $@"  # Print the command line for logging

if [ -f path.sh ]; then . ./path.sh; fi
. parse_options.sh || exit 1;

if [ $# != 4 ]; then
  echo "Usage: $0 [opts] <data> <lang> <ali-dir> <exp-dir>"
  echo " e.g.: $0 data/train data/lang exp/tri3_ali exp/tri4_nnet"
  echo ""
  echo "Main options (for others, see top of script file)"
  echo "  --config <config-file>                           # config file containing options"
  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
  echo "  --num-epochs <#epochs|4 >                        # Number of epochs of main training"
  echo "                                                   # while reducing learning rate (determines #iterations, together"
  echo "                                                   # with --samples-per-iter and --num-jobs-nnet)"
  echo "  --num-epochs-extra <#epochs-extra|1>             # Number of extra epochs of training"
  echo "                                                   # after learning rate fully reduced"
  echo "  --initial-learning-rate <initial-learning-rate|0.02> # Learning rate at start of training, e.g. 0.02 for small"
  echo "                                                       # data, 0.01 for large data"
  echo "  --final-learning-rate  <final-learning-rate|0.004>   # Learning rate at end of training, e.g. 0.004 for small"
  echo "                                                   # data, 0.001 for large data"
  echo "  --mix-up <#pseudo-gaussians|0>                   # Can be used to have multiple targets in final output layer,"
  echo "                                                   # per context-dependent state.  Try a number several times #states."
  echo "  --num-jobs-nnet <num-jobs|8>                     # Number of parallel jobs to use for main neural net"
  echo "                                                   # training (will affect results as well as speed; try 8, 16)"
  echo "                                                   # Note: if you increase this, you may want to also increase"
  echo "                                                   # the learning rate."
  echo "  --num-threads <num-threads|16>                   # Number of parallel threads per job (will affect results"
  echo "                                                   # as well as speed; may interact with batch size; if you increase"
  echo "                                                   # this, you may want to decrease the batch size."
  echo "  --parallel-opts <opts|\"--num-threads 16 --mem 1G\">      # extra options to pass to e.g. queue.pl for processes that"
  echo "                                                   # use multiple threads... "
  echo "  --io-opts <opts|\"--max-jobs-run 10\">                      # Options given to e.g. queue.pl for jobs that do a lot of I/O."
  echo "  --minibatch-size <minibatch-size|128>            # Size of minibatch to process (note: product with --num-threads"
  echo "                                                   # should not get too large, e.g. >2k)."
  echo "  --samples-per-iter <#samples|400000>             # Number of samples of data to process per iteration, per"
  echo "                                                   # process."
  echo "  --num-iters-final <#iters|10>                    # Number of final iterations to give to nnet-combine-fast to "
  echo "                                                   # interpolate parameters (the weights are learned with a validation set)"
  echo "  --num-utts-subset <#utts|300>                    # Number of utterances in subsets used for validation and diagnostics"
  echo "                                                   # (the validation subset is held out from training)"
  echo "  --num-frames-diagnostic <#frames|4000>           # Number of frames used in computing (train,valid) diagnostics"
  echo "  --stage <stage|-9>                               # Used to run a partially-completed training process from somewhere in"
  echo "                                                   # the middle."


  exit 1;
fi

data=$1
lang=$2
alidir=$3
dir=$4

# Check some files.
for f in $data/feats.scp $lang/L.fst $alidir/ali.1.gz $alidir/final.mdl $alidir/tree; do
  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
done


# Set some variables.
num_leaves=`tree-info $alidir/tree 2>/dev/null | grep num-pdfs | awk '{print $2}'` || exit 1
[ -z $num_leaves ] && echo "\$num_leaves is unset" && exit 1
[ "$num_leaves" -eq "0" ] && echo "\$num_leaves is 0" && exit 1

nj=`cat $alidir/num_jobs` || exit 1;  # number of jobs in alignment dir...
# in this dir we'll have just one job.
sdata=$data/split$nj
utils/split_data.sh $data $nj

mkdir -p $dir/log
echo $nj > $dir/num_jobs
cp $alidir/tree $dir


utils/lang/check_phones_compatible.sh $lang/phones.txt $alidir/phones.txt || exit 1;
cp $lang/phones.txt $dir || exit 1;

if [ $stage -le -3 ] && [ -z "$egs_dir" ]; then
  echo "$0: calling get_egs.sh"
  steps/nnet2/get_egs.sh --feat-type raw --cmvn-opts "--norm-means=false --norm-vars=false" \
      --samples-per-iter $samples_per_iter --left-context 0 --right-context 0 \
      --num-jobs-nnet $num_jobs_nnet --stage $get_egs_stage \
      --cmd "$cmd" $egs_opts --io-opts "$io_opts" \
      $data $lang $alidir $dir || exit 1;
fi

[ -z $egs_dir ] && egs_dir=$dir/egs

iters_per_epoch=`cat $egs_dir/iters_per_epoch`  || exit 1;
! [ $num_jobs_nnet -eq `cat $egs_dir/num_jobs_nnet` ] && \
  echo "$0: Warning: using --num-jobs-nnet=`cat $egs_dir/num_jobs_nnet` from $egs_dir"
num_jobs_nnet=`cat $egs_dir/num_jobs_nnet` || exit 1;


if [ $stage -le -2 ]; then
  echo "$0: initializing neural net";

  feat_dim=$(feat-to-dim scp:$data/feats.scp -) || exit 1;

  online_preconditioning_opts="alpha=$alpha num-samples-history=$num_samples_history update-period=$update_period rank-in=$precondition_rank_in rank-out=$precondition_rank_out max-change-per-sample=$max_change_per_sample"

  cat >$dir/nnet.config <<EOF
AffineComponentPreconditionedOnline input-dim=$feat_dim output-dim=$num_leaves $online_preconditioning_opts learning-rate=$initial_learning_rate param-stddev=0 bias-stddev=0
SoftmaxComponent dim=$num_leaves
EOF
  $cmd $dir/log/nnet_init.log \
    nnet-am-init $alidir/tree $lang/topo "nnet-init $dir/nnet.config -|" \
    $dir/0.mdl || exit 1;
fi

if [ $stage -le -1 ]; then
  echo "Training transition probabilities and setting priors"
  $cmd $dir/log/train_trans.log \
    nnet-train-transitions $dir/0.mdl "ark:gunzip -c $alidir/ali.*.gz|" $dir/0.mdl \
    || exit 1;
fi

num_iters_reduce=$[$num_epochs * $iters_per_epoch];
num_iters_extra=$[$num_epochs_extra * $iters_per_epoch];
num_iters=$[$num_iters_reduce+$num_iters_extra]

echo "$0: Will train for $num_epochs + $num_epochs_extra epochs, equalling "
echo "$0: $num_iters_reduce + $num_iters_extra = $num_iters iterations, "
echo "$0: (while reducing learning rate) + (with constant learning rate)."


function set_target_objf_change {
  # nothing to do if $target_multiplier not set.
  [ "$target_multiplier" == "0" -o "$target_multiplier" == "0.0" ] && return;
  [ $x -le $finish_add_layers_iter ] && return;
  wait=2  # the compute_prob_{train,valid} from 2 iterations ago should
          # most likey be done even though we backgrounded them.
  [ $[$x-$wait] -le 0 ] && return;
  while true; do
    # Note: awk 'some-expression' is the same as: awk '{if(some-expression) print;}'
    train_prob=$(awk '(NF == 1)' < $dir/log/compute_prob_train.$[$x-$wait].log)
    valid_prob=$(awk '(NF == 1)' < $dir/log/compute_prob_valid.$[$x-$wait].log)
    if [ -z "$train_prob" ] || [ -z "$valid_prob" ]; then
      echo "$0: waiting until $dir/log/compute_prob_{train,valid}.$[$x-$wait].log are done"
      sleep 60
    else
      target_objf_change=$(perl -e '($train,$valid,$min_change,$multiplier)=@ARGV; if (!($train < 0.0) || !($valid < 0.0)) { print "0\n"; print STDERR "Error: invalid train or valid prob: $train_prob, $valid_prob\n"; exit(0); } else { print STDERR "train,valid=$train,$valid\n"; $proposed_target = $multiplier * ($train-$valid); if ($proposed_target < $min_change) { print "0"; } else { print $proposed_target; }}' -- "$train_prob" "$valid_prob" "$min_target_objf_change" "$target_multiplier")
      echo "On iter $x, (train,valid) probs from iter $[$x-$wait] were ($train_prob,$valid_prob), and setting target-objf-change to $target_objf_change."
      return;
    fi
  done
}

mix_up_iter=$[$num_iters/2]

if [ $num_threads -eq 1 ]; then
  parallel_suffix="-simple" # this enables us to use GPU code if
                         # we have just one thread.
  parallel_train_opts=
  if ! cuda-compiled; then
    echo "$0: WARNING: you are running with one thread but you have not compiled"
    echo "   for CUDA.  You may be running a setup optimized for GPUs.  If you have"
    echo "   GPUs and have nvcc installed, go to src/ and do ./configure; make"
  fi
else
  parallel_suffix="-parallel"
  parallel_train_opts="--num-threads=$num_threads"
fi

x=0
target_objf_change=0 # relates to perturbed training.

while [ $x -lt $num_iters ]; do
  if [ $x -ge 0 ] && [ $stage -le $x ]; then
    # Set off jobs doing some diagnostics, in the background.
    $cmd $dir/log/compute_prob_valid.$x.log \
      nnet-compute-prob $dir/$x.mdl ark:$egs_dir/valid_diagnostic.egs &
    $cmd $dir/log/compute_prob_train.$x.log \
      nnet-compute-prob $dir/$x.mdl ark:$egs_dir/train_diagnostic.egs &
    if [ $x -gt 0 ] && [ ! -f $dir/log/mix_up.$[$x-1].log ]; then
      $cmd $dir/log/progress.$x.log \
        nnet-show-progress --use-gpu=no $dir/$[$x-1].mdl $dir/$x.mdl \
          ark:$egs_dir/train_diagnostic.egs '&&' \
        nnet-am-info $dir/$x.mdl &
    fi

    echo "Training neural net (pass $x)"

    if [ $x -eq 0 ]; then
      # on iteration zero, use a smaller minibatch size and just one job: the
      # model-averaging doesn't seem to be helpful when the model is changing
      # too fast (i.e. it worsens the objective function), and the smaller
      # minibatch size will help to keep the update stable.
      this_minibatch_size=$[$minibatch_size/2];
      do_average=false
    else
      this_minibatch_size=$minibatch_size
      do_average=true
    fi

    set_target_objf_change;  # only has effect if target_multiplier != 0
    if [ "$target_objf_change" != "0" ]; then
      [ ! -f $dir/within_covar.spmat ] && \
        echo "$0: expected $dir/within_covar.spmat to exist." && exit 1;
      perturb_suffix="-perturbed"
      perturb_opts="--target-objf-change=$target_objf_change --within-covar=$dir/within_covar.spmat"
    fi

    $cmd $parallel_opts JOB=1:$num_jobs_nnet $dir/log/train.$x.JOB.log \
      nnet-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x \
      ark:$egs_dir/egs.JOB.$[$x%$iters_per_epoch].ark ark:- \| \
       nnet-train$parallel_suffix$perturb_suffix $parallel_train_opts $perturb_opts \
        --minibatch-size=$this_minibatch_size --srand=$x $dir/$x.mdl \
        ark:- $dir/$[$x+1].JOB.mdl \
      || exit 1;

    nnets_list=
    for n in `seq 1 $num_jobs_nnet`; do
      nnets_list="$nnets_list $dir/$[$x+1].$n.mdl"
    done

    learning_rate=`perl -e '($x,$n,$i,$f)=@ARGV; print ($x >= $n ? $f : $i*exp($x*log($f/$i)/$n));' $[$x+1] $num_iters_reduce $initial_learning_rate $final_learning_rate`;

    if $do_average; then
      $cmd $dir/log/average.$x.log \
        nnet-am-average $nnets_list - \| \
        nnet-am-copy --learning-rate=$learning_rate - $dir/$[$x+1].mdl || exit 1;
    else
      n=$(perl -e '($nj,$pat)=@ARGV; $best_n=1; $best_logprob=-1.0e+10; for ($n=1;$n<=$nj;$n++) {
          $fn = sprintf($pat,$n); open(F, "<$fn") || die "Error opening log file $fn";
          undef $logprob; while (<F>) { if (m/log-prob-per-frame=(\S+)/) { $logprob=$1; } }
          close(F); if (defined $logprob && $logprob > $best_logprob) { $best_logprob=$logprob;
          $best_n=$n; } } print "$best_n\n"; ' $num_jobs_nnet $dir/log/train.$x.%d.log) || exit 1;
      [ -z "$n" ] && echo "Error getting best model" && exit 1;
      $cmd $dir/log/select.$x.log \
        nnet-am-copy --learning-rate=$learning_rate $dir/$[$x+1].$n.mdl $dir/$[$x+1].mdl || exit 1;
    fi

    if [ "$mix_up" -gt 0 ] && [ $x -eq $mix_up_iter ]; then
      # mix up.
      echo Mixing up from $num_leaves to $mix_up components
      $cmd $dir/log/mix_up.$x.log \
        nnet-am-mixup --min-count=10 --num-mixtures=$mix_up \
        $dir/$[$x+1].mdl $dir/$[$x+1].mdl || exit 1;
    fi
    rm $nnets_list
    [ ! -f $dir/$[$x+1].mdl ] && exit 1;
    if [ -f $dir/$[$x-1].mdl ] && $cleanup && \
       [ $[($x-1)%100] -ne 0  ] && [ $[$x-1] -le $[$num_iters-$num_iters_final] ]; then
      rm $dir/$[$x-1].mdl
    fi
  fi
  x=$[$x+1]
done

# Now do combination.
# At the end, final.mdl will be a combination of the last e.g. 10 models.
nnets_list=()
if [ $num_iters_final -gt $num_iters_extra ]; then
  echo "Setting num_iters_final=$num_iters_extra"
fi
start=$[$num_iters-$num_iters_final+1]
for x in `seq $start $num_iters`; do
  idx=$[$x-$start]
  if [ $x -gt $mix_up_iter ]; then
    nnets_list[$idx]=$dir/$x.mdl # "nnet-am-copy --remove-dropout=true $dir/$x.mdl - |"
  fi
done

if [ $stage -le $num_iters ]; then
  echo "Doing final combination to produce final.mdl"
  # Below, use --use-gpu=no to disable nnet-combine-fast from using a GPU, as
  # if there are many models it can give out-of-memory error; set num-threads to 8
  # to speed it up (this isn't ideal...)
  num_egs=`nnet-copy-egs ark:$egs_dir/combine.egs ark:/dev/null 2>&1 | tail -n 1 | awk '{print $NF}'`
  mb=$[($num_egs+$combine_num_threads-1)/$combine_num_threads]
  [ $mb -gt 512 ] && mb=512
  # Setting --initial-model to a large value makes it initialize the combination
  # with the average of all the models.  It's important not to start with a
  # single model, or, due to the invariance to scaling that these nonlinearities
  # give us, we get zero diagonal entries in the fisher matrix that
  # nnet-combine-fast uses for scaling, which after flooring and inversion, has
  # the effect that the initial model chosen gets much higher learning rates
  # than the others.  This prevents the optimization from working well.
  $cmd $combine_parallel_opts $dir/log/combine.log \
    nnet-combine-fast --initial-model=100000 --num-lbfgs-iters=40 --use-gpu=no \
      --num-threads=$combine_num_threads \
      --verbose=3 --minibatch-size=$mb "${nnets_list[@]}" ark:$egs_dir/combine.egs \
      $dir/final.mdl || exit 1;

  # Compute the probability of the final, combined model with
  # the same subset we used for the previous compute_probs, as the
  # different subsets will lead to different probs.
  $cmd $dir/log/compute_prob_valid.final.log \
    nnet-compute-prob $dir/final.mdl ark:$egs_dir/valid_diagnostic.egs &
  $cmd $dir/log/compute_prob_train.final.log \
    nnet-compute-prob $dir/final.mdl ark:$egs_dir/train_diagnostic.egs &
fi

if [ $stage -le $[$num_iters+1] ]; then
  echo "Getting average posterior for purposes of adjusting the priors."
  # Note: this just uses CPUs, using a smallish subset of data.
  rm $dir/post.*.vec 2>/dev/null
  $cmd JOB=1:$num_jobs_nnet $dir/log/get_post.JOB.log \
    nnet-subset-egs --n=$prior_subset_size ark:$egs_dir/egs.JOB.0.ark ark:- \| \
    nnet-compute-from-egs "nnet-to-raw-nnet $dir/final.mdl -|" ark:- ark:- \| \
    matrix-sum-rows ark:- ark:- \| vector-sum ark:- $dir/post.JOB.vec || exit 1;

  sleep 3;  # make sure there is time for $dir/post.*.vec to appear.

  $cmd $dir/log/vector_sum.log \
   vector-sum $dir/post.*.vec $dir/post.vec || exit 1;

  rm $dir/post.*.vec;

  echo "Re-adjusting priors based on computed posteriors"
  $cmd $dir/log/adjust_priors.log \
    nnet-adjust-priors $dir/final.mdl $dir/post.vec $dir/final.mdl || exit 1;
fi


sleep 2

echo Done

if $cleanup; then
  echo Cleaning up data
  if [ $egs_dir == "$dir/egs" ]; then
    steps/nnet2/remove_egs.sh $dir/egs
  fi
fi


================================================
FILE: egs/steps/nnet2/retrain_simple2.sh
================================================
#!/usr/bin/env bash

# Copyright 2012-2014  Johns Hopkins University (Author: Daniel Povey).
#                2013  Xiaohui Zhang
#                2013  Guoguo Chen
#                2014  Vimal Manohar
# Apache 2.0.


# retrain_simple2.sh is a script for training a single-layer (softmax-only)
# neural network on top of activations dumped from an existing network; we'll
# later combine the networks to a single network.

# It differs from train_pnorm_simple2.sh in the same way that retrain_fast.sh
# differs from train_pnorm_fast.sh.


# Begin configuration section.
cmd=run.pl
num_epochs=5      # Number of epochs of training;
initial_learning_rate=0.04
final_learning_rate=0.004
minibatch_size=128 # by default use a smallish minibatch size for neural net
                   # training; this controls instability which would otherwise
                   # be a problem with multi-threaded update.
samples_per_iter=400000 # each iteration of training, see this many samples
                        # per job.  This option is passed to get_egs.sh
num_jobs_nnet=4    # Number of neural net jobs to run in parallel.  This option
                   # is passed to get_egs.sh.
prior_subset_size=10000 # 10k samples per job, for computing priors.  Should be
                        # more than enough.
num_jobs_compute_prior=10 # these are single-threaded, run on CPU.
get_egs_stage=0


max_models_combine=20 # The "max_models_combine" is the maximum number of models we give
  # to the final 'combine' stage, but these models will themselves be averages of
  # iteration-number ranges.

shuffle_buffer_size=5000 # This "buffer_size" variable controls randomization of the samples
                # on each iter.  You could set it to 0 or to a large value for complete
                # randomization, but this would both consume memory and cause spikes in
                # disk I/O.  Smaller is easier on disk and memory but less random.  It's
                # not a huge deal though, as samples are anyway randomized right at the start.
                # (the point of this is to get data in different minibatches on different iterations,
                # since in the preconditioning method, 2 samples in the same minibatch can
                # affect each others' gradients.

stage=-4


alpha=4.0 # relates to online preconditioning.
update_period=4 # relates to online preconditioning: says how often we update the subspace.
num_samples_history=2000 # relates to online preconditioning
precondition_rank_in=20  # relates to online preconditioning
precondition_rank_out=80 # relates to online preconditioning
max_change_per_sample=0.075

mix_up=0 # Number of components to mix up to (should be > #tree leaves, if
        # specified.)
num_threads=16
parallel_opts="--num-threads 16 --mem 1G"
  # by default we use 16 threads; this lets the queue know.
  # note: parallel_opts doesn't automatically get adjusted if you adjust num-threads.
combine_num_threads=8
combine_parallel_opts="--num-threads 8"  # queue options for the "combine" stage.
cleanup=true
egs_dir=
egs_opts=
io_opts="--max-jobs-run 5" # for jobs with a lot of I/O, limits the number running at one time.
align_cmd=              # The cmd that is passed to steps/nnet2/align.sh
align_use_gpu=          # Passed to use_gpu in steps/nnet2/align.sh [yes/no]
realign_epochs=         # List of epochs, the beginning of which realignment is done
num_jobs_align=30       # Number of jobs for realignment
# End configuration section.


echo "$0 $@"  # Print the command line for logging

if [ -f path.sh ]; then . ./path.sh; fi
. parse_options.sh || exit 1;

if [ $# != 4 ]; then
  echo "Usage: $0 [opts] <data> <lang> <ali-dir> <exp-dir>"
  echo " e.g.: $0 data/train data/lang exp/tri3_ali exp/tri4_nnet"
  echo ""
  echo "Main options (for others, see top of script file)"
  echo "  --config <config-file>                           # config file containing options"
  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
  echo "  --num-epochs <#epochs|5>                         # Number of epochs of training"
  echo "  --initial-learning-rate <initial-learning-rate|0.02> # Learning rate at start of training, e.g. 0.02 for small"
  echo "                                                       # data, 0.01 for large data"
  echo "  --final-learning-rate  <final-learning-rate|0.004>   # Learning rate at end of training, e.g. 0.004 for small"
  echo "                                                   # data, 0.001 for large data"
  echo "  --mix-up <#pseudo-gaussians|0>                   # Can be used to have multiple targets in final output layer,"
  echo "                                                   # per context-dependent state.  Try a number several times #states."
  echo "  --num-jobs-nnet <num-jobs|8>                     # Number of parallel jobs to use for main neural net"
  echo "                                                   # training (will affect results as well as speed; try 8, 16)"
  echo "                                                   # Note: if you increase this, you may want to also increase"
  echo "                                                   # the learning rate."
  echo "  --num-threads <num-threads|16>                   # Number of parallel threads per job (will affect results"
  echo "                                                   # as well as speed; may interact with batch size; if you increase"
  echo "                                                   # this, you may want to decrease the batch size."
  echo "  --parallel-opts <opts|\"--num-threads 16 --mem 1G\">      # extra options to pass to e.g. queue.pl for processes that"
  echo "                                                   # use multiple threads... "
  echo "  --io-opts <opts|\"--max-jobs-run 10\">                      # Options given to e.g. queue.pl for jobs that do a lot of I/O."
  echo "  --minibatch-size <minibatch-size|128>            # Size of minibatch to process (note: product with --num-threads"
  echo "                                                   # should not get too large, e.g. >2k)."
  echo "  --samples-per-iter <#samples|400000>             # Number of samples of data to process per iteration, per"
  echo "                                                   # process."
  echo "  --realign-epochs <list-of-epochs|\"\">           # A list of space-separated epoch indices the beginning of which"
  echo "                                                   # realignment is to be done"
  echo "  --align-cmd (utils/run.pl|utils/queue.pl <queue opts>) # passed to align.sh"
  echo "  --align-use-gpu (yes/no)                         # specify is gpu is to be used for realignment"
  echo "  --num-jobs-align <#njobs|30>                     # Number of jobs to perform realignment"
  echo "  --stage <stage|-4>                               # Used to run a partially-completed training process from somewhere in"
  echo "                                                   # the middle."


  exit 1;
fi

data=$1
lang=$2
alidir=$3
dir=$4

if [ ! -z "$realign_epochs" ]; then
  [ -z "$align_cmd" ] && echo "$0: realign_epochs specified but align_cmd not specified" && exit 1
  [ -z "$align_use_gpu" ] && echo "$0: realign_epochs specified but align_use_gpu not specified" && exit 1
fi

# Check some files.
for f in $data/feats.scp $lang/L.fst $alidir/ali.1.gz $alidir/final.mdl $alidir/tree; do
  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
done


# Set some variables.
num_leaves=`tree-info $alidir/tree 2>/dev/null | grep num-pdfs | awk '{print $2}'` || exit 1
[ -z $num_leaves ] && echo "\$num_leaves is unset" && exit 1
[ "$num_leaves" -eq "0" ] && echo "\$num_leaves is 0" && exit 1

nj=`cat $alidir/num_jobs` || exit 1;  # number of jobs in alignment dir...
# in this dir we'll have just one job.
sdata=$data/split$nj
utils/split_data.sh $data $nj

mkdir -p $dir/log
echo $nj > $dir/num_jobs
cp $alidir/tree $dir

utils/lang/check_phones_compatible.sh $lang/phones.txt $alidir/phones.txt || exit 1;
cp $lang/phones.txt $dir || exit 1;


if [ $stage -le -3 ] && [ -z "$egs_dir" ]; then
  echo "$0: calling get_egs2.sh"
  # set --frames-per-eg to 1 because there is no context, so there
  # is no advantage in having multiple frames per eg.
  steps/nnet2/get_egs2.sh --feat-type raw \
    --frames-per-eg 1 --left-context 0 --right-context 0 \
    --cmvn-opts "--norm-means=false --norm-vars=false" \
    --io-opts "$io_opts" \
    --samples-per-iter $samples_per_iter --stage $get_egs_stage \
    --cmd "$cmd" $egs_opts $data $alidir $dir/egs || exit 1;
fi

if [ -z $egs_dir ]; then
  egs_dir=$dir/egs
fi

frames_per_eg=$(cat $egs_dir/info/frames_per_eg) || { echo "error: no such file $egs_dir/info/frames_per_eg"; exit 1; }
num_archives=$(cat $egs_dir/info/num_archives) || { echo "error: no such file $egs_dir/info/frames_per_eg"; exit 1; }

# num_archives_expanded considers each separate label-position from
# 0..frames_per_eg-1 to be a separate archive.
num_archives_expanded=$[$num_archives*$frames_per_eg]

if [ $num_jobs_nnet -gt $num_archives_expanded ]; then
  echo "$0: --num-jobs-nnet cannot exceed num-archives*frames-per-eg which is $num_archives_expanded"
  echo "$0: setting --num-jobs-nnet to $num_archives_expanded"
  num_jobs_nnet=$num_archives_expanded
fi

if [ $stage -le -2 ]; then
  echo "$0: initializing neural net";
  feat_dim=$(feat-to-dim scp:$data/feats.scp -) || exit 1;

  online_preconditioning_opts="alpha=$alpha num-samples-history=$num_samples_history update-period=$update_period rank-in=$precondition_rank_in rank-out=$precondition_rank_out max-change-per-sample=$max_change_per_sample"

  stddev=`perl -e "print 1.0/sqrt($pnorm_input_dim);"`
  cat >$dir/nnet.config <<EOF
AffineComponentPreconditionedOnline input-dim=$feat_dim output-dim=$num_leaves $online_preconditioning_opts learning-rate=$initial_learning_rate param-stddev=0 bias-stddev=0
SoftmaxComponent dim=$num_leaves
EOF

  $cmd $dir/log/nnet_init.log \
    nnet-am-init $alidir/tree $lang/topo "nnet-init $dir/nnet.config -|" \
    $dir/0.mdl || exit 1;
fi

if [ $stage -le -1 ]; then
  echo "Training transition probabilities and setting priors"
  $cmd $dir/log/train_trans.log \
    nnet-train-transitions $dir/0.mdl "ark:gunzip -c $alidir/ali.*.gz|" $dir/0.mdl \
    || exit 1;
fi

# set num_iters so that as close as possible, we process the data $num_epochs
# times, i.e. $num_iters*$num_jobs_nnet == $num_epochs*$num_archives_expanded
num_iters=$[($num_epochs*$num_archives_expanded)/$num_jobs_nnet]

echo "$0: Will train for $num_epochs epochs = $num_iters iterations"

mix_up_iter=$[$num_iters/2]

if [ $num_threads -eq 1 ]; then
  parallel_suffix="-simple" # this enables us to use GPU code if
                         # we have just one thread.
  parallel_train_opts=
  if ! cuda-compiled; then
    echo "$0: WARNING: you are running with one thread but you have not compiled"
    echo "   for CUDA.  You may be running a setup optimized for GPUs.  If you have"
    echo "   GPUs and have nvcc installed, go to src/ and do ./configure; make"
  fi
else
  parallel_suffix="-parallel"
  parallel_train_opts="--num-threads=$num_threads"
fi


approx_iters_per_epoch=$[$num_iters/$num_epochs]
# First work out how many models we want to combine over in the final
# nnet-combine-fast invocation.  This equals
# min(max(max_models_combine, iters_per_epoch),
#     2/3 * iters_after_mixup)
num_models_combine=$max_models_combine
if [ $num_models_combine -lt $approx_iters_per_epoch ]; then
  num_models_combine=$approx_iters_per_epoch
fi
iters_after_mixup_23=$[(($num_iters-$mix_up_iter-1)*2)/3]
if [ $num_models_combine -gt $iters_after_mixup_23 ]; then
  num_models_combine=$iters_after_mixup_23
fi
first_model_combine=$[$num_iters-$num_models_combine+1]

x=0

for realign_epoch in $realign_epochs; do
  # compare the equation below with the equation we use to set num_iters above.
  # note, realign_epochs may be floating-point, which is why we don't use $[] to
  # do the math.
  realign_iter=$(perl -e 'print int(($ARGV[0]*$ARGV[1])/$ARGV[2]);' $realign_epoch $num_archives_expanded $num_jobs_nnet)
  realign_this_iter[$realign_iter]=$realign_epoch
done

cur_egs_dir=$egs_dir

while [ $x -lt $num_iters ]; do

  if [ ! -z "${realign_this_iter[$x]}" ]; then
    prev_egs_dir=$cur_egs_dir
    cur_egs_dir=$dir/egs_${realign_this_iter[$x]}
  fi

  if [ $x -ge 0 ] && [ $stage -le $x ]; then
    if [ ! -z "${realign_this_iter[$x]}" ]; then
      epoch=${realign_this_iter[$x]}


      echo "Getting average posterior for purposes of adjusting the priors."
      # Note: this just uses CPUs, using a smallish subset of data.
      # always use the first egs archive, which makes the script simpler;
      # we're using different random subsets of it.
      rm $dir/post.$x.*.vec 2>/dev/null
      $cmd JOB=1:$num_jobs_compute_prior $dir/log/get_post.$x.JOB.log \
        nnet-copy-egs --srand=JOB --frame=random ark:$prev_egs_dir/egs.1.ark ark:- \| \
        nnet-subset-egs --srand=JOB --n=$prior_subset_size ark:- ark:- \| \
        nnet-compute-from-egs "nnet-to-raw-nnet $dir/$x.mdl -|" ark:- ark:- \| \
        matrix-sum-rows ark:- ark:- \| vector-sum ark:- $dir/post.$x.JOB.vec || exit 1;

      sleep 3;  # make sure there is time for $dir/post.$x.*.vec to appear.

      $cmd $dir/log/vector_sum.$x.log \
        vector-sum $dir/post.$x.*.vec $dir/post.$x.vec || exit 1;
      rm $dir/post.$x.*.vec;

      echo "Re-adjusting priors based on computed posteriors"
      $cmd $dir/log/adjust_priors.$x.log \
        nnet-adjust-priors $dir/$x.mdl $dir/post.$x.vec $dir/$x.mdl || exit 1;

      sleep 2

      steps/nnet2/align.sh --nj $num_jobs_align --cmd "$align_cmd" --use-gpu $align_use_gpu \
        --transform-dir "$transform_dir" --online-ivector-dir "$online_ivector_dir" \
        --iter $x $data $lang $dir $dir/ali_$epoch || exit 1

      steps/nnet2/relabel_egs2.sh --cmd "$cmd" --iter $x $dir/ali_$epoch \
        $prev_egs_dir $cur_egs_dir || exit 1

      if $cleanup && [[ $prev_egs_dir =~ $dir/egs* ]]; then
        steps/nnet2/remove_egs.sh $prev_egs_dir
      fi
    fi

    # Set off jobs doing some diagnostics, in the background.
    # Use the egs dir from the previous iteration for the diagnostics
    $cmd $dir/log/compute_prob_valid.$x.log \
      nnet-compute-prob $dir/$x.mdl ark:$cur_egs_dir/valid_diagnostic.egs &
    $cmd $dir/log/compute_prob_train.$x.log \
      nnet-compute-prob $dir/$x.mdl ark:$cur_egs_dir/train_diagnostic.egs &
    if [ $x -gt 0 ] && [ ! -f $dir/log/mix_up.$[$x-1].log ]; then
      $cmd $dir/log/progress.$x.log \
        nnet-show-progress --use-gpu=no $dir/$[$x-1].mdl $dir/$x.mdl \
        ark:$cur_egs_dir/train_diagnostic.egs '&&' \
        nnet-am-info $dir/$x.mdl &
    fi

    echo "Training neural net (pass $x)"

    mdl=$dir/$x.mdl

    if [ $x -eq 0 ]; then
      # on iteration zero, use a smaller minibatch
      # size and just one job: the model-averaging doesn't seem to be helpful
      # when the model is changing too fast (i.e. it worsens the objective
      # function), and the smaller minibatch size will help to keep
      # the update stable.
      this_minibatch_size=$[$minibatch_size/2];
      do_average=false
    else
      this_minibatch_size=$minibatch_size
      do_average=true
    fi

    rm $dir/.error 2>/dev/null


    ( # this sub-shell is so that when we "wait" below,
      # we only wait for the training jobs that we just spawned,
      # not the diagnostic jobs that we spawned above.

      # We can't easily use a single parallel SGE job to do the main training,
      # because the computation of which archive and which --frame option
      # to use for each job is a little complex, so we spawn each one separately.
      for n in $(seq $num_jobs_nnet); do
        k=$[$x*$num_jobs_nnet + $n - 1]; # k is a zero-based index that we'll derive
                                         # the other indexes from.
        archive=$[($k%$num_archives)+1]; # work out the 1-based archive index.
        frame=$[(($k/$num_archives)%$frames_per_eg)]; # work out the 0-based frame
        # index; this increases more slowly than the archive index because the
        # same archive with different frame indexes will give similar gradients,
        # so we want to separate them in time.

        $cmd $parallel_opts $dir/log/train.$x.$n.log \
          nnet-train$parallel_suffix $parallel_train_opts \
          --minibatch-size=$this_minibatch_size --srand=$x $dir/$x.mdl \
          "ark,bg:nnet-copy-egs --frame=$frame ark:$cur_egs_dir/egs.$archive.ark ark:-|nnet-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x ark:- ark:-|" \
          $dir/$[$x+1].$n.mdl || touch $dir/.error &
      done
      wait
    )
    # the error message below is not that informative, but $cmd will
    # have printed a more specific one.
    [ -f $dir/.error ] && echo "$0: error on iteration $x of training" && exit 1;

    nnets_list=
    for n in `seq 1 $num_jobs_nnet`; do
      nnets_list="$nnets_list $dir/$[$x+1].$n.mdl"
    done

    learning_rate=`perl -e '($x,$n,$i,$f)=@ARGV; print ($x >= $n ? $f : $i*exp($x*log($f/$i)/$n));' $[$x+1] $num_iters $initial_learning_rate $final_learning_rate`;

    if $do_average; then
      # average the output of the different jobs.
      $cmd $dir/log/average.$x.log \
        nnet-am-average $nnets_list - \| \
        nnet-am-copy --learning-rate=$learning_rate - $dir/$[$x+1].mdl || exit 1;
    else
      # choose the best from the different jobs.
      n=$(perl -e '($nj,$pat)=@ARGV; $best_n=1; $best_logprob=-1.0e+10; for ($n=1;$n<=$nj;$n++) {
          $fn = sprintf($pat,$n); open(F, "<$fn") || die "Error opening log file $fn";
          undef $logprob; while (<F>) { if (m/log-prob-per-frame=(\S+)/) { $logprob=$1; } }
          close(F); if (defined $logprob && $logprob > $best_logprob) { $best_logprob=$logprob;
          $best_n=$n; } } print "$best_n\n"; ' $num_jobs_nnet $dir/log/train.$x.%d.log) || exit 1;
      [ -z "$n" ] && echo "Error getting best model" && exit 1;
      $cmd $dir/log/select.$x.log \
        nnet-am-copy --learning-rate=$learning_rate $dir/$[$x+1].$n.mdl $dir/$[$x+1].mdl || exit 1;
    fi

    if [ "$mix_up" -gt 0 ] && [ $x -eq $mix_up_iter ]; then
      # mix up.
      echo Mixing up from $num_leaves to $mix_up components
      $cmd $dir/log/mix_up.$x.log \
        nnet-am-mixup --min-count=10 --num-mixtures=$mix_up \
        $dir/$[$x+1].mdl $dir/$[$x+1].mdl || exit 1;
    fi
    rm $nnets_list
    [ ! -f $dir/$[$x+1].mdl ] && exit 1;
    if [ -f $dir/$[$x-1].mdl ] && $cleanup && \
       [ $[($x-1)%100] -ne 0  ] && [ $[$x-1] -lt $first_model_combine ]; then
      rm $dir/$[$x-1].mdl
    fi
  fi
  x=$[$x+1]
done


if [ $stage -le $num_iters ]; then
  echo "Doing final combination to produce final.mdl"

  # Now do combination.
  nnets_list=()
  # the if..else..fi statement below sets 'nnets_list'.
  if [ $max_models_combine -lt $num_models_combine ]; then
    # The number of models to combine is too large, e.g. > 20.  In this case,
    # each argument to nnet-combine-fast will be an average of multiple models.
    cur_offset=0 # current offset from first_model_combine.
    for n in $(seq $max_models_combine); do
      next_offset=$[($n*$num_models_combine)/$max_models_combine]
      sub_list=""
      for o in $(seq $cur_offset $[$next_offset-1]); do
        iter=$[$first_model_combine+$o]
        mdl=$dir/$iter.mdl
        [ ! -f $mdl ] && echo "Expected $mdl to exist" && exit 1;
        sub_list="$sub_list $mdl"
      done
      nnets_list[$[$n-1]]="nnet-am-average $sub_list - |"
      cur_offset=$next_offset
    done
  else
    nnets_list=
    for n in $(seq 0 $[num_models_combine-1]); do
      iter=$[$first_model_combine+$n]
      mdl=$dir/$iter.mdl
      [ ! -f $mdl ] && echo "Expected $mdl to exist" && exit 1;
      nnets_list[$n]=$mdl
    done
  fi


  # Below, use --use-gpu=no to disable nnet-combine-fast from using a GPU, as
  # if there are many models it can give out-of-memory error; set num-threads to 8
  # to speed it up (this isn't ideal...)
  num_egs=`nnet-copy-egs ark:$cur_egs_dir/combine.egs ark:/dev/null 2>&1 | tail -n 1 | awk '{print $NF}'`
  mb=$[($num_egs+$combine_num_threads-1)/$combine_num_threads]
  [ $mb -gt 512 ] && mb=512
  # Setting --initial-model to a large value makes it initialize the combination
  # with the average of all the models.  It's important not to start with a
  # single model, or, due to the invariance to scaling that these nonlinearities
  # give us, we get zero diagonal entries in the fisher matrix that
  # nnet-combine-fast uses for scaling, which after flooring and inversion, has
  # the effect that the initial model chosen gets much higher learning rates
  # than the others.  This prevents the optimization from working well.
  $cmd $combine_parallel_opts $dir/log/combine.log \
    nnet-combine-fast --initial-model=100000 --num-lbfgs-iters=40 --use-gpu=no \
      --num-threads=$combine_num_threads \
      --verbose=3 --minibatch-size=$mb "${nnets_list[@]}" ark:$cur_egs_dir/combine.egs \
      $dir/final.mdl || exit 1;

  # Compute the probability of the final, combined model with
  # the same subset we used for the previous compute_probs, as the
  # different subsets will lead to different probs.
  $cmd $dir/log/compute_prob_valid.final.log \
    nnet-compute-prob $dir/final.mdl ark:$cur_egs_dir/valid_diagnostic.egs &
  $cmd $dir/log/compute_prob_train.final.log \
    nnet-compute-prob $dir/final.mdl ark:$cur_egs_dir/train_diagnostic.egs &
fi

if [ $stage -le $[$num_iters+1] ]; then
  echo "Getting average posterior for purposes of adjusting the priors."
  # Note: this just uses CPUs, using a smallish subset of data.
  rm $dir/post.$x.*.vec 2>/dev/null
  $cmd JOB=1:$num_jobs_compute_prior $dir/log/get_post.$x.JOB.log \
    nnet-copy-egs --frame=random --srand=JOB ark:$cur_egs_dir/egs.1.ark ark:- \| \
    nnet-subset-egs --srand=JOB --n=$prior_subset_size ark:- ark:- \| \
    nnet-compute-from-egs "nnet-to-raw-nnet $dir/final.mdl -|" ark:- ark:- \| \
    matrix-sum-rows ark:- ark:- \| vector-sum ark:- $dir/post.$x.JOB.vec || exit 1;

  sleep 3;  # make sure there is time for $dir/post.$x.*.vec to appear.

  $cmd $dir/log/vector_sum.$x.log \
   vector-sum $dir/post.$x.*.vec $dir/post.$x.vec || exit 1;

  rm $dir/post.$x.*.vec;

  echo "Re-adjusting priors based on computed posteriors"
  $cmd $dir/log/adjust_priors.final.log \
    nnet-adjust-priors $dir/final.mdl $dir/post.$x.vec $dir/final.mdl || exit 1;
fi


if [ ! -f $dir/final.mdl ]; then
  echo "$0: $dir/final.mdl does not exist."
  # we don't want to clean up if the training didn't succeed.
  exit 1;
fi

sleep 2

echo Done

if $cleanup; then
  echo Cleaning up data
  if [[ $cur_egs_dir =~ $dir/egs* ]]; then
    steps/nnet2/remove_egs.sh $cur_egs_dir
  fi

  echo Removing most of the models
  for x in `seq 0 $num_iters`; do
    if [ $[$x%100] -ne 0 ] && [ $x -ne $num_iters ] && [ -f $dir/$x.mdl ]; then
       # delete all but every 100th model; don't delete the ones which combine to form the final model.
      rm $dir/$x.mdl
    fi
  done
fi


================================================
FILE: egs/steps/nnet2/retrain_tanh.sh
================================================
#!/usr/bin/env bash

# Copyright 2012  Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.

# This script is for training networks with tanh nonlinearities; it starts with
# a given model and supports increasing the hidden-layer dimension.  It is
# otherwise similar to train_tanh.sh

# Begin configuration section.
cmd=run.pl
num_epochs=15      # Number of epochs during which we reduce
                   # the learning rate; number of iteration is worked out from this.
num_epochs_extra=5 # Number of epochs after we stop reducing
                   # the learning rate.
num_iters_final=20 # Maximum number of final iterations to give to the
                   # optimization over the validation set.
initial_learning_rate=0.04
final_learning_rate=0.004
softmax_learning_rate_factor=0.5 # Train this layer half as fast as the other layers.


minibatch_size=128 # by default use a smallish minibatch size for neural net
                   # training; this controls instability which would otherwise
                   # be a problem with multi-threaded update.  Note: it also
                   # interacts with the "preconditioned" update which generally
                   # works better with larger minibatch size, so it's not
                   # completely cost free.

shuffle_buffer_size=5000 # This "buffer_size" variable controls randomization of the samples
                # on each iter.  You could set it to 0 or to a large value for complete
                # randomization, but this would both consume memory and cause spikes in
                # disk I/O.  Smaller is easier on disk and memory but less random.  It's
                # not a huge deal though, as samples are anyway randomized right at the start.


stage=-5


mix_up=0 # Number of components to mix up to (should be > #tree leaves, if
         # specified.)  Will do this at the start.
widen=0 # If specified, it will increase the hidden-layer dimension 
                            # to this value.  Will do this at the start.
bias_stddev=0.5 # will be used for widen

num_threads=16
parallel_opts="--num-threads $num_threads"  # using a smallish #threads by default, out of stability concerns.
  # note: parallel_opts doesn't automatically get adjusted if you adjust num-threads.
cleanup=true
# End configuration section.


echo "$0 $@"  # Print the command line for logging

if [ -f path.sh ]; then . ./path.sh; fi
. parse_options.sh || exit 1;

if [ $# != 3 ]; then
  echo "Usage: $0 [opts] <egs-dir> <old-nnet-dir> <exp-dir>"
  echo " e.g.: $0 --widen 1024 exp/tri4_nnet/egs exp/tri4_nnet exp/tri5_nnet"
  echo ""
  echo "Main options (for others, see top of script file)"
  echo "  --config <config-file>                           # config file containing options"
  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
  echo "  --num-epochs <#epochs|15>                        # Number of epochs of main training"
  echo "                                                   # while reducing learning rate (determines #iterations, together"
  echo "                                                   # with --samples-per-iter and --num-jobs-nnet)"
  echo "  --num-epochs-extra <#epochs-extra|5>             # Number of extra epochs of training"
  echo "                                                   # after learning rate fully reduced"
  echo "  --initial-learning-rate <initial-learning-rate|0.02> # Learning rate at start of training, e.g. 0.02 for small"
  echo "                                                       # data, 0.01 for large data"
  echo "  --final-learning-rate  <final-learning-rate|0.004>   # Learning rate at end of training, e.g. 0.004 for small"
  echo "                                                   # data, 0.001 for large data"
  echo "  --mix-up <#pseudo-gaussians|0>                   # Can be used to have multiple targets in final output layer,"
  echo "                                                   # per context-dependent state.  Try a number several times #states."
  echo "  --num-threads <num-threads|16>                   # Number of parallel threads per job (will affect results"
  echo "                                                   # as well as speed; may interact with batch size; if you increase"
  echo "                                                   # this, you may want to decrease the batch size."
  echo "  --parallel-opts <opts|\"--num-threads 16\">            # extra options to pass to e.g. queue.pl for processes that"
  echo "                                                   # use multiple threads."
  echo "  --minibatch-size <minibatch-size|128>            # Size of minibatch to process (note: product with --num-threads"
  echo "                                                   # should not get too large, e.g. >2k)."
  echo "  --num-iters-final <#iters|10>                    # Number of final iterations to give to nnet-combine-fast to "
  echo "                                                   # interpolate parameters (the weights are learned with a validation set)"
  echo "  --stage <stage|-5>                               # Used to run a partially-completed training process from somewhere in"
  echo "                                                   # the middle."
  exit 1;
fi

egs_dir=$1
nnet_dir=$2
dir=$3

# Check some files.
for f in $egs_dir/egs.1.0.ark $nnet_dir/final.mdl; do
  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
done

num_jobs_nnet=`cat $egs_dir/num_jobs_nnet` || exit 1;
iters_per_epoch=`cat $egs_dir/iters_per_epoch`  || exit 1;

mkdir -p $dir/log

cp $nnet_dir/phones.txt $dir 2>/dev/null

cp $nnet_dir/splice_opts $dir 2>/dev/null
cp $nnet_dir/final.mat $dir 2>/dev/null # any LDA matrix...
cp $nnet_dir/tree $dir


if [ $stage -le -2 ] && [ $mix_up -gt 0 ]; then
  echo Mixing up to $mix_up components
  $cmd $dir/log/mix_up.$x.log \
    nnet-am-mixup --min-count=10 --num-mixtures=$mix_up \
      $nnet_dir/final.mdl $dir/0.mdl || exit 1;
else 
  cp $nnet_dir/final.mdl $dir/0.mdl || exit 1;
fi

if [ $stage -le -1 ] && [ $widen -gt 0 ]; then
  echo "$0: Widening nnet to hidden-layer-dim=$widen"
  $cmd $dir/log/widen.log \
    nnet-am-widen --hidden-layer-dim=$widen $dir/0.mdl $dir/0.mdl || exit 1;
fi

num_iters_reduce=$[$num_epochs * $iters_per_epoch];
num_iters_extra=$[$num_epochs_extra * $iters_per_epoch];
num_iters=$[$num_iters_reduce+$num_iters_extra]

echo "$0: Will train for $num_epochs + $num_epochs_extra epochs, equalling "
echo "$0: $num_iters_reduce + $num_iters_extra = $num_iters iterations, "
echo "$0: (while reducing learning rate) + (with constant learning rate)."

x=0
while [ $x -lt $num_iters ]; do
  if [ $x -ge 0 ] && [ $stage -le $x ]; then
    # Set off jobs doing some diagnostics, in the background.
    $cmd $dir/log/compute_prob_valid.$x.log \
      nnet-compute-prob $dir/$x.mdl ark:$egs_dir/valid_diagnostic.egs &
    $cmd $dir/log/compute_prob_train.$x.log \
      nnet-compute-prob $dir/$x.mdl ark:$egs_dir/train_diagnostic.egs &

    echo "Training neural net (pass $x)"

    $cmd $parallel_opts JOB=1:$num_jobs_nnet $dir/log/train.$x.JOB.log \
      nnet-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x \
      ark:$egs_dir/egs.JOB.$[$x%$iters_per_epoch].ark ark:- \| \
      nnet-train-parallel --num-threads=$num_threads \
         --minibatch-size=$minibatch_size --srand=$x $dir/$x.mdl \
        ark:- $dir/$[$x+1].JOB.mdl \
      || exit 1;

    nnets_list=
    for n in `seq 1 $num_jobs_nnet`; do
      nnets_list="$nnets_list $dir/$[$x+1].$n.mdl"
    done

    learning_rate=`perl -e '($x,$n,$i,$f)=@ARGV; print ($x >= $n ? $f : $i*exp($x*log($f/$i)/$n));' $[$x+1] $num_iters_reduce $initial_learning_rate $final_learning_rate`;
    softmax_learning_rate=`perl -e "print $learning_rate * $softmax_learning_rate_factor;"`;
    nnet-am-info $dir/$[$x+1].1.mdl > $dir/foo  2>/dev/null || exit 1
    nu=`cat $dir/foo | grep num-updatable-components | awk '{print $2}'`
    na=`cat $dir/foo | grep AffineComponent | wc -l` # number of last AffineComopnent layer [one-based]
    lr_string="$learning_rate"
    for n in `seq 2 $nu`; do 
      if [ $n -eq $na ]; then lr=$softmax_learning_rate;
      else lr=$learning_rate; fi
      lr_string="$lr_string:$lr"
    done
    
    $cmd $dir/log/average.$x.log \
      nnet-am-average $nnets_list - \| \
      nnet-am-copy --learning-rates=$lr_string - $dir/$[$x+1].mdl || exit 1;

    rm $nnets_list
  fi
  x=$[$x+1]
done

# Now do combination.
# At the end, final.mdl will be a combination of the last e.g. 10 models.
if [ $num_iters_final -gt $num_iters_extra ]; then
  echo "Setting num_iters_final=$num_iters_extra"
  num_iters_final=$num_iters_extra
fi
start=$[$num_iters-$num_iters_final+1]
nnets_list=
for x in `seq $start $num_iters`; do
  nnets_list="$nnets_list $dir/$x.mdl"
done

if [ $stage -le $num_iters ]; then
  num_egs=`nnet-copy-egs ark:$egs_dir/combine.egs ark:/dev/null 2>&1 | tail -n 1 | awk '{print $NF}'`
  mb=$[($num_egs+$num_threads-1)/$num_threads]
  $cmd $parallel_opts $dir/log/combine.log \
    nnet-combine-fast --use-gpu=no --num-threads=$num_threads --verbose=3 --minibatch-size=$mb \
    $nnets_list ark:$egs_dir/combine.egs $dir/final.mdl || exit 1;
fi

sleep 2; # make sure final.mdl exists.

# Compute the probability of the final, combined model with
# the same subset we used for the previous compute_probs, as the
# different subsets will lead to different probs.
$cmd $dir/log/compute_prob_valid.final.log \
  nnet-compute-prob $dir/final.mdl ark:$egs_dir/valid_diagnostic.egs &
$cmd $dir/log/compute_prob_train.final.log \
  nnet-compute-prob $dir/final.mdl ark:$egs_dir/train_diagnostic.egs &

echo Done

if $cleanup; then
  echo Removing most of the models
  for x in `seq 0 $num_iters`; do
    if [ $[$x%10] -ne 0 ] && [ $x -lt $[$num_iters-$num_iters_final+1] ]; then 
       # delete all but every 10th model; don't delete the ones which combine to form the final model.
      rm $dir/$x.mdl
    fi
  done
fi


================================================
FILE: egs/steps/nnet2/train_block.sh
================================================
#!/usr/bin/env bash

# Copyright 2012  Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.
# this is as train_tanh.sh but for on top of fbank feats-- we have block-diagonal
# transforms for the first few layers, on separate frequency bands.
# Otherwise it's tanh.

# Begin configuration section.
cmd=run.pl
num_epochs=15      # Number of epochs during which we reduce
                   # the learning rate; number of iteration is worked out from this.
num_epochs_extra=5 # Number of epochs after we stop reducing
                   # the learning rate.
num_iters_final=20 # Maximum number of final iterations to give to the
                   # optimization over the validation set.
initial_learning_rate=0.04
final_learning_rate=0.004
bias_stddev=0.0
shrink_interval=5 # shrink every $shrink_interval iters except while we are
                  # still adding layers, when we do it every iter.
shrink=true
num_frames_shrink=2000 # note: must be <= --num-frames-diagnostic option to get_egs.sh, if
                       # given.
softmax_learning_rate_factor=0.5 # Train this layer half as fast as the other layers.

hidden_layer_dim=300 #  You may want this larger, e.g. 1024 or 2048.

minibatch_size=128 # by default use a smallish minibatch size for neural net
                   # training; this controls instability which would otherwise
                   # be a problem with multi-threaded update.  Note: it also
                   # interacts with the "preconditioned" update which generally
                   # works better with larger minibatch size, so it's not
                   # completely cost free.

samples_per_iter=200000 # each iteration of training, see this many samples
                        # per job.  This option is passed to get_egs.sh
num_jobs_nnet=16   # Number of neural net jobs to run in parallel.  This option
                   # is passed to get_egs.sh.
get_egs_stage=0

shuffle_buffer_size=5000 # This "buffer_size" variable controls randomization of the samples
                # on each iter.  You could set it to 0 or to a large value for complete
                # randomization, but this would both consume memory and cause spikes in
                # disk I/O.  Smaller is easier on disk and memory but less random.  It's
                # not a huge deal though, as samples are anyway randomized right at the start.

add_layers_period=2 # by default, add new layers every 2 iterations.

num_block_layers=2
num_normal_layers=2
block_size=10
block_shift=5

stage=-5

io_opts="--max-jobs-run 5" # for jobs with a lot of I/O, limits the number running at one time.
splice_width=7 # meaning +- 7 frames on each side for second LDA
randprune=4.0 # speeds up LDA.
alpha=4.0
max_change=10.0
mix_up=0 # Number of components to mix up to (should be > #tree leaves, if
        # specified.)
num_threads=16
parallel_opts="--num-threads 16 --mem 1G" # by default we use 16 threads; this lets the queue know.
  # note: parallel_opts doesn't automatically get adjusted if you adjust num-threads.
cleanup=true
egs_dir=
lda_opts=
egs_opts=
# End configuration section.


echo "$0 $@"  # Print the command line for logging

if [ -f path.sh ]; then . ./path.sh; fi
. parse_options.sh || exit 1;


if [ $# != 4 ]; then
  echo "Usage: $0 [opts] <data> <lang> <ali-dir> <exp-dir>"
  echo " e.g.: $0 data/train data/lang exp/tri3_ali exp/tri4_nnet"
  echo ""
  echo "Main options (for others, see top of script file)"
  echo "  --config <config-file>                           # config file containing options"
  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
  echo "  --num-epochs <#epochs|15>                        # Number of epochs of main training"
  echo "                                                   # while reducing learning rate (determines #iterations, together"
  echo "                                                   # with --samples-per-iter and --num-jobs-nnet)"
  echo "  --num-epochs-extra <#epochs-extra|5>             # Number of extra epochs of training"
  echo "                                                   # after learning rate fully reduced"
  echo "  --initial-learning-rate <initial-learning-rate|0.02> # Learning rate at start of training, e.g. 0.02 for small"
  echo "                                                       # data, 0.01 for large data"
  echo "  --final-learning-rate  <final-learning-rate|0.004>   # Learning rate at end of training, e.g. 0.004 for small"
  echo "                                                   # data, 0.001 for large data"
  echo "  --num-hidden-layers <#hidden-layers|2>           # Number of hidden layers, e.g. 2 for 3 hours of data, 4 for 100hrs"
  echo "  --initial-num-hidden-layers <#hidden-layers|1>   # Number of hidden layers to start with."
  echo "  --add-layers-period <#iters|2>                   # Number of iterations between adding hidden layers"
  echo "  --mix-up <#pseudo-gaussians|0>                   # Can be used to have multiple targets in final output layer,"
  echo "                                                   # per context-dependent state.  Try a number several times #states."
  echo "  --num-jobs-nnet <num-jobs|8>                     # Number of parallel jobs to use for main neural net"
  echo "                                                   # training (will affect results as well as speed; try 8, 16)"
  echo "                                                   # Note: if you increase this, you may want to also increase"
  echo "                                                   # the learning rate."
  echo "  --num-threads <num-threads|16>                   # Number of parallel threads per job (will affect results"
  echo "                                                   # as well as speed; may interact with batch size; if you increase"
  echo "                                                   # this, you may want to decrease the batch size."
  echo "  --parallel-opts <opts|\"--num-threads 16 --mem 1G\">      # extra options to pass to e.g. queue.pl for processes that"
  echo "                                                   # use multiple threads... "
  echo "  --io-opts <opts|\"--max-jobs-run 10\">                      # Options given to e.g. queue.pl for jobs that do a lot of I/O."
  echo "  --minibatch-size <minibatch-size|128>            # Size of minibatch to process (note: product with --num-threads"
  echo "                                                   # should not get too large, e.g. >2k)."
  echo "  --samples-per-iter <#samples|400000>             # Number of samples of data to process per iteration, per"
  echo "                                                   # process."
  echo "  --splice-width <width|4>                         # Number of frames on each side to append for feature input"
  echo "                                                   # (note: we splice processed, typically 40-dimensional frames"
  echo "  --lda-dim <dim|250>                              # Dimension to reduce spliced features to with LDA"
  echo "  --num-iters-final <#iters|10>                    # Number of final iterations to give to nnet-combine-fast to "
  echo "                                                   # interpolate parameters (the weights are learned with a validation set)"
  echo "  --num-utts-subset <#utts|300>                    # Number of utterances in subsets used for validation and diagnostics"
  echo "                                                   # (the validation subset is held out from training)"
  echo "  --num-frames-diagnostic <#frames|4000>           # Number of frames used in computing (train,valid) diagnostics"
  echo "  --num-valid-frames-combine <#frames|10000>       # Number of frames used in getting combination weights at the"
  echo "                                                   # very end."
  echo "  --stage <stage|-9>                               # Used to run a partially-completed training process from somewhere in"
  echo "                                                   # the middle."

  exit 1;
fi

data=$1
lang=$2
alidir=$3
dir=$4

# Check some files.
for f in $data/feats.scp $lang/L.fst $alidir/ali.1.gz $alidir/final.mdl $alidir/tree; do
  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
done


# Set some variables.
num_leaves=`gmm-info $alidir/final.mdl 2>/dev/null | awk '/number of pdfs/{print $NF}'` || exit 1;

nj=`cat $alidir/num_jobs` || exit 1;  # number of jobs in alignment dir...
# in this dir we'll have just one job.
sdata=$data/split$nj
utils/split_data.sh $data $nj

mkdir -p $dir/log
echo $nj > $dir/num_jobs
cp $alidir/tree $dir

utils/lang/check_phones_compatible.sh $lang/phones.txt $alidir/phones.txt || exit 1;
cp $lang/phones.txt $dir || exit 1;

# Get list of validation utterances.
awk '{print $1}' $data/utt2spk | utils/shuffle_list.pl | head -$num_utts_subset \
    > $dir/valid_uttlist || exit 1;
awk '{print $1}' $data/utt2spk | utils/filter_scp.pl --exclude $dir/valid_uttlist | \
     head -$num_utts_subset > $dir/train_subset_uttlist || exit 1;


if [ $stage -le -4 ]; then
  echo "$0: calling get_lda.sh"
  steps/nnet2/get_lda_block.sh --block-size $block_size --block-shift $block_shift \
    $lda_opts --splice-width $splice_width --cmd "$cmd" $data $lang $alidir $dir || exit 1;
fi

# these files will have been written by get_lda_block.sh
feat_dim=`cat $dir/feat_dim` || exit 1;
lda_dim=`cat $dir/lda_dim` || exit 1;
num_blocks=`cat $dir/num_blocks` || exit 1;

if [ $stage -le -3 ] && [ -z "$egs_dir" ]; then
  echo "$0: calling get_egs.sh"
  steps/nnet2/get_egs.sh --io-opts "$io_opts" --samples-per-iter $samples_per_iter \
      --num-jobs-nnet $num_jobs_nnet \
      --splice-width $splice_width --stage $get_egs_stage --cmd "$cmd" $egs_opts --feat-type raw \
      $data $lang $alidir $dir || exit 1;
fi

if [ -z $egs_dir ]; then
  egs_dir=$dir/egs
fi

iters_per_epoch=`cat $egs_dir/iters_per_epoch`  || exit 1;
! [ $num_jobs_nnet -eq `cat $egs_dir/num_jobs_nnet` ] && \
  echo "$0: Warning: using --num-jobs-nnet=`cat $egs_dir/num_jobs_nnet` from $egs_dir"
num_jobs_nnet=`cat $egs_dir/num_jobs_nnet`


if [ $stage -le -2 ]; then
  echo "$0: initializing neural net";

  hidden_block_size=`perl -e "print int(sqrt(($hidden_layer_dim*$hidden_layer_dim)/$num_blocks));"`
  echo "Hidden block size is $hidden_block_size"
  hidden_block_dim=$[$hidden_block_size*$num_blocks]
  block_stddev=`perl -e "print 1.0/sqrt($block_size);"`
  hidden_block_stddev=`perl -e "print 1.0/sqrt($hidden_block_size);"`
  first_hidden_layer_stddev=`perl -e "print 1.0/sqrt($hidden_block_dim);"`
  stddev=`perl -e "print 1.0/sqrt($hidden_layer_dim);"`


  cat >$dir/nnet.config <<EOF
SpliceComponent input-dim=$feat_dim left-context=$splice_width right-context=$splice_width
FixedAffineComponent matrix=$dir/lda.mat
BlockAffineComponentPreconditioned input-dim=$lda_dim output-dim=$hidden_block_dim alpha=$alpha learning-rate=$initial_learning_rate num-blocks=$num_blocks param-stddev=$block_stddev bias-stddev=$bias_stddev
TanhComponent dim=$hidden_block_dim
EOF
  for n in `seq 2 $num_block_layers`; do
    cat >>$dir/nnet.config <<EOF
BlockAffineComponentPreconditioned input-dim=$hidden_block_dim output-dim=$hidden_block_dim alpha=$alpha num-blocks=$num_blocks learning-rate=$initial_learning_rate param-stddev=$hidden_block_stddev bias-stddev=$bias_stddev
TanhComponent dim=$hidden_block_dim
EOF
  done
  cat >>$dir/nnet.config <<EOF
AffineComponentPreconditioned input-dim=$hidden_block_dim output-dim=$hidden_layer_dim alpha=$alpha max-change=$max_change learning-rate=$initial_learning_rate param-stddev=$first_hidden_layer_stddev bias-stddev=$bias_stddev
TanhComponent dim=$hidden_layer_dim
EOF
  for n in `seq 2 $num_normal_layers`; do
  cat >>$dir/nnet.config <<EOF
AffineComponentPreconditioned input-dim=$hidden_layer_dim output-dim=$hidden_layer_dim alpha=$alpha max-change=$max_change learning-rate=$initial_learning_rate param-stddev=$stddev bias-stddev=$bias_stddev
TanhComponent dim=$hidden_layer_dim
EOF
  done

  cat >>$dir/nnet.config <<EOF
AffineComponentPreconditioned input-dim=$hidden_layer_dim output-dim=$num_leaves alpha=$alpha max-change=$max_change learning-rate=$initial_learning_rate param-stddev=0 bias-stddev=0
SoftmaxComponent dim=$num_leaves
EOF

  $cmd $dir/log/nnet_init.log \
    nnet-am-init $alidir/tree $lang/topo "nnet-init $dir/nnet.config -|" \
    $dir/0.mdl || exit 1;
fi

if [ $stage -le -1 ]; then
  echo "Training transition probabilities and setting priors"
  $cmd $dir/log/train_trans.log \
    nnet-train-transitions $dir/0.mdl "ark:gunzip -c $alidir/ali.*.gz|" $dir/0.mdl \
    || exit 1;
fi

num_iters_reduce=$[$num_epochs * $iters_per_epoch];
num_iters_extra=$[$num_epochs_extra * $iters_per_epoch];
num_iters=$[$num_iters_reduce+$num_iters_extra]

echo "$0: Will train for $num_epochs + $num_epochs_extra epochs, equalling "
echo "$0: $num_iters_reduce + $num_iters_extra = $num_iters iterations, "
echo "$0: (while reducing learning rate) + (with constant learning rate)."

# This is when we decide to mix up from: halfway between when we've finished
# adding the hidden layers and the end of training.
mix_up_iter=$[$num_iters/2]

if [ $num_threads -eq 1 ]; then
  train_suffix="-simple" # this enables us to use GPU code if
                         # we have just one thread.
else
  train_suffix="-parallel --num-threads=$num_threads"
fi

x=0
while [ $x -lt $num_iters ]; do
  if [ $x -ge 0 ] && [ $stage -le $x ]; then
    # Set off jobs doing some diagnostics, in the background.
    $cmd $dir/log/compute_prob_valid.$x.log \
      nnet-compute-prob $dir/$x.mdl ark:$egs_dir/valid_diagnostic.egs &
    $cmd $dir/log/compute_prob_train.$x.log \
      nnet-compute-prob $dir/$x.mdl ark:$egs_dir/train_diagnostic.egs &
    if [ $x -gt 0 ] && [ ! -f $dir/log/mix_up.$[$x-1].log ]; then
      $cmd $dir/log/progress.$x.log \
        nnet-show-progress --use-gpu=no $dir/$[$x-1].mdl $dir/$x.mdl ark:$egs_dir/train_diagnostic.egs &
    fi

    echo "Training neural net (pass $x)"
    mdl=$dir/$x.mdl

    $cmd $parallel_opts JOB=1:$num_jobs_nnet $dir/log/train.$x.JOB.log \
      nnet-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x \
      ark:$egs_dir/egs.JOB.$[$x%$iters_per_epoch].ark ark:- \| \
      nnet-train$train_suffix \
         --minibatch-size=$minibatch_size --srand=$x "$mdl" \
        ark:- $dir/$[$x+1].JOB.mdl \
      || exit 1;

    nnets_list=
    for n in `seq 1 $num_jobs_nnet`; do
      nnets_list="$nnets_list $dir/$[$x+1].$n.mdl"
    done

    learning_rate=`perl -e '($x,$n,$i,$f)=@ARGV; print ($x >= $n ? $f : $i*exp($x*log($f/$i)/$n));' $[$x+1] $num_iters_reduce $initial_learning_rate $final_learning_rate`;
    softmax_learning_rate=`perl -e "print $learning_rate * $softmax_learning_rate_factor;"`;
    nnet-am-info $dir/$[$x+1].1.mdl > $dir/foo  2>/dev/null || exit 1
    nu=`cat $dir/foo | grep num-updatable-components | awk '{print $2}'`
    na=`cat $dir/foo | grep -v Fixed | grep AffineComponent | wc -l`
    # na is number of last updatable AffineComponent layer [one-based, counting only
    # updatable components.]
    lr_string="$learning_rate"
    for n in `seq 2 $nu`; do
      if [ $n -eq $na ] || [ $n -eq $[$na-1] ]; then lr=$softmax_learning_rate;
      else lr=$learning_rate; fi
      lr_string="$lr_string:$lr"
    done

    $cmd $dir/log/average.$x.log \
      nnet-am-average $nnets_list - \| \
      nnet-am-copy --learning-rates=$lr_string - $dir/$[$x+1].mdl || exit 1;

    if $shrink && [ $[$x % $shrink_interval] -eq 0 ]; then
      mb=$[($num_frames_shrink+$num_threads-1)/$num_threads]
      $cmd $parallel_opts $dir/log/shrink.$x.log \
        nnet-subset-egs --n=$num_frames_shrink --randomize-order=true --srand=$x \
          ark:$egs_dir/train_diagnostic.egs ark:-  \| \
        nnet-combine-fast --num-threads=$num_threads --verbose=3 --minibatch-size=$mb \
          $dir/$[$x+1].mdl ark:- $dir/$[$x+1].mdl || exit 1;
    else
      # On other iters, do nnet-am-fix which is much faster and has roughly
      # the same effect.
      nnet-am-fix $dir/$[$x+1].mdl $dir/$[$x+1].mdl 2>$dir/log/fix.$x.log
    fi

    if [ "$mix_up" -gt 0 ] && [ $x -eq $mix_up_iter ]; then
      # mix up.
      echo Mixing up from $num_leaves to $mix_up components
      $cmd $dir/log/mix_up.$x.log \
        nnet-am-mixup --min-count=10 --num-mixtures=$mix_up \
        $dir/$[$x+1].mdl $dir/$[$x+1].mdl || exit 1;
    fi
    rm $nnets_list
  fi
  x=$[$x+1]
done

# Now do combination.
# At the end, final.mdl will be a combination of the last e.g. 10 models.
nnets_list=()
if [ $num_iters_final -gt $num_iters_extra ]; then
  echo "Setting num_iters_final=$num_iters_extra"
fi
start=$[$num_iters-$num_iters_final+1]
for x in `seq $start $num_iters`; do
  idx=$[$x-$start]
  if [ $x -gt $mix_up_iter ]; then
    nnets_list[$idx]=$dir/$x.mdl # "nnet-am-copy --remove-dropout=true $dir/$x.mdl - |"
  fi
done

if [ $stage -le $num_iters ]; then
  # Below, use --use-gpu=no to disable nnet-combine-fast from using a GPU, as
  # if there are many models it can give out-of-memory error; set num-threads to 8
  # to speed it up (this isn't ideal...)
  this_num_threads=$num_threads
  [ $this_num_threads -lt 8 ] && this_num_threads=8
  num_egs=`nnet-copy-egs ark:$egs_dir/combine.egs ark:/dev/null 2>&1 | tail -n 1 | awk '{print $NF}'`
  mb=$[($num_egs+$this_num_threads-1)/$this_num_threads]
  [ $mb -gt 512 ] && mb=512
  $cmd $parallel_opts $dir/log/combine.log \
    nnet-combine-fast --use-gpu=no --num-threads=$this_num_threads \
      --verbose=3 --minibatch-size=$mb "${nnets_list[@]}" ark:$egs_dir/combine.egs \
      $dir/final.mdl || exit 1;
fi

# Compute the probability of the final, combined model with
# the same subset we used for the previous compute_probs, as the
# different subsets will lead to different probs.
$cmd $dir/log/compute_prob_valid.final.log \
  nnet-compute-prob $dir/final.mdl ark:$egs_dir/valid_diagnostic.egs &
$cmd $dir/log/compute_prob_train.final.log \
  nnet-compute-prob $dir/final.mdl ark:$egs_dir/train_diagnostic.egs &

sleep 2

echo Done

if $cleanup; then
  echo Cleaning up data
  if [ $egs_dir == "$dir/egs" ]; then
    steps/nnet2/remove_egs.sh $dir/egs
  fi
  echo Removing most of the models
  for x in `seq 0 $num_iters`; do
    if [ $[$x%10] -ne 0 ] && [ $x -lt $[$num_iters-$num_iters_final+1] ]; then
       # delete all but every 10th model; don't delete the ones which combine to form the final model.
      rm $dir/$x.mdl
    fi
  done
fi


================================================
FILE: egs/steps/nnet2/train_convnet_accel2.sh
================================================
#!/usr/bin/env bash

# Copyright 2012-2014  Johns Hopkins University (Author: Daniel Povey).
#                2013  Xiaohui Zhang
#                2013  Guoguo Chen
#                2014  Vimal Manohar
#                2015  Xingyu Na
# Apache 2.0.

# train_convnet_accel2.sh is modified from train_pnorm_accel2.sh. It propotypes
# the training of a ConvNet. The ConvNet is composed of 4 hidden layers. The first layer
# is a Convolutional1d component plus a Maxpooling component. The second layer
# is a single Convolutional1d component. The third and fourth layers are affine
# components with ReLU nonlinearities. Due to non-squashing output, normalize
# component is applied to all four layers. The number of hidden layers is hard
# coded now.

# train_pnorm_accel2.sh is a modified form of train_pnorm_simple2.sh (the "2"
# suffix is because they both use the the "new" egs format, created by
# get_egs2.sh).  The "accel" part of the name refers to the fact that this
# script uses a number of jobs that can increase during training.  You can
# specify --initial-num-jobs and --final-num-jobs to control these separately.
# Also, in this script, the learning rates specified by --initial-learning-rate
# and --final-learning-rate are the "effective learning rates" (defined as the
# learning rate divided by the number of jobs), and the actual learning rates
# used will be the specified learning rates multiplied by the current number
# of jobs.  You'll want to set these lower than you normally would previously
# have set the learning rates, by a factor equal to the (previous) number of
# jobs.


# Begin configuration section.
cmd=run.pl
num_epochs=15      # Number of epochs of training;
                   # the number of iterations is worked out from this.
initial_effective_lrate=0.01
final_effective_lrate=0.001
bias_stddev=0.5
hidden_dim=3000
minibatch_size=128 # by default use a smallish minibatch size for neural net
                   # training; this controls instability which would otherwise
                   # be a problem with multi-threaded update.

samples_per_iter=400000 # each iteration of training, see this many samples
                        # per job.  This option is passed to get_egs.sh
num_jobs_initial=1    # Number of neural net jobs to run in parallel at the start of training.
num_jobs_final=8      # Number of jobs to run in parallel at the end of training.

prior_subset_size=10000 # 10k samples per job, for computing priors.  Should be
                        # more than enough.
num_jobs_compute_prior=10 # these are single-threaded, run on CPU.
get_egs_stage=0
online_ivector_dir=


max_models_combine=20 # The "max_models_combine" is the maximum number of models we give
  # to the final 'combine' stage, but these models will themselves be averages of
  # iteration-number ranges.

shuffle_buffer_size=5000 # This "buffer_size" variable controls randomization of the samples
                # on each iter.  You could set it to 0 or to a large value for complete
                # randomization, but this would both consume memory and cause spikes in
                # disk I/O.  Smaller is easier on disk and memory but less random.  It's
                # not a huge deal though, as samples are anyway randomized right at the start.
                # (the point of this is to get data in different minibatches on different iterations,
                # since in the preconditioning method, 2 samples in the same minibatch can
                # affect each others' gradients.
num_hidden_layers=4
add_layers_period=2 # by default, add new layers every 2 iterations.
stage=-3

splice_width=4 # meaning +- 4 frames on each side for second LDA
left_context= # if set, overrides splice-width
right_context= # if set, overrides splice-width.
randprune=4.0 # speeds up LDA.
alpha=4.0 # relates to preconditioning.
update_period=4 # relates to online preconditioning: says how often we update the subspace.
num_samples_history=2000 # relates to online preconditioning
max_change_per_sample=0.075
precondition_rank_in=20  # relates to online preconditioning
precondition_rank_out=80 # relates to online preconditioning

num_filters1=128      # number of filters in the first convolutional layer
patch_step1=1         # patch step of the first convolutional layer
patch_dim1=7          # dim of convolutional kernel in the first layer
pool_size=3           # size of pooling after the first convolutional layer
num_filters2=256      # number of filters in the second convolutional layer
patch_dim2=4          # dim of convolutional kernel in the second layer
patch_step2=1         # patch step of the second convolutional layer

mix_up=0 # Number of components to mix up to (should be > #tree leaves, if
        # specified.)
num_threads=16
parallel_opts="--num-threads 16 --mem 1G"
  # by default we use 16 threads; this lets the queue know.
  # note: parallel_opts doesn't automatically get adjusted if you adjust num-threads.
combine_num_threads=8
combine_parallel_opts="--num-threads 8"  # queue options for the "combine" stage.
cleanup=true
egs_dir=
lda_opts=
lda_dim=
egs_opts=
delta_order=
io_opts="--max-jobs-run 5" # for jobs with a lot of I/O, limits the number running at one time.
transform_dir=     # If supplied, overrides alidir
postdir=
cmvn_opts=  # will be passed to get_lda.sh and get_egs.sh, if supplied.
            # only relevant for "raw" features, not lda.
feat_type=  # Can be used to force "raw" features.
align_cmd=              # The cmd that is passed to steps/nnet2/align.sh
align_use_gpu=          # Passed to use_gpu in steps/nnet2/align.sh [yes/no]
realign_times=          # List of times on which we realign.  Each time is
                        # floating point number strictly between 0 and 1, which
                        # will be multiplied by the num-iters to get an iteration
                        # number.
num_jobs_align=30       # Number of jobs for realignment
srand=0 # random seed used to initialize the nnet
# End configuration section.


echo "$0 $@"  # Print the command line for logging

if [ -f path.sh ]; then . ./path.sh; fi
. parse_options.sh || exit 1;

if [ $# != 4 ]; then
  echo "Usage: $0 [opts] <data> <lang> <ali-dir> <exp-dir>"
  echo " e.g.: $0 data/train data/lang exp/tri3_ali exp/tri4_nnet"
  echo ""
  echo "Main options (for others, see top of script file)"
  echo "  --config <config-file>                           # config file containing options"
  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
  echo "  --num-epochs <#epochs|15>                        # Number of epochs of training"
  echo "  --initial-effective-lrate <lrate|0.02> # effective learning rate at start of training,"
  echo "                                         # actual learning-rate is this time num-jobs."
  echo "  --final-effective-lrate <lrate|0.004>   # effective learning rate at end of training."
  echo "  --add-layers-period <#iters|2>                   # Number of iterations between adding hidden layers"
  echo "  --mix-up <#pseudo-gaussians|0>                   # Can be used to have multiple targets in final output layer,"
  echo "                                                   # per context-dependent state.  Try a number several times #states."
  echo "  --num-jobs-initial <num-jobs|1>                  # Number of parallel jobs to use for neural net training, at the start."
  echo "  --num-jobs-final <num-jobs|8>                    # Number of parallel jobs to use for neural net training, at the end"
  echo "  --num-threads <num-threads|16>                   # Number of parallel threads per job (will affect results"
  echo "                                                   # as well as speed; may interact with batch size; if you increase"
  echo "                                                   # this, you may want to decrease the batch size."
  echo "  --parallel-opts <opts|\"--num-threads 16 --mem 1G\">      # extra options to pass to e.g. queue.pl for processes that"
  echo "                                                   # use multiple threads... note, you might have to reduce --mem"
  echo "                                                   # versus your defaults, because it gets multiplied by the --num-threads argument."
  echo "  --io-opts <opts|\"--max-jobs-run 10\">                      # Options given to e.g. queue.pl for jobs that do a lot of I/O."
  echo "  --minibatch-size <minibatch-size|128>            # Size of minibatch to process (note: product with --num-threads"
  echo "                                                   # should not get too large, e.g. >2k)."
  echo "  --samples-per-iter <#samples|400000>             # Number of samples of data to process per iteration, per"
  echo "                                                   # process."
  echo "  --splice-width <width|4>                         # Number of frames on each side to append for feature input"
  echo "                                                   # (note: we splice processed, typically 40-dimensional frames"
  echo "  --realign-epochs <list-of-epochs|\"\">           # A list of space-separated epoch indices the beginning of which"
  echo "                                                   # realignment is to be done"
  echo "  --align-cmd (utils/run.pl|utils/queue.pl <queue opts>) # passed to align.sh"
  echo "  --align-use-gpu (yes/no)                         # specify is gpu is to be used for realignment"
  echo "  --num-jobs-align <#njobs|30>                     # Number of jobs to perform realignment"
  echo "  --stage <stage|-4>                               # Used to run a partially-completed training process from somewhere in"
  echo "                                                   # the middle."
  echo "ConvNet configurations"
  echo "  --num-filters1 <num-filters1|128>                # number of filters in the first convolutional layer."
  echo "  --patch-step1 <patch-step1|1>                    # patch step of the first convolutional layer."
  echo "  --patch-dim1 <patch-dim1|7>                      # dim of convolutional kernel in the first layer."
  echo "                                                   # (note: (feat-dim - patch-dim1) % patch-step1 should be 0.)"
  echo "  --pool-size <pool-size|3>                        # size of pooling after the first convolutional layer."
  echo "                                                   # (note: (feat-dim - patch-dim1 + 1) % pool-size should be 0.)"
  echo "  --num-filters2 <num-filters2|256>                # number of filters in the second convolutional layer."
  echo "  --patch-dim2 <patch-dim2|4>                      # dim of convolutional kernel in the second layer."


  exit 1;
fi

data=$1
lang=$2
alidir=$3
dir=$4

if [ ! -z "$realign_times" ]; then
  [ -z "$align_cmd" ] && echo "$0: realign_times specified but align_cmd not specified" && exit 1
  [ -z "$align_use_gpu" ] && echo "$0: realign_times specified but align_use_gpu not specified" && exit 1
fi

# Check some files.
for f in $data/feats.scp $lang/L.fst $alidir/final.mdl $alidir/tree; do
  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
done

[ ! -f $postdir/post.1.scp ] && [ ! -f $alidir/ali.1.gz ] && echo "$0: no (soft) alignments provided" && exit 1;

trap 'for pid in $(jobs -pr); do kill -KILL $pid; done' INT QUIT TERM

# Set some variables.
num_leaves=`tree-info $alidir/tree 2>/dev/null | grep num-pdfs | awk '{print $2}'` || exit 1
[ -z $num_leaves ] && echo "\$num_leaves is unset" && exit 1
[ "$num_leaves" -eq "0" ] && echo "\$num_leaves is 0" && exit 1

nj=`cat $alidir/num_jobs` || exit 1;  # number of jobs in alignment dir...
# in this dir we'll have just one job.
sdata=$data/split$nj
utils/split_data.sh $data $nj

mkdir -p $dir/log
echo $nj > $dir/num_jobs
cp $alidir/tree $dir

utils/lang/check_phones_compatible.sh $lang/phones.txt $alidir/phones.txt || exit 1;
cp $lang/phones.txt $dir || exit 1;

extra_opts=()
[ ! -z "$cmvn_opts" ] && extra_opts+=(--cmvn-opts "$cmvn_opts")
[ ! -z "$feat_type" ] && extra_opts+=(--feat-type $feat_type)
[ ! -z "$delta_order" ] && extra_opts+=(--delta-order $delta_order)
[ ! -z "$online_ivector_dir" ] && extra_opts+=(--online-ivector-dir $online_ivector_dir)
[ -z "$transform_dir" ] && transform_dir=$alidir
extra_opts+=(--transform-dir $transform_dir)
[ -z "$left_context" ] && left_context=$splice_width
[ -z "$right_context" ] && right_context=$splice_width
extra_opts+=(--left-context $left_context --right-context $right_context)

feat-to-dim scp:$sdata/1/feats.scp - > $dir/feat_dim
feat_dim=$(cat $dir/feat_dim) || exit 1;

if [ $stage -le -3 ] && [ -z "$egs_dir" ]; then
  echo "$0: calling get_egs2.sh"
  steps/nnet2/get_egs2.sh $egs_opts "${extra_opts[@]}"  --io-opts "$io_opts" \
    --postdir "$postdir" --samples-per-iter $samples_per_iter --stage $get_egs_stage \
    --cmd "$cmd" --feat-type "raw" $data $alidir $dir/egs || exit 1;
fi

if [ -f $dir/egs/cmvn_opts ]; then
  cp $dir/egs/cmvn_opts $dir
fi

if [ -f $dir/egs/delta_order ]; then
  cp $dir/egs/delta_order $dir
fi

if [ -z $egs_dir ]; then
  egs_dir=$dir/egs
fi

frames_per_eg=$(cat $egs_dir/info/frames_per_eg) || { echo "error: no such file $egs_dir/info/frames_per_eg"; exit 1; }
num_archives=$(cat $egs_dir/info/num_archives) || { echo "error: no such file $egs_dir/info/frames_per_eg"; exit 1; }

# num_archives_expanded considers each separate label-position from
# 0..frames_per_eg-1 to be a separate archive.
num_archives_expanded=$[$num_archives*$frames_per_eg]

[ $num_jobs_initial -gt $num_jobs_final ] && \
  echo "$0: --initial-num-jobs cannot exceed --final-num-jobs" && exit 1;

[ $num_jobs_final -gt $num_archives_expanded ] && \
  echo "$0: --final-num-jobs cannot exceed #archives $num_archives_expanded." && exit 1;

if ! [ $num_hidden_layers -ge 1 ]; then
  echo "Invalid num-hidden-layers $num_hidden_layers"
  exit 1
fi

if [ $stage -le -2 ]; then
  echo "$0: initializing neural net";
  tot_splice=$[($delta_order+1)*($left_context+1+$right_context)]
  delta_feat_dim=$[($delta_order+1)*$feat_dim]
  tot_input_dim=$[$feat_dim*$tot_splice]
  num_patch1=$[1+($feat_dim-$patch_dim1)/$patch_step1]
  num_pool=$[$num_patch1/$pool_size]
  patch_stride2=$num_pool
  num_patch2=$[1+($patch_stride2-$patch_dim2)/$patch_step2]
  conv_out_dim1=$[$num_filters1*$num_patch1] # 128 x (36 - 7 + 1)
  pool_out_dim=$[$num_filters1*$num_pool]
  conv_out_dim2=$[$num_filters2*$num_patch2]

  online_preconditioning_opts="alpha=$alpha num-samples-history=$num_samples_history update-period=$update_period rank-in=$precondition_rank_in rank-out=$precondition_rank_out max-change-per-sample=$max_change_per_sample"

  initial_lrate=$(perl -e "print ($initial_effective_lrate*$num_jobs_initial);")
  stddev=`perl -e "print 1.0/sqrt($hidden_dim);"`
  cat >$dir/nnet.config <<EOF
SpliceComponent input-dim=$delta_feat_dim left-context=$left_context right-context=$right_context
Convolutional1dComponent input-dim=$tot_input_dim output-dim=$conv_out_dim1 learning-rate=$initial_lrate param-stddev=$stddev bias-stddev=$bias_stddev patch-dim=$patch_dim1 patch-step=$patch_step1 patch-stride=$feat_dim
MaxpoolingComponent input-dim=$conv_out_dim1 output-dim=$pool_out_dim pool-size=$pool_size pool-stride=$num_filters1
NormalizeComponent dim=$pool_out_dim
AffineComponentPreconditionedOnline input-dim=$pool_out_dim output-dim=$num_leaves $online_preconditioning_opts learning-rate=$initial_lrate param-stddev=0 bias-stddev=0
SoftmaxComponent dim=$num_leaves
EOF

  cat >$dir/replace.1.config <<EOF
Convolutional1dComponent input-dim=$pool_out_dim output-dim=$conv_out_dim2 learning-rate=$initial_lrate param-stddev=$stddev bias-stddev=$bias_stddev patch-dim=$patch_dim2 patch-step=$patch_step2 patch-stride=$patch_stride2 appended-conv=true
NormalizeComponent dim=$conv_out_dim2
AffineComponentPreconditionedOnline input-dim=$conv_out_dim2 output-dim=$num_leaves $online_preconditioning_opts learning-rate=$initial_lrate param-stddev=0 bias-stddev=0
SoftmaxComponent dim=$num_leaves
EOF

  cat >$dir/replace.2.config <<EOF
AffineComponentPreconditionedOnline input-dim=$conv_out_dim2 output-dim=$hidden_dim $online_preconditioning_opts learning-rate=$initial_lrate param-stddev=$stddev bias-stddev=$bias_stddev
RectifiedLinearComponent dim=$hidden_dim
NormalizeComponent dim=$hidden_dim
AffineComponentPreconditionedOnline input-dim=$hidden_dim output-dim=$num_leaves $online_preconditioning_opts learning-rate=$initial_lrate param-stddev=0 bias-stddev=0
SoftmaxComponent dim=$num_leaves
EOF

  # to hidden.config it will write the part of the config corresponding to a
  # single hidden layer; we need this to add new layers.
  cat >$dir/replace.3.config <<EOF
AffineComponentPreconditionedOnline input-dim=$hidden_dim output-dim=$hidden_dim $online_preconditioning_opts learning-rate=$initial_lrate param-stddev=$stddev bias-stddev=$bias_stddev
RectifiedLinearComponent dim=$hidden_dim
NormalizeComponent dim=$hidden_dim
AffineComponentPreconditionedOnline input-dim=$hidden_dim output-dim=$num_leaves $online_preconditioning_opts learning-rate=$initial_lrate param-stddev=0 bias-stddev=0
SoftmaxComponent dim=$num_leaves
EOF

  $cmd $dir/log/nnet_init.log \
    nnet-am-init $alidir/tree $lang/topo "nnet-init --srand=$srand $dir/nnet.config -|" \
    $dir/0.mdl || exit 1;
fi

if [ $stage -le -1 ]; then
  echo "Training transition probabilities and setting priors"
  $cmd $dir/log/train_trans.log \
    nnet-train-transitions $dir/0.mdl "ark:gunzip -c $alidir/ali.*.gz|" $dir/0.mdl \
    || exit 1;
fi

# set num_iters so that as close as possible, we process the data $num_epochs
# times, i.e. $num_iters*$avg_num_jobs) == $num_epochs*$num_archives_expanded,
# where avg_num_jobs=(num_jobs_initial+num_jobs_final)/2.

num_archives_to_process=$[$num_epochs*$num_archives_expanded]
num_archives_processed=0
num_iters=$[($num_archives_to_process*2)/($num_jobs_initial+$num_jobs_final)]

echo "$0: Will train for $num_epochs epochs = $num_iters iterations"

finish_add_layers_iter=$[$num_hidden_layers * $add_layers_period]

! [ $num_iters -gt $[$finish_add_layers_iter+2] ] \
  && echo "$0: Insufficient epochs" && exit 1

# mix up at the iteration where we've processed about half the data; this keeps
# the overall training procedure fairly invariant to the number of initial and
# final jobs.
# j = initial, k = final, n = num-iters, x = half-of-data epoch,
# p is proportion of data we want to process (e.g. p=0.5 here).
# solve for x if the amount of data processed by epoch x is p
# times the amount by iteration n.
# put this in wolfram alpha:
# solve { x*j + (k-j)*x*x/(2*n) = p * (j*n + (k-j)*n/2), {x} }
# got: x = (j n-sqrt(-n^2 (j^2 (p-1)-k^2 p)))/(j-k) and j!=k and n!=0
# simplified manually to: n * (sqrt(((1-p)j^2 + p k^2)/2) - j)/(j-k)
mix_up_iter=$(perl -e '($j,$k,$n,$p)=@ARGV; print int(0.5 + ($j==$k ? $n*$p : $n*(sqrt((1-$p)*$j*$j+$p*$k*$k)-$j)/($k-$j))); ' $num_jobs_initial $num_jobs_final $num_iters 0.5)
! [ $mix_up_iter -gt $finish_add_layers_iter ] && \
  echo "Mix-up-iter is $mix_up_iter, should be greater than $finish_add_layers_iter -> add more epochs?" \
  && exit 1;

if [ $num_threads -eq 1 ]; then
  parallel_suffix="-simple" # this enables us to use GPU code if
                         # we have just one thread.
  parallel_train_opts=
  if ! cuda-compiled; then
    echo "$0: WARNING: you are running with one thread but you have not compiled"
    echo "   for CUDA.  You may be running a setup optimized for GPUs.  If you have"
    echo "   GPUs and have nvcc installed, go to src/ and do ./configure; make"
  fi
else
  parallel_suffix="-parallel"
  parallel_train_opts="--num-threads=$num_threads"
fi


approx_iters_per_epoch_final=$[$num_archives_expanded/$num_jobs_final]

# First work out how many models we want to combine over in the final
# nnet-combine-fast invocation.  This equals
# min(max(max_models_combine, approx_iters_per_epoch_final),
#     2/3 * iters_after_mixup)
num_models_combine=$max_models_combine
if [ $num_models_combine -lt $approx_iters_per_epoch_final ]; then
  num_models_combine=$approx_iters_per_epoch_final
fi
iters_after_mixup_23=$[(($num_iters-$mix_up_iter-1)*2)/3]
if [ $num_models_combine -gt $iters_after_mixup_23 ]; then
  num_models_combine=$iters_after_mixup_23
fi
first_model_combine=$[$num_iters-$num_models_combine+1]

x=0


for realign_time in $realign_times; do
  # Work out the iterations on which we will re-align, if the --realign-times
  # option was used.  This is slightly approximate.
  ! perl -e "exit($realign_time > 0.0 && $realign_time < 1.0 ? 0:1);" && \
    echo "Invalid --realign-times option $realign_times: elements must be strictly between 0 and 1.";
  # the next formula is based on the one for mix_up_iter above.
  realign_iter=$(perl -e '($j,$k,$n,$p)=@ARGV; print int(0.5 + ($j==$k ? $n*$p : $n*(sqrt((1-$p)*$j*$j+$p*$k*$k)-$j)/($k-$j))); ' $num_jobs_initial $num_jobs_final $num_iters $realign_time) || exit 1;
  realign_this_iter[$realign_iter]=$realign_time
done

cur_egs_dir=$egs_dir
num_hid_added=1
while [ $x -lt $num_iters ]; do
  this_num_jobs=$(perl -e "print int(0.5+$num_jobs_initial+($num_jobs_final-$num_jobs_initial)*$x/$num_iters);")

  ilr=$initial_effective_lrate; flr=$final_effective_lrate; np=$num_archives_processed; nt=$num_archives_to_process;
  this_learning_rate=$(perl -e  "print (($x + 1 >= $num_iters ? $flr : $ilr*exp($np*log($flr/$ilr)/$nt))*$this_num_jobs);");

  # TODO: remove this line.
  echo "On iteration $x, learning rate is $this_learning_rate."

  if [ ! -z "${realign_this_iter[$x]}" ]; then
    prev_egs_dir=$cur_egs_dir
    cur_egs_dir=$dir/egs_${realign_this_iter[$x]}
  fi

  if [ $x -ge 0 ] && [ $stage -le $x ]; then
    if [ ! -z "${realign_this_iter[$x]}" ]; then
      time=${realign_this_iter[$x]}


      echo "Getting average posterior for purposes of adjusting the priors."
      # Note: this just uses CPUs, using a smallish subset of data.
      # always use the first egs archive, which makes the script simpler;
      # we're using different random subsets of it.
      rm $dir/post.$x.*.vec 2>/dev/null
      $cmd JOB=1:$num_jobs_compute_prior $dir/log/get_post.$x.JOB.log \
        nnet-copy-egs --srand=JOB --frame=random ark:$prev_egs_dir/egs.1.ark ark:- \| \
        nnet-subset-egs --srand=JOB --n=$prior_subset_size ark:- ark:- \| \
        nnet-compute-from-egs "nnet-to-raw-nnet $dir/$x.mdl -|" ark:- ark:- \| \
        matrix-sum-rows ark:- ark:- \| vector-sum ark:- $dir/post.$x.JOB.vec || exit 1;

      sleep 3;  # make sure there is time for $dir/post.$x.*.vec to appear.

      $cmd $dir/log/vector_sum.$x.log \
        vector-sum $dir/post.$x.*.vec $dir/post.$x.vec || exit 1;
      rm $dir/post.$x.*.vec;

      echo "Re-adjusting priors based on computed posteriors"
      $cmd $dir/log/adjust_priors.$x.log \
        nnet-adjust-priors $dir/$x.mdl $dir/post.$x.vec $dir/$x.mdl || exit 1;

      sleep 2

      steps/nnet2/align.sh --nj $num_jobs_align --cmd "$align_cmd" --use-gpu $align_use_gpu \
        --transform-dir "$transform_dir" --online-ivector-dir "$online_ivector_dir" \
        --iter $x $data $lang $dir $dir/ali_$time || exit 1

      steps/nnet2/relabel_egs2.sh --cmd "$cmd" --iter $x $dir/ali_$time \
        $prev_egs_dir $cur_egs_dir || exit 1

      if $cleanup && [[ $prev_egs_dir =~ $dir/egs* ]]; then
        steps/nnet2/remove_egs.sh $prev_egs_dir
      fi
    fi

    # Set off jobs doing some diagnostics, in the background.
    # Use the egs dir from the previous iteration for the diagnostics
    $cmd $dir/log/compute_prob_valid.$x.log \
      nnet-compute-prob $dir/$x.mdl ark:$cur_egs_dir/valid_diagnostic.egs &
    $cmd $dir/log/compute_prob_train.$x.log \
      nnet-compute-prob $dir/$x.mdl ark:$cur_egs_dir/train_diagnostic.egs &
    if [ $x -gt 0 ] && [ ! -f $dir/log/mix_up.$[$x-1].log ]; then
      [ ! -f $x.mdl ] && sleep 10;
      $cmd $dir/log/progress.$x.log \
        nnet-show-progress --use-gpu=no $dir/$[$x-1].mdl $dir/$x.mdl \
        ark:$cur_egs_dir/train_diagnostic.egs '&&' \
        nnet-am-info $dir/$x.mdl &
    fi

    echo "Training neural net (pass $x)"

    if [ $x -gt 0 ] && \
      [ $x -le $[($num_hidden_layers-1)*$add_layers_period] ] && \
      [ $[($x-1) % $add_layers_period] -eq 0 ]; then
      do_average=false # if we've just mixed up, don't do averaging take the best.
      mdl="nnet-init --srand=$x $dir/replace.$num_hid_added.config - | nnet-replace-last-layers $dir/$x.mdl - - | nnet-am-copy --learning-rate=$this_learning_rate - -|"
      num_hid_added=$[$num_hid_added+1]
    else
      do_average=true
      if [ $x -eq 0 ]; then do_average=false; fi # on iteration 0, pick the best, don't average.
      mdl="nnet-am-copy --learning-rate=$this_learning_rate $dir/$x.mdl -|"
    fi
    if $do_average; then
      this_minibatch_size=$minibatch_size
    else
      # on iteration zero or when we just added a layer, use a smaller minibatch
      # size and just one job: the model-averaging doesn't seem to be helpful
      # when the model is changing too fast (i.e. it worsens the objective
      # function), and the smaller minibatch size will help to keep
      # the update stable.
      this_minibatch_size=$[$minibatch_size/2];
    fi

    rm $dir/.error 2>/dev/null

    ( # this sub-shell is so that when we "wait" below,
      # we only wait for the training jobs that we just spawned,
      # not the diagnostic jobs that we spawned above.

      # We can't easily use a single parallel SGE job to do the main training,
      # because the computation of which archive and which --frame option
      # to use for each job is a little complex, so we spawn each one separately.
      for n in $(seq $this_num_jobs); do
        k=$[$num_archives_processed + $n - 1]; # k is a zero-based index that we'll derive
                                         # the other indexes from.
        archive=$[($k%$num_archives)+1]; # work out the 1-based archive index.
        frame=$[(($k/$num_archives)%$frames_per_eg)]; # work out the 0-based frame
        # index; this increases more slowly than the archive index because the
        # same archive with different frame indexes will give similar gradients,
        # so we want to separate them in time.

        $cmd $parallel_opts $dir/log/train.$x.$n.log \
          nnet-train$parallel_suffix $parallel_train_opts \
          --minibatch-size=$this_minibatch_size --srand=$x "$mdl" \
          "ark,bg:nnet-copy-egs --frame=$frame ark:$cur_egs_dir/egs.$archive.ark ark:-|nnet-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x ark:- ark:-|" \
          $dir/$[$x+1].$n.mdl || touch $dir/.error &
      done
      wait
    )
    # the error message below is not that informative, but $cmd will
    # have printed a more specific one.
    [ -f $dir/.error ] && echo "$0: error on iteration $x of training" && exit 1;

    nnets_list=
    for n in `seq 1 $this_num_jobs`; do
      nnets_list="$nnets_list $dir/$[$x+1].$n.mdl"
    done

    if $do_average; then
      # average the output of the different jobs.
      $cmd $dir/log/average.$x.log \
        nnet-am-average $nnets_list $dir/$[$x+1].mdl ||  exit 1;
    else
      # choose the best from the different jobs.
      n=$(perl -e '($nj,$pat)=@ARGV; $best_n=1; $best_logprob=-1.0e+10; for ($n=1;$n<=$nj;$n++) {
          $fn = sprintf($pat,$n); open(F, "<$fn") || die "Error opening log file $fn";
          undef $logprob; while (<F>) { if (m/log-prob-per-frame=(\S+)/) { $logprob=$1; } }
          close(F); if (defined $logprob && $logprob > $best_logprob) { $best_logprob=$logprob;
          $best_n=$n; } } print "$best_n\n"; ' $num_jobs_nnet $dir/log/train.$x.%d.log) || exit 1;
      [ -z "$n" ] && echo "Error getting best model" && exit 1;
      cp $dir/$[$x+1].$n.mdl $dir/$[$x+1].mdl || exit 1;
    fi

    if [ "$mix_up" -gt 0 ] && [ $x -eq $mix_up_iter ]; then
      # mix up.
      echo Mixing up from $num_leaves to $mix_up components
      $cmd $dir/log/mix_up.$x.log \
        nnet-am-mixup --min-count=10 --num-mixtures=$mix_up \
        $dir/$[$x+1].mdl $dir/$[$x+1].mdl || exit 1;
    fi
    rm $nnets_list
    [ ! -f $dir/$[$x+1].mdl ] && exit 1;
    if [ -f $dir/$[$x-1].mdl ] && $cleanup && \
       [ $[($x-1)%100] -ne 0  ] && [ $[$x-1] -lt $first_model_combine ]; then
      rm $dir/$[$x-1].mdl
    fi
  fi
  x=$[$x+1]
  num_archives_processed=$[$num_archives_processed+$this_num_jobs]
done


if [ $stage -le $num_iters ]; then
  echo "Doing final combination to produce final.mdl"

  # Now do combination.
  nnets_list=()
  # the if..else..fi statement below sets 'nnets_list'.
  if [ $max_models_combine -lt $num_models_combine ]; then
    # The number of models to combine is too large, e.g. > 20.  In this case,
    # each argument to nnet-combine-fast will be an average of multiple models.
    cur_offset=0 # current offset from first_model_combine.
    for n in $(seq $max_models_combine); do
      next_offset=$[($n*$num_models_combine)/$max_models_combine]
      sub_list=""
      for o in $(seq $cur_offset $[$next_offset-1]); do
        iter=$[$first_model_combine+$o]
        mdl=$dir/$iter.mdl
        [ ! -f $mdl ] && echo "Expected $mdl to exist" && exit 1;
        sub_list="$sub_list $mdl"
      done
      nnets_list[$[$n-1]]="nnet-am-average $sub_list - |"
      cur_offset=$next_offset
    done
  else
    nnets_list=
    for n in $(seq 0 $[num_models_combine-1]); do
      iter=$[$first_model_combine+$n]
      mdl=$dir/$iter.mdl
      [ ! -f $mdl ] && echo "Expected $mdl to exist" && exit 1;
      nnets_list[$n]=$mdl
    done
  fi


  # Below, use --use-gpu=no to disable nnet-combine-fast from using a GPU, as
  # if there are many models it can give out-of-memory error; set num-threads to 8
  # to speed it up (this isn't ideal...)
  num_egs=`nnet-copy-egs ark:$cur_egs_dir/combine.egs ark:/dev/null 2>&1 | tail -n 1 | awk '{print $NF}'`
  mb=$[($num_egs+$combine_num_threads-1)/$combine_num_threads]
  [ $mb -gt 512 ] && mb=512
  # Setting --initial-model to a large value makes it initialize the combination
  # with the average of all the models.  It's important not to start with a
  # single model, or, due to the invariance to scaling that these nonlinearities
  # give us, we get zero diagonal entries in the fisher matrix that
  # nnet-combine-fast uses for scaling, which after flooring and inversion, has
  # the effect that the initial model chosen gets much higher learning rates
  # than the others.  This prevents the optimization from working well.
  $cmd $combine_parallel_opts $dir/log/combine.log \
    nnet-combine-fast --initial-model=100000 --num-lbfgs-iters=40 --use-gpu=no \
      --num-threads=$combine_num_threads \
      --verbose=3 --minibatch-size=$mb "${nnets_list[@]}" ark:$cur_egs_dir/combine.egs \
      $dir/final.mdl || exit 1;

  # Normalize stddev for affine or block affine layers that are followed by a
  # ReLU layer and then a normalize layer.
  $cmd $dir/log/normalize.log \
    nnet-normalize-stddev $dir/final.mdl $dir/final.mdl || exit 1;

  # Compute the probability of the final, combined model with
  # the same subset we used for the previous compute_probs, as the
  # different subsets will lead to different probs.
  $cmd $dir/log/compute_prob_valid.final.log \
    nnet-compute-prob $dir/final.mdl ark:$cur_egs_dir/valid_diagnostic.egs &
  $cmd $dir/log/compute_prob_train.final.log \
    nnet-compute-prob $dir/final.mdl ark:$cur_egs_dir/train_diagnostic.egs &
fi

if [ $stage -le $[$num_iters+1] ]; then
  echo "Getting average posterior for purposes of adjusting the priors."
  # Note: this just uses CPUs, using a smallish subset of data.
  rm $dir/post.$x.*.vec 2>/dev/null
  $cmd JOB=1:$num_jobs_compute_prior $dir/log/get_post.$x.JOB.log \
    nnet-copy-egs --frame=random --srand=JOB ark:$cur_egs_dir/egs.1.ark ark:- \| \
    nnet-subset-egs --srand=JOB --n=$prior_subset_size ark:- ark:- \| \
    nnet-compute-from-egs "nnet-to-raw-nnet $dir/final.mdl -|" ark:- ark:- \| \
    matrix-sum-rows ark:- ark:- \| vector-sum ark:- $dir/post.$x.JOB.vec || exit 1;

  sleep 3;  # make sure there is time for $dir/post.$x.*.vec to appear.

  $cmd $dir/log/vector_sum.$x.log \
   vector-sum $dir/post.$x.*.vec $dir/post.$x.vec || exit 1;

  rm $dir/post.$x.*.vec;

  echo "Re-adjusting priors based on computed posteriors"
  $cmd $dir/log/adjust_priors.final.log \
    nnet-adjust-priors $dir/final.mdl $dir/post.$x.vec $dir/final.mdl || exit 1;
fi


if [ ! -f $dir/final.mdl ]; then
  echo "$0: $dir/final.mdl does not exist."
  # we don't want to clean up if the training didn't succeed.
  exit 1;
fi

sleep 2

echo Done

if $cleanup; then
  echo Cleaning up data
  if [[ $cur_egs_dir =~ $dir/egs* ]]; then
    steps/nnet2/remove_egs.sh $cur_egs_dir
  fi

  echo Removing most of the models
  for x in `seq 0 $num_iters`; do
    if [ $[$x%100] -ne 0 ] && [ $x -ne $num_iters ] && [ -f $dir/$x.mdl ]; then
       # delete all but every 100th model; don't delete the ones which combine to form the final model.
      rm $dir/$x.mdl
    fi
  done
fi


================================================
FILE: egs/steps/nnet2/train_discriminative.sh
================================================
#!/usr/bin/env bash

# Copyright 2012  Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.

# This script does MPE or MMI or state-level minimum bayes risk (sMBR) training
# of neural nets. 

# Begin configuration section.
cmd=run.pl
num_epochs=4       # Number of epochs of training
learning_rate=0.00002
effective_lrate=    # If supplied, overrides the learning rate, which gets set to effective_lrate * num_jobs_nnet.
acoustic_scale=0.1  # acoustic scale for MMI/MPFE/SMBR training.
criterion=smbr
boost=0.0       # option relevant for MMI
drop_frames=false #  option relevant for MMI
one_silence_class=true # Option relevant for MPE/SMBR
num_jobs_nnet=4    # Number of neural net jobs to run in parallel.  Note: this
                   # will interact with the learning rates (if you decrease
                   # this, you'll have to decrease the learning rate, and vice
                   # versa).
samples_per_iter=400000 # measured in frames, not in "examples"

modify_learning_rates=true
last_layer_factor=1.0  # relates to modify-learning-rates
first_layer_factor=1.0 # relates to modify-learning-rates
shuffle_buffer_size=5000 # This "buffer_size" variable controls randomization of the samples
                # on each iter.  You could set it to 0 or to a large value for complete
                # randomization, but this would both consume memory and cause spikes in
                # disk I/O.  Smaller is easier on disk and memory but less random.  It's
                # not a huge deal though, as samples are anyway randomized right at the start.


stage=-8

io_opts="--max-jobs-run 5" # for jobs with a lot of I/O, limits the number running at one time.   These don't

num_threads=16  # this is the default but you may want to change it, e.g. to 1 if
                # using GPUs.
parallel_opts="--num-threads 16 --mem 1G" # by default we use 4 threads; this lets the queue know.
  # note: parallel_opts doesn't automatically get adjusted if you adjust num-threads.
transform_dir= # If this is a SAT system, directory for transforms
cleanup=true
transform_dir=
degs_dir=
retroactive=false
online_ivector_dir=
# End configuration section.


echo "$0 $@"  # Print the command line for logging

if [ -f path.sh ]; then . ./path.sh; fi
. parse_options.sh || exit 1;


if [ $# != 6 ]; then
  echo "Usage: $0 [opts] <data> <lang> <ali-dir> <denlat-dir> <src-model-file> <exp-dir>"
  echo " e.g.: $0 data/train data/lang exp/tri3_ali exp/tri4_nnet_denlats exp/tri4/final.mdl exp/tri4_mpe"
  echo ""
  echo "Main options (for others, see top of script file)"
  echo "  --config <config-file>                           # config file containing options"
  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
  echo "  --num-epochs <#epochs|4>                        # Number of epochs of training"
  echo "  --learning-rate <learning-rate|0.0002>           # Learning rate to use"
  echo "  --effective-lrate <effective-learning-rate>      # If supplied, learning rate will be set to"
  echo "                                                   # this value times num-jobs-nnet."
  echo "  --num-jobs-nnet <num-jobs|8>                     # Number of parallel jobs to use for main neural net"
  echo "                                                   # training (will affect results as well as speed; try 8, 16)"
  echo "                                                   # Note: if you increase this, you may want to also increase"
  echo "                                                   # the learning rate."
  echo "  --num-threads <num-threads|16>                   # Number of parallel threads per job (will affect results"
  echo "                                                   # as well as speed; may interact with batch size; if you increase"
  echo "                                                   # this, you may want to decrease the batch size."
  echo "  --parallel-opts <opts|\"--num-threads 16 --mem 1G\">      # extra options to pass to e.g. queue.pl for processes that"
  echo "                                                   # use multiple threads... "
  echo "  --io-opts <opts|\"--max-jobs-run 10\">                      # Options given to e.g. queue.pl for jobs that do a lot of I/O."
  echo "  --samples-per-iter <#samples|400000>             # Number of samples of data to process per iteration, per"
  echo "                                                   # process."
  echo "  --stage <stage|-8>                               # Used to run a partially-completed training process from somewhere in"
  echo "                                                   # the middle."
  echo "  --criterion <criterion|smbr>                     # Training criterion: may be smbr, mmi or mpfe"
  echo "  --boost <boost|0.0>                              # Boosting factor for MMI (e.g., 0.1)"
  echo "  --modify-learning-rates <true,false|false>       # If true, modify learning rates to try to equalize relative"
  echo "                                                   # changes across layers."
  echo "  --degs-dir <dir|"">                              # Directory for discriminative examples, e.g. exp/foo/degs"
  echo "  --drop-frames <true,false|false>                 # Option that affects MMI training: if true, we exclude gradients from frames"
  echo "                                                   # where the numerator transition-id is not in the denominator lattice."
  echo "  --one-silence-class <true,false|false>           # Option that affects MPE/SMBR training (will tend to reduce insertions)"
  echo "  --online-ivector-dir <dir|"">                    # Directory for online-estimated iVectors, used in the"
  echo "                                                   # online-neural-net setup."
  exit 1;
fi

data=$1
lang=$2
alidir=$3
denlatdir=$4
src_model=$5
dir=$6


extra_files=
[ ! -z $online_ivector_dir ] && \
 extra_files="$online_ivector_dir/ivector_period $online_ivector_dir/ivector_online.scp"

# Check some files.
for f in $data/feats.scp $lang/L.fst $alidir/ali.1.gz $alidir/num_jobs $alidir/tree \
         $denlatdir/lat.1.gz $denlatdir/num_jobs $src_model $extra_files; do
  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
done

nj=$(cat $alidir/num_jobs) || exit 1; # caution: $nj is the number of
                                      # splits of the denlats and alignments, but
                                      # num_jobs_nnet is the number of nnet training
                                      # jobs we run in parallel.
if ! [ $nj == $(cat $denlatdir/num_jobs) ]; then
  echo "Number of jobs mismatch: $nj versus $(cat $denlatdir/num_jobs)"
  exit 1;
fi

mkdir -p $dir/log || exit 1;
[ -z "$degs_dir" ] && mkdir -p $dir/degs

utils/lang/check_phones_compatible.sh $lang/phones.txt $alidir/phones.txt || exit 1;
cp $lang/phones.txt $dir || exit 1;

sdata=$data/split$nj
utils/split_data.sh $data $nj

# function to remove egs that might be soft links.
remove () { for x in $*; do [ -L $x ] && rm $(utils/make_absolute.sh $x); rm $x; done }

splice_opts=`cat $alidir/splice_opts 2>/dev/null`
silphonelist=`cat $lang/phones/silence.csl` || exit 1;
cmvn_opts=`cat $alidir/cmvn_opts 2>/dev/null`
cp $alidir/splice_opts $dir 2>/dev/null
cp $alidir/cmvn_opts $dir 2>/dev/null
cp $alidir/tree $dir

if [ ! -z "$online_ivector_dir" ]; then
  ivector_period=$(cat $online_ivector_dir/ivector_period)
  ivector_dim=$(feat-to-dim scp:$online_ivector_dir/ivector_online.scp -) || exit 1;
  # the 'const_dim_opt' allows it to write only one iVector per example,
  # rather than one per time-index... it has to average over
  const_dim_opt="--const-feat-dim=$ivector_dim"
fi

## Set up features.
## Don't support deltas, only LDA or raw (mainly because deltas are less frequently used).
if [ -z $feat_type ]; then
  if [ -f $alidir/final.mat ] && [ ! -f $transform_dir/raw_trans.1 ]; then feat_type=lda; else feat_type=raw; fi
fi
echo "$0: feature type is $feat_type"

case $feat_type in
  raw) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- |"
   ;;
  lda) 
    splice_opts=`cat $alidir/splice_opts 2>/dev/null`
    cp $alidir/final.mat $dir    
    feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $dir/final.mat ark:- ark:- |"
    ;;
  *) echo "$0: invalid feature type $feat_type" && exit 1;
esac

if [ -z "$transform_dir" ]; then
  if [ -f $transform_dir/trans.1 ] || [ -f $transform_dir/raw_trans.1 ]; then
    transform_dir=$alidir
  fi
fi

if [ ! -z "$transform_dir" ]; then
  echo "$0: using transforms from $transform_dir"
  [ ! -s $transform_dir/num_jobs ] && \
    echo "$0: expected $transform_dir/num_jobs to contain the number of jobs." && exit 1;
  nj_orig=$(cat $transform_dir/num_jobs)
  
  if [ $feat_type == "raw" ]; then trans=raw_trans;
  else trans=trans; fi
  if [ $feat_type == "lda" ] && ! cmp $transform_dir/final.mat $alidir/final.mat; then
    echo "$0: LDA transforms differ between $alidir and $transform_dir"
    exit 1;
  fi
  if [ ! -f $transform_dir/$trans.1 ]; then
    echo "$0: expected $transform_dir/$trans.1 to exist (--transform-dir option)"
    exit 1;
  fi
  if [ $nj -ne $nj_orig ]; then
    # Copy the transforms into an archive with an index.
    for n in $(seq $nj_orig); do cat $transform_dir/$trans.$n; done | \
       copy-feats ark:- ark,scp:$dir/$trans.ark,$dir/$trans.scp || exit 1;
    feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk scp:$dir/$trans.scp ark:- ark:- |"
  else
    # number of jobs matches with alignment dir.
    feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/$trans.JOB ark:- ark:- |"
  fi
fi
if [ ! -z $online_ivector_dir ]; then
  # add iVectors to the features.
  feats="$feats paste-feats --length-tolerance=$ivector_period ark:- 'ark,s,cs:utils/filter_scp.pl $sdata/JOB/utt2spk $online_ivector_dir/ivector_online.scp | subsample-feats --n=-$ivector_period scp:- ark:- |' ark:- |"
fi


if [ -z "$degs_dir" ]; then
  if [ $stage -le -8 ]; then
    echo "$0: working out number of frames of training data"
    num_frames=$(steps/nnet2/get_num_frames.sh $data)
    echo $num_frames > $dir/num_frames
    # Working out number of iterations per epoch.
    iters_per_epoch=`perl -e "print int($num_frames/($samples_per_iter * $num_jobs_nnet) + 0.5);"` || exit 1;
    [ $iters_per_epoch -eq 0 ] && iters_per_epoch=1
    echo $iters_per_epoch > $dir/degs/iters_per_epoch  || exit 1;
  else
    num_frames=$(cat $dir/num_frames) || exit 1;
    iters_per_epoch=$(cat $dir/degs/iters_per_epoch) || exit 1;
  fi

  samples_per_iter_real=$[$num_frames/($num_jobs_nnet*$iters_per_epoch)]
  echo "$0: Every epoch, splitting the data up into $iters_per_epoch iterations,"
  echo "$0: giving samples-per-iteration of $samples_per_iter_real (you requested $samples_per_iter)."
else
  iters_per_epoch=$(cat $degs_dir/iters_per_epoch) || exit 1;
  [ -z "$iters_per_epoch" ] && exit 1;
  echo "$0: Every epoch, splitting the data up into $iters_per_epoch iterations"
fi


# we create these data links regardless of the stage, as there are situations where we
# would want to recreate a data link that had previously been deleted.
if [ -z "$degs_dir" ] && [ -d $dir/degs/storage ]; then
  echo "$0: creating data links for distributed storage of degs"
    # See utils/create_split_dir.pl for how this 'storage' directory
    # is created.
  for x in $(seq $num_jobs_nnet); do
    for y in $(seq $nj); do
      utils/create_data_link.pl $dir/degs/degs_orig.$x.$y.ark
    done
    for z in $(seq 0 $[$iters_per_epoch-1]); do
      utils/create_data_link.pl $dir/degs/degs_tmp.$x.$z.ark
      utils/create_data_link.pl $dir/degs/degs.$x.$z.ark
    done
  done
fi


if [ $stage -le -7 ]; then
  echo "$0: Copying initial model and modifying preconditioning setup"

  # Note, the baseline model probably had preconditioning, and we'll keep it;
  # but we want online preconditioning with a larger number of samples of
  # history, since in this setup the frames are only randomized at the segment
  # level so they are highly correlated.  It might make sense to tune this a
  # little, later on, although I doubt it matters once the --num-samples-history
  # is large enough.

  if [ ! -z "$effective_lrate" ]; then
    learning_rate=$(perl -e "print ($num_jobs_nnet*$effective_lrate);")
    echo "$0: setting learning rate to $learning_rate = --num-jobs-nnet * --effective-lrate."
  fi
  $cmd $dir/log/convert.log \
    nnet-am-copy --learning-rate=$learning_rate "$src_model" - \| \
    nnet-am-switch-preconditioning  --num-samples-history=50000 - $dir/0.mdl || exit 1;
fi


if [ $stage -le -6 ] && [ -z "$degs_dir" ]; then
  echo "$0: getting initial training examples by splitting lattices"

  egs_list=
  for n in `seq 1 $num_jobs_nnet`; do
    egs_list="$egs_list ark:$dir/degs/degs_orig.$n.JOB.ark"
  done


  $cmd $io_opts JOB=1:$nj $dir/log/get_egs.JOB.log \
    nnet-get-egs-discriminative --criterion=$criterion --drop-frames=$drop_frames \
      $dir/0.mdl "$feats" \
    "ark,s,cs:gunzip -c $alidir/ali.JOB.gz |" \
    "ark,s,cs:gunzip -c $denlatdir/lat.JOB.gz|" ark:- \| \
    nnet-copy-egs-discriminative $const_dim_opt ark:- $egs_list || exit 1;
fi

if [ $stage -le -5 ] && [ -z "$degs_dir" ]; then
  echo "$0: rearranging examples into parts for different parallel jobs"

  # combine all the "egs_orig.JOB.*.scp" (over the $nj splits of the data) and
  # then split into multiple parts egs.JOB.*.scp for different parts of the
  # data, 0 .. $iters_per_epoch-1.

  if [ $iters_per_epoch -eq 1 ]; then
    echo "Since iters-per-epoch == 1, just concatenating the data."
    for n in `seq 1 $num_jobs_nnet`; do
      cat $dir/degs/degs_orig.$n.*.ark > $dir/degs/degs_tmp.$n.0.ark || exit 1;
      remove $dir/degs/degs_orig.$n.*.ark  # don't "|| exit 1", due to NFS bugs...
    done
  else # We'll have to split it up using nnet-copy-egs.
    egs_list=
    for n in `seq 0 $[$iters_per_epoch-1]`; do
      egs_list="$egs_list ark:$dir/degs/degs_tmp.JOB.$n.ark"
    done
    $cmd $io_opts JOB=1:$num_jobs_nnet $dir/log/split_egs.JOB.log \
      nnet-copy-egs-discriminative --srand=JOB \
        "ark:cat $dir/degs/degs_orig.JOB.*.ark|" $egs_list || exit 1;
    remove $dir/degs/degs_orig.*.*.ark
  fi
fi


if [ $stage -le -4 ] && [ -z "$degs_dir" ]; then
  # Next, shuffle the order of the examples in each of those files.
  # Each one should not be too large, so we can do this in memory.
  # Then combine the examples together to form suitable-size minibatches
  # (for discriminative examples, it's one example per minibatch, so we
  # have to combine the lattices).
  echo "Shuffling the order of training examples"
  echo "(in order to avoid stressing the disk, these won't all run at once)."

  # note, the "|| true" below is a workaround for NFS bugs
  # we encountered running this script with Debian-7, NFS-v4.
  # Also, we should note that we used to do nnet-combine-egs-discriminative
  # at this stage, but if iVectors are used this would expand the size of
  # the examples on disk (because they could no longer be stored in the spk_info
  # variable of the discrminative example, no longer being constant), so
  # now we do the nnet-combine-egs-discriminative operation on the fly during
  # training.
  for n in `seq 0 $[$iters_per_epoch-1]`; do
    $cmd $io_opts JOB=1:$num_jobs_nnet $dir/log/shuffle.$n.JOB.log \
      nnet-shuffle-egs-discriminative "--srand=\$[JOB+($num_jobs_nnet*$n)]" \
      ark:$dir/degs/degs_tmp.JOB.$n.ark ark:$dir/degs/degs.JOB.$n.ark || exit 1;
    remove $dir/degs/degs_tmp.*.$n.ark
  done
fi

if [ -z "$degs_dir" ]; then
  degs_dir=$dir/degs
fi

num_iters=$[$num_epochs * $iters_per_epoch];

echo "$0: Will train for $num_epochs epochs = $num_iters iterations"

if [ $num_threads -eq 1 ]; then
 train_suffix="-simple" # this enables us to use GPU code if
                        # we have just one thread.
else
  train_suffix="-parallel --num-threads=$num_threads"
fi


x=0   
while [ $x -lt $num_iters ]; do
  if [ $stage -le $x ]; then
    
    echo "Training neural net (pass $x)"

    $cmd $parallel_opts JOB=1:$num_jobs_nnet $dir/log/train.$x.JOB.log \
      nnet-train-discriminative$train_suffix --silence-phones=$silphonelist \
       --criterion=$criterion --drop-frames=$drop_frames \
       --one-silence-class=$one_silence_class --boost=$boost \
       --acoustic-scale=$acoustic_scale $dir/$x.mdl \
       "ark,bg:nnet-combine-egs-discriminative ark:$degs_dir/degs.JOB.$[$x%$iters_per_epoch].ark ark:- |" \
        $dir/$[$x+1].JOB.mdl \
      || exit 1;

    nnets_list=$(for n in $(seq $num_jobs_nnet); do echo $dir/$[$x+1].$n.mdl; done)

    $cmd $dir/log/average.$x.log \
      nnet-am-average $nnets_list $dir/$[$x+1].mdl || exit 1;

    if $modify_learning_rates; then
      $cmd $dir/log/modify_learning_rates.$x.log \
        nnet-modify-learning-rates --retroactive=$retroactive \
        --last-layer-factor=$last_layer_factor \
        --first-layer-factor=$first_layer_factor \
        $dir/$x.mdl $dir/$[$x+1].mdl $dir/$[$x+1].mdl || exit 1;
    fi
    rm $nnets_list
  fi

  x=$[$x+1]
done

rm $dir/final.mdl 2>/dev/null
ln -s $x.mdl $dir/final.mdl


echo Done

if $cleanup; then
  echo Cleaning up data

  echo Removing training examples
  if [ -d $dir/degs ] && [ ! -L $dir/degs ]; then # only remove if directory is not a soft link.
    remove $dir/degs/degs.*
  fi

  echo Removing most of the models
  for x in `seq 0 $num_iters`; do
    if [ $[$x%$iters_per_epoch] -ne 0 ]; then
      # delete all but the epoch-final models.
      rm $dir/$x.mdl 2>/dev/null
    fi
  done
fi

for n in $(seq 0 $num_epochs); do
  x=$[$n*$iters_per_epoch]
  rm $dir/epoch$n.mdl 2>/dev/null
  ln -s $x.mdl $dir/epoch$n.mdl
done


================================================
FILE: egs/steps/nnet2/train_discriminative2.sh
================================================
#!/usr/bin/env bash

# Copyright 2012-2014  Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.

# This script does MPE or MMI or state-level minimum bayes risk (sMBR) training.
# This version (2) of the script uses a newer format for the discriminative-training
# egs, as obtained by steps/nnet2/get_egs_discriminative2.sh.

# Begin configuration section.
cmd=run.pl
num_epochs=4       # Number of epochs of training
learning_rate=0.00002
effective_lrate=    # If supplied, overrides the learning rate, which gets set to effective_lrate * num_jobs_nnet.
acoustic_scale=0.1  # acoustic scale for MMI/MPFE/SMBR training.
boost=0.0       # option relevant for MMI

criterion=smbr
drop_frames=false #  option relevant for MMI
one_silence_class=true # option relevant for MPE/SMBR
num_jobs_nnet=4    # Number of neural net jobs to run in parallel.  Note: this
                   # will interact with the learning rates (if you decrease
                   # this, you'll have to decrease the learning rate, and vice
                   # versa).

modify_learning_rates=true
last_layer_factor=1.0  # relates to modify-learning-rates
first_layer_factor=1.0 # relates to modify-learning-rates
shuffle_buffer_size=5000 # This "buffer_size" variable controls randomization of the samples
                # on each iter.  You could set it to 0 or to a large value for complete
                # randomization, but this would both consume memory and cause spikes in
                # disk I/O.  Smaller is easier on disk and memory but less random.  It's
                # not a huge deal though, as samples are anyway randomized right at the start.


stage=-3

adjust_priors=false
num_threads=16  # this is the default but you may want to change it, e.g. to 1 if
                # using GPUs.
parallel_opts="--num-threads 16 --mem 1G"
  # by default we use 16 threads; this lets the queue know.
  # note: parallel_opts doesn't automatically get adjusted if you adjust num-threads.

cleanup=true
retroactive=false
remove_egs=false
src_model=  # will default to $degs_dir/final.mdl
# End configuration section.


echo "$0 $@"  # Print the command line for logging

if [ -f path.sh ]; then . ./path.sh; fi
. parse_options.sh || exit 1;


if [ $# != 2 ]; then
  echo "Usage: $0 [opts] <degs-dir> <exp-dir>"
  echo " e.g.: $0 exp/tri4_mpe_degs exp/tri4_mpe"
  echo ""
  echo "You have to first call get_egs_discriminative2.sh to dump the egs."
  echo "Caution: the options 'drop-frames' and 'criterion' are taken here"
  echo "even though they were required also by get_egs_discriminative2.sh,"
  echo "and they should normally match."
  echo ""
  echo "Main options (for others, see top of script file)"
  echo "  --config <config-file>                           # config file containing options"
  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
  echo "  --num-epochs <#epochs|4>                        # Number of epochs of training"
  echo "  --learning-rate <learning-rate|0.0002>           # Learning rate to use"
  echo "  --effective-lrate <effective-learning-rate>      # If supplied, learning rate will be set to"
  echo "                                                   # this value times num-jobs-nnet."
  echo "  --num-jobs-nnet <num-jobs|8>                     # Number of parallel jobs to use for main neural net"
  echo "                                                   # training (will affect results as well as speed; try 8, 16)"
  echo "                                                   # Note: if you increase this, you may want to also increase"
  echo "                                                   # the learning rate.  Also note: if there are fewer archives"
  echo "                                                   # of egs than this, it will get reduced automatically."
  echo "  --num-threads <num-threads|16>                   # Number of parallel threads per job (will affect results"
  echo "                                                   # as well as speed; may interact with batch size; if you increase"
  echo "                                                   # this, you may want to decrease the batch size.  With GPU, must be 1."
  echo "  --parallel-opts <opts|\"--num-threads 16 --mem 1G\">      # extra options to pass to e.g. queue.pl for processes that"
  echo "                                                   # use multiple threads... "
  echo "  --stage <stage|-3>                               # Used to run a partially-completed training process from somewhere in"
  echo "                                                   # the middle."
  echo "  --criterion <criterion|smbr>                     # Training criterion: may be smbr, mmi or mpfe"
  echo "  --boost <boost|0.0>                              # Boosting factor for MMI (e.g., 0.1)"
  echo "  --drop-frames <true,false|false>                 # Option that affects MMI training: if true, we exclude gradients from frames"
  echo "                                                   # where the numerator transition-id is not in the denominator lattice."
  echo "  --one-silence-class <true,false|false>           # Option that affects MPE/SMBR training (will tend to reduce insertions)"
  echo "  --modify-learning-rates <true,false|false>       # If true, modify learning rates to try to equalize relative"
  echo "                                                   # changes across layers."
  exit 1;
fi

degs_dir=$1
dir=$2

[ -z "$src_model" ] && src_model=$degs_dir/final.mdl

# Check some files.
for f in $degs_dir/degs.1.ark $degs_dir/info/{num_archives,silence.csl,frames_per_archive} $src_model; do
  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
done

mkdir -p $dir/log || exit 1;

cp $degs_dir/phones.txt $dir 2>/dev/null
# copy some things
for f in splice_opts cmvn_opts tree final.mat; do
  if [ -f $degs_dir/$f ]; then
    cp $degs_dir/$f $dir/ || exit 1;
  fi
done

silphonelist=`cat $degs_dir/info/silence.csl` || exit 1;


num_archives=$(cat $degs_dir/info/num_archives) || exit 1;

if [ $num_jobs_nnet -gt $num_archives ]; then
  echo "$0: num-jobs-nnet $num_jobs_nnet exceeds number of archives $num_archives,"
  echo " ... setting it to $num_archives."
  num_jobs_nnet=$num_archives
fi

num_iters=$[($num_epochs*$num_archives)/$num_jobs_nnet]

echo "$0: Will train for $num_epochs epochs = $num_iters iterations"

for e in $(seq 1 $num_epochs); do
  x=$[($e*$num_archives)/$num_jobs_nnet] # gives the iteration number.
  iter_to_epoch[$x]=$e
done

if [ $stage -le -1 ]; then
  echo "$0: Copying initial model and modifying preconditioning setup"

  # Note, the baseline model probably had preconditioning, and we'll keep it;
  # but we want online preconditioning with a larger number of samples of
  # history, since in this setup the frames are only randomized at the segment
  # level so they are highly correlated.  It might make sense to tune this a
  # little, later on, although I doubt it matters once the --num-samples-history
  # is large enough.

  if [ ! -z "$effective_lrate" ]; then
    learning_rate=$(perl -e "print ($num_jobs_nnet*$effective_lrate);")
    echo "$0: setting learning rate to $learning_rate = --num-jobs-nnet * --effective-lrate."
  fi

  $cmd $dir/log/convert.log \
    nnet-am-copy --learning-rate=$learning_rate "$src_model" - \| \
    nnet-am-switch-preconditioning  --num-samples-history=50000 - $dir/0.mdl || exit 1;
fi


if [ $num_threads -eq 1 ]; then
 train_suffix="-simple" # this enables us to use GPU code if
                        # we have just one thread.
else
  train_suffix="-parallel --num-threads=$num_threads"
fi

rm $dir/.error
x=0   
while [ $x -lt $num_iters ]; do
  if [ $stage -le $x ]; then
    
    echo "Training neural net (pass $x)"

    # The \$ below delays the evaluation of the expression until the script runs (and JOB
    # will be replaced by the job-id).  That expression in $[..] is responsible for
    # choosing the archive indexes to use for each job on each iteration... we cycle through
    # all archives.

    $cmd $parallel_opts JOB=1:$num_jobs_nnet $dir/log/train.$x.JOB.log \
      nnet-combine-egs-discriminative \
        "ark:$degs_dir/degs.\$[((JOB-1+($x*$num_jobs_nnet))%$num_archives)+1].ark" ark:- \| \
      nnet-train-discriminative$train_suffix --silence-phones=$silphonelist \
       --criterion=$criterion --drop-frames=$drop_frames \
       --one-silence-class=$one_silence_class \
       --boost=$boost --acoustic-scale=$acoustic_scale \
       $dir/$x.mdl ark:- $dir/$[$x+1].JOB.mdl || exit 1;

    nnets_list=$(for n in $(seq $num_jobs_nnet); do echo $dir/$[$x+1].$n.mdl; done)

    # below use run.pl instead of a generic $cmd for these very quick stages,
    # so that we don't run the risk of waiting for a possibly hard-to-get GPU.
    run.pl $dir/log/average.$x.log \
      nnet-am-average $nnets_list $dir/$[$x+1].mdl || exit 1;

    if $modify_learning_rates; then
      run.pl $dir/log/modify_learning_rates.$x.log \
        nnet-modify-learning-rates --retroactive=$retroactive \
        --last-layer-factor=$last_layer_factor \
        --first-layer-factor=$first_layer_factor \
        $dir/$x.mdl $dir/$[$x+1].mdl $dir/$[$x+1].mdl || exit 1;
    fi
    rm $nnets_list
  fi
  if $adjust_priors && [ ! -z "${iter_to_epoch[$x]}" ]; then
    if [ ! -f $degs_dir/priors_egs.1.ark ]; then
      echo "$0: Expecting $degs_dir/priors_egs.1.ark to exist since --adjust-priors was true."
      echo "$0: Run this script with --adjust-priors false to not adjust priors"
      exit 1
    fi
    (
    e=${iter_to_epoch[$x]}
    rm $dir/.error
    num_archives_priors=`cat $degs_dir/info/num_archives_priors` || { touch $dir/.error; echo "Could not find $degs_dir/info/num_archives_priors. Set --adjust-priors false to not adjust priors"; exit 1; }

    $cmd JOB=1:$num_archives_priors $dir/log/get_post.epoch$e.JOB.log \
      nnet-compute-from-egs "nnet-to-raw-nnet $dir/$x.mdl -|" \
      ark:$degs_dir/priors_egs.JOB.ark ark:- \| \
      matrix-sum-rows ark:- ark:- \| \
      vector-sum ark:- $dir/post.epoch$e.JOB.vec || \
      { touch $dir/.error; echo "Error in getting posteriors for adjusting priors. See $dir/log/get_post.epoch$e.*.log"; exit 1; }

    sleep 3;

    $cmd $dir/log/sum_post.epoch$e.log \
      vector-sum $dir/post.epoch$e.*.vec $dir/post.epoch$e.vec || \
      { touch $dir/.error; echo "Error in summing posteriors. See $dir/log/sum_post.epoch$e.log"; exit 1; }

    rm $dir/post.epoch$e.*.vec

    echo "Re-adjusting priors based on computed posteriors for iter $x"
    $cmd $dir/log/adjust_priors.epoch$e.log \
      nnet-adjust-priors $dir/$x.mdl $dir/post.epoch$e.vec $dir/$x.mdl \
      || { touch $dir/.error; echo "Error in adjusting priors. See $dir/log/adjust_priors.epoch$e.log"; exit 1; }
    ) &
  fi

  [ -f $dir/.error ] && exit 1

  x=$[$x+1]
done

rm $dir/final.mdl 2>/dev/null
ln -s $x.mdl $dir/final.mdl

echo Done

epoch_final_iters=
for e in $(seq 0 $num_epochs); do
  x=$[($e*$num_archives)/$num_jobs_nnet] # gives the iteration number.
  ln -sf $x.mdl $dir/epoch$e.mdl
  epoch_final_iters="$epoch_final_iters $x"
done


# function to remove egs that might be soft links.
remove () { for x in $*; do [ -L $x ] && rm $(utils/make_absolute.sh $x); rm $x; done }

if $cleanup && $remove_egs; then  # note: this is false by default.
  echo Removing training examples
  for n in $(seq $num_archives); do
    remove $degs_dir/degs.*
    remove $degs_dir/priors_egs.*
  done
fi


if $cleanup; then
  echo Removing most of the models
  for x in `seq 0 $num_iters`; do
    if ! echo $epoch_final_iters | grep -w $x >/dev/null; then 
      # if $x is not an epoch-final iteration..
      rm $dir/$x.mdl 2>/dev/null
    fi
  done
fi


================================================
FILE: egs/steps/nnet2/train_discriminative_multilang2.sh
================================================
#!/usr/bin/env bash

# Copyright 2012-2014  Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.

# This script does MPE or MMI or state-level minimum bayes risk (sMBR) training,
# in the multi-language or at least multi-model setting where you have multiple "degs" directories.
# The input "degs" directories must be dumped by one of the get_egs_discriminative2.sh scripts.

# Begin configuration section.
cmd=run.pl
num_epochs=4       # Number of epochs of training
learning_rate=0.00002
acoustic_scale=0.1  # acoustic scale for MMI/MPFE/SMBR training.
boost=0.0       # option relevant for MMI

criterion=smbr
drop_frames=false #  option relevant for MMI
one_silence_class=true # option relevant for MPE/SMBR
num_jobs_nnet="4 4"    # Number of neural net jobs to run in parallel, one per
                       # language..  Note: this will interact with the learning
                       # rates (if you decrease this, you'll have to decrease
                       # the learning rate, and vice versa).

modify_learning_rates=true
last_layer_factor=1.0  # relates to modify-learning-rates
first_layer_factor=1.0 # relates to modify-learning-rates
shuffle_buffer_size=5000 # This "buffer_size" variable controls randomization of the samples
                # on each iter.  You could set it to 0 or to a large value for complete
                # randomization, but this would both consume memory and cause spikes in
                # disk I/O.  Smaller is easier on disk and memory but less random.  It's
                # not a huge deal though, as samples are anyway randomized right at the start.


stage=-3


num_threads=16  # this is the default but you may want to change it, e.g. to 1 if
                # using GPUs.
cleanup=true
retroactive=false
remove_egs=false
src_models=  # can be used to override the defaults of <degs-dir1>/final.mdl <degs-dir2>/final.mdl .. etc.
             # set this to a space-separated list.
# End configuration section.


echo "$0 $@"  # Print the command line for logging

if [ -f path.sh ]; then . ./path.sh; fi
. parse_options.sh || exit 1;


if [ $# -lt 3 ]; then
  echo "Usage: $0 [opts] <degs-dir1> <degs-dir2> ... <degs-dirN>  <exp-dir>"
  echo " e.g.: $0 exp/tri4_mpe_degs exp_other_lang/tri4_mpe_degs exp/tri4_mpe_multilang"
  echo ""
  echo "You have to first call get_egs_discriminative2.sh to dump the egs."
  echo "Caution: the options 'drop_frames' and 'criterion' are taken here"
  echo "even though they were required also by get_egs_discriminative2.sh,"
  echo "and they should normally match."
  echo ""
  echo "Main options (for others, see top of script file)"
  echo "  --config <config-file>                           # config file containing options"
  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
  echo "  --num-epochs <#epochs|4>                        # Number of epochs of training (measured on language 0)"
  echo "  --learning-rate <learning-rate|0.0002>           # Learning rate to use"
  echo "  --num-jobs-nnet <num-jobs|4 4>                   # Number of parallel jobs to use for main neural net:"
  echo "                                                   # space separated list of num-jobs per language. Affects"
  echo "                                                   # relative weighting."
  echo "  --num-threads <num-threads|16>                   # Number of parallel threads per job (will affect results"
  echo "                                                   # as well as speed; may interact with batch size; if you increase"
  echo "                                                   # this, you may want to decrease the batch size.  With GPU, must be 1."
  echo "  --parallel-opts <opts|\"--num-threads 16 --mem 1G\">      # extra options to pass to e.g. queue.pl for processes that"
  echo "                                                   # use multiple threads... "
  echo "  --stage <stage|-3>                               # Used to run a partially-completed training process from somewhere in"
  echo "                                                   # the middle."
  echo "  --criterion <criterion|smbr>                     # Training criterion: may be smbr, mmi or mpfe"
  echo "  --boost <boost|0.0>                              # Boosting factor for MMI (e.g., 0.1)"
  echo "  --drop-frames <true,false|false>                 # Option that affects MMI training: if true, we exclude gradients from frames"
  echo "                                                   # where the numerator transition-id is not in the denominator lattice."
  echo "  --modify-learning-rates <true,false|false>       # If true, modify learning rates to try to equalize relative"
  echo "                                                   # changes across layers."
  exit 1;
fi

argv=("$@") 
num_args=$#
num_lang=$[$num_args-1]

dir=${argv[$num_args-1]}

num_jobs_nnet_array=($num_jobs_nnet)
! [ "${#num_jobs_nnet_array[@]}" -eq "$num_lang" ] && \
  echo "$0: --num-jobs-nnet option must have size equal to the number of languages" && exit 1;

for lang in $(seq 0 $[$num_lang-1]); do
  degs_dir[$lang]=${argv[$lang]}
done

if [ ! -z "$src_models" ]; then
  src_model_array=($src_models)
  ! [ "${#src_model_array[@]}" -eq "$num_lang" ] && \
    echo "$0: --src-models option must have size equal to the number of languages" && exit 1;
else
  for lang in $(seq 0 $[$num_lang-1]); do
    src_model_array[$lang]=${degs_dir[$lang]}/final.mdl
  done
fi

mkdir -p $dir/log || exit 1;

for lang in $(seq 0 $[$num_lang-1]); do
  this_degs_dir=${degs_dir[$lang]}
  mdl=${src_model_array[$lang]}
  this_num_jobs_nnet=${num_jobs_nnet_array[$lang]}
  # Check inputs
  for f in $this_degs_dir/degs.1.ark $this_degs_dir/info/{num_archives,silence.csl,frames_per_archive} $mdl; do
    [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
  done
  mkdir -p $dir/$lang/log || exit 1;

  # check for valid num-jobs-nnet.
  ! [ $this_num_jobs_nnet -gt 0 ] && echo "Bad num-jobs-nnet option '$num_jobs_nnet'" && exit 1;
  this_num_archives=$(cat $this_degs_dir/info/num_archives) || exit 1;
  num_archives_array[$lang]=$this_num_archives
  silphonelist_array[$lang]=$(cat $this_degs_dir/info/silence.csl) || exit 1;

  if [ $this_num_jobs_nnet -gt $this_num_archives ]; then
    echo "$0: num-jobs-nnet $this_num_jobs_nnet exceeds number of archives $this_num_archives"
    echo " ... for language $lang; setting it to $this_num_archives."
    num_jobs_nnet_array[$lang]=$this_num_archives
  fi

  # copy some things from the input directories.
  for f in splice_opts cmvn_opts tree final.mat; do
    if [ -f $this_degs_dir/$f ]; then
      cp $this_degs_dir/$f $dir/$lang/ || exit 1;
    fi
  done
  if [ -f $this_degs_dir/conf ]; then
    ln -sf $(utils/make_absolute.sh $this_degs_dir/conf) $dir/ || exit 1; 
  fi
done


# work out number of iterations.
num_archives0=$(cat ${degs_dir[0]}/info/num_archives) || exit 1;
num_jobs_nnet0=${num_jobs_nnet_array[0]}

! [ $num_epochs -gt 0 ] && echo "Error: num-epochs $num_epochs is not valid" && exit 1;


num_iters=$[($num_epochs*$num_archives0)/$num_jobs_nnet0]

echo "$0: Will train for $num_epochs epochs = $num_iters iterations (measured on language 0)"
# Work out the number of epochs we train for on the other languages... this is
# just informational.
for lang in $(seq 1 $[$num_lang-1]); do
  this_degs_dir=${degs_dir[$lang]}
  this_num_archives=${num_archives_array[$lang]}
  this_num_epochs=$[($num_iters*${num_jobs_nnet_array[$lang]})/$this_num_archives]
  echo "$0: $num_iters iterations is approximately $this_num_epochs epochs for language $lang"
done


if [ $stage -le -1 ]; then
  echo "$0: Copying initial models and modifying preconditioning setups"

  # Note, the baseline model probably had preconditioning, and we'll keep it;
  # but we want online preconditioning with a larger number of samples of
  # history, since in this setup the frames are only randomized at the segment
  # level so they are highly correlated.  It might make sense to tune this a
  # little, later on, although I doubt it matters once the --num-samples-history
  # is large enough.

  for lang in $(seq 0 $[$num_lang-1]); do
    $cmd $dir/$lang/log/convert.log \
      nnet-am-copy --learning-rate=$learning_rate ${src_model_array[$lang]} - \| \
      nnet-am-switch-preconditioning  --num-samples-history=50000 - $dir/$lang/0.mdl || exit 1;
  done
fi


if [ $num_threads -eq 1 ]; then
 train_suffix="-simple" # this enables us to use GPU code if
                        # we have just one thread.
else
  train_suffix="-parallel --num-threads=$num_threads"
fi


x=0   
while [ $x -lt $num_iters ]; do
  if [ $stage -le $x ]; then
    
    echo "Training neural net (pass $x)"


    rm $dir/.error 2>/dev/null

    for lang in $(seq 0 $[$num_lang-1]); do
      this_num_jobs_nnet=${num_jobs_nnet_array[$lang]}
      this_num_archives=${num_archives_array[$lang]}
      this_degs_dir=${degs_dir[$lang]}
      this_silphonelist=${silphonelist_array[$lang]}

      # The \$ below delays the evaluation of the expression until the script runs (and JOB
      # will be replaced by the job-id).  That expression in $[..] is responsible for
      # choosing the archive indexes to use for each job on each iteration... we cycle through
      # all archives.

      (
        $cmd JOB=1:$this_num_jobs_nnet $dir/$lang/log/train.$x.JOB.log \
          nnet-combine-egs-discriminative \
          "ark:$this_degs_dir/degs.\$[((JOB-1+($x*$this_num_jobs_nnet))%$this_num_archives)+1].ark" ark:- \| \
          nnet-train-discriminative$train_suffix --silence-phones=$this_silphonelist \
           --criterion=$criterion --drop-frames=$drop_frames \
           --one-silence-class=$one_silence_class \
           --boost=$boost --acoustic-scale=$acoustic_scale \
           $dir/$lang/$x.mdl ark:- $dir/$lang/$[$x+1].JOB.mdl || exit 1;

        nnets_list=$(for n in $(seq $this_num_jobs_nnet); do echo $dir/$lang/$[$x+1].$n.mdl; done)

        # produce an average just within this language.
        $cmd $dir/$lang/log/average.$x.log \
          nnet-am-average $nnets_list $dir/$lang/$[$x+1].tmp.mdl || exit 1;

        rm $nnets_list
      ) || touch $dir/.error &
    done
    wait
    [ -f $dir/.error ] && echo "$0: error on pass $x" && exit 1


    # apply the modify-learning-rates thing to the model for the zero'th language;
    # we'll use the resulting learning rates for the other languages.
    if $modify_learning_rates; then
      $cmd $dir/log/modify_learning_rates.$x.log \
        nnet-modify-learning-rates --retroactive=$retroactive \
        --last-layer-factor=$last_layer_factor \
        --first-layer-factor=$first_layer_factor \
        $dir/0/$x.mdl $dir/0/$[$x+1].tmp.mdl $dir/0/$[$x+1].tmp.mdl || exit 1;
    fi

    nnets_list=$(for lang in $(seq 0 $[$num_lang-1]); do echo $dir/$lang/$[$x+1].tmp.mdl; done)
    weights_csl=$(echo $num_jobs_nnet | sed 's/ /:/g') # get as colon separated list.

    # the next command produces the cross-language averaged model containing the
    # final layer corresponding to language zero.  Note, if we did modify-learning-rates,
    # it will also have the modified learning rates.
    $cmd $dir/log/average.$x.log \
      nnet-am-average --weights=$weights_csl --skip-last-layer=true \
      $nnets_list $dir/0/$[$x+1].mdl || exit 1;

    # we'll transfer these learning rates to the other models.
    learning_rates=$(nnet-am-info --print-learning-rates=true $dir/0/$[$x+1].mdl 2>/dev/null)        

    for lang in $(seq 1 $[$num_lang-1]); do
      # the next command takes the averaged hidden parameters from language zero, and
      # the last layer from language $lang.  It's not really doing averaging.
      # we use nnet-am-copy to transfer the learning rates from model zero.
      $cmd $dir/$lang/log/combine_average.$x.log \
        nnet-am-average --weights=0.0:1.0 --skip-last-layer=true \
          $dir/$lang/$[$x+1].tmp.mdl $dir/0/$[$x+1].mdl - \| \
        nnet-am-copy --learning-rates=$learning_rates - $dir/$lang/$[$x+1].mdl || exit 1;
    done

    $cleanup && rm $dir/*/$[$x+1].tmp.mdl

  fi

  x=$[$x+1]
done


for lang in $(seq 0 $[$num_lang-1]); do
  rm $dir/$lang/final.mdl 2>/dev/null
  ln -s $x.mdl $dir/$lang/final.mdl


  epoch_final_iters=
  for e in $(seq 0 $num_epochs); do
    x=$[($e*$num_archives0)/$num_jobs_nnet0] # gives the iteration number.
    ln -sf $x.mdl $dir/$lang/epoch$e.mdl
    epoch_final_iters="$epoch_final_iters $x"
  done

  if $cleanup; then
    echo "Removing most of the models for language $lang"
    for x in `seq 0 $num_iters`; do
      if ! echo $epoch_final_iters | grep -w $x >/dev/null; then 
        # if $x is not an epoch-final iteration..
        rm $dir/$lang/$x.mdl 2>/dev/null
      fi
    done
  fi
done


echo Done


================================================
FILE: egs/steps/nnet2/train_more.sh
================================================
#!/usr/bin/env bash

# Copyright 2014  Johns Hopkins University (Author: Daniel Povey). 
# Apache 2.0.


# This script further trains an already-existing neural network,
# given an existing model and an examples (egs/) directory.
# The number of parallel jobs (--num-jobs-nnet) is determined by the
# egs directory.

# Begin configuration section.
cmd=run.pl
num_epochs=10      # Number of epochs of training; number of iterations is
                   # worked out from this.
num_iters_final=20 # Maximum number of final iterations to give to the
                  # optimization over the validation set.
learning_rate_factor=1.0 # You can use this to gradually decrease the learning
                         # rate during training (e.g. use 0.2); the initial
                         # learning rates are as specified in the model, but it
                         # will decrease slightly on each iteration to achieve
                         # this ratio.

combine=true # controls whether or not to do the final model combination.
combine_regularizer=1.0e-14 # Small regularizer so that parameters won't go crazy.

minibatch_size=128 # by default use a smallish minibatch size for neural net
                   # training; this controls instability which would otherwise
                   # be a problem with multi-threaded update.  Note: it also
                   # interacts with the "preconditioned" update which generally
                   # works better with larger minibatch size, so it's not
                   # completely cost free.

shuffle_buffer_size=5000 # This "buffer_size" variable controls randomization of the samples
                # on each iter.  You could set it to 0 or to a large value for complete
                # randomization, but this would both consume memory and cause spikes in
                # disk I/O.  Smaller is easier on disk and memory but less random.  It's
                # not a huge deal though, as samples are anyway randomized right at the start.
mix_up=0
stage=-5
num_threads=16
parallel_opts="--num-threads 16 --mem 1G" # by default we use 16 threads; this lets the queue know.
   # note: parallel_opts doesn't automatically get adjusted if you adjust num-threads.
cleanup=true
remove_egs=false
prior_subset_size=10000 # 10k samples per job, for computing priors.  Should be
                        # more than enough.
# End configuration section.


echo "$0 $@"  # Print the command line for logging

if [ -f path.sh ]; then . ./path.sh; fi
. parse_options.sh || exit 1;

if [ $# != 3 ]; then
  echo "Usage: $0 [opts] <input-model> <egs-dir> <exp-dir>"
  echo " e.g.: $0 exp/nnet4c/final.mdl exp/nnet4c/egs exp/nnet5c/"
  echo "see also the older script update_nnet.sh which creates the egs itself"
  echo "You probably now want to use train_more2.sh, which uses the newer,"
  echo "more compact egs format."
  echo ""
  echo "Main options (for others, see top of script file)"
  echo "  --config <config-file>                           # config file containing options"
  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
  echo "  --num-epochs <#epochs|15>                        # Number of epochs of training"
  echo "                                                   # while reducing learning rate (determines #iterations, together"
  echo "                                                   # with --samples-per-iter and --num-jobs-nnet)"
  echo "  --learning-rate-factor<factor|1.0>               # Factor (e.g. 0.2) by which to change learning rate"
  echo "                                                   # during the course of training"
  echo "  --num-threads <num-threads|16>                   # Number of parallel threads per job (will affect results"
  echo "                                                   # as well as speed; may interact with batch size; if you increase"
  echo "                                                   # this, you may want to decrease the batch size."
  echo "  --parallel-opts <opts|\"--num-threads 16 --mem 1G\">      # extra options to pass to e.g. queue.pl for processes that"
  echo "                                                   # use multiple threads... "
  echo "  --minibatch-size <minibatch-size|128>            # Size of minibatch to process (note: product with --num-threads"
  echo "                                                   # should not get too large, e.g. >2k)."
  echo "  --num-iters-final <#iters|20>                    # Number of final iterations to give to nnet-combine-fast to "
  echo "                                                   # interpolate parameters (the weights are learned with a validation set)"
  echo "  --mix-up <#mix|0>                                # If specified, add quasi-targets, analogous to a mixture of Gaussians vs."
  echo "                                                   # single Gaussians.  Only do this if not already mixed-up."
  echo "  --combine <true or false|true>                   # If true, do the final nnet-combine-fast stage."
  echo "  --stage <stage|-5>                               # Used to run a partially-completed training process from somewhere in"
  echo "                                                   # the middle."  
  exit 1;
fi

input_mdl=$1
egs_dir=$2
dir=$3

# Check some files.
for f in $input_mdl $egs_dir/egs.1.0.ark; do
  [ ! -f $f ] && echo "$0: expected file $f to exist." && exit 1;
done

mkdir -p $dir/log

# Copy some things from the directory where the input model is located, to the
# experimental directory, if they exist.  These might be needed for things like
# decoding.
input_dir=$(dirname $input_mdl);
for f in tree splice_opts cmvn_opts final.mat; do
  if [ -f $input_dir/$f ]; then
    cp $input_dir/$f $dir/
  fi
done

iters_per_epoch=$(cat $egs_dir/iters_per_epoch) || exit 1;
num_jobs_nnet=$(cat $egs_dir/num_jobs_nnet) || exit 1;

num_iters=$[$num_epochs * $iters_per_epoch];
per_iter_learning_rate_factor=$(perl -e "print ($learning_rate_factor ** (1.0 / $num_iters));")

echo "$0: Will train for $num_epochs epochs, equalling $num_iters iterations."

mix_up_iter=$[$num_iters/2]

if [ $num_threads -eq 1 ]; then
  train_suffix="-simple" # this enables us to use GPU code if
                         # we have just one thread.
  if ! cuda-compiled; then
    echo "$0: WARNING: you are running with one thread but you have not compiled"
    echo "   for CUDA.  You may be running a setup optimized for GPUs.  If you have"
    echo "   GPUs and have nvcc installed, go to src/ and do ./configure; make"
  fi
else
  train_suffix="-parallel --num-threads=$num_threads"
fi

cp $input_mdl $dir/0.mdl || exit 1;

x=0

while [ $x -lt $num_iters ]; do
  if [ $x -ge 0 ] && [ $stage -le $x ]; then
    # Set off jobs doing some diagnostics, in the background.
    $cmd $dir/log/compute_prob_valid.$x.log \
      nnet-compute-prob $dir/$x.mdl ark:$egs_dir/valid_diagnostic.egs &
    $cmd $dir/log/compute_prob_train.$x.log \
      nnet-compute-prob $dir/$x.mdl ark:$egs_dir/train_diagnostic.egs &
    if [ $x -gt 0 ] && [ ! -f $dir/log/mix_up.$[$x-1].log ]; then
      $cmd $dir/log/progress.$x.log \
        nnet-show-progress --use-gpu=no $dir/$[$x-1].mdl $dir/$x.mdl ark:$egs_dir/train_diagnostic.egs &
    fi
    
    echo "Training neural net (pass $x)"


    $cmd $parallel_opts JOB=1:$num_jobs_nnet $dir/log/train.$x.JOB.log \
      nnet-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x \
      ark:$egs_dir/egs.JOB.$[$x%$iters_per_epoch].ark ark:- \| \
      nnet-train$train_suffix --minibatch-size=$minibatch_size --srand=$x $dir/$x.mdl \
        ark:- $dir/$[$x+1].JOB.mdl \
      || exit 1;

    nnets_list=
    for n in `seq 1 $num_jobs_nnet`; do
      nnets_list="$nnets_list $dir/$[$x+1].$n.mdl"
    done     

    $cmd $dir/log/average.$x.log \
      nnet-am-average $nnets_list - \| \
      nnet-am-copy --learning-rate-factor=$per_iter_learning_rate_factor - $dir/$[$x+1].mdl || exit 1;

    if [ "$mix_up" -gt 0 ] && [ $x -eq $mix_up_iter ]; then
      # mix up.
      echo Mixing up from $num_leaves to $mix_up components
      $cmd $dir/log/mix_up.$x.log \
        nnet-am-mixup --min-count=10 --num-mixtures=$mix_up \
         $dir/$[$x+1].mdl $dir/$[$x+1].mdl || exit 1;
    fi
    rm $nnets_list
  fi
  x=$[$x+1]
done

# Now do combination.
# At the end, final.mdl will be a combination of the last e.g. 10 models.
nnets_list=()
[ $num_iters_final -gt $num_iters ] && num_iters_final=$num_iters
[ "$mix_up" -gt 0 ] && [ $num_iters_final -gt $[$num_iters-$mix_up_iter] ] && \
  num_iters_final=$[$num_iters-$mix_up_iter]

start=$[$num_iters-$num_iters_final+1]
for x in `seq $start $num_iters`; do
  idx=$[$x-$start]
  if [ $x -gt $mix_up_iter ]; then
    nnets_list[$idx]=$dir/$x.mdl # "nnet-am-copy --remove-dropout=true $dir/$x.mdl - |"
  fi
done

if [ $stage -le $num_iters ]; then
  if $combine; then
    echo "Doing final combination to produce final.mdl"
  # Below, use --use-gpu=no to disable nnet-combine-fast from using a GPU, as
  # if there are many models it can give out-of-memory error; set num-threads to 8
  # to speed it up (this isn't ideal...)
    this_num_threads=$num_threads
    [ $this_num_threads -lt 8 ] && this_num_threads=8
    num_egs=`nnet-copy-egs ark:$egs_dir/combine.egs ark:/dev/null 2>&1 | tail -n 1 | awk '{print $NF}'`
    mb=$[($num_egs+$this_num_threads-1)/$this_num_threads]
    [ $mb -gt 512 ] && mb=512
  # Setting --initial-model to a large value makes it initialize the combination
  # with the average of all the models.  It's important not to start with a
  # single model, or, due to the invariance to scaling that these nonlinearities
  # give us, we get zero diagonal entries in the fisher matrix that
  # nnet-combine-fast uses for scaling, which after flooring and inversion, has
  # the effect that the initial model chosen gets much higher learning rates
  # than the others.  This prevents the optimization from working well.
    $cmd $parallel_opts $dir/log/combine.log \
      nnet-combine-fast --initial-model=100000 --num-lbfgs-iters=40 --use-gpu=no \
      --num-threads=$this_num_threads --regularizer=$combine_regularizer \
      --verbose=3 --minibatch-size=$mb "${nnets_list[@]}" ark:$egs_dir/combine.egs \
      $dir/final.mdl || exit 1;

  # Normalize stddev for affine or block affine layers that are followed by a
  # pnorm layer and then a normalize layer.
    $cmd $parallel_opts $dir/log/normalize.log \
      nnet-normalize-stddev $dir/final.mdl $dir/final.mdl || exit 1;

  # Compute the probability of the final, combined model with
  # the same subset we used for the previous compute_probs, as the
  # different subsets will lead to different probs.
    $cmd $dir/log/compute_prob_valid.final.log \
      nnet-compute-prob $dir/final.mdl ark:$egs_dir/valid_diagnostic.egs &
    $cmd $dir/log/compute_prob_train.final.log \
      nnet-compute-prob $dir/final.mdl ark:$egs_dir/train_diagnostic.egs &
  else
    echo "$0: --combine=false so just using last model."
    cp $dir/$x.mdl $dir/final.mdl
  fi
fi

if [ $stage -le $[$num_iters+1] ]; then
  echo "Getting average posterior for purposes of adjusting the priors."
  # Note: this just uses CPUs, using a smallish subset of data.
  rm $dir/post.*.vec 2>/dev/null
  $cmd JOB=1:$num_jobs_nnet $dir/log/get_post.JOB.log \
    nnet-subset-egs --n=$prior_subset_size ark:$egs_dir/egs.JOB.0.ark ark:- \| \
    nnet-compute-from-egs "nnet-to-raw-nnet $dir/final.mdl -|" ark:- ark:- \| \
    matrix-sum-rows ark:- ark:- \| vector-sum ark:- $dir/post.JOB.vec || exit 1;

  sleep 3;  # make sure there is time for $dir/post.*.vec to appear.

  $cmd $dir/log/vector_sum.log \
   vector-sum $dir/post.*.vec $dir/post.vec || exit 1;

  rm $dir/post.*.vec;

  echo "Re-adjusting priors based on computed posteriors"
  $cmd $dir/log/adjust_priors.log \
    nnet-adjust-priors $dir/final.mdl $dir/post.vec $dir/final.mdl || exit 1;
fi


sleep 2

echo Done


$remove_egs && steps/nnet2/remove_egs.sh $dir/egs

if $cleanup; then
  echo Removing most of the models
  for x in `seq 0 $num_iters`; do
    if [ $[$x%100] -ne 0 ] && [ $x -lt $[$num_iters-$num_iters_final+1] ]; then 
       # delete all but every 100th model; don't delete the ones which combine to form the final model.
      rm $dir/$x.mdl
    fi
  done
fi


================================================
FILE: egs/steps/nnet2/train_more2.sh
================================================
#!/usr/bin/env bash

# Copyright 2014  Johns Hopkins University (Author: Daniel Povey). 
# Apache 2.0.

# This script further trains an already-existing neural network,
# given an existing model and an examples (egs/) directory.
# This version of the script epects an egs/ directory in the newer
# format, as created by get_egs2.sh.
#

# Begin configuration section.
cmd=run.pl
num_epochs=10      # Number of epochs of training; number of iterations is
                   # worked out from this.
num_iters_final=20 # Maximum number of final iterations to give to the
                  # optimization over the validation set.
learning_rate_factor=1.0 # You can use this to gradually decrease the learning
                         # rate during training (e.g. use 0.2); the initial
                         # learning rates are as specified in the model, but it
                         # will decrease slightly on each iteration to achieve
                         # this ratio.

combine=true # controls whether or not to do the final model combination.
combine_regularizer=1.0e-14 # Small regularizer so that parameters won't go crazy.
max_models_combine=20 # The "max_models_combine" is the maximum number of models we give
  # to the final 'combine' stage, but these models will themselves be averages of
  # iteration-number ranges.

minibatch_size=128 # by default use a smallish minibatch size for neural net
                   # training; this controls instability which would otherwise
                   # be a problem with multi-threaded update.  Note: it also
                   # interacts with the "preconditioned" update which generally
                   # works better with larger minibatch size, so it's not
                   # completely cost free.

shuffle_buffer_size=5000 # This "buffer_size" variable controls randomization of the samples
                # on each iter.  You could set it to 0 or to a large value for complete
                # randomization, but this would both consume memory and cause spikes in
                # disk I/O.  Smaller is easier on disk and memory but less random.  It's
                # not a huge deal though, as samples are anyway randomized right at the start.
num_jobs_nnet=4
mix_up=0
stage=-5
num_threads=16
parallel_opts="--num-threads 16 --mem 1G" # by default we use 16 threads; this lets the queue know.
   # note: parallel_opts doesn't automatically get adjusted if you adjust num-threads.
combine_num_threads=8
cleanup=true
prior_subset_size=10000 # 10k samples per job, for computing priors.  Should be
                        # more than enough.
num_jobs_compute_prior=10 # these are single-threaded, run on CPU.
remove_egs=false
# End configuration section.


echo "$0 $@"  # Print the command line for logging

if [ -f path.sh ]; then . ./path.sh; fi
. parse_options.sh || exit 1;

if [ $# != 3 ]; then
  echo "Usage: $0 [opts] <input-model> <egs-dir> <exp-dir>"
  echo " e.g.: $0 exp/nnet4c/final.mdl exp/nnet4c/egs exp/nnet5c/"
  echo "see also the older script update_nnet.sh which creates the egs itself"
  echo ""
  echo "Main options (for others, see top of script file)"
  echo "  --config <config-file>                           # config file containing options"
  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
  echo "  --num-epochs <#epochs|15>                        # Number of epochs of training"
  echo "                                                   # while reducing learning rate (determines #iterations, together"
  echo "                                                   # with --samples-per-iter and --num-jobs-nnet)"
  echo "  --num-jobs-nnet <#jobs|4>                        # Number of neural-net jobs to run in parallel"
  echo "  --learning-rate-factor<factor|1.0>               # Factor (e.g. 0.2) by which to change learning rate"
  echo "                                                   # during the course of training"
  echo "  --num-threads <num-threads|16>                   # Number of parallel threads per job (will affect results"
  echo "                                                   # as well as speed; may interact with batch size; if you increase"
  echo "                                                   # this, you may want to decrease the batch size."
  echo "  --parallel-opts <opts|\"--num-threads 16 --mem 1G\">      # extra options to pass to e.g. queue.pl for processes that"
  echo "                                                   # use multiple threads... "
  echo "  --minibatch-size <minibatch-size|128>            # Size of minibatch to process (note: product with --num-threads"
  echo "                                                   # should not get too large, e.g. >2k)."
  echo "  --num-iters-final <#iters|20>                    # Number of final iterations to give to nnet-combine-fast to "
  echo "                                                   # interpolate parameters (the weights are learned with a validation set)"
  echo "  --mix-up <#mix|0>                                # If specified, add quasi-targets, analogous to a mixture of Gaussians vs."
  echo "                                                   # single Gaussians.  Only do this if not already mixed-up."
  echo "  --combine <true or false|true>                   # If true, do the final nnet-combine-fast stage."
  echo "  --stage <stage|-5>                               # Used to run a partially-completed training process from somewhere in"
  echo "                                                   # the middle."  
  exit 1;
fi

input_mdl=$1
egs_dir=$2
dir=$3

# Check some files.
for f in $input_mdl $egs_dir/egs.1.ark; do
  [ ! -f $f ] && echo "$0: expected file $f to exist." && exit 1;
done

mkdir -p $dir/log

# Copy some things from the directory where the input model is located, to the
# experimental directory, if they exist.  These might be needed for things like
# decoding.
input_dir=$(dirname $input_mdl);
for f in tree splice_opts cmvn_opts final.mat; do
  if [ -f $input_dir/$f ]; then
    cp $input_dir/$f $dir/
  fi
done

frames_per_eg=$(cat $egs_dir/info/frames_per_eg) || { echo "error: no such file $egs_dir/info/frames_per_eg"; exit 1; }
num_archives=$(cat $egs_dir/info/num_archives) || { echo "error: no such file $egs_dir/info/frames_per_eg"; exit 1; }

# num_archives_expanded considers each separate label-position from
# 0..frames_per_eg-1 to be a separate archive.
num_archives_expanded=$[$num_archives*$frames_per_eg]

if [ $num_jobs_nnet -gt $num_archives_expanded ]; then
  echo "$0: --num-jobs-nnet cannot exceed num-archives*frames-per-eg which is $num_archives_expanded"
  echo "$0: setting --num-jobs-nnet to $num_archives_expanded"
  num_jobs_nnet=$num_archives_expanded
fi


# set num_iters so that as close as possible, we process the data $num_epochs
# times, i.e. $num_iters*$num_jobs_nnet == $num_epochs*$num_archives_expanded
num_iters=$[($num_epochs*$num_archives_expanded)/$num_jobs_nnet]

echo "$0: Will train for $num_epochs epochs = $num_iters iterations"

per_iter_learning_rate_factor=$(perl -e "print ($learning_rate_factor ** (1.0 / $num_iters));")

mix_up_iter=$[$num_iters/4]  # mix up after only a short way into training, as
                             # most likely the net is already quite well trained.

if [ $num_threads -eq 1 ]; then
  parallel_suffix="-simple" # this enables us to use GPU code if
                         # we have just one thread.
  parallel_train_opts=
  if ! cuda-compiled; then
    echo "$0: WARNING: you are running with one thread but you have not compiled"
    echo "   for CUDA.  You may be running a setup optimized for GPUs.  If you have"
    echo "   GPUs and have nvcc installed, go to src/ and do ./configure; make"
  fi
else
  parallel_suffix="-parallel"
  parallel_train_opts="--num-threads=$num_threads"
fi


approx_iters_per_epoch=$[$num_iters/$num_epochs]
# First work out how many models we want to combine over in the final
# nnet-combine-fast invocation.  This equals
# min(max(max_models_combine, iters_per_epoch),
#     2/3 * iters_after_mixup)
num_models_combine=$max_models_combine
if [ $num_models_combine -lt $approx_iters_per_epoch ]; then
  num_models_combine=$approx_iters_per_epoch
fi
iters_after_mixup_23=$[(($num_iters-$mix_up_iter-1)*2)/3]
if [ $num_models_combine -gt $iters_after_mixup_23 ]; then
  num_models_combine=$iters_after_mixup_23
fi
first_model_combine=$[$num_iters-$num_models_combine+1]

cp $input_mdl $dir/0.mdl || exit 1;

x=0

while [ $x -lt $num_iters ]; do
  if [ $x -ge 0 ] && [ $stage -le $x ]; then
    # Set off jobs doing some diagnostics, in the background.
    $cmd $dir/log/compute_prob_valid.$x.log \
      nnet-compute-prob $dir/$x.mdl ark:$egs_dir/valid_diagnostic.egs &
    $cmd $dir/log/compute_prob_train.$x.log \
      nnet-compute-prob $dir/$x.mdl ark:$egs_dir/train_diagnostic.egs &
    if [ $x -gt 0 ] && [ ! -f $dir/log/mix_up.$[$x-1].log ]; then
      $cmd $dir/log/progress.$x.log \
        nnet-show-progress --use-gpu=no $dir/$[$x-1].mdl $dir/$x.mdl ark:$egs_dir/train_diagnostic.egs &
    fi
    
    echo "Training neural net (pass $x)"

    rm $dir/.error 2>/dev/null
    ( # this sub-shell is so that when we "wait" below,
      # we only wait for the training jobs that we just spawned,
      # not the diagnostic jobs that we spawned above.
      
      # We can't easily use a single parallel SGE job to do the main training,
      # because the computation of which archive and which --frame option
      # to use for each job is a little complex, so we spawn each one separately.
      for n in $(seq $num_jobs_nnet); do
        k=$[$x*$num_jobs_nnet + $n - 1]; # k is a zero-based index that we'll derive
                                         # the other indexes from.
        archive=$[($k%$num_archives)+1]; # work out the 1-based archive index.
        frame=$[(($k/$num_archives)%$frames_per_eg)]; # work out the 0-based frame
        # index; this increases more slowly than the archive index because the
        # same archive with different frame indexes will give similar gradients,
        # so we want to separate them in time.

        $cmd $parallel_opts $dir/log/train.$x.$n.log \
          nnet-train$parallel_suffix $parallel_train_opts \
          --minibatch-size=$minibatch_size --srand=$x $dir/$x.mdl \
          "ark,bg:nnet-copy-egs --frame=$frame ark:$egs_dir/egs.$archive.ark ark:-|nnet-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x ark:- ark:-|" \
          $dir/$[$x+1].$n.mdl || touch $dir/.error &
      done
      wait
    )
    # the error message below is not that informative, but $cmd will
    # have printed a more specific one.
    [ -f $dir/.error ] && echo "$0: error on iteration $x of training" && exit 1;

    nnets_list=
    for n in `seq 1 $num_jobs_nnet`; do
      nnets_list="$nnets_list $dir/$[$x+1].$n.mdl"
    done     

    $cmd $dir/log/average.$x.log \
      nnet-am-average $nnets_list - \| \
      nnet-am-copy --learning-rate-factor=$per_iter_learning_rate_factor - $dir/$[$x+1].mdl || exit 1;

    if [ "$mix_up" -gt 0 ] && [ $x -eq $mix_up_iter ]; then
      # mix up.
      echo Mixing up from $num_leaves to $mix_up components
      $cmd $dir/log/mix_up.$x.log \
        nnet-am-mixup --min-count=10 --num-mixtures=$mix_up \
         $dir/$[$x+1].mdl $dir/$[$x+1].mdl || exit 1;
    fi
    rm $nnets_list
  fi
  x=$[$x+1]
done


if [ $stage -le $num_iters ]; then
  echo "Doing final combination to produce final.mdl"

  # Now do combination.
  nnets_list=()
  # the if..else..fi statement below sets 'nnets_list'.
  if [ $max_models_combine -lt $num_models_combine ]; then
    # The number of models to combine is too large, e.g. > 20.  In this case,
    # each argument to nnet-combine-fast will be an average of multiple models.
    cur_offset=0 # current offset from first_model_combine.
    for n in $(seq $max_models_combine); do
      next_offset=$[($n*$num_models_combine)/$max_models_combine]
      sub_list="" 
      for o in $(seq $cur_offset $[$next_offset-1]); do
        iter=$[$first_model_combine+$o]
        mdl=$dir/$iter.mdl
        [ ! -f $mdl ] && echo "Expected $mdl to exist" && exit 1;
        sub_list="$sub_list $mdl"
      done
      nnets_list[$[$n-1]]="nnet-am-average $sub_list - |"
      cur_offset=$next_offset
    done
  else
    nnets_list=
    for n in $(seq 0 $[num_models_combine-1]); do
      iter=$[$first_model_combine+$n]
      mdl=$dir/$iter.mdl
      [ ! -f $mdl ] && echo "Expected $mdl to exist" && exit 1;
      nnets_list[$n]=$mdl
    done
  fi


  # Below, use --use-gpu=no to disable nnet-combine-fast from using a GPU, as
  # if there are many models it can give out-of-memory error; set num-threads to 8
  # to speed it up (this isn't ideal...)
  num_egs=`nnet-copy-egs ark:$egs_dir/combine.egs ark:/dev/null 2>&1 | tail -n 1 | awk '{print $NF}'`
  mb=$[($num_egs+$combine_num_threads-1)/$combine_num_threads]
  [ $mb -gt 512 ] && mb=512
  # Setting --initial-model to a large value makes it initialize the combination
  # with the average of all the models.  It's important not to start with a
  # single model, or, due to the invariance to scaling that these nonlinearities
  # give us, we get zero diagonal entries in the fisher matrix that
  # nnet-combine-fast uses for scaling, which after flooring and inversion, has
  # the effect that the initial model chosen gets much higher learning rates
  # than the others.  This prevents the optimization from working well.
  $cmd $combine_parallel_opts $dir/log/combine.log \
    nnet-combine-fast --initial-model=100000 --num-lbfgs-iters=40 --use-gpu=no \
      --num-threads=$combine_num_threads \
      --verbose=3 --minibatch-size=$mb "${nnets_list[@]}" ark:$egs_dir/combine.egs \
      $dir/final.mdl || exit 1;

  # Normalize stddev for affine or block affine layers that are followed by a
  # pnorm layer and then a normalize layer.
  $cmd $dir/log/normalize.log \
    nnet-normalize-stddev $dir/final.mdl $dir/final.mdl || exit 1;

  # Compute the probability of the final, combined model with
  # the same subset we used for the previous compute_probs, as the
  # different subsets will lead to different probs.
  $cmd $dir/log/compute_prob_valid.final.log \
    nnet-compute-prob $dir/final.mdl ark:$egs_dir/valid_diagnostic.egs &
  $cmd $dir/log/compute_prob_train.final.log \
    nnet-compute-prob $dir/final.mdl ark:$egs_dir/train_diagnostic.egs &
fi

if [ $stage -le $[$num_iters+1] ]; then
  echo "Getting average posterior for purposes of adjusting the priors."
  # Note: this just uses CPUs, using a smallish subset of data.
  rm $dir/post.$x.*.vec 2>/dev/null
  $cmd JOB=1:$num_jobs_compute_prior $dir/log/get_post.$x.JOB.log \
    nnet-copy-egs --frame=random --srand=JOB ark:$egs_dir/egs.1.ark ark:- \| \
    nnet-subset-egs --srand=JOB --n=$prior_subset_size ark:- ark:- \| \
    nnet-compute-from-egs "nnet-to-raw-nnet $dir/final.mdl -|" ark:- ark:- \| \
    matrix-sum-rows ark:- ark:- \| vector-sum ark:- $dir/post.$x.JOB.vec || exit 1;

  sleep 3;  # make sure there is time for $dir/post.$x.*.vec to appear.

  $cmd $dir/log/vector_sum.$x.log \
   vector-sum $dir/post.$x.*.vec $dir/post.$x.vec || exit 1;

  rm $dir/post.$x.*.vec;

  echo "Re-adjusting priors based on computed posteriors"
  $cmd $dir/log/adjust_priors.final.log \
    nnet-adjust-priors $dir/final.mdl $dir/post.$x.vec $dir/final.mdl || exit 1;
fi


if [ ! -f $dir/final.mdl ]; then
  echo "$0: $dir/final.mdl does not exist."
  # we don't want to clean up if the training didn't succeed.
  exit 1;
fi

sleep 2

echo Done

if $cleanup; then
  echo Cleaning up data
  if $remove_egs && [[ $egs_dir =~ $dir/egs* ]]; then
    steps/nnet2/remove_egs.sh $egs_dir
  fi

  echo Removing most of the models
  for x in `seq 0 $num_iters`; do
    if [ $[$x%100] -ne 0 ] && [ $x -ne $num_iters ] && [ -f $dir/$x.mdl ]; then
       # delete all but every 100th model; don't delete the ones which combine to form the final model.
      rm $dir/$x.mdl
    fi
  done
fi


================================================
FILE: egs/steps/nnet2/train_multilang2.sh
================================================
#!/usr/bin/env bash

# Copyright 2012-2014  Johns Hopkins University (Author: Daniel Povey). 
#           2013  Xiaohui Zhang
#           2013  Guoguo Chen
#           2014  Vimal Manohar
#           2014  Vijayaditya Peddinti
# Apache 2.0.


# train_multilang2.sh is for multi-language training of neural nets.  It
# takes multiple egs directories which must be created by get_egs2.sh, and the
# corresponding alignment directories (only needed for training the transition
# models).

# for the n languages, we share all the hidden layers but there are separate
# final layers.  On each iteration of training we average the hidden layers
# across all jobs of all languages, but average the parameters of the final,
# output layer only within each language.  The script starts from a partially
# trained model from the first language (language 0 in the directory-numbering
# scheme).  See egs/rm/s5/local/online/run_nnet2_wsj_joint.sh for example.
#
# This script requires you to supply a neural net partially trained for the 1st
# language, by one of the regular training scripts, to be used as the initial
# neural net (for use by other languages, we'll discard the last layer); it
# should not have been subject to "mix-up" (since this script does mix-up), or
# combination (since it would increase the parameter range to a too-large value
# which isn't compatible with our normal learning rate schedules).


# Begin configuration section.
cmd=run.pl
num_epochs=10      # Number of epochs of training (for first language);
                   # the number of iterations is worked out from this.
initial_learning_rate=0.04
final_learning_rate=0.004

minibatch_size=128 # by default use a smallish minibatch size for neural net
                   # training; this controls instability which would otherwise
                   # be a problem with multi-threaded update. 

num_jobs_nnet="2 2"    # Number of neural net jobs to run in parallel.  This option
                       # is passed to get_egs.sh.  Array must be same length
                       # as number of separate languages.
num_jobs_compute_prior=10 # these are single-threaded, run on CPU.

max_models_combine=20 # The "max_models_combine" is the maximum number of models we give
  # to the final 'combine' stage, but these models will themselves be averages of
  # iteration-number ranges.

shuffle_buffer_size=5000 # This "buffer_size" variable controls randomization of the samples
                # on each iter.  You could set it to 0 or to a large value for complete
                # randomization, but this would both consume memory and cause spikes in
                # disk I/O.  Smaller is easier on disk and memory but less random.  It's
                # not a huge deal though, as samples are anyway randomized right at the start.
                # (the point of this is to get data in different minibatches on different iterations,
                # since in the preconditioning method, 2 samples in the same minibatch can
                # affect each others' gradients.

prior_subset_size=10000 # 10k samples per job, for computing priors.  Should be
                        # more than enough.

stage=-4


mix_up="0 0" # Number of components to mix up to (should be > #tree leaves, if
             # specified.)  An array, one per language.

num_threads=16  # default suitable for CPU-based training
parallel_opts="--num-threads 16 --mem 1G"  # default suitable for CPU-based training.
  # by default we use 16 threads; this lets the queue know.
  # note: parallel_opts doesn't automatically get adjusted if you adjust num-threads.
combine_num_threads=8
combine_parallel_opts="--num-threads 8"  # queue options for the "combine" stage.
cleanup=false # while testing, leaving cleanup=false.
# End configuration section.


echo "$0 $@"  # Print the command line for logging

if [ -f path.sh ]; then . ./path.sh; fi
. parse_options.sh || exit 1;

if [ $# -lt 6 -o $[$#%2] -ne 0 ]; then
  # num-args must be at least 6 and must be even.
  echo "Usage: $0 [opts] <ali0> <egs0> <ali1> <egs1> ... <aliN-1> <egsN-1> <input-model> <exp-dir>"
  echo " e.g.: $0 data/train exp/tri6_ali exp/tri6_egs exp_lang2/tri6_ali exp_lang2/tri6_egs exp/dnn6a/10.mdl exp/tri6_multilang"
  echo ""
  echo "Note: <input-model> must correspond to the model/tree for <ali0> and <egs0>, and the"
  echo "num-epochs is computed for the zeroth language."
  echo ""
  echo "The --num-jobs-nnet should be an array saying how many jobs to allocate to each language,"
  echo "e.g. --num-jobs-nnet '2 4'"
  echo ""
  echo "Main options (for others, see top of script file)"
  echo "  --config <config-file>                           # config file containing options"
  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
  echo "  --num-epochs <#epochs|15>                        # Number of epochs of training (figured from 1st corpus)"
  echo "  --initial-learning-rate <initial-learning-rate|0.02> # Learning rate at start of training, e.g. 0.02 for small"
  echo "                                                       # data, 0.01 for large data"
  echo "  --final-learning-rate  <final-learning-rate|0.004>   # Learning rate at end of training, e.g. 0.004 for small"
  echo "                                                   # data, 0.001 for large data"
  echo "  --num-hidden-layers <#hidden-layers|2>           # Number of hidden layers, e.g. 2 for 3 hours of data, 4 for 100hrs"
  echo "  --add-layers-period <#iters|2>                   # Number of iterations between adding hidden layers"
  echo "  --mix-up <#pseudo-gaussians|0>                   # Can be used to have multiple targets in final output layer,"
  echo "                                                   # per context-dependent state.  Try a number several times #states."
  echo "  --num-jobs-nnet <num-jobs|8>                     # Number of parallel jobs to use for main neural net"
  echo "                                                   # training (will affect results as well as speed; try 8, 16)"
  echo "                                                   # Note: if you increase this, you may want to also increase"
  echo "                                                   # the learning rate."
  echo "  --num-threads <num-threads|16>                   # Number of parallel threads per job (will affect results"
  echo "                                                   # as well as speed; may interact with batch size; if you increase"
  echo "                                                   # this, you may want to decrease the batch size."
  echo "  --parallel-opts <opts|\"--num-threads 16 --mem 1G\">      # extra options to pass to e.g. queue.pl for processes that"
  echo "                                                   # use multiple threads... "
  echo "  --stage <stage|-4>                               # Used to run a partially-completed training process from somewhere in"
  echo "                                                   # the middle."
  exit 1;
fi


argv=("$@") 
num_args=$#
num_lang=$[($num_args-2)/2]

dir=${argv[$num_args-1]}
input_model=${argv[$num_args-2]}

[ ! -f $input_model ] && echo "$0: Input model $input_model does not exist" && exit 1;


mkdir -p $dir/log

num_jobs_nnet_array=($num_jobs_nnet)
! [ "${#num_jobs_nnet_array[@]}" -eq "$num_lang" ] && \
  echo "$0: --num-jobs-nnet option must have size equal to the number of languages" && exit 1;
mix_up_array=($mix_up)
! [ "${#mix_up_array[@]}" -eq "$num_lang" ] && \
  echo "$0: --mix-up option must have size equal to the number of languages" && exit 1;


# Language index starts from 0.
for lang in $(seq 0 $[$num_lang-1]); do
  alidir[$lang]=${argv[$lang*2]}
  egs_dir[$lang]=${argv[$lang*2+1]}
  for f in ${egs_dir[$lang]}/info/frames_per_eg ${egs_dir[lang]}/egs.1.ark ${alidir[$lang]}/ali.1.gz ${alidir[$lang]}/tree; do
    [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
  done
  mkdir -p $dir/$lang/log
  cp ${alidir[$lang]}/tree $dir/$lang/ || exit 1;

  for f in ${egs_dir[$lang]}/{final.mat,cmvn_opts,splice_opts}; do
    # Copy any of these files that exist.
    cp $f $dir/$lang/ 2>/dev/null 
  done
done


input_model_pdfs=$(nnet-am-info $input_model | grep '^output-dim' | awk '{print $2}')
alidir0_pdfs=$(tree-info ${alidir[0]}/tree | grep '^num-pdfs' | awk '{print $2}')
if ! [ $input_model_pdfs -eq $alidir0_pdfs ]; then
  echo "$0: expected num-pdfs from the input model $input_model to match"
  echo " .. the one used for the first alignment directory ${alidir[0]}, $input_model_pdfs != $alidir0_pdfs"
  exit 1;
fi


for x in final.mat cmvn_opts splice_opts; do
  if [ -f $dir/0/$x ]; then
    for lang in $(seq 1 $[$num_lang-1]); do
      if ! cmp $dir/0/$x $dir/$lang/$x; then
        echo "$0: warning: files $dir/0/$x and $dir/$lang/$x are not identical."
      fi
    done
  fi
done

# the input model is supposed to correspond to the first language.
nnet-am-copy --learning-rate=$initial_learning_rate $input_model $dir/0/0.mdl

if nnet-am-info --print-args=false $dir/0/0.mdl | grep SumGroupComponent 2>/dev/null; then
  if [ "${mix_up_array[0]}" != "0" ]; then
    echo "$0: Your input model already has mixtures, but you are asking to mix it up."
    echo " ... best to use a model without mixtures as input.  (e.g., earlier iter)."
    exit 1;
  fi
fi


if [ $stage -le -4 ]; then
  echo "$0: initializing models for other languages"
  for lang in $(seq 1 $[$num_lang-1]); do
    # create the initial models for the other languages.
    $cmd $dir/$lang/log/reinitialize.log \
      nnet-am-reinitialize $input_model ${alidir[$lang]}/final.mdl $dir/$lang/0.mdl || exit 1;
  done
fi

if [ $stage -le -3 ]; then
  echo "Training transition probabilities and setting priors"
  for lang in $(seq 0 $[$num_lang-1]); do
    $cmd $dir/$lang/log/train_trans.log \
      nnet-train-transitions $dir/$lang/0.mdl "ark:gunzip -c ${alidir[$lang]}/ali.*.gz|" $dir/$lang/0.mdl \
      || exit 1;
  done
fi

# Work out the number of iterations... the number of epochs refers to the
# first language (language zero) and this, together with the num-jobs-nnet for
# that language and details of the egs, determine the number of epochs.

frames_per_eg0=$(cat ${egs_dir[0]}/info/frames_per_eg) || exit 1;
num_archives0=$(cat ${egs_dir[0]}/info/num_archives) || exit 1;
# num_archives_expanded considers each separate label-position from
# 0..frames_per_eg-1 to be a separate archive.
num_archives_expanded0=$[$num_archives0*$frames_per_eg0]

if [ ${num_jobs_nnet_array[0]} -gt $num_archives_expanded0 ]; then
  echo "$0: --num-jobs-nnet[0] cannot exceed num-archives*frames-per-eg which is $num_archives_expanded"
  exit 1;
fi

# set num_iters so that as close as possible, we process the data $num_epochs
# times, i.e. $num_iters*$num_jobs_nnet == $num_epochs*$num_archives_expanded
num_iters=$[($num_epochs*$num_archives_expanded0)/${num_jobs_nnet_array[0]}]

echo "$0: Will train for $num_epochs epochs (of language 0) = $num_iters iterations"

! [ $num_iters -gt 0 ] && exit 1;

# Work out the number of epochs we train for on the other languages... this is
# just informational.
for lang in $(seq 1 $[$num_lang-1]); do
  frames_per_eg=$(cat ${egs_dir[$lang]}/info/frames_per_eg) || exit 1;
  num_archives=$(cat ${egs_dir[$lang]}/info/num_archives) || exit 1;
  num_archives_expanded=$[$num_archives*$frames_per_eg]
  num_epochs=$[($num_iters*${num_jobs_nnet_array[$lang]})/$num_archives_expanded]
  echo "$0: $num_iters iterations is approximately $num_epochs epochs for language $lang"
done

# do any mixing-up after half the iters.
mix_up_iter=$[$num_iters/2]

if [ $num_threads -eq 1 ]; then
  parallel_suffix="-simple" # this enables us to use GPU code if
                         # we have just one thread.
  parallel_train_opts=
  if ! cuda-compiled; then
    echo "$0: WARNING: you are running with one thread but you have not compiled"
    echo "   for CUDA.  You may be running a setup optimized for GPUs.  If you have"
    echo "   GPUs and have nvcc installed, go to src/ and do ./configure; make"
  fi
else
  parallel_suffix="-parallel"
  parallel_train_opts="--num-threads=$num_threads"
fi


approx_iters_per_epoch=$[$num_iters/$num_epochs]
# First work out how many models we want to combine over in the final
# nnet-combine-fast invocation.  This equals
# min(max(max_models_combine, iters_per_epoch),
#     2/3 * iters_after_mixup).
# We use the same numbers of iterations for all languages, even though it's just
# worked out for the first language.
num_models_combine=$max_models_combine
if [ $num_models_combine -lt $approx_iters_per_epoch ]; then
  num_models_combine=$approx_iters_per_epoch
fi
iters_after_mixup_23=$[(($num_iters-$mix_up_iter-1)*2)/3]
if [ $num_models_combine -gt $iters_after_mixup_23 ]; then
  num_models_combine=$iters_after_mixup_23
fi
first_model_combine=$[$num_iters-$num_models_combine+1]

x=0


while [ $x -lt $num_iters ]; do
    
  if [ $x -ge 0 ] && [ $stage -le $x ]; then
    for lang in $(seq 0 $[$num_lang-1]); do
      # Set off jobs doing some diagnostics, in the background.
      $cmd $dir/$lang/log/compute_prob_valid.$x.log \
        nnet-compute-prob $dir/$lang/$x.mdl ark:${egs_dir[$lang]}/valid_diagnostic.egs &
      $cmd $dir/$lang/log/compute_prob_train.$x.log \
        nnet-compute-prob $dir/$lang/$x.mdl ark:${egs_dir[$lang]}/train_diagnostic.egs &
      if [ $x -gt 0 ] && [ ! -f $dir/$lang/log/mix_up.$[$x-1].log ]; then
        $cmd $dir/$lang/log/progress.$x.log \
          nnet-show-progress --use-gpu=no $dir/$lang/$[$x-1].mdl $dir/$lang/$x.mdl \
          ark:${egs_dir[$lang]}/train_diagnostic.egs '&&' \
           nnet-am-info $dir/$lang/$x.mdl &
      fi
    done

    echo "Training neural net (pass $x)"

    if [ $x -eq 0 ]; then
      # on iteration zero, use a smaller minibatch size and only one quarter of the
      # normal amount of training data: this will help, respectively, to ensure stability
      # and to stop the models from moving so far that averaging hurts.
      this_minibatch_size=$[$minibatch_size/2];
      this_keep_proportion=0.25
    else
      this_minibatch_size=$minibatch_size
      this_keep_proportion=1.0
      # use half the examples on iteration 1, out of a concern that the model-averaging
      # might not work if we move too far before getting close to convergence.
      [ $x -eq 1 ] && this_keep_proportion=0.5 
    fi

    rm $dir/.error 2>/dev/null


    ( # this sub-shell is so that when we "wait" below,
      # we only wait for the training jobs that we just spawned,
      # not the diagnostic jobs that we spawned above.
      
      # We can't easily use a single parallel SGE job to do the main training,
      # because the computation of which archive and which --frame option
      # to use for each job is a little complex, so we spawn each one separately.
      
      
      for lang in $(seq 0 $[$num_lang-1]); do
        this_num_jobs_nnet=${num_jobs_nnet_array[$lang]}
        this_frames_per_eg=$(cat ${egs_dir[$lang]}/info/frames_per_eg) || exit 1;
        this_num_archives=$(cat ${egs_dir[$lang]}/info/num_archives) || exit 1;

        ! [ $this_num_jobs_nnet -gt 0 -a $this_frames_per_eg -gt 0 -a $this_num_archives -gt 0 ] && exit 1

        for n in $(seq $this_num_jobs_nnet); do
          k=$[$x*$this_num_jobs_nnet + $n - 1]; # k is a zero-based index that we'll derive
                                                # the other indexes from.
          archive=$[($k%$this_num_archives)+1]; # work out the 1-based archive index.
          frame=$[(($k/$this_num_archives)%$this_frames_per_eg)];

          $cmd $parallel_opts $dir/$lang/log/train.$x.$n.log \
            nnet-train$parallel_suffix $parallel_train_opts \
            --minibatch-size=$this_minibatch_size --srand=$x $dir/$lang/$x.mdl \
            "ark,bg:nnet-copy-egs --keep-proportion=$this_keep_proportion --frame=$frame ark:${egs_dir[$lang]}/egs.$archive.ark ark:-|nnet-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x ark:- ark:-|" \
            $dir/$lang/$[$x+1].$n.mdl || touch $dir/.error &
        done
      done
      wait
    )
    # the error message below is not that informative, but $cmd will
    # have printed a more specific one.
    [ -f $dir/.error ] && echo "$0: error on iteration $x of training" && exit 1;


    learning_rate=`perl -e '($x,$n,$i,$f)=@ARGV; print ($x >= $n ? $f : $i*exp($x*log($f/$i)/$n));' $[$x+1] $num_iters $initial_learning_rate $final_learning_rate`;

    (
      # First average within each language.  Use a sub-shell so "wait" won't
      # wait for the diagnostic jobs.
      for lang in $(seq 0 $[$num_lang-1]); do
        this_num_jobs_nnet=${num_jobs_nnet_array[$lang]}
        nnets_list=$(for n in `seq 1 $this_num_jobs_nnet`; do echo $dir/$lang/$[$x+1].$n.mdl; done)
        # average the output of the different jobs.
        $cmd $dir/$lang/log/average.$x.log \
          nnet-am-average $nnets_list - \| \
          nnet-am-copy --learning-rate=$learning_rate - $dir/$lang/$[$x+1].tmp.mdl || touch $dir/.error &
      done
      wait
      [ -f $dir/.error ] && echo "$0: error averaging models on iteration $x of training" && exit 1;
      # Remove the models we just averaged.
      for lang in $(seq 0 $[$num_lang-1]); do
        this_num_jobs_nnet=${num_jobs_nnet_array[$lang]}
        for n in `seq 1 $this_num_jobs_nnet`; do rm $dir/$lang/$[$x+1].$n.mdl; done
      done
    )


    nnets_list=$(for lang in $(seq 0 $[$num_lang-1]); do echo $dir/$lang/$[$x+1].tmp.mdl; done)
    weights_csl=$(echo $num_jobs_nnet | sed 's/ /:/g') # get as colon separated list.

    # the next command produces the cross-language averaged model containing the
    # final layer corresponding to language zero.
    $cmd $dir/log/average.$x.log \
      nnet-am-average --weights=$weights_csl --skip-last-layer=true \
      $nnets_list $dir/0/$[$x+1].mdl || exit 1;

    for lang in $(seq 1 $[$num_lang-1]); do
      # the next command takes the averaged hidden parameters from language zero, and
      # the last layer from language $lang.  It's not really doing averaging.
      $cmd $dir/$lang/log/combine_average.$x.log \
        nnet-am-average --weights=0.0:1.0 --skip-last-layer=true \
          $dir/$lang/$[$x+1].tmp.mdl $dir/0/$[$x+1].mdl $dir/$lang/$[$x+1].mdl || exit 1;
    done

    $cleanup && rm $dir/*/$[$x+1].tmp.mdl

    if [ $x -eq $mix_up_iter ]; then
      for lang in $(seq 0 $[$num_lang-1]); do     
        this_mix_up=${mix_up_array[$lang]}
        if [ $this_mix_up -gt 0 ]; then
          echo "$0: for language $lang, mixing up to $this_mix_up components"
          $cmd $dir/$lang/log/mix_up.$x.log \
            nnet-am-mixup --min-count=10 --num-mixtures=$this_mix_up \
             $dir/$lang/$[$x+1].mdl $dir/$lang/$[$x+1].mdl || exit 1;
        fi
      done
    fi

    # Now average across languages.

    rm $nnets_list

    for lang in $(seq 0 $[$num_lang-1]); do # mix up.
      [ ! -f $dir/$lang/$[$x+1].mdl ] && echo "No such file $dir/$lang/$[$x+1].mdl" && exit 1;
      if [ -f $dir/$lang/$[$x-1].mdl ] && $cleanup && \
        [ $[($x-1)%100] -ne 0  ] && [ $[$x-1] -lt $first_model_combine ]; then
        rm $dir/$lang/$[$x-1].mdl
      fi
    done
  fi
  x=$[$x+1]
done


if [ $stage -le $num_iters ]; then
  echo "$0: Doing combination to produce final models"


  rm $dir/.error 2>/dev/null
  for lang in $(seq 0 $[$num_lang-1]); do
    nnets_list=()
    # the if..else..fi statement below sets 'nnets_list'.
    if [ $max_models_combine -lt $num_models_combine ]; then
      # The number of models to combine is too large, e.g. > 20.  In this case,
      # each argument to nnet-combine-fast will be an average of multiple models.
      cur_offset=0 # current offset from first_model_combine.
      for n in $(seq $max_models_combine); do
        next_offset=$[($n*$num_models_combine)/$max_models_combine]
        sub_list="" 
        for o in $(seq $cur_offset $[$next_offset-1]); do
          iter=$[$first_model_combine+$o]
          mdl=$dir/$lang/$iter.mdl
          [ ! -f $mdl ] && echo "$0: Expected $mdl to exist" && exit 1;
          sub_list="$sub_list $mdl"
        done
        nnets_list[$[$n-1]]="nnet-am-average $sub_list - |"
        cur_offset=$next_offset
      done
    else
      nnets_list=
      for n in $(seq 0 $[num_models_combine-1]); do
        iter=$[$first_model_combine+$n]
        mdl=$dir/$lang/$iter.mdl
        [ ! -f $mdl ] && echo "$0: Expected $mdl to exist" && exit 1;
        nnets_list[$n]=$mdl
      done
    fi

    # Below, use --use-gpu=no to disable nnet-combine-fast from using a GPU, as
    # if there are many models it can give out-of-memory error; set num-threads
    # to 8 to speed it up (this isn't ideal...)
    num_egs=`nnet-copy-egs ark:${egs_dir[$lang]}/combine.egs ark:/dev/null 2>&1 | tail -n 1 | awk '{print $NF}'`

    mb=$[($num_egs+$combine_num_threads-1)/$combine_num_threads]
    [ $mb -gt 512 ] && mb=512
    # Setting --initial-model to a large value makes it initialize the combination
    # with the average of all the models.  It's important not to start with a
    # single model, or, due to the invariance to scaling that these nonlinearities
    # give us, we get zero diagonal entries in the fisher matrix that
    # nnet-combine-fast uses for scaling, which after flooring and inversion, has
    # the effect that the initial model chosen gets much higher learning rates
    # than the others.  This prevents the optimization from working well.
    $cmd $combine_parallel_opts $dir/$lang/log/combine.log \
      nnet-combine-fast --initial-model=100000 --num-lbfgs-iters=40 --use-gpu=no \
        --num-threads=$combine_num_threads \
        --verbose=3 --minibatch-size=$mb "${nnets_list[@]}" ark:${egs_dir[$lang]}/combine.egs \
      - \| nnet-normalize-stddev - $dir/$lang/final.mdl || touch $dir/.error &
  done
  wait
  
  [ -f $dir/.error ] && echo "$0: error doing model combination" && exit 1;
fi


if [ $stage -le $[$num_iters+1] ]; then
  for lang in $(seq 0 $[$num_lang-1]); do  
    # Run the diagnostics for the final models.
    $cmd $dir/$lang/log/compute_prob_valid.final.log \
      nnet-compute-prob $dir/$lang/final.mdl ark:${egs_dir[$lang]}/valid_diagnostic.egs &
    $cmd $dir/$lang/log/compute_prob_train.final.log \
      nnet-compute-prob $dir/$lang/final.mdl ark:${egs_dir[$lang]}/train_diagnostic.egs &
  done
  wait
fi

if [ $stage -le $[$num_iters+2] ]; then
  # Note: this just uses CPUs, using a smallish subset of data.


  for lang in $(seq 0 $[$num_lang-1]); do
    echo "$0: Getting average posterior for purposes of adjusting the priors (language $lang)."
    rm $dir/$lang/.error 2>/dev/null
    rm $dir/$lang/post.$x.*.vec 2>/dev/null
    $cmd JOB=1:$num_jobs_compute_prior $dir/$lang/log/get_post.JOB.log \
      nnet-copy-egs --frame=random --srand=JOB ark:${egs_dir[$lang]}/egs.1.ark ark:- \| \
      nnet-subset-egs --srand=JOB --n=$prior_subset_size ark:- ark:- \| \
      nnet-compute-from-egs "nnet-to-raw-nnet $dir/$lang/final.mdl -|" ark:- ark:- \| \
      matrix-sum-rows ark:- ark:- \| vector-sum ark:- $dir/$lang/post.JOB.vec || touch $dir/$lang/.error &
  done
  echo "$0: ... waiting for jobs for all languages to complete."
  wait
  sleep 3;  # make sure there is time for $dir/$lang/post.$x.*.vec to appear.
  for lang in $(seq 0 $[$num_lang-1]); do
    [ -f $dir/$lang/.error ] && \
      echo "$0: error getting posteriors for adjusting the priors for language $lang" && exit 1;

    $cmd $dir/$lang/log/vector_sum.log \
      vector-sum $dir/$lang/post.*.vec $dir/$lang/post.vec || exit 1;

    rm $dir/$lang/post.*.vec;

    echo "Re-adjusting priors based on computed posteriors for language $lang"
    $cmd $dir/$lang/log/adjust_priors.final.log \
      nnet-adjust-priors $dir/$lang/final.mdl $dir/$lang/post.vec $dir/$lang/final.mdl || exit 1;
  done
fi


for lang in $(seq 0 $[$num_lang-1]); do
  if [ ! -f $dir/$lang/final.mdl ]; then
    echo "$0: $dir/final.mdl does not exist."
    # we don't want to clean up if the training didn't succeed.
    exit 1;
  fi
done

sleep 2

echo Done

if $cleanup; then
  echo Cleaning up data
  if [[ $egs_dir =~ $dir/egs* ]]; then
    steps/nnet2/remove_egs.sh $egs_dir
  fi

  echo Removing most of the models
  for x in `seq 0 $num_iters`; do
    if [ $[$x%100] -ne 0 ] && [ $x -ne $num_iters ] && [ -f $dir/$lang/$x.mdl ]; then
       # delete all but every 100th model; don't delete the ones which combine to form the final model.
      rm $dir/$lang/$x.mdl
    fi
  done
fi

exit 0


================================================
FILE: egs/steps/nnet2/train_multisplice_accel2.sh
================================================
#!/usr/bin/env bash

# Copyright 2012-2014  Johns Hopkins University (Author: Daniel Povey).
#           2013  Xiaohui Zhang
#           2013  Guoguo Chen
#           2014  Vimal Manohar
#           2014  Vijayaditya Peddinti
# Apache 2.0.

# train_multisplice_accel2.sh is a modified version of
# train_pnorm_multisplice2.sh (still using pnorm).  The "accel" refers to the
# fact that we increase the number of jobs during training (from
# --num-jobs-initial to --num-jobs-final).  We dropped "pnorm" from the name as
# it was getting too long.


# Begin configuration section.
cmd=run.pl
num_epochs=15      # Number of epochs of training;
                   # the number of iterations is worked out from this.
initial_effective_lrate=0.01
final_effective_lrate=0.001
bias_stddev=0.5
pnorm_input_dim=3000
pnorm_output_dim=300
presoftmax_prior_scale_power=-0.25 # use the specified power value on the priors (inverse priors)
                                   # to scale the pre-softmax outputs
minibatch_size=128 # by default use a smallish minibatch size for neural net
                   # training; this controls instability which would otherwise
                   # be a problem with multi-threaded update.

samples_per_iter=400000 # each iteration of training, see this many samples
                        # per job.  This option is passed to get_egs.sh
num_jobs_initial=1  # Number of neural net jobs to run in parallel at the start of training
num_jobs_final=8   # Number of neural net jobs to run in parallel at the end of training
prior_subset_size=10000 # 10k samples per job, for computing priors.  Should be
                        # more than enough.
num_jobs_compute_prior=10 # these are single-threaded, run on CPU.
get_egs_stage=0
fix_nnet=false
min_average=0.05
max_average=0.95
online_ivector_dir=
remove_egs=true  # set to false to disable removing egs.

max_models_combine=20 # The "max_models_combine" is the maximum number of models we give
  # to the final 'combine' stage, but these models will themselves be averages of
  # iteration-number ranges.

shuffle_buffer_size=5000 # This "buffer_size" variable controls randomization of the samples
                # on each iter.  You could set it to 0 or to a large value for complete
                # randomization, but this would both consume memory and cause spikes in
                # disk I/O.  Smaller is easier on disk and memory but less random.  It's
                # not a huge deal though, as samples are anyway randomized right at the start.
                # (the point of this is to get data in different minibatches on different iterations,
                # since in the preconditioning method, 2 samples in the same minibatch can
                # affect each others' gradients.

add_layers_period=2 # by default, add new layers every 2 iterations.
num_hidden_layers=3
stage=-4
exit_stage=-100 # you can set this to terminate the training early.  Exits before running this stage

splice_indexes="layer0/-4:-3:-2:-1:0:1:2:3:4 layer2/-5:-1:3"
# Format : layer<hidden_layer>/<frame_indices>....layer<hidden_layer>/<frame_indices> "
# note: hidden layers which are composed of one or more components,
# so hidden layer indexing is different from component count


io_opts="--max-jobs-run 5" # for jobs with a lot of I/O, limits the number running at one time.   These don't
randprune=4.0 # speeds up LDA.
alpha=4.0 # relates to preconditioning.
update_period=4 # relates to online preconditioning: says how often we update the subspace.
num_samples_history=2000 # relates to online preconditioning
max_change_per_sample=0.075
precondition_rank_in=20  # relates to online preconditioning
precondition_rank_out=80 # relates to online preconditioning

mix_up=0 # Number of components to mix up to (should be > #tree leaves, if
        # specified.)
num_threads=16
parallel_opts="--num-threads 16 --mem 1G"
  # by default we use 16 threads; this lets the queue know.
  # note: parallel_opts doesn't automatically get adjusted if you adjust num-threads.
combine_num_threads=8
combine_parallel_opts="--num-threads 8"  # queue options for the "combine" stage.
cleanup=true
egs_dir=
lda_opts=
lda_dim=
egs_opts=
transform_dir=     # If supplied, overrides alidir
cmvn_opts=  # will be passed to get_lda.sh and get_egs.sh, if supplied.
            # only relevant for "raw" features, not lda.
feat_type=  # Can be used to force "raw" features.
align_cmd=              # The cmd that is passed to steps/nnet2/align.sh
align_use_gpu=          # Passed to use_gpu in steps/nnet2/align.sh [yes/no]
realign_times=          # List of times on which we realign.  Each time is
                        # floating point number strictly between 0 and 1, which
                        # will be multiplied by the num-iters to get an iteration
                        # number.
num_jobs_align=30       # Number of jobs for realignment
# End configuration section.
frames_per_eg=8 # to be passed on to get_egs2.sh

trap 'for pid in $(jobs -pr); do kill -KILL $pid; done' INT QUIT TERM

echo "$0 $@"  # Print the command line for logging

if [ -f path.sh ]; then . ./path.sh; fi
. parse_options.sh || exit 1;

echo $@
if [ $# != 4 ]; then
  echo "Usage: $0 [opts] <data> <lang> <ali-dir> <exp-dir>"
  echo " e.g.: $0 data/train data/lang exp/tri3_ali exp/tri4_nnet"
  echo ""
  echo "Main options (for others, see top of script file)"
  echo "  --config <config-file>                           # config file containing options"
  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
  echo "  --num-epochs <#epochs|15>                        # Number of epochs of training"
  echo "  --initial-effective-lrate <lrate|0.02> # effective learning rate at start of training."
  echo "  --final-effective-lrate <lrate|0.004>   # effective learning rate at end of training."
  echo "                                                   # data, 0.00025 for large data"
  echo "  --num-hidden-layers <#hidden-layers|2>           # Number of hidden layers, e.g. 2 for 3 hours of data, 4 for 100hrs"
  echo "  --add-layers-period <#iters|2>                   # Number of iterations between adding hidden layers"
  echo "  --presoftmax-prior-scale-power <power|-0.25>     # use the specified power value on the priors (inverse priors)"
  echo "                                                   # to scale the pre-softmax outputs."
  echo "                                                   # (set to 0.0 to disable the presoftmax element scale)"
  echo "  --mix-up <#pseudo-gaussians|0>                   # This option now does nothing; please remove it."
  echo "                                                   # per context-dependent state.  Try a number several times #states."
  echo "  --num-jobs-initial <num-jobs|1>                  # Number of parallel jobs to use for neural net training, at the start."
  echo "  --num-jobs-final <num-jobs|8>                    # Number of parallel jobs to use for neural net training, at the end"
  echo "  --num-threads <num-threads|16>                   # Number of parallel threads per job (will affect results"
  echo "                                                   # as well as speed; may interact with batch size; if you increase"
  echo "                                                   # this, you may want to decrease the batch size."
  echo "  --parallel-opts <opts|\"--num-threads 16 --mem 1G\">      # extra options to pass to e.g. queue.pl for processes that"
  echo "                                                   # use multiple threads... "
  echo "  --io-opts <opts|\"--max-jobs-run 10\">                      # Options given to e.g. queue.pl for jobs that do a lot of I/O."
  echo "  --minibatch-size <minibatch-size|128>            # Size of minibatch to process (note: product with --num-threads"
  echo "                                                   # should not get too large, e.g. >2k)."
  echo "  --samples-per-iter <#samples|400000>             # Number of samples of data to process per iteration, per"
  echo "                                                   # process."
  echo "  --splice-indexes <string|layer0/-4:-3:-2:-1:0:1:2:3:4> "
  echo "                                                   # Frame indices used for each splice layer."
  echo "                                                   # Format : layer<hidden_layer_index>/<frame_indices>....layer<hidden_layer>/<frame_indices> "
  echo "                                                   # (note: we splice processed, typically 40-dimensional frames"
  echo "  --lda-dim <dim|''>                               # Dimension to reduce spliced features to with LDA"
  echo "  --realign-times <list-of-times|\"\">             # A list of space-separated floating point numbers between 0.0 and"
  echo "                                                   # 1.0 to specify how far through training realignment is to be done"
  echo "  --align-cmd (utils/run.pl|utils/queue.pl <queue opts>) # passed to align.sh"
  echo "  --align-use-gpu (yes/no)                         # specify is gpu is to be used for realignment"
  echo "  --num-jobs-align <#njobs|30>                     # Number of jobs to perform realignment"
  echo "  --stage <stage|-4>                               # Used to run a partially-completed training process from somewhere in"
  echo "                                                   # the middle."


  exit 1;
fi

data=$1
lang=$2
alidir=$3
dir=$4

if [ ! -z "$realign_times" ]; then
  [ -z "$align_cmd" ] && echo "$0: realign_times specified but align_cmd not specified" && exit 1
  [ -z "$align_use_gpu" ] && echo "$0: realign_times specified but align_use_gpu not specified" && exit 1
fi

# Check some files.
for f in $data/feats.scp $lang/L.fst $alidir/ali.1.gz $alidir/final.mdl $alidir/tree; do
  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
done


# Set some variables.
num_leaves=`tree-info $alidir/tree 2>/dev/null | grep num-pdfs | awk '{print $2}'` || exit 1
[ -z $num_leaves ] && echo "\$num_leaves is unset" && exit 1
[ "$num_leaves" -eq "0" ] && echo "\$num_leaves is 0" && exit 1

nj=`cat $alidir/num_jobs` || exit 1;  # number of jobs in alignment dir...
# in this dir we'll have just one job.
sdata=$data/split$nj
utils/split_data.sh $data $nj

mkdir -p $dir/log
echo $nj > $dir/num_jobs
cp $alidir/tree $dir

# process the splice_inds string, to get a layer-wise context string
# to be processed by the nnet-components
# this would be mainly used by SpliceComponent|SpliceMaxComponent
python steps/nnet2/make_multisplice_configs.py contexts --splice-indexes "$splice_indexes" $dir || exit -1;
context_string=$(cat $dir/vars) || exit -1
echo $context_string
eval $context_string || exit -1; #
  # initializes variables used by get_lda.sh and get_egs.sh
  # get_lda.sh : first_left_context, first_right_context,
  # get_egs.sh : nnet_left_context & nnet_right_context

extra_opts=()
[ ! -z "$cmvn_opts" ] && extra_opts+=(--cmvn-opts "$cmvn_opts")
[ ! -z "$feat_type" ] && extra_opts+=(--feat-type $feat_type)
[ ! -z "$online_ivector_dir" ] && extra_opts+=(--online-ivector-dir $online_ivector_dir)
[ -z "$transform_dir" ] && transform_dir=$alidir
extra_opts+=(--transform-dir $transform_dir)

if [ $stage -le -4 ]; then
  echo "$0: calling get_lda.sh"
  steps/nnet2/get_lda.sh $lda_opts "${extra_opts[@]}" --left-context $first_left_context --right-context $first_right_context --cmd "$cmd" $data $lang $alidir $dir || exit 1;
fi
# these files will have been written by get_lda.sh
feat_dim=$(cat $dir/feat_dim) || exit 1;
ivector_dim=$(cat $dir/ivector_dim) || exit 1;
lda_dim=$(cat $dir/lda_dim) || exit 1;

if [ $stage -le -3 ] && [ -z "$egs_dir" ]; then

  extra_opts+=(--left-context $nnet_left_context )
  extra_opts+=(--right-context $nnet_right_context )
  echo "$0: calling get_egs2.sh"
  steps/nnet2/get_egs2.sh $egs_opts "${extra_opts[@]}" \
      --samples-per-iter $samples_per_iter --stage $get_egs_stage \
      --io-opts "$io_opts" \
      --cmd "$cmd" $egs_opts \
      --frames-per-eg $frames_per_eg \
      $data $alidir $dir/egs || exit 1;
fi

[ -z $egs_dir ] && egs_dir=$dir/egs
# confirm that the provided egs_dir has the necessary context
egs_left_context=$(cat $egs_dir/info/left_context) || exit 1
egs_right_context=$(cat $egs_dir/info/right_context) || exit 1
([[ $egs_left_context -lt $nnet_left_context ]] || [[ $egs_right_context -lt $nnet_right_context ]]) &&
  echo "$0: Provided egs_dir $egs_dir does not have sufficient context to train the neural network." && exit 1;

frames_per_eg=$(cat $egs_dir/info/frames_per_eg) || { echo "error: no such file $egs_dir/info/frames_per_eg"; exit 1; }
num_archives=$(cat $egs_dir/info/num_archives) || { echo "error: no such file $egs_dir/info/frames_per_eg"; exit 1; }

# num_archives_expanded considers each separate label-position from
# 0..frames_per_eg-1 to be a separate archive.
num_archives_expanded=$[$num_archives*$frames_per_eg]

[ $num_jobs_initial -gt $num_jobs_final ] && \
  echo "$0: --initial-num-jobs cannot exceed --final-num-jobs" && exit 1;

[ $num_jobs_final -gt $num_archives_expanded ] && \
  echo "$0: --final-num-jobs cannot exceed #archives $num_archives_expanded." && exit 1;

if ! [ $num_hidden_layers -ge 1 ]; then
  echo "Invalid num-hidden-layers $num_hidden_layers"
  exit 1
fi

if [ $stage -le -2 ]; then
  echo "$0: initializing neural net";
  lda_mat=$dir/lda.mat
  tot_input_dim=$[$feat_dim+$ivector_dim]

  online_preconditioning_opts="alpha=$alpha num-samples-history=$num_samples_history update-period=$update_period rank-in=$precondition_rank_in rank-out=$precondition_rank_out max-change-per-sample=$max_change_per_sample"

  initial_lrate=$(perl -e "print ($initial_effective_lrate*$num_jobs_initial);")

  # create the config files for nnet initialization
  python steps/nnet2/make_multisplice_configs.py  \
    --splice-indexes "$splice_indexes"  \
    --total-input-dim $tot_input_dim  \
    --ivector-dim $ivector_dim  \
    --lda-mat "$lda_mat"  \
    --lda-dim $lda_dim  \
    --pnorm-input-dim $pnorm_input_dim  \
    --pnorm-output-dim  $pnorm_output_dim \
    --online-preconditioning-opts "$online_preconditioning_opts"  \
    --initial-learning-rate $initial_lrate \
    --bias-stddev  $bias_stddev  \
    --num-hidden-layers $num_hidden_layers \
    --num-targets  $num_leaves  \
    configs  $dir || exit -1;

  $cmd $dir/log/nnet_init.log \
    nnet-am-init $alidir/tree $lang/topo "nnet-init $dir/nnet.config -|" \
    $dir/0.mdl || exit 1;
fi
if [ $pnorm_input_dim -eq $pnorm_output_dim ] && [ $fix_nnet ]; then fix_nnet=true;fi
if [ $stage -le -1 ]; then
  echo "Training transition probabilities and setting priors"
  $cmd $dir/log/train_trans.log \
    nnet-train-transitions $dir/0.mdl "ark:gunzip -c $alidir/ali.*.gz|" $dir/0.mdl \
    || exit 1;

  if [ "$presoftmax_prior_scale_power" != "0.0" ]; then
    echo "prepare initial vector for FixedScaleComponent before softmax"
    echo "use priors^$presoftmax_prior_scale_power and rescale to average 1"

    # obtains raw pdf count
    $cmd JOB=1:$nj $dir/log/acc_pdf.JOB.log \
      ali-to-post "ark:gunzip -c $alidir/ali.JOB.gz|" ark:- \| \
      post-to-tacc --per-pdf=true --binary=false $alidir/final.mdl ark:- $dir/JOB.pacc || exit 1;
    cat $dir/*.pacc > $dir/pacc
    rm $dir/*.pacc
    awk -v power=$presoftmax_prior_scale_power \
      '{ for(i=2; i<=NF-1; i++) {sum[i]+=$i} }
      END {
        for (i=2; i<=NF-1; i++) {total+=sum[i]}
        ave_pdf=int(total/(NF-2)); total+=0.01*ave_pdf*(NF-2)
        for (i=2; i<=NF-1; i++) {rescale+=((sum[i]+0.01*ave_pdf)/total)^power}
        rescale/=(NF-2)
        printf " [ "; for (i=2; i<=NF-1; i++) {printf("%f ", ((sum[i]+0.01*ave_pdf)/total)^power/rescale)}; print "]"
      }' $dir/pacc > $dir/presoftmax_prior_scale_vecfile

    echo "FixedScaleComponent scales=$dir/presoftmax_prior_scale_vecfile" > $dir/per_element.config
    echo "insert an additional layer of FixedScaleComponent before softmax"
    inp=`nnet-am-info $dir/0.mdl | grep 'Softmax' | awk '{print $2}'`
    nnet-init $dir/per_element.config - | nnet-insert --insert-at=$inp --randomize-next-component=false $dir/0.mdl - $dir/0.mdl
  fi
fi

# set num_iters so that as close as possible, we process the data $num_epochs
# times, i.e. $num_iters*$avg_num_jobs) == $num_epochs*$num_archives_expanded,
# where avg_num_jobs=(num_jobs_initial+num_jobs_final)/2.

num_archives_to_process=$[$num_epochs*$num_archives_expanded]
num_archives_processed=0
num_iters=$[($num_archives_to_process*2)/($num_jobs_initial+$num_jobs_final)]

! [ $num_iters -gt $[$finish_add_layers_iter+2] ] \
  && echo "$0: Insufficient epochs" && exit 1

finish_add_layers_iter=$[$num_hidden_layers * $add_layers_period]


# mix up at the iteration where we've processed about half the data; this keeps
# the overall training procedure fairly invariant to the number of initial and
# final jobs.
# j = initial, k = final, n = num-iters, x = half-of-data epoch,
# p is proportion of data we want to process (e.g. p=0.5 here).
# solve for x if the amount of data processed by epoch x is p
# times the amount by iteration n.
# put this in wolfram alpha:
# solve { x*j + (k-j)*x*x/(2*n) = p * (j*n + (k-j)*n/2), {x} }
# got: x = (j n-sqrt(-n^2 (j^2 (p-1)-k^2 p)))/(j-k) and j!=k and n!=0
# simplified manually to: n * (sqrt(((1-p)j^2 + p k^2)/2) - j)/(j-k)
mix_up_iter=$(perl -e '($j,$k,$n,$p)=@ARGV; print int(0.5 + ($j==$k ? $n*$p : $n*(sqrt((1-$p)*$j*$j+$p*$k*$k)-$j)/($k-$j))); ' $num_jobs_initial $num_jobs_final $num_iters 0.5)
! [ $mix_up_iter -gt $finish_add_layers_iter ] && \
  echo "Mix-up-iter is $mix_up_iter, should be greater than $finish_add_layers_iter -> add more epochs?" \
  && exit 1;

echo "$0: Will train for $num_epochs epochs = $num_iters iterations"
echo "$0: Will not do mix up"

if [ $num_threads -eq 1 ]; then
  parallel_suffix="-simple" # this enables us to use GPU code if
                         # we have just one thread.
  parallel_train_opts=
  if ! cuda-compiled; then
    echo "$0: WARNING: you are running with one thread but you have not compiled"
    echo "   for CUDA.  You may be running a setup optimized for GPUs.  If you have"
    echo "   GPUs and have nvcc installed, go to src/ and do ./configure; make"
  fi
else
  parallel_suffix="-parallel"
  parallel_train_opts="--num-threads=$num_threads"
fi


approx_iters_per_epoch_final=$[$num_archives_expanded/$num_jobs_final]
# First work out how many models we want to combine over in the final
# nnet-combine-fast invocation.  This equals
# min(max(max_models_combine, approx_iters_per_epoch_final),
#     2/3 * iters_after_mixup)
num_models_combine=$max_models_combine
if [ $num_models_combine -lt $approx_iters_per_epoch_final ]; then
   num_models_combine=$approx_iters_per_epoch_final
fi
iters_after_mixup_23=$[(($num_iters-$mix_up_iter-1)*2)/3]
if [ $num_models_combine -gt $iters_after_mixup_23 ]; then
  num_models_combine=$iters_after_mixup_23
fi
first_model_combine=$[$num_iters-$num_models_combine+1]

x=0


for realign_time in $realign_times; do
  # Work out the iterations on which we will re-align, if the --realign-times
  # option was used.  This is slightly approximate.
  ! perl -e "exit($realign_time > 0.0 && $realign_time < 1.0 ? 0:1);" && \
    echo "Invalid --realign-times option $realign_times: elements must be strictly between 0 and 1.";
  # the next formula is based on the one for mix_up_iter above.
  realign_iter=$(perl -e '($j,$k,$n,$p)=@ARGV; print int(0.5 + ($j==$k ? $n*$p : $n*(sqrt((1-$p)*$j*$j+$p*$k*$k)-$j)/($k-$j))); ' $num_jobs_initial $num_jobs_final $num_iters $realign_time) || exit 1;
  realign_this_iter[$realign_iter]=$realign_time
done

cur_egs_dir=$egs_dir

while [ $x -lt $num_iters ]; do
  [ $x -eq $exit_stage ] && echo "$0: Exiting early due to --exit-stage $exit_stage" && exit 0;
  if [ $x -gt $[$num_iters/2] ]; then fix_nnet=false; fi
  this_num_jobs=$(perl -e "print int(0.5+$num_jobs_initial+($num_jobs_final-$num_jobs_initial)*$x/$num_iters);")

  ilr=$initial_effective_lrate; flr=$final_effective_lrate; np=$num_archives_processed; nt=$num_archives_to_process;
  this_learning_rate=$(perl -e "print (($x + 1 >= $num_iters ? $flr : $ilr*exp($np*log($flr/$ilr)/$nt))*$this_num_jobs);");

  echo "On iteration $x, learning rate is $this_learning_rate."

  if [ ! -z "${realign_this_iter[$x]}" ]; then
    prev_egs_dir=$cur_egs_dir
    cur_egs_dir=$dir/egs_${realign_this_iter[$x]}
  fi

  if [ $x -ge 0 ] && [ $stage -le $x ]; then
    if [ ! -z "${realign_this_iter[$x]}" ]; then
      time=${realign_this_iter[$x]}

      echo "Getting average posterior for purposes of adjusting the priors."
      # Note: this just uses CPUs, using a smallish subset of data.
      # always use the first egs archive, which makes the script simpler;
      # we're using different random subsets of it.
      rm $dir/post.$x.*.vec 2>/dev/null
      $cmd JOB=1:$num_jobs_compute_prior $dir/log/get_post.$x.JOB.log \
        nnet-copy-egs --srand=JOB --frame=random ark:$prev_egs_dir/egs.1.ark ark:- \| \
        nnet-subset-egs --srand=JOB --n=$prior_subset_size ark:- ark:- \| \
        nnet-compute-from-egs "nnet-to-raw-nnet $dir/$x.mdl -|" ark:- ark:- \| \
        matrix-sum-rows ark:- ark:- \| vector-sum ark:- $dir/post.$x.JOB.vec || exit 1;

      sleep 3;  # make sure there is time for $dir/post.$x.*.vec to appear.

      $cmd $dir/log/vector_sum.$x.log \
        vector-sum $dir/post.$x.*.vec $dir/post.$x.vec || exit 1;
      rm $dir/post.$x.*.vec;

      echo "Re-adjusting priors based on computed posteriors"
      $cmd $dir/log/adjust_priors.$x.log \
        nnet-adjust-priors $dir/$x.mdl $dir/post.$x.vec $dir/$x.mdl || exit 1;

      sleep 2

      steps/nnet2/align.sh --nj $num_jobs_align --cmd "$align_cmd" --use-gpu $align_use_gpu \
        --transform-dir "$transform_dir" --online-ivector-dir "$online_ivector_dir" \
        --iter $x $data $lang $dir $dir/ali_$time || exit 1

      steps/nnet2/relabel_egs2.sh --cmd "$cmd" --iter $x $dir/ali_$time \
        $prev_egs_dir $cur_egs_dir || exit 1

      if $cleanup && [[ $prev_egs_dir =~ $dir/egs* ]]; then
        steps/nnet2/remove_egs.sh $prev_egs_dir
      fi
    fi

    # Set off jobs doing some diagnostics, in the background.
    # Use the egs dir from the previous iteration for the diagnostics
    $cmd $dir/log/compute_prob_valid.$x.log \
      nnet-compute-prob $dir/$x.mdl ark:$cur_egs_dir/valid_diagnostic.egs &
    $cmd $dir/log/compute_prob_train.$x.log \
      nnet-compute-prob $dir/$x.mdl ark:$cur_egs_dir/train_diagnostic.egs &
    if [ $x -gt 0 ] && [ ! -f $dir/log/mix_up.$[$x-1].log ]; then
      $cmd $dir/log/progress.$x.log \
        nnet-show-progress --use-gpu=no $dir/$[$x-1].mdl $dir/$x.mdl \
        ark:$cur_egs_dir/train_diagnostic.egs '&&' \
        nnet-am-info $dir/$x.mdl &
    fi

    echo "Training neural net (pass $x)"

    if [ $x -gt 0 ] && \
      [ $x -le $[($num_hidden_layers-1)*$add_layers_period] ] && \
      [ $[$x%$add_layers_period] -eq 0 ]; then
      do_average=false # if we've just mixed up, don't do averaging take the best.
      cur_num_hidden_layers=$[$x/$add_layers_period];
      inp=`nnet-am-info $dir/$x.mdl | grep 'Softmax' | awk '{print $2}'`

      if [ "$presoftmax_prior_scale_power" != "0.0" ]; then
        inp=$[$inp-2]
      else
        inp=$[$inp-1]
      fi

      mdl="nnet-init --srand=$x $dir/hidden_${cur_num_hidden_layers}.config - | nnet-insert --insert-at=$inp $dir/$x.mdl - - | nnet-am-copy --learning-rate=$this_learning_rate - -|"
    else
      do_average=true
      if [ $x -eq 0 ]; then do_average=false; fi # on iteration 0, pick the best, don't average.
      mdl="nnet-am-copy --learning-rate=$this_learning_rate $dir/$x.mdl -|"
    fi
    if $do_average; then
      this_minibatch_size=$minibatch_size
    else
      # on iteration zero or when we just added a layer, use a smaller minibatch
      # size and just one job: the model-averaging doesn't seem to be helpful
      # when the model is changing too fast (i.e. it worsens the objective
      # function), and the smaller minibatch size will help to keep
      # the update stable.
      this_minibatch_size=$[$minibatch_size/2];
    fi

    rm $dir/.error 2>/dev/null


    ( # this sub-shell is so that when we "wait" below,
      # we only wait for the training jobs that we just spawned,
      # not the diagnostic jobs that we spawned above.

      # We can't easily use a single parallel SGE job to do the main training,
      # because the computation of which archive and which --frame option
      # to use for each job is a little complex, so we spawn each one separately.
      for n in $(seq $this_num_jobs); do
        k=$[$num_archives_processed + $n - 1]; # k is a zero-based index that we'll derive
                                               # the other indexes from.
        archive=$[($k%$num_archives)+1]; # work out the 1-based archive index.
        frame=$[(($k/$num_archives)%$frames_per_eg)]; # work out the 0-based frame
        # index; this increases more slowly than the archive index because the
        # same archive with different frame indexes will give similar gradients,
        # so we want to separate them in time.

        $cmd $parallel_opts $dir/log/train.$x.$n.log \
          nnet-train$parallel_suffix $parallel_train_opts \
          --minibatch-size=$this_minibatch_size --srand=$x "$mdl" \
          "ark,bg:nnet-copy-egs --frame=$frame ark:$cur_egs_dir/egs.$archive.ark ark:-|nnet-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x ark:- ark:-|" \
          $dir/$[$x+1].$n.mdl || touch $dir/.error &
      done
      wait
    )
    # the error message below is not that informative, but $cmd will
    # have printed a more specific one.
    [ -f $dir/.error ] && echo "$0: error on iteration $x of training" && exit 1;

    nnets_list=
    for n in `seq 1 $this_num_jobs`; do
      nnets_list="$nnets_list $dir/$[$x+1].$n.mdl"
    done

    if $do_average; then
      # average the output of the different jobs.
      $cmd $dir/log/average.$x.log \
        nnet-am-average $nnets_list $dir/$[$x+1].mdl || exit 1;
    else
      # choose the best from the different jobs.
      n=$(perl -e '($nj,$pat)=@ARGV; $best_n=1; $best_logprob=-1.0e+10; for ($n=1;$n<=$nj;$n++) {
          $fn = sprintf($pat,$n); open(F, "<$fn") || die "Error opening log file $fn";
          undef $logprob; while (<F>) { if (m/log-prob-per-frame=(\S+)/) { $logprob=$1; } }
          close(F); if (defined $logprob && $logprob > $best_logprob) { $best_logprob=$logprob;
          $best_n=$n; } } print "$best_n\n"; ' $num_jobs_nnet $dir/log/train.$x.%d.log) || exit 1;
      [ -z "$n" ] && echo "Error getting best model" && exit 1;
      cp $dir/$[$x+1].$n.mdl $dir/$[$x+1].mdl || exit 1;
    fi
    if $fix_nnet; then
      # do nnet-am-fix to fix some pathology in the network
      nnet-am-fix --max-average-deriv=$max_average --min-average-deriv=$min_average $dir/$[$x+1].mdl $dir/$[$x+1].mdl 2>$dir/log/fix.$x.log || exit;
    fi

    if [ "$mix_up" -gt 0 ] && [ $x -eq $mix_up_iter ]; then
      echo "Warning: the mix up opertion is disabled!"
      echo "    Ignore mix up leaves number specified"
    fi
    rm $nnets_list
    [ ! -f $dir/$[$x+1].mdl ] && exit 1;
    if [ -f $dir/$[$x-1].mdl ] && $cleanup && \
       [ $[($x-1)%100] -ne 0  ] && [ $[$x-1] -lt $first_model_combine ]; then
      rm $dir/$[$x-1].mdl
    fi
  fi
  x=$[$x+1]
  num_archives_processed=$[$num_archives_processed+$this_num_jobs]
done


if [ $stage -le $num_iters ]; then
  echo "Doing final combination to produce final.mdl"

  # Now do combination.
  nnets_list=()
  # the if..else..fi statement below sets 'nnets_list'.
  if [ $max_models_combine -lt $num_models_combine ]; then
    # The number of models to combine is too large, e.g. > 20.  In this case,
    # each argument to nnet-combine-fast will be an average of multiple models.
    cur_offset=0 # current offset from first_model_combine.
    for n in $(seq $max_models_combine); do
      next_offset=$[($n*$num_models_combine)/$max_models_combine]
      sub_list=""
      for o in $(seq $cur_offset $[$next_offset-1]); do
        iter=$[$first_model_combine+$o]
        mdl=$dir/$iter.mdl
        [ ! -f $mdl ] && echo "Expected $mdl to exist" && exit 1;
        sub_list="$sub_list $mdl"
      done
      nnets_list[$[$n-1]]="nnet-am-average $sub_list - |"
      cur_offset=$next_offset
    done
  else
    nnets_list=
    for n in $(seq 0 $[num_models_combine-1]); do
      iter=$[$first_model_combine+$n]
      mdl=$dir/$iter.mdl
      [ ! -f $mdl ] && echo "Expected $mdl to exist" && exit 1;
      nnets_list[$n]=$mdl
    done
  fi


  # Below, use --use-gpu=no to disable nnet-combine-fast from using a GPU, as
  # if there are many models it can give out-of-memory error; set num-threads to 8
  # to speed it up (this isn't ideal...)
  num_egs=`nnet-copy-egs ark:$cur_egs_dir/combine.egs ark:/dev/null 2>&1 | tail -n 1 | awk '{print $NF}'`
  mb=$[($num_egs+$combine_num_threads-1)/$combine_num_threads]
  [ $mb -gt 512 ] && mb=512
  # Setting --initial-model to a large value makes it initialize the combination
  # with the average of all the models.  It's important not to start with a
  # single model, or, due to the invariance to scaling that these nonlinearities
  # give us, we get zero diagonal entries in the fisher matrix that
  # nnet-combine-fast uses for scaling, which after flooring and inversion, has
  # the effect that the initial model chosen gets much higher learning rates
  # than the others.  This prevents the optimization from working well.
  $cmd $combine_parallel_opts $dir/log/combine.log \
    nnet-combine-fast --initial-model=100000 --num-lbfgs-iters=40 --use-gpu=no \
      --num-threads=$combine_num_threads \
      --verbose=3 --minibatch-size=$mb "${nnets_list[@]}" ark:$cur_egs_dir/combine.egs \
      $dir/final.mdl || exit 1;

  # Normalize stddev for affine or block affine layers that are followed by a
  # pnorm layer and then a normalize layer.
  $cmd $dir/log/normalize.log \
    nnet-normalize-stddev $dir/final.mdl $dir/final.mdl || exit 1;

  # Compute the probability of the final, combined model with
  # the same subset we used for the previous compute_probs, as the
  # different subsets will lead to different probs.
  $cmd $dir/log/compute_prob_valid.final.log \
    nnet-compute-prob $dir/final.mdl ark:$cur_egs_dir/valid_diagnostic.egs &
  $cmd $dir/log/compute_prob_train.final.log \
    nnet-compute-prob $dir/final.mdl ark:$cur_egs_dir/train_diagnostic.egs &
fi

if [ $stage -le $[$num_iters+1] ]; then
  echo "Getting average posterior for purposes of adjusting the priors."
  # Note: this just uses CPUs, using a smallish subset of data.
  rm $dir/post.$x.*.vec 2>/dev/null
  $cmd JOB=1:$num_jobs_compute_prior $dir/log/get_post.$x.JOB.log \
    nnet-copy-egs --frame=random --srand=JOB ark:$cur_egs_dir/egs.1.ark ark:- \| \
    nnet-subset-egs --srand=JOB --n=$prior_subset_size ark:- ark:- \| \
    nnet-compute-from-egs "nnet-to-raw-nnet $dir/final.mdl -|" ark:- ark:- \| \
    matrix-sum-rows ark:- ark:- \| vector-sum ark:- $dir/post.$x.JOB.vec || exit 1;

  sleep 3;  # make sure there is time for $dir/post.$x.*.vec to appear.

  $cmd $dir/log/vector_sum.$x.log \
   vector-sum $dir/post.$x.*.vec $dir/post.$x.vec || exit 1;

  rm $dir/post.$x.*.vec;

  echo "Re-adjusting priors based on computed posteriors"
  $cmd $dir/log/adjust_priors.final.log \
    nnet-adjust-priors $dir/final.mdl $dir/post.$x.vec $dir/final.mdl || exit 1;
fi


if [ ! -f $dir/final.mdl ]; then
  echo "$0: $dir/final.mdl does not exist."
  # we don't want to clean up if the training didn't succeed.
  exit 1;
fi

sleep 2

echo Done

if $cleanup; then
  echo Cleaning up data
  if $remove_egs && [[ $cur_egs_dir =~ $dir/egs* ]]; then
    steps/nnet2/remove_egs.sh $cur_egs_dir
  fi

  echo Removing most of the models
  for x in `seq 0 $num_iters`; do
    if [ $[$x%100] -ne 0 ] && [ $x -ne $num_iters ] && [ -f $dir/$x.mdl ]; then
       # delete all but every 100th model; don't delete the ones which combine to form the final model.
      rm $dir/$x.mdl
    fi
  done
fi


================================================
FILE: egs/steps/nnet2/train_multisplice_ensemble.sh
================================================
#!/usr/bin/env bash

# Copyright 2012-2014  Johns Hopkins University (Author: Daniel Povey).
#           2013  Xiaohui Zhang
#           2013  Guoguo Chen
#           2014  Vimal Manohar
#           2014  Vijayaditya Peddinti
# Apache 2.0.

# train_multisplice_accel2.sh is a modified version of
# train_pnorm_multisplice2.sh (still using pnorm).  The "accel" refers to the
# fact that we increase the number of jobs during training (from
# --num-jobs-initial to --num-jobs-final).  We dropped "pnorm" from the name as
# it was getting too long.


# Begin configuration section.
cmd=run.pl
num_epochs=15      # Number of epochs of training;
                   # the number of iterations is worked out from this.
initial_effective_lrate=0.01
final_effective_lrate=0.001
bias_stddev=0.5
pnorm_input_dim=3000
pnorm_output_dim=300
minibatch_size=128 # by default use a smallish minibatch size for neural net
                   # training; this controls instability which would otherwise
                   # be a problem with multi-threaded update.

samples_per_iter=400000 # each iteration of training, see this many samples
                        # per job.  This option is passed to get_egs.sh
num_jobs_initial=1  # Number of neural net jobs to run in parallel at the start of training
num_jobs_final=8   # Number of neural net jobs to run in parallel at the end of training
prior_subset_size=10000 # 10k samples per job, for computing priors.  Should be
                        # more than enough.
num_jobs_compute_prior=10 # these are single-threaded, run on CPU.
get_egs_stage=0
online_ivector_dir=
remove_egs=true  # set to false to disable removing egs.

max_models_combine=20 # The "max_models_combine" is the maximum number of models we give
  # to the final 'combine' stage, but these models will themselves be averages of
  # iteration-number ranges.

shuffle_buffer_size=5000 # This "buffer_size" variable controls randomization of the samples
                # on each iter.  You could set it to 0 or to a large value for complete
                # randomization, but this would both consume memory and cause spikes in
                # disk I/O.  Smaller is easier on disk and memory but less random.  It's
                # not a huge deal though, as samples are anyway randomized right at the start.
                # (the point of this is to get data in different minibatches on different iterations,
                # since in the preconditioning method, 2 samples in the same minibatch can
                # affect each others' gradients.

add_layers_period=2 # by default, add new layers every 2 iterations.
num_hidden_layers=3
stage=-4
exit_stage=-100 # you can set this to terminate the training early.  Exits before running this stage

splice_indexes="layer0/-4:-3:-2:-1:0:1:2:3:4 layer2/-5:-1:3"
# Format : layer<hidden_layer>/<frame_indices>....layer<hidden_layer>/<frame_indices> "
# note: hidden layers which are composed of one or more components,
# so hidden layer indexing is different from component count


io_opts="--max-jobs-run 5" # for jobs with a lot of I/O, limits the number running at one time.   These don't
randprune=4.0 # speeds up LDA.
alpha=4.0 # relates to preconditioning.
update_period=4 # relates to online preconditioning: says how often we update the subspace.
num_samples_history=2000 # relates to online preconditioning
max_change_per_sample=0.075
precondition_rank_in=20  # relates to online preconditioning
precondition_rank_out=80 # relates to online preconditioning

mix_up=0 # Number of components to mix up to (should be > #tree leaves, if
        # specified.)
num_threads=16
parallel_opts="--num-threads 16 --mem 1G"
  # by default we use 16 threads; this lets the queue know.
  # note: parallel_opts doesn't automatically get adjusted if you adjust num-threads.
combine_num_threads=8
combine_parallel_opts="--num-threads 8"  # queue options for the "combine" stage.
cleanup=true
egs_dir=
lda_opts=
lda_dim=
egs_opts=
transform_dir=     # If supplied, overrides alidir
postdir=
cmvn_opts=  # will be passed to get_lda.sh and get_egs.sh, if supplied.
            # only relevant for "raw" features, not lda.
feat_type=  # Can be used to force "raw" features.
align_cmd=              # The cmd that is passed to steps/nnet2/align.sh
align_use_gpu=          # Passed to use_gpu in steps/nnet2/align.sh [yes/no]
realign_times=          # List of times on which we realign.  Each time is
                        # floating point number strictly between 0 and 1, which
                        # will be multiplied by the num-iters to get an iteration
                        # number.
num_jobs_align=30       # Number of jobs for realignment
srand=0 # random seed used to initialize the nnet
initial_beta=0.1
final_beta=3
ensemble_size=4
# End configuration section.

trap 'for pid in $(jobs -pr); do kill -KILL $pid; done' INT QUIT TERM

echo "$0 $@"  # Print the command line for logging

if [ -f path.sh ]; then . ./path.sh; fi
. parse_options.sh || exit 1;

if [ $# != 4 ]; then
  echo "Usage: $0 [opts] <data> <lang> <ali-dir> <exp-dir>"
  echo " e.g.: $0 data/train data/lang exp/tri3_ali exp/tri4_nnet"
  echo ""
  echo "Main options (for others, see top of script file)"
  echo "  --config <config-file>                           # config file containing options"
  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
  echo "  --num-epochs <#epochs|15>                        # Number of epochs of training"
  echo "  --initial-effective-lrate <lrate|0.02> # effective learning rate at start of training."
  echo "  --final-effective-lrate <lrate|0.004>   # effective learning rate at end of training."
  echo "                                                   # data, 0.00025 for large data"
  echo "  --num-hidden-layers <#hidden-layers|2>           # Number of hidden layers, e.g. 2 for 3 hours of data, 4 for 100hrs"
  echo "  --add-layers-period <#iters|2>                   # Number of iterations between adding hidden layers"
  echo "  --mix-up <#pseudo-gaussians|0>                   # Can be used to have multiple targets in final output layer,"
  echo "                                                   # per context-dependent state.  Try a number several times #states."
  echo "  --num-jobs-initial <num-jobs|1>                  # Number of parallel jobs to use for neural net training, at the start."
  echo "  --num-jobs-final <num-jobs|8>                    # Number of parallel jobs to use for neural net training, at the end"
  echo "  --num-threads <num-threads|16>                   # Number of parallel threads per job (will affect results"
  echo "                                                   # as well as speed; may interact with batch size; if you increase"
  echo "                                                   # this, you may want to decrease the batch size."
  echo "  --parallel-opts <opts|\"--num-threads 16 --mem 1G\">      # extra options to pass to e.g. queue.pl for processes that"
  echo "                                                   # use multiple threads... "
  echo "  --io-opts <opts|\"--max-jobs-run 10\">                      # Options given to e.g. queue.pl for jobs that do a lot of I/O."
  echo "  --minibatch-size <minibatch-size|128>            # Size of minibatch to process (note: product with --num-threads"
  echo "                                                   # should not get too large, e.g. >2k)."
  echo "  --samples-per-iter <#samples|400000>             # Number of samples of data to process per iteration, per"
  echo "                                                   # process."
  echo "  --splice-indexes <string|layer0/-4:-3:-2:-1:0:1:2:3:4> "
  echo "                                                   # Frame indices used for each splice layer."
  echo "                                                   # Format : layer<hidden_layer_index>/<frame_indices>....layer<hidden_layer>/<frame_indices> "
  echo "                                                   # (note: we splice processed, typically 40-dimensional frames"
  echo "  --lda-dim <dim|''>                               # Dimension to reduce spliced features to with LDA"
  echo "  --realign-epochs <list-of-epochs|''>             # A list of space-separated epoch indices the beginning of which"
  echo "                                                   # realignment is to be done"
  echo "  --align-cmd (utils/run.pl|utils/queue.pl <queue opts>) # passed to align.sh"
  echo "  --align-use-gpu (yes/no)                         # specify is gpu is to be used for realignment"
  echo "  --num-jobs-align <#njobs|30>                     # Number of jobs to perform realignment"
  echo "  --stage <stage|-4>                               # Used to run a partially-completed training process from somewhere in"
  echo "                                                   # the middle."


  exit 1;
fi

data=$1
lang=$2
alidir=$3
dir=$4

if [ ! -z "$realign_times" ]; then
  [ -z "$align_cmd" ] && echo "$0: realign_times specified but align_cmd not specified" && exit 1
  [ -z "$align_use_gpu" ] && echo "$0: realign_times specified but align_use_gpu not specified" && exit 1
fi

# Check some files.
for f in $data/feats.scp $lang/L.fst $alidir/final.mdl $alidir/tree; do
  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
done

[ ! -f $postdir/post.1.scp ] && [ ! -f $alidir/ali.1.gz ] && echo "$0: no (soft) alignments provided" && exit 1;

# Set some variables.
num_leaves=`tree-info $alidir/tree 2>/dev/null | grep num-pdfs | awk '{print $2}'` || exit 1
[ -z $num_leaves ] && echo "\$num_leaves is unset" && exit 1
[ "$num_leaves" -eq "0" ] && echo "\$num_leaves is 0" && exit 1

nj=`cat $alidir/num_jobs` || exit 1;  # number of jobs in alignment dir...
# in this dir we'll have just one job.
sdata=$data/split$nj
utils/split_data.sh $data $nj

mkdir -p $dir/log
echo $nj > $dir/num_jobs
cp $alidir/tree $dir

# process the splice_inds string, to get a layer-wise context string
# to be processed by the nnet-components
# this would be mainly used by SpliceComponent|SpliceMaxComponent
python steps/nnet2/make_multisplice_configs.py contexts --splice-indexes "$splice_indexes" $dir || exit -1;
context_string=$(cat $dir/vars) || exit -1
echo $context_string
eval $context_string || exit -1; #
  # initializes variables used by get_lda.sh and get_egs.sh
  # get_lda.sh : first_left_context, first_right_context,
  # get_egs.sh : nnet_left_context & nnet_right_context

extra_opts=()
[ ! -z "$cmvn_opts" ] && extra_opts+=(--cmvn-opts "$cmvn_opts")
[ ! -z "$feat_type" ] && extra_opts+=(--feat-type $feat_type)
[ ! -z "$online_ivector_dir" ] && extra_opts+=(--online-ivector-dir $online_ivector_dir)
[ -z "$transform_dir" ] && transform_dir=$alidir
extra_opts+=(--transform-dir $transform_dir)

if [ $stage -le -4 ]; then
  echo "$0: calling get_lda.sh"
  steps/nnet2/get_lda.sh $lda_opts "${extra_opts[@]}" --left-context $first_left_context --right-context $first_right_context --cmd "$cmd" $data $lang $alidir $dir || exit 1;
fi
# these files will have been written by get_lda.sh
feat_dim=$(cat $dir/feat_dim) || exit 1;
ivector_dim=$(cat $dir/ivector_dim) || exit 1;
lda_dim=$(cat $dir/lda_dim) || exit 1;

if [ $stage -le -3 ] && [ -z "$egs_dir" ]; then

  extra_opts+=(--left-context $nnet_left_context )
  extra_opts+=(--right-context $nnet_right_context )
  echo "$0: calling get_egs2.sh"
  steps/nnet2/get_egs2.sh $egs_opts "${extra_opts[@]}" \
      --postdir "$postdir" \
      --samples-per-iter $samples_per_iter --stage $get_egs_stage \
      --io-opts "$io_opts" \
      --cmd "$cmd" $egs_opts \
      $data $alidir $dir/egs || exit 1;
fi

if [ -z $egs_dir ]; then
  egs_dir=$dir/egs
fi

frames_per_eg=$(cat $egs_dir/info/frames_per_eg) || { echo "error: no such file $egs_dir/info/frames_per_eg"; exit 1; }
num_archives=$(cat $egs_dir/info/num_archives) || { echo "error: no such file $egs_dir/info/frames_per_eg"; exit 1; }

# num_archives_expanded considers each separate label-position from
# 0..frames_per_eg-1 to be a separate archive.
num_archives_expanded=$[$num_archives*$frames_per_eg]

[ $num_jobs_initial -gt $num_jobs_final ] && \
  echo "$0: --initial-num-jobs cannot exceed --final-num-jobs" && exit 1;

[ $num_jobs_final -gt $num_archives_expanded ] && \
  echo "$0: --final-num-jobs cannot exceed #archives $num_archives_expanded." && exit 1;

if ! [ $num_hidden_layers -ge 1 ]; then
  echo "Invalid num-hidden-layers $num_hidden_layers"
  exit 1
fi

if [ $stage -le -2 ]; then
  echo "$0: initializing neural net";
  lda_mat=$dir/lda.mat
  tot_input_dim=$[$feat_dim+$ivector_dim]

  online_preconditioning_opts="alpha=$alpha num-samples-history=$num_samples_history update-period=$update_period rank-in=$precondition_rank_in rank-out=$precondition_rank_out max-change-per-sample=$max_change_per_sample"

  initial_lrate=$(perl -e "print ($initial_effective_lrate*$num_jobs_initial);")

  # create the config files for nnet initialization
  python steps/nnet2/make_multisplice_configs.py  \
    --splice-indexes "$splice_indexes"  \
    --total-input-dim $tot_input_dim  \
    --ivector-dim $ivector_dim  \
    --lda-mat "$lda_mat"  \
    --lda-dim $lda_dim  \
    --pnorm-input-dim $pnorm_input_dim  \
    --pnorm-output-dim  $pnorm_output_dim \
    --online-preconditioning-opts "$online_preconditioning_opts"  \
    --initial-learning-rate $initial_lrate \
    --bias-stddev  $bias_stddev  \
    --num-hidden-layers $num_hidden_layers \
    --num-targets  $num_leaves  \
    configs  $dir || exit -1;

    $cmd $parallel_opts JOB=1:$ensemble_size $dir/log/nnet_init.JOB.log \
      nnet-am-init $alidir/tree $lang/topo "nnet-init --srand=JOB $dir/nnet.config -|" \
      $dir/0.JOB.mdl || exit 1;
fi

if [ $stage -le -1 ]; then
  echo "Training transition probabilities and setting priors"
  $cmd $parallel_opts JOB=1:$ensemble_size $dir/log/train_trans.JOB.log \
    nnet-train-transitions $dir/0.JOB.mdl "ark:gunzip -c $alidir/ali.*.gz|" $dir/0.JOB.mdl \
    || exit 1;
fi

# set num_iters so that as close as possible, we process the data $num_epochs
# times, i.e. $num_iters*$avg_num_jobs) == $num_epochs*$num_archives_expanded,
# where avg_num_jobs=(num_jobs_initial+num_jobs_final)/2.

num_archives_to_process=$[$num_epochs*$num_archives_expanded]
num_archives_processed=0
num_iters=$[($num_archives_to_process*2)/($num_jobs_initial+$num_jobs_final)]

! [ $num_iters -gt $[$finish_add_layers_iter+2] ] \
  && echo "$0: Insufficient epochs" && exit 1

finish_add_layers_iter=$[$num_hidden_layers * $add_layers_period]


# mix up at the iteration where we've processed about half the data; this keeps
# the overall training procedure fairly invariant to the number of initial and
# final jobs.
# j = initial, k = final, n = num-iters, x = half-of-data epoch,
# p is proportion of data we want to process (e.g. p=0.5 here).
# solve for x if the amount of data processed by epoch x is p
# times the amount by iteration n.
# put this in wolfram alpha:
# solve { x*j + (k-j)*x*x/(2*n) = p * (j*n + (k-j)*n/2), {x} }
# got: x = (j n-sqrt(-n^2 (j^2 (p-1)-k^2 p)))/(j-k) and j!=k and n!=0
# simplified manually to: n * (sqrt(((1-p)j^2 + p k^2)/2) - j)/(j-k)
mix_up_iter=$(perl -e '($j,$k,$n,$p)=@ARGV; print int(0.5 + ($j==$k ? $n*$p : $n*(sqrt((1-$p)*$j*$j+$p*$k*$k)-$j)/($k-$j))); ' $num_jobs_initial $num_jobs_final $num_iters 0.5)
! [ $mix_up_iter -gt $finish_add_layers_iter ] && \
  echo "Mix-up-iter is $mix_up_iter, should be greater than $finish_add_layers_iter -> add more epochs?" \
  && exit 1;

echo "$0: Will train for $num_epochs epochs = $num_iters iterations"
[ $mix_up -gt 0 ] && echo "$0: Will mix up on iteration $mix_up_iter"

if [ $num_threads -eq 1 ]; then
  parallel_suffix="-simple" # this enables us to use GPU code if
                         # we have just one thread.
  parallel_train_opts=
  if ! cuda-compiled; then
    echo "$0: WARNING: you are running with one thread but you have not compiled"
    echo "   for CUDA.  You may be running a setup optimized for GPUs.  If you have"
    echo "   GPUs and have nvcc installed, go to src/ and do ./configure; make"
  fi
else
  parallel_suffix="-parallel"
  parallel_train_opts="--num-threads=$num_threads"
fi


approx_iters_per_epoch_final=$[$num_archives_expanded/$num_jobs_final]
# First work out how many models we want to combine over in the final
# nnet-combine-fast invocation.  This equals
# min(max(max_models_combine, approx_iters_per_epoch_final),
#     2/3 * iters_after_mixup)
num_models_combine=$max_models_combine
if [ $num_models_combine -lt $approx_iters_per_epoch_final ]; then
   num_models_combine=$approx_iters_per_epoch_final
fi
iters_after_mixup_23=$[(($num_iters-$mix_up_iter-1)*2)/3]
if [ $num_models_combine -gt $iters_after_mixup_23 ]; then
  num_models_combine=$iters_after_mixup_23
fi
first_model_combine=$[$num_iters-$num_models_combine+1]

x=0


for realign_time in $realign_times; do
  # Work out the iterations on which we will re-align, if the --realign-times
  # option was used.  This is slightly approximate.
  ! perl -e "exit($realign_time > 0.0 && $realign_time < 1.0 ? 0:1);" && \
    echo "Invalid --realign-times option $realign_times: elements must be strictly between 0 and 1.";
  # the next formula is based on the one for mix_up_iter above.
  realign_iter=$(perl -e '($j,$k,$n,$p)=@ARGV; print int(0.5 + ($j==$k ? $n*$p : $n*(sqrt((1-$p)*$j*$j+$p*$k*$k)-$j)/($k-$j))); ' $num_jobs_initial $num_jobs_final $num_iters $realign_time) || exit 1;
  realign_this_iter[$realign_iter]=$realign_time
done

cur_egs_dir=$egs_dir

while [ $x -lt $num_iters ]; do
  [ $x -eq $exit_stage ] && echo "$0: Exiting early due to --exit-stage $exit_stage" && exit 0;

  this_num_jobs=$(perl -e "print int(0.5+$num_jobs_initial+($num_jobs_final-$num_jobs_initial)*$x/$num_iters);")

  ilr=$initial_effective_lrate; flr=$final_effective_lrate; np=$num_archives_processed; nt=$num_archives_to_process;
  this_learning_rate=$(perl -e "print (($x + 1 >= $num_iters ? $flr : $ilr*exp($np*log($flr/$ilr)/$nt))*$this_num_jobs);");

  echo "On iteration $x, learning rate is $this_learning_rate."

  if [ ! -z "${realign_this_iter[$x]}" ]; then
    prev_egs_dir=$cur_egs_dir
    cur_egs_dir=$dir/egs_${realign_this_iter[$x]}
  fi

  if [ $x -ge 0 ] && [ $stage -le $x ]; then
    if [ ! -z "${realign_this_iter[$x]}" ]; then
      time=${realign_this_iter[$x]}

      echo "Getting average posterior for purposes of adjusting the priors."
      # Note: this just uses CPUs, using a smallish subset of data.
      # always use the first egs archive, which makes the script simpler;
      # we're using different random subsets of it.
      rm $dir/post.$x.*.vec 2>/dev/null
      $cmd JOB=1:$num_jobs_compute_prior $dir/log/get_post.$x.JOB.log \
        nnet-copy-egs --srand=JOB --frame=random ark:$prev_egs_dir/egs.1.ark ark:- \| \
        nnet-subset-egs --srand=JOB --n=$prior_subset_size ark:- ark:- \| \
        nnet-compute-from-egs "nnet-to-raw-nnet $dir/$x.mdl -|" ark:- ark:- \| \
        matrix-sum-rows ark:- ark:- \| vector-sum ark:- $dir/post.$x.JOB.vec || exit 1;

      sleep 3;  # make sure there is time for $dir/post.$x.*.vec to appear.

      $cmd $dir/log/vector_sum.$x.log \
        vector-sum $dir/post.$x.*.vec $dir/post.$x.vec || exit 1;
      rm $dir/post.$x.*.vec;

      echo "Re-adjusting priors based on computed posteriors"
      $cmd $dir/log/adjust_priors.$x.log \
        nnet-adjust-priors $dir/$x.mdl $dir/post.$x.vec $dir/$x.mdl || exit 1;

      sleep 2

      steps/nnet2/align.sh --nj $num_jobs_align --cmd "$align_cmd" --use-gpu $align_use_gpu \
        --transform-dir "$transform_dir" --online-ivector-dir "$online_ivector_dir" \
        --iter $x $data $lang $dir $dir/ali_$time || exit 1

      steps/nnet2/relabel_egs2.sh --cmd "$cmd" --iter $x $dir/ali_$time \
        $prev_egs_dir $cur_egs_dir || exit 1

      if $cleanup && [[ $prev_egs_dir =~ $dir/egs* ]]; then
        steps/nnet2/remove_egs.sh $prev_egs_dir
      fi
    fi

    # Set off jobs doing some diagnostics, in the background.
    # Use the egs dir from the previous iteration for the diagnostics
    $cmd $dir/log/compute_prob_valid.$x.log \
      nnet-compute-prob $dir/$x.1.mdl ark:$cur_egs_dir/valid_diagnostic.egs &
    $cmd $dir/log/compute_prob_train.$x.log \
      nnet-compute-prob $dir/$x.1.mdl ark:$cur_egs_dir/train_diagnostic.egs &
    if [ $x -gt 0 ] && [ ! -f $dir/log/mix_up.$[$x-1].log ]; then
      $cmd $dir/log/progress.$x.log \
        nnet-show-progress --use-gpu=no $dir/$[$x-1].1.mdl $dir/$x.1.mdl \
        ark:$cur_egs_dir/train_diagnostic.egs '&&' \
        nnet-am-info $dir/$x.1.mdl &
    fi

    echo "Training neural net (pass $x)"

    declare -A mdl
    if [ $x -gt 0 ] && \
      [ $x -le $[($num_hidden_layers-1)*$add_layers_period] ] && \
      [ $[$x%$add_layers_period] -eq 0 ]; then
      do_average=false # if we've just mixed up, don't do averaging take the best.
      cur_num_hidden_layers=$[$x/$add_layers_period];
      for i in `seq 1 $ensemble_size`; do
        mdl[$i]="nnet-init --srand=$[$x+$i] $dir/hidden_${cur_num_hidden_layers}.config - | nnet-insert $dir/$x.$i.mdl - - | nnet-am-copy --learning-rate=$this_learning_rate - -|"
      done
    else
      do_average=true
      if [ $x -eq 0 ]; then do_average=false; fi # on iteration 0, pick the best, don't average.
      for i in `seq 1 $ensemble_size`; do
        mdl[$i]="nnet-am-copy --learning-rate=$this_learning_rate $dir/$x.$i.mdl -|"
      done
    fi
    if $do_average; then
      this_minibatch_size=$minibatch_size
    else
      # on iteration zero or when we just added a layer, use a smaller minibatch
      # size and just one job: the model-averaging doesn't seem to be helpful
      # when the model is changing too fast (i.e. it worsens the objective
      # function), and the smaller minibatch size will help to keep
      # the update stable.
      this_minibatch_size=$[$minibatch_size/2];
    fi

    rm $dir/.error 2>/dev/null


    ( # this sub-shell is so that when we "wait" below,
      # we only wait for the training jobs that we just spawned,
      # not the diagnostic jobs that we spawned above.

      # We can't easily use a single parallel SGE job to do the main training,
      # because the computation of which archive and which --frame option
      # to use for each job is a little complex, so we spawn each one separately.
      for n in $(seq $this_num_jobs); do
        k=$[$num_archives_processed + $n - 1]; # k is a zero-based index that we'll derive
                                               # the other indexes from.
        archive=$[($k%$num_archives)+1]; # work out the 1-based archive index.
        frame=$[(($k/$num_archives)%$frames_per_eg)]; # work out the 0-based frame
        # index; this increases more slowly than the archive index because the
        # same archive with different frame indexes will give similar gradients,
        # so we want to separate them in time.

      nnets_ensemble_in=
      nnets_ensemble_out=
      for i in `seq 1 $ensemble_size`; do
        nnets_ensemble_in="$nnets_ensemble_in '${mdl[$i]}'"
        nnets_ensemble_out="${nnets_ensemble_out} $dir/$[$x+1].$n.$i.mdl "
      done

      beta=`perl -e '($x,$n,$i,$f)=@ARGV; print ($i+$x*($f-$i)/$n);' $[$x+1] $num_iters $initial_beta $final_beta`;


        $cmd $parallel_opts $dir/log/train.$x.$n.log \
          nnet-train-ensemble \
          --minibatch-size=$this_minibatch_size --srand=$x \
          --beta=$beta $nnets_ensemble_in \
          "ark,bg:nnet-copy-egs --frame=$frame ark:$cur_egs_dir/egs.$archive.ark ark:-|nnet-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x ark:- ark:-|" \
          ark:- $nnets_ensemble_out || touch $dir/.error &
      done
      wait
    )
    # the error message below is not that informative, but $cmd will
    # have printed a more specific one.
    [ -f $dir/.error ] && echo "$0: error on iteration $x of training" && exit 1;

    for i in `seq 1 $ensemble_size`; do
      nnets_list=
      for n in `seq 1 $this_num_jobs`; do
        nnets_list="$nnets_list $dir/$[$x+1].$n.$i.mdl"
      done

      if $do_average; then
        # average the output of the different jobs.
        $cmd $dir/log/average.$x.log \
          nnet-am-average $nnets_list $dir/$[$x+1].$i.mdl ||  exit 1;
      else
        # choose the best from the different jobs.
        n=$(perl -e '($nj,$pat)=@ARGV; $best_n=1; $best_logprob=-1.0e+10; for ($n=1;$n<=$nj;$n++) {
            $fn = sprintf($pat,$n); open(F, "<$fn") || die "Error opening log file $fn";
            undef $logprob; while (<F>) { if (m/log-prob-per-frame=(\S+)/) { $logprob=$1; } }
            close(F); if (defined $logprob && $logprob > $best_logprob) { $best_logprob=$logprob;
            $best_n=$n; } } print "$best_n\n"; ' $num_jobs_nnet $dir/log/train.$x.%d.log) || exit 1;
        [ -z "$n" ] && echo "Error getting best model" && exit 1;
        cp $dir/$[$x+1].$n.$i.mdl $dir/$[$x+1].$i.mdl || exit 1;
      fi

      if [ "$mix_up" -gt 0 ] && [ $x -eq $mix_up_iter ]; then
        # mix up.
        echo Mixing up from $num_leaves to $mix_up components
        $cmd $dir/log/mix_up.$x.$i.log \
          nnet-am-mixup --min-count=10 --num-mixtures=$mix_up \
          $dir/$[$x+1].$i.mdl $dir/$[$x+1].$i.mdl || exit 1;
      fi
      rm $nnets_list
      [ ! -f $dir/$[$x+1].$i.mdl ] && exit 1;
      if [ -f $dir/$[$x-1].$i.mdl ] && $cleanup && \
         [ $[($x-1)%100] -ne 0  ] && [ $[$x-1] -lt $first_model_combine ]; then
        rm $dir/$[$x-1].$i.mdl
      fi
    done
  fi
    x=$[$x+1]
  num_archives_processed=$[$num_archives_processed+$this_num_jobs]
done


if [ $stage -le $num_iters ]; then
  echo "Doing final combination to produce final.mdl"

(
  # Now do combination.
  for i in `seq 1 $ensemble_size`; do
    # Now do combination.
    nnets_list=()
    # the if..else..fi statement below sets 'nnets_list'.
    if [ $max_models_combine -lt $num_models_combine ]; then
      # The number of models to combine is too large, e.g. > 20.  In this case,
      # each argument to nnet-combine-fast will be an average of multiple models.
      cur_offset=0 # current offset from first_model_combine.
      for n in $(seq $max_models_combine); do
        next_offset=$[($n*$num_models_combine)/$max_models_combine]
        sub_list=""
        for o in $(seq $cur_offset $[$next_offset-1]); do
          iter=$[$first_model_combine+$o]
          mdl=$dir/$iter.$i.mdl
          [ ! -f $mdl ] && echo "Expected $mdl to exist" && exit 1;
          sub_list="$sub_list $mdl"
        done
        nnets_list[$[$n-1]]="nnet-am-average $sub_list - |"
        cur_offset=$next_offset
      done
    else
      for n in $(seq 0 $[num_models_combine-1]); do
        iter=$[$first_model_combine+$n]
        mdl=$dir/$iter.$i.mdl
        [ ! -f $mdl ] && echo "Expected $mdl to exist" && exit 1;
        nnets_list[$n]=$mdl
      done
    fi
    # Below, use --use-gpu=no to disable nnet-combine-fast from using a GPU, as
    # if there are many models it can give out-of-memory error; set num-threads to 8
    # to speed it up (this isn't ideal...)
    num_egs=`nnet-copy-egs ark:$cur_egs_dir/combine.egs ark:/dev/null 2>&1 | tail -n 1 | awk '{print $NF}'`
    mb=$[($num_egs+$combine_num_threads-1)/$combine_num_threads]
    [ $mb -gt 512 ] && mb=512
    # Setting --initial-model to a large value makes it initialize the combination
    # with the average of all the models.  It's important not to start with a
    # single model, or, due to the invariance to scaling that these nonlinearities
    # give us, we get zero diagonal entries in the fisher matrix that
    # nnet-combine-fast uses for scaling, which after flooring and inversion, has
    # the effect that the initial model chosen gets much higher learning rates
    # than the others.  This prevents the optimization from working well.

    $cmd $combine_parallel_opts  $dir/log/combine.$i.log \
    nnet-combine-fast --initial-model=100000 --num-lbfgs-iters=40 --use-gpu=no \
      --num-threads=$combine_num_threads \
      --verbose=3 --minibatch-size=$mb "${nnets_list[@]}" ark:$cur_egs_dir/combine.egs \
      $dir/final.$i.mdl || touch $dir/.error &

    [ -f $dir/.error ] && echo "$0: error when combining models." && exit 1;
    rm $dir/.error 2>/dev/null
  done
  wait
)
  # Normalize stddev for affine or block affine layers that are followed by a
  # pnorm layer and then a normalize layer.
  $cmd JOB=1:$ensemble_size $dir/log/normalize.JOB.log \
    nnet-normalize-stddev $dir/final.JOB.mdl $dir/final.JOB.mdl || exit 1;

  # Compute the probability of the final, combined model with
  # the same subset we used for the previous compute_probs, as the
  # different subsets will lead to different probs.
  $cmd $dir/log/compute_prob_valid.final.log \
    nnet-compute-prob $dir/final.1.mdl ark:$cur_egs_dir/valid_diagnostic.egs &
  $cmd $dir/log/compute_prob_train.final.log \
    nnet-compute-prob $dir/final.1.mdl ark:$cur_egs_dir/train_diagnostic.egs &
fi

if [ $stage -le $[$num_iters+1] ]; then
  for i in `seq 1 $ensemble_size`; do
    echo "Getting average posterior for purposes of adjusting the priors."
    # Note: this just uses CPUs, using a smallish subset of data.
    rm $dir/post.$x.*.vec 2>/dev/null
    $cmd JOB=1:$num_jobs_compute_prior $dir/log/get_post.$x.JOB.log \
      nnet-copy-egs --frame=random --srand=JOB ark:$cur_egs_dir/egs.1.ark ark:- \| \
      nnet-subset-egs --srand=JOB --n=$prior_subset_size ark:- ark:- \| \
      nnet-compute-from-egs "nnet-to-raw-nnet $dir/final.$i.mdl -|" ark:- ark:- \| \
      matrix-sum-rows ark:- ark:- \| vector-sum ark:- $dir/post.$x.JOB.vec || exit 1;

    sleep 3;  # make sure there is time for $dir/post.$x.*.vec to appear.

    $cmd $dir/log/vector_sum.$x.log \
     vector-sum $dir/post.$x.*.vec $dir/post.$x.vec || exit 1;

    rm $dir/post.$x.*.vec;

    echo "Re-adjusting priors based on computed posteriors"
    $cmd $dir/log/adjust_priors.final.log \
      nnet-adjust-priors $dir/final.$i.mdl $dir/post.$x.vec $dir/final.$i.mdl || exit 1;
  done
fi
cp $dir/final.1.mdl $dir/final.mdl


if [ ! -f $dir/final.mdl ]; then
  echo "$0: $dir/final.mdl does not exist."
  # we don't want to clean up if the training didn't succeed.
  exit 1;
fi

sleep 2

echo Done

if $cleanup; then
  echo Cleaning up data
  if $remove_egs && [[ $cur_egs_dir =~ $dir/egs* ]]; then
    steps/nnet2/remove_egs.sh $cur_egs_dir
  fi
  echo Removing most of the models
  for x in `seq 0 $num_iters`; do
    for i in `seq 1 $ensemble_size`; do
      if [ $[$x%100] -ne 0 ] && [ $x -ne $num_iters ] && [ -f $dir/$x.$i.mdl ]; then
       # delete all but every 100th model; don't delete the ones which combine to form the final model.
        rm $dir/$x.$i.mdl
      fi
    done
  done
fi


================================================
FILE: egs/steps/nnet2/train_pnorm.sh
================================================
#!/usr/bin/env bash

# Copyright 2012  Johns Hopkins University (Author: Daniel Povey).
#           2013  Xiaohui Zhang
#           2013  Guoguo Chen
#           2014  Vimal Manohar
# Apache 2.0.


# This script trains neural network with pnorm nonlinearities.
# The difference with train_tanh.sh is that, instead of setting
# hidden_layer_size, you should set pnorm_input_dim and pnorm_output_dim.
# Also the P value (the order of the p-norm) should be set.
#
# [Vimal Manohar - Oct 2014]
# The script now supports realignment during training, which can be done by
# specifying realign_epochs.

# Begin configuration section.
cmd=run.pl
num_epochs=15      # Number of epochs during which we reduce
                   # the learning rate; number of iteration is worked out from this.
num_epochs_extra=5 # Number of epochs after we stop reducing
                   # the learning rate.
num_iters_final=20 # Maximum number of final iterations to give to the
                   # optimization over the validation set.
initial_learning_rate=0.04
final_learning_rate=0.004
bias_stddev=0.5
softmax_learning_rate_factor=1.0 # In the default setting keep the same learning rate.

combine_regularizer=1.0e-14 # Small regularizer so that parameters won't go crazy.
pnorm_input_dim=3000
pnorm_output_dim=300
p=2
minibatch_size=128 # by default use a smallish minibatch size for neural net
                   # training; this controls instability which would otherwise
                   # be a problem with multi-threaded update.  Note: it also
                   # interacts with the "preconditioned" update which generally
                   # works better with larger minibatch size, so it's not
                   # completely cost free.

samples_per_iter=200000 # each iteration of training, see this many samples
                        # per job.  This option is passed to get_egs.sh
num_jobs_nnet=16   # Number of neural net jobs to run in parallel.  This option
                   # is passed to get_egs.sh.
get_egs_stage=0

shuffle_buffer_size=5000 # This "buffer_size" variable controls randomization of the samples
                # on each iter.  You could set it to 0 or to a large value for complete
                # randomization, but this would both consume memory and cause spikes in
                # disk I/O.  Smaller is easier on disk and memory but less random.  It's
                # not a huge deal though, as samples are anyway randomized right at the start.

add_layers_period=2 # by default, add new layers every 2 iterations.
num_hidden_layers=3
stage=-5

io_opts="--max-jobs-run 5" # for jobs with a lot of I/O, limits the number running at one time.
splice_width=4 # meaning +- 4 frames on each side for second LDA
randprune=4.0 # speeds up LDA.
alpha=4.0
max_change=10.0
mix_up=0 # Number of components to mix up to (should be > #tree leaves, if
        # specified.)
num_threads=16
parallel_opts="--num-threads 16 --mem 1G" # by default we use 16 threads; this lets the queue know.
  # note: parallel_opts doesn't automatically get adjusted if you adjust num-threads.
cleanup=true
egs_dir=
lda_opts=
lda_dim=
egs_opts=
transform_dir=     # If supplied, overrides alidir
cmvn_opts=  # will be passed to get_lda.sh and get_egs.sh, if supplied.
            # only relevant for "raw" features, not lda.
feat_type=  # Can be used to force "raw" features.
prior_subset_size=10000 # 10k samples per job, for computing priors.  Should be
                        # more than enough.
align_cmd=              # The cmd that is passed to steps/nnet2/align.sh
align_use_gpu=          # Passed to use_gpu in steps/nnet2/align.sh [yes/no]
realign_epochs=         # List of epochs, the beginning of which realignment is done
num_jobs_align=30       # Number of jobs for realignment

# End configuration section.

echo "$0 $@"  # Print the command line for logging

if [ -f path.sh ]; then . ./path.sh; fi
. parse_options.sh || exit 1;

if [ $# != 4 ]; then
  echo "Usage: $0 [opts] <data> <lang> <ali-dir> <exp-dir>"
  echo " e.g.: $0 data/train data/lang exp/tri3_ali exp/tri4_nnet"
  echo ""
  echo "Main options (for others, see top of script file)"
  echo "  --config <config-file>                           # config file containing options"
  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
  echo "  --num-epochs <#epochs|15>                        # Number of epochs of main training"
  echo "                                                   # while reducing learning rate (determines #iterations, together"
  echo "                                                   # with --samples-per-iter and --num-jobs-nnet)"
  echo "  --num-epochs-extra <#epochs-extra|5>             # Number of extra epochs of training"
  echo "                                                   # after learning rate fully reduced"
  echo "  --initial-learning-rate <initial-learning-rate|0.02> # Learning rate at start of training, e.g. 0.02 for small"
  echo "                                                       # data, 0.01 for large data"
  echo "  --final-learning-rate  <final-learning-rate|0.004>   # Learning rate at end of training, e.g. 0.004 for small"
  echo "                                                   # data, 0.001 for large data"
  echo "  --num-hidden-layers <#hidden-layers|2>           # Number of hidden layers, e.g. 2 for 3 hours of data, 4 for 100hrs"
  echo "  --add-layers-period <#iters|2>                   # Number of iterations between adding hidden layers"
  echo "  --mix-up <#pseudo-gaussians|0>                   # Can be used to have multiple targets in final output layer,"
  echo "                                                   # per context-dependent state.  Try a number several times #states."
  echo "  --num-jobs-nnet <num-jobs|8>                     # Number of parallel jobs to use for main neural net"
  echo "                                                   # training (will affect results as well as speed; try 8, 16)"
  echo "                                                   # Note: if you increase this, you may want to also increase"
  echo "                                                   # the learning rate."
  echo "  --num-threads <num-threads|16>                   # Number of parallel threads per job (will affect results"
  echo "                                                   # as well as speed; may interact with batch size; if you increase"
  echo "                                                   # this, you may want to decrease the batch size."
  echo "  --parallel-opts <opts|\"--num-threads 16 --mem 1G\">      # extra options to pass to e.g. queue.pl for processes that"
  echo "                                                   # use multiple threads... "
  echo "  --io-opts <opts|\"--max-jobs-run 10\">                      # Options given to e.g. queue.pl for jobs that do a lot of I/O."
  echo "  --minibatch-size <minibatch-size|128>            # Size of minibatch to process (note: product with --num-threads"
  echo "                                                   # should not get too large, e.g. >2k)."
  echo "  --samples-per-iter <#samples|400000>             # Number of samples of data to process per iteration, per"
  echo "                                                   # process."
  echo "  --splice-width <width|4>                         # Number of frames on each side to append for feature input"
  echo "                                                   # (note: we splice processed, typically 40-dimensional frames"
  echo "  --lda-dim <dim|250>                              # Dimension to reduce spliced features to with LDA"
  echo "  --num-iters-final <#iters|20>                    # Number of final iterations to give to nnet-combine-fast to "
  echo "                                                   # interpolate parameters (the weights are learned with a validation set)"
  echo "  --egs-opts <opts>                                # Extra options to pass to get_egs.sh"
  echo "  --lda-opts <opts>                                # Extra options to pass to get_lda.sh"
  echo "  --realign-epochs <list-of-epochs|\"\">           # A list of space-separated epoch indices the beginning of which"
  echo "                                                   # realignment is to be done"
  echo "  --align-cmd (utils/run.pl|utils/queue.pl <queue opts>) # passed to align.sh"
  echo "  --align-use-gpu (yes/no)                         # specify is gpu is to be used for realignment"
  echo "  --num-jobs-align <#njobs|30>                     # Number of jobs to perform realignment"
  echo "  --stage <stage|-9>                               # Used to run a partially-completed training process from somewhere in"
  echo "                                                   # the middle."

  exit 1;
fi

data=$1
lang=$2
alidir=$3
dir=$4

if [ ! -z "$realign_epochs" ]; then
  [ -z "$align_cmd" ] && echo "$0: realign_epochs specified but align_cmd not specified" && exit 1
  [ -z "$align_use_gpu" ] && echo "$0: realign_epochs specified but align_use_gpu not specified" && exit 1
fi

# Check some files.
for f in $data/feats.scp $lang/L.fst $alidir/ali.1.gz $alidir/final.mdl $alidir/tree; do
  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
done


# Set some variables.
num_leaves=`tree-info $alidir/tree 2>/dev/null | grep num-pdfs | awk '{print $2}'` || exit 1
[ -z $num_leaves ] && echo "\$num_leaves is unset" && exit 1
[ "$num_leaves" -eq "0" ] && echo "\$num_leaves is 0" && exit 1

nj=`cat $alidir/num_jobs` || exit 1;  # number of jobs in alignment dir...
# in this dir we'll have just one job.
sdata=$data/split$nj
utils/split_data.sh $data $nj

mkdir -p $dir/log
echo $nj > $dir/num_jobs
cp $alidir/tree $dir

utils/lang/check_phones_compatible.sh $lang/phones.txt $alidir/phones.txt || exit 1;
cp $lang/phones.txt $dir || exit 1;

extra_opts=()
[ ! -z "$cmvn_opts" ] && extra_opts+=(--cmvn-opts "$cmvn_opts")
[ ! -z "$feat_type" ] && extra_opts+=(--feat-type $feat_type)
[ ! -z "$online_ivector_dir" ] && extra_opts+=(--online-ivector-dir $online_ivector_dir)
[ -z "$transform_dir" ] && transform_dir=$alidir
extra_opts+=(--transform-dir $transform_dir)
extra_opts+=(--splice-width $splice_width)

if [ $stage -le -4 ]; then
  echo "$0: calling get_lda.sh"
  steps/nnet2/get_lda.sh $lda_opts "${extra_opts[@]}" --cmd "$cmd" $data $lang $alidir $dir || exit 1;
fi

# these files will have been written by get_lda.sh
feat_dim=`cat $dir/feat_dim` || exit 1;
lda_dim=`cat $dir/lda_dim` || exit 1;

if [ $stage -le -3 ] && [ -z "$egs_dir" ]; then
  echo "$0: calling get_egs.sh"
  steps/nnet2/get_egs.sh $egs_opts "${extra_opts[@]}" \
      --samples-per-iter $samples_per_iter \
      --num-jobs-nnet $num_jobs_nnet --stage $get_egs_stage \
      --cmd "$cmd" $egs_opts --io-opts "$io_opts" \
      $data $lang $alidir $dir || exit 1;
fi
if [ -z $egs_dir ]; then
  egs_dir=$dir/egs
fi

iters_per_epoch=`cat $egs_dir/iters_per_epoch`  || exit 1;
! [ $num_jobs_nnet -eq `cat $egs_dir/num_jobs_nnet` ] && \
  echo "$0: Warning: using --num-jobs-nnet=`cat $egs_dir/num_jobs_nnet` from $egs_dir"
num_jobs_nnet=`cat $egs_dir/num_jobs_nnet` || exit 1;


if ! [ $num_hidden_layers -ge 1 ]; then
  echo "Invalid num-hidden-layers $num_hidden_layers"
  exit 1
fi

if [ $stage -le -2 ]; then
  echo "$0: initializing neural net";

  lda_mat=$dir/lda.mat
  ext_lda_dim=$lda_dim
  ext_feat_dim=$feat_dim

  stddev=`perl -e "print 1.0/sqrt($pnorm_input_dim);"`
  cat >$dir/nnet.config <<EOF
SpliceComponent input-dim=$ext_feat_dim left-context=$splice_width right-context=$splice_width
FixedAffineComponent matrix=$lda_mat
AffineComponentPreconditioned input-dim=$ext_lda_dim output-dim=$pnorm_input_dim alpha=$alpha max-change=$max_change learning-rate=$initial_learning_rate param-stddev=$stddev bias-stddev=$bias_stddev
PnormComponent input-dim=$pnorm_input_dim output-dim=$pnorm_output_dim p=$p
NormalizeComponent dim=$pnorm_output_dim
AffineComponentPreconditioned input-dim=$pnorm_output_dim output-dim=$num_leaves alpha=$alpha max-change=$max_change learning-rate=$initial_learning_rate param-stddev=0 bias-stddev=0
SoftmaxComponent dim=$num_leaves
EOF

  # to hidden.config it will write the part of the config corresponding to a
  # single hidden layer; we need this to add new layers.
  cat >$dir/hidden.config <<EOF
AffineComponentPreconditioned input-dim=$pnorm_output_dim output-dim=$pnorm_input_dim alpha=$alpha max-change=$max_change learning-rate=$initial_learning_rate param-stddev=$stddev bias-stddev=$bias_stddev
PnormComponent input-dim=$pnorm_input_dim output-dim=$pnorm_output_dim p=$p
NormalizeComponent dim=$pnorm_output_dim
EOF
  $cmd $dir/log/nnet_init.log \
    nnet-am-init $alidir/tree $lang/topo "nnet-init $dir/nnet.config -|" \
    $dir/0.mdl || exit 1;
fi

if [ $stage -le -1 ]; then
  echo "Training transition probabilities and setting priors"
  $cmd $dir/log/train_trans.log \
    nnet-train-transitions $dir/0.mdl "ark:gunzip -c $alidir/ali.*.gz|" $dir/0.mdl \
    || exit 1;
fi

num_iters_reduce=$[$num_epochs * $iters_per_epoch];
num_iters_extra=$[$num_epochs_extra * $iters_per_epoch];
num_iters=$[$num_iters_reduce+$num_iters_extra]

echo "$0: Will train for $num_epochs + $num_epochs_extra epochs, equalling "
echo "$0: $num_iters_reduce + $num_iters_extra = $num_iters iterations, "
echo "$0: (while reducing learning rate) + (with constant learning rate)."

# This is when we decide to mix up from: halfway between when we've finished
# adding the hidden layers and the end of training.
finish_add_layers_iter=$[$num_hidden_layers * $add_layers_period]
mix_up_iter=$[($num_iters + $finish_add_layers_iter)/2]

if [ $num_threads -eq 1 ]; then
  train_suffix="-simple" # this enables us to use GPU code if
                         # we have just one thread.
  if ! cuda-compiled; then
    echo "$0: WARNING: you are running with one thread but you have not compiled"
    echo "   for CUDA.  You may be running a setup optimized for GPUs.  If you have"
    echo "   GPUs and have nvcc installed, go to src/ and do ./configure; make"
  fi
else
  train_suffix="-parallel --num-threads=$num_threads"
fi

x=0

for realign_epoch in $realign_epochs; do
  realign_iter=`perl -e 'print int($ARGV[0] * $ARGV[1]);' $realign_epoch $iters_per_epoch`
  realign_this_iter[$realign_iter]=$realign_epoch
done

cur_egs_dir=$egs_dir

while [ $x -lt $num_iters ]; do

  if [ ! -z "${realign_this_iter[$x]}" ]; then
    prev_egs_dir=$cur_egs_dir
    cur_egs_dir=$dir/egs_${realign_this_iter[$x]}
  fi

  if [ $x -ge 0 ] && [ $stage -le $x ]; then
    if [ ! -z "${realign_this_iter[$x]}" ]; then
      epoch=${realign_this_iter[$x]}

      echo "Getting average posterior for purposes of adjusting the priors."
      # Note: this just uses CPUs, using a smallish subset of data.
      rm $dir/post.*.vec 2>/dev/null
      $cmd JOB=1:$num_jobs_nnet $dir/log/get_post.JOB.log \
        nnet-subset-egs --n=$prior_subset_size ark:$prev_egs_dir/egs.JOB.0.ark ark:- \| \
        nnet-compute-from-egs "nnet-to-raw-nnet $dir/$x.mdl -|" ark:- ark:- \| \
        matrix-sum-rows ark:- ark:- \| vector-sum ark:- $dir/post.JOB.vec || exit 1;

      sleep 3;  # make sure there is time for $dir/post.*.vec to appear.

      $cmd $dir/log/vector_sum.log \
        vector-sum $dir/post.*.vec $dir/post.vec || exit 1;

      rm $dir/post.*.vec;

      echo "Re-adjusting priors based on computed posteriors"
      $cmd $dir/log/adjust_priors.$x.log \
        nnet-adjust-priors $dir/$x.mdl $dir/post.vec $dir/$x.mdl || exit 1;

      sleep 2

      steps/nnet2/align.sh --nj $num_jobs_align --cmd "$align_cmd" --use-gpu $align_use_gpu \
        --transform-dir "$transform_dir" \
        --iter $x $data $lang $dir $dir/ali_$epoch || exit 1

      steps/nnet2/relabel_egs.sh --cmd "$cmd" --iter $x $dir/ali_$epoch \
        $prev_egs_dir $cur_egs_dir || exit 1

      if $cleanup && [[ $prev_egs_dir =~ $dir/egs* ]]; then
        steps/nnet2/remove_egs.sh $prev_egs_dir
      fi
    fi

    # Set off jobs doing some diagnostics, in the background.
    # Use the egs dir from the previous iteration for the diagnostics
    $cmd $dir/log/compute_prob_valid.$x.log \
      nnet-compute-prob $dir/$x.mdl ark:$cur_egs_dir/valid_diagnostic.egs &
    $cmd $dir/log/compute_prob_train.$x.log \
      nnet-compute-prob $dir/$x.mdl ark:$cur_egs_dir/train_diagnostic.egs &
    if [ $x -gt 0 ] && [ ! -f $dir/log/mix_up.$[$x-1].log ]; then
      $cmd $dir/log/progress.$x.log \
        nnet-show-progress --use-gpu=no $dir/$[$x-1].mdl $dir/$x.mdl \
        ark:$cur_egs_dir/train_diagnostic.egs '&&' \
        nnet-am-info $dir/$x.mdl &
    fi

    echo "Training neural net (pass $x)"
    if [ $x -gt 0 ] && \
      [ $x -le $[($num_hidden_layers-1)*$add_layers_period] ] && \
      [ $[($x-1) % $add_layers_period] -eq 0 ]; then
      mdl="nnet-init --srand=$x $dir/hidden.config - | nnet-insert $dir/$x.mdl - - |"
    else
      mdl=$dir/$x.mdl
    fi


    $cmd $parallel_opts JOB=1:$num_jobs_nnet $dir/log/train.$x.JOB.log \
      nnet-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x \
      ark:$cur_egs_dir/egs.JOB.$[$x%$iters_per_epoch].ark ark:- \| \
      nnet-train$train_suffix \
         --minibatch-size=$minibatch_size --srand=$x "$mdl" \
        ark:- $dir/$[$x+1].JOB.mdl \
      || exit 1;

    nnets_list=
    for n in `seq 1 $num_jobs_nnet`; do
      nnets_list="$nnets_list $dir/$[$x+1].$n.mdl"
    done

    learning_rate=`perl -e '($x,$n,$i,$f)=@ARGV; print ($x >= $n ? $f : $i*exp($x*log($f/$i)/$n));' $[$x+1] $num_iters_reduce $initial_learning_rate $final_learning_rate`;
    softmax_learning_rate=`perl -e "print $learning_rate * $softmax_learning_rate_factor;"`;
    nnet-am-info $dir/$[$x+1].1.mdl > $dir/foo  2>/dev/null || exit 1
    nu=`cat $dir/foo | grep num-updatable-components | awk '{print $2}'`
    na=`cat $dir/foo | grep -v Fixed | grep AffineComponent | wc -l`
    # na is number of last updatable AffineComponent layer [one-based, counting only
    # updatable components.]
    lr_string="$learning_rate"
    for n in `seq 2 $nu`; do
      if [ $n -eq $na ] || [ $n -eq $[$na-1] ]; then lr=$softmax_learning_rate;
      else lr=$learning_rate; fi
      lr_string="$lr_string:$lr"
    done

    $cmd $dir/log/average.$x.log \
      nnet-am-average $nnets_list - \| \
      nnet-am-copy --learning-rates=$lr_string - $dir/$[$x+1].mdl || exit 1;

    if [ "$mix_up" -gt 0 ] && [ $x -eq $mix_up_iter ]; then
      # mix up.
      echo Mixing up from $num_leaves to $mix_up components
      $cmd $dir/log/mix_up.$x.log \
        nnet-am-mixup --min-count=10 --num-mixtures=$mix_up \
        $dir/$[$x+1].mdl $dir/$[$x+1].mdl || exit 1;
    fi
    rm $nnets_list
  fi
  x=$[$x+1]
done

# Now do combination.
# At the end, final.mdl will be a combination of the last e.g. 10 models.
nnets_list=()
if [ $num_iters_final -gt $num_iters_extra ]; then
  echo "Setting num_iters_final=$num_iters_extra"
fi
start=$[$num_iters-$num_iters_final+1]
for x in `seq $start $num_iters`; do
  idx=$[$x-$start]
  if [ $x -gt $mix_up_iter ]; then
    nnets_list[$idx]=$dir/$x.mdl # "nnet-am-copy --remove-dropout=true $dir/$x.mdl - |"
  fi
done

if [ $stage -le $num_iters ]; then
  echo "Doing final combination to produce final.mdl"
  # Below, use --use-gpu=no to disable nnet-combine-fast from using a GPU, as
  # if there are many models it can give out-of-memory error; set num-threads to 8
  # to speed it up (this isn't ideal...)
  this_num_threads=$num_threads
  [ $this_num_threads -lt 8 ] && this_num_threads=8
  num_egs=`nnet-copy-egs ark:$cur_egs_dir/combine.egs ark:/dev/null 2>&1 | tail -n 1 | awk '{print $NF}'`
  mb=$[($num_egs+$this_num_threads-1)/$this_num_threads]
  [ $mb -gt 512 ] && mb=512
  # Setting --initial-model to a large value makes it initialize the combination
  # with the average of all the models.  It's important not to start with a
  # single model, or, due to the invariance to scaling that these nonlinearities
  # give us, we get zero diagonal entries in the fisher matrix that
  # nnet-combine-fast uses for scaling, which after flooring and inversion, has
  # the effect that the initial model chosen gets much higher learning rates
  # than the others.  This prevents the optimization from working well.
  $cmd $parallel_opts $dir/log/combine.log \
    nnet-combine-fast --initial-model=100000 --num-lbfgs-iters=40 --use-gpu=no \
      --num-threads=$this_num_threads --regularizer=$combine_regularizer \
      --verbose=3 --minibatch-size=$mb "${nnets_list[@]}" ark:$cur_egs_dir/combine.egs \
      $dir/final.mdl || exit 1;

  # Normalize stddev for affine or block affine layers that are followed by a
  # pnorm layer and then a normalize layer.
  $cmd $parallel_opts $dir/log/normalize.log \
    nnet-normalize-stddev $dir/final.mdl $dir/final.mdl || exit 1;

  # Compute the probability of the final, combined model with
  # the same subset we used for the previous compute_probs, as the
  # different subsets will lead to different probs.
  $cmd $dir/log/compute_prob_valid.final.log \
    nnet-compute-prob $dir/final.mdl ark:$cur_egs_dir/valid_diagnostic.egs &
  $cmd $dir/log/compute_prob_train.final.log \
    nnet-compute-prob $dir/final.mdl ark:$cur_egs_dir/train_diagnostic.egs &
fi

if [ $stage -le $[$num_iters+1] ]; then
  echo "Getting average posterior for purposes of adjusting the priors."
  # Note: this just uses CPUs, using a smallish subset of data.
  rm $dir/post.*.vec 2>/dev/null
  $cmd JOB=1:$num_jobs_nnet $dir/log/get_post.JOB.log \
    nnet-subset-egs --n=$prior_subset_size ark:$cur_egs_dir/egs.JOB.0.ark ark:- \| \
    nnet-compute-from-egs "nnet-to-raw-nnet $dir/final.mdl -|" ark:- ark:- \| \
    matrix-sum-rows ark:- ark:- \| vector-sum ark:- $dir/post.JOB.vec || exit 1;

  sleep 3;  # make sure there is time for $dir/post.*.vec to appear.

  $cmd $dir/log/vector_sum.log \
   vector-sum $dir/post.*.vec $dir/post.vec || exit 1;

  rm $dir/post.*.vec;

  echo "Re-adjusting priors based on computed posteriors"
  $cmd $dir/log/adjust_priors.final.log \
    nnet-adjust-priors $dir/final.mdl $dir/post.vec $dir/final.mdl || exit 1;
fi


sleep 2

echo Done

if $cleanup; then
  echo Cleaning up data
  if [[ $cur_egs_dir =~ $dir/egs* ]]; then
    steps/nnet2/remove_egs.sh $cur_egs_dir
  fi
  echo Removing most of the models
  for x in `seq 0 $num_iters`; do
    if [ $[$x%100] -ne 0 ] && [ $x -lt $[$num_iters-$num_iters_final+1] ]; then
       # delete all but every 100th model; don't delete the ones which combine to form the final model.
      rm $dir/$x.mdl
    fi
  done
fi


================================================
FILE: egs/steps/nnet2/train_pnorm_accel2.sh
================================================
#!/usr/bin/env bash

# Copyright 2012-2014  Johns Hopkins University (Author: Daniel Povey).
#                2013  Xiaohui Zhang
#                2013  Guoguo Chen
#                2014  Vimal Manohar
# Apache 2.0.

# train_pnorm_accel2.sh is a modified form of train_pnorm_simple2.sh (the "2"
# suffix is because they both use the the "new" egs format, created by
# get_egs2.sh).  The "accel" part of the name refers to the fact that this
# script uses a number of jobs that can increase during training.  You can
# specify --initial-num-jobs and --final-num-jobs to control these separately.
# Also, in this script, the learning rates specified by --initial-learning-rate
# and --final-learning-rate are the "effective learning rates" (defined as the
# learning rate divided by the number of jobs), and the actual learning rates
# used will be the specified learning rates multiplied by the current number
# of jobs.  You'll want to set these lower than you normally would previously
# have set the learning rates, by a factor equal to the (previous) number of
# jobs.


# Begin configuration section.
cmd=run.pl
num_epochs=15      # Number of epochs of training;
                   # the number of iterations is worked out from this.
initial_effective_lrate=0.01
final_effective_lrate=0.001
bias_stddev=0.5
pnorm_input_dim=3000
pnorm_output_dim=300
p=2
minibatch_size=128 # by default use a smallish minibatch size for neural net
                   # training; this controls instability which would otherwise
                   # be a problem with multi-threaded update.

samples_per_iter=400000 # each iteration of training, see this many samples
                        # per job.  This option is passed to get_egs.sh
num_jobs_initial=1    # Number of neural net jobs to run in parallel at the start of training.
num_jobs_final=8      # Number of jobs to run in parallel at the end of training.

prior_subset_size=10000 # 10k samples per job, for computing priors.  Should be
                        # more than enough.
num_jobs_compute_prior=10 # these are single-threaded, run on CPU.
get_egs_stage=0
online_ivector_dir=


max_models_combine=20 # The "max_models_combine" is the maximum number of models we give
  # to the final 'combine' stage, but these models will themselves be averages of
  # iteration-number ranges.

shuffle_buffer_size=5000 # This "buffer_size" variable controls randomization of the samples
                # on each iter.  You could set it to 0 or to a large value for complete
                # randomization, but this would both consume memory and cause spikes in
                # disk I/O.  Smaller is easier on disk and memory but less random.  It's
                # not a huge deal though, as samples are anyway randomized right at the start.
                # (the point of this is to get data in different minibatches on different iterations,
                # since in the preconditioning method, 2 samples in the same minibatch can
                # affect each others' gradients.

add_layers_period=2 # by default, add new layers every 2 iterations.
num_hidden_layers=3
stage=-4

splice_width=4 # meaning +- 4 frames on each side for second LDA
left_context= # if set, overrides splice-width
right_context= # if set, overrides splice-width.
randprune=4.0 # speeds up LDA.
alpha=4.0 # relates to preconditioning.
update_period=4 # relates to online preconditioning: says how often we update the subspace.
num_samples_history=2000 # relates to online preconditioning
max_change_per_sample=0.075
precondition_rank_in=20  # relates to online preconditioning
precondition_rank_out=80 # relates to online preconditioning

mix_up=0 # Number of components to mix up to (should be > #tree leaves, if
        # specified.)
num_threads=16
parallel_opts="--num-threads 16 --mem 1G"
  # by default we use 16 threads; this lets the queue know.
  # note: parallel_opts doesn't automatically get adjusted if you adjust num-threads.
combine_num_threads=8
combine_parallel_opts="--num-threads 8"  # queue options for the "combine" stage.
cleanup=true
egs_dir=
lda_opts=
lda_dim=
egs_opts=
io_opts="--max-jobs-run 5" # for jobs with a lot of I/O, limits the number running at one time.
transform_dir=     # If supplied, overrides alidir
postdir=
cmvn_opts=  # will be passed to get_lda.sh and get_egs.sh, if supplied.
            # only relevant for "raw" features, not lda.
feat_type=  # Can be used to force "raw" features.
align_cmd=              # The cmd that is passed to steps/nnet2/align.sh
align_use_gpu=          # Passed to use_gpu in steps/nnet2/align.sh [yes/no]
realign_times=          # List of times on which we realign.  Each time is
                        # floating point number strictly between 0 and 1, which
                        # will be multiplied by the num-iters to get an iteration
                        # number.
num_jobs_align=30       # Number of jobs for realignment
srand=0 # random seed used to initialize the nnet
# End configuration section.


echo "$0 $@"  # Print the command line for logging

if [ -f path.sh ]; then . ./path.sh; fi
. parse_options.sh || exit 1;

if [ $# != 4 ]; then
  echo "Usage: $0 [opts] <data> <lang> <ali-dir> <exp-dir>"
  echo " e.g.: $0 data/train data/lang exp/tri3_ali exp/tri4_nnet"
  echo ""
  echo "Main options (for others, see top of script file)"
  echo "  --config <config-file>                           # config file containing options"
  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
  echo "  --num-epochs <#epochs|15>                        # Number of epochs of training"
  echo "  --initial-effective-lrate <lrate|0.02> # effective learning rate at start of training,"
  echo "                                         # actual learning-rate is this time num-jobs."
  echo "  --final-effective-lrate <lrate|0.004>   # effective learning rate at end of training."
  echo "  --num-hidden-layers <#hidden-layers|2>           # Number of hidden layers, e.g. 2 for 3 hours of data, 4 for 100hrs"
  echo "  --add-layers-period <#iters|2>                   # Number of iterations between adding hidden layers"
  echo "  --mix-up <#pseudo-gaussians|0>                   # Can be used to have multiple targets in final output layer,"
  echo "                                                   # per context-dependent state.  Try a number several times #states."
  echo "  --num-jobs-initial <num-jobs|1>                  # Number of parallel jobs to use for neural net training, at the start."
  echo "  --num-jobs-final <num-jobs|8>                    # Number of parallel jobs to use for neural net training, at the end"
  echo "  --num-threads <num-threads|16>                   # Number of parallel threads per job (will affect results"
  echo "                                                   # as well as speed; may interact with batch size; if you increase"
  echo "                                                   # this, you may want to decrease the batch size."
  echo "  --parallel-opts <opts|\"--num-threads 16 --mem 1G\">      # extra options to pass to e.g. queue.pl for processes that"
  echo "                                                   # use multiple threads... "
  echo "  --io-opts <opts|\"--max-jobs-run 10\">                      # Options given to e.g. queue.pl for jobs that do a lot of I/O."
  echo "  --minibatch-size <minibatch-size|128>            # Size of minibatch to process (note: product with --num-threads"
  echo "                                                   # should not get too large, e.g. >2k)."
  echo "  --samples-per-iter <#samples|400000>             # Number of samples of data to process per iteration, per"
  echo "                                                   # process."
  echo "  --splice-width <width|4>                         # Number of frames on each side to append for feature input"
  echo "                                                   # (note: we splice processed, typically 40-dimensional frames"
  echo "  --lda-dim <dim|250>                              # Dimension to reduce spliced features to with LDA"
  echo "  --realign-times <list-of-times|\"\">             # A list of space-separated floating point numbers between 0.0 and"
  echo "                                                   # 1.0 to specify how far through training realignment is to be done"
  echo "  --align-cmd (utils/run.pl|utils/queue.pl <queue opts>) # passed to align.sh"
  echo "  --align-use-gpu (yes/no)                         # specify is gpu is to be used for realignment"
  echo "  --num-jobs-align <#njobs|30>                     # Number of jobs to perform realignment"
  echo "  --stage <stage|-4>                               # Used to run a partially-completed training process from somewhere in"
  echo "                                                   # the middle."


  exit 1;
fi

data=$1
lang=$2
alidir=$3
dir=$4

if [ ! -z "$realign_times" ]; then
  [ -z "$align_cmd" ] && echo "$0: realign_times specified but align_cmd not specified" && exit 1
  [ -z "$align_use_gpu" ] && echo "$0: realign_times specified but align_use_gpu not specified" && exit 1
fi

# Check some files.
for f in $data/feats.scp $lang/L.fst $alidir/final.mdl $alidir/tree; do
  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
done

[ ! -f $postdir/post.1.scp ] && [ ! -f $alidir/ali.1.gz ] && echo "$0: no (soft) alignments provided" && exit 1;

trap 'for pid in $(jobs -pr); do kill -KILL $pid; done' INT QUIT TERM

# Set some variables.
num_leaves=`tree-info $alidir/tree 2>/dev/null | grep num-pdfs | awk '{print $2}'` || exit 1
[ -z $num_leaves ] && echo "\$num_leaves is unset" && exit 1
[ "$num_leaves" -eq "0" ] && echo "\$num_leaves is 0" && exit 1

nj=`cat $alidir/num_jobs` || exit 1;  # number of jobs in alignment dir...
# in this dir we'll have just one job.
sdata=$data/split$nj
utils/split_data.sh $data $nj

mkdir -p $dir/log
echo $nj > $dir/num_jobs
cp $alidir/tree $dir

utils/lang/check_phones_compatible.sh $lang/phones.txt $alidir/phones.txt || exit 1;
cp $lang/phones.txt $dir || exit 1;

extra_opts=()
[ ! -z "$cmvn_opts" ] && extra_opts+=(--cmvn-opts "$cmvn_opts")
[ ! -z "$feat_type" ] && extra_opts+=(--feat-type $feat_type)
[ ! -z "$online_ivector_dir" ] && extra_opts+=(--online-ivector-dir $online_ivector_dir)
[ -z "$transform_dir" ] && transform_dir=$alidir
extra_opts+=(--transform-dir $transform_dir)
[ -z "$left_context" ] && left_context=$splice_width
[ -z "$right_context" ] && right_context=$splice_width
extra_opts+=(--left-context $left_context --right-context $right_context)

if [ $stage -le -4 ]; then
  echo "$0: calling get_lda.sh"
  steps/nnet2/get_lda.sh $lda_opts "${extra_opts[@]}" --cmd "$cmd" $data $lang $alidir $dir || exit 1;
fi

# these files will have been written by get_lda.sh
feat_dim=$(cat $dir/feat_dim) || exit 1;
ivector_dim=$(cat $dir/ivector_dim) || exit 1;
lda_dim=$(cat $dir/lda_dim) || exit 1;

if [ $stage -le -3 ] && [ -z "$egs_dir" ]; then
  echo "$0: calling get_egs2.sh"
  steps/nnet2/get_egs2.sh $egs_opts "${extra_opts[@]}"  --io-opts "$io_opts" \
    --postdir "$postdir" --samples-per-iter $samples_per_iter --stage $get_egs_stage \
    --cmd "$cmd" $egs_opts $data $alidir $dir/egs || exit 1;
fi

if [ -z $egs_dir ]; then
  egs_dir=$dir/egs
fi

frames_per_eg=$(cat $egs_dir/info/frames_per_eg) || { echo "error: no such file $egs_dir/info/frames_per_eg"; exit 1; }
num_archives=$(cat $egs_dir/info/num_archives) || { echo "error: no such file $egs_dir/info/frames_per_eg"; exit 1; }

# num_archives_expanded considers each separate label-position from
# 0..frames_per_eg-1 to be a separate archive.
num_archives_expanded=$[$num_archives*$frames_per_eg]

[ $num_jobs_initial -gt $num_jobs_final ] && \
  echo "$0: --initial-num-jobs cannot exceed --final-num-jobs" && exit 1;

[ $num_jobs_final -gt $num_archives_expanded ] && \
  echo "$0: --final-num-jobs cannot exceed #archives $num_archives_expanded." && exit 1;

if ! [ $num_hidden_layers -ge 1 ]; then
  echo "Invalid num-hidden-layers $num_hidden_layers"
  exit 1
fi

if [ $stage -le -2 ]; then
  echo "$0: initializing neural net";
  lda_mat=$dir/lda.mat
  tot_input_dim=$[$feat_dim+$ivector_dim]

  online_preconditioning_opts="alpha=$alpha num-samples-history=$num_samples_history update-period=$update_period rank-in=$precondition_rank_in rank-out=$precondition_rank_out max-change-per-sample=$max_change_per_sample"

  initial_lrate=$(perl -e "print ($initial_effective_lrate*$num_jobs_initial);")
  stddev=`perl -e "print 1.0/sqrt($pnorm_input_dim);"`
  cat >$dir/nnet.config <<EOF
SpliceComponent input-dim=$tot_input_dim left-context=$left_context right-context=$right_context const-component-dim=$ivector_dim
FixedAffineComponent matrix=$lda_mat
AffineComponentPreconditionedOnline input-dim=$lda_dim output-dim=$pnorm_input_dim $online_preconditioning_opts learning-rate=$initial_lrate param-stddev=$stddev bias-stddev=$bias_stddev
PnormComponent input-dim=$pnorm_input_dim output-dim=$pnorm_output_dim p=$p
NormalizeComponent dim=$pnorm_output_dim
AffineComponentPreconditionedOnline input-dim=$pnorm_output_dim output-dim=$num_leaves $online_preconditioning_opts learning-rate=$initial_lrate param-stddev=0 bias-stddev=0
SoftmaxComponent dim=$num_leaves
EOF

  # to hidden.config it will write the part of the config corresponding to a
  # single hidden layer; we need this to add new layers.
  cat >$dir/hidden.config <<EOF
AffineComponentPreconditionedOnline input-dim=$pnorm_output_dim output-dim=$pnorm_input_dim $online_preconditioning_opts learning-rate=$initial_lrate param-stddev=$stddev bias-stddev=$bias_stddev
PnormComponent input-dim=$pnorm_input_dim output-dim=$pnorm_output_dim p=$p
NormalizeComponent dim=$pnorm_output_dim
EOF
  $cmd $dir/log/nnet_init.log \
    nnet-am-init $alidir/tree $lang/topo "nnet-init --srand=$srand $dir/nnet.config -|" \
    $dir/0.mdl || exit 1;
fi

if [ $stage -le -1 ]; then
  echo "Training transition probabilities and setting priors"
  $cmd $dir/log/train_trans.log \
    nnet-train-transitions $dir/0.mdl "ark:gunzip -c $alidir/ali.*.gz|" $dir/0.mdl \
    || exit 1;
fi

# set num_iters so that as close as possible, we process the data $num_epochs
# times, i.e. $num_iters*$avg_num_jobs) == $num_epochs*$num_archives_expanded,
# where avg_num_jobs=(num_jobs_initial+num_jobs_final)/2.

num_archives_to_process=$[$num_epochs*$num_archives_expanded]
num_archives_processed=0
num_iters=$[($num_archives_to_process*2)/($num_jobs_initial+$num_jobs_final)]

echo "$0: Will train for $num_epochs epochs = $num_iters iterations"

finish_add_layers_iter=$[$num_hidden_layers * $add_layers_period]

! [ $num_iters -gt $[$finish_add_layers_iter+2] ] \
  && echo "$0: Insufficient epochs" && exit 1

# mix up at the iteration where we've processed about half the data; this keeps
# the overall training procedure fairly invariant to the number of initial and
# final jobs.
# j = initial, k = final, n = num-iters, x = half-of-data epoch,
# p is proportion of data we want to process (e.g. p=0.5 here).
# solve for x if the amount of data processed by epoch x is p
# times the amount by iteration n.
# put this in wolfram alpha:
# solve { x*j + (k-j)*x*x/(2*n) = p * (j*n + (k-j)*n/2), {x} }
# got: x = (j n-sqrt(-n^2 (j^2 (p-1)-k^2 p)))/(j-k) and j!=k and n!=0
# simplified manually to: n * (sqrt(((1-p)j^2 + p k^2)/2) - j)/(j-k)
mix_up_iter=$(perl -e '($j,$k,$n,$p)=@ARGV; print int(0.5 + ($j==$k ? $n*$p : $n*(sqrt((1-$p)*$j*$j+$p*$k*$k)-$j)/($k-$j))); ' $num_jobs_initial $num_jobs_final $num_iters 0.5)
! [ $mix_up_iter -gt $finish_add_layers_iter ] && \
  echo "Mix-up-iter is $mix_up_iter, should be greater than $finish_add_layers_iter -> add more epochs?" \
  && exit 1;

if [ $num_threads -eq 1 ]; then
  parallel_suffix="-simple" # this enables us to use GPU code if
                         # we have just one thread.
  parallel_train_opts=
  if ! cuda-compiled; then
    echo "$0: WARNING: you are running with one thread but you have not compiled"
    echo "   for CUDA.  You may be running a setup optimized for GPUs.  If you have"
    echo "   GPUs and have nvcc installed, go to src/ and do ./configure; make"
  fi
else
  parallel_suffix="-parallel"
  parallel_train_opts="--num-threads=$num_threads"
fi


approx_iters_per_epoch_final=$[$num_archives_expanded/$num_jobs_final]

# First work out how many models we want to combine over in the final
# nnet-combine-fast invocation.  This equals
# min(max(max_models_combine, approx_iters_per_epoch_final),
#     2/3 * iters_after_mixup)
num_models_combine=$max_models_combine
if [ $num_models_combine -lt $approx_iters_per_epoch_final ]; then
  num_models_combine=$approx_iters_per_epoch_final
fi
iters_after_mixup_23=$[(($num_iters-$mix_up_iter-1)*2)/3]
if [ $num_models_combine -gt $iters_after_mixup_23 ]; then
  num_models_combine=$iters_after_mixup_23
fi
first_model_combine=$[$num_iters-$num_models_combine+1]

x=0


for realign_time in $realign_times; do
  # Work out the iterations on which we will re-align, if the --realign-times
  # option was used.  This is slightly approximate.
  ! perl -e "exit($realign_time > 0.0 && $realign_time < 1.0 ? 0:1);" && \
    echo "Invalid --realign-times option $realign_times: elements must be strictly between 0 and 1.";
  # the next formula is based on the one for mix_up_iter above.
  realign_iter=$(perl -e '($j,$k,$n,$p)=@ARGV; print int(0.5 + ($j==$k ? $n*$p : $n*(sqrt((1-$p)*$j*$j+$p*$k*$k)-$j)/($k-$j))); ' $num_jobs_initial $num_jobs_final $num_iters $realign_time) || exit 1;
  realign_this_iter[$realign_iter]=$realign_time
done

cur_egs_dir=$egs_dir

while [ $x -lt $num_iters ]; do
  this_num_jobs=$(perl -e "print int(0.5+$num_jobs_initial+($num_jobs_final-$num_jobs_initial)*$x/$num_iters);")

  ilr=$initial_effective_lrate; flr=$final_effective_lrate; np=$num_archives_processed; nt=$num_archives_to_process;
  this_learning_rate=$(perl -e  "print (($x + 1 >= $num_iters ? $flr : $ilr*exp($np*log($flr/$ilr)/$nt))*$this_num_jobs);");

  # TODO: remove this line.
  echo "On iteration $x, learning rate is $this_learning_rate."

  if [ ! -z "${realign_this_iter[$x]}" ]; then
    prev_egs_dir=$cur_egs_dir
    cur_egs_dir=$dir/egs_${realign_this_iter[$x]}
  fi

  if [ $x -ge 0 ] && [ $stage -le $x ]; then
    if [ ! -z "${realign_this_iter[$x]}" ]; then
      time=${realign_this_iter[$x]}


      echo "Getting average posterior for purposes of adjusting the priors."
      # Note: this just uses CPUs, using a smallish subset of data.
      # always use the first egs archive, which makes the script simpler;
      # we're using different random subsets of it.
      rm $dir/post.$x.*.vec 2>/dev/null
      $cmd JOB=1:$num_jobs_compute_prior $dir/log/get_post.$x.JOB.log \
        nnet-copy-egs --srand=JOB --frame=random ark:$prev_egs_dir/egs.1.ark ark:- \| \
        nnet-subset-egs --srand=JOB --n=$prior_subset_size ark:- ark:- \| \
        nnet-compute-from-egs "nnet-to-raw-nnet $dir/$x.mdl -|" ark:- ark:- \| \
        matrix-sum-rows ark:- ark:- \| vector-sum ark:- $dir/post.$x.JOB.vec || exit 1;

      sleep 3;  # make sure there is time for $dir/post.$x.*.vec to appear.

      $cmd $dir/log/vector_sum.$x.log \
        vector-sum $dir/post.$x.*.vec $dir/post.$x.vec || exit 1;
      rm $dir/post.$x.*.vec;

      echo "Re-adjusting priors based on computed posteriors"
      $cmd $dir/log/adjust_priors.$x.log \
        nnet-adjust-priors $dir/$x.mdl $dir/post.$x.vec $dir/$x.mdl || exit 1;

      sleep 2

      steps/nnet2/align.sh --nj $num_jobs_align --cmd "$align_cmd" --use-gpu $align_use_gpu \
        --transform-dir "$transform_dir" --online-ivector-dir "$online_ivector_dir" \
        --iter $x $data $lang $dir $dir/ali_$time || exit 1

      steps/nnet2/relabel_egs2.sh --cmd "$cmd" --iter $x $dir/ali_$time \
        $prev_egs_dir $cur_egs_dir || exit 1

      if $cleanup && [[ $prev_egs_dir =~ $dir/egs* ]]; then
        steps/nnet2/remove_egs.sh $prev_egs_dir
      fi
    fi

    # Set off jobs doing some diagnostics, in the background.
    # Use the egs dir from the previous iteration for the diagnostics
    $cmd $dir/log/compute_prob_valid.$x.log \
      nnet-compute-prob $dir/$x.mdl ark:$cur_egs_dir/valid_diagnostic.egs &
    $cmd $dir/log/compute_prob_train.$x.log \
      nnet-compute-prob $dir/$x.mdl ark:$cur_egs_dir/train_diagnostic.egs &
    if [ $x -gt 0 ] && [ ! -f $dir/log/mix_up.$[$x-1].log ]; then
      [ ! -f $x.mdl ] && sleep 10;
      $cmd $dir/log/progress.$x.log \
        nnet-show-progress --use-gpu=no $dir/$[$x-1].mdl $dir/$x.mdl \
        ark:$cur_egs_dir/train_diagnostic.egs '&&' \
        nnet-am-info $dir/$x.mdl &
    fi

    echo "Training neural net (pass $x)"

    if [ $x -gt 0 ] && \
      [ $x -le $[($num_hidden_layers-1)*$add_layers_period] ] && \
      [ $[($x-1) % $add_layers_period] -eq 0 ]; then
      do_average=false # if we've just mixed up, don't do averaging take the best.
      mdl="nnet-init --srand=$x $dir/hidden.config - | nnet-insert $dir/$x.mdl - - | nnet-am-copy --learning-rate=$this_learning_rate - -|"
    else
      do_average=true
      if [ $x -eq 0 ]; then do_average=false; fi # on iteration 0, pick the best, don't average.
      mdl="nnet-am-copy --learning-rate=$this_learning_rate $dir/$x.mdl -|"
    fi
    if $do_average; then
      this_minibatch_size=$minibatch_size
    else
      # on iteration zero or when we just added a layer, use a smaller minibatch
      # size and just one job: the model-averaging doesn't seem to be helpful
      # when the model is changing too fast (i.e. it worsens the objective
      # function), and the smaller minibatch size will help to keep
      # the update stable.
      this_minibatch_size=$[$minibatch_size/2];
    fi

    rm $dir/.error 2>/dev/null

    ( # this sub-shell is so that when we "wait" below,
      # we only wait for the training jobs that we just spawned,
      # not the diagnostic jobs that we spawned above.

      # We can't easily use a single parallel SGE job to do the main training,
      # because the computation of which archive and which --frame option
      # to use for each job is a little complex, so we spawn each one separately.
      for n in $(seq $this_num_jobs); do
        k=$[$num_archives_processed + $n - 1]; # k is a zero-based index that we'll derive
                                         # the other indexes from.
        archive=$[($k%$num_archives)+1]; # work out the 1-based archive index.
        frame=$[(($k/$num_archives)%$frames_per_eg)]; # work out the 0-based frame
        # index; this increases more slowly than the archive index because the
        # same archive with different frame indexes will give similar gradients,
        # so we want to separate them in time.

        $cmd $parallel_opts $dir/log/train.$x.$n.log \
          nnet-train$parallel_suffix $parallel_train_opts \
          --minibatch-size=$this_minibatch_size --srand=$x "$mdl" \
          "ark,bg:nnet-copy-egs --frame=$frame ark:$cur_egs_dir/egs.$archive.ark ark:-|nnet-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x ark:- ark:-|" \
          $dir/$[$x+1].$n.mdl || touch $dir/.error &
      done
      wait
    )
    # the error message below is not that informative, but $cmd will
    # have printed a more specific one.
    [ -f $dir/.error ] && echo "$0: error on iteration $x of training" && exit 1;

    nnets_list=
    for n in `seq 1 $this_num_jobs`; do
      nnets_list="$nnets_list $dir/$[$x+1].$n.mdl"
    done

    if $do_average; then
      # average the output of the different jobs.
      $cmd $dir/log/average.$x.log \
        nnet-am-average $nnets_list $dir/$[$x+1].mdl ||  exit 1;
    else
      # choose the best from the different jobs.
      n=$(perl -e '($nj,$pat)=@ARGV; $best_n=1; $best_logprob=-1.0e+10; for ($n=1;$n<=$nj;$n++) {
          $fn = sprintf($pat,$n); open(F, "<$fn") || die "Error opening log file $fn";
          undef $logprob; while (<F>) { if (m/log-prob-per-frame=(\S+)/) { $logprob=$1; } }
          close(F); if (defined $logprob && $logprob > $best_logprob) { $best_logprob=$logprob;
          $best_n=$n; } } print "$best_n\n"; ' $num_jobs_nnet $dir/log/train.$x.%d.log) || exit 1;
      [ -z "$n" ] && echo "Error getting best model" && exit 1;
      cp $dir/$[$x+1].$n.mdl $dir/$[$x+1].mdl || exit 1;
    fi

    if [ "$mix_up" -gt 0 ] && [ $x -eq $mix_up_iter ]; then
      # mix up.
      echo Mixing up from $num_leaves to $mix_up components
      $cmd $dir/log/mix_up.$x.log \
        nnet-am-mixup --min-count=10 --num-mixtures=$mix_up \
        $dir/$[$x+1].mdl $dir/$[$x+1].mdl || exit 1;
    fi
    rm $nnets_list
    [ ! -f $dir/$[$x+1].mdl ] && exit 1;
    if [ -f $dir/$[$x-1].mdl ] && $cleanup && \
       [ $[($x-1)%100] -ne 0  ] && [ $[$x-1] -lt $first_model_combine ]; then
      rm $dir/$[$x-1].mdl
    fi
  fi
  x=$[$x+1]
  num_archives_processed=$[$num_archives_processed+$this_num_jobs]
done


if [ $stage -le $num_iters ]; then
  echo "Doing final combination to produce final.mdl"

  # Now do combination.
  nnets_list=()
  # the if..else..fi statement below sets 'nnets_list'.
  if [ $max_models_combine -lt $num_models_combine ]; then
    # The number of models to combine is too large, e.g. > 20.  In this case,
    # each argument to nnet-combine-fast will be an average of multiple models.
    cur_offset=0 # current offset from first_model_combine.
    for n in $(seq $max_models_combine); do
      next_offset=$[($n*$num_models_combine)/$max_models_combine]
      sub_list=""
      for o in $(seq $cur_offset $[$next_offset-1]); do
        iter=$[$first_model_combine+$o]
        mdl=$dir/$iter.mdl
        [ ! -f $mdl ] && echo "Expected $mdl to exist" && exit 1;
        sub_list="$sub_list $mdl"
      done
      nnets_list[$[$n-1]]="nnet-am-average $sub_list - |"
      cur_offset=$next_offset
    done
  else
    nnets_list=
    for n in $(seq 0 $[num_models_combine-1]); do
      iter=$[$first_model_combine+$n]
      mdl=$dir/$iter.mdl
      [ ! -f $mdl ] && echo "Expected $mdl to exist" && exit 1;
      nnets_list[$n]=$mdl
    done
  fi


  # Below, use --use-gpu=no to disable nnet-combine-fast from using a GPU, as
  # if there are many models it can give out-of-memory error; set num-threads to 8
  # to speed it up (this isn't ideal...)
  num_egs=`nnet-copy-egs ark:$cur_egs_dir/combine.egs ark:/dev/null 2>&1 | tail -n 1 | awk '{print $NF}'`
  mb=$[($num_egs+$combine_num_threads-1)/$combine_num_threads]
  [ $mb -gt 512 ] && mb=512
  # Setting --initial-model to a large value makes it initialize the combination
  # with the average of all the models.  It's important not to start with a
  # single model, or, due to the invariance to scaling that these nonlinearities
  # give us, we get zero diagonal entries in the fisher matrix that
  # nnet-combine-fast uses for scaling, which after flooring and inversion, has
  # the effect that the initial model chosen gets much higher learning rates
  # than the others.  This prevents the optimization from working well.
  $cmd $combine_parallel_opts $dir/log/combine.log \
    nnet-combine-fast --initial-model=100000 --num-lbfgs-iters=40 --use-gpu=no \
      --num-threads=$combine_num_threads \
      --verbose=3 --minibatch-size=$mb "${nnets_list[@]}" ark:$cur_egs_dir/combine.egs \
      $dir/final.mdl || exit 1;

  # Normalize stddev for affine or block affine layers that are followed by a
  # pnorm layer and then a normalize layer.
  $cmd $dir/log/normalize.log \
    nnet-normalize-stddev $dir/final.mdl $dir/final.mdl || exit 1;

  # Compute the probability of the final, combined model with
  # the same subset we used for the previous compute_probs, as the
  # different subsets will lead to different probs.
  $cmd $dir/log/compute_prob_valid.final.log \
    nnet-compute-prob $dir/final.mdl ark:$cur_egs_dir/valid_diagnostic.egs &
  $cmd $dir/log/compute_prob_train.final.log \
    nnet-compute-prob $dir/final.mdl ark:$cur_egs_dir/train_diagnostic.egs &
fi

if [ $stage -le $[$num_iters+1] ]; then
  echo "Getting average posterior for purposes of adjusting the priors."
  # Note: this just uses CPUs, using a smallish subset of data.
  rm $dir/post.$x.*.vec 2>/dev/null
  $cmd JOB=1:$num_jobs_compute_prior $dir/log/get_post.$x.JOB.log \
    nnet-copy-egs --frame=random --srand=JOB ark:$cur_egs_dir/egs.1.ark ark:- \| \
    nnet-subset-egs --srand=JOB --n=$prior_subset_size ark:- ark:- \| \
    nnet-compute-from-egs "nnet-to-raw-nnet $dir/final.mdl -|" ark:- ark:- \| \
    matrix-sum-rows ark:- ark:- \| vector-sum ark:- $dir/post.$x.JOB.vec || exit 1;

  sleep 3;  # make sure there is time for $dir/post.$x.*.vec to appear.

  $cmd $dir/log/vector_sum.$x.log \
   vector-sum $dir/post.$x.*.vec $dir/post.$x.vec || exit 1;

  rm $dir/post.$x.*.vec;

  echo "Re-adjusting priors based on computed posteriors"
  $cmd $dir/log/adjust_priors.final.log \
    nnet-adjust-priors $dir/final.mdl $dir/post.$x.vec $dir/final.mdl || exit 1;
fi


if [ ! -f $dir/final.mdl ]; then
  echo "$0: $dir/final.mdl does not exist."
  # we don't want to clean up if the training didn't succeed.
  exit 1;
fi

sleep 2

echo Done

if $cleanup; then
  echo Cleaning up data
  if [[ $cur_egs_dir =~ $dir/egs* ]]; then
    steps/nnet2/remove_egs.sh $cur_egs_dir
  fi

  echo Removing most of the models
  for x in `seq 0 $num_iters`; do
    if [ $[$x%100] -ne 0 ] && [ $x -ne $num_iters ] && [ -f $dir/$x.mdl ]; then
       # delete all but every 100th model; don't delete the ones which combine to form the final model.
      rm $dir/$x.mdl
    fi
  done
fi


================================================
FILE: egs/steps/nnet2/train_pnorm_bottleneck_fast.sh
================================================
#!/usr/bin/env bash

# Copyright 2012-2014  Johns Hopkins University (Author: Daniel Povey).
#           2014  Pegah Ghahremani
# Apache 2.0.


# train_pnorm_fast.sh is a new, improved version of train_pnorm.sh, which uses
# the 'online' preconditioning method.  For GPUs it's about two times faster
# than before (although that's partly due to optimizations that will also help
# the old recipe), and for CPUs it gives better performance than the old method
# (I believe); also, the difference in optimization performance between CPU and
# GPU is almost gone.  The old train_pnorm.sh script is now deprecated.
# We made this a separate script because not all of the options that the
# old script accepted, are still accepted.

# Begin configuration section.
cmd=run.pl
num_epochs=15      # Number of epochs during which we reduce
                   # the learning rate; number of iterations is worked out from this.
num_epochs_extra=5 # Number of epochs after we stop reducing
                   # the learning rate.
num_iters_final=20 # Maximum number of final iterations to give to the
                   # optimization over the validation set (maximum)
initial_learning_rate=0.04
final_learning_rate=0.004
bias_stddev=0.5
pnorm_input_dim=3000
pnorm_output_dim=300
bottleneck_dim=42  # bottleneck layer dimensio
p=2
minibatch_size=128 # by default use a smallish minibatch size for neural net
                   # training; this controls instability which would otherwise
                   # be a problem with multi-threaded update.

samples_per_iter=200000 # each iteration of training, see this many samples
                        # per job.  This option is passed to get_egs.sh
num_jobs_nnet=16   # Number of neural net jobs to run in parallel.  This option
                   # is passed to get_egs.sh.
get_egs_stage=0
online_ivector_dir=

shuffle_buffer_size=5000 # This "buffer_size" variable controls randomization of the samples
                # on each iter.  You could set it to 0 or to a large value for complete
                # randomization, but this would both consume memory and cause spikes in
                # disk I/O.  Smaller is easier on disk and memory but less random.  It's
                # not a huge deal though, as samples are anyway randomized right at the start.
                # (the point of this is to get data in different minibatches on different iterations,
                # since in the preconditioning method, 2 samples in the same minibatch can
                # affect each others' gradients.

add_layers_period=2 # by default, add new layers every 2 iterations.
num_hidden_layers=3
stage=-5

io_opts="--max-jobs-run 5" # for jobs with a lot of I/O, limits the number running at one time.   These don't
splice_width=4 # meaning +- 4 frames on each side for second LDA
randprune=4.0 # speeds up LDA.
alpha=4.0 # relates to preconditioning.
update_period=4 # relates to online preconditioning: says how often we update the subspace.
num_samples_history=2000 # relates to online preconditioning
max_change_per_sample=0.075
precondition_rank_in=20  # relates to online preconditioning
precondition_rank_out=80 # relates to online preconditioning

# this relates to perturbed training.
min_target_objf_change=0.1
target_multiplier=0 #  Set this to e.g. 1.0 to enable perturbed training.

mix_up=0 # Number of components to mix up to (should be > #tree leaves, if
        # specified.)
num_threads=16
parallel_opts="--num-threads 16 --mem 1G" # by default we use 16 threads; this lets the queue know.
  # note: parallel_opts doesn't automatically get adjusted if you adjust num-threads.
combine_num_threads=8
combine_parallel_opts="--num-threads 8"  # queue options for the "combine" stage.
cleanup=true
egs_dir=
lda_opts=
lda_dim=
egs_opts=
transform_dir=     # If supplied, overrides alidir
cmvn_opts=  # will be passed to get_lda.sh and get_egs.sh, if supplied.
            # only relevant for "raw" features, not lda.
feat_type=  # Can be used to force "raw" features.
prior_subset_size=10000 # 10k samples per job, for computing priors.  Should be
                        # more than enough.
bottleneck_layer_num=$num_hidden_layers-2 # bottleneck layer number between hidden layer
                                          # eg. 2000|2000|420|2000 bottleneck_layer_num = 2
# End configuration section.


echo "$0 $@"  # Print the command line for logging

if [ -f path.sh ]; then . ./path.sh; fi
. parse_options.sh || exit 1;

if [ $# != 4 ]; then
  echo "Usage: $0 [opts] <data> <lang> <ali-dir> <exp-dir>"
  echo " e.g.: $0 data/train data/lang exp/tri3_ali exp/tri4_nnet"
  echo ""
  echo "Main options (for others, see top of script file)"
  echo "  --config <config-file>                           # config file containing options"
  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
  echo "  --num-epochs <#epochs|15>                        # Number of epochs of main training"
  echo "                                                   # while reducing learning rate (determines #iterations, together"
  echo "                                                   # with --samples-per-iter and --num-jobs-nnet)"
  echo "  --num-epochs-extra <#epochs-extra|5>             # Number of extra epochs of training"
  echo "                                                   # after learning rate fully reduced"
  echo "  --initial-learning-rate <initial-learning-rate|0.02> # Learning rate at start of training, e.g. 0.02 for small"
  echo "                                                       # data, 0.01 for large data"
  echo "  --final-learning-rate  <final-learning-rate|0.004>   # Learning rate at end of training, e.g. 0.004 for small"
  echo "                                                   # data, 0.001 for large data"
  echo "  --num-hidden-layers <#hidden-layers|2>           # Number of hidden layers, e.g. 2 for 3 hours of data, 4 for 100hrs"
  echo "  --add-layers-period <#iters|2>                   # Number of iterations between adding hidden layers"
  echo "  --mix-up <#pseudo-gaussians|0>                   # Can be used to have multiple targets in final output layer,"
  echo "                                                   # per context-dependent state.  Try a number several times #states."
  echo "  --num-jobs-nnet <num-jobs|8>                     # Number of parallel jobs to use for main neural net"
  echo "                                                   # training (will affect results as well as speed; try 8, 16)"
  echo "                                                   # Note: if you increase this, you may want to also increase"
  echo "                                                   # the learning rate."
  echo "  --num-threads <num-threads|16>                   # Number of parallel threads per job (will affect results"
  echo "                                                   # as well as speed; may interact with batch size; if you increase"
  echo "                                                   # this, you may want to decrease the batch size."
  echo "  --parallel-opts <opts|\"--num-threads 16 --mem 1G\">      # extra options to pass to e.g. queue.pl for processes that"
  echo "                                                   # use multiple threads... "
  echo "  --io-opts <opts|\"--max-jobs-run 10\">                      # Options given to e.g. queue.pl for jobs that do a lot of I/O."
  echo "  --minibatch-size <minibatch-size|128>            # Size of minibatch to process (note: product with --num-threads"
  echo "                                                   # should not get too large, e.g. >2k)."
  echo "  --samples-per-iter <#samples|400000>             # Number of samples of data to process per iteration, per"
  echo "                                                   # process."
  echo "  --splice-width <width|4>                         # Number of frames on each side to append for feature input"
  echo "                                                   # (note: we splice processed, typically 40-dimensional frames"
  echo "  --lda-dim <dim|250>                              # Dimension to reduce spliced features to with LDA"
  echo "  --num-iters-final <#iters|20>                    # Number of final iterations to give to nnet-combine-fast to "
  echo "                                                   # interpolate parameters (the weights are learned with a validation set)"
  echo "  --first-component-power <power|1.0>              # Power applied to output of first p-norm layer... setting this to"
  echo "                                                   # 0.5 seems to help under some circumstances."
  echo "  --stage <stage|-9>                               # Used to run a partially-completed training process from somewhere in"
  echo "                                                   # the middle."


  exit 1;
fi

data=$1
lang=$2
alidir=$3
dir=$4

# Check some files.
for f in $data/feats.scp $lang/L.fst $alidir/ali.1.gz $alidir/final.mdl $alidir/tree; do
  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
done


# Set some variables.
truncate_comp_num=$[3*$num_hidden_layers+1]
num_leaves=`tree-info $alidir/tree 2>/dev/null | grep num-pdfs | awk '{print $2}'` || exit 1
[ -z $num_leaves ] && echo "\$num_leaves is unset" && exit 1
[ "$num_leaves" -eq "0" ] && echo "\$num_leaves is 0" && exit 1

nj=`cat $alidir/num_jobs` || exit 1;  # number of jobs in alignment dir...
# in this dir we'll have just one job.
sdata=$data/split$nj
utils/split_data.sh $data $nj

mkdir -p $dir/log
echo $nj > $dir/num_jobs
cp $alidir/tree $dir

utils/lang/check_phones_compatible.sh $lang/phones.txt $alidir/phones.txt || exit 1;
cp $lang/phones.txt $dir || exit 1;

extra_opts=()
[ ! -z "$cmvn_opts" ] && extra_opts+=(--cmvn-opts "$cmvn_opts")
[ ! -z "$feat_type" ] && extra_opts+=(--feat-type $feat_type)
[ ! -z "$online_ivector_dir" ] && extra_opts+=(--online-ivector-dir $online_ivector_dir)
[ -z "$transform_dir" ] && transform_dir=$alidir
extra_opts+=(--transform-dir $transform_dir)
extra_opts+=(--splice-width $splice_width)

if [ $stage -le -4 ]; then
  echo "$0: calling get_lda.sh"
  steps/nnet2/get_lda.sh $lda_opts "${extra_opts[@]}" --cmd "$cmd" $data $lang $alidir $dir || exit 1;
fi

# these files will have been written by get_lda.sh
feat_dim=$(cat $dir/feat_dim) || exit 1;
ivector_dim=$(cat $dir/ivector_dim) || exit 1;
lda_dim=$(cat $dir/lda_dim) || exit 1;

if [ $stage -le -3 ] && [ -z "$egs_dir" ]; then
  echo "$0: calling get_egs.sh"
  [ ! -z $spk_vecs_dir ] && egs_opts="$egs_opts --spk-vecs-dir $spk_vecs_dir";
  steps/nnet2/get_egs.sh $egs_opts "${extra_opts[@]}" \
      --samples-per-iter $samples_per_iter \
      --num-jobs-nnet $num_jobs_nnet --stage $get_egs_stage \
      --cmd "$cmd" $egs_opts --io-opts "$io_opts" \
      $data $lang $alidir $dir || exit 1;
fi

if [ -z $egs_dir ]; then
  egs_dir=$dir/egs
fi

iters_per_epoch=`cat $egs_dir/iters_per_epoch`  || exit 1;
! [ $num_jobs_nnet -eq `cat $egs_dir/num_jobs_nnet` ] && \
  echo "$0: Warning: using --num-jobs-nnet=`cat $egs_dir/num_jobs_nnet` from $egs_dir"
num_jobs_nnet=`cat $egs_dir/num_jobs_nnet` || exit 1;


if ! [ $num_hidden_layers -ge 1 ]; then
  echo "Invalid num-hidden-layers $num_hidden_layers"
  exit 1
fi

if [ $stage -le -2 ]; then
  echo "$0: initializing neural net";
  lda_mat=$dir/lda.mat
  tot_input_dim=$[$feat_dim+$ivector_dim]

  online_preconditioning_opts="alpha=$alpha num-samples-history=$num_samples_history update-period=$update_period rank-in=$precondition_rank_in rank-out=$precondition_rank_out max-change-per-sample=$max_change_per_sample"

  stddev=`perl -e "print 1.0/sqrt($pnorm_input_dim);"`
  cat >$dir/nnet.config <<EOF
SpliceComponent input-dim=$tot_input_dim left-context=$splice_width right-context=$splice_width const-component-dim=$ivector_dim
FixedAffineComponent matrix=$lda_mat
AffineComponentPreconditionedOnline input-dim=$lda_dim output-dim=$pnorm_input_dim $online_preconditioning_opts learning-rate=$initial_learning_rate param-stddev=$stddev bias-stddev=$bias_stddev
PnormComponent input-dim=$pnorm_input_dim output-dim=$pnorm_output_dim p=$p
NormalizeComponent dim=$pnorm_output_dim
AffineComponentPreconditionedOnline input-dim=$pnorm_output_dim output-dim=$num_leaves $online_preconditioning_opts learning-rate=$initial_learning_rate param-stddev=0 bias-stddev=0
SoftmaxComponent dim=$num_leaves
EOF

  # to hidden.config it will write the part of the config corresponding to a
  # single hidden layer; we need this to add new layers.
  cat >$dir/hidden.config <<EOF
AffineComponentPreconditionedOnline input-dim=$pnorm_output_dim output-dim=$pnorm_input_dim $online_preconditioning_opts learning-rate=$initial_learning_rate param-stddev=$stddev bias-stddev=$bias_stddev
PnormComponent input-dim=$pnorm_input_dim output-dim=$pnorm_output_dim p=$p
NormalizeComponent dim=$pnorm_output_dim
EOF

bnf_input_dim=$((10 * $bottleneck_dim))
bnf_output_dim=$bottleneck_dim
echo bnf_input_dim = $bnf_input_dim
  bottleneck_stddev=`perl -e "print 1.0/sqrt($bnf_input_dim);"`
  # bnf.config it will write the part of th config corresponding to a
  # bottleneck layer; we need this to add bottleneck layer.
  cat >$dir/bnf.config <<EOF
AffineComponentPreconditionedOnline input-dim=$pnorm_output_dim output-dim=$bnf_input_dim $online_preconditioning_opts learning-rate=$initial_learning_rate param-stddev=$bottleneck_stddev bias-stddev=$bias_stddev
PnormComponent input-dim=$bnf_input_dim output-dim=$bnf_output_dim p=$p
NormalizeComponent dim=$bnf_output_dim
AffineComponentPreconditionedOnline input-dim=$bnf_output_dim output-dim=$pnorm_input_dim $online_preconditioning_opts learning-rate=$initial_learning_rate param-stddev=$stddev bias-stddev=$bias_stddev
PnormComponent input-dim=$pnorm_input_dim output-dim=$pnorm_output_dim  p=$p
NormalizeComponent dim=$pnorm_output_dim
EOF
  $cmd $dir/log/nnet_init.log \
    nnet-am-init $alidir/tree $lang/topo "nnet-init $dir/nnet.config -|" \
    $dir/0.mdl || exit 1;
fi

if [ $stage -le -1 ]; then
  echo "Training transition probabilities and setting priors"
  $cmd $dir/log/train_trans.log \
    nnet-train-transitions $dir/0.mdl "ark:gunzip -c $alidir/ali.*.gz|" $dir/0.mdl \
    || exit 1;
fi

num_iters_reduce=$[$num_epochs * $iters_per_epoch];
num_iters_extra=$[$num_epochs_extra * $iters_per_epoch];
num_iters=$[$num_iters_reduce+$num_iters_extra]

echo "$0: Will train for $num_epochs + $num_epochs_extra epochs, equalling "
echo "$0: $num_iters_reduce + $num_iters_extra = $num_iters iterations, "
echo "$0: (while reducing learning rate) + (with constant learning rate)."


function set_target_objf_change {
  # nothing to do if $target_multiplier not set.
  [ "$target_multiplier" == "0" -o "$target_multiplier" == "0.0" ] && return;
  [ $x -le $finish_add_layers_iter ] && return;
  wait=2  # the compute_prob_{train,valid} from 2 iterations ago should
          # most likey be done even though we backgrounded them.
  [ $[$x-$wait] -le 0 ] && return;
  while true; do
    # Note: awk 'some-expression' is the same as: awk '{if(some-expression) print;}'
    train_prob=$(awk '(NF == 1)' < $dir/log/compute_prob_train.$[$x-$wait].log)
    valid_prob=$(awk '(NF == 1)' < $dir/log/compute_prob_valid.$[$x-$wait].log)
    if [ -z "$train_prob" ] || [ -z "$valid_prob" ]; then
      echo "$0: waiting until $dir/log/compute_prob_{train,valid}.$[$x-$wait].log are done"
      sleep 60
    else
      target_objf_change=$(perl -e '($train,$valid,$min_change,$multiplier)=@ARGV; if (!($train < 0.0) || !($valid < 0.0)) { print "0\n"; print STDERR "Error: invalid train or valid prob: $train_prob, $valid_prob\n"; exit(0); } else { print STDERR "train,valid=$train,$valid\n"; $proposed_target = $multiplier * ($train-$valid); if ($proposed_target < $min_change) { print "0"; } else { print $proposed_target; }}' -- "$train_prob" "$valid_prob" "$min_target_objf_change" "$target_multiplier")
      echo "On iter $x, (train,valid) probs from iter $[$x-$wait] were ($train_prob,$valid_prob), and setting target-objf-change to $target_objf_change."
      return;
    fi
  done
}

finish_add_layers_iter=$[$num_hidden_layers * $add_layers_period]
# This is when we decide to mix up from: halfway between when we've finished
# adding the hidden layers and the end of training.
mix_up_iter=$[($num_iters + $finish_add_layers_iter)/2]

if [ $num_threads -eq 1 ]; then
  parallel_suffix="-simple" # this enables us to use GPU code if
                         # we have just one thread.
  parallel_train_opts=
  if ! cuda-compiled; then
    echo "$0: WARNING: you are running with one thread but you have not compiled"
    echo "   for CUDA.  You may be running a setup optimized for GPUs.  If you have"
    echo "   GPUs and have nvcc installed, go to src/ and do ./configure; make"
  fi
else
  parallel_suffix="-parallel"
  parallel_train_opts="--num-threads=$num_threads"
fi

x=0
target_objf_change=0 # relates to perturbed training.

while [ $x -lt $num_iters ]; do
  if [ $x -ge 0 ] && [ $stage -le $x ]; then
    # Set off jobs doing some diagnostics, in the background.
    $cmd $dir/log/compute_prob_valid.$x.log \
      nnet-compute-prob $dir/$x.mdl ark:$egs_dir/valid_diagnostic.egs &
    $cmd $dir/log/compute_prob_train.$x.log \
      nnet-compute-prob $dir/$x.mdl ark:$egs_dir/train_diagnostic.egs &
    if [ $x -gt 0 ] && [ ! -f $dir/log/mix_up.$[$x-1].log ]; then
      $cmd $dir/log/progress.$x.log \
        nnet-show-progress --use-gpu=no $dir/$[$x-1].mdl $dir/$x.mdl \
          ark:$egs_dir/train_diagnostic.egs '&&' \
        nnet-am-info $dir/$x.mdl &
    fi

    echo "Training neural net (pass $x)"

    if [ $x -gt 0 ] && \
      [ $x -le $[($num_hidden_layers-1)*$add_layers_period] ] && \
      [ $[($x-1) % $add_layers_period] -eq 0 ]; then
      if [ $[($x-1) / $add_layers_period] -eq $[($num_hidden_layers-2)] ]; then
        echo bnf layer with x = $x
        mdl="nnet-init --srand=$x $dir/bnf.config - | nnet-insert $dir/$x.mdl - - |"
      else
        mdl="nnet-init --srand=$x $dir/hidden.config - | nnet-insert $dir/$x.mdl - - |"
      fi
    else
      mdl=$dir/$x.mdl
    fi
    if [ $x -eq 0 ] || [ "$mdl" != "$dir/$x.mdl" ]; then
      # on iteration zero or when we just added a layer, use a smaller minibatch
      # size and just one job: the model-averaging doesn't seem to be helpful
      # when the model is changing too fast (i.e. it worsens the objective
      # function), and the smaller minibatch size will help to keep
      # the update stable.
      this_minibatch_size=$[$minibatch_size/2];
      do_average=false
    else
      this_minibatch_size=$minibatch_size
      do_average=true
    fi

    set_target_objf_change;  # only has effect if target_multiplier != 0
    if [ "$target_objf_change" != "0" ]; then
      [ ! -f $dir/within_covar.spmat ] && \
        echo "$0: expected $dir/within_covar.spmat to exist." && exit 1;
      perturb_suffix="-perturbed"
      perturb_opts="--target-objf-change=$target_objf_change --within-covar=$dir/within_covar.spmat"
    fi

    $cmd $parallel_opts JOB=1:$num_jobs_nnet $dir/log/train.$x.JOB.log \
      nnet-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x \
      ark:$egs_dir/egs.JOB.$[$x%$iters_per_epoch].ark ark:- \| \
       nnet-train$parallel_suffix$perturb_suffix $parallel_train_opts $perturb_opts \
        --minibatch-size=$this_minibatch_size --srand=$x "$mdl" \
        ark:- $dir/$[$x+1].JOB.mdl \
      || exit 1;

    nnets_list=
    for n in `seq 1 $num_jobs_nnet`; do
      nnets_list="$nnets_list $dir/$[$x+1].$n.mdl"
    done

    learning_rate=`perl -e '($x,$n,$i,$f)=@ARGV; print ($x >= $n ? $f : $i*exp($x*log($f/$i)/$n));' $[$x+1] $num_iters_reduce $initial_learning_rate $final_learning_rate`;

    if $do_average; then
      $cmd $dir/log/average.$x.log \
        nnet-am-average $nnets_list - \| \
        nnet-am-copy --learning-rate=$learning_rate - $dir/$[$x+1].mdl || exit 1;
    else
      n=$(perl -e '($nj,$pat)=@ARGV; $best_n=1; $best_logprob=-1.0e+10; for ($n=1;$n<=$nj;$n++) {
          $fn = sprintf($pat,$n); open(F, "<$fn") || die "Error opening log file $fn";
          undef $logprob; while (<F>) { if (m/log-prob-per-frame=(\S+)/) { $logprob=$1; } }
          close(F); if (defined $logprob && $logprob > $best_logprob) { $best_logprob=$logprob;
          $best_n=$n; } } print "$best_n\n"; ' $num_jobs_nnet $dir/log/train.$x.%d.log) || exit 1;
      [ -z "$n" ] && echo "Error getting best model" && exit 1;
      $cmd $dir/log/select.$x.log \
        nnet-am-copy --learning-rate=$learning_rate $dir/$[$x+1].$n.mdl $dir/$[$x+1].mdl || exit 1;
    fi

    if [ "$mix_up" -gt 0 ] && [ $x -eq $mix_up_iter ]; then
      # mix up.
      echo Mixing up from $num_leaves to $mix_up components
      $cmd $dir/log/mix_up.$x.log \
        nnet-am-mixup --min-count=10 --num-mixtures=$mix_up \
        $dir/$[$x+1].mdl $dir/$[$x+1].mdl || exit 1;
    fi
    rm $nnets_list
    [ ! -f $dir/$[$x+1].mdl ] && exit 1;
    if [ -f $dir/$[$x-1].mdl ] && $cleanup && \
       [ $[($x-1)%100] -ne 0  ] && [ $[$x-1] -le $[$num_iters-$num_iters_final] ]; then
      rm $dir/$[$x-1].mdl
    fi
  fi
  x=$[$x+1]
done

# Now do combination.
# At the end, final.mdl will be a combination of the last e.g. 10 models.
nnets_list=()
if [ $num_iters_final -gt $num_iters_extra ]; then
  echo "Setting num_iters_final=$num_iters_extra"
fi
start=$[$num_iters-$num_iters_final+1]
for x in `seq $start $num_iters`; do
  idx=$[$x-$start]
  if [ $x -gt $mix_up_iter ]; then
    nnets_list[$idx]=$dir/$x.mdl # "nnet-am-copy --remove-dropout=true $dir/$x.mdl - |"
  fi
done

if [ $stage -le $num_iters ]; then
  echo "Doing final combination to produce final.mdl"
  # Below, use --use-gpu=no to disable nnet-combine-fast from using a GPU, as
  # if there are many models it can give out-of-memory error; set num-threads to 8
  # to speed it up (this isn't ideal...)
  num_egs=`nnet-copy-egs ark:$egs_dir/combine.egs ark:/dev/null 2>&1 | tail -n 1 | awk '{print $NF}'`
  mb=$[($num_egs+$combine_num_threads-1)/$combine_num_threads]
  [ $mb -gt 512 ] && mb=512
  # Setting --initial-model to a large value makes it initialize the combination
  # with the average of all the models.  It's important not to start with a
  # single model, or, due to the invariance to scaling that these nonlinearities
  # give us, we get zero diagonal entries in the fisher matrix that
  # nnet-combine-fast uses for scaling, which after flooring and inversion, has
  # the effect that the initial model chosen gets much higher learning rates
  # than the others.  This prevents the optimization from working well.
  $cmd $combine_parallel_opts $dir/log/combine.log \
    nnet-combine-fast --initial-model=100000 --num-lbfgs-iters=40 --use-gpu=no \
      --num-threads=$combine_num_threads \
      --verbose=3 --minibatch-size=$mb "${nnets_list[@]}" ark:$egs_dir/combine.egs \
      $dir/final.mdl || exit 1;

  # Normalize stddev for affine or block affine layers that are followed by a
  # pnorm layer and then a normalize layer.
  $cmd $dir/log/normalize.log \
    nnet-normalize-stddev $dir/final.mdl $dir/final.mdl || exit 1;

  # Compute the probability of the final, combined model with
  # the same subset we used for the previous compute_probs, as the
  # different subsets will lead to different probs.
  $cmd $dir/log/compute_prob_valid.final.log \
    nnet-compute-prob $dir/final.mdl ark:$egs_dir/valid_diagnostic.egs &
  $cmd $dir/log/compute_prob_train.final.log \
    nnet-compute-prob $dir/final.mdl ark:$egs_dir/train_diagnostic.egs &
fi

if [ $stage -le $[$num_iters+1] ]; then
  echo "Getting average posterior for purposes of adjusting the priors."
  # Note: this just uses CPUs, using a smallish subset of data.
  rm $dir/post.*.vec 2>/dev/null
  $cmd JOB=1:$num_jobs_nnet $dir/log/get_post.JOB.log \
    nnet-subset-egs --n=$prior_subset_size ark:$egs_dir/egs.JOB.0.ark ark:- \| \
    nnet-compute-from-egs "nnet-to-raw-nnet $dir/final.mdl -|" ark:- ark:- \| \
    matrix-sum-rows ark:- ark:- \| vector-sum ark:- $dir/post.JOB.vec || exit 1;

  sleep 3;  # make sure there is time for $dir/post.*.vec to appear.

  $cmd $dir/log/vector_sum.log \
   vector-sum $dir/post.*.vec $dir/post.vec || exit 1;

  rm $dir/post.*.vec;

  echo "Re-adjusting priors based on computed posteriors"
  $cmd $dir/log/adjust_priors.log \
    nnet-adjust-priors $dir/final.mdl $dir/post.vec $dir/final.mdl || exit 1;
fi


sleep 2

echo Done

if $cleanup; then
  echo Cleaning up data
  if [ $egs_dir == "$dir/egs" ]; then
    steps/nnet2/remove_egs.sh $dir/egs
  fi
fi
name=`basename $data`
if [ -f $dir/final.mdl ]; then
  nnet-to-raw-nnet --truncate=$truncate_comp_num $dir/final.mdl $dir/final.raw
else
  echo "$0: we require final.mdl in source dir $dir"
fi


================================================
FILE: egs/steps/nnet2/train_pnorm_ensemble.sh
================================================
#!/usr/bin/env bash

# Copyright 2012  Johns Hopkins University (Author: Daniel Povey).
#           2013  Guoguo Chen
#           2014  Xiaohui Zhang
# Apache 2.0.


# This script trains an ensemble of neural networks with pnorm nonlinearities.
# An ensemble of nets are first differently initialized, and then trained using the
# same data during each iteration. In each training iteration, one term is added to
# the objf, which is beta times the cross-entropy between the current net's posterior
# output and the geometrically averaged posterior outputs of the ensemble of nets.
# The beta values obey an exponentially increasing schedule (determined by initial_beta
# and final_beta).

# Begin configuration section.
cmd=run.pl
num_epochs=15      # Number of epochs during which we reduce
                   # the learning rate; number of iteration is worked out from this.
num_epochs_extra=5 # Number of epochs after we stop reducing
                   # the learning rate.
num_iters_final=20 # Maximum number of final iterations to give to the
                   # optimization over the validation set.
initial_learning_rate=0.04
final_learning_rate=0.004
bias_stddev=0.5
softmax_learning_rate_factor=1.0 # In the default setting keep the same learning rate.

combine_regularizer=1.0e-14 # Small regularizer so that parameters won't go crazy.
pnorm_input_dim=3000
pnorm_output_dim=300
p=2
minibatch_size=128 # by default use a smallish minibatch size for neural net
                   # training; this controls instability which would otherwise
                   # be a problem with multi-threaded update.  Note: it also
                   # interacts with the "preconditioned" update which generally
                   # works better with larger minibatch size, so it's not
                   # completely cost free.

samples_per_iter=200000 # each iteration of training, see this many samples
                        # per job.  This option is passed to get_egs.sh
num_jobs_nnet=16   # Number of neural net jobs to run in parallel.  This option
                   # is passed to get_egs.sh.
get_egs_stage=0

shuffle_buffer_size=5000 # This "buffer_size" variable controls randomization of the samples
                # on each iter.  You could set it to 0 or to a large value for complete
                # randomization, but this would both consume memory and cause spikes in
                # disk I/O.  Smaller is easier on disk and memory but less random.  It's
                # not a huge deal though, as samples are anyway randomized right at the start.

add_layers_period=2 # by default, add new layers every 2 iterations.
num_hidden_layers=3
stage=-5

io_opts="--max-jobs-run 5" # for jobs with a lot of I/O, limits the number running at one time.   These don't
splice_width=4 # meaning +- 4 frames on each side for second LDA
randprune=4.0 # speeds up LDA.
alpha=4.0
max_change=10.0
mix_up=0 # Number of components to mix up to (should be > #tree leaves, if
        # specified.)
num_threads=16
parallel_opts="--num-threads 16 --mem 1G" # by default we use 16 threads; this lets the queue know.
  # note: parallel_opts doesn't automatically get adjusted if you adjust num-threads.
cleanup=true
egs_dir=
lda_opts=
egs_opts=
initial_beta=0.1
final_beta=6
ensemble_size=2
# End configuration section.


echo "$0 $@"  # Print the command line for logging

if [ -f path.sh ]; then . ./path.sh; fi
. parse_options.sh || exit 1;

if [ $# != 4 ]; then
  echo "Usage: $0 [opts] <data> <lang> <ali-dir> <exp-dir>"
  echo " e.g.: $0 data/train data/lang exp/tri3_ali exp/tri4_nnet"
  echo ""
  echo "Main options (for others, see top of script file)"
  echo "  --config <config-file>                           # config file containing options"
  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
  echo "  --num-epochs <#epochs|15>                        # Number of epochs of main training"
  echo "                                                   # while reducing learning rate (determines #iterations, together"
  echo "                                                   # with --samples-per-iter and --num-jobs-nnet)"
  echo "  --num-epochs-extra <#epochs-extra|5>             # Number of extra epochs of training"
  echo "                                                   # after learning rate fully reduced"
  echo "  --initial-learning-rate <initial-learning-rate|0.02> # Learning rate at start of training, e.g. 0.02 for small"
  echo "                                                       # data, 0.01 for large data"
  echo "  --final-learning-rate  <final-learning-rate|0.004>   # Learning rate at end of training, e.g. 0.004 for small"
  echo "                                                   # data, 0.001 for large data"
  echo "  --num-hidden-layers <#hidden-layers|2>           # Number of hidden layers, e.g. 2 for 3 hours of data, 4 for 100hrs"
  echo "  --initial-num-hidden-layers <#hidden-layers|1>   # Number of hidden layers to start with."
  echo "  --add-layers-period <#iters|2>                   # Number of iterations between adding hidden layers"
  echo "  --mix-up <#pseudo-gaussians|0>                   # Can be used to have multiple targets in final output layer,"
  echo "                                                   # per context-dependent state.  Try a number several times #states."
  echo "  --num-jobs-nnet <num-jobs|8>                     # Number of parallel jobs to use for main neural net"
  echo "                                                   # training (will affect results as well as speed; try 8, 16)"
  echo "                                                   # Note: if you increase this, you may want to also increase"
  echo "                                                   # the learning rate."
  echo "  --num-threads <num-threads|16>                   # Number of parallel threads per job (will affect results"
  echo "                                                   # as well as speed; may interact with batch size; if you increase"
  echo "                                                   # this, you may want to decrease the batch size."
  echo "  --parallel-opts <opts|\"--num-threads 16 --mem 1G\">      # extra options to pass to e.g. queue.pl for processes that"
  echo "                                                   # use multiple threads... "
  echo "  --io-opts <opts|\"--max-jobs-run 10\">                      # Options given to e.g. queue.pl for jobs that do a lot of I/O."
  echo "  --minibatch-size <minibatch-size|128>            # Size of minibatch to process (note: product with --num-threads"
  echo "                                                   # should not get too large, e.g. >2k)."
  echo "  --samples-per-iter <#samples|400000>             # Number of samples of data to process per iteration, per"
  echo "                                                   # process."
  echo "  --splice-width <width|4>                         # Number of frames on each side to append for feature input"
  echo "                                                   # (note: we splice processed, typically 40-dimensional frames"
  echo "  --lda-dim <dim|250>                              # Dimension to reduce spliced features to with LDA"
  echo "  --num-iters-final <#iters|10>                    # Number of final iterations to give to nnet-combine-fast to "
  echo "                                                   # interpolate parameters (the weights are learned with a validation set)"
  echo "  --num-utts-subset <#utts|300>                    # Number of utterances in subsets used for validation and diagnostics"
  echo "                                                   # (the validation subset is held out from training)"
  echo "  --num-frames-diagnostic <#frames|4000>           # Number of frames used in computing (train,valid) diagnostics"
  echo "  --num-valid-frames-combine <#frames|10000>       # Number of frames used in getting combination weights at the"
  echo "                                                   # very end."
  echo "  --stage <stage|-9>                               # Used to run a partially-completed training process from somewhere in"
  echo "                                                   # the middle."

  exit 1;
fi

data=$1
lang=$2
alidir=$3
dir=$4

# Check some files.
for f in $data/feats.scp $lang/L.fst $alidir/ali.1.gz $alidir/final.mdl $alidir/tree; do
  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
done


# Set some variables.
num_leaves=`tree-info $alidir/tree 2>/dev/null | grep num-pdfs | awk '{print $2}'` || exit 1
[ -z $num_leaves ] && echo "\$num_leaves is unset" && exit 1
[ "$num_leaves" -eq "0" ] && echo "\$num_leaves is 0" && exit 1

nj=`cat $alidir/num_jobs` || exit 1;  # number of jobs in alignment dir...
# in this dir we'll have just one job.
sdata=$data/split$nj
utils/split_data.sh $data $nj

mkdir -p $dir/log
echo $nj > $dir/num_jobs
splice_opts=`cat $alidir/splice_opts 2>/dev/null`
cp $alidir/splice_opts $dir 2>/dev/null
cp $alidir/tree $dir

utils/lang/check_phones_compatible.sh $lang/phones.txt $alidir/phones.txt || exit 1;
cp $lang/phones.txt $dir || exit 1;

if [ $stage -le -4 ]; then
  echo "$0: calling get_lda.sh"
  steps/nnet2/get_lda.sh $lda_opts --splice-width $splice_width --cmd "$cmd" $data $lang $alidir $dir || exit 1;
fi

# these files will have been written by get_lda.sh
feat_dim=`cat $dir/feat_dim` || exit 1;
lda_dim=`cat $dir/lda_dim` || exit 1;

if [ $stage -le -3 ] && [ -z "$egs_dir" ]; then
  echo "$0: calling get_egs.sh"
  steps/nnet2/get_egs.sh --samples-per-iter $samples_per_iter --num-jobs-nnet $num_jobs_nnet \
      --splice-width $splice_width --stage $get_egs_stage --cmd "$cmd" $egs_opts --io-opts "$io_opts" \
      $data $lang $alidir $dir || exit 1;
fi

if [ -z $egs_dir ]; then
  egs_dir=$dir/egs
fi

iters_per_epoch=`cat $egs_dir/iters_per_epoch`  || exit 1;
! [ $num_jobs_nnet -eq `cat $egs_dir/num_jobs_nnet` ] && \
  echo "$0: Warning: using --num-jobs-nnet=`cat $egs_dir/num_jobs_nnet` from $egs_dir"
num_jobs_nnet=`cat $egs_dir/num_jobs_nnet` || exit 1;


if ! [ $num_hidden_layers -ge 1 ]; then
  echo "Invalid num-hidden-layers $num_hidden_layers"
  exit 1
fi

if [ $stage -le -2 ]; then
  echo "$0: initializing neural net";

  lda_mat=$dir/lda.mat
  ext_lda_dim=$lda_dim
  ext_feat_dim=$feat_dim

  stddev=`perl -e "print 1.0/sqrt($pnorm_input_dim);"`
  cat >$dir/nnet.config <<EOF
SpliceComponent input-dim=$ext_feat_dim left-context=$splice_width right-context=$splice_width
FixedAffineComponent matrix=$lda_mat
AffineComponentPreconditioned input-dim=$ext_lda_dim output-dim=$pnorm_input_dim alpha=$alpha max-change=$max_change learning-rate=$initial_learning_rate param-stddev=$stddev bias-stddev=$bias_stddev
PnormComponent input-dim=$pnorm_input_dim output-dim=$pnorm_output_dim p=$p
NormalizeComponent dim=$pnorm_output_dim
AffineComponentPreconditioned input-dim=$pnorm_output_dim output-dim=$num_leaves alpha=$alpha max-change=$max_change learning-rate=$initial_learning_rate param-stddev=0 bias-stddev=0
SoftmaxComponent dim=$num_leaves
EOF

  # to hidden.config it will write the part of the config corresponding to a
  # single hidden layer; we need this to add new layers.
  cat >$dir/hidden.config <<EOF
AffineComponentPreconditioned input-dim=$pnorm_output_dim output-dim=$pnorm_input_dim alpha=$alpha max-change=$max_change learning-rate=$initial_learning_rate param-stddev=$stddev bias-stddev=$bias_stddev
PnormComponent input-dim=$pnorm_input_dim output-dim=$pnorm_output_dim p=$p
NormalizeComponent dim=$pnorm_output_dim
EOF
  for i in `seq 1 $ensemble_size`; do
    $cmd $parallel_opts JOB=1:$ensemble_size $dir/log/nnet_init.JOB.log \
      nnet-am-init $alidir/tree $lang/topo "nnet-init --srand=JOB $dir/nnet.config -|" \
      $dir/0.JOB.mdl || exit 1;
  done
fi

if [ $stage -le -1 ]; then
  echo "Training transition probabilities and setting priors"
  $cmd $parallel_opts JOB=1:$ensemble_size $dir/log/train_trans.JOB.log \
      nnet-train-transitions $dir/0.JOB.mdl "ark:gunzip -c $alidir/ali.*.gz|" $dir/0.JOB.mdl \
      || exit 1;
fi

num_iters_reduce=$[$num_epochs * $iters_per_epoch];
num_iters_extra=$[$num_epochs_extra * $iters_per_epoch];
num_iters=$[$num_iters_reduce+$num_iters_extra]

echo "$0: Will train for $num_epochs + $num_epochs_extra epochs, equalling "
echo "$0: $num_iters_reduce + $num_iters_extra = $num_iters iterations, "
echo "$0: (while reducing learning rate) + (with constant learning rate)."

# This is when we decide to mix up from: halfway between when we've finished
# adding the hidden layers and the end of training.
finish_add_layers_iter=$[$num_hidden_layers*$add_layers_period]
mix_up_iter=$[($num_iters + $finish_add_layers_iter)/2]

if [ $num_threads -eq 1 ]; then
  if ! cuda-compiled; then
    echo "$0: WARNING: you are running with one thread but you have not compiled"
    echo "   for CUDA.  You may be running a setup optimized for GPUs.  If you have"
    echo "   GPUs and have nvcc installed, go to src/ and do ./configure; make"
    exit
  fi
fi

x=0

while [ $x -lt $num_iters ]; do
  if [ $x -ge 0 ] && [ $stage -le $x ]; then
    # Set off jobs doing some diagnostics, in the background.
    $cmd $dir/log/compute_prob_valid.$x.log \
      nnet-compute-prob $dir/$x.1.mdl ark:$egs_dir/valid_diagnostic.egs &
    $cmd $dir/log/compute_prob_train.$x.log \
      nnet-compute-prob $dir/$x.1.mdl ark:$egs_dir/train_diagnostic.egs &
    if [ $x -gt 0 ] && [ ! -f $dir/log/mix_up.$[$x-1].log ]; then
      $cmd $dir/log/progress.$x.log \
        nnet-show-progress --use-gpu=no $dir/$[$x-1].1.mdl $dir/$x.1.mdl ark:$egs_dir/train_diagnostic.egs &
    fi

    declare -A mdl
    echo "Training neural net (pass $x)"
    if [ $x -gt 0 ] && \
      [ $x -le $[($num_hidden_layers-1)*$add_layers_period] ] && \
      [ $[($x-1) % $add_layers_period] -eq 0 ]; then
      for i in `seq 1 $ensemble_size`; do
        mdl[$i]="nnet-init --srand=$[$x+$i] $dir/hidden.config - | nnet-insert $dir/$x.$i.mdl - - |"
      done
    else
      for i in `seq 1 $ensemble_size`; do
        mdl[$i]=$dir/$x.$i.mdl
      done
    fi

    nnets_ensemble_in=
    nnets_ensemble_out=
    for i in `seq 1 $ensemble_size`; do
      nnets_ensemble_in="$nnets_ensemble_in '${mdl[$i]}'"
      nnets_ensemble_out="${nnets_ensemble_out} $dir/$[$x+1].JOB.$i.mdl "
    done

    beta=`perl -e '($x,$n,$i,$f)=@ARGV; print ($i+$x*($f-$i)/$n);' $[$x+1] $num_iters $initial_beta $final_beta`;

    $cmd $parallel_opts JOB=1:$num_jobs_nnet $dir/log/train.$x.JOB.log \
      nnet-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x \
      ark:$egs_dir/egs.JOB.$[$x%$iters_per_epoch].ark ark:- \| \
      nnet-train-ensemble \
         --minibatch-size=$minibatch_size --srand=$x --beta=$beta $nnets_ensemble_in \
        ark:- $nnets_ensemble_out \
      || exit 1;

    learning_rate=`perl -e '($x,$n,$i,$f)=@ARGV; print ($x >= $n ? $f : $i*exp($x*log($f/$i)/$n));' $[$x+1] $num_iters_reduce $initial_learning_rate $final_learning_rate`;
    softmax_learning_rate=`perl -e "print $learning_rate * $softmax_learning_rate_factor;"`;
    nnet-am-info $dir/$[$x+1].1.1.mdl > $dir/foo  2>/dev/null || exit 1
    nu=`cat $dir/foo | grep num-updatable-components | awk '{print $2}'`
    na=`cat $dir/foo | grep -v Fixed | grep AffineComponent | wc -l`
    # na is number of last updatable AffineComponent layer [one-based, counting only
    # updatable components.]
    lr_string="$learning_rate"
    for n in `seq 2 $nu`; do
      if [ $n -eq $na ] || [ $n -eq $[$na-1] ]; then lr=$softmax_learning_rate;
      else lr=$learning_rate; fi
      lr_string="$lr_string:$lr"
    done

    for i in `seq 1 $ensemble_size`; do
      nnets_list=
      for n in `seq 1 $num_jobs_nnet`; do
        nnets_list="$nnets_list $dir/$[$x+1].$n.$i.mdl"
      done
      $cmd $dir/log/average.$x.$i.log \
        nnet-am-average $nnets_list - \| \
        nnet-am-copy --learning-rates=$lr_string - $dir/$[$x+1].$i.mdl || exit 1;
      rm $nnets_list
      if [ "$mix_up" -gt 0 ] && [ $x -eq $mix_up_iter ]; then
        # mix up.
        echo Mixing up from $num_leaves to $mix_up components
        $cmd $dir/log/mix_up.$x.$i.log \
          nnet-am-mixup --min-count=10 --num-mixtures=$mix_up \
          $dir/$[$x+1].$i.mdl $dir/$[$x+1].$i.mdl || exit 1;
      fi
    done
  fi
  x=$[$x+1]
done

# Now do combination.
# At the end, final.mdl will be a combination of the last e.g. 10 models.

for i in `seq 1 $ensemble_size`; do
  nnets_list=()
  if [ $num_iters_final -gt $num_iters_extra ]; then
    echo "Setting num_iters_final=$num_iters_extra"
  fi
  start=$[$num_iters-$num_iters_final+1]
  for x in `seq $start $num_iters`; do
    idx=$[$x-$start]
    if [ $x -gt $mix_up_iter ]; then
      nnets_list[$idx]=$dir/$x.$i.mdl # "nnet-am-copy --remove-dropout=true $dir/$x.mdl - |"
    fi
  done

  if [ $stage -le $num_iters ]; then
    # Below, use --use-gpu=no to disable nnet-combine-fast from using a GPU, as
    # if there are many models it can give out-of-memory error; set num-threads to 8
    # to speed it up (this isn't ideal...)
    this_num_threads=$num_threads
    [ $this_num_threads -lt 8 ] && this_num_threads=8
    num_egs=`nnet-copy-egs ark:$egs_dir/combine.egs ark:/dev/null 2>&1 | tail -n 1 | awk '{print $NF}'`
    mb=$[($num_egs+$this_num_threads-1)/$this_num_threads]
    [ $mb -gt 512 ] && mb=512
    # Setting --initial-model to a large value makes it initialize the combination
    # with the average of all the models.  It's important not to start with a
    # single model, or, due to the invariance to scaling that these nonlinearities
    # give us, we get zero diagonal entries in the fisher matrix that
    # nnet-combine-fast uses for scaling, which after flooring and inversion, has
    # the effect that the initial model chosen gets much higher learning rates
    # than the others.  This prevents the optimization from working well.
    $cmd $parallel_opts $dir/log/combine.$i.log \
      nnet-combine-fast --initial-model=100000 --num-lbfgs-iters=40 --use-gpu=no \
        --num-threads=$this_num_threads --regularizer=$combine_regularizer \
        --initial-model=100000 --num-lbfgs-iters=40 \
        --verbose=3 --minibatch-size=$mb "${nnets_list[@]}" ark:$egs_dir/combine.egs \
        $dir/final.$i.mdl || exit 1;

    # Normalize stddev for affine or block affine layers that are followed by a
    # pnorm layer and then a normalize layer.
    $cmd $parallel_opts $dir/log/normalize.$i.log \
      nnet-normalize-stddev $dir/final.$i.mdl $dir/final.$i.mdl || exit 1;
  fi
  # Compute the probability of the final, combined model with
  # the same subset we used for the previous compute_probs, as the
  # different subsets will lead to different probs.
  $cmd $dir/log/compute_prob_valid.final.$i.log \
    nnet-compute-prob $dir/final.$i.mdl ark:$egs_dir/valid_diagnostic.egs &
  $cmd $dir/log/compute_prob_train.final.$i.log \
    nnet-compute-prob $dir/final.$i.mdl ark:$egs_dir/train_diagnostic.egs &
done
cp $dir/final.1.mdl $dir/final.mdl

sleep 2

echo Done

if $cleanup; then
  echo Cleaning up data
  if [ $egs_dir == "$dir/egs" ]; then
    steps/nnet2/remove_egs.sh $dir/egs
  fi
  echo Removing most of the models
  for x in `seq 0 $num_iters`; do
    if [ $[$x%10] -ne 0 ] && [ $x -lt $[$num_iters-$num_iters_final+1] ]; then
       # delete all but every 10th model; don't delete the ones which combine to form the final model.
      for i in `seq 1 $ensemble_size`; do
        rm $dir/$x.$i.mdl
      done
    fi
  done
fi


================================================
FILE: egs/steps/nnet2/train_pnorm_fast.sh
================================================
#!/usr/bin/env bash

# Copyright 2012-2014  Johns Hopkins University (Author: Daniel Povey).
#           2013  Xiaohui Zhang
#           2013  Guoguo Chen
# Apache 2.0.


# train_pnorm_fast.sh is a new, improved version of train_pnorm.sh, which uses
# the 'online' preconditioning method.  For GPUs it's about two times faster
# than before (although that's partly due to optimizations that will also help
# the old recipe), and for CPUs it gives better performance than the old method
# (I believe); also, the difference in optimization performance between CPU and
# GPU is almost gone.  The old train_pnorm.sh script is now deprecated.
# We made this a separate script because not all of the options that the
# old script accepted, are still accepted.

# Begin configuration section.
cmd=run.pl
num_epochs=15      # Number of epochs during which we reduce
                   # the learning rate; number of iterations is worked out from this.
num_epochs_extra=5 # Number of epochs after we stop reducing
                   # the learning rate.
num_iters_final=20 # Maximum number of final iterations to give to the
                   # optimization over the validation set (maximum)
initial_learning_rate=0.04
final_learning_rate=0.004
bias_stddev=0.5
pnorm_input_dim=3000
pnorm_output_dim=300
p=2
presoftmax_prior_scale_power=-0.25 # use the specified power value on the priors (inverse priors)
                                   # to scale the pre-softmax outputs
minibatch_size=128 # by default use a smallish minibatch size for neural net
                   # training; this controls instability which would otherwise
                   # be a problem with multi-threaded update.

samples_per_iter=200000 # each iteration of training, see this many samples
                        # per job.  This option is passed to get_egs.sh
num_jobs_nnet=16   # Number of neural net jobs to run in parallel.  This option
                   # is passed to get_egs.sh.
get_egs_stage=0
online_ivector_dir=

shuffle_buffer_size=5000 # This "buffer_size" variable controls randomization of the samples
                # on each iter.  You could set it to 0 or to a large value for complete
                # randomization, but this would both consume memory and cause spikes in
                # disk I/O.  Smaller is easier on disk and memory but less random.  It's
                # not a huge deal though, as samples are anyway randomized right at the start.
                # (the point of this is to get data in different minibatches on different iterations,
                # since in the preconditioning method, 2 samples in the same minibatch can
                # affect each others' gradients.

add_layers_period=2 # by default, add new layers every 2 iterations.
num_hidden_layers=3
stage=-5

io_opts="--max-jobs-run 15" # for jobs with a lot of I/O, limits the number running at one time.   These don't
splice_width=4 # meaning +- 4 frames on each side for second LDA
randprune=4.0 # speeds up LDA.
alpha=4.0 # relates to preconditioning.
update_period=4 # relates to online preconditioning: says how often we update the subspace.
num_samples_history=2000 # relates to online preconditioning
max_change_per_sample=0.075
precondition_rank_in=20  # relates to online preconditioning
precondition_rank_out=80 # relates to online preconditioning

# this relates to perturbed training.
min_target_objf_change=0.1
target_multiplier=0 #  Set this to e.g. 1.0 to enable perturbed training.

mix_up=0 # Number of components to mix up to (should be > #tree leaves, if
        # specified.)
num_threads=16
parallel_opts="--num-threads 16 --mem 1G" # by default we use 16 threads; this lets the queue know.
  # note: parallel_opts doesn't automatically get adjusted if you adjust num-threads.
combine_num_threads=8
combine_parallel_opts="--num-threads 8"  # queue options for the "combine" stage.
cleanup=true
egs_dir=
lda_opts=
lda_dim=
egs_opts=
transform_dir=     # If supplied, overrides alidir
cmvn_opts=  # will be passed to get_lda.sh and get_egs.sh, if supplied.
            # only relevant for "raw" features, not lda.
feat_type=  # Can be used to force "raw" features.
prior_subset_size=10000 # 10k samples per job, for computing priors.  Should be
                        # more than enough.
# End configuration section.


echo "$0 $@"  # Print the command line for logging

if [ -f path.sh ]; then . ./path.sh; fi
. parse_options.sh || exit 1;

if [ $# != 4 ]; then
  echo "Usage: $0 [opts] <data> <lang> <ali-dir> <exp-dir>"
  echo " e.g.: $0 data/train data/lang exp/tri3_ali exp/tri4_nnet"
  echo ""
  echo "Main options (for others, see top of script file)"
  echo "  --config <config-file>                           # config file containing options"
  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
  echo "  --num-epochs <#epochs|15>                        # Number of epochs of main training"
  echo "                                                   # while reducing learning rate (determines #iterations, together"
  echo "                                                   # with --samples-per-iter and --num-jobs-nnet)"
  echo "  --num-epochs-extra <#epochs-extra|5>             # Number of extra epochs of training"
  echo "                                                   # after learning rate fully reduced"
  echo "  --initial-learning-rate <initial-learning-rate|0.02> # Learning rate at start of training, e.g. 0.02 for small"
  echo "                                                       # data, 0.01 for large data"
  echo "  --final-learning-rate  <final-learning-rate|0.004>   # Learning rate at end of training, e.g. 0.004 for small"
  echo "                                                   # data, 0.001 for large data"
  echo "  --num-hidden-layers <#hidden-layers|2>           # Number of hidden layers, e.g. 2 for 3 hours of data, 4 for 100hrs"
  echo "  --add-layers-period <#iters|2>                   # Number of iterations between adding hidden layers"
  echo "  --mix-up <#pseudo-gaussians|0>                   # This option now does nothing; please remove it."
  echo "  --presoftmax-prior-scale-power <power|-0.25>     # use the specified power value on the priors (inverse priors) "
  echo "                                                   # to scale the pre-softmax outputs."
  echo "                                                   # (set to 0.0 to disable the presoftmax element scale)"
  echo "  --num-jobs-nnet <num-jobs|8>                     # Number of parallel jobs to use for main neural net"
  echo "                                                   # training (will affect results as well as speed; try 8, 16)"
  echo "                                                   # Note: if you increase this, you may want to also increase"
  echo "                                                   # the learning rate."
  echo "  --num-threads <num-threads|16>                   # Number of parallel threads per job (will affect results"
  echo "                                                   # as well as speed; may interact with batch size; if you increase"
  echo "                                                   # this, you may want to decrease the batch size."
  echo "  --parallel-opts <opts|\"--num-threads 16 --mem 1G\">      # extra options to pass to e.g. queue.pl for processes that"
  echo "                                                   # use multiple threads... "
  echo "  --io-opts <opts|\"--max-jobs-run 10\">                      # Options given to e.g. queue.pl for jobs that do a lot of I/O."
  echo "  --minibatch-size <minibatch-size|128>            # Size of minibatch to process (note: product with --num-threads"
  echo "                                                   # should not get too large, e.g. >2k)."
  echo "  --samples-per-iter <#samples|400000>             # Number of samples of data to process per iteration, per"
  echo "                                                   # process."
  echo "  --splice-width <width|4>                         # Number of frames on each side to append for feature input"
  echo "                                                   # (note: we splice processed, typically 40-dimensional frames"
  echo "  --lda-dim <dim|250>                              # Dimension to reduce spliced features to with LDA"
  echo "  --num-iters-final <#iters|20>                    # Number of final iterations to give to nnet-combine-fast to "
  echo "                                                   # interpolate parameters (the weights are learned with a validation set)"
  echo "  --first-component-power <power|1.0>              # Power applied to output of first p-norm layer... setting this to"
  echo "                                                   # 0.5 seems to help under some circumstances."
  echo "  --stage <stage|-9>                               # Used to run a partially-completed training process from somewhere in"
  echo "                                                   # the middle."


  exit 1;
fi

data=$1
lang=$2
alidir=$3
dir=$4

# Check some files.
for f in $data/feats.scp $lang/L.fst $alidir/ali.1.gz $alidir/final.mdl $alidir/tree; do
  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
done


# Set some variables.
num_leaves=`tree-info $alidir/tree 2>/dev/null | grep num-pdfs | awk '{print $2}'` || exit 1
[ -z $num_leaves ] && echo "\$num_leaves is unset" && exit 1
[ "$num_leaves" -eq "0" ] && echo "\$num_leaves is 0" && exit 1

nj=`cat $alidir/num_jobs` || exit 1;  # number of jobs in alignment dir...
# in this dir we'll have just one job.
sdata=$data/split$nj
utils/split_data.sh $data $nj

mkdir -p $dir/log
echo $nj > $dir/num_jobs
cp $alidir/tree $dir

utils/lang/check_phones_compatible.sh $lang/phones.txt $alidir/phones.txt || exit 1;
cp $lang/phones.txt $dir || exit 1;

extra_opts=()
[ ! -z "$cmvn_opts" ] && extra_opts+=(--cmvn-opts "$cmvn_opts")
[ ! -z "$feat_type" ] && extra_opts+=(--feat-type $feat_type)
[ ! -z "$online_ivector_dir" ] && extra_opts+=(--online-ivector-dir $online_ivector_dir)
[ -z "$transform_dir" ] && transform_dir=$alidir
extra_opts+=(--transform-dir $transform_dir)
extra_opts+=(--splice-width $splice_width)

if [ $stage -le -4 ]; then
  echo "$0: calling get_lda.sh"
  steps/nnet2/get_lda.sh $lda_opts "${extra_opts[@]}" --cmd "$cmd" $data $lang $alidir $dir || exit 1;
fi

# these files will have been written by get_lda.sh
feat_dim=$(cat $dir/feat_dim) || exit 1;
ivector_dim=$(cat $dir/ivector_dim) || exit 1;
lda_dim=$(cat $dir/lda_dim) || exit 1;

if [ $stage -le -3 ] && [ -z "$egs_dir" ]; then
  echo "$0: calling get_egs.sh"
  steps/nnet2/get_egs.sh $egs_opts "${extra_opts[@]}" \
      --samples-per-iter $samples_per_iter \
      --num-jobs-nnet $num_jobs_nnet --stage $get_egs_stage \
      --cmd "$cmd" $egs_opts --io-opts "$io_opts" \
      $data $lang $alidir $dir || exit 1;
fi

if [ -z $egs_dir ]; then
  egs_dir=$dir/egs
fi

iters_per_epoch=`cat $egs_dir/iters_per_epoch`  || exit 1;
! [ $num_jobs_nnet -eq `cat $egs_dir/num_jobs_nnet` ] && \
  echo "$0: Warning: using --num-jobs-nnet=`cat $egs_dir/num_jobs_nnet` from $egs_dir"
num_jobs_nnet=`cat $egs_dir/num_jobs_nnet` || exit 1;


if ! [ $num_hidden_layers -ge 1 ]; then
  echo "Invalid num-hidden-layers $num_hidden_layers"
  exit 1
fi

if [ $stage -le -2 ]; then
  echo "$0: initializing neural net";
  lda_mat=$dir/lda.mat
  tot_input_dim=$[$feat_dim+$ivector_dim]

  online_preconditioning_opts="alpha=$alpha num-samples-history=$num_samples_history update-period=$update_period rank-in=$precondition_rank_in rank-out=$precondition_rank_out max-change-per-sample=$max_change_per_sample"

  stddev=`perl -e "print 1.0/sqrt($pnorm_input_dim);"`
  cat >$dir/nnet.config <<EOF
SpliceComponent input-dim=$tot_input_dim left-context=$splice_width right-context=$splice_width const-component-dim=$ivector_dim
FixedAffineComponent matrix=$lda_mat
AffineComponentPreconditionedOnline input-dim=$lda_dim output-dim=$pnorm_input_dim $online_preconditioning_opts learning-rate=$initial_learning_rate param-stddev=$stddev bias-stddev=$bias_stddev
PnormComponent input-dim=$pnorm_input_dim output-dim=$pnorm_output_dim p=$p
NormalizeComponent dim=$pnorm_output_dim
AffineComponentPreconditionedOnline input-dim=$pnorm_output_dim output-dim=$num_leaves $online_preconditioning_opts learning-rate=$initial_learning_rate param-stddev=0 bias-stddev=0
SoftmaxComponent dim=$num_leaves
EOF

  # to hidden.config it will write the part of the config corresponding to a
  # single hidden layer; we need this to add new layers.
  cat >$dir/hidden.config <<EOF
AffineComponentPreconditionedOnline input-dim=$pnorm_output_dim output-dim=$pnorm_input_dim $online_preconditioning_opts learning-rate=$initial_learning_rate param-stddev=$stddev bias-stddev=$bias_stddev
PnormComponent input-dim=$pnorm_input_dim output-dim=$pnorm_output_dim p=$p
NormalizeComponent dim=$pnorm_output_dim
EOF
  $cmd $dir/log/nnet_init.log \
    nnet-am-init $alidir/tree $lang/topo "nnet-init $dir/nnet.config -|" \
    $dir/0.mdl || exit 1;
fi

if [ $stage -le -1 ]; then
  echo "Training transition probabilities and setting priors"
  $cmd $dir/log/train_trans.log \
    nnet-train-transitions $dir/0.mdl "ark:gunzip -c $alidir/ali.*.gz|" $dir/0.mdl \
    || exit 1;

  if [ "$presoftmax_prior_scale_power" != "0.0" ]; then
    echo "prepare vector assignment for FixedScaleComponent before softmax"
    echo "(use priors^$presoftmax_prior_scale_power and rescale to average 1)"

    # obtains raw pdf count
    $cmd JOB=1:$nj $dir/log/acc_pdf.JOB.log \
      ali-to-post "ark:gunzip -c $alidir/ali.JOB.gz|" ark:- \| \
      post-to-tacc --per-pdf=true --binary=false $alidir/final.mdl ark:- $dir/JOB.pacc || exit 1;
    cat $dir/*.pacc > $dir/pacc
    rm $dir/*.pacc
    awk -v power=$presoftmax_prior_scale_power \
      '{ for(i=2; i<=NF-1; i++) {sum[i]+=$i} }
      END {
        for (i=2; i<=NF-1; i++) {total+=sum[i]}
        ave_pdf=int(total/(NF-2)); total+=0.01*ave_pdf*(NF-2)
        for (i=2; i<=NF-1; i++) {rescale+=((sum[i]+0.01*ave_pdf)/total)^power}
        rescale/=(NF-2)
        printf " [ "; for (i=2; i<=NF-1; i++) {printf("%f ", ((sum[i]+0.01*ave_pdf)/total)^power/rescale)}; print "]"
      }' $dir/pacc > $dir/presoftmax_prior_scale_vecfile

    echo "FixedScaleComponent scales=$dir/presoftmax_prior_scale_vecfile" > $dir/per_element.config
    echo "insert an additional layer of FixedScaleComponent before softmax"
    inp=`nnet-am-info $dir/0.mdl | grep 'Softmax' | awk '{print $2}'`
    nnet-init $dir/per_element.config - | nnet-insert --insert-at=$inp --randomize-next-component=false $dir/0.mdl - $dir/0.mdl
  fi

fi

num_iters_reduce=$[$num_epochs * $iters_per_epoch];
num_iters_extra=$[$num_epochs_extra * $iters_per_epoch];
num_iters=$[$num_iters_reduce+$num_iters_extra]

echo "$0: Will train for $num_epochs + $num_epochs_extra epochs, equalling "
echo "$0: $num_iters_reduce + $num_iters_extra = $num_iters iterations, "
echo "$0: (while reducing learning rate) + (with constant learning rate)."
echo "$0: Will not do mix up"

function set_target_objf_change {
  # nothing to do if $target_multiplier not set.
  [ "$target_multiplier" == "0" -o "$target_multiplier" == "0.0" ] && return;
  [ $x -le $finish_add_layers_iter ] && return;
  wait=2  # the compute_prob_{train,valid} from 2 iterations ago should
          # most likey be done even though we backgrounded them.
  [ $[$x-$wait] -le 0 ] && return;
  while true; do
    # Note: awk 'some-expression' is the same as: awk '{if(some-expression) print;}'
    train_prob=$(awk '(NF == 1)' < $dir/log/compute_prob_train.$[$x-$wait].log)
    valid_prob=$(awk '(NF == 1)' < $dir/log/compute_prob_valid.$[$x-$wait].log)
    if [ -z "$train_prob" ] || [ -z "$valid_prob" ]; then
      echo "$0: waiting until $dir/log/compute_prob_{train,valid}.$[$x-$wait].log are done"
      sleep 60
    else
      target_objf_change=$(perl -e '($train,$valid,$min_change,$multiplier)=@ARGV; if (!($train < 0.0) || !($valid < 0.0)) { print "0\n"; print STDERR "Error: invalid train or valid prob: $train_prob, $valid_prob\n"; exit(0); } else { print STDERR "train,valid=$train,$valid\n"; $proposed_target = $multiplier * ($train-$valid); if ($proposed_target < $min_change) { print "0"; } else { print $proposed_target; }}' -- "$train_prob" "$valid_prob" "$min_target_objf_change" "$target_multiplier")
      echo "On iter $x, (train,valid) probs from iter $[$x-$wait] were ($train_prob,$valid_prob), and setting target-objf-change to $target_objf_change."
      return;
    fi
  done
}

finish_add_layers_iter=$[$num_hidden_layers * $add_layers_period]
# This is when we decide to mix up from: halfway between when we've finished
# adding the hidden layers and the end of training.
mix_up_iter=$[($num_iters + $finish_add_layers_iter)/2]

if [ $num_threads -eq 1 ]; then
  parallel_suffix="-simple" # this enables us to use GPU code if
                         # we have just one thread.
  parallel_train_opts=
  if ! cuda-compiled; then
    echo "$0: WARNING: you are running with one thread but you have not compiled"
    echo "   for CUDA.  You may be running a setup optimized for GPUs.  If you have"
    echo "   GPUs and have nvcc installed, go to src/ and do ./configure; make"
  fi
else
  parallel_suffix="-parallel"
  parallel_train_opts="--num-threads=$num_threads"
fi

x=0
target_objf_change=0 # relates to perturbed training.

while [ $x -lt $num_iters ]; do
  if [ $x -ge 0 ] && [ $stage -le $x ]; then
    # Set off jobs doing some diagnostics, in the background.
    $cmd $dir/log/compute_prob_valid.$x.log \
      nnet-compute-prob $dir/$x.mdl ark:$egs_dir/valid_diagnostic.egs &
    $cmd $dir/log/compute_prob_train.$x.log \
      nnet-compute-prob $dir/$x.mdl ark:$egs_dir/train_diagnostic.egs &
    if [ $x -gt 0 ] && [ ! -f $dir/log/mix_up.$[$x-1].log ]; then
      $cmd $dir/log/progress.$x.log \
        nnet-show-progress --use-gpu=no $dir/$[$x-1].mdl $dir/$x.mdl \
          ark:$egs_dir/train_diagnostic.egs '&&' \
        nnet-am-info $dir/$x.mdl &
    fi

    echo "Training neural net (pass $x)"

    if [ $x -gt 0 ] && \
      [ $x -le $[($num_hidden_layers-1)*$add_layers_period] ] && \
      [ $[($x-1) % $add_layers_period] -eq 0 ]; then

      inp=`nnet-am-info $dir/$x.mdl | grep 'Softmax' | awk '{print $2}'`
      if [ "$presoftmax_prior_scale_power" != "0.0" ]; then
        inp=$[$inp-2]
      else
        inp=$[$inp-1]
      fi
      mdl="nnet-init --srand=$x $dir/hidden.config - | nnet-insert --insert-at=$inp $dir/$x.mdl - - |"

    else
      mdl=$dir/$x.mdl
    fi
    if [ $x -eq 0 ] || [ "$mdl" != "$dir/$x.mdl" ]; then
      # on iteration zero or when we just added a layer, use a smaller minibatch
      # size and just one job: the model-averaging doesn't seem to be helpful
      # when the model is changing too fast (i.e. it worsens the objective
      # function), and the smaller minibatch size will help to keep
      # the update stable.
      this_minibatch_size=$[$minibatch_size/2];
      do_average=false
    else
      this_minibatch_size=$minibatch_size
      do_average=true
    fi

    set_target_objf_change;  # only has effect if target_multiplier != 0
    if [ "$target_objf_change" != "0" ]; then
      [ ! -f $dir/within_covar.spmat ] && \
        echo "$0: expected $dir/within_covar.spmat to exist." && exit 1;
      perturb_suffix="-perturbed"
      perturb_opts="--target-objf-change=$target_objf_change --within-covar=$dir/within_covar.spmat"
    fi

    $cmd $parallel_opts JOB=1:$num_jobs_nnet $dir/log/train.$x.JOB.log \
      nnet-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x \
      ark:$egs_dir/egs.JOB.$[$x%$iters_per_epoch].ark ark:- \| \
       nnet-train$parallel_suffix$perturb_suffix $parallel_train_opts $perturb_opts \
        --minibatch-size=$this_minibatch_size --srand=$x "$mdl" \
        ark:- $dir/$[$x+1].JOB.mdl \
      || exit 1;

    nnets_list=
    for n in `seq 1 $num_jobs_nnet`; do
      nnets_list="$nnets_list $dir/$[$x+1].$n.mdl"
    done

    learning_rate=`perl -e '($x,$n,$i,$f)=@ARGV; print ($x >= $n ? $f : $i*exp($x*log($f/$i)/$n));' $[$x+1] $num_iters_reduce $initial_learning_rate $final_learning_rate`;

    if $do_average; then
      $cmd $dir/log/average.$x.log \
        nnet-am-average $nnets_list - \| \
        nnet-am-copy --learning-rate=$learning_rate - $dir/$[$x+1].mdl || exit 1;
    else
      n=$(perl -e '($nj,$pat)=@ARGV; $best_n=1; $best_logprob=-1.0e+10; for ($n=1;$n<=$nj;$n++) {
          $fn = sprintf($pat,$n); open(F, "<$fn") || die "Error opening log file $fn";
          undef $logprob; while (<F>) { if (m/log-prob-per-frame=(\S+)/) { $logprob=$1; } }
          close(F); if (defined $logprob && $logprob > $best_logprob) { $best_logprob=$logprob;
          $best_n=$n; } } print "$best_n\n"; ' $num_jobs_nnet $dir/log/train.$x.%d.log) || exit 1;
      [ -z "$n" ] && echo "Error getting best model" && exit 1;
      $cmd $dir/log/select.$x.log \
        nnet-am-copy --learning-rate=$learning_rate $dir/$[$x+1].$n.mdl $dir/$[$x+1].mdl || exit 1;
    fi

    if [ "$mix_up" -gt 0 ] && [ $x -eq $mix_up_iter ]; then
      echo "Warning: the mix up opertion is disabled!"
      echo "    Ignore mix up leaves number specified"
    fi
    rm $nnets_list
    [ ! -f $dir/$[$x+1].mdl ] && exit 1;
    if [ -f $dir/$[$x-1].mdl ] && $cleanup && \
       [ $[($x-1)%100] -ne 0  ] && [ $[$x-1] -le $[$num_iters-$num_iters_final] ]; then
      rm $dir/$[$x-1].mdl
    fi
  fi
  x=$[$x+1]
done

# Now do combination.
# At the end, final.mdl will be a combination of the last e.g. 10 models.
nnets_list=()
if [ $num_iters_final -gt $num_iters_extra ]; then
  echo "Setting num_iters_final=$num_iters_extra"
fi
start=$[$num_iters-$num_iters_final+1]
for x in `seq $start $num_iters`; do
  idx=$[$x-$start]
  if [ $x -gt $mix_up_iter ]; then
    nnets_list[$idx]=$dir/$x.mdl # "nnet-am-copy --remove-dropout=true $dir/$x.mdl - |"
  fi
done

if [ $stage -le $num_iters ]; then
  echo "Doing final combination to produce final.mdl"
  # Below, use --use-gpu=no to disable nnet-combine-fast from using a GPU, as
  # if there are many models it can give out-of-memory error; set num-threads to 8
  # to speed it up (this isn't ideal...)
  num_egs=`nnet-copy-egs ark:$egs_dir/combine.egs ark:/dev/null 2>&1 | tail -n 1 | awk '{print $NF}'`
  mb=$[($num_egs+$combine_num_threads-1)/$combine_num_threads]
  [ $mb -gt 512 ] && mb=512
  # Setting --initial-model to a large value makes it initialize the combination
  # with the average of all the models.  It's important not to start with a
  # single model, or, due to the invariance to scaling that these nonlinearities
  # give us, we get zero diagonal entries in the fisher matrix that
  # nnet-combine-fast uses for scaling, which after flooring and inversion, has
  # the effect that the initial model chosen gets much higher learning rates
  # than the others.  This prevents the optimization from working well.
  $cmd $combine_parallel_opts $dir/log/combine.log \
    nnet-combine-fast --initial-model=100000 --num-lbfgs-iters=40 --use-gpu=no \
      --num-threads=$combine_num_threads \
      --verbose=3 --minibatch-size=$mb "${nnets_list[@]}" ark:$egs_dir/combine.egs \
      $dir/final.mdl || exit 1;

  # Normalize stddev for affine or block affine layers that are followed by a
  # pnorm layer and then a normalize layer.
  $cmd $dir/log/normalize.log \
    nnet-normalize-stddev $dir/final.mdl $dir/final.mdl || exit 1;

  # Compute the probability of the final, combined model with
  # the same subset we used for the previous compute_probs, as the
  # different subsets will lead to different probs.
  $cmd $dir/log/compute_prob_valid.final.log \
    nnet-compute-prob $dir/final.mdl ark:$egs_dir/valid_diagnostic.egs &
  $cmd $dir/log/compute_prob_train.final.log \
    nnet-compute-prob $dir/final.mdl ark:$egs_dir/train_diagnostic.egs &
fi

if [ $stage -le $[$num_iters+1] ]; then
  echo "Getting average posterior for purposes of adjusting the priors."
  # Note: this just uses CPUs, using a smallish subset of data.
  rm $dir/post.*.vec 2>/dev/null
  $cmd JOB=1:$num_jobs_nnet $dir/log/get_post.JOB.log \
    nnet-subset-egs --n=$prior_subset_size ark:$egs_dir/egs.JOB.0.ark ark:- \| \
    nnet-compute-from-egs "nnet-to-raw-nnet $dir/final.mdl -|" ark:- ark:- \| \
    matrix-sum-rows ark:- ark:- \| vector-sum ark:- $dir/post.JOB.vec || exit 1;

  sleep 3;  # make sure there is time for $dir/post.*.vec to appear.

  $cmd $dir/log/vector_sum.log \
   vector-sum $dir/post.*.vec $dir/post.vec || exit 1;

  rm $dir/post.*.vec;

  echo "Re-adjusting priors based on computed posteriors"
  $cmd $dir/log/adjust_priors.log \
    nnet-adjust-priors $dir/final.mdl $dir/post.vec $dir/final.mdl || exit 1;
fi


sleep 2

echo Done

if $cleanup; then
  echo Cleaning up data
  if [ $egs_dir == "$dir/egs" ]; then
    steps/nnet2/remove_egs.sh $dir/egs
  fi
fi


================================================
FILE: egs/steps/nnet2/train_pnorm_multisplice.sh
================================================
#!/usr/bin/env bash

# Copyright 2012-2014  Johns Hopkins University (Author: Daniel Povey).
#           2013  Xiaohui Zhang
#           2013  Guoguo Chen
#           2014  Vimal Manohar
#           2014  Vijayaditya Peddinti
# Apache 2.0.

# train_pnorm_multisplice.sh is a modified version of train_pnorm_simple.sh.
# Like train_pnorm_fast.sh, it uses the `online' preconditioning,
# which is faster (especially on GPUs).  The difference is that the
# learning-rate schedule is simpler, with the learning rate exponentially
# decreasing during training, and no phase where the learning rate is constant.
#
# Also, the final model-combination is done a bit differently: we combine models
# over typically a whole epoch, and because that would be too many iterations to
# easily be able to combine over, we arrange the iterations into groups (20
# groups by default) and average over each group.
#
# [Vimal Manohar - Oct 2014]
# The script now supports realignment during training, which can be done by
# specifying realign_epochs.

# Begin configuration section.
cmd=run.pl
num_epochs=15      # Number of epochs of training;
                   # the number of iterations is worked out from this.
initial_learning_rate=0.04
final_learning_rate=0.004
bias_stddev=0.5
pnorm_input_dim=3000
pnorm_output_dim=300
minibatch_size=128 # by default use a smallish minibatch size for neural net
                   # training; this controls instability which would otherwise
                   # be a problem with multi-threaded update.

samples_per_iter=400000 # each iteration of training, see this many samples
                        # per job.  This option is passed to get_egs.sh
num_jobs_nnet=16   # Number of neural net jobs to run in parallel.  This option
                   # is passed to get_egs.sh.
get_egs_stage=0
online_ivector_dir=


max_models_combine=20 # The "max_models_combine" is the maximum number of models we give
  # to the final 'combine' stage, but these models will themselves be averages of
  # iteration-number ranges.

shuffle_buffer_size=5000 # This "buffer_size" variable controls randomization of the samples
                # on each iter.  You could set it to 0 or to a large value for complete
                # randomization, but this would both consume memory and cause spikes in
                # disk I/O.  Smaller is easier on disk and memory but less random.  It's
                # not a huge deal though, as samples are anyway randomized right at the start.
                # (the point of this is to get data in different minibatches on different iterations,
                # since in the preconditioning method, 2 samples in the same minibatch can
                # affect each others' gradients.

add_layers_period=2 # by default, add new layers every 2 iterations.
num_hidden_layers=3
stage=-4

io_opts="--max-jobs-run 5" # for jobs with a lot of I/O, limits the number running at one time.   These don't
splice_indexes="layer0/-4:-3:-2:-1:0:1:2:3:4 layer2/-5:-1:3"
# Format : layer<hidden_layer>/<frame_indices>....layer<hidden_layer>/<frame_indices> "
# note: hidden layers which are composed of one or more components,
# so hidden layer indexing is different from component count

randprune=4.0 # speeds up LDA.
alpha=4.0 # relates to preconditioning.
update_period=4 # relates to online preconditioning: says how often we update the subspace.
num_samples_history=2000 # relates to online preconditioning
max_change_per_sample=0.075
precondition_rank_in=20  # relates to online preconditioning
precondition_rank_out=80 # relates to online preconditioning

mix_up=0 # Number of components to mix up to (should be > #tree leaves, if
        # specified.)
num_threads=16
parallel_opts="--num-threads 16 --mem 1G"
  # by default we use 16 threads; this lets the queue know.
  # note: parallel_opts doesn't automatically get adjusted if you adjust num-threads.
combine_num_threads=8
combine_parallel_opts="--num-threads 8"  # queue options for the "combine" stage.
cleanup=true
egs_dir=
lda_opts=
lda_dim=
egs_opts=
transform_dir=     # If supplied, overrides alidir
cmvn_opts=  # will be passed to get_lda.sh and get_egs.sh, if supplied.
            # only relevant for "raw" features, not lda.
feat_type=  # Can be used to force "raw" features.
prior_subset_size=10000 # 10k samples per job, for computing priors.  Should be
                        # more than enough.
align_cmd=              # The cmd that is passed to steps/nnet2/align.sh
align_use_gpu=          # Passed to use_gpu in steps/nnet2/align.sh [yes/no]
realign_epochs=         # List of epochs, the beginning of which realignment is done
num_jobs_align=30       # Number of jobs for realignment
# End configuration section.


echo "$0 $@"  # Print the command line for logging

if [ -f path.sh ]; then . ./path.sh; fi
. parse_options.sh || exit 1;

if [ $# != 4 ]; then
  echo "Usage: $0 [opts] <data> <lang> <ali-dir> <exp-dir>"
  echo " e.g.: $0 data/train data/lang exp/tri3_ali exp/tri4_nnet"
  echo ""
  echo "Main options (for others, see top of script file)"
  echo "  --config <config-file>                           # config file containing options"
  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
  echo "  --num-epochs <#epochs|15>                        # Number of epochs of training"
  echo "  --initial-learning-rate <initial-learning-rate|0.02> # Learning rate at start of training, e.g. 0.02 for small"
  echo "                                                       # data, 0.01 for large data"
  echo "  --final-learning-rate  <final-learning-rate|0.004>   # Learning rate at end of training, e.g. 0.004 for small"
  echo "                                                   # data, 0.001 for large data"
  echo "  --num-hidden-layers <#hidden-layers|2>           # Number of hidden layers, e.g. 2 for 3 hours of data, 4 for 100hrs"
  echo "  --add-layers-period <#iters|2>                   # Number of iterations between adding hidden layers"
  echo "  --mix-up <#pseudo-gaussians|0>                   # Can be used to have multiple targets in final output layer,"
  echo "                                                   # per context-dependent state.  Try a number several times #states."
  echo "  --num-jobs-nnet <num-jobs|8>                     # Number of parallel jobs to use for main neural net"
  echo "                                                   # training (will affect results as well as speed; try 8, 16)"
  echo "                                                   # Note: if you increase this, you may want to also increase"
  echo "                                                   # the learning rate."
  echo "  --num-threads <num-threads|16>                   # Number of parallel threads per job (will affect results"
  echo "                                                   # as well as speed; may interact with batch size; if you increase"
  echo "                                                   # this, you may want to decrease the batch size."
  echo "  --parallel-opts <opts|\"--num-threads 16 --mem 1G\">      # extra options to pass to e.g. queue.pl for processes that"
  echo "                                                   # use multiple threads... "
  echo "  --io-opts <opts|\"--max-jobs-run 10\">                      # Options given to e.g. queue.pl for jobs that do a lot of I/O."
  echo "  --minibatch-size <minibatch-size|128>            # Size of minibatch to process (note: product with --num-threads"
  echo "                                                   # should not get too large, e.g. >2k)."
  echo "  --samples-per-iter <#samples|400000>             # Number of samples of data to process per iteration, per"
  echo "                                                   # process."
  echo "  --splice-indexes <string|layer0/-4:-3:-2:-1:0:1:2:3:4> "
  echo "                                                   # Frame indices used for each splice layer."
  echo "                                                   # Format : layer<hidden_layer_index>/<frame_indices>....layer<hidden_layer>/<frame_indices> "
  echo "                                                   # (note: we splice processed, typically 40-dimensional frames"
  echo "  --lda-dim <dim|''>                               # Dimension to reduce spliced features to with LDA"
  echo "  --realign-epochs <list-of-epochs|''>             # A list of space-separated epoch indices the beginning of which"
  echo "                                                   # realignment is to be done"
  echo "  --align-cmd (utils/run.pl|utils/queue.pl <queue opts>) # passed to align.sh"
  echo "  --align-use-gpu (yes/no)                         # specify is gpu is to be used for realignment"
  echo "  --num-jobs-align <#njobs|30>                     # Number of jobs to perform realignment"
  echo "  --stage <stage|-4>                               # Used to run a partially-completed training process from somewhere in"
  echo "                                                   # the middle."


  exit 1;
fi

data=$1
lang=$2
alidir=$3
dir=$4

if [ ! -z "$realign_epochs" ]; then
  [ -z "$align_cmd" ] && echo "$0: realign_epochs specified but align_cmd not specified" && exit 1
  [ -z "$align_use_gpu" ] && echo "$0: realign_epochs specified but align_use_gpu not specified" && exit 1
fi

# Check some files.
for f in $data/feats.scp $lang/L.fst $alidir/ali.1.gz $alidir/final.mdl $alidir/tree; do
  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
done


# Set some variables.
num_leaves=`tree-info $alidir/tree 2>/dev/null | grep num-pdfs | awk '{print $2}'` || exit 1
[ -z $num_leaves ] && echo "\$num_leaves is unset" && exit 1
[ "$num_leaves" -eq "0" ] && echo "\$num_leaves is 0" && exit 1

nj=`cat $alidir/num_jobs` || exit 1;  # number of jobs in alignment dir...
# in this dir we'll have just one job.
sdata=$data/split$nj
utils/split_data.sh $data $nj

mkdir -p $dir/log
echo $nj > $dir/num_jobs
cp $alidir/tree $dir

utils/lang/check_phones_compatible.sh $lang/phones.txt $alidir/phones.txt || exit 1;
cp $lang/phones.txt $dir || exit 1;

# process the splice_inds string, to get a layer-wise context string
# to be processed by the nnet-components
# this would be mainly used by SpliceComponent|SpliceMaxComponent
python steps/nnet2/make_multisplice_configs.py contexts --splice-indexes "$splice_indexes" $dir || exit -1;
context_string=$(cat $dir/vars) || exit -1
echo $context_string
eval $context_string || exit -1; #
  # initializes variables used by get_lda.sh and get_egs.sh
  # get_lda.sh : first_left_context, first_right_context,
  # get_egs.sh : nnet_left_context & nnet_right_context

extra_opts=()
[ ! -z "$cmvn_opts" ] && extra_opts+=(--cmvn-opts "$cmvn_opts")
[ ! -z "$feat_type" ] && extra_opts+=(--feat-type $feat_type)
[ ! -z "$online_ivector_dir" ] && extra_opts+=(--online-ivector-dir $online_ivector_dir)
[ -z "$transform_dir" ] && transform_dir=$alidir
extra_opts+=(--transform-dir $transform_dir)

if [ $stage -le -4 ]; then
  echo "$0: calling get_lda.sh"
  steps/nnet2/get_lda.sh $lda_opts "${extra_opts[@]}" --left-context $first_left_context --right-context $first_right_context --cmd "$cmd" $data $lang $alidir $dir || exit 1;
fi
# these files will have been written by get_lda.sh
feat_dim=$(cat $dir/feat_dim) || exit 1;
ivector_dim=$(cat $dir/ivector_dim) || exit 1;
lda_dim=$(cat $dir/lda_dim) || exit 1;

if [ $stage -le -3 ] && [ -z "$egs_dir" ]; then

  extra_opts+=(--left-context $nnet_left_context )
  extra_opts+=(--right-context $nnet_right_context )
  echo "$0: calling get_egs.sh"
  steps/nnet2/get_egs.sh $egs_opts "${extra_opts[@]}" \
      --samples-per-iter $samples_per_iter \
      --num-jobs-nnet $num_jobs_nnet --stage $get_egs_stage \
      --cmd "$cmd" $egs_opts \
      $data $lang $alidir $dir || exit 1;
fi

if [ -z $egs_dir ]; then
  egs_dir=$dir/egs
fi

iters_per_epoch=`cat $egs_dir/iters_per_epoch`  || exit 1;
! [ $num_jobs_nnet -eq `cat $egs_dir/num_jobs_nnet` ] && \
  echo "$0: Warning: using --num-jobs-nnet=`cat $egs_dir/num_jobs_nnet` from $egs_dir"
num_jobs_nnet=`cat $egs_dir/num_jobs_nnet` || exit 1;


if ! [ $num_hidden_layers -ge 1 ]; then
  echo "Invalid num-hidden-layers $num_hidden_layers"
  exit 1
fi

if [ $stage -le -2 ]; then
  echo "$0: initializing neural net";
  lda_mat=$dir/lda.mat
  tot_input_dim=$[$feat_dim+$ivector_dim]

  online_preconditioning_opts="alpha=$alpha num-samples-history=$num_samples_history update-period=$update_period rank-in=$precondition_rank_in rank-out=$precondition_rank_out max-change-per-sample=$max_change_per_sample"

  # create the config files for nnet initialization
  python steps/nnet2/make_multisplice_configs.py  \
    --splice-indexes "$splice_indexes"  \
    --total-input-dim $tot_input_dim  \
    --ivector-dim $ivector_dim  \
    --lda-mat "$lda_mat"  \
    --lda-dim $lda_dim  \
    --pnorm-input-dim $pnorm_input_dim  \
    --pnorm-output-dim  $pnorm_output_dim \
    --online-preconditioning-opts "$online_preconditioning_opts"  \
    --initial-learning-rate $initial_learning_rate  \
    --bias-stddev  $bias_stddev  \
    --num-hidden-layers $num_hidden_layers \
    --num-targets  $num_leaves  \
    configs  $dir || exit -1;

  $cmd $dir/log/nnet_init.log \
    nnet-am-init $alidir/tree $lang/topo "nnet-init $dir/nnet.config -|" \
    $dir/0.mdl || exit 1;
fi

cur_num_hidden_layer=1  # counts the number of hidden layers in the network
                        # this is different from the number of components in
                        # in the network, each hidden layer is composed of
                        # affine comp. + pnorm comp. + normalization comp.
                        # optionally a splice component is also added


if [ $stage -le -1 ]; then
  echo "Training transition probabilities and setting priors"
  $cmd $dir/log/train_trans.log \
    nnet-train-transitions $dir/0.mdl "ark:gunzip -c $alidir/ali.*.gz|" $dir/0.mdl \
    || exit 1;
fi

num_iters=$[$num_epochs * $iters_per_epoch];

echo "$0: Will train for $num_epochs epochs = $num_iters iterations"

finish_add_layers_iter=$[$num_hidden_layers * $add_layers_period]
# This is when we decide to mix up from: halfway between when we've finished
# adding the hidden layers and the end of training.
mix_up_iter=$[($num_iters + $finish_add_layers_iter)/2]

if [ $num_threads -eq 1 ]; then
  parallel_suffix="-simple" # this enables us to use GPU code if
                         # we have just one thread.
  parallel_train_opts=
  if ! cuda-compiled; then
    echo "$0: WARNING: you are running with one thread but you have not compiled"
    echo "   for CUDA.  You may be running a setup optimized for GPUs.  If you have"
    echo "   GPUs and have nvcc installed, go to src/ and do ./configure; make"
  fi
else
  parallel_suffix="-parallel"
  parallel_train_opts="--num-threads=$num_threads"
fi

# First work out how many models we want to combine over in the final
# nnet-combine-fast invocation.  This equals
# min(max(max_models_combine, iters_per_epoch),
#     2/3 * iters_after_mixup)
num_models_combine=$max_models_combine
if [ $num_models_combine -lt $iters_per_epoch ]; then
  num_models_combine=$iters_per_epoch
fi
iters_after_mixup_23=$[(($num_iters-$mix_up_iter-1)*2)/3]
if [ $num_models_combine -gt $iters_after_mixup_23 ]; then
  num_models_combine=$iters_after_mixup_23
fi
first_model_combine=$[$num_iters-$num_models_combine+1]

x=0

for realign_epoch in $realign_epochs; do
  realign_iter=`perl -e 'print int($ARGV[0] * $ARGV[1]);' $realign_epoch $iters_per_epoch`
  realign_this_iter[$realign_iter]=$realign_epoch
done

cur_egs_dir=$egs_dir

while [ $x -lt $num_iters ]; do

  if [ ! -z "${realign_this_iter[$x]}" ]; then
    prev_egs_dir=$cur_egs_dir
    cur_egs_dir=$dir/egs_${realign_this_iter[$x]}
  fi

  if [ $x -ge 0 ] && [ $stage -le $x ]; then
    if [ ! -z "${realign_this_iter[$x]}" ]; then
      epoch=${realign_this_iter[$x]}

      echo "Getting average posterior for purposes of adjusting the priors."
      # Note: this just uses CPUs, using a smallish subset of data.
      rm $dir/post.$x.*.vec 2>/dev/null
      $cmd JOB=1:$num_jobs_nnet $dir/log/get_post.$x.JOB.log \
        nnet-subset-egs --n=$prior_subset_size ark:$prev_egs_dir/egs.JOB.0.ark ark:- \| \
        nnet-compute-from-egs "nnet-to-raw-nnet $dir/$x.mdl -|" ark:- ark:- \| \
        matrix-sum-rows ark:- ark:- \| vector-sum ark:- $dir/post.$x.JOB.vec || exit 1;

      sleep 3;  # make sure there is time for $dir/post.$x.*.vec to appear.

      $cmd $dir/log/vector_sum.$x.log \
        vector-sum $dir/post.$x.*.vec $dir/post.$x.vec || exit 1;

      rm $dir/post.$x.*.vec;

      echo "Re-adjusting priors based on computed posteriors"
      $cmd $dir/log/adjust_priors.$x.log \
        nnet-adjust-priors $dir/$x.mdl $dir/post.$x.vec $dir/$x.mdl || exit 1;

      sleep 2

      steps/nnet2/align.sh --nj $num_jobs_align --cmd "$align_cmd" --use-gpu $align_use_gpu \
        --transform-dir "$transform_dir" --online-ivector-dir "$online_ivector_dir" \
        --iter $x $data $lang $dir $dir/ali_$epoch || exit 1

      steps/nnet2/relabel_egs.sh --cmd "$cmd" --iter $x $dir/ali_$epoch \
        $prev_egs_dir $cur_egs_dir || exit 1

      if $cleanup && [[ $prev_egs_dir =~ $dir/egs* ]]; then
        steps/nnet2/remove_egs.sh $prev_egs_dir
      fi
    fi

    # Set off jobs doing some diagnostics, in the background.
    # Use the egs dir from the previous iteration for the diagnostics
    $cmd $dir/log/compute_prob_valid.$x.log \
      nnet-compute-prob $dir/$x.mdl ark:$cur_egs_dir/valid_diagnostic.egs &
    $cmd $dir/log/compute_prob_train.$x.log \
      nnet-compute-prob $dir/$x.mdl ark:$cur_egs_dir/train_diagnostic.egs &
    if [ $x -gt 0 ] && [ ! -f $dir/log/mix_up.$[$x-1].log ]; then
      $cmd $dir/log/progress.$x.log \
        nnet-show-progress --use-gpu=no $dir/$[$x-1].mdl $dir/$x.mdl \
        ark:$cur_egs_dir/train_diagnostic.egs '&&' \
        nnet-am-info $dir/$x.mdl &
    fi

    echo "Training neural net (pass $x)"

    if [ $x -gt 0 ] && \
      [ $x -le $[($num_hidden_layers-1)*$add_layers_period] ] && \
      [ $[($x-1) % $add_layers_period] -eq 0 ]; then
      mdl="nnet-init --srand=$x $dir/hidden_${cur_num_hidden_layer}.config - | nnet-insert $dir/$x.mdl - - |"
      cur_num_hidden_layer=$((cur_num_hidden_layer + 1))
    else
      mdl=$dir/$x.mdl
    fi
    if [ $x -eq 0 ] || [ "$mdl" != "$dir/$x.mdl" ]; then
      # on iteration zero or when we just added a layer, use a smaller minibatch
      # size and just one job: the model-averaging doesn't seem to be helpful
      # when the model is changing too fast (i.e. it worsens the objective
      # function), and the smaller minibatch size will help to keep
      # the update stable.
      this_minibatch_size=$[$minibatch_size/2];
      do_average=false
    else
      this_minibatch_size=$minibatch_size
      do_average=true
    fi

    $cmd $parallel_opts JOB=1:$num_jobs_nnet $dir/log/train.$x.JOB.log \
      nnet-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x \
      ark:$cur_egs_dir/egs.JOB.$[$x%$iters_per_epoch].ark ark:- \| \
       nnet-train$parallel_suffix $parallel_train_opts \
        --minibatch-size=$this_minibatch_size --srand=$x "$mdl" \
        ark:- $dir/$[$x+1].JOB.mdl \
      || exit 1;

    nnets_list=
    for n in `seq 1 $num_jobs_nnet`; do
      nnets_list="$nnets_list $dir/$[$x+1].$n.mdl"
    done

    learning_rate=`perl -e '($x,$n,$i,$f)=@ARGV; print ($x >= $n ? $f : $i*exp($x*log($f/$i)/$n));' $[$x+1] $num_iters $initial_learning_rate $final_learning_rate`;

    if $do_average; then
      # average the output of the different jobs.
      $cmd $dir/log/average.$x.log \
        nnet-am-average $nnets_list - \| \
        nnet-am-copy --learning-rate=$learning_rate - $dir/$[$x+1].mdl || exit 1;
    else
      # choose the best from the different jobs.
      n=$(perl -e '($nj,$pat)=@ARGV; $best_n=1; $best_logprob=-1.0e+10; for ($n=1;$n<=$nj;$n++) {
          $fn = sprintf($pat,$n); open(F, "<$fn") || die "Error opening log file $fn";
          undef $logprob; while (<F>) { if (m/log-prob-per-frame=(\S+)/) { $logprob=$1; } }
          close(F); if (defined $logprob && $logprob > $best_logprob) { $best_logprob=$logprob;
          $best_n=$n; } } print "$best_n\n"; ' $num_jobs_nnet $dir/log/train.$x.%d.log) || exit 1;
      [ -z "$n" ] && echo "Error getting best model" && exit 1;
      $cmd $dir/log/select.$x.log \
        nnet-am-copy --learning-rate=$learning_rate $dir/$[$x+1].$n.mdl $dir/$[$x+1].mdl || exit 1;
    fi

    if [ "$mix_up" -gt 0 ] && [ $x -eq $mix_up_iter ]; then
      # mix up.
      echo Mixing up from $num_leaves to $mix_up components
      $cmd $dir/log/mix_up.$x.log \
        nnet-am-mixup --min-count=10 --num-mixtures=$mix_up \
        $dir/$[$x+1].mdl $dir/$[$x+1].mdl || exit 1;
    fi
    rm $nnets_list
    [ ! -f $dir/$[$x+1].mdl ] && exit 1;
    if [ -f $dir/$[$x-1].mdl ] && $cleanup && \
       [ $[($x-1)%100] -ne 0  ] && [ $[$x-1] -lt $first_model_combine ]; then
      rm $dir/$[$x-1].mdl
    fi
  fi
  x=$[$x+1]
done


if [ $stage -le $num_iters ]; then
  echo "Doing final combination to produce final.mdl"

  # Now do combination.
  nnets_list=()
  # the if..else..fi statement below sets 'nnets_list'.
  if [ $max_models_combine -lt $num_models_combine ]; then
    # The number of models to combine is too large, e.g. > 20.  In this case,
    # each argument to nnet-combine-fast will be an average of multiple models.
    cur_offset=0 # current offset from first_model_combine.
    for n in $(seq $max_models_combine); do
      next_offset=$[($n*$num_models_combine)/$max_models_combine]
      sub_list=""
      for o in $(seq $cur_offset $[$next_offset-1]); do
        iter=$[$first_model_combine+$o]
        mdl=$dir/$iter.mdl
        [ ! -f $mdl ] && echo "Expected $mdl to exist" && exit 1;
        sub_list="$sub_list $mdl"
      done
      nnets_list[$[$n-1]]="nnet-am-average $sub_list - |"
      cur_offset=$next_offset
    done
  else
    nnets_list=
    for n in $(seq 0 $[num_models_combine-1]); do
      iter=$[$first_model_combine+$n]
      mdl=$dir/$iter.mdl
      [ ! -f $mdl ] && echo "Expected $mdl to exist" && exit 1;
      nnets_list[$n]=$mdl
    done
  fi


  # Below, use --use-gpu=no to disable nnet-combine-fast from using a GPU, as
  # if there are many models it can give out-of-memory error; set num-threads to 8
  # to speed it up (this isn't ideal...)
  num_egs=`nnet-copy-egs ark:$cur_egs_dir/combine.egs ark:/dev/null 2>&1 | tail -n 1 | awk '{print $NF}'`
  mb=$[($num_egs+$combine_num_threads-1)/$combine_num_threads]
  [ $mb -gt 512 ] && mb=512
  # Setting --initial-model to a large value makes it initialize the combination
  # with the average of all the models.  It's important not to start with a
  # single model, or, due to the invariance to scaling that these nonlinearities
  # give us, we get zero diagonal entries in the fisher matrix that
  # nnet-combine-fast uses for scaling, which after flooring and inversion, has
  # the effect that the initial model chosen gets much higher learning rates
  # than the others.  This prevents the optimization from working well.
  $cmd $combine_parallel_opts $dir/log/combine.log \
    nnet-combine-fast --initial-model=100000 --num-lbfgs-iters=40 --use-gpu=no \
      --num-threads=$combine_num_threads \
      --verbose=3 --minibatch-size=$mb "${nnets_list[@]}" ark:$cur_egs_dir/combine.egs \
      $dir/final.mdl || exit 1;

  # Normalize stddev for affine or block affine layers that are followed by a
  # pnorm layer and then a normalize layer.
  $cmd $dir/log/normalize.log \
    nnet-normalize-stddev $dir/final.mdl $dir/final.mdl || exit 1;

  # Compute the probability of the final, combined model with
  # the same subset we used for the previous compute_probs, as the
  # different subsets will lead to different probs.
  $cmd $dir/log/compute_prob_valid.final.log \
    nnet-compute-prob $dir/final.mdl ark:$cur_egs_dir/valid_diagnostic.egs &
  $cmd $dir/log/compute_prob_train.final.log \
    nnet-compute-prob $dir/final.mdl ark:$cur_egs_dir/train_diagnostic.egs &
fi

if [ $stage -le $[$num_iters+1] ]; then
  echo "Getting average posterior for purposes of adjusting the priors."
  # Note: this just uses CPUs, using a smallish subset of data.
  rm $dir/post.$x.*.vec 2>/dev/null
  $cmd JOB=1:$num_jobs_nnet $dir/log/get_post.$x.JOB.log \
    nnet-subset-egs --n=$prior_subset_size ark:$cur_egs_dir/egs.JOB.0.ark ark:- \| \
    nnet-compute-from-egs "nnet-to-raw-nnet $dir/final.mdl -|" ark:- ark:- \| \
    matrix-sum-rows ark:- ark:- \| vector-sum ark:- $dir/post.$x.JOB.vec || exit 1;

  sleep 3;  # make sure there is time for $dir/post.$x.*.vec to appear.

  $cmd $dir/log/vector_sum.$x.log \
    vector-sum $dir/post.$x.*.vec $dir/post.$x.vec || exit 1;

  rm $dir/post.$x.*.vec;

  echo "Re-adjusting priors based on computed posteriors"
  $cmd $dir/log/adjust_priors.final.log \
    nnet-adjust-priors $dir/final.mdl $dir/post.$x.vec $dir/final.mdl || exit 1;
fi


if [ ! -f $dir/final.mdl ]; then
  echo "$0: $dir/final.mdl does not exist."
  # we don't want to clean up if the training didn't succeed.
  exit 1;
fi

sleep 2

echo Done

if $cleanup; then
  echo Cleaning up data
  if [[ $cur_egs_dir =~ $dir/egs* ]]; then
    steps/nnet2/remove_egs.sh $cur_egs_dir
  fi

  echo Removing most of the models
  for x in `seq 0 $num_iters`; do
    if [ $[$x%100] -ne 0 ] && [ $x -ne $num_iters ] && [ -f $dir/$x.mdl ]; then
       # delete all but every 100th model; don't delete the ones which combine to form the final model.
      rm $dir/$x.mdl
    fi
  done

fi


================================================
FILE: egs/steps/nnet2/train_pnorm_multisplice2.sh
================================================
#!/usr/bin/env bash

# Copyright 2012-2014  Johns Hopkins University (Author: Daniel Povey).
#           2013  Xiaohui Zhang
#           2013  Guoguo Chen
#           2014  Vimal Manohar
#           2014  Vijayaditya Peddinti
# Apache 2.0.

# train_pnorm_multisplice2.sh is a modified version of
# train_pnorm_simple2.sh. This script creates neural net architectures with
# multiple levels of splicing.  You can also compare it with
# train_pnorm_multisplice.sh; it differs from that script by using the newer,
# more compact multi-frame egs format that is dumped by get_egs2.sh.


# Begin configuration section.
cmd=run.pl
num_epochs=15      # Number of epochs of training;
                   # the number of iterations is worked out from this.
initial_learning_rate=0.04
final_learning_rate=0.004
bias_stddev=0.5
pnorm_input_dim=3000
pnorm_output_dim=300
minibatch_size=128 # by default use a smallish minibatch size for neural net
                   # training; this controls instability which would otherwise
                   # be a problem with multi-threaded update.

samples_per_iter=400000 # each iteration of training, see this many samples
                        # per job.  This option is passed to get_egs.sh
num_jobs_nnet=4    # Number of neural net jobs to run in parallel.  This option
                   # is passed to get_egs.sh.
prior_subset_size=10000 # 10k samples per job, for computing priors.  Should be
                        # more than enough.
num_jobs_compute_prior=10 # these are single-threaded, run on CPU.
get_egs_stage=0
online_ivector_dir=

max_models_combine=20 # The "max_models_combine" is the maximum number of models we give
  # to the final 'combine' stage, but these models will themselves be averages of
  # iteration-number ranges.

shuffle_buffer_size=5000 # This "buffer_size" variable controls randomization of the samples
                # on each iter.  You could set it to 0 or to a large value for complete
                # randomization, but this would both consume memory and cause spikes in
                # disk I/O.  Smaller is easier on disk and memory but less random.  It's
                # not a huge deal though, as samples are anyway randomized right at the start.
                # (the point of this is to get data in different minibatches on different iterations,
                # since in the preconditioning method, 2 samples in the same minibatch can
                # affect each others' gradients.

add_layers_period=2 # by default, add new layers every 2 iterations.
num_hidden_layers=3
stage=-4

splice_indexes="layer0/-4:-3:-2:-1:0:1:2:3:4 layer2/-5:-1:3"
# Format : layer<hidden_layer>/<frame_indices>....layer<hidden_layer>/<frame_indices> "
# note: hidden layers which are composed of one or more components,
# so hidden layer indexing is different from component count


io_opts="--max-jobs-run 5" # for jobs with a lot of I/O, limits the number running at one time.   These don't
randprune=4.0 # speeds up LDA.
alpha=4.0 # relates to preconditioning.
update_period=4 # relates to online preconditioning: says how often we update the subspace.
num_samples_history=2000 # relates to online preconditioning
max_change_per_sample=0.075
precondition_rank_in=20  # relates to online preconditioning
precondition_rank_out=80 # relates to online preconditioning

mix_up=0 # Number of components to mix up to (should be > #tree leaves, if
        # specified.)
num_threads=16
parallel_opts="--num-threads 16 --mem 1G"
  # by default we use 16 threads; this lets the queue know.
  # note: parallel_opts doesn't automatically get adjusted if you adjust num-threads.
combine_num_threads=8
combine_parallel_opts="--num-threads 8"  # queue options for the "combine" stage.
cleanup=true
egs_dir=
lda_opts=
lda_dim=
egs_opts=
transform_dir=     # If supplied, overrides alidir
cmvn_opts=  # will be passed to get_lda.sh and get_egs.sh, if supplied.
            # only relevant for "raw" features, not lda.
feat_type=  # Can be used to force "raw" features.
align_cmd=              # The cmd that is passed to steps/nnet2/align.sh
align_use_gpu=          # Passed to use_gpu in steps/nnet2/align.sh [yes/no]
realign_epochs=         # List of epochs, the beginning of which realignment is done
num_jobs_align=30       # Number of jobs for realignment
# End configuration section.


echo "$0 $@"  # Print the command line for logging

if [ -f path.sh ]; then . ./path.sh; fi
. parse_options.sh || exit 1;

if [ $# != 4 ]; then
  echo "Usage: $0 [opts] <data> <lang> <ali-dir> <exp-dir>"
  echo " e.g.: $0 data/train data/lang exp/tri3_ali exp/tri4_nnet"
  echo ""
  echo "Main options (for others, see top of script file)"
  echo "  --config <config-file>                           # config file containing options"
  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
  echo "  --num-epochs <#epochs|15>                        # Number of epochs of training"
  echo "  --initial-learning-rate <initial-learning-rate|0.02> # Learning rate at start of training, e.g. 0.02 for small"
  echo "                                                       # data, 0.01 for large data"
  echo "  --final-learning-rate  <final-learning-rate|0.004>   # Learning rate at end of training, e.g. 0.004 for small"
  echo "                                                   # data, 0.001 for large data"
  echo "  --num-hidden-layers <#hidden-layers|2>           # Number of hidden layers, e.g. 2 for 3 hours of data, 4 for 100hrs"
  echo "  --add-layers-period <#iters|2>                   # Number of iterations between adding hidden layers"
  echo "  --mix-up <#pseudo-gaussians|0>                   # Can be used to have multiple targets in final output layer,"
  echo "                                                   # per context-dependent state.  Try a number several times #states."
  echo "  --num-jobs-nnet <num-jobs|8>                     # Number of parallel jobs to use for main neural net"
  echo "                                                   # training (will affect results as well as speed; try 8, 16)"
  echo "                                                   # Note: if you increase this, you may want to also increase"
  echo "                                                   # the learning rate."
  echo "  --num-threads <num-threads|16>                   # Number of parallel threads per job (will affect results"
  echo "                                                   # as well as speed; may interact with batch size; if you increase"
  echo "                                                   # this, you may want to decrease the batch size."
  echo "  --parallel-opts <opts|\"--num-threads 16 --mem 1G\">      # extra options to pass to e.g. queue.pl for processes that"
  echo "                                                   # use multiple threads... "
  echo "  --io-opts <opts|\"--max-jobs-run 10\">                      # Options given to e.g. queue.pl for jobs that do a lot of I/O."
  echo "  --minibatch-size <minibatch-size|128>            # Size of minibatch to process (note: product with --num-threads"
  echo "                                                   # should not get too large, e.g. >2k)."
  echo "  --samples-per-iter <#samples|400000>             # Number of samples of data to process per iteration, per"
  echo "                                                   # process."
  echo "  --splice-indexes <string|layer0/-4:-3:-2:-1:0:1:2:3:4> "
  echo "                                                   # Frame indices used for each splice layer."
  echo "                                                   # Format : layer<hidden_layer_index>/<frame_indices>....layer<hidden_layer>/<frame_indices> "
  echo "                                                   # (note: we splice processed, typically 40-dimensional frames"
  echo "  --lda-dim <dim|''>                               # Dimension to reduce spliced features to with LDA"
  echo "  --realign-epochs <list-of-epochs|''>             # A list of space-separated epoch indices the beginning of which"
  echo "                                                   # realignment is to be done"
  echo "  --align-cmd (utils/run.pl|utils/queue.pl <queue opts>) # passed to align.sh"
  echo "  --align-use-gpu (yes/no)                         # specify is gpu is to be used for realignment"
  echo "  --num-jobs-align <#njobs|30>                     # Number of jobs to perform realignment"
  echo "  --stage <stage|-4>                               # Used to run a partially-completed training process from somewhere in"
  echo "                                                   # the middle."


  exit 1;
fi

data=$1
lang=$2
alidir=$3
dir=$4

if [ ! -z "$realign_epochs" ]; then
  [ -z "$align_cmd" ] && echo "$0: realign_epochs specified but align_cmd not specified" && exit 1
  [ -z "$align_use_gpu" ] && echo "$0: realign_epochs specified but align_use_gpu not specified" && exit 1
fi

# Check some files.
for f in $data/feats.scp $lang/L.fst $alidir/ali.1.gz $alidir/final.mdl $alidir/tree; do
  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
done


# Set some variables.
num_leaves=`tree-info $alidir/tree 2>/dev/null | grep num-pdfs | awk '{print $2}'` || exit 1
[ -z $num_leaves ] && echo "\$num_leaves is unset" && exit 1
[ "$num_leaves" -eq "0" ] && echo "\$num_leaves is 0" && exit 1

nj=`cat $alidir/num_jobs` || exit 1;  # number of jobs in alignment dir...
# in this dir we'll have just one job.
sdata=$data/split$nj
utils/split_data.sh $data $nj

mkdir -p $dir/log
echo $nj > $dir/num_jobs
cp $alidir/tree $dir

utils/lang/check_phones_compatible.sh $lang/phones.txt $alidir/phones.txt || exit 1;
cp $lang/phones.txt $dir || exit 1;
# process the splice_inds string, to get a layer-wise context string
# to be processed by the nnet-components
# this would be mainly used by SpliceComponent|SpliceMaxComponent
python steps/nnet2/make_multisplice_configs.py contexts --splice-indexes "$splice_indexes" $dir || exit -1;
context_string=$(cat $dir/vars) || exit -1
echo $context_string
eval $context_string || exit -1; #
  # initializes variables used by get_lda.sh and get_egs.sh
  # get_lda.sh : first_left_context, first_right_context,
  # get_egs.sh : nnet_left_context & nnet_right_context

extra_opts=()
[ ! -z "$cmvn_opts" ] && extra_opts+=(--cmvn-opts "$cmvn_opts")
[ ! -z "$feat_type" ] && extra_opts+=(--feat-type $feat_type)
[ ! -z "$online_ivector_dir" ] && extra_opts+=(--online-ivector-dir $online_ivector_dir)
[ -z "$transform_dir" ] && transform_dir=$alidir
extra_opts+=(--transform-dir $transform_dir)

if [ $stage -le -4 ]; then
  echo "$0: calling get_lda.sh"
  steps/nnet2/get_lda.sh $lda_opts "${extra_opts[@]}" --left-context $first_left_context --right-context $first_right_context --cmd "$cmd" $data $lang $alidir $dir || exit 1;
fi
# these files will have been written by get_lda.sh
feat_dim=$(cat $dir/feat_dim) || exit 1;
ivector_dim=$(cat $dir/ivector_dim) || exit 1;
lda_dim=$(cat $dir/lda_dim) || exit 1;

if [ $stage -le -3 ] && [ -z "$egs_dir" ]; then

  extra_opts+=(--left-context $nnet_left_context )
  extra_opts+=(--right-context $nnet_right_context )
  echo "$0: calling get_egs2.sh"
  steps/nnet2/get_egs2.sh $egs_opts "${extra_opts[@]}" \
      --samples-per-iter $samples_per_iter --stage $get_egs_stage \
      --io-opts "$io_opts" \
      --cmd "$cmd" $egs_opts \
      $data $alidir $dir/egs || exit 1;
fi

if [ -z $egs_dir ]; then
  egs_dir=$dir/egs
fi

frames_per_eg=$(cat $egs_dir/info/frames_per_eg) || { echo "error: no such file $egs_dir/info/frames_per_eg"; exit 1; }
num_archives=$(cat $egs_dir/info/num_archives) || { echo "error: no such file $egs_dir/info/frames_per_eg"; exit 1; }

# num_archives_expanded considers each separate label-position from
# 0..frames_per_eg-1 to be a separate archive.
num_archives_expanded=$[$num_archives*$frames_per_eg]

if [ $num_jobs_nnet -gt $num_archives_expanded ]; then
  echo "$0: --num-jobs-nnet cannot exceed num-archives*frames-per-eg which is $num_archives_expanded"
  echo "$0: setting --num-jobs-nnet to $num_archives_expanded"
  num_jobs_nnet=$num_archives_expanded
fi

if ! [ $num_hidden_layers -ge 1 ]; then
  echo "Invalid num-hidden-layers $num_hidden_layers"
  exit 1
fi

if [ $stage -le -2 ]; then
  echo "$0: initializing neural net";
  lda_mat=$dir/lda.mat
  tot_input_dim=$[$feat_dim+$ivector_dim]

  online_preconditioning_opts="alpha=$alpha num-samples-history=$num_samples_history update-period=$update_period rank-in=$precondition_rank_in rank-out=$precondition_rank_out max-change-per-sample=$max_change_per_sample"

  # create the config files for nnet initialization
  python steps/nnet2/make_multisplice_configs.py  \
    --splice-indexes "$splice_indexes"  \
    --total-input-dim $tot_input_dim  \
    --ivector-dim $ivector_dim  \
    --lda-mat "$lda_mat"  \
    --lda-dim $lda_dim  \
    --pnorm-input-dim $pnorm_input_dim  \
    --pnorm-output-dim  $pnorm_output_dim \
    --online-preconditioning-opts "$online_preconditioning_opts"  \
    --initial-learning-rate $initial_learning_rate  \
    --bias-stddev  $bias_stddev  \
    --num-hidden-layers $num_hidden_layers \
    --num-targets  $num_leaves  \
    configs  $dir || exit -1;

  $cmd $dir/log/nnet_init.log \
    nnet-am-init $alidir/tree $lang/topo "nnet-init $dir/nnet.config -|" \
    $dir/0.mdl || exit 1;
fi


if [ $stage -le -1 ]; then
  echo "Training transition probabilities and setting priors"
  $cmd $dir/log/train_trans.log \
    nnet-train-transitions $dir/0.mdl "ark:gunzip -c $alidir/ali.*.gz|" $dir/0.mdl \
    || exit 1;
fi

# set num_iters so that as close as possible, we process the data $num_epochs
# times, i.e. $num_iters*$num_jobs_nnet == $num_epochs*$num_archives_expanded
num_iters=$[($num_epochs*$num_archives_expanded)/$num_jobs_nnet]

echo "$0: Will train for $num_epochs epochs = $num_iters iterations"

finish_add_layers_iter=$[$num_hidden_layers * $add_layers_period]
# This is when we decide to mix up from: halfway between when we've finished
# adding the hidden layers and the end of training.
mix_up_iter=$[($num_iters + $finish_add_layers_iter)/2]

if [ $num_threads -eq 1 ]; then
  parallel_suffix="-simple" # this enables us to use GPU code if
                         # we have just one thread.
  parallel_train_opts=
  if ! cuda-compiled; then
    echo "$0: WARNING: you are running with one thread but you have not compiled"
    echo "   for CUDA.  You may be running a setup optimized for GPUs.  If you have"
    echo "   GPUs and have nvcc installed, go to src/ and do ./configure; make"
  fi
else
  parallel_suffix="-parallel"
  parallel_train_opts="--num-threads=$num_threads"
fi


approx_iters_per_epoch=$[$num_iters/$num_epochs]
# First work out how many models we want to combine over in the final
# nnet-combine-fast invocation.  This equals
# min(max(max_models_combine, iters_per_epoch),
#     2/3 * iters_after_mixup)
num_models_combine=$max_models_combine
if [ $num_models_combine -lt $approx_iters_per_epoch ]; then
  num_models_combine=$approx_iters_per_epoch
fi
iters_after_mixup_23=$[(($num_iters-$mix_up_iter-1)*2)/3]
if [ $num_models_combine -gt $iters_after_mixup_23 ]; then
  num_models_combine=$iters_after_mixup_23
fi
first_model_combine=$[$num_iters-$num_models_combine+1]

x=0

for realign_epoch in $realign_epochs; do
  # compare the equation below with the equation we use to set num_iters above.
  # note, realign_epochs may be floating-point, which is why we don't use $[] to
  # do the math.
  realign_iter=$(perl -e 'print int(($ARGV[0]*$ARGV[1])/$ARGV[2]);' $realign_epoch $num_archives_expanded $num_jobs_nnet)
  realign_this_iter[$realign_iter]=$realign_epoch
done

cur_egs_dir=$egs_dir

while [ $x -lt $num_iters ]; do

  if [ ! -z "${realign_this_iter[$x]}" ]; then
    prev_egs_dir=$cur_egs_dir
    cur_egs_dir=$dir/egs_${realign_this_iter[$x]}
  fi

  if [ $x -ge 0 ] && [ $stage -le $x ]; then
    if [ ! -z "${realign_this_iter[$x]}" ]; then
      epoch=${realign_this_iter[$x]}

      echo "Getting average posterior for purposes of adjusting the priors."
      # Note: this just uses CPUs, using a smallish subset of data.
      # always use the first egs archive, which makes the script simpler;
      # we're using different random subsets of it.
      rm $dir/post.$x.*.vec 2>/dev/null
      $cmd JOB=1:$num_jobs_compute_prior $dir/log/get_post.$x.JOB.log \
        nnet-copy-egs --srand=JOB --frame=random ark:$prev_egs_dir/egs.1.ark ark:- \| \
        nnet-subset-egs --srand=JOB --n=$prior_subset_size ark:- ark:- \| \
        nnet-compute-from-egs "nnet-to-raw-nnet $dir/$x.mdl -|" ark:- ark:- \| \
        matrix-sum-rows ark:- ark:- \| vector-sum ark:- $dir/post.$x.JOB.vec || exit 1;

      sleep 3;  # make sure there is time for $dir/post.$x.*.vec to appear.

      $cmd $dir/log/vector_sum.$x.log \
        vector-sum $dir/post.$x.*.vec $dir/post.$x.vec || exit 1;
      rm $dir/post.$x.*.vec;

      echo "Re-adjusting priors based on computed posteriors"
      $cmd $dir/log/adjust_priors.$x.log \
        nnet-adjust-priors $dir/$x.mdl $dir/post.$x.vec $dir/$x.mdl || exit 1;

      sleep 2

      steps/nnet2/align.sh --nj $num_jobs_align --cmd "$align_cmd" --use-gpu $align_use_gpu \
        --transform-dir "$transform_dir" --online-ivector-dir "$online_ivector_dir" \
        --iter $x $data $lang $dir $dir/ali_$epoch || exit 1

      steps/nnet2/relabel_egs2.sh --cmd "$cmd" --iter $x $dir/ali_$epoch \
        $prev_egs_dir $cur_egs_dir || exit 1

      if $cleanup && [[ $prev_egs_dir =~ $dir/egs* ]]; then
        steps/nnet2/remove_egs.sh $prev_egs_dir
      fi
    fi

    # Set off jobs doing some diagnostics, in the background.
    # Use the egs dir from the previous iteration for the diagnostics
    $cmd $dir/log/compute_prob_valid.$x.log \
      nnet-compute-prob $dir/$x.mdl ark:$cur_egs_dir/valid_diagnostic.egs &
    $cmd $dir/log/compute_prob_train.$x.log \
      nnet-compute-prob $dir/$x.mdl ark:$cur_egs_dir/train_diagnostic.egs &
    if [ $x -gt 0 ] && [ ! -f $dir/log/mix_up.$[$x-1].log ]; then
      $cmd $dir/log/progress.$x.log \
        nnet-show-progress --use-gpu=no $dir/$[$x-1].mdl $dir/$x.mdl \
        ark:$cur_egs_dir/train_diagnostic.egs '&&' \
        nnet-am-info $dir/$x.mdl &
    fi

    echo "Training neural net (pass $x)"

    if [ $x -gt 0 ] && \
      [ $x -le $[($num_hidden_layers-1)*$add_layers_period] ] && \
      [ $[$x%$add_layers_period] -eq 0 ]; then
      cur_num_hidden_layers=$[$x/$add_layers_period];
      mdl="nnet-init --srand=$x $dir/hidden_${cur_num_hidden_layers}.config - | nnet-insert $dir/$x.mdl - - |"
    else
      mdl=$dir/$x.mdl
    fi
    if [ $x -eq 0 ] || [ "$mdl" != "$dir/$x.mdl" ]; then
      # on iteration zero or when we just added a layer, use a smaller minibatch
      # size and just one job: the model-averaging doesn't seem to be helpful
      # when the model is changing too fast (i.e. it worsens the objective
      # function), and the smaller minibatch size will help to keep
      # the update stable.
      this_minibatch_size=$[$minibatch_size/2];
      do_average=false
    else
      this_minibatch_size=$minibatch_size
      do_average=true
    fi

    rm $dir/.error 2>/dev/null


    ( # this sub-shell is so that when we "wait" below,
      # we only wait for the training jobs that we just spawned,
      # not the diagnostic jobs that we spawned above.

      # We can't easily use a single parallel SGE job to do the main training,
      # because the computation of which archive and which --frame option
      # to use for each job is a little complex, so we spawn each one separately.
      for n in $(seq $num_jobs_nnet); do
        k=$[$x*$num_jobs_nnet + $n - 1]; # k is a zero-based index that we'll derive
                                         # the other indexes from.
        archive=$[($k%$num_archives)+1]; # work out the 1-based archive index.
        frame=$[(($k/$num_archives)%$frames_per_eg)]; # work out the 0-based frame
        # index; this increases more slowly than the archive index because the
        # same archive with different frame indexes will give similar gradients,
        # so we want to separate them in time.

        $cmd $parallel_opts $dir/log/train.$x.$n.log \
          nnet-train$parallel_suffix $parallel_train_opts \
          --minibatch-size=$this_minibatch_size --srand=$x "$mdl" \
          "ark,bg:nnet-copy-egs --frame=$frame ark:$cur_egs_dir/egs.$archive.ark ark:-|nnet-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x ark:- ark:-|" \
          $dir/$[$x+1].$n.mdl || touch $dir/.error &
      done
      wait
    )
    # the error message below is not that informative, but $cmd will
    # have printed a more specific one.
    [ -f $dir/.error ] && echo "$0: error on iteration $x of training" && exit 1;

    nnets_list=
    for n in `seq 1 $num_jobs_nnet`; do
      nnets_list="$nnets_list $dir/$[$x+1].$n.mdl"
    done

    learning_rate=`perl -e '($x,$n,$i,$f)=@ARGV; print ($x >= $n ? $f : $i*exp($x*log($f/$i)/$n));' $[$x+1] $num_iters $initial_learning_rate $final_learning_rate`;

    if $do_average; then
      # average the output of the different jobs.
      $cmd $dir/log/average.$x.log \
        nnet-am-average $nnets_list - \| \
        nnet-am-copy --learning-rate=$learning_rate - $dir/$[$x+1].mdl || exit 1;
    else
      # choose the best from the different jobs.
      n=$(perl -e '($nj,$pat)=@ARGV; $best_n=1; $best_logprob=-1.0e+10; for ($n=1;$n<=$nj;$n++) {
          $fn = sprintf($pat,$n); open(F, "<$fn") || die "Error opening log file $fn";
          undef $logprob; while (<F>) { if (m/log-prob-per-frame=(\S+)/) { $logprob=$1; } }
          close(F); if (defined $logprob && $logprob > $best_logprob) { $best_logprob=$logprob;
          $best_n=$n; } } print "$best_n\n"; ' $num_jobs_nnet $dir/log/train.$x.%d.log) || exit 1;
      [ -z "$n" ] && echo "Error getting best model" && exit 1;
      $cmd $dir/log/select.$x.log \
        nnet-am-copy --learning-rate=$learning_rate $dir/$[$x+1].$n.mdl $dir/$[$x+1].mdl || exit 1;
    fi

    if [ "$mix_up" -gt 0 ] && [ $x -eq $mix_up_iter ]; then
      # mix up.
      echo Mixing up from $num_leaves to $mix_up components
      $cmd $dir/log/mix_up.$x.log \
        nnet-am-mixup --min-count=10 --num-mixtures=$mix_up \
        $dir/$[$x+1].mdl $dir/$[$x+1].mdl || exit 1;
    fi
    rm $nnets_list
    [ ! -f $dir/$[$x+1].mdl ] && exit 1;
    if [ -f $dir/$[$x-1].mdl ] && $cleanup && \
       [ $[($x-1)%100] -ne 0  ] && [ $[$x-1] -lt $first_model_combine ]; then
      rm $dir/$[$x-1].mdl
    fi
  fi
  x=$[$x+1]
done


if [ $stage -le $num_iters ]; then
  echo "Doing final combination to produce final.mdl"

  # Now do combination.
  nnets_list=()
  # the if..else..fi statement below sets 'nnets_list'.
  if [ $max_models_combine -lt $num_models_combine ]; then
    # The number of models to combine is too large, e.g. > 20.  In this case,
    # each argument to nnet-combine-fast will be an average of multiple models.
    cur_offset=0 # current offset from first_model_combine.
    for n in $(seq $max_models_combine); do
      next_offset=$[($n*$num_models_combine)/$max_models_combine]
      sub_list=""
      for o in $(seq $cur_offset $[$next_offset-1]); do
        iter=$[$first_model_combine+$o]
        mdl=$dir/$iter.mdl
        [ ! -f $mdl ] && echo "Expected $mdl to exist" && exit 1;
        sub_list="$sub_list $mdl"
      done
      nnets_list[$[$n-1]]="nnet-am-average $sub_list - |"
      cur_offset=$next_offset
    done
  else
    nnets_list=
    for n in $(seq 0 $[num_models_combine-1]); do
      iter=$[$first_model_combine+$n]
      mdl=$dir/$iter.mdl
      [ ! -f $mdl ] && echo "Expected $mdl to exist" && exit 1;
      nnets_list[$n]=$mdl
    done
  fi


  # Below, use --use-gpu=no to disable nnet-combine-fast from using a GPU, as
  # if there are many models it can give out-of-memory error; set num-threads to 8
  # to speed it up (this isn't ideal...)
  num_egs=`nnet-copy-egs ark:$cur_egs_dir/combine.egs ark:/dev/null 2>&1 | tail -n 1 | awk '{print $NF}'`
  mb=$[($num_egs+$combine_num_threads-1)/$combine_num_threads]
  [ $mb -gt 512 ] && mb=512
  # Setting --initial-model to a large value makes it initialize the combination
  # with the average of all the models.  It's important not to start with a
  # single model, or, due to the invariance to scaling that these nonlinearities
  # give us, we get zero diagonal entries in the fisher matrix that
  # nnet-combine-fast uses for scaling, which after flooring and inversion, has
  # the effect that the initial model chosen gets much higher learning rates
  # than the others.  This prevents the optimization from working well.
  $cmd $combine_parallel_opts $dir/log/combine.log \
    nnet-combine-fast --initial-model=100000 --num-lbfgs-iters=40 --use-gpu=no \
      --num-threads=$combine_num_threads \
      --verbose=3 --minibatch-size=$mb "${nnets_list[@]}" ark:$cur_egs_dir/combine.egs \
      $dir/final.mdl || exit 1;

  # Normalize stddev for affine or block affine layers that are followed by a
  # pnorm layer and then a normalize layer.
  $cmd $dir/log/normalize.log \
    nnet-normalize-stddev $dir/final.mdl $dir/final.mdl || exit 1;

  # Compute the probability of the final, combined model with
  # the same subset we used for the previous compute_probs, as the
  # different subsets will lead to different probs.
  $cmd $dir/log/compute_prob_valid.final.log \
    nnet-compute-prob $dir/final.mdl ark:$cur_egs_dir/valid_diagnostic.egs &
  $cmd $dir/log/compute_prob_train.final.log \
    nnet-compute-prob $dir/final.mdl ark:$cur_egs_dir/train_diagnostic.egs &
fi

if [ $stage -le $[$num_iters+1] ]; then
  echo "Getting average posterior for purposes of adjusting the priors."
  # Note: this just uses CPUs, using a smallish subset of data.
  rm $dir/post.$x.*.vec 2>/dev/null
  $cmd JOB=1:$num_jobs_compute_prior $dir/log/get_post.$x.JOB.log \
    nnet-copy-egs --frame=random --srand=JOB ark:$cur_egs_dir/egs.1.ark ark:- \| \
    nnet-subset-egs --srand=JOB --n=$prior_subset_size ark:- ark:- \| \
    nnet-compute-from-egs "nnet-to-raw-nnet $dir/final.mdl -|" ark:- ark:- \| \
    matrix-sum-rows ark:- ark:- \| vector-sum ark:- $dir/post.$x.JOB.vec || exit 1;

  sleep 3;  # make sure there is time for $dir/post.$x.*.vec to appear.

  $cmd $dir/log/vector_sum.$x.log \
   vector-sum $dir/post.$x.*.vec $dir/post.$x.vec || exit 1;

  rm $dir/post.$x.*.vec;

  echo "Re-adjusting priors based on computed posteriors"
  $cmd $dir/log/adjust_priors.final.log \
    nnet-adjust-priors $dir/final.mdl $dir/post.$x.vec $dir/final.mdl || exit 1;
fi


if [ ! -f $dir/final.mdl ]; then
  echo "$0: $dir/final.mdl does not exist."
  # we don't want to clean up if the training didn't succeed.
  exit 1;
fi

sleep 2

echo Done

if $cleanup; then
  echo Cleaning up data
  if [[ $cur_egs_dir =~ $dir/egs* ]]; then
    steps/nnet2/remove_egs.sh $cur_egs_dir
  fi

  echo Removing most of the models
  for x in `seq 0 $num_iters`; do
    if [ $[$x%100] -ne 0 ] && [ $x -ne $num_iters ] && [ -f $dir/$x.mdl ]; then
       # delete all but every 100th model; don't delete the ones which combine to form the final model.
      rm $dir/$x.mdl
    fi
  done
fi


================================================
FILE: egs/steps/nnet2/train_pnorm_simple.sh
================================================
#!/usr/bin/env bash

# Copyright 2012-2014  Johns Hopkins University (Author: Daniel Povey).
#           2013  Xiaohui Zhang
#           2013  Guoguo Chen
#           2014  Vimal Manohar
# Apache 2.0.


# train_pnorm_simple.sh is a modified version of train_pnorm_fast.sh.  Like
# train_pnorm_fast.sh, it uses the `online' preconditioning, which is faster
# (especially on GPUs).  The difference is that the learning-rate schedule is
# simpler, with the learning rate exponentially decreasing during training,
# and no phase where the learning rate is constant.
#
# Also, the final model-combination is done a bit differently: we combine models
# over typically a whole epoch, and because that would be too many iterations to
# easily be able to combine over, we arrange the iterations into groups (20
# groups by default) and average over each group.
#
# [Vimal Manohar - Oct 2014]
# The script now supports realignment during training, which can be done by
# specifying realign_epochs.

# Begin configuration section.
cmd=run.pl
num_epochs=15      # Number of epochs of training;
                   # the number of iterations is worked out from this.
initial_learning_rate=0.04
final_learning_rate=0.004
bias_stddev=0.5
pnorm_input_dim=3000
pnorm_output_dim=300
p=2
minibatch_size=128 # by default use a smallish minibatch size for neural net
                   # training; this controls instability which would otherwise
                   # be a problem with multi-threaded update.

samples_per_iter=400000 # each iteration of training, see this many samples
                        # per job.  This option is passed to get_egs.sh
num_jobs_nnet=16   # Number of neural net jobs to run in parallel.  This option
                   # is passed to get_egs.sh.
get_egs_stage=0
online_ivector_dir=


max_models_combine=20 # The "max_models_combine" is the maximum number of models we give
  # to the final 'combine' stage, but these models will themselves be averages of
  # iteration-number ranges.

shuffle_buffer_size=5000 # This "buffer_size" variable controls randomization of the samples
                # on each iter.  You could set it to 0 or to a large value for complete
                # randomization, but this would both consume memory and cause spikes in
                # disk I/O.  Smaller is easier on disk and memory but less random.  It's
                # not a huge deal though, as samples are anyway randomized right at the start.
                # (the point of this is to get data in different minibatches on different iterations,
                # since in the preconditioning method, 2 samples in the same minibatch can
                # affect each others' gradients.

add_layers_period=2 # by default, add new layers every 2 iterations.
num_hidden_layers=3
stage=-4

io_opts="--max-jobs-run 5" # for jobs with a lot of I/O, limits the number running at one time.   These don't
splice_width=4 # meaning +- 4 frames on each side for second LDA
randprune=4.0 # speeds up LDA.
alpha=4.0 # relates to preconditioning.
update_period=4 # relates to online preconditioning: says how often we update the subspace.
num_samples_history=2000 # relates to online preconditioning
max_change_per_sample=0.075
precondition_rank_in=20  # relates to online preconditioning
precondition_rank_out=80 # relates to online preconditioning

mix_up=0 # Number of components to mix up to (should be > #tree leaves, if
        # specified.)
num_threads=16
parallel_opts="--num-threads 16 --mem 1G"
  # by default we use 16 threads; this lets the queue know.
  # note: parallel_opts doesn't automatically get adjusted if you adjust num-threads.
combine_num_threads=8
combine_parallel_opts="--num-threads 8"  # queue options for the "combine" stage.
cleanup=true
egs_dir=
lda_opts=
lda_dim=
egs_opts=
transform_dir=     # If supplied, overrides alidir
cmvn_opts=  # will be passed to get_lda.sh and get_egs.sh, if supplied.
            # only relevant for "raw" features, not lda.
feat_type=  # Can be used to force "raw" features.
prior_subset_size=10000 # 10k samples per job, for computing priors.  Should be
                        # more than enough.
align_cmd=              # The cmd that is passed to steps/nnet2/align.sh
align_use_gpu=          # Passed to use_gpu in steps/nnet2/align.sh [yes/no]
realign_epochs=         # List of epochs, the beginning of which realignment is done
num_jobs_align=30       # Number of jobs for realignment
# End configuration section.


echo "$0 $@"  # Print the command line for logging

if [ -f path.sh ]; then . ./path.sh; fi
. parse_options.sh || exit 1;

if [ $# != 4 ]; then
  echo "Usage: $0 [opts] <data> <lang> <ali-dir> <exp-dir>"
  echo " e.g.: $0 data/train data/lang exp/tri3_ali exp/tri4_nnet"
  echo ""
  echo "Main options (for others, see top of script file)"
  echo "  --config <config-file>                           # config file containing options"
  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
  echo "  --num-epochs <#epochs|15>                        # Number of epochs of training"
  echo "  --initial-learning-rate <initial-learning-rate|0.02> # Learning rate at start of training, e.g. 0.02 for small"
  echo "                                                       # data, 0.01 for large data"
  echo "  --final-learning-rate  <final-learning-rate|0.004>   # Learning rate at end of training, e.g. 0.004 for small"
  echo "                                                   # data, 0.001 for large data"
  echo "  --num-hidden-layers <#hidden-layers|2>           # Number of hidden layers, e.g. 2 for 3 hours of data, 4 for 100hrs"
  echo "  --add-layers-period <#iters|2>                   # Number of iterations between adding hidden layers"
  echo "  --mix-up <#pseudo-gaussians|0>                   # Can be used to have multiple targets in final output layer,"
  echo "                                                   # per context-dependent state.  Try a number several times #states."
  echo "  --num-jobs-nnet <num-jobs|8>                     # Number of parallel jobs to use for main neural net"
  echo "                                                   # training (will affect results as well as speed; try 8, 16)"
  echo "                                                   # Note: if you increase this, you may want to also increase"
  echo "                                                   # the learning rate."
  echo "  --num-threads <num-threads|16>                   # Number of parallel threads per job (will affect results"
  echo "                                                   # as well as speed; may interact with batch size; if you increase"
  echo "                                                   # this, you may want to decrease the batch size."
  echo "  --parallel-opts <opts|\"--num-threads 16 --mem 1G\">      # extra options to pass to e.g. queue.pl for processes that"
  echo "                                                   # use multiple threads... "
  echo "  --io-opts <opts|\"--max-jobs-run 10\">                      # Options given to e.g. queue.pl for jobs that do a lot of I/O."
  echo "  --minibatch-size <minibatch-size|128>            # Size of minibatch to process (note: product with --num-threads"
  echo "                                                   # should not get too large, e.g. >2k)."
  echo "  --samples-per-iter <#samples|400000>             # Number of samples of data to process per iteration, per"
  echo "                                                   # process."
  echo "  --splice-width <width|4>                         # Number of frames on each side to append for feature input"
  echo "                                                   # (note: we splice processed, typically 40-dimensional frames"
  echo "  --lda-dim <dim|250>                              # Dimension to reduce spliced features to with LDA"
  echo "  --realign-epochs <list-of-epochs|\"\">           # A list of space-separated epoch indices the beginning of which"
  echo "                                                   # realignment is to be done"
  echo "  --align-cmd (utils/run.pl|utils/queue.pl <queue opts>) # passed to align.sh"
  echo "  --align-use-gpu (yes/no)                         # specify is gpu is to be used for realignment"
  echo "  --num-jobs-align <#njobs|30>                     # Number of jobs to perform realignment"
  echo "  --stage <stage|-4>                               # Used to run a partially-completed training process from somewhere in"
  echo "                                                   # the middle."


  exit 1;
fi

data=$1
lang=$2
alidir=$3
dir=$4

if [ ! -z "$realign_epochs" ]; then
  [ -z "$align_cmd" ] && echo "$0: realign_epochs specified but align_cmd not specified" && exit 1
  [ -z "$align_use_gpu" ] && echo "$0: realign_epochs specified but align_use_gpu not specified" && exit 1
fi

# Check some files.
for f in $data/feats.scp $lang/L.fst $alidir/ali.1.gz $alidir/final.mdl $alidir/tree; do
  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
done


# Set some variables.
num_leaves=`tree-info $alidir/tree 2>/dev/null | grep num-pdfs | awk '{print $2}'` || exit 1
[ -z $num_leaves ] && echo "\$num_leaves is unset" && exit 1
[ "$num_leaves" -eq "0" ] && echo "\$num_leaves is 0" && exit 1

nj=`cat $alidir/num_jobs` || exit 1;  # number of jobs in alignment dir...
# in this dir we'll have just one job.
sdata=$data/split$nj
utils/split_data.sh $data $nj

mkdir -p $dir/log
echo $nj > $dir/num_jobs
cp $alidir/tree $dir

utils/lang/check_phones_compatible.sh $lang/phones.txt $alidir/phones.txt || exit 1;
cp $lang/phones.txt $dir || exit 1;

extra_opts=()
[ ! -z "$cmvn_opts" ] && extra_opts+=(--cmvn-opts "$cmvn_opts")
[ ! -z "$feat_type" ] && extra_opts+=(--feat-type $feat_type)
[ ! -z "$online_ivector_dir" ] && extra_opts+=(--online-ivector-dir $online_ivector_dir)
[ -z "$transform_dir" ] && transform_dir=$alidir
extra_opts+=(--transform-dir $transform_dir)
extra_opts+=(--splice-width $splice_width)

if [ $stage -le -4 ]; then
  echo "$0: calling get_lda.sh"
  steps/nnet2/get_lda.sh $lda_opts "${extra_opts[@]}" --cmd "$cmd" $data $lang $alidir $dir || exit 1;
fi

# these files will have been written by get_lda.sh
feat_dim=$(cat $dir/feat_dim) || exit 1;
ivector_dim=$(cat $dir/ivector_dim) || exit 1;
lda_dim=$(cat $dir/lda_dim) || exit 1;

if [ $stage -le -3 ] && [ -z "$egs_dir" ]; then
  echo "$0: calling get_egs.sh"
  steps/nnet2/get_egs.sh $egs_opts "${extra_opts[@]}" \
      --samples-per-iter $samples_per_iter \
      --num-jobs-nnet $num_jobs_nnet --stage $get_egs_stage \
      --cmd "$cmd" $egs_opts --io-opts "$io_opts" \
      $data $lang $alidir $dir || exit 1;
fi

if [ -z $egs_dir ]; then
  egs_dir=$dir/egs
fi

iters_per_epoch=`cat $egs_dir/iters_per_epoch`  || exit 1;
! [ $num_jobs_nnet -eq `cat $egs_dir/num_jobs_nnet` ] && \
  echo "$0: Warning: using --num-jobs-nnet=`cat $egs_dir/num_jobs_nnet` from $egs_dir"
num_jobs_nnet=`cat $egs_dir/num_jobs_nnet` || exit 1;


if ! [ $num_hidden_layers -ge 1 ]; then
  echo "Invalid num-hidden-layers $num_hidden_layers"
  exit 1
fi

if [ $stage -le -2 ]; then
  echo "$0: initializing neural net";
  lda_mat=$dir/lda.mat
  tot_input_dim=$[$feat_dim+$ivector_dim]

  online_preconditioning_opts="alpha=$alpha num-samples-history=$num_samples_history update-period=$update_period rank-in=$precondition_rank_in rank-out=$precondition_rank_out max-change-per-sample=$max_change_per_sample"

  stddev=`perl -e "print 1.0/sqrt($pnorm_input_dim);"`
  cat >$dir/nnet.config <<EOF
SpliceComponent input-dim=$tot_input_dim left-context=$splice_width right-context=$splice_width const-component-dim=$ivector_dim
FixedAffineComponent matrix=$lda_mat
AffineComponentPreconditionedOnline input-dim=$lda_dim output-dim=$pnorm_input_dim $online_preconditioning_opts learning-rate=$initial_learning_rate param-stddev=$stddev bias-stddev=$bias_stddev
PnormComponent input-dim=$pnorm_input_dim output-dim=$pnorm_output_dim p=$p
NormalizeComponent dim=$pnorm_output_dim
AffineComponentPreconditionedOnline input-dim=$pnorm_output_dim output-dim=$num_leaves $online_preconditioning_opts learning-rate=$initial_learning_rate param-stddev=0 bias-stddev=0
SoftmaxComponent dim=$num_leaves
EOF

  # to hidden.config it will write the part of the config corresponding to a
  # single hidden layer; we need this to add new layers.
  cat >$dir/hidden.config <<EOF
AffineComponentPreconditionedOnline input-dim=$pnorm_output_dim output-dim=$pnorm_input_dim $online_preconditioning_opts learning-rate=$initial_learning_rate param-stddev=$stddev bias-stddev=$bias_stddev
PnormComponent input-dim=$pnorm_input_dim output-dim=$pnorm_output_dim p=$p
NormalizeComponent dim=$pnorm_output_dim
EOF
  $cmd $dir/log/nnet_init.log \
    nnet-am-init $alidir/tree $lang/topo "nnet-init $dir/nnet.config -|" \
    $dir/0.mdl || exit 1;
fi

if [ $stage -le -1 ]; then
  echo "Training transition probabilities and setting priors"
  $cmd $dir/log/train_trans.log \
    nnet-train-transitions $dir/0.mdl "ark:gunzip -c $alidir/ali.*.gz|" $dir/0.mdl \
    || exit 1;
fi

num_iters=$[$num_epochs * $iters_per_epoch];

echo "$0: Will train for $num_epochs epochs = $num_iters iterations"

finish_add_layers_iter=$[$num_hidden_layers * $add_layers_period]
# This is when we decide to mix up from: halfway between when we've finished
# adding the hidden layers and the end of training.
mix_up_iter=$[($num_iters + $finish_add_layers_iter)/2]

if [ $num_threads -eq 1 ]; then
  parallel_suffix="-simple" # this enables us to use GPU code if
                         # we have just one thread.
  parallel_train_opts=
  if ! cuda-compiled; then
    echo "$0: WARNING: you are running with one thread but you have not compiled"
    echo "   for CUDA.  You may be running a setup optimized for GPUs.  If you have"
    echo "   GPUs and have nvcc installed, go to src/ and do ./configure; make"
  fi
else
  parallel_suffix="-parallel"
  parallel_train_opts="--num-threads=$num_threads"
fi

# First work out how many models we want to combine over in the final
# nnet-combine-fast invocation.  This equals
# min(max(max_models_combine, iters_per_epoch),
#     2/3 * iters_after_mixup)
num_models_combine=$max_models_combine
if [ $num_models_combine -lt $iters_per_epoch ]; then
  num_models_combine=$iters_per_epoch
fi
iters_after_mixup_23=$[(($num_iters-$mix_up_iter-1)*2)/3]
if [ $num_models_combine -gt $iters_after_mixup_23 ]; then
  num_models_combine=$iters_after_mixup_23
fi
first_model_combine=$[$num_iters-$num_models_combine+1]

x=0

for realign_epoch in $realign_epochs; do
  realign_iter=`perl -e 'print int($ARGV[0] * $ARGV[1]);' $realign_epoch $iters_per_epoch`
  realign_this_iter[$realign_iter]=$realign_epoch
done

cur_egs_dir=$egs_dir

while [ $x -lt $num_iters ]; do

  if [ ! -z "${realign_this_iter[$x]}" ]; then
    prev_egs_dir=$cur_egs_dir
    cur_egs_dir=$dir/egs_${realign_this_iter[$x]}
  fi

  if [ $x -ge 0 ] && [ $stage -le $x ]; then
    if [ ! -z "${realign_this_iter[$x]}" ]; then
      epoch=${realign_this_iter[$x]}

      echo "Getting average posterior for purposes of adjusting the priors."
      # Note: this just uses CPUs, using a smallish subset of data.
      rm $dir/post.$x.*.vec 2>/dev/null
      $cmd JOB=1:$num_jobs_nnet $dir/log/get_post.$x.JOB.log \
        nnet-subset-egs --n=$prior_subset_size ark:$prev_egs_dir/egs.JOB.0.ark ark:- \| \
        nnet-compute-from-egs "nnet-to-raw-nnet $dir/$x.mdl -|" ark:- ark:- \| \
        matrix-sum-rows ark:- ark:- \| vector-sum ark:- $dir/post.$x.JOB.vec || exit 1;

      sleep 3;  # make sure there is time for $dir/post.$x.*.vec to appear.

      $cmd $dir/log/vector_sum.log \
        vector-sum $dir/post.$x.*.vec $dir/post.$x.vec || exit 1;

      rm $dir/post.$x.*.vec;

      echo "Re-adjusting priors based on computed posteriors"
      $cmd $dir/log/adjust_priors.$x.log \
        nnet-adjust-priors $dir/$x.mdl $dir/post.vec $dir/$x.mdl || exit 1;

      sleep 2

      steps/nnet2/align.sh --nj $num_jobs_align --cmd "$align_cmd" --use-gpu $align_use_gpu \
        --transform-dir "$transform_dir" --online-ivector-dir "$online_ivector_dir" \
        --iter $x $data $lang $dir $dir/ali_$epoch || exit 1

      steps/nnet2/relabel_egs.sh --cmd "$cmd" --iter $x $dir/ali_$epoch \
        $prev_egs_dir $cur_egs_dir || exit 1

      if $cleanup && [[ $prev_egs_dir =~ $dir/egs* ]]; then
        steps/nnet2/remove_egs.sh $prev_egs_dir
      fi
    fi

    # Set off jobs doing some diagnostics, in the background.
    # Use the egs dir from the previous iteration for the diagnostics
    $cmd $dir/log/compute_prob_valid.$x.log \
      nnet-compute-prob $dir/$x.mdl ark:$cur_egs_dir/valid_diagnostic.egs &
    $cmd $dir/log/compute_prob_train.$x.log \
      nnet-compute-prob $dir/$x.mdl ark:$cur_egs_dir/train_diagnostic.egs &
    if [ $x -gt 0 ] && [ ! -f $dir/log/mix_up.$[$x-1].log ]; then
      $cmd $dir/log/progress.$x.log \
        nnet-show-progress --use-gpu=no $dir/$[$x-1].mdl $dir/$x.mdl \
        ark:$cur_egs_dir/train_diagnostic.egs '&&' \
        nnet-am-info $dir/$x.mdl &
    fi

    echo "Training neural net (pass $x)"

    if [ $x -gt 0 ] && \
      [ $x -le $[($num_hidden_layers-1)*$add_layers_period] ] && \
      [ $[($x-1) % $add_layers_period] -eq 0 ]; then
      mdl="nnet-init --srand=$x $dir/hidden.config - | nnet-insert $dir/$x.mdl - - |"
    else
      mdl=$dir/$x.mdl
    fi
    if [ $x -eq 0 ] || [ "$mdl" != "$dir/$x.mdl" ]; then
      # on iteration zero or when we just added a layer, use a smaller minibatch
      # size and just one job: the model-averaging doesn't seem to be helpful
      # when the model is changing too fast (i.e. it worsens the objective
      # function), and the smaller minibatch size will help to keep
      # the update stable.
      this_minibatch_size=$[$minibatch_size/2];
      do_average=false
    else
      this_minibatch_size=$minibatch_size
      do_average=true
    fi

    $cmd $parallel_opts JOB=1:$num_jobs_nnet $dir/log/train.$x.JOB.log \
      nnet-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x \
      ark:$cur_egs_dir/egs.JOB.$[$x%$iters_per_epoch].ark ark:- \| \
       nnet-train$parallel_suffix $parallel_train_opts \
        --minibatch-size=$this_minibatch_size --srand=$x "$mdl" \
        ark:- $dir/$[$x+1].JOB.mdl \
      || exit 1;

    nnets_list=
    for n in `seq 1 $num_jobs_nnet`; do
      nnets_list="$nnets_list $dir/$[$x+1].$n.mdl"
    done

    learning_rate=`perl -e '($x,$n,$i,$f)=@ARGV; print ($x >= $n ? $f : $i*exp($x*log($f/$i)/$n));' $[$x+1] $num_iters $initial_learning_rate $final_learning_rate`;

    if $do_average; then
      # average the output of the different jobs.
      $cmd $dir/log/average.$x.log \
        nnet-am-average $nnets_list - \| \
        nnet-am-copy --learning-rate=$learning_rate - $dir/$[$x+1].mdl || exit 1;
    else
      # choose the best from the different jobs.
      n=$(perl -e '($nj,$pat)=@ARGV; $best_n=1; $best_logprob=-1.0e+10; for ($n=1;$n<=$nj;$n++) {
          $fn = sprintf($pat,$n); open(F, "<$fn") || die "Error opening log file $fn";
          undef $logprob; while (<F>) { if (m/log-prob-per-frame=(\S+)/) { $logprob=$1; } }
          close(F); if (defined $logprob && $logprob > $best_logprob) { $best_logprob=$logprob;
          $best_n=$n; } } print "$best_n\n"; ' $num_jobs_nnet $dir/log/train.$x.%d.log) || exit 1;
      [ -z "$n" ] && echo "Error getting best model" && exit 1;
      $cmd $dir/log/select.$x.log \
        nnet-am-copy --learning-rate=$learning_rate $dir/$[$x+1].$n.mdl $dir/$[$x+1].mdl || exit 1;
    fi

    if [ "$mix_up" -gt 0 ] && [ $x -eq $mix_up_iter ]; then
      # mix up.
      echo Mixing up from $num_leaves to $mix_up components
      $cmd $dir/log/mix_up.$x.log \
        nnet-am-mixup --min-count=10 --num-mixtures=$mix_up \
        $dir/$[$x+1].mdl $dir/$[$x+1].mdl || exit 1;
    fi
    rm $nnets_list
    [ ! -f $dir/$[$x+1].mdl ] && exit 1;
    if [ -f $dir/$[$x-1].mdl ] && $cleanup && \
       [ $[($x-1)%100] -ne 0  ] && [ $[$x-1] -lt $first_model_combine ]; then
      rm $dir/$[$x-1].mdl
    fi
  fi
  x=$[$x+1]
done


if [ $stage -le $num_iters ]; then
  echo "Doing final combination to produce final.mdl"

  # Now do combination.
  nnets_list=()
  # the if..else..fi statement below sets 'nnets_list'.
  if [ $max_models_combine -lt $num_models_combine ]; then
    # The number of models to combine is too large, e.g. > 20.  In this case,
    # each argument to nnet-combine-fast will be an average of multiple models.
    cur_offset=0 # current offset from first_model_combine.
    for n in $(seq $max_models_combine); do
      next_offset=$[($n*$num_models_combine)/$max_models_combine]
      sub_list=""
      for o in $(seq $cur_offset $[$next_offset-1]); do
        iter=$[$first_model_combine+$o]
        mdl=$dir/$iter.mdl
        [ ! -f $mdl ] && echo "Expected $mdl to exist" && exit 1;
        sub_list="$sub_list $mdl"
      done
      nnets_list[$[$n-1]]="nnet-am-average $sub_list - |"
      cur_offset=$next_offset
    done
  else
    nnets_list=
    for n in $(seq 0 $[num_models_combine-1]); do
      iter=$[$first_model_combine+$n]
      mdl=$dir/$iter.mdl
      [ ! -f $mdl ] && echo "Expected $mdl to exist" && exit 1;
      nnets_list[$n]=$mdl
    done
  fi


  # Below, use --use-gpu=no to disable nnet-combine-fast from using a GPU, as
  # if there are many models it can give out-of-memory error; set num-threads to 8
  # to speed it up (this isn't ideal...)
  num_egs=`nnet-copy-egs ark:$cur_egs_dir/combine.egs ark:/dev/null 2>&1 | tail -n 1 | awk '{print $NF}'`
  mb=$[($num_egs+$combine_num_threads-1)/$combine_num_threads]
  [ $mb -gt 512 ] && mb=512
  # Setting --initial-model to a large value makes it initialize the combination
  # with the average of all the models.  It's important not to start with a
  # single model, or, due to the invariance to scaling that these nonlinearities
  # give us, we get zero diagonal entries in the fisher matrix that
  # nnet-combine-fast uses for scaling, which after flooring and inversion, has
  # the effect that the initial model chosen gets much higher learning rates
  # than the others.  This prevents the optimization from working well.
  $cmd $combine_parallel_opts $dir/log/combine.log \
    nnet-combine-fast --initial-model=100000 --num-lbfgs-iters=40 --use-gpu=no \
      --num-threads=$combine_num_threads \
      --verbose=3 --minibatch-size=$mb "${nnets_list[@]}" ark:$cur_egs_dir/combine.egs \
      $dir/final.mdl || exit 1;

  # Normalize stddev for affine or block affine layers that are followed by a
  # pnorm layer and then a normalize layer.
  $cmd $dir/log/normalize.log \
    nnet-normalize-stddev $dir/final.mdl $dir/final.mdl || exit 1;

  # Compute the probability of the final, combined model with
  # the same subset we used for the previous compute_probs, as the
  # different subsets will lead to different probs.
  $cmd $dir/log/compute_prob_valid.final.log \
    nnet-compute-prob $dir/final.mdl ark:$cur_egs_dir/valid_diagnostic.egs &
  $cmd $dir/log/compute_prob_train.final.log \
    nnet-compute-prob $dir/final.mdl ark:$cur_egs_dir/train_diagnostic.egs &
fi

if [ $stage -le $[$num_iters+1] ]; then
  echo "Getting average posterior for purposes of adjusting the priors."
  # Note: this just uses CPUs, using a smallish subset of data.
  rm $dir/post.$x.*.vec 2>/dev/null
  $cmd JOB=1:$num_jobs_nnet $dir/log/get_post.$x.JOB.log \
    nnet-subset-egs --n=$prior_subset_size ark:$cur_egs_dir/egs.JOB.0.ark ark:- \| \
    nnet-compute-from-egs "nnet-to-raw-nnet $dir/final.mdl -|" ark:- ark:- \| \
    matrix-sum-rows ark:- ark:- \| vector-sum ark:- $dir/post.$x.JOB.vec || exit 1;

  sleep 3;  # make sure there is time for $dir/post.$x.*.vec to appear.

  $cmd $dir/log/vector_sum.log \
   vector-sum $dir/post.$x.*.vec $dir/post.$x.vec || exit 1;

  rm $dir/post.$x.*.vec;

  echo "Re-adjusting priors based on computed posteriors"
  $cmd $dir/log/adjust_priors.final.log \
    nnet-adjust-priors $dir/final.mdl $dir/post.$x.vec $dir/final.mdl || exit 1;
fi


if [ ! -f $dir/final.mdl ]; then
  echo "$0: $dir/final.mdl does not exist."
  # we don't want to clean up if the training didn't succeed.
  exit 1;
fi

sleep 2

echo Done

if $cleanup; then
  echo Cleaning up data
  if [[ $cur_egs_dir =~ $dir/egs* ]]; then
    steps/nnet2/remove_egs.sh $cur_egs_dir
  fi

  echo Removing most of the models
  for x in `seq 0 $num_iters`; do
    if [ $[$x%100] -ne 0 ] && [ $x -ne $num_iters ] && [ -f $dir/$x.mdl ]; then
       # delete all but every 100th model; don't delete the ones which combine to form the final model.
      rm $dir/$x.mdl
    fi
  done

fi


================================================
FILE: egs/steps/nnet2/train_pnorm_simple2.sh
================================================
#!/usr/bin/env bash

# Copyright 2012-2014  Johns Hopkins University (Author: Daniel Povey).
#                2013  Xiaohui Zhang
#                2013  Guoguo Chen
#                2014  Vimal Manohar
# Apache 2.0.


# train_pnorm_simple2.sh is as train_pnorm_simple.sh but it uses the "new" egs
# format, created by get_egs2.sh.

# train_pnorm_simple.sh is a modified version of train_pnorm_fast.sh.  Like
# train_pnorm_fast.sh, it uses the `online' preconditioning, which is faster
# (especially on GPUs).  The difference is that the learning-rate schedule is
# simpler, with the learning rate exponentially decreasing during training,
# and no phase where the learning rate is constant.
#
# Also, the final model-combination is done a bit differently: we combine models
# over typically a whole epoch, and because that would be too many iterations to
# easily be able to combine over, we arrange the iterations into groups (20
# groups by default) and average over each group.
#
# [Vimal Manohar - Oct 2014]
# The script now supports realignment during training, which can be done by
# specifying realign_epochs.

# Begin configuration section.
cmd=run.pl
num_epochs=15      # Number of epochs of training;
                   # the number of iterations is worked out from this.
initial_learning_rate=0.04
final_learning_rate=0.004
bias_stddev=0.5
pnorm_input_dim=3000
pnorm_output_dim=300
p=2
presoftmax_prior_scale_power=-0.25 # use the specified power value on the priors (inverse priors)
                                   # to scale the pre-softmax outputs
minibatch_size=128 # by default use a smallish minibatch size for neural net
                   # training; this controls instability which would otherwise
                   # be a problem with multi-threaded update.

samples_per_iter=400000 # each iteration of training, see this many samples
                        # per job.  This option is passed to get_egs.sh
num_jobs_nnet=4    # Number of neural net jobs to run in parallel.  This option
                   # is passed to get_egs.sh.
prior_subset_size=10000 # 10k samples per job, for computing priors.  Should be
                        # more than enough.
num_jobs_compute_prior=10 # these are single-threaded, run on CPU.
get_egs_stage=0
online_ivector_dir=


max_models_combine=20 # The "max_models_combine" is the maximum number of models we give
  # to the final 'combine' stage, but these models will themselves be averages of
  # iteration-number ranges.

shuffle_buffer_size=5000 # This "buffer_size" variable controls randomization of the samples
                # on each iter.  You could set it to 0 or to a large value for complete
                # randomization, but this would both consume memory and cause spikes in
                # disk I/O.  Smaller is easier on disk and memory but less random.  It's
                # not a huge deal though, as samples are anyway randomized right at the start.
                # (the point of this is to get data in different minibatches on different iterations,
                # since in the preconditioning method, 2 samples in the same minibatch can
                # affect each others' gradients.

add_layers_period=2 # by default, add new layers every 2 iterations.
num_hidden_layers=3
stage=-4

splice_width=4 # meaning +- 4 frames on each side for second LDA
left_context= # if set, overrides splice-width
right_context= # if set, overrides splice-width.
randprune=4.0 # speeds up LDA.
alpha=4.0 # relates to preconditioning.
update_period=4 # relates to online preconditioning: says how often we update the subspace.
num_samples_history=2000 # relates to online preconditioning
max_change_per_sample=0.075
precondition_rank_in=20  # relates to online preconditioning
precondition_rank_out=80 # relates to online preconditioning

mix_up=0 # Number of components to mix up to (should be > #tree leaves, if
        # specified.)
num_threads=16
parallel_opts="--num-threads 16 --mem 1G"
  # by default we use 16 threads; this lets the queue know.
  # note: parallel_opts doesn't automatically get adjusted if you adjust num-threads.
combine_num_threads=8
combine_parallel_opts="--num-threads 8"  # queue options for the "combine" stage.
cleanup=true
egs_dir=
lda_opts=
lda_dim=
egs_opts=
io_opts="--max-jobs-run 5" # for jobs with a lot of I/O, limits the number running at one time.
transform_dir=     # If supplied, overrides alidir
cmvn_opts=  # will be passed to get_lda.sh and get_egs.sh, if supplied.
            # only relevant for "raw" features, not lda.
feat_type=  # Can be used to force "raw" features.
align_cmd=              # The cmd that is passed to steps/nnet2/align.sh
align_use_gpu=          # Passed to use_gpu in steps/nnet2/align.sh [yes/no]
realign_epochs=         # List of epochs, the beginning of which realignment is done
num_jobs_align=30       # Number of jobs for realignment
# End configuration section.


echo "$0 $@"  # Print the command line for logging

if [ -f path.sh ]; then . ./path.sh; fi
. parse_options.sh || exit 1;

if [ $# != 4 ]; then
  echo "Usage: $0 [opts] <data> <lang> <ali-dir> <exp-dir>"
  echo " e.g.: $0 data/train data/lang exp/tri3_ali exp/tri4_nnet"
  echo ""
  echo "Main options (for others, see top of script file)"
  echo "  --config <config-file>                           # config file containing options"
  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
  echo "  --num-epochs <#epochs|15>                        # Number of epochs of training"
  echo "  --initial-learning-rate <initial-learning-rate|0.02> # Learning rate at start of training, e.g. 0.02 for small"
  echo "                                                       # data, 0.01 for large data"
  echo "  --final-learning-rate  <final-learning-rate|0.004>   # Learning rate at end of training, e.g. 0.004 for small"
  echo "                                                   # data, 0.001 for large data"
  echo "  --num-hidden-layers <#hidden-layers|2>           # Number of hidden layers, e.g. 2 for 3 hours of data, 4 for 100hrs"
  echo "  --add-layers-period <#iters|2>                   # Number of iterations between adding hidden layers"
  echo "  --mix-up <#pseudo-gaussians|0>                   # This option now does nothing; please remove it."
  echo "  --presoftmax-prior-scale-power <power|-0.25>     # use the specified power value on the priors (inverse priors) "
  echo "                                                   # to scale the pre-softmax outputs."
  echo "                                                   # (set to 0.0 to disable the presoftmax element scale)"
  echo "  --num-jobs-nnet <num-jobs|8>                     # Number of parallel jobs to use for main neural net"
  echo "                                                   # training (will affect results as well as speed; try 8, 16)"
  echo "                                                   # Note: if you increase this, you may want to also increase"
  echo "                                                   # the learning rate."
  echo "  --num-threads <num-threads|16>                   # Number of parallel threads per job (will affect results"
  echo "                                                   # as well as speed; may interact with batch size; if you increase"
  echo "                                                   # this, you may want to decrease the batch size."
  echo "  --parallel-opts <opts|\"--num-threads 16 --mem 1G\">      # extra options to pass to e.g. queue.pl for processes that"
  echo "                                                   # use multiple threads... "
  echo "  --io-opts <opts|\"--max-jobs-run 10\">                      # Options given to e.g. queue.pl for jobs that do a lot of I/O."
  echo "  --minibatch-size <minibatch-size|128>            # Size of minibatch to process (note: product with --num-threads"
  echo "                                                   # should not get too large, e.g. >2k)."
  echo "  --samples-per-iter <#samples|400000>             # Number of samples of data to process per iteration, per"
  echo "                                                   # process."
  echo "  --splice-width <width|4>                         # Number of frames on each side to append for feature input"
  echo "                                                   # (note: we splice processed, typically 40-dimensional frames"
  echo "  --lda-dim <dim|250>                              # Dimension to reduce spliced features to with LDA"
  echo "  --realign-epochs <list-of-epochs|\"\">           # A list of space-separated epoch indices the beginning of which"
  echo "                                                   # realignment is to be done"
  echo "  --align-cmd (utils/run.pl|utils/queue.pl <queue opts>) # passed to align.sh"
  echo "  --align-use-gpu (yes/no)                         # specify is gpu is to be used for realignment"
  echo "  --num-jobs-align <#njobs|30>                     # Number of jobs to perform realignment"
  echo "  --stage <stage|-4>                               # Used to run a partially-completed training process from somewhere in"
  echo "                                                   # the middle."


  exit 1;
fi

data=$1
lang=$2
alidir=$3
dir=$4

if [ ! -z "$realign_epochs" ]; then
  [ -z "$align_cmd" ] && echo "$0: realign_epochs specified but align_cmd not specified" && exit 1
  [ -z "$align_use_gpu" ] && echo "$0: realign_epochs specified but align_use_gpu not specified" && exit 1
fi

# Check some files.
for f in $data/feats.scp $lang/L.fst $alidir/ali.1.gz $alidir/final.mdl $alidir/tree; do
  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
done


# Set some variables.
num_leaves=`tree-info $alidir/tree 2>/dev/null | grep num-pdfs | awk '{print $2}'` || exit 1
[ -z $num_leaves ] && echo "\$num_leaves is unset" && exit 1
[ "$num_leaves" -eq "0" ] && echo "\$num_leaves is 0" && exit 1

nj=`cat $alidir/num_jobs` || exit 1;  # number of jobs in alignment dir...
# in this dir we'll have just one job.
sdata=$data/split$nj
utils/split_data.sh $data $nj

mkdir -p $dir/log
echo $nj > $dir/num_jobs
cp $alidir/tree $dir

utils/lang/check_phones_compatible.sh $lang/phones.txt $alidir/phones.txt || exit 1;
cp $lang/phones.txt $dir || exit 1;

extra_opts=()
[ ! -z "$cmvn_opts" ] && extra_opts+=(--cmvn-opts "$cmvn_opts")
[ ! -z "$feat_type" ] && extra_opts+=(--feat-type $feat_type)
[ ! -z "$online_ivector_dir" ] && extra_opts+=(--online-ivector-dir $online_ivector_dir)
[ -z "$transform_dir" ] && transform_dir=$alidir
extra_opts+=(--transform-dir $transform_dir)
[ -z "$left_context" ] && left_context=$splice_width
[ -z "$right_context" ] && right_context=$splice_width
extra_opts+=(--left-context $left_context --right-context $right_context)

if [ $stage -le -4 ]; then
  echo "$0: calling get_lda.sh"
  steps/nnet2/get_lda.sh $lda_opts "${extra_opts[@]}" --cmd "$cmd" $data $lang $alidir $dir || exit 1;
fi

# these files will have been written by get_lda.sh
feat_dim=$(cat $dir/feat_dim) || exit 1;
ivector_dim=$(cat $dir/ivector_dim) || exit 1;
lda_dim=$(cat $dir/lda_dim) || exit 1;

if [ $stage -le -3 ] && [ -z "$egs_dir" ]; then
  echo "$0: calling get_egs2.sh"
  steps/nnet2/get_egs2.sh $egs_opts "${extra_opts[@]}"  --io-opts "$io_opts" \
    --samples-per-iter $samples_per_iter --stage $get_egs_stage \
    --cmd "$cmd" $egs_opts $data $alidir $dir/egs || exit 1;
fi

if [ -z $egs_dir ]; then
  egs_dir=$dir/egs
fi

frames_per_eg=$(cat $egs_dir/info/frames_per_eg) || { echo "error: no such file $egs_dir/info/frames_per_eg"; exit 1; }
num_archives=$(cat $egs_dir/info/num_archives) || { echo "error: no such file $egs_dir/info/frames_per_eg"; exit 1; }

# num_archives_expanded considers each separate label-position from
# 0..frames_per_eg-1 to be a separate archive.
num_archives_expanded=$[$num_archives*$frames_per_eg]

if [ $num_jobs_nnet -gt $num_archives_expanded ]; then
  echo "$0: --num-jobs-nnet cannot exceed num-archives*frames-per-eg which is $num_archives_expanded"
  echo "$0: setting --num-jobs-nnet to $num_archives_expanded"
  num_jobs_nnet=$num_archives_expanded
fi

if ! [ $num_hidden_layers -ge 1 ]; then
  echo "Invalid num-hidden-layers $num_hidden_layers"
  exit 1
fi

if [ $stage -le -2 ]; then
  echo "$0: initializing neural net";
  lda_mat=$dir/lda.mat
  tot_input_dim=$[$feat_dim+$ivector_dim]

  online_preconditioning_opts="alpha=$alpha num-samples-history=$num_samples_history update-period=$update_period rank-in=$precondition_rank_in rank-out=$precondition_rank_out max-change-per-sample=$max_change_per_sample"

  stddev=`perl -e "print 1.0/sqrt($pnorm_input_dim);"`
  cat >$dir/nnet.config <<EOF
SpliceComponent input-dim=$tot_input_dim left-context=$left_context right-context=$right_context const-component-dim=$ivector_dim
FixedAffineComponent matrix=$lda_mat
AffineComponentPreconditionedOnline input-dim=$lda_dim output-dim=$pnorm_input_dim $online_preconditioning_opts learning-rate=$initial_learning_rate param-stddev=$stddev bias-stddev=$bias_stddev
PnormComponent input-dim=$pnorm_input_dim output-dim=$pnorm_output_dim p=$p
NormalizeComponent dim=$pnorm_output_dim
AffineComponentPreconditionedOnline input-dim=$pnorm_output_dim output-dim=$num_leaves $online_preconditioning_opts learning-rate=$initial_learning_rate param-stddev=0 bias-stddev=0
SoftmaxComponent dim=$num_leaves
EOF

  # to hidden.config it will write the part of the config corresponding to a
  # single hidden layer; we need this to add new layers.
  cat >$dir/hidden.config <<EOF
AffineComponentPreconditionedOnline input-dim=$pnorm_output_dim output-dim=$pnorm_input_dim $online_preconditioning_opts learning-rate=$initial_learning_rate param-stddev=$stddev bias-stddev=$bias_stddev
PnormComponent input-dim=$pnorm_input_dim output-dim=$pnorm_output_dim p=$p
NormalizeComponent dim=$pnorm_output_dim
EOF
  $cmd $dir/log/nnet_init.log \
    nnet-am-init $alidir/tree $lang/topo "nnet-init $dir/nnet.config -|" \
    $dir/0.mdl || exit 1;
fi

if [ $stage -le -1 ]; then
  echo "Training transition probabilities and setting priors"
  $cmd $dir/log/train_trans.log \
    nnet-train-transitions $dir/0.mdl "ark:gunzip -c $alidir/ali.*.gz|" $dir/0.mdl \
    || exit 1;

  if [ "$presoftmax_prior_scale_power" != "0.0" ]; then
    echo "prepare vector assignment for FixedScaleComponent before softmax"
    echo "(use priors^$presoftmax_prior_scale_power and rescale to average 1)"

    # obtains raw pdf count
    $cmd JOB=1:$nj $dir/log/acc_pdf.JOB.log \
      ali-to-post "ark:gunzip -c $alidir/ali.JOB.gz|" ark:- \| \
      post-to-tacc --per-pdf=true --binary=false $alidir/final.mdl ark:- $dir/JOB.pacc || exit 1;
    cat $dir/*.pacc > $dir/pacc
    rm $dir/*.pacc
    awk -v power=$presoftmax_prior_scale_power \
      '{ for(i=2; i<=NF-1; i++) {sum[i]+=$i} }
      END {
        for (i=2; i<=NF-1; i++) {total+=sum[i]}
        ave_pdf=int(total/(NF-2)); total+=0.01*ave_pdf*(NF-2)
        for (i=2; i<=NF-1; i++) {rescale+=((sum[i]+0.01*ave_pdf)/total)^power}
        rescale/=(NF-2)
        printf " [ "; for (i=2; i<=NF-1; i++) {printf("%f ", ((sum[i]+0.01*ave_pdf)/total)^power/rescale)}; print "]"
      }' $dir/pacc > $dir/presoftmax_prior_scale_vecfile

    echo "FixedScaleComponent scales=$dir/presoftmax_prior_scale_vecfile" > $dir/per_element.config
    echo "insert an additional layer of FixedScaleComponent before softmax"
    inp=`nnet-am-info $dir/0.mdl | grep 'Softmax' | awk '{print $2}'`
    nnet-init $dir/per_element.config - | nnet-insert --insert-at=$inp --randomize-next-component=false $dir/0.mdl - $dir/0.mdl
  fi
fi

# set num_iters so that as close as possible, we process the data $num_epochs
# times, i.e. $num_iters*$num_jobs_nnet == $num_epochs*$num_archives_expanded
num_iters=$[($num_epochs*$num_archives_expanded)/$num_jobs_nnet]

echo "$0: Will train for $num_epochs epochs = $num_iters iterations"
echo "$0: Will not do mix up"

finish_add_layers_iter=$[$num_hidden_layers * $add_layers_period]
# This is when we decide to mix up from: halfway between when we've finished
# adding the hidden layers and the end of training.
mix_up_iter=$[($num_iters + $finish_add_layers_iter)/2]

if [ $num_threads -eq 1 ]; then
  parallel_suffix="-simple" # this enables us to use GPU code if
                         # we have just one thread.
  parallel_train_opts=
  if ! cuda-compiled; then
    echo "$0: WARNING: you are running with one thread but you have not compiled"
    echo "   for CUDA.  You may be running a setup optimized for GPUs.  If you have"
    echo "   GPUs and have nvcc installed, go to src/ and do ./configure; make"
  fi
else
  parallel_suffix="-parallel"
  parallel_train_opts="--num-threads=$num_threads"
fi


approx_iters_per_epoch=$[$num_iters/$num_epochs]
# First work out how many models we want to combine over in the final
# nnet-combine-fast invocation.  This equals
# min(max(max_models_combine, iters_per_epoch),
#     2/3 * iters_after_mixup)
num_models_combine=$max_models_combine
if [ $num_models_combine -lt $approx_iters_per_epoch ]; then
  num_models_combine=$approx_iters_per_epoch
fi
iters_after_mixup_23=$[(($num_iters-$mix_up_iter-1)*2)/3]
if [ $num_models_combine -gt $iters_after_mixup_23 ]; then
  num_models_combine=$iters_after_mixup_23
fi
first_model_combine=$[$num_iters-$num_models_combine+1]

x=0

for realign_epoch in $realign_epochs; do
  # compare the equation below with the equation we use to set num_iters above.
  # note, realign_epochs may be floating-point, which is why we don't use $[] to
  # do the math.
  realign_iter=$(perl -e 'print int(($ARGV[0]*$ARGV[1])/$ARGV[2]);' $realign_epoch $num_archives_expanded $num_jobs_nnet)
  realign_this_iter[$realign_iter]=$realign_epoch
done

cur_egs_dir=$egs_dir

while [ $x -lt $num_iters ]; do

  if [ ! -z "${realign_this_iter[$x]}" ]; then
    prev_egs_dir=$cur_egs_dir
    cur_egs_dir=$dir/egs_${realign_this_iter[$x]}
  fi

  if [ $x -ge 0 ] && [ $stage -le $x ]; then
    if [ ! -z "${realign_this_iter[$x]}" ]; then
      epoch=${realign_this_iter[$x]}


      echo "Getting average posterior for purposes of adjusting the priors."
      # Note: this just uses CPUs, using a smallish subset of data.
      # always use the first egs archive, which makes the script simpler;
      # we're using different random subsets of it.
      rm $dir/post.$x.*.vec 2>/dev/null
      $cmd JOB=1:$num_jobs_compute_prior $dir/log/get_post.$x.JOB.log \
        nnet-copy-egs --srand=JOB --frame=random ark:$prev_egs_dir/egs.1.ark ark:- \| \
        nnet-subset-egs --srand=JOB --n=$prior_subset_size ark:- ark:- \| \
        nnet-compute-from-egs "nnet-to-raw-nnet $dir/$x.mdl -|" ark:- ark:- \| \
        matrix-sum-rows ark:- ark:- \| vector-sum ark:- $dir/post.$x.JOB.vec || exit 1;

      sleep 3;  # make sure there is time for $dir/post.$x.*.vec to appear.

      $cmd $dir/log/vector_sum.$x.log \
        vector-sum $dir/post.$x.*.vec $dir/post.$x.vec || exit 1;
      rm $dir/post.$x.*.vec;

      echo "Re-adjusting priors based on computed posteriors"
      $cmd $dir/log/adjust_priors.$x.log \
        nnet-adjust-priors $dir/$x.mdl $dir/post.$x.vec $dir/$x.mdl || exit 1;

      sleep 2

      steps/nnet2/align.sh --nj $num_jobs_align --cmd "$align_cmd" --use-gpu $align_use_gpu \
        --transform-dir "$transform_dir" --online-ivector-dir "$online_ivector_dir" \
        --iter $x $data $lang $dir $dir/ali_$epoch || exit 1

      steps/nnet2/relabel_egs2.sh --cmd "$cmd" --iter $x $dir/ali_$epoch \
        $prev_egs_dir $cur_egs_dir || exit 1

      if $cleanup && [[ $prev_egs_dir =~ $dir/egs* ]]; then
        steps/nnet2/remove_egs.sh $prev_egs_dir
      fi
    fi

    # Set off jobs doing some diagnostics, in the background.
    # Use the egs dir from the previous iteration for the diagnostics
    $cmd $dir/log/compute_prob_valid.$x.log \
      nnet-compute-prob $dir/$x.mdl ark:$cur_egs_dir/valid_diagnostic.egs &
    $cmd $dir/log/compute_prob_train.$x.log \
      nnet-compute-prob $dir/$x.mdl ark:$cur_egs_dir/train_diagnostic.egs &
    if [ $x -gt 0 ] && [ ! -f $dir/log/mix_up.$[$x-1].log ]; then
      $cmd $dir/log/progress.$x.log \
        nnet-show-progress --use-gpu=no $dir/$[$x-1].mdl $dir/$x.mdl \
        ark:$cur_egs_dir/train_diagnostic.egs '&&' \
        nnet-am-info $dir/$x.mdl &
    fi

    echo "Training neural net (pass $x)"

    if [ $x -gt 0 ] && \
      [ $x -le $[($num_hidden_layers-1)*$add_layers_period] ] && \
      [ $[($x-1) % $add_layers_period] -eq 0 ]; then

      inp=`nnet-am-info $dir/$x.mdl | grep 'Softmax' | awk '{print $2}'`
      if [ "$presoftmax_prior_scale_power" != "0.0" ]; then
        inp=$[$inp-2]
      else
        inp=$[$inp-1]
      fi
      mdl="nnet-init --srand=$x $dir/hidden.config - | nnet-insert --insert-at=$inp $dir/$x.mdl - - |"
    else
      mdl=$dir/$x.mdl
    fi
    if [ $x -eq 0 ] || [ "$mdl" != "$dir/$x.mdl" ]; then
      # on iteration zero or when we just added a layer, use a smaller minibatch
      # size and just one job: the model-averaging doesn't seem to be helpful
      # when the model is changing too fast (i.e. it worsens the objective
      # function), and the smaller minibatch size will help to keep
      # the update stable.
      this_minibatch_size=$[$minibatch_size/2];
      do_average=false
    else
      this_minibatch_size=$minibatch_size
      do_average=true
    fi

    rm $dir/.error 2>/dev/null


    ( # this sub-shell is so that when we "wait" below,
      # we only wait for the training jobs that we just spawned,
      # not the diagnostic jobs that we spawned above.

      # We can't easily use a single parallel SGE job to do the main training,
      # because the computation of which archive and which --frame option
      # to use for each job is a little complex, so we spawn each one separately.
      for n in $(seq $num_jobs_nnet); do
        k=$[$x*$num_jobs_nnet + $n - 1]; # k is a zero-based index that we'll derive
                                         # the other indexes from.
        archive=$[($k%$num_archives)+1]; # work out the 1-based archive index.
        frame=$[(($k/$num_archives)%$frames_per_eg)]; # work out the 0-based frame
        # index; this increases more slowly than the archive index because the
        # same archive with different frame indexes will give similar gradients,
        # so we want to separate them in time.

        $cmd $parallel_opts $dir/log/train.$x.$n.log \
          nnet-train$parallel_suffix $parallel_train_opts \
          --minibatch-size=$this_minibatch_size --srand=$x "$mdl" \
          "ark,bg:nnet-copy-egs --frame=$frame ark:$cur_egs_dir/egs.$archive.ark ark:-|nnet-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x ark:- ark:-|" \
          $dir/$[$x+1].$n.mdl || touch $dir/.error &
      done
      wait
    )
    # the error message below is not that informative, but $cmd will
    # have printed a more specific one.
    [ -f $dir/.error ] && echo "$0: error on iteration $x of training" && exit 1;

    nnets_list=
    for n in `seq 1 $num_jobs_nnet`; do
      nnets_list="$nnets_list $dir/$[$x+1].$n.mdl"
    done

    learning_rate=`perl -e '($x,$n,$i,$f)=@ARGV; print ($x >= $n ? $f : $i*exp($x*log($f/$i)/$n));' $[$x+1] $num_iters $initial_learning_rate $final_learning_rate`;

    if $do_average; then
      # average the output of the different jobs.
      $cmd $dir/log/average.$x.log \
        nnet-am-average $nnets_list - \| \
        nnet-am-copy --learning-rate=$learning_rate - $dir/$[$x+1].mdl || exit 1;
    else
      # choose the best from the different jobs.
      n=$(perl -e '($nj,$pat)=@ARGV; $best_n=1; $best_logprob=-1.0e+10; for ($n=1;$n<=$nj;$n++) {
          $fn = sprintf($pat,$n); open(F, "<$fn") || die "Error opening log file $fn";
          undef $logprob; while (<F>) { if (m/log-prob-per-frame=(\S+)/) { $logprob=$1; } }
          close(F); if (defined $logprob && $logprob > $best_logprob) { $best_logprob=$logprob;
          $best_n=$n; } } print "$best_n\n"; ' $num_jobs_nnet $dir/log/train.$x.%d.log) || exit 1;
      [ -z "$n" ] && echo "Error getting best model" && exit 1;
      $cmd $dir/log/select.$x.log \
        nnet-am-copy --learning-rate=$learning_rate $dir/$[$x+1].$n.mdl $dir/$[$x+1].mdl || exit 1;
    fi

    if [ "$mix_up" -gt 0 ] && [ $x -eq $mix_up_iter ]; then
      echo "Warning: the mix up opertion is disabled!"
      echo "    Ignore mix up leaves number specified"
    fi
    rm $nnets_list
    [ ! -f $dir/$[$x+1].mdl ] && exit 1;
    if [ -f $dir/$[$x-1].mdl ] && $cleanup && \
       [ $[($x-1)%100] -ne 0  ] && [ $[$x-1] -lt $first_model_combine ]; then
      rm $dir/$[$x-1].mdl
    fi
  fi
  x=$[$x+1]
done


if [ $stage -le $num_iters ]; then
  echo "Doing final combination to produce final.mdl"

  # Now do combination.
  nnets_list=()
  # the if..else..fi statement below sets 'nnets_list'.
  if [ $max_models_combine -lt $num_models_combine ]; then
    # The number of models to combine is too large, e.g. > 20.  In this case,
    # each argument to nnet-combine-fast will be an average of multiple models.
    cur_offset=0 # current offset from first_model_combine.
    for n in $(seq $max_models_combine); do
      next_offset=$[($n*$num_models_combine)/$max_models_combine]
      sub_list=""
      for o in $(seq $cur_offset $[$next_offset-1]); do
        iter=$[$first_model_combine+$o]
        mdl=$dir/$iter.mdl
        [ ! -f $mdl ] && echo "Expected $mdl to exist" && exit 1;
        sub_list="$sub_list $mdl"
      done
      nnets_list[$[$n-1]]="nnet-am-average $sub_list - |"
      cur_offset=$next_offset
    done
  else
    nnets_list=
    for n in $(seq 0 $[num_models_combine-1]); do
      iter=$[$first_model_combine+$n]
      mdl=$dir/$iter.mdl
      [ ! -f $mdl ] && echo "Expected $mdl to exist" && exit 1;
      nnets_list[$n]=$mdl
    done
  fi


  # Below, use --use-gpu=no to disable nnet-combine-fast from using a GPU, as
  # if there are many models it can give out-of-memory error; set num-threads to 8
  # to speed it up (this isn't ideal...)
  num_egs=`nnet-copy-egs ark:$cur_egs_dir/combine.egs ark:/dev/null 2>&1 | tail -n 1 | awk '{print $NF}'`
  mb=$[($num_egs+$combine_num_threads-1)/$combine_num_threads]
  [ $mb -gt 512 ] && mb=512
  # Setting --initial-model to a large value makes it initialize the combination
  # with the average of all the models.  It's important not to start with a
  # single model, or, due to the invariance to scaling that these nonlinearities
  # give us, we get zero diagonal entries in the fisher matrix that
  # nnet-combine-fast uses for scaling, which after flooring and inversion, has
  # the effect that the initial model chosen gets much higher learning rates
  # than the others.  This prevents the optimization from working well.
  $cmd $combine_parallel_opts $dir/log/combine.log \
    nnet-combine-fast --initial-model=100000 --num-lbfgs-iters=40 --use-gpu=no \
      --num-threads=$combine_num_threads \
      --verbose=3 --minibatch-size=$mb "${nnets_list[@]}" ark:$cur_egs_dir/combine.egs \
      $dir/final.mdl || exit 1;

  # Normalize stddev for affine or block affine layers that are followed by a
  # pnorm layer and then a normalize layer.
  $cmd $dir/log/normalize.log \
    nnet-normalize-stddev $dir/final.mdl $dir/final.mdl || exit 1;

  # Compute the probability of the final, combined model with
  # the same subset we used for the previous compute_probs, as the
  # different subsets will lead to different probs.
  $cmd $dir/log/compute_prob_valid.final.log \
    nnet-compute-prob $dir/final.mdl ark:$cur_egs_dir/valid_diagnostic.egs &
  $cmd $dir/log/compute_prob_train.final.log \
    nnet-compute-prob $dir/final.mdl ark:$cur_egs_dir/train_diagnostic.egs &
fi

if [ $stage -le $[$num_iters+1] ]; then
  echo "Getting average posterior for purposes of adjusting the priors."
  # Note: this just uses CPUs, using a smallish subset of data.
  rm $dir/post.$x.*.vec 2>/dev/null
  $cmd JOB=1:$num_jobs_compute_prior $dir/log/get_post.$x.JOB.log \
    nnet-copy-egs --frame=random --srand=JOB ark:$cur_egs_dir/egs.1.ark ark:- \| \
    nnet-subset-egs --srand=JOB --n=$prior_subset_size ark:- ark:- \| \
    nnet-compute-from-egs "nnet-to-raw-nnet $dir/final.mdl -|" ark:- ark:- \| \
    matrix-sum-rows ark:- ark:- \| vector-sum ark:- $dir/post.$x.JOB.vec || exit 1;

  sleep 3;  # make sure there is time for $dir/post.$x.*.vec to appear.

  $cmd $dir/log/vector_sum.$x.log \
   vector-sum $dir/post.$x.*.vec $dir/post.$x.vec || exit 1;

  rm $dir/post.$x.*.vec;

  echo "Re-adjusting priors based on computed posteriors"
  $cmd $dir/log/adjust_priors.final.log \
    nnet-adjust-priors $dir/final.mdl $dir/post.$x.vec $dir/final.mdl || exit 1;
fi


if [ ! -f $dir/final.mdl ]; then
  echo "$0: $dir/final.mdl does not exist."
  # we don't want to clean up if the training didn't succeed.
  exit 1;
fi

sleep 2

echo Done

if $cleanup; then
  echo Cleaning up data
  if [[ $cur_egs_dir =~ $dir/egs* ]]; then
    steps/nnet2/remove_egs.sh $cur_egs_dir
  fi

  echo Removing most of the models
  for x in `seq 0 $num_iters`; do
    if [ $[$x%100] -ne 0 ] && [ $x -ne $num_iters ] && [ -f $dir/$x.mdl ]; then
       # delete all but every 100th model; don't delete the ones which combine to form the final model.
      rm $dir/$x.mdl
    fi
  done
fi


================================================
FILE: egs/steps/nnet2/train_tanh.sh
================================================
#!/usr/bin/env bash

# Copyright 2012  Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.

# This script trains a fairly vanilla network with tanh nonlinearities.

# Begin configuration section.
cmd=run.pl
num_epochs=15      # Number of epochs during which we reduce
                   # the learning rate; number of iteration is worked out from this.
num_epochs_extra=5 # Number of epochs after we stop reducing
                   # the learning rate.
num_iters_final=20 # Maximum number of final iterations to give to the
                   # optimization over the validation set.
initial_learning_rate=0.04
final_learning_rate=0.004
bias_stddev=0.5
shrink_interval=5 # shrink every $shrink_interval iters except while we are
                  # still adding layers, when we do it every iter.
shrink=true
num_frames_shrink=2000 # note: must be <= --num-frames-diagnostic option to get_egs.sh, if
                       # given.
final_learning_rate_factor=0.5 # Train the two last layers of parameters half as
                               # fast as the other layers.

hidden_layer_dim=300 #  You may want this larger, e.g. 1024 or 2048.

minibatch_size=128 # by default use a smallish minibatch size for neural net
                   # training; this controls instability which would otherwise
                   # be a problem with multi-threaded update.  Note: it also
                   # interacts with the "preconditioned" update which generally
                   # works better with larger minibatch size, so it's not
                   # completely cost free.

samples_per_iter=200000 # each iteration of training, see this many samples
                        # per job.  This option is passed to get_egs.sh
num_jobs_nnet=16   # Number of neural net jobs to run in parallel.  This option
                   # is passed to get_egs.sh.
get_egs_stage=0

shuffle_buffer_size=5000 # This "buffer_size" variable controls randomization of the samples
                # on each iter.  You could set it to 0 or to a large value for complete
                # randomization, but this would both consume memory and cause spikes in
                # disk I/O.  Smaller is easier on disk and memory but less random.  It's
                # not a huge deal though, as samples are anyway randomized right at the start.

add_layers_period=2 # by default, add new layers every 2 iterations.
num_hidden_layers=3
modify_learning_rates=false
last_layer_factor=0.1 # relates to modify_learning_rates.
first_layer_factor=1.0 # relates to modify_learning_rates.
stage=-5

io_opts="--max-jobs-run 5" # for jobs with a lot of I/O, limits the number running at one time.   These don't
splice_width=4 # meaning +- 4 frames on each side for second LDA
randprune=4.0 # speeds up LDA.
alpha=4.0
max_change=10.0
mix_up=0 # Number of components to mix up to (should be > #tree leaves, if
         # specified.)
num_threads=16
parallel_opts="--num-threads 16 --mem 1G" # by default we use 16 threads; this lets the queue know.
  # note: parallel_opts doesn't automatically get adjusted if you adjust num-threads.
cleanup=true
egs_dir=
lda_opts=
egs_opts=
transform_dir=
cmvn_opts=  # will be passed to get_lda.sh and get_egs.sh, if supplied.
            # only relevant for "raw" features, not lda.
feat_type=  # can be used to force "raw" feature type.
prior_subset_size=10000 # 10k samples per job, for computing priors.  Should be
                        # more than enough.
# End configuration section.


echo "$0 $@"  # Print the command line for logging

if [ -f path.sh ]; then . ./path.sh; fi
. parse_options.sh || exit 1;


if [ $# != 4 ]; then
  echo "Usage: $0 [opts] <data> <lang> <ali-dir> <exp-dir>"
  echo " e.g.: $0 data/train data/lang exp/tri3_ali exp/tri4_nnet"
  echo ""
  echo "Main options (for others, see top of script file)"
  echo "  --config <config-file>                           # config file containing options"
  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
  echo "  --num-epochs <#epochs|15>                        # Number of epochs of main training"
  echo "                                                   # while reducing learning rate (determines #iterations, together"
  echo "                                                   # with --samples-per-iter and --num-jobs-nnet)"
  echo "  --num-epochs-extra <#epochs-extra|5>             # Number of extra epochs of training"
  echo "                                                   # after learning rate fully reduced"
  echo "  --initial-learning-rate <initial-learning-rate|0.02> # Learning rate at start of training, e.g. 0.02 for small"
  echo "                                                       # data, 0.01 for large data"
  echo "  --final-learning-rate  <final-learning-rate|0.004>   # Learning rate at end of training, e.g. 0.004 for small"
  echo "                                                   # data, 0.001 for large data"
  echo "  --num-hidden-layers <#hidden-layers|2>           # Number of hidden layers, e.g. 2 for 3 hours of data, 4 for 100hrs"
  echo "  --initial-num-hidden-layers <#hidden-layers|1>   # Number of hidden layers to start with."
  echo "  --add-layers-period <#iters|2>                   # Number of iterations between adding hidden layers"
  echo "  --mix-up <#pseudo-gaussians|0>                   # Can be used to have multiple targets in final output layer,"
  echo "                                                   # per context-dependent state.  Try a number several times #states."
  echo "  --num-jobs-nnet <num-jobs|8>                     # Number of parallel jobs to use for main neural net"
  echo "                                                   # training (will affect results as well as speed; try 8, 16)"
  echo "                                                   # Note: if you increase this, you may want to also increase"
  echo "                                                   # the learning rate."
  echo "  --num-threads <num-threads|16>                   # Number of parallel threads per job (will affect results"
  echo "                                                   # as well as speed; may interact with batch size; if you increase"
  echo "                                                   # this, you may want to decrease the batch size."
  echo "  --parallel-opts <opts|\"--num-threads 16 --mem 1G\">      # extra options to pass to e.g. queue.pl for processes that"
  echo "                                                   # use multiple threads... "
  echo "  --io-opts <opts|\"--max-jobs-run 10\">                      # Options given to e.g. queue.pl for jobs that do a lot of I/O."
  echo "  --minibatch-size <minibatch-size|128>            # Size of minibatch to process (note: product with --num-threads"
  echo "                                                   # should not get too large, e.g. >2k)."
  echo "  --samples-per-iter <#samples|200000>             # Number of samples of data to process per iteration, per"
  echo "                                                   # process."
  echo "  --splice-width <width|4>                         # Number of frames on each side to append for feature input"
  echo "                                                   # (note: we splice processed, typically 40-dimensional frames"
  echo "  --lda-dim <dim|250>                              # Dimension to reduce spliced features to with LDA"
  echo "  --num-iters-final <#iters|20>                    # Number of final iterations to give to nnet-combine-fast to "
  echo "                                                   # interpolate parameters (the weights are learned with a validation set)"
  echo "  --stage <stage|-9>                               # Used to run a partially-completed training process from somewhere in"
  echo "                                                   # the middle."

  exit 1;
fi

data=$1
lang=$2
alidir=$3
dir=$4

# Check some files.
for f in $data/feats.scp $lang/L.fst $alidir/ali.1.gz $alidir/final.mdl $alidir/tree; do
  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
done


# Set some variables.
num_leaves=`am-info $alidir/final.mdl 2>/dev/null | awk '/number of pdfs/{print $NF}'` || exit 1;

nj=`cat $alidir/num_jobs` || exit 1;  # number of jobs in alignment dir...
# in this dir we'll have just one job.
sdata=$data/split$nj
utils/split_data.sh $data $nj

mkdir -p $dir/log
cp $alidir/tree $dir

utils/lang/check_phones_compatible.sh $lang/phones.txt $alidir/phones.txt || exit 1;
cp $lang/phones.txt $dir || exit 1;

extra_opts=()
[ ! -z "$cmvn_opts" ] && extra_opts+=(--cmvn-opts "$cmvn_opts")
[ ! -z "$feat_type" ] && extra_opts+=(--feat-type $feat_type)
[ ! -z "$online_ivector_dir" ] && extra_opts+=(--online-ivector-dir $online_ivector_dir)
[ -z "$transform_dir" ] && transform_dir=$alidir
extra_opts+=(--transform-dir $transform_dir)
extra_opts+=(--splice-width $splice_width)

if [ $stage -le -4 ]; then
  echo "$0: calling get_lda.sh"
  steps/nnet2/get_lda.sh $lda_opts "${extra_opts[@]}" --cmd "$cmd" $data $lang $alidir $dir || exit 1;
fi

# these files will have been written by get_lda.sh
feat_dim=`cat $dir/feat_dim` || exit 1;
lda_dim=`cat $dir/lda_dim` || exit 1;

if [ $stage -le -3 ] && [ -z "$egs_dir" ]; then
  echo "$0: calling get_egs.sh"
  steps/nnet2/get_egs.sh $egs_opts "${extra_opts[@]}" \
      --samples-per-iter $samples_per_iter \
      --num-jobs-nnet $num_jobs_nnet --stage $get_egs_stage \
      --cmd "$cmd" $egs_opts --io-opts "$io_opts" \
      $data $lang $alidir $dir || exit 1;
fi

if [ -z $egs_dir ]; then
  egs_dir=$dir/egs
fi

iters_per_epoch=`cat $egs_dir/iters_per_epoch`  || exit 1;
! [ $num_jobs_nnet -eq `cat $egs_dir/num_jobs_nnet` ] && \
  echo "$0: Warning: using --num-jobs-nnet=`cat $egs_dir/num_jobs_nnet` from $egs_dir"
num_jobs_nnet=`cat $egs_dir/num_jobs_nnet` || exit 1;


if ! [ $num_hidden_layers -ge 1 ]; then
  echo "Invalid num-hidden-layers $num_hidden_layers"
  exit 1
fi

if [ $stage -le -2 ]; then
  echo "$0: initializing neural net";

  lda_mat=$dir/lda.mat
  ext_lda_dim=$lda_dim
  ext_feat_dim=$feat_dim

  stddev=`perl -e "print 1.0/sqrt($hidden_layer_dim);"`
  cat >$dir/nnet.config <<EOF
SpliceComponent input-dim=$ext_feat_dim left-context=$splice_width right-context=$splice_width
FixedAffineComponent matrix=$lda_mat
AffineComponentPreconditioned input-dim=$ext_lda_dim output-dim=$hidden_layer_dim alpha=$alpha max-change=$max_change learning-rate=$initial_learning_rate param-stddev=$stddev bias-stddev=$bias_stddev
TanhComponent dim=$hidden_layer_dim
AffineComponentPreconditioned input-dim=$hidden_layer_dim output-dim=$num_leaves alpha=$alpha max-change=$max_change learning-rate=$initial_learning_rate param-stddev=0 bias-stddev=0
SoftmaxComponent dim=$num_leaves
EOF

  # to hidden.config it will write the part of the config corresponding to a
  # single hidden layer; we need this to add new layers.
  cat >$dir/hidden.config <<EOF
AffineComponentPreconditioned input-dim=$hidden_layer_dim output-dim=$hidden_layer_dim alpha=$alpha max-change=$max_change learning-rate=$initial_learning_rate param-stddev=$stddev bias-stddev=$bias_stddev
TanhComponent dim=$hidden_layer_dim
EOF
  $cmd $dir/log/nnet_init.log \
    nnet-am-init $alidir/tree $lang/topo "nnet-init $dir/nnet.config -|" \
    $dir/0.mdl || exit 1;
fi

if [ $stage -le -1 ]; then
  echo "Training transition probabilities and setting priors"
  $cmd $dir/log/train_trans.log \
    nnet-train-transitions $dir/0.mdl "ark:gunzip -c $alidir/ali.*.gz|" $dir/0.mdl \
    || exit 1;
fi

num_iters_reduce=$[$num_epochs * $iters_per_epoch];
num_iters_extra=$[$num_epochs_extra * $iters_per_epoch];
num_iters=$[$num_iters_reduce+$num_iters_extra]

echo "$0: Will train for $num_epochs + $num_epochs_extra epochs, equalling "
echo "$0: $num_iters_reduce + $num_iters_extra = $num_iters iterations, "
echo "$0: (while reducing learning rate) + (with constant learning rate)."

# This is when we decide to mix up from: halfway between when we've finished
# adding the hidden layers and the end of training.
finish_add_layers_iter=$[$num_hidden_layers * $add_layers_period]
first_modify_iter=$[$finish_add_layers_iter + $add_layers_period]
mix_up_iter=$[($num_iters + $finish_add_layers_iter)/2]

if [ $num_threads -eq 1 ]; then
  train_suffix="-simple" # this enables us to use GPU code if
                         # we have just one thread.
  if ! cuda-compiled; then
    echo "$0: WARNING: you are running with one thread but you have not compiled"
    echo "   for CUDA.  You may be running a setup optimized for GPUs.  If you have"
    echo "   GPUs and have nvcc installed, go to src/ and do ./configure; make"
  fi
else
  train_suffix="-parallel --num-threads=$num_threads"
fi


x=0
while [ $x -lt $num_iters ]; do
  if [ $x -ge 0 ] && [ $stage -le $x ]; then
    # Set off jobs doing some diagnostics, in the background.
    $cmd $dir/log/compute_prob_valid.$x.log \
      nnet-compute-prob $dir/$x.mdl ark:$egs_dir/valid_diagnostic.egs &
    $cmd $dir/log/compute_prob_train.$x.log \
      nnet-compute-prob $dir/$x.mdl ark:$egs_dir/train_diagnostic.egs &
    if [ $x -gt 0 ] && [ ! -f $dir/log/mix_up.$[$x-1].log ]; then
      $cmd $dir/log/progress.$x.log \
        nnet-show-progress --use-gpu=no $dir/$[$x-1].mdl $dir/$x.mdl ark:$egs_dir/train_diagnostic.egs &
    fi

    echo "Training neural net (pass $x)"
    if [ $x -gt 0 ] && \
      [ $x -le $[($num_hidden_layers-1)*$add_layers_period] ] && \
      [ $[($x-1) % $add_layers_period] -eq 0 ]; then
      mdl="nnet-init --srand=$x $dir/hidden.config - | nnet-insert $dir/$x.mdl - - |"
    else
      mdl=$dir/$x.mdl
    fi


    $cmd $parallel_opts JOB=1:$num_jobs_nnet $dir/log/train.$x.JOB.log \
      nnet-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x \
      ark:$egs_dir/egs.JOB.$[$x%$iters_per_epoch].ark ark:- \| \
      nnet-train$train_suffix \
         --minibatch-size=$minibatch_size --srand=$x "$mdl" \
        ark:- $dir/$[$x+1].JOB.mdl \
      || exit 1;

    nnets_list=
    for n in `seq 1 $num_jobs_nnet`; do
      nnets_list="$nnets_list $dir/$[$x+1].$n.mdl"
    done

    learning_rate=`perl -e '($x,$n,$i,$f)=@ARGV; print ($x >= $n ? $f : $i*exp($x*log($f/$i)/$n));' $[$x+1] $num_iters_reduce $initial_learning_rate $final_learning_rate`;
    last_layer_learning_rate=`perl -e "print $learning_rate * $final_learning_rate_factor;"`;
    nnet-am-info $dir/$[$x+1].1.mdl > $dir/foo  2>/dev/null || exit 1
    nu=`cat $dir/foo | grep num-updatable-components | awk '{print $2}'`
    na=`cat $dir/foo | grep -v Fixed | grep AffineComponent | wc -l`
    # na is number of last updatable AffineComponent layer [one-based, counting only
    # updatable components.]
    # The last two layers will get this (usually lower) learning rate.
    lr_string="$learning_rate"
    for n in `seq 2 $nu`; do
      if [ $n -eq $na ] || [ $n -eq $[$na-1] ]; then lr=$last_layer_learning_rate;
      else lr=$learning_rate; fi
      lr_string="$lr_string:$lr"
    done

    $cmd $dir/log/average.$x.log \
      nnet-am-average $nnets_list - \| \
      nnet-am-copy --learning-rates=$lr_string - $dir/$[$x+1].mdl || exit 1;

    if $modify_learning_rates && [ $x -ge $first_modify_iter ]; then
      $cmd $dir/log/modify_learning_rates.$x.log \
        nnet-modify-learning-rates --last-layer-factor=$last_layer_factor \
          --first-layer-factor=$first_layer_factor --average-learning-rate=$learning_rate \
        $dir/$x.mdl $dir/$[$x+1].mdl $dir/$[$x+1].mdl || exit 1;
    fi

    if $shrink && [ $[$x % $shrink_interval] -eq 0 ]; then
      mb=$[($num_frames_shrink+$num_threads-1)/$num_threads]
      $cmd $parallel_opts $dir/log/shrink.$x.log \
        nnet-subset-egs --n=$num_frames_shrink --randomize-order=true --srand=$x \
          ark:$egs_dir/train_diagnostic.egs ark:-  \| \
        nnet-combine-fast --use-gpu=no --num-threads=$num_threads --verbose=3 --minibatch-size=$mb \
          $dir/$[$x+1].mdl ark:- $dir/$[$x+1].mdl || exit 1;
    else
      # On other iters, do nnet-am-fix which is much faster and has roughly
      # the same effect.
      nnet-am-fix $dir/$[$x+1].mdl $dir/$[$x+1].mdl 2>$dir/log/fix.$x.log
    fi

    if [ "$mix_up" -gt 0 ] && [ $x -eq $mix_up_iter ]; then
      # mix up.
      echo Mixing up from $num_leaves to $mix_up components
      $cmd $dir/log/mix_up.$x.log \
        nnet-am-mixup --min-count=10 --num-mixtures=$mix_up \
        $dir/$[$x+1].mdl $dir/$[$x+1].mdl || exit 1;
    fi
    rm $nnets_list
  fi
  x=$[$x+1]
done

# Now do combination.
# At the end, final.mdl will be a combination of the last e.g. 10 models.
nnets_list=()
if [ $num_iters_final -gt $num_iters_extra ]; then
  echo "Setting num_iters_final=$num_iters_extra"
fi
start=$[$num_iters-$num_iters_final+1]
for x in `seq $start $num_iters`; do
  idx=$[$x-$start]
  if [ $x -gt $mix_up_iter ]; then
    nnets_list[$idx]=$dir/$x.mdl # "nnet-am-copy --remove-dropout=true $dir/$x.mdl - |"
  fi
done

if [ $stage -le $num_iters ]; then
  # Below, use --use-gpu=no to disable nnet-combine-fast from using a GPU, as
  # if there are many models it can give out-of-memory error; set num-threads to 8
  # to speed it up (this isn't ideal...)
  this_num_threads=$num_threads
  [ $this_num_threads -lt 8 ] && this_num_threads=8
  num_egs=`nnet-copy-egs ark:$egs_dir/combine.egs ark:/dev/null 2>&1 | tail -n 1 | awk '{print $NF}'`
  mb=$[($num_egs+$this_num_threads-1)/$this_num_threads]
  [ $mb -gt 512 ] && mb=512
  $cmd $parallel_opts $dir/log/combine.log \
    nnet-combine-fast --use-gpu=no --num-threads=$this_num_threads \
      --verbose=3 --minibatch-size=$mb "${nnets_list[@]}" ark:$egs_dir/combine.egs \
      $dir/final.mdl || exit 1;

  # Compute the probability of the final, combined model with
  # the same subset we used for the previous compute_probs, as the
  # different subsets will lead to different probs.
  $cmd $dir/log/compute_prob_valid.final.log \
    nnet-compute-prob $dir/final.mdl ark:$egs_dir/valid_diagnostic.egs &
  $cmd $dir/log/compute_prob_train.final.log \
    nnet-compute-prob $dir/final.mdl ark:$egs_dir/train_diagnostic.egs &
fi

if [ $stage -le $[$num_iters+1] ]; then
  echo "Getting average posterior for purposes of adjusting the priors."
  # Note: this just uses CPUs, using a smallish subset of data.
  rm $dir/post.*.vec 2>/dev/null
  $cmd JOB=1:$num_jobs_nnet $dir/log/get_post.JOB.log \
    nnet-subset-egs --n=$prior_subset_size ark:$egs_dir/egs.JOB.0.ark ark:- \| \
    nnet-compute-from-egs "nnet-to-raw-nnet $dir/final.mdl -|" ark:- ark:- \| \
    matrix-sum-rows ark:- ark:- \| vector-sum ark:- $dir/post.JOB.vec || exit 1;

  sleep 3;  # make sure there is time for $dir/post.*.vec to appear.

  $cmd $dir/log/vector_sum.log \
   vector-sum $dir/post.*.vec $dir/post.vec || exit 1;

  rm $dir/post.*.vec;

  echo "Re-adjusting priors based on computed posteriors"
  $cmd $dir/log/adjust_priors.log \
    nnet-adjust-priors $dir/final.mdl $dir/post.vec $dir/final.mdl || exit 1;
fi

sleep 2

echo Done

if $cleanup; then
  echo Cleaning up data
  if [ $egs_dir == "$dir/egs" ]; then
    steps/nnet2/remove_egs.sh $dir/egs
  fi
  echo Removing most of the models
  for x in `seq 0 $num_iters`; do
    if [ $[$x%100] -ne 0 ] && [ $x -lt $[$num_iters-$num_iters_final+1] ]; then
       # delete all but every 10th model; don't delete the ones which combine to form the final model.
      rm $dir/$x.mdl
    fi
  done
fi


================================================
FILE: egs/steps/nnet2/train_tanh_bottleneck.sh
================================================
#!/usr/bin/env bash

# Copyright 2012  Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.
#      2014  Pegah Ghahremani
# This script trains a fairly vanilla network with tanh nonlinearities to generate bottleneck features

# Begin configuration section.
cmd=run.pl
num_epochs=15    # Number of epochs during which we reduce
                   # the learning rate; number of iteration is worked out from this.
num_epochs_extra=5 # Number of epochs after we stop reducing
                   # the learning rate.
num_iters_final=20 # Maximum number of final iterations to give to the
                   # optimization over the validation set.
initial_learning_rate=0.04
final_learning_rate=0.004
bias_stddev=0.5
shrink_interval=5 # shrink every $shrink_interval iters except while we are
                  # still adding layers, when we do it every iter.
shrink=true
num_frames_shrink=2000 # note: must be <= --num-frames-diagnostic option to get_egs.sh, if
                       # given.
final_learning_rate_factor=0.5 # Train the two last layers of parameters half as
                               # fast as the other layers.

hidden_layer_dim=1024 #  You may want this larger, e.g. 1024 or 2048.

bottleneck_dim=42  # bottleneck layer dimension
minibatch_size=128 # by default use a smallish minibatch size for neural net
                   # training; this controls instability which would otherwise
                   # be a problem with multi-threaded update.  Note: it also
                   # interacts with the "preconditioned" update which generally
                   # works better with larger minibatch size, so it's not
                   # completely cost free.

samples_per_iter=200000 # each iteration of training, see this many samples
                        # per job.  This option is passed to get_egs.sh
num_jobs_nnet=16   # Number of neural net jobs to run in parallel.  This option
                   # is passed to get_egs.sh.
get_egs_stage=0

shuffle_buffer_size=5000 # This "buffer_size" variable controls randomization of the samples
                # on each iter.  You could set it to 0 or to a large value for complete
                # randomization, but this would both consume memory and cause spikes in
                # disk I/O.  Smaller is easier on disk and memory but less random.  It's
                # not a huge deal though, as samples are anyway randomized right at the start.

add_layers_period=2 # by default, add new layers every 2 iterations.
num_hidden_layers=3
bottleneck_layer_num=$num_hidden_layers-2 # bottleneck layer number between hidden layer
                                        # eg. 1024|1024|42|1024 bottleneck_layer_num = 2

modify_learning_rates=false
last_layer_factor=0.1 # relates to modify_learning_rates.
first_layer_factor=1.0 # relates to modify_learning_rates.
stage=-5

io_opts="--max-jobs-run 5" # for jobs with a lot of I/O, limits the number running at one time.   These don't
splice_width=4 # meaning +- 4 frames on each side for second LDA
randprune=4.0 # speeds up LDA.
alpha=4.0
max_change=10.0
mix_up=0 # Number of components to mix up to (should be > #tree leaves, if
        # specified.)
num_threads=16
parallel_opts="--num-threads 16 --mem 1G" # by default we use 16 threads; this lets the queue know.
  # note: parallel_opts doesn't automatically get adjusted if you adjust num-threads.
combine_opts="--mem 12G"
cleanup=true
egs_dir=
lda_opts=
egs_opts=
transform_dir=
nj=
# End configuration section.


echo "$0 $@"  # Print the command line for logging

if [ -f path.sh ]; then . ./path.sh; fi
. parse_options.sh || exit 1;


if [ $# != 4 ]; then
  echo "Usage: $0 [opts] <data> <lang> <ali-dir> <exp-dir>"
  echo " e.g.: $0 data/train data/lang exp/tri3_ali exp/tri4_nnet"
  echo ""
  echo "Main options (for others, see top of script file)"
  echo "  --config <config-file>                           # config file containing options"
  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
  echo "  --num-epochs <#epochs|15>                        # Number of epochs of main training"
  echo "                                                   # while reducing learning rate (determines #iterations, together"
  echo "                                                   # with --samples-per-iter and --num-jobs-nnet)"
  echo "  --num-epochs-extra <#epochs-extra|5>             # Number of extra epochs of training"
  echo "                                                   # after learning rate fully reduced"
  echo "  --initial-learning-rate <initial-learning-rate|0.02> # Learning rate at start of training, e.g. 0.02 for small"
  echo "                                                       # data, 0.01 for large data"
  echo "  --final-learning-rate  <final-learning-rate|0.004>   # Learning rate at end of training, e.g. 0.004 for small"
  echo "                                                   # data, 0.001 for large data"
  echo "  --num-hidden-layers <#hidden-layers|2>           # Number of hidden layers, e.g. 2 for 3 hours of data, 4 for 100hrs"
  echo "  --initial-num-hidden-layers <#hidden-layers|1>   # Number of hidden layers to start with."
  echo "  --add-layers-period <#iters|2>                   # Number of iterations between adding hidden layers"
  echo "  --mix-up <#pseudo-gaussians|0>                   # Can be used to have multiple targets in final output layer,"
  echo "                                                   # per context-dependent state.  Try a number several times #states."
  echo "  --num-jobs-nnet <num-jobs|8>                     # Number of parallel jobs to use for main neural net"
  echo "                                                   # training (will affect results as well as speed; try 8, 16)"
  echo "                                                   # Note: if you increase this, you may want to also increase"
  echo "                                                   # the learning rate."
  echo "  --num-threads <num-threads|16>                   # Number of parallel threads per job (will affect results"
  echo "                                                   # as well as speed; may interact with batch size; if you increase"
  echo "                                                   # this, you may want to decrease the batch size."
  echo "  --parallel-opts <opts|\"--num-threads 16 --mem 1G\">      # extra options to pass to e.g. queue.pl for processes that"
  echo "                                                   # use multiple threads... "
  echo "  --io-opts <opts|\"--max-jobs-run 10\">                      # Options given to e.g. queue.pl for jobs that do a lot of I/O."
  echo "  --minibatch-size <minibatch-size|128>            # Size of minibatch to process (note: product with --num-threads"
  echo "                                                   # should not get too large, e.g. >2k)."
  echo "  --samples-per-iter <#samples|200000>             # Number of samples of data to process per iteration, per"
  echo "                                                   # process."
  echo "  --splice-width <width|4>                         # Number of frames on each side to append for feature input"
  echo "                                                   # (note: we splice processed, typically 40-dimensional frames"
  echo "  --lda-dim <dim|250>                              # Dimension to nsformreduce spliced features to with LDA"
  echo "  --num-iters-final <#iters|10>                    # Number of final iterations to give to nnet-combine-fast to "
  echo "                                                   # interpolate parameters (the weights are learned with a validation set)"
  echo "  --num-utts-subset <#utts|300>                    # Number of utterances in subsets used for validation and diagnostics"
  echo "                                                   # (the validation subset is held out from training)"
  echo "  --num-frames-diagnostic <#frames|4000>           # Number of frames used in computing (train,valid) diagnostics"
  echo "  --num-valid-frames-combine <#frames|10000>       # Number of frames used in getting combination weights at the"
  echo "                                                   # very end."
  echo "  --stage <stage|-9>                               # Used to run a partially-completed training process from somewhere in"
  echo "                                                   # the middle."

  exit 1;
fi

data=$1
lang=$2
alidir=$3
dir=$4

# Check some files.
for f in $data/feats.scp $lang/L.fst $alidir/ali.1.gz $alidir/final.mdl $alidir/tree; do
  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
done


# Set some variables.
num_leaves=`am-info $alidir/final.mdl 2>/dev/null | awk '/number of pdfs/{print $NF}'` || exit 1;

nj=`cat $alidir/num_jobs` || exit 1;  # number of jobs in alignment dir...
# in this dir we'll have just one job.
sdata=$data/split$nj
utils/split_data.sh $data $nj

mkdir -p $dir/log
splice_opts=`cat $alidir/splice_opts 2>/dev/null`
cp $alidir/final.mat $dir 2>/dev/null
cp $alidir/splice_opts $dir 2>/dev/null
cmvn_opts=`cat $alidir/cmvn_opts 2>/dev/null`
cp $alidir/cmvn_opts $dir 2>/dev/null
cp $alidir/tree $dir

utils/lang/check_phones_compatible.sh $lang/phones.txt $alidir/phones.txt || exit 1;
cp $lang/phones.txt $dir || exit 1;

truncate_comp_num=$[2*$num_hidden_layers+1]
if [ $stage -le -4 ]; then
  echo "$0: calling get_lda.sh"
  steps/nnet2/get_lda.sh $lda_opts --splice-width $splice_width --cmd "$cmd" $data $lang $alidir $dir || exit 1;
fi

# these files will have been written by get_lda.sh
feat_dim=`cat $dir/feat_dim` || exit 1;
lda_dim=`cat $dir/lda_dim` || exit 1;

if [ $stage -le -3 ] && [ -z "$egs_dir" ]; then
  echo "$0: calling get_egs.sh"
  [ ! -z $transform_dir ] && $transform_dir_opt="--transform-dir $transform_dir";
  steps/nnet2/get_egs.sh $transform_dir_opt --samples-per-iter $samples_per_iter \
      --num-jobs-nnet $num_jobs_nnet --splice-width $splice_width --stage $get_egs_stage \
      --cmd "$cmd" $egs_opts --io-opts "$io_opts" \
      $data $lang $alidir $dir || exit 1;
fi

if [ -z $egs_dir ]; then
  egs_dir=$dir/egs
fi

iters_per_epoch=`cat $egs_dir/iters_per_epoch`  || exit 1;
! [ $num_jobs_nnet -eq `cat $egs_dir/num_jobs_nnet` ] && \
  echo "$0: Warning: using --num-jobs-nnet=`cat $egs_dir/num_jobs_nnet` from $egs_dir"
num_jobs_nnet=`cat $egs_dir/num_jobs_nnet` || exit 1;


if ! [ $num_hidden_layers -ge 1 ]; then
  echo "Invalid num-hidden-layers $num_hidden_layers"
  exit 1
fi

if [ $stage -le -2 ]; then
  echo "$0: initializing neural net";

  lda_mat=$dir/lda.mat

  stddev=`perl -e "print 1.0/sqrt($hidden_layer_dim);"`
  cat >$dir/nnet.config <<EOF
SpliceComponent input-dim=$feat_dim left-context=$splice_width right-context=$splice_width const-component-dim=0
FixedAffineComponent matrix=$lda_mat
AffineComponentPreconditioned input-dim=$lda_dim output-dim=$hidden_layer_dim alpha=$alpha max-change=$max_change learning-rate=$initial_learning_rate param-stddev=$stddev bias-stddev=$bias_stddev
TanhComponent dim=$hidden_layer_dim
AffineComponentPreconditioned input-dim=$hidden_layer_dim output-dim=$num_leaves alpha=$alpha max-change=$max_change learning-rate=$initial_learning_rate param-stddev=0 bias-stddev=0
SoftmaxComponent dim=$num_leaves
EOF

  # to hidden.config it will write the part of the config corresponding to a
  # single hidden layer; we need this to add new layers.
  cat >$dir/hidden.config <<EOF
AffineComponentPreconditioned input-dim=$hidden_layer_dim output-dim=$hidden_layer_dim alpha=$alpha max-change=$max_change learning-rate=$initial_learning_rate param-stddev=$stddev bias-stddev=$bias_stddev
TanhComponent dim=$hidden_layer_dim
EOF
  bottleneck_stddev=`perl -e "print 1.0/sqrt($bottleneck_dim);"`
  # bnf.config it will write the part of th config corresponding to a
  # bottleneck layer; we need this to add bottleneck layer.
  cat >$dir/bnf.config <<EOF
AffineComponentPreconditioned input-dim=$hidden_layer_dim output-dim=$bottleneck_dim alpha=$alpha max-change=$max_change learning-rate=$initial_learning_rate param-stddev=$stddev bias-stddev=$bias_stddev
AffineComponentPreconditioned input-dim=$bottleneck_dim output-dim=$hidden_layer_dim alpha=$alpha max-change=$max_change learning-rate=$initial_learning_rate param-stddev=$bottleneck_stddev bias-stddev=$bias_stddev
TanhComponent dim=$hidden_layer_dim
EOF
  $cmd $dir/log/nnet_init.log \
    nnet-am-init $alidir/tree $lang/topo "nnet-init $dir/nnet.config -|" \
    $dir/0.mdl || exit 1;
fi

if [ $stage -le -1 ]; then
  echo "Training transition probabilities and setting priors"
  $cmd $dir/log/train_trans.log \
    nnet-train-transitions $dir/0.mdl "ark:gunzip -c $alidir/ali.*.gz|" $dir/0.mdl \
    || exit 1;
fi

num_iters_reduce=$[$num_epochs * $iters_per_epoch];
num_iters_extra=$[$num_epochs_extra * $iters_per_epoch];
num_iters=$[$num_iters_reduce+$num_iters_extra]
echo num_iters = $num_iters
echo "$0: Will train for $num_epochs + $num_epochs_extra epochs, equalling "
echo "$0: $num_iters_reduce + $num_iters_extra = $num_iters iterations, "
echo "$0: (while reducing learning rate) + (with constant learning rate)."

# This is when we decide to mix up from: halfway between when we've finished
# adding the hidden layers and the end of training.
finish_add_layers_iter=$[($num_hidden_layers-$initial_num_hidden_layers+1)*$add_layers_period]
first_modify_iter=$[$finish_add_layers_iter + $add_layers_period]
mix_up_iter=$[($num_iters + $finish_add_layers_iter)/2]
truncate_comp_num=$[2*$num_hidden_layers+1]
if [ $num_threads -eq 1 ]; then
  train_suffix="-simple" # this enables us to use GPU code if
                         # we have just one thread.
else
  train_suffix="-parallel --num-threads=$num_threads"
fi

x=0
while [ $x -lt $num_iters ]; do
  if [ $x -ge 0 ] && [ $stage -le $x ]; then
    # Set off jobs doing some diagnostics, in the background.
    $cmd $dir/log/compute_prob_valid.$x.log \
      nnet-compute-prob $dir/$x.mdl ark:$egs_dir/valid_diagnostic.egs &
    $cmd $dir/log/compute_prob_train.$x.log \
      nnet-compute-prob $dir/$x.mdl ark:$egs_dir/train_diagnostic.egs &
    if [ $x -gt 0 ] && [ ! -f $dir/log/mix_up.$[$x-1].log ]; then
      $cmd $dir/log/progress.$x.log \
        nnet-show-progress --use-gpu=no $dir/$[$x-1].mdl $dir/$x.mdl ark:$egs_dir/train_diagnostic.egs &
    fi

    echo "Training neural net (pass $x)"
    if [ $x -gt 0 ] && \
      [ $x -le $[($num_hidden_layers-1)*$add_layers_period] ] && \
      [ $[($x-1) % $add_layers_period] -eq 0 ]; then
      if [ $[($x-1) / $add_layers_period] -eq $[($num_hidden_layers-2)] ]; then
        echo bnf layer with x = $x
        mdl="nnet-init --srand=$x $dir/bnf.config - | nnet-insert $dir/$x.mdl - - |"
      else
        mdl="nnet-init --srand=$x $dir/hidden.config - | nnet-insert $dir/$x.mdl - - |"
      fi
    else
      mdl=$dir/$x.mdl
    fi

    $cmd $parallel_opts JOB=1:$num_jobs_nnet $dir/log/train.$x.JOB.log \
      nnet-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x \
      ark:$egs_dir/egs.JOB.$[$x%$iters_per_epoch].ark ark:- \| \
      nnet-train$train_suffix \
         --minibatch-size=$minibatch_size --srand=$x "$mdl" \
        ark:- $dir/$[$x+1].JOB.mdl \
      || exit 1;

    nnets_list=
    for n in `seq 1 $num_jobs_nnet`; do
      nnets_list="$nnets_list $dir/$[$x+1].$n.mdl"
    done

    learning_rate=`perl -e '($x,$n,$i,$f)=@ARGV; print ($x >= $n ? $f : $i*exp($x*log($f/$i)/$n));' $[$x+1] $num_iters_reduce $initial_learning_rate $final_learning_rate`;
    last_layer_learning_rate=`perl -e "print $learning_rate * $final_learning_rate_factor;"`;
    nnet-am-info $dir/$[$x+1].1.mdl > $dir/foo  2>/dev/null || exit 1
    nu=`cat $dir/foo | grep num-updatable-components | awk '{print $2}'`
    na=`cat $dir/foo | grep -v Fixed | grep AffineComponent | wc -l`
    # na is number of last updatable AffineComponent layer [one-based, counting only
    # updatable components.]
    # The last two layers will get this (usually lower) learning rate.
    lr_string="$learning_rate"
    for n in `seq 2 $nu`; do
      if [ $n -eq $na ] || [ $n -eq $[$na-1] ]; then lr=$last_layer_learning_rate;
      else lr=$learning_rate; fi
      lr_string="$lr_string:$lr"
    done

    $cmd $dir/log/average.$x.log \
      nnet-am-average $nnets_list - \| \
      nnet-am-copy --learning-rates=$lr_string - $dir/$[$x+1].mdl || exit 1;

    if $modify_learning_rates && [ $x -ge $first_modify_iter ]; then
      $cmd $dir/log/modify_learning_rates.$x.log \
        nnet-modify-learning-rates --last-layer-factor=$last_layer_factor \
          --first-layer-factor=$first_layer_factor --average-learning-rate=$learning_rate \
        $dir/$x.mdl $dir/$[$x+1].mdl $dir/$[$x+1].mdl || exit 1;
    fi

    if $shrink && [ $[$x % $shrink_interval] -eq 0 ]; then
      mb=$[($num_frames_shrink+$num_threads-1)/$num_threads]
      $cmd $parallel_opts $dir/log/shrink.$x.log \
        nnet-subset-egs --n=$num_frames_shrink --randomize-order=true --srand=$x \
          ark:$egs_dir/train_diagnostic.egs ark:-  \| \
        nnet-combine-fast --use-gpu=no --num-threads=$num_threads --verbose=3 --minibatch-size=$mb \
          $dir/$[$x+1].mdl ark:- $dir/$[$x+1].mdl || exit 1;
    else
      # On other iters, do nnet-am-fix which is much faster and has roughly
      # the same effect.
      nnet-am-fix $dir/$[$x+1].mdl $dir/$[$x+1].mdl 2>$dir/log/fix.$x.log
    fi

    if [ "$mix_up" -gt 0 ] && [ $x -eq $mix_up_iter ]; then
      # mix up.
      echo Mixing up from $num_leaves to $mix_up components
      $cmd $dir/log/mix_up.$x.log \
        nnet-am-mixup --min-count=10 --num-mixtures=$mix_up \
        $dir/$[$x+1].mdl $dir/$[$x+1].mdl || exit 1;
    fi
    rm $nnets_list
  fi
  x=$[$x+1]
done

# Now do combination.
# At the end, final.mdl will be a combination of the last e.g. 10 models.
nnets_list=()
if [ $num_iters_final -gt $num_iters_extra ]; then
  echo "Setting num_iters_final=$num_iters_extra"
fi
start=$[$num_iters-$num_iters_final+1]
for x in `seq $start $num_iters`; do
  idx=$[$x-$start]
  if [ $x -gt $mix_up_iter ]; then
    nnets_list[$idx]=$dir/$x.mdl # "nnet-am-copy --remove-dropout=true $dir/$x.mdl - |"
  fi
done

if [ $stage -le $num_iters ]; then
  # Below, use --use-gpu=no to disable nnet-combine-fast from using a GPU, as
  # if there are many models it can give out-of-memory error; set num-threads to 8
  # to speed it up (this isn't ideal...)
  this_num_threads=$num_threads
  [ $this_num_threads -lt 8 ] && this_num_threads=8
  num_egs=`nnet-copy-egs ark:$egs_dir/combine.egs ark:/dev/null 2>&1 | tail -n 1 | awk '{print $NF}'`
  mb=$[($num_egs+$this_num_threads-1)/$this_num_threads]
  [ $mb -gt 512 ] && mb=512
  $cmd $parallel_opts $combine_opts $dir/log/combine.log \
    nnet-combine-fast --use-gpu=no --num-threads=$this_num_threads \
      --verbose=3 --minibatch-size=$mb "${nnets_list[@]}" ark:$egs_dir/combine.egs \
      $dir/final.mdl || exit 1;
fi

# Compute the probability of the final, combined model with
# the same subset we used for the previous compute_probs, as the
# different subsets will lead to different probs.
$cmd $dir/log/compute_prob_valid.final.log \
  nnet-compute-prob $dir/final.mdl ark:$egs_dir/valid_diagnostic.egs &
$cmd $dir/log/compute_prob_train.final.log \
  nnet-compute-prob $dir/final.mdl ark:$egs_dir/train_diagnostic.egs &

sleep 2

echo Done

if $cleanup; then
  echo Cleaning up data
  if [ $egs_dir == "$dir/egs" ]; then
    steps/nnet2/remove_egs.sh $dir/egs
  fi
  echo Removing most of the models
  for x in `seq 0 $num_iters`; do
    if [ $[$x%10] -ne 0 ] && [ $x -lt $[$num_iters-$num_iters_final+1] ]; then
       # delete all but every 10th model; don't delete the ones which combine to form the final model.
      rm $dir/$x.mdl
    fi
  done
fi

name=`basename $data`
if [ -f $dir/final.mdl ]; then
  nnet-to-raw-nnet --truncate=$truncate_comp_num $dir/final.mdl $dir/final.raw
else
  echo "$0: we require final.mdl in source dir $dir"
fi


================================================
FILE: egs/steps/nnet2/train_tanh_fast.sh
================================================
#!/usr/bin/env bash

# Copyright 2012-2014  Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.

# This script trains a fairly vanilla network with tanh nonlinearities.

# train_tanh_fast.sh is a new, improved version of train_tanh.sh, which uses
# the 'online' preconditioning method.  For GPUs it's about two times faster
# than before (although that's partly due to optimizations that will also help
# the old recipe), and for CPUs it gives better performance than the old method
# (I believe); also, the difference in optimization performance between CPU and
# GPU is almost gone.  The old train_tanh.sh script is now deprecated.
# We made this a separate script because not all of the options that the
# old script accepted, are still accepted.


# Begin configuration section.
cmd=run.pl
num_epochs=15      # Number of epochs during which we reduce
                   # the learning rate; number of iteration is worked out from this.
num_epochs_extra=5 # Number of epochs after we stop reducing
                   # the learning rate.
num_iters_final=20 # Maximum number of final iterations to give to the
                   # optimization over the validation set.
initial_learning_rate=0.04
final_learning_rate=0.004
bias_stddev=0.5
shrink_interval=5 # shrink every $shrink_interval iters except while we are
                  # still adding layers, when we do it every iter.
shrink=true
num_frames_shrink=2000 # note: must be <= --num-frames-diagnostic option to get_egs.sh, if
                       # given.
final_learning_rate_factor=0.5 # Train the two last layers of parameters half as
                               # fast as the other layers, by default.

hidden_layer_dim=300 #  You may want this larger, e.g. 1024 or 2048.

minibatch_size=128 # by default use a smallish minibatch size for neural net
                   # training; this controls instability which would otherwise
                   # be a problem with multi-threaded update.

samples_per_iter=200000 # each iteration of training, see this many samples
                        # per job.  This option is passed to get_egs.sh.
num_jobs_nnet=8    # Number of neural net jobs to run in parallel.  This option
                   # is passed to get_egs.sh.
get_egs_stage=0

shuffle_buffer_size=5000 # This "buffer_size" variable controls randomization of
                # the samples on each iter.  You could set it to 0 or to a large
                # value for complete randomization, but this would both consume
                # memory and cause spikes in disk I/O.  Smaller is easier on
                # disk and memory but less random.  It's not a huge deal though,
                # as samples are anyway randomized right at the start.

add_layers_period=2 # by default, add new layers every 2 iterations.
num_hidden_layers=3 # This is an important configuration value that you might
                    # want to tune.
stage=-5

io_opts="--max-jobs-run 5" # for jobs with a lot of I/O, limits the number running at one time.   These don't
splice_width=4 # meaning +- 4 frames on each side for second LDA
randprune=4.0 # speeds up LDA.
alpha=4.0 # relates to preconditioning.
update_period=4 # relates to online preconditioning: says how often we update the subspace.
num_samples_history=2000 # relates to online preconditioning
max_change_per_sample=0.075
# we make the [input, output] ranks less different for the tanh setup than for
# the pnorm setup, as we don't have the difference in dimensions to deal with.
precondition_rank_in=30  # relates to online preconditioning
precondition_rank_out=60 # relates to online preconditioning
mix_up=0 # Number of components to mix up to (should be > #tree leaves, if
         # specified.)
num_threads=16
parallel_opts="--num-threads 16 --mem 1G" # by default we use 16 threads; this lets the queue know.
  # note: parallel_opts doesn't automatically get adjusted if you adjust num-threads.
combine_parallel_opts="--num-threads 8"  # queue options for the "combine" stage.
combine_num_threads=8
cleanup=true
egs_dir=
lda_opts=
egs_opts=
transform_dir=
cmvn_opts=  # will be passed to get_lda.sh and get_egs.sh, if supplied.
            # only relevant for "raw" features, not lda.
feat_type=  # Can be used to force "raw" features.
prior_subset_size=10000 # 10k samples per job, for computing priors.  Should be
                        # more than enough.
# End configuration section.


echo "$0 $@"  # Print the command line for logging

if [ -f path.sh ]; then . ./path.sh; fi
. parse_options.sh || exit 1;


if [ $# != 4 ]; then
  echo "Usage: $0 [opts] <data> <lang> <ali-dir> <exp-dir>"
  echo " e.g.: $0 data/train data/lang exp/tri3_ali exp/tri4_nnet"
  echo ""
  echo "Main options (for others, see top of script file)"
  echo "  --config <config-file>                           # config file containing options"
  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
  echo "  --num-epochs <#epochs|15>                        # Number of epochs of main training"
  echo "                                                   # while reducing learning rate (determines #iterations, together"
  echo "                                                   # with --samples-per-iter and --num-jobs-nnet)"
  echo "  --num-epochs-extra <#epochs-extra|5>             # Number of extra epochs of training"
  echo "                                                   # after learning rate fully reduced"
  echo "  --initial-learning-rate <initial-learning-rate|0.02> # Learning rate at start of training, e.g. 0.02 for small"
  echo "                                                       # data, 0.01 for large data"
  echo "  --final-learning-rate  <final-learning-rate|0.004>   # Learning rate at end of training, e.g. 0.004 for small"
  echo "                                                   # data, 0.001 for large data"
  echo "  --num-hidden-layers <#hidden-layers|2>           # Number of hidden layers, e.g. 2 for 3 hours of data, 4 for 100hrs"
  echo "  --initial-num-hidden-layers <#hidden-layers|1>   # Number of hidden layers to start with."
  echo "  --add-layers-period <#iters|2>                   # Number of iterations between adding hidden layers"
  echo "  --mix-up <#pseudo-gaussians|0>                   # Can be used to have multiple targets in final output layer,"
  echo "                                                   # per context-dependent state.  Try a number several times #states."
  echo "  --num-jobs-nnet <num-jobs|8>                     # Number of parallel jobs to use for main neural net"
  echo "                                                   # training (will affect results as well as speed; try 8, 16)"
  echo "                                                   # Note: if you increase this, you may want to also increase"
  echo "                                                   # the learning rate."
  echo "  --num-threads <num-threads|16>                   # Number of parallel threads per job (will affect results"
  echo "                                                   # as well as speed; may interact with batch size; if you increase"
  echo "                                                   # this, you may want to decrease the batch size."
  echo "  --parallel-opts <opts|\"--num-threads 16 --mem 1G\">      # extra options to pass to e.g. queue.pl for processes that"
  echo "                                                   # use multiple threads... "
  echo "  --io-opts <opts|\"--max-jobs-run 10\">                      # Options given to e.g. queue.pl for jobs that do a lot of I/O."
  echo "  --minibatch-size <minibatch-size|128>            # Size of minibatch to process (note: product with --num-threads"
  echo "                                                   # should not get too large, e.g. >2k)."
  echo "  --samples-per-iter <#samples|200000>             # Number of samples of data to process per iteration, per"
  echo "                                                   # process."
  echo "  --splice-width <width|4>                         # Number of frames on each side to append for feature input"
  echo "                                                   # (note: we splice processed, typically 40-dimensional frames"
  echo "  --lda-dim <dim|250>                              # Dimension to reduce spliced features to with LDA"
  echo "  --num-iters-final <#iters|20>                    # Number of final iterations to give to nnet-combine-fast to "
  echo "                                                   # interpolate parameters (the weights are learned with a validation set)"
  echo "  --stage <stage|-9>                               # Used to run a partially-completed training process from somewhere in"
  echo "                                                   # the middle."

  exit 1;
fi

data=$1
lang=$2
alidir=$3
dir=$4

# Check some files.
for f in $data/feats.scp $lang/L.fst $alidir/ali.1.gz $alidir/final.mdl $alidir/tree; do
  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
done


# Set some variables.
num_leaves=`am-info $alidir/final.mdl 2>/dev/null | awk '/number of pdfs/{print $NF}'` || exit 1;

nj=`cat $alidir/num_jobs` || exit 1;  # number of jobs in alignment dir...
# in this dir we'll have just one job.
sdata=$data/split$nj
utils/split_data.sh $data $nj

mkdir -p $dir/log
cp $alidir/tree $dir

utils/lang/check_phones_compatible.sh $lang/phones.txt $alidir/phones.txt || exit 1;
cp $lang/phones.txt $dir || exit 1;

extra_opts=()
[ ! -z "$cmvn_opts" ] && extra_opts+=(--cmvn-opts "$cmvn_opts")
[ ! -z "$feat_type" ] && extra_opts+=(--feat-type $feat_type)
[ ! -z "$online_ivector_dir" ] && extra_opts+=(--online-ivector-dir $online_ivector_dir)
[ -z "$transform_dir" ] && transform_dir=$alidir
extra_opts+=(--transform-dir $transform_dir)
extra_opts+=(--splice-width $splice_width)

if [ $stage -le -4 ]; then
  echo "$0: calling get_lda.sh"
  steps/nnet2/get_lda.sh $lda_opts "${extra_opts[@]}" --cmd "$cmd" $data $lang $alidir $dir || exit 1;
fi

# these files will have been written by get_lda.sh
feat_dim=$(cat $dir/feat_dim) || exit 1;
ivector_dim=$(cat $dir/ivector_dim) || exit 1;
lda_dim=$(cat $dir/lda_dim) || exit 1;

if [ $stage -le -3 ] && [ -z "$egs_dir" ]; then
  echo "$0: calling get_egs.sh"
  steps/nnet2/get_egs.sh $egs_opts "${extra_opts[@]}" \
      --samples-per-iter $samples_per_iter \
      --num-jobs-nnet $num_jobs_nnet --stage $get_egs_stage \
      --cmd "$cmd" $egs_opts --io-opts "$io_opts" \
      $data $lang $alidir $dir || exit 1;
fi
if [ -z $egs_dir ]; then
  egs_dir=$dir/egs
fi

iters_per_epoch=`cat $egs_dir/iters_per_epoch`  || exit 1;
! [ $num_jobs_nnet -eq `cat $egs_dir/num_jobs_nnet` ] && \
  echo "$0: Warning: using --num-jobs-nnet=`cat $egs_dir/num_jobs_nnet` from $egs_dir"
num_jobs_nnet=`cat $egs_dir/num_jobs_nnet` || exit 1;


if ! [ $num_hidden_layers -ge 1 ]; then
  echo "Invalid num-hidden-layers $num_hidden_layers"
  exit 1
fi

if [ $stage -le -2 ]; then
  echo "$0: initializing neural net";

  # Get spk-vec dim (in case we're using them).
  lda_mat=$dir/lda.mat
  tot_input_dim=$[$feat_dim+$ivector_dim]

  online_preconditioning_opts="alpha=$alpha num-samples-history=$num_samples_history update-period=$update_period rank-in=$precondition_rank_in rank-out=$precondition_rank_out max-change-per-sample=$max_change_per_sample"

  stddev=`perl -e "print 1.0/sqrt($hidden_layer_dim);"`
  cat >$dir/nnet.config <<EOF
SpliceComponent input-dim=$tot_input_dim left-context=$splice_width right-context=$splice_width const-component-dim=$ivector_dim
FixedAffineComponent matrix=$lda_mat
AffineComponentPreconditionedOnline input-dim=$lda_dim output-dim=$hidden_layer_dim $online_preconditioning_opts learning-rate=$initial_learning_rate param-stddev=$stddev bias-stddev=$bias_stddev
TanhComponent dim=$hidden_layer_dim
AffineComponentPreconditionedOnline input-dim=$hidden_layer_dim output-dim=$num_leaves $online_preconditioning_opts learning-rate=$initial_learning_rate param-stddev=0 bias-stddev=0
SoftmaxComponent dim=$num_leaves
EOF

  # to hidden.config it will write the part of the config corresponding to a
  # single hidden layer; we need this to add new layers.
  cat >$dir/hidden.config <<EOF
AffineComponentPreconditionedOnline input-dim=$hidden_layer_dim output-dim=$hidden_layer_dim $online_preconditioning_opts learning-rate=$initial_learning_rate param-stddev=$stddev bias-stddev=$bias_stddev
TanhComponent dim=$hidden_layer_dim
EOF
  $cmd $dir/log/nnet_init.log \
    nnet-am-init $alidir/tree $lang/topo "nnet-init $dir/nnet.config -|" \
    $dir/0.mdl || exit 1;
fi

if [ $stage -le -1 ]; then
  echo "Training transition probabilities and setting priors"
  $cmd $dir/log/train_trans.log \
    nnet-train-transitions $dir/0.mdl "ark:gunzip -c $alidir/ali.*.gz|" $dir/0.mdl \
    || exit 1;
fi

num_iters_reduce=$[$num_epochs * $iters_per_epoch];
num_iters_extra=$[$num_epochs_extra * $iters_per_epoch];
num_iters=$[$num_iters_reduce+$num_iters_extra]

echo "$0: Will train for $num_epochs + $num_epochs_extra epochs, equalling "
echo "$0: $num_iters_reduce + $num_iters_extra = $num_iters iterations, "
echo "$0: (while reducing learning rate) + (with constant learning rate)."

# This is when we decide to mix up from: halfway between when we've finished
# adding the hidden layers and the end of training.
finish_add_layers_iter=$[$num_hidden_layers * $add_layers_period]
mix_up_iter=$[($num_iters + $finish_add_layers_iter)/2]

if [ $num_threads -eq 1 ]; then
  train_suffix="-simple" # this enables us to use GPU code if
                         # we have just one thread.
  if ! cuda-compiled; then
    echo "$0: WARNING: you are running with one thread but you have not compiled"
    echo "   for CUDA.  You may be running a setup optimized for GPUs.  If you have"
    echo "   GPUs and have nvcc installed, go to src/ and do ./configure; make"
  fi
else
  train_suffix="-parallel --num-threads=$num_threads"
fi


x=0
while [ $x -lt $num_iters ]; do
  if [ $x -ge 0 ] && [ $stage -le $x ]; then
    # Set off jobs doing some diagnostics, in the background.
    $cmd $dir/log/compute_prob_valid.$x.log \
      nnet-compute-prob $dir/$x.mdl ark:$egs_dir/valid_diagnostic.egs &
    $cmd $dir/log/compute_prob_train.$x.log \
      nnet-compute-prob $dir/$x.mdl ark:$egs_dir/train_diagnostic.egs &
    if [ $x -gt 0 ] && [ ! -f $dir/log/mix_up.$[$x-1].log ]; then
      $cmd $dir/log/progress.$x.log \
        nnet-show-progress --use-gpu=no $dir/$[$x-1].mdl $dir/$x.mdl \
         ark:$egs_dir/train_diagnostic.egs '&&' \
         nnet-am-info $dir/$x.mdl &
    fi

    echo "Training neural net (pass $x)"
    if [ $x -gt 0 ] && \
      [ $x -le $[($num_hidden_layers-1)*$add_layers_period] ] && \
      [ $[($x-1) % $add_layers_period] -eq 0 ]; then
      mdl="nnet-init --srand=$x $dir/hidden.config - | nnet-insert $dir/$x.mdl - - |"
    else
      mdl=$dir/$x.mdl
    fi

    if [ $x -eq 0 ] || [ "$mdl" != "$dir/$x.mdl" ]; then
      # on iteration zero or when we just added a layer, use a smaller minibatch
      # size and just one job: the model-averaging doesn't seem to be helpful
      # when the model is changing too fast (i.e. it worsens the objective
      # function), and the smaller minibatch size will help to keep
      # the update stable.
      this_minibatch_size=$[$minibatch_size/2];
      do_average=false
    else
      this_minibatch_size=$minibatch_size
      do_average=true
    fi

    $cmd $parallel_opts JOB=1:$num_jobs_nnet $dir/log/train.$x.JOB.log \
      nnet-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x \
      ark:$egs_dir/egs.JOB.$[$x%$iters_per_epoch].ark ark:- \| \
      nnet-train$train_suffix \
         --minibatch-size=$this_minibatch_size --srand=$x "$mdl" \
        ark:- $dir/$[$x+1].JOB.mdl \
      || exit 1;

    nnets_list=
    for n in `seq 1 $num_jobs_nnet`; do
      nnets_list="$nnets_list $dir/$[$x+1].$n.mdl"
    done

    learning_rate=`perl -e '($x,$n,$i,$f)=@ARGV; print ($x >= $n ? $f : $i*exp($x*log($f/$i)/$n));' $[$x+1] $num_iters_reduce $initial_learning_rate $final_learning_rate`;
    last_layer_learning_rate=`perl -e "print $learning_rate * $final_learning_rate_factor;"`;
    nnet-am-info $dir/$[$x+1].1.mdl > $dir/foo  2>/dev/null || exit 1
    nu=`cat $dir/foo | grep num-updatable-components | awk '{print $2}'`
    na=`cat $dir/foo | grep -v Fixed | grep AffineComponent | wc -l`
    # na is number of last updatable AffineComponent layer [one-based, counting only
    # updatable components.]
    # The last two layers will get this (usually lower) learning rate.
    lr_string="$learning_rate"
    for n in `seq 2 $nu`; do
      if [ $n -eq $na ] || [ $n -eq $[$na-1] ]; then lr=$last_layer_learning_rate;
      else lr=$learning_rate; fi
      lr_string="$lr_string:$lr"
    done

    if $do_average; then
      $cmd $dir/log/average.$x.log \
        nnet-am-average $nnets_list - \| \
        nnet-am-copy --learning-rates=$lr_string - $dir/$[$x+1].mdl || exit 1;
    else
      n=$(perl -e '($nj,$pat)=@ARGV; $best_n=1; $best_logprob=-1.0e+10; for ($n=1;$n<=$nj;$n++) {
          $fn = sprintf($pat,$n); open(F, "<$fn") || die "Error opening log file $fn";
          undef $logprob; while (<F>) { if (m/log-prob-per-frame=(\S+)/) { $logprob=$1; } }
          close(F); if (defined $logprob && $logprob > $best_logprob) { $best_logprob=$logprob;
          $best_n=$n; } } print "$best_n\n"; ' $num_jobs_nnet $dir/log/train.$x.%d.log) || exit 1;
      [ -z "$n" ] && echo "Error getting best model" && exit 1;
      $cmd $dir/log/select.$x.log \
        nnet-am-copy --learning-rates=$lr_string $dir/$[$x+1].$n.mdl $dir/$[$x+1].mdl || exit 1;
    fi

    if $shrink && [ $[$x % $shrink_interval] -eq 0 ]; then
      mb=$[($num_frames_shrink+$num_threads-1)/$num_threads]
      $cmd $combine_parallel_opts $dir/log/shrink.$x.log \
        nnet-subset-egs --n=$num_frames_shrink --randomize-order=true --srand=$x \
          ark:$egs_dir/train_diagnostic.egs ark:-  \| \
        nnet-combine-fast --use-gpu=no --num-threads=$combine_num_threads \
          --verbose=3 --minibatch-size=$mb \
          $dir/$[$x+1].mdl ark:- $dir/$[$x+1].mdl || exit 1;
    else
      # On other iters, do nnet-am-fix which is much faster and has roughly
      # the same effect.
      nnet-am-fix $dir/$[$x+1].mdl $dir/$[$x+1].mdl 2>$dir/log/fix.$x.log
    fi

    if [ "$mix_up" -gt 0 ] && [ $x -eq $mix_up_iter ]; then
      # mix up.
      echo Mixing up from $num_leaves to $mix_up components
      $cmd $dir/log/mix_up.$x.log \
        nnet-am-mixup --min-count=10 --num-mixtures=$mix_up \
        $dir/$[$x+1].mdl $dir/$[$x+1].mdl || exit 1;
    fi
    rm $nnets_list
  fi
  x=$[$x+1]
done

# Now do combination.
# At the end, final.mdl will be a combination of the last e.g. 10 models.
nnets_list=()
if [ $num_iters_final -gt $num_iters_extra ]; then
  echo "Setting num_iters_final=$num_iters_extra"
fi
start=$[$num_iters-$num_iters_final+1]
for x in `seq $start $num_iters`; do
  idx=$[$x-$start]
  if [ $x -gt $mix_up_iter ]; then
    nnets_list[$idx]=$dir/$x.mdl # "nnet-am-copy --remove-dropout=true $dir/$x.mdl - |"
  fi
done

if [ $stage -le $num_iters ]; then
  echo "Doing final combination to produce final.mdl"
  # Below, use --use-gpu=no to disable nnet-combine-fast from using a GPU, as if
  # there are many models it can give out-of-memory error on the GPU; set
  # num-threads to 8 to speed it up (this isn't ideal...)
  num_egs=`nnet-copy-egs ark:$egs_dir/combine.egs ark:/dev/null 2>&1 | tail -n 1 | awk '{print $NF}'`
  mb=$[($num_egs+$combine_num_threads-1)/$combine_num_threads]
  [ $mb -gt 512 ] && mb=512
  $cmd $combine_parallel_opts $dir/log/combine.log \
    nnet-combine-fast --use-gpu=no --num-threads=$combine_num_threads \
      --verbose=3 --minibatch-size=$mb "${nnets_list[@]}" ark:$egs_dir/combine.egs \
      $dir/final.mdl || exit 1;

  # Compute the probability of the final, combined model with
  # the same subset we used for the previous compute_probs, as the
  # different subsets will lead to different probs.
  $cmd $dir/log/compute_prob_valid.final.log \
    nnet-compute-prob $dir/final.mdl ark:$egs_dir/valid_diagnostic.egs &
  $cmd $dir/log/compute_prob_train.final.log \
    nnet-compute-prob $dir/final.mdl ark:$egs_dir/train_diagnostic.egs &
fi

if [ $stage -le $[$num_iters+1] ]; then
  echo "Getting average posterior for purposes of adjusting the priors."
  # Note: this just uses CPUs, using a smallish subset of data.
  rm $dir/post.*.vec 2>/dev/null
  $cmd JOB=1:$num_jobs_nnet $dir/log/get_post.JOB.log \
    nnet-subset-egs --n=$prior_subset_size ark:$egs_dir/egs.JOB.0.ark ark:- \| \
    nnet-compute-from-egs "nnet-to-raw-nnet $dir/final.mdl -|" ark:- ark:- \| \
    matrix-sum-rows ark:- ark:- \| vector-sum ark:- $dir/post.JOB.vec || exit 1;

  sleep 3;  # make sure there is time for $dir/post.*.vec to appear.

  $cmd $dir/log/vector_sum.log \
   vector-sum $dir/post.*.vec $dir/post.vec || exit 1;

  rm $dir/post.*.vec;

  echo "Re-adjusting priors based on computed posteriors"
  $cmd $dir/log/adjust_priors.log \
    nnet-adjust-priors $dir/final.mdl $dir/post.vec $dir/final.mdl || exit 1;
fi

sleep 2

echo Done

if $cleanup; then
  echo Cleaning up data
  if [ $egs_dir == "$dir/egs" ]; then
    steps/nnet2/remove_egs.sh $dir/egs
  fi
  echo Removing most of the models
  for x in `seq 0 $num_iters`; do
    if [ $[$x%100] -ne 0 ] && [ $x -lt $[$num_iters-$num_iters_final+1] ]; then
       # delete all but every 10th model; don't delete the ones which combine to form the final model.
      rm $dir/$x.mdl
    fi
  done
fi


================================================
FILE: egs/steps/nnet2/update_nnet.sh
================================================
#!/usr/bin/env bash

# Copyright 2012  Johns Hopkins University (Author: Daniel Povey).
#           2013  Xiaohui Zhang
#           2013  Guoguo Chen
#           2013  Johns Hopkins University (Author: Jan Trmal)
#           2013  Vimal Manohar
# Apache 2.0.


# This script updates an existing neural network model without initializing it.

# Begin configuration section.
cmd=run.pl
num_epochs=20      # Number of epochs during which we reduce
                   # the learning rate; number of iteration is worked out from this.
num_iters_final=20 # Maximum number of final iterations to give to the
                   # optimization over the validation set.
learning_rates="0.0008:0.0008:0.0008:0"

combine_regularizer=1.0e-14 # Small regularizer so that parameters won't go crazy.
minibatch_size=128 # by default use a smallish minibatch size for neural net
                   # training; this controls instability which would otherwise
                   # be a problem with multi-threaded update.  Note: it also
                   # interacts with the "preconditioned" update which generally
                   # works better with larger minibatch size, so it's not
                   # completely cost free.

samples_per_iter=200000 # each iteration of training, see this many samples
                        # per job.  This option is passed to get_egs.sh
num_jobs_nnet=16   # Number of neural net jobs to run in parallel.  This option
                   # is passed to get_egs.sh.
get_egs_stage=0

shuffle_buffer_size=5000 # This "buffer_size" variable controls randomization of the samples
                # on each iter.  You could set it to 0 or to a large value for complete
                # randomization, but this would both consume memory and cause spikes in
                # disk I/O.  Smaller is easier on disk and memory but less random.  It's
                # not a huge deal though, as samples are anyway randomized right at the start.

stage=-5

io_opts="--max-jobs-run 5" # for jobs with a lot of I/O, limits the number running at one time.   These don't
splice_width=4 # meaning +- 4 frames on each side for second LDA
randprune=4.0 # speeds up LDA.
alpha=4.0
max_change=10.0
mix_up=0 # Number of components to mix up to (should be > #tree leaves, if
        # specified.)
num_threads=16
parallel_opts="--num-threads 16 --mem 1G" # by default we use 16 threads; this lets the queue know.
  # note: parallel_opts doesn't automatically get adjusted if you adjust num-threads.
cleanup=false
egs_dir=
egs_opts=
transform_dir=     # If supplied, overrides alidir
# End configuration section.


echo "$0 $@"  # Print the command line for logging

if [ -f path.sh ]; then . ./path.sh; fi
. parse_options.sh || exit 1;

if [ $# != 5 ]; then
  echo "Usage: $0 [opts] <data> <lang> <ali-dir> <model-dir> <exp-dir>"
  echo " e.g.: $0 data/train data/lang exp/tri3_ali exp/tri4_nnet exp/tri4b_nnet"
  echo "See also the more recent script train_more.sh which requires the egs"
  echo "directory."
  echo ""
  echo "Main options (for others, see top of script file)"
  echo "  --config <config-file>                           # config file containing options"
  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
  echo "  --num-epochs <#epochs|15>                        # Number of epochs of main training"
  echo "                                                   # while reducing learning rate (determines #iterations, together"
  echo "                                                   # with --samples-per-iter and --num-jobs-nnet)"
  echo "  --num-jobs-nnet <num-jobs|8>                     # Number of parallel jobs to use for main neural net"
  echo "                                                   # training (will affect results as well as speed; try 8, 16)"
  echo "                                                   # Note: if you increase this, you may want to also increase"
  echo "                                                   # the learning rate."
  echo "  --num-threads <num-threads|16>                   # Number of parallel threads per job (will affect results"
  echo "                                                   # as well as speed; may interact with batch size; if you increase"
  echo "                                                   # this, you may want to decrease the batch size."
  echo "  --parallel-opts <opts|\"--num-threads 16 --mem 1G\">      # extra options to pass to e.g. queue.pl for processes that"
  echo "                                                   # use multiple threads... "
  echo "  --io-opts <opts|\"--max-jobs-run 10\">                      # Options given to e.g. queue.pl for jobs that do a lot of I/O."
  echo "  --minibatch-size <minibatch-size|128>            # Size of minibatch to process (note: product with --num-threads"
  echo "                                                   # should not get too large, e.g. >2k)."
  echo "  --samples-per-iter <#samples|400000>             # Number of samples of data to process per iteration, per"
  echo "                                                   # process."
  echo "  --splice-width <width|4>                         # Number of frames on each side to append for feature input"
  echo "                                                   # (note: we splice processed, typically 40-dimensional frames"
  echo "  --num-iters-final <#iters|10>                    # Number of final iterations to give to nnet-combine-fast to "
  echo "                                                   # interpolate parameters (the weights are learned with a validation set)"
  echo "  --num-utts-subset <#utts|300>                    # Number of utterances in subsets used for validation and diagnostics"
  echo "                                                   # (the validation subset is held out from training)"
  echo "  --num-frames-diagnostic <#frames|4000>           # Number of frames used in computing (train,valid) diagnostics"
  echo "  --num-valid-frames-combine <#frames|10000>       # Number of frames used in getting combination weights at the"
  echo "                                                   # very end."
  echo "  --stage <stage|-9>                               # Used to run a partially-completed training process from somewhere in"
  echo "                                                   # the middle."
  echo "  --transform-dir                                  # Directory with fMLLR transforms. Overrides alidir if provided."

  exit 1;
fi

data=$1
lang=$2
alidir=$3
sdir=$4
dir=$5

# Check some files.
for f in $data/feats.scp $lang/L.fst $alidir/ali.1.gz $alidir/final.mdl $alidir/tree; do
  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
done


# Set some variables.
num_leaves=`gmm-info $alidir/final.mdl 2>/dev/null | awk '/number of pdfs/{print $NF}'` || exit 1;

nj=`cat $alidir/num_jobs` || exit 1;  # number of jobs in alignment dir...
# in this dir we'll have just one job.
sdata=$data/split$nj
utils/split_data.sh $data $nj

mkdir -p $dir/log
echo $nj > $dir/num_jobs
splice_opts=`cat $alidir/splice_opts 2>/dev/null`
cp $alidir/splice_opts $dir 2>/dev/null
cp $alidir/tree $dir

utils/lang/check_phones_compatible.sh $lang/phones.txt $alidir/phones.txt || exit 1;
utils/lang/check_phones_compatible.sh $lang/phones.txt $sdir/phones.txt || exit 1;
cp $lang/phones.txt $dir || exit 1;

[ -z "$transform_dir" ] && transform_dir=$alidir

if [ $stage -le -3 ] && [ -z "$egs_dir" ]; then
  echo "$0: calling get_egs.sh"
  steps/nnet2/get_egs.sh --samples-per-iter $samples_per_iter --num-jobs-nnet $num_jobs_nnet \
      --splice-width $splice_width --stage $get_egs_stage --cmd "$cmd" $egs_opts --io-opts "$io_opts" --transform-dir $transform_dir \
      $data $lang $alidir $dir || exit 1;
fi

if [ -z $egs_dir ]; then
  egs_dir=$dir/egs
fi

iters_per_epoch=`cat $egs_dir/iters_per_epoch`  || exit 1;
! [ $num_jobs_nnet -eq `cat $egs_dir/num_jobs_nnet` ] && \
  echo "$0: Warning: using --num-jobs-nnet=`cat $egs_dir/num_jobs_nnet` from $egs_dir"
num_jobs_nnet=`cat $egs_dir/num_jobs_nnet` || exit 1;


if [ $stage -le -2 ]; then
  echo "$0: using existing neural net";
  source_model=$sdir/final.mdl
  nnet-am-copy --learning-rates=${learning_rates} $source_model $dir/0.mdl
fi


num_iters=$[$num_epochs * $iters_per_epoch];

echo "$0: Will train for $num_epochs epochs, equalling $num_iters iterations"


if [ $num_threads -eq 1 ]; then
  train_suffix="-simple" # this enables us to use GPU code if
                         # we have just one thread.
else
  train_suffix="-parallel --num-threads=$num_threads"
fi

x=0

while [ $x -lt $num_iters ]; do
  if [ $x -ge 0 ] && [ $stage -le $x ]; then
    # Set off jobs doing some diagnostics, in the background.
    $cmd $dir/log/compute_prob_valid.$x.log \
      nnet-compute-prob $dir/$x.mdl ark:$egs_dir/valid_diagnostic.egs &
    $cmd $dir/log/compute_prob_train.$x.log \
      nnet-compute-prob $dir/$x.mdl ark:$egs_dir/train_diagnostic.egs &

    if [ $x -gt 0 ] ; then
      $cmd $dir/log/progress.$x.log \
        nnet-show-progress --use-gpu=no $dir/$[$x-1].mdl $dir/$x.mdl ark:$egs_dir/train_diagnostic.egs &
    fi

    echo "Training neural net (pass $x)"
    mdl=$dir/$x.mdl


    $cmd $parallel_opts JOB=1:$num_jobs_nnet $dir/log/train.$x.JOB.log \
      nnet-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x \
      ark:$egs_dir/egs.JOB.$[$x%$iters_per_epoch].ark ark:- \| \
      nnet-train$train_suffix \
         --minibatch-size=$minibatch_size --srand=$x "$mdl" \
        ark:- $dir/$[$x+1].JOB.mdl \
      || exit 1;

    nnets_list=
    for n in `seq 1 $num_jobs_nnet`; do
      nnets_list="$nnets_list $dir/$[$x+1].$n.mdl"
    done

    $cmd $dir/log/average.$x.log \
      nnet-am-average $nnets_list $dir/$[$x+1].mdl || exit 1;

    rm $nnets_list
  fi
  x=$[$x+1]
done

# Now do combination.
# At the end, final.mdl will be a combination of the last e.g. 10 models.
nnets_list=()
if [ $num_iters_final -gt $num_iters ]; then
  echo "Setting num_iters_final=$num_iters"
fi
start=$[$num_iters-$num_iters_final+1]
for x in `seq $start $num_iters`; do
  idx=$[$x-$start]
  nnets_list[$idx]=$dir/$x.mdl # "nnet-am-copy --remove-dropout=true $dir/$x.mdl - |"
done

if [ $stage -le $num_iters ]; then
  # Below, use --use-gpu=no to disable nnet-combine-fast from using a GPU, as
  # if there are many models it can give out-of-memory error; set num-threads to 8
  # to speed it up (this isn't ideal...)
  this_num_threads=$num_threads
  [ $this_num_threads -lt 8 ] && this_num_threads=8
  num_egs=`nnet-copy-egs ark:$egs_dir/combine.egs ark:/dev/null 2>&1 | tail -n 1 | awk '{print $NF}'`
  mb=$[($num_egs+$this_num_threads-1)/$this_num_threads]
  [ $mb -gt 512 ] && mb=512
  # Setting --initial-model to a large value makes it initialize the combination
  # with the average of all the models.  It's important not to start with a
  # single model, or, due to the invariance to scaling that these nonlinearities
  # give us, we get zero diagonal entries in the fisher matrix that
  # nnet-combine-fast uses for scaling, which after flooring and inversion, has
  # the effect that the initial model chosen gets much higher learning rates
  # than the others.  This prevents the optimization from working well.
  $cmd $parallel_opts $dir/log/combine.log \
    nnet-combine-fast --initial-model=100000 --num-lbfgs-iters=40 --use-gpu=no \
      --num-threads=$this_num_threads --regularizer=$combine_regularizer \
      --verbose=3 --minibatch-size=$mb "${nnets_list[@]}" ark:$egs_dir/combine.egs \
      $dir/final.mdl || exit 1;
fi

# Compute the probability of the final, combined model with
# the same subset we used for the previous compute_probs, as the
# different subsets will lead to different probs.
$cmd $dir/log/compute_prob_valid.final.log \
  nnet-compute-prob $dir/final.mdl ark:$egs_dir/valid_diagnostic.egs &
$cmd $dir/log/compute_prob_train.final.log \
  nnet-compute-prob $dir/final.mdl ark:$egs_dir/train_diagnostic.egs &

sleep 2

echo Done

if $cleanup; then
  echo Cleaning up data
  if [ $egs_dir == "$dir/egs" ]; then
    echo Removing training examples
    steps/nnet2/remove_egs.sh $dir/egs
  fi
  echo Removing most of the models
  for x in `seq 0 $num_iters`; do
    if [ $[$x%10] -ne 0 ] && [ $x -lt $[$num_iters-$num_iters_final+1] ]; then
       # delete all but every 10th model; don't delete the ones which combine to form the final model.
      rm $dir/$x.mdl
    fi
  done
fi


================================================
FILE: egs/steps/nnet3/adjust_priors.sh
================================================
#!/usr/bin/env bash

. ./path.sh

# This script computes the DNN output averaged over a small subset of
# training egs and stores it in post.$iter.vec.
# This is used for the purpose of adjusting the nnet priors.
# When --use-raw-nnet is false, then the computed priors is added into the
# nnet model; hence the term adjust priors.
# When --use-raw-nnet is true, the computed priors is not added into the
# nnet model and left in the file post.$iter.vec.

cmd=run.pl
prior_subset_size=20000   # 20k samples per job, for computing priors.
num_jobs_compute_prior=10 # these are single-threaded, run on CPU.
use_gpu=false             # if true, we run on GPU.
egs_type=egs              # Compute from $egs_type.*.ark in $egs_dir
                          # If --egs-type is degs, then the program
                          # nnet3-discriminative-compute-from-egs is used
                          # instead of nnet3-compute-from-egs.
use_raw_nnet=false        # If raw nnet, the averaged posterior is computed
                          # and stored in post.$iter.vec; but there is no
                          # adjusting of priors
minibatch_size=256
iter=final

. utils/parse_options.sh

echo "$0 $@"  # Print the command line for logging

if [ $# -ne 2 ]; then
  echo "Usage: $0 [opts] <exp-dir> <egs-dir>"
  echo " e.g.: $0 exp/nnet3_sad_snr/tdnn_train_100k_whole_1k_splice2_2_relu500"
  exit 1
fi

dir=$1
egs_dir=$2

if $use_gpu; then
  prior_gpu_opt="--use-gpu=yes"
  prior_queue_opt="--gpu 1"
else
  prior_gpu_opt="--use-gpu=no"
  prior_queue_opt=""
fi

for f in $egs_dir/$egs_type.1.ark $egs_dir/info/num_archives; do
  if [ ! -f $f ]; then
    echo "$f not found"
    exit 1
  fi
done

if $use_raw_nnet; then
  model=$dir/$iter.raw
else
  model="nnet3-am-copy --raw=true $dir/$iter.mdl - |"
fi

rm -f $dir/post.$iter.*.vec 2>/dev/null

num_archives=$(cat $egs_dir/info/num_archives) || { echo "error: no such file $egs_dir/info/frames_per_eg"; exit 1; }
if [ $num_jobs_compute_prior -gt $num_archives ]; then
  num_jobs_compute_prior=$num_archives
fi


if [ $egs_type != "degs" ]; then
  $cmd JOB=1:$num_jobs_compute_prior $prior_queue_opt $dir/log/get_post.$iter.JOB.log \
    nnet3-copy-egs ark:$egs_dir/$egs_type.JOB.ark ark:- \| \
    nnet3-subset-egs --srand=JOB --n=$prior_subset_size ark:- ark:- \| \
    nnet3-merge-egs --minibatch-size=$minibatch_size ark:- ark:- \| \
    nnet3-compute-from-egs $prior_gpu_opt --apply-exp=true \
    "$model" ark:- ark:- \| \
    matrix-sum-rows ark:- ark:- \| vector-sum ark:- $dir/post.$iter.JOB.vec || exit 1;
else
  $cmd JOB=1:$num_jobs_compute_prior $prior_queue_opt $dir/log/get_post.$iter.JOB.log \
    nnet3-discriminative-copy-egs ark:$egs_dir/degs.JOB.ark ark:- \| \
    nnet3-discriminative-subset-egs --srand=JOB --n=$prior_subset_size ark:- ark:- \| \
    nnet3-discriminative-merge-egs --minibatch-size=$minibatch_size ark:- ark:- \| \
    nnet3-discriminative-compute-from-egs $prior_gpu_opt --apply-exp=true \
    "$model" ark:- ark:- \| \
    matrix-sum-rows ark:- ark:- \| vector-sum ark:- $dir/post.$iter.JOB.vec || exit 1;
fi

sleep 3;  # make sure there is time for $dir/post.$iter.*.vec to appear.

$cmd $dir/log/vector_sum.$iter.log \
  vector-sum $dir/post.$iter.*.vec $dir/post.$iter.vec || exit 1;

if ! $use_raw_nnet; then
  run.pl $dir/log/adjust_priors.$iter.log \
    nnet3-am-adjust-priors $dir/$iter.mdl $dir/post.$iter.vec $dir/${iter}_adj.mdl
fi

rm -f $dir/post.$iter.*.vec;


================================================
FILE: egs/steps/nnet3/align.sh
================================================
#!/usr/bin/env bash
# Copyright 2012  Brno University of Technology (Author: Karel Vesely)
#           2013  Johns Hopkins University (Author: Daniel Povey)
#           2015  Vijayaditya Peddinti
#           2016  Vimal Manohar
# Apache 2.0

# Computes training alignments using nnet3 DNN
# Warning: this script uses GPUs by default, and this is generally not
# an efficient use of GPUs. Set --use-gpu false to make it run on CPU.

# Begin configuration section.
nj=4
cmd=run.pl
# Begin configuration.
scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
beam=10
retry_beam=40
iter=final
use_gpu=true
frames_per_chunk=50
extra_left_context=0
extra_right_context=0
extra_left_context_initial=-1
extra_right_context_final=-1
online_ivector_dir=
graphs_scp=
# End configuration options.

echo "$0 $@"  # Print the command line for logging

[ -f path.sh ] && . ./path.sh # source the path.
. parse_options.sh || exit 1;

if [ $# != 4 ]; then
   echo "Usage: $0 <data-dir> <lang-dir> <src-dir> <align-dir>"
   echo "e.g.: $0 data/train data/lang exp/nnet4 exp/nnet4_ali"
   echo "Warning: this script uses GPUs by default, and this is generally not"
   echo "an efficient use of GPUs. Set --use-gpu false to make it run on CPU."
   echo ""
   echo "main options (for others, see top of script file)"
   echo "  --config <config-file>                           # config containing options"
   echo "  --nj <nj>                                        # number of parallel jobs"
   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
   exit 1;
fi

data=$1
lang=$2
srcdir=$3
dir=$4

oov=`cat $lang/oov.int` || exit 1;
mkdir -p $dir/log
echo $nj > $dir/num_jobs
sdata=$data/split${nj}
[[ -d $sdata && $data/feats.scp -ot $sdata ]] || \
   split_data.sh $data $nj || exit 1;

if $use_gpu; then
  queue_opt="--gpu 1"
  gpu_opt="--use-gpu=wait"
else
  queue_opt=""
  gpu_opt="--use-gpu=no"
fi

extra_files=
if [ ! -z "$online_ivector_dir" ]; then
  steps/nnet2/check_ivectors_compatible.sh $srcdir $online_ivector_dir || exit 1
  extra_files="$online_ivector_dir/ivector_online.scp $online_ivector_dir/ivector_period"
fi

for f in $srcdir/tree $srcdir/${iter}.mdl $data/feats.scp $lang/L.fst $extra_files; do
  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
done

cp $srcdir/{tree,${iter}.mdl} $dir || exit 1;

utils/lang/check_phones_compatible.sh $lang/phones.txt $srcdir/phones.txt || exit 1;
cp $lang/phones.txt $dir || exit 1;
## Set up features.  Note: these are different from the normal features
## because we have one rspecifier that has the features for the entire
## training set, not separate ones for each batch.
echo "$0: feature type is raw"

cmvn_opts=`cat $srcdir/cmvn_opts 2>/dev/null`
cp $srcdir/cmvn_opts $dir 2>/dev/null

feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- |"

ivector_opts=
if [ ! -z "$online_ivector_dir" ]; then
  ivector_period=$(cat $online_ivector_dir/ivector_period) || exit 1;
  ivector_opts="--online-ivectors=scp:$online_ivector_dir/ivector_online.scp --online-ivector-period=$ivector_period"
fi

echo "$0: aligning data in $data using model from $srcdir, putting alignments in $dir"

frame_subsampling_opt=
if [ -f $srcdir/frame_subsampling_factor ]; then
  # e.g. for 'chain' systems
  frame_subsampling_factor=$(cat $srcdir/frame_subsampling_factor)
  frame_subsampling_opt="--frame-subsampling-factor=$frame_subsampling_factor"
  cp $srcdir/frame_subsampling_factor $dir
  if [ "$frame_subsampling_factor" -gt 1 ] && \
     [ "$scale_opts" == "--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1" ]; then
    echo "$0: frame-subsampling-factor is not 1 (so likely a chain system),"
    echo "...  but the scale opts are the defaults.  You probably want"
    echo "--scale-opts '--transition-scale=1.0 --acoustic-scale=1.0 --self-loop-scale=1.0'"
    sleep 1
  fi
fi

if [ ! -z "$graphs_scp" ]; then
  if [ ! -f $graphs_scp ]; then
    echo "Could not find graphs $graphs_scp" && exit 1
  fi
  tra="scp:utils/filter_scp.pl $sdata/JOB/utt2spk $graphs_scp |"
  prog=compile-train-graphs-fsts
else
  tra="ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt $sdata/JOB/text|";
  prog=compile-train-graphs
fi

$cmd $queue_opt JOB=1:$nj $dir/log/align.JOB.log \
  $prog --read-disambig-syms=$lang/phones/disambig.int $dir/tree \
  $srcdir/${iter}.mdl  $lang/L.fst "$tra" ark:- \| \
  nnet3-align-compiled $scale_opts $ivector_opts $frame_subsampling_opt \
  --frames-per-chunk=$frames_per_chunk \
  --extra-left-context=$extra_left_context \
  --extra-right-context=$extra_right_context \
  --extra-left-context-initial=$extra_left_context_initial \
  --extra-right-context-final=$extra_right_context_final \
  $gpu_opt --beam=$beam --retry-beam=$retry_beam \
  $srcdir/${iter}.mdl ark:- "$feats" "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1;

steps/diagnostic/analyze_alignments.sh --cmd "$cmd" $lang $dir

echo "$0: done aligning data."


================================================
FILE: egs/steps/nnet3/align_lats.sh
================================================
#!/usr/bin/env bash
# Copyright 2012  Brno University of Technology (Author: Karel Vesely)
#           2013  Johns Hopkins University (Author: Daniel Povey)
#           2015  Vijayaditya Peddinti
#           2016  Vimal Manohar
#           2017  Pegah Ghahremani
# Apache 2.0

# Computes training alignments using nnet3 DNN, with output to lattices.

# Begin configuration section.
nj=4
cmd=run.pl
stage=-1
# Begin configuration.
scale_opts="--transition-scale=1.0 --self-loop-scale=0.1"
acoustic_scale=0.1
beam=20
iter=final
frames_per_chunk=50
extra_left_context=0
extra_right_context=0
extra_left_context_initial=-1
extra_right_context_final=-1
online_ivector_dir=
graphs_scp=
generate_ali_from_lats=false # If true, alingments generated from lattices.
# End configuration options.

echo "$0 $@"  # Print the command line for logging

[ -f path.sh ] && . ./path.sh # source the path.
. parse_options.sh || exit 1;

if [ $# != 4 ]; then
   echo "Usage: $0 <data-dir> <lang-dir> <src-dir> <align-dir>"
   echo "e.g.: $0 data/train data/lang exp/nnet4 exp/nnet4_ali"
   echo "main options (for others, see top of script file)"
   echo "  --config <config-file>                           # config containing options"
   echo "  --nj <nj>                                        # number of parallel jobs"
   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
   exit 1;
fi

data=$1
lang=$2
srcdir=$3
dir=$4

oov=`cat $lang/oov.int` || exit 1;
mkdir -p $dir/log
echo $nj > $dir/num_jobs
sdata=$data/split${nj}
[[ -d $sdata && $data/feats.scp -ot $sdata ]] || \
   split_data.sh $data $nj || exit 1;

extra_files=
if [ ! -z "$online_ivector_dir" ]; then
  steps/nnet2/check_ivectors_compatible.sh $srcdir $online_ivector_dir || exit 1
  extra_files="$online_ivector_dir/ivector_online.scp $online_ivector_dir/ivector_period"
fi

for f in $srcdir/tree $srcdir/${iter}.mdl $data/feats.scp $lang/L.fst $extra_files; do
  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
done

cp $srcdir/{tree,${iter}.mdl} $dir || exit 1;

utils/lang/check_phones_compatible.sh $lang/phones.txt $srcdir/phones.txt || exit 1;
cp $lang/phones.txt $dir || exit 1;
## Set up features.  Note: these are different from the normal features
## because we have one rspecifier that has the features for the entire
## training set, not separate ones for each batch.
echo "$0: feature type is raw"

cmvn_opts=`cat $srcdir/cmvn_opts 2>/dev/null`
cp $srcdir/cmvn_opts $dir 2>/dev/null

feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- |"

ivector_opts=
if [ ! -z "$online_ivector_dir" ]; then
  ivector_period=$(cat $online_ivector_dir/ivector_period) || exit 1;
  ivector_opts="--online-ivectors=scp:$online_ivector_dir/ivector_online.scp --online-ivector-period=$ivector_period"
fi

echo "$0: aligning data in $data using model from $srcdir, putting alignments in $dir"

frame_subsampling_opt=
if [ -f $srcdir/frame_subsampling_factor ]; then
  # e.g. for 'chain' systems
  frame_subsampling_factor=$(cat $srcdir/frame_subsampling_factor)
  frame_subsampling_opt="--frame-subsampling-factor=$frame_subsampling_factor"
  cp $srcdir/frame_subsampling_factor $dir
  if [[ $frame_subsampling_factor -gt 1 ]]; then
    # Assume a chain system, check agrument sanity.
    if [[ ! ($scale_opts == *--self-loop-scale=1.0* &&
             $scale_opts == *--transition-scale=1.0* &&
             $acoustic_scale = '1.0') ]]; then
      echo "$0: ERROR: frame-subsampling-factor is not 1, assuming a chain system."
      echo "... You should pass the following options to this script:"
      echo "  --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0'" \
           "--acoustic_scale 1.0"
    fi
  fi
fi

if [ ! -z "$graphs_scp" ]; then
  if [ ! -f $graphs_scp ]; then
    echo "Could not find graphs $graphs_scp" && exit 1
  fi
  tra="scp:utils/filter_scp.pl $sdata/JOB/utt2spk $graphs_scp |"
  prog=compile-train-graphs-fsts
else
  tra="ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt $sdata/JOB/text|";
  prog=compile-train-graphs
fi

if [ $stage -le 0 ]; then
  ## because nnet3-latgen-faster doesn't support adding the transition-probs to the
  ## graph itself, we need to bake them into the compiled graphs.  This means we can't reuse previously compiled graphs,
  ## because the other scripts write them without transition probs.
  $cmd JOB=1:$nj $dir/log/compile_graphs.JOB.log \
    $prog --read-disambig-syms=$lang/phones/disambig.int \
    $scale_opts \
    $dir/tree $srcdir/${iter}.mdl  $lang/L.fst "$tra" \
    "ark:|gzip -c >$dir/fsts.JOB.gz" || exit 1
fi

if [ $stage -le 1 ]; then
  # Warning: nnet3-latgen-faster doesn't support a retry-beam so you may get more
  # alignment errors (however, it does have a default min-active=200 so this
  # will tend to reduce alignment errors).
  # --allow_partial=false makes sure we reach the end of the decoding graph.
  # --word-determinize=false makes sure we retain the alternative pronunciations of
  #   words (including alternatives regarding optional silences).
  #  --lattice-beam=$beam keeps all the alternatives that were within the beam,
  #    it means we do no pruning of the lattice (lattices from a training transcription
  #    will be small anyway).
  $cmd JOB=1:$nj $dir/log/generate_lattices.JOB.log \
    nnet3-latgen-faster --acoustic-scale=$acoustic_scale $ivector_opts $frame_subsampling_opt \
    --frames-per-chunk=$frames_per_chunk \
    --extra-left-context=$extra_left_context \
    --extra-right-context=$extra_right_context \
    --extra-left-context-initial=$extra_left_context_initial \
    --extra-right-context-final=$extra_right_context_final \
    --beam=$beam --lattice-beam=$beam \
    --allow-partial=false --word-determinize=false \
    $srcdir/${iter}.mdl "ark:gunzip -c $dir/fsts.JOB.gz |" \
    "$feats" "ark:|gzip -c >$dir/lat.JOB.gz" || exit 1;
fi

if [ $stage -le 2 ] && $generate_ali_from_lats; then
  # If generate_alignments is true, ali.*.gz is generated in lats dir
  $cmd JOB=1:$nj $dir/log/generate_alignments.JOB.log \
    lattice-best-path --acoustic-scale=$acoustic_scale "ark:gunzip -c $dir/lat.JOB.gz |" \
    ark:/dev/null "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1;
fi
echo "$0: done generating lattices from training transcripts."


================================================
FILE: egs/steps/nnet3/chain/align_lats.sh
================================================
#!/bin/bash
# Copyright 2012  Brno University of Technology (Author: Karel Vesely)
#           2013  Johns Hopkins University (Author: Daniel Povey)
#           2015  Vijayaditya Peddinti
#           2016  Vimal Manohar
#           2017  Pegah Ghahremani
# Apache 2.0

# Computes training alignments using nnet3 DNN, with output to lattices.

# Begin configuration section.
nj=4
cmd=run.pl
stage=-1
# Begin configuration.
scale_opts="--transition-scale=1.0 --self-loop-scale=1.0"
acoustic_scale=1.0
post_decode_acwt=10.0
beam=20
iter=final
frames_per_chunk=50
extra_left_context=0
extra_right_context=0
extra_left_context_initial=-1
extra_right_context_final=-1
online_ivector_dir=
graphs_scp=
# End configuration options.

echo "$0 $@"  # Print the command line for logging

[ -f path.sh ] && . ./path.sh # source the path.
. parse_options.sh || exit 1;

if [ $# != 4 ]; then
   echo "Usage: $0 <data-dir> <lang-dir> <src-dir> <align-dir>"
   echo "e.g.: $0 data/train data/lang exp/nnet4 exp/nnet4_ali"
   echo "main options (for others, see top of script file)"
   echo "  --config <config-file>                           # config containing options"
   echo "  --nj <nj>                                        # number of parallel jobs"
   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
   exit 1;
fi

data=$1
lang=$2
srcdir=$3
dir=$4

oov=`cat $lang/oov.int` || exit 1;
mkdir -p $dir/log
echo $nj > $dir/num_jobs
sdata=$data/split${nj}
[[ -d $sdata && $data/feats.scp -ot $sdata ]] || \
   split_data.sh $data $nj || exit 1;

extra_files=
if [ ! -z "$online_ivector_dir" ]; then
  steps/nnet2/check_ivectors_compatible.sh $srcdir $online_ivector_dir || exit 1
  extra_files="$online_ivector_dir/ivector_online.scp $online_ivector_dir/ivector_period"
fi

for f in $srcdir/tree $srcdir/${iter}.mdl $data/feats.scp $lang/L.fst $extra_files; do
  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
done

cp $srcdir/{tree,${iter}.mdl} $dir || exit 1;

utils/lang/check_phones_compatible.sh $lang/phones.txt $srcdir/phones.txt || exit 1;
cp $lang/phones.txt $dir || exit 1;
## Set up features.  Note: these are different from the normal features
## because we have one rspecifier that has the features for the entire
## training set, not separate ones for each batch.
echo "$0: feature type is raw"

cmvn_opts=`cat $srcdir/cmvn_opts 2>/dev/null`
cp $srcdir/cmvn_opts $dir 2>/dev/null

feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- |"

ivector_opts=
if [ ! -z "$online_ivector_dir" ]; then
  ivector_period=$(cat $online_ivector_dir/ivector_period) || exit 1;
  ivector_opts="--online-ivectors=scp:$online_ivector_dir/ivector_online.scp --online-ivector-period=$ivector_period"
fi

echo "$0: aligning data in $data using model from $srcdir, putting alignments in $dir"

frame_subsampling_opt=
if [ -f $srcdir/frame_subsampling_factor ]; then
  # e.g. for 'chain' systems
  frame_subsampling_factor=$(cat $srcdir/frame_subsampling_factor)
  frame_subsampling_opt="--frame-subsampling-factor=$frame_subsampling_factor"
  cp $srcdir/frame_subsampling_factor $dir
  if [ "$frame_subsampling_factor" -gt 1 ] && \
     [ "$scale_opts" == "--transition-scale=1.0 --self-loop-scale=0.1" ]; then
    echo "$0: frame-subsampling-factor is not 1 (so likely a chain system),"
    echo "...  but the scale opts are the defaults.  You probably want"
    echo "--scale-opts '--transition-scale=1.0 --self-loop-scale=1.0'"
    sleep 1
  fi
fi

if [ ! -z "$graphs_scp" ]; then
  if [ ! -f $graphs_scp ]; then
    echo "Could not find graphs $graphs_scp" && exit 1
  fi
  tra="scp:utils/filter_scp.pl $sdata/JOB/utt2spk $graphs_scp |"
  prog=compile-train-graphs-fsts
else
  tra="ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt $sdata/JOB/text|";
  prog=compile-train-graphs
fi

if [ $stage -le 0 ]; then
  ## because nnet3-latgen-faster doesn't support adding the transition-probs to the
  ## graph itself, we need to bake them into the compiled graphs.  This means we can't reuse previously compiled graphs,
  ## because the other scripts write them without transition probs.
  $cmd JOB=1:$nj $dir/log/compile_graphs.JOB.log \
    $prog --read-disambig-syms=$lang/phones/disambig.int \
    $scale_opts \
    $dir/tree $srcdir/${iter}.mdl  $lang/L.fst "$tra" \
    "ark:|gzip -c >$dir/fsts.JOB.gz" || exit 1
fi

if [ $stage -le 1 ]; then
  # Warning: nnet3-latgen-faster doesn't support a retry-beam so you may get more
  # alignment errors (however, it does have a default min-active=200 so this
  # will tend to reduce alignment errors).
  # --allow_partial=false makes sure we reach the end of the decoding graph.
  # --word-determinize=false makes sure we retain the alternative pronunciations of
  #   words (including alternatives regarding optional silences).
  #  --lattice-beam=$beam keeps all the alternatives that were within the beam,
  #    it means we do no pruning of the lattice (lattices from a training transcription
  #    will be small anyway).
  $cmd JOB=1:$nj $dir/log/generate_lattices.JOB.log \
    nnet3-latgen-faster --acoustic-scale=$acoustic_scale $ivector_opts $frame_subsampling_opt \
    --frames-per-chunk=$frames_per_chunk \
    --extra-left-context=$extra_left_context \
    --extra-right-context=$extra_right_context \
    --extra-left-context-initial=$extra_left_context_initial \
    --extra-right-context-final=$extra_right_context_final \
    --beam=$beam --lattice-beam=$beam \
    --allow-partial=false --word-determinize=false \
    $srcdir/${iter}.mdl "ark:gunzip -c $dir/fsts.JOB.gz |" \
    "$feats" "ark:|lattice-scale --acoustic-scale=$post_decode_acwt ark:- ark:- | gzip -c >$dir/lat.JOB.gz" || exit 1;
fi

echo "$0: done generating lattices from training transcripts."


================================================
FILE: egs/steps/nnet3/chain/build_tree.sh
================================================
#!/usr/bin/env bash
# Copyright 2012-2015  Johns Hopkins University (Author: Daniel Povey).
#  Apache 2.0.


# This script builds a tree for use in the 'chain' systems (although the script
# itself is pretty generic and doesn't use any 'chain' binaries).  This is just
# like the first stages of a standard system, like 'train_sat.sh', except it
# does 'convert-ali' to convert alignments to a monophone topology just created
# from the 'lang' directory (in case the topology is different from where you
# got the system's alignments from), and it stops after the tree-building and
# model-initialization stage, without re-estimating the Gaussians or training
# the transitions.


# Begin configuration section.
stage=-5
exit_stage=-100 # you can use this to require it to exit at the
                # beginning of a specific stage.  Not all values are
                # supported.
cmd=run.pl
context_opts=  # e.g. set this to "--context-width 5 --central-position 2" for quinphone.
cluster_thresh=-1  # for build-tree control final bottom-up clustering of leaves
frame_subsampling_factor=1
alignment_subsampling_factor=
leftmost_questions_truncate=-1  # note: this option is deprecated and has no effect
tree_stats_opts=
cluster_phones_opts=
repeat_frames=false
# End configuration section.

echo "$0 $@"  # Print the command line for logging

[ -f path.sh ] && . ./path.sh
. parse_options.sh || exit 1;

if [ $# != 5 ]; then
  echo "Usage: $0 <#leaves> <data> <lang> <ali-dir> <exp-dir>"
  echo " e.g.: $0 --frame-subsampling-factor 3 \\"
  echo "   --context-opts '--context-width=2 --central-position=1'  \\"
  echo "    3500 data/train_si84 data/lang_chain exp/tri3b_ali_si284_sp exp/chain/tree_a_sp"
  echo "Main options (for others, see top of script file)"
  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
  echo "  --config <config-file>                           # config containing options"
  echo "  --stage <stage>                                  # stage to do partial re-run from."
  echo "  --repeat-frames <true|false>                     # Only affects alignment conversion at"
  echo "                                                   # the end. If true, generate an "
  echo "                                                   # alignment using the frame-subsampled "
  echo "                                                   # topology that is repeated "
  echo "                                                   # --frame-subsampling-factor times "
  echo "                                                   # and interleaved, to be the same "
  echo "                                                   # length as the original alignment "
  echo "                                                   # (useful for cross-entropy training "
  echo "                                                   # of reduced frame rate systems)."
  echo "  --context-opts <option-string>                   # Options controlling phonetic context;"
  echo "                                                   # we suggest '--context-width=2 --central-position=1',"
  echo "                                                   # which is left bigram."
  echo "  --frame-subsampling-factor <factor>              # Factor (e.g. 3) controlling frame subsampling"
  echo "                                                   # at the neural net output, so the frame rate at"
  echo "                                                   # the output is less than at the input."
  exit 1;
fi

numleaves=$1
data=$2
lang=$3
alidir=$4
dir=$5

for f in $data/feats.scp $lang/phones.txt $alidir/final.mdl $alidir/ali.1.gz; do
  [ ! -f $f ] && echo "train_sat.sh: no such file $f" && exit 1;
done

oov=`cat $lang/oov.int`
nj=`cat $alidir/num_jobs` || exit 1;
silphonelist=`cat $lang/phones/silence.csl`
ciphonelist=`cat $lang/phones/context_indep.csl` || exit 1;
splice_opts=`cat $alidir/splice_opts 2>/dev/null` # frame-splicing options.
cmvn_opts=`cat $alidir/cmvn_opts 2>/dev/null`
delta_opts=`cat $alidir/delta_opts 2>/dev/null`

mkdir -p $dir/log
cp $alidir/splice_opts $dir 2>/dev/null # frame-splicing options.
cp $alidir/cmvn_opts $dir 2>/dev/null # cmn/cmvn option.
cp $alidir/delta_opts $dir 2>/dev/null # delta option.

utils/lang/check_phones_compatible.sh $lang/phones.txt $alidir/phones.txt || exit 1;
cp $lang/phones.txt $dir || exit 1;

echo $nj >$dir/num_jobs
if [ -f $alidir/per_utt ]; then
  sdata=$data/split${nj}utt
  utils/split_data.sh --per-utt $data $nj
else
  sdata=$data/split$nj
  utils/split_data.sh $data $nj
fi

# Set up features.

if [ -f $alidir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
echo "$0: feature type is $feat_type"

## Set up speaker-independent features.
case $feat_type in
  delta) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas $delta_opts ark:- ark:- |";;
  lda) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |"
    cp $alidir/final.mat $dir
    cp $alidir/full.mat $dir 2>/dev/null
    ;;
  *) echo "$0: invalid feature type $feat_type" && exit 1;
esac

# Add fMLLR transforms if available
if [ -f $alidir/trans.1 ]; then
  echo "$0: Using transforms from $alidir"
  feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$alidir/trans.JOB ark:- ark:- |"
fi

# Do subsampling of feats, if needed
if [ $frame_subsampling_factor -gt 1 ]; then
  feats="$feats subsample-feats --n=$frame_subsampling_factor ark:- ark:- |"
fi

if [ -z $alignment_subsampling_factor ]; then
  alignment_subsampling_factor=$frame_subsampling_factor
fi

if [ $stage -le -5 ]; then
  echo "$0: Initializing monophone model (for alignment conversion, in case topology changed)"

  [ ! -f $lang/phones/sets.int ] && exit 1;
  shared_phones_opt="--shared-phones=$lang/phones/sets.int"
  # get feature dimension
  example_feats="`echo $feats | sed s/JOB/1/g`";
  if ! feat_dim=$(feat-to-dim "$example_feats" - 2>/dev/null) || [ -z $feat_dim ]; then
    feat-to-dim "$example_feats" - # to see the error message.
    echo "error getting feature dimension"
    exit 1;
  fi
  $cmd JOB=1 $dir/log/init_mono.log \
    gmm-init-mono $shared_phones_opt "--train-feats=$feats subset-feats --n=10 ark:- ark:-|" $lang/topo $feat_dim \
      $dir/mono.mdl $dir/mono.tree || exit 1;
fi

if [ $stage -le -4 ]; then
  # Get tree stats.
  echo "$0: Accumulating tree stats"
  $cmd JOB=1:$nj $dir/log/acc_tree.JOB.log \
     convert-ali --frame-subsampling-factor=$alignment_subsampling_factor \
         $alidir/final.mdl $dir/mono.mdl $dir/mono.tree "ark:gunzip -c $alidir/ali.JOB.gz|" ark:-  \| \
      acc-tree-stats $context_opts $tree_stats_opts --ci-phones=$ciphonelist $dir/mono.mdl \
         "$feats" ark:- $dir/JOB.treeacc || exit 1;
  [ "`ls $dir/*.treeacc | wc -w`" -ne "$nj" ] && echo "$0: Wrong #tree-accs" && exit 1;
  $cmd $dir/log/sum_tree_acc.log \
    sum-tree-stats $dir/treeacc $dir/*.treeacc || exit 1;
  rm $dir/*.treeacc
fi

if [ $stage -le -3 ] && $train_tree; then
  echo "$0: Getting questions for tree clustering."
  # preparing questions, roots file...
  $cmd $dir/log/questions.log \
     cluster-phones $cluster_phones_opts $context_opts $dir/treeacc \
     $lang/phones/sets.int $dir/questions.int || exit 1;
  cat $lang/phones/extra_questions.int >> $dir/questions.int
  $cmd $dir/log/compile_questions.log \
    compile-questions $context_opts $lang/topo \
      $dir/questions.int $dir/questions.qst || exit 1;

  echo "$0: Building the tree"
  $cmd $dir/log/build_tree.log \
    build-tree $context_opts --verbose=1 --max-leaves=$numleaves \
    --cluster-thresh=$cluster_thresh $dir/treeacc $lang/phones/roots.int \
    $dir/questions.qst $lang/topo $dir/tree || exit 1;
fi

if [ $stage -le -2 ]; then
  echo "$0: Initializing the model"
  gmm-init-model  --write-occs=$dir/1.occs  \
    $dir/tree $dir/treeacc $lang/topo $dir/1.mdl 2> $dir/log/init_model.log || exit 1;
  grep 'no stats' $dir/log/init_model.log && echo "This is a bad warning.";
  rm $dir/treeacc
fi

if [ $stage -le -1 ]; then
  # Convert the alignments to the new tree.  Note: we likely will not use these
  # converted alignments in the chain system directly, but they could be useful
  # for other purposes.
  echo "$0: Converting alignments from $alidir to use current tree"
  $cmd JOB=1:$nj $dir/log/convert.JOB.log \
    convert-ali --repeat-frames=$repeat_frames \
      --frame-subsampling-factor=$alignment_subsampling_factor \
      $alidir/final.mdl $dir/1.mdl $dir/tree \
      "ark:gunzip -c $alidir/ali.JOB.gz|" "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1;
fi

cp $dir/1.mdl $dir/final.mdl

echo $0: Done building tree


================================================
FILE: egs/steps/nnet3/chain/build_tree_multiple_sources.sh
================================================
#!/usr/bin/env bash
# Copyright 2012-2015  Johns Hopkins University (Author: Daniel Povey).
#           2017  Vimal Manohar
#  Apache 2.0.

# This script is similar to steps/nnet3/chain/build_tree.sh but supports 
# getting statistics from multiple alignment sources.


# Begin configuration section.
stage=-5
exit_stage=-100 # you can use this to require it to exit at the
                # beginning of a specific stage.  Not all values are
                # supported.
cmd=run.pl
use_fmllr=true  # If true, fmllr transforms will be applied from the alignment directories.
                # Otherwise, no fmllr will be applied even if alignment directory contains trans.*
context_opts=  # e.g. set this to "--context-width 5 --central-position 2" for quinphone.
cluster_thresh=-1  # for build-tree control final bottom-up clustering of leaves
frame_subsampling_factor=1  # frame subsampling factor of output w.r.t. to the input features
tree_stats_opts=
cluster_phones_opts=
repeat_frames=false
# End configuration section.

echo "$0 $@"  # Print the command line for logging

[ -f path.sh ] && . ./path.sh
. parse_options.sh || exit 1;

if [ $# -lt 5 ]; then
  echo "Usage: steps/nnet3/chain/build_tree_multiple_sources.sh <#leaves> <lang> <data1> <ali-dir1> [<data2> <ali-dir2> ... <data> <ali-dirN>] <exp-dir>"
  echo " e.g.: steps/nnet3/chain/build_tree_multiple_sources.sh 15000 data/lang data/train_sup exp/tri3_ali data/train_unsup exp/tri3/best_path_train_unsup exp/tree_semi"
  echo "Main options (for others, see top of script file)"
  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
  echo "  --config <config-file>                           # config containing options"
  echo "  --stage <stage>                                  # stage to do partial re-run from."
  echo "  --repeat-frames <true|false>                     # Only affects alignment conversion at"
  echo "                                                   # the end. If true, generate an "
  echo "                                                   # alignment using the frame-subsampled "
  echo "                                                   # topology that is repeated "
  echo "                                                   # --frame-subsampling-factor times "
  echo "                                                   # and interleaved, to be the same "
  echo "                                                   # length as the original alignment "
  echo "                                                   # (useful for cross-entropy training "
  echo "                                                   # of reduced frame rate systems)."
  exit 1;
fi

numleaves=$1
lang=$2
dir=${@: -1}  # last argument to the script
shift 2;
data_and_alidirs=( $@ )  # read the remaining arguments into an array
unset data_and_alidirs[${#data_and_alidirs[@]}-1]  # 'pop' the last argument which is odir
num_sys=$[${#data_and_alidirs[@]}]  # number of systems to combine

if (( $num_sys % 2 != 0 )); then
  echo "$0: The data and alignment arguments must be an even number of arguments."
  exit 1
fi

num_sys=$((num_sys / 2))

data=$dir/data_tmp
mkdir -p $data

mkdir -p $dir
alidir=`echo ${data_and_alidirs[1]}`

datadirs=()
alidirs=()
for n in `seq 0 $[num_sys-1]`; do
  datadirs[$n]=${data_and_alidirs[$[2*n]]}
  alidirs[$n]=${data_and_alidirs[$[2*n+1]]}
done

utils/combine_data.sh $data ${datadirs[@]} || exit 1

for f in $data/feats.scp $lang/phones.txt $alidir/final.mdl $alidir/ali.1.gz; do
  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
done

oov=`cat $lang/oov.int`
nj=`cat $alidir/num_jobs` || exit 1;
silphonelist=`cat $lang/phones/silence.csl`
ciphonelist=`cat $lang/phones/context_indep.csl` || exit 1;
sdata=$data/split$nj;
splice_opts=`cat $alidir/splice_opts 2>/dev/null` # frame-splicing options.
cmvn_opts=`cat $alidir/cmvn_opts 2>/dev/null` || exit 1
delta_opts=`cat $alidir/delta_opts 2>/dev/null`

mkdir -p $dir/log
cp $alidir/splice_opts $dir 2>/dev/null # frame-splicing options.
cp $alidir/cmvn_opts $dir 2>/dev/null # cmn/cmvn option.
cp $alidir/delta_opts $dir 2>/dev/null # delta option.

utils/lang/check_phones_compatible.sh $lang/phones.txt $alidir/phones.txt || exit 1;
cp $lang/phones.txt $dir || exit 1;

echo $nj >$dir/num_jobs
[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;

# Set up features.
if [ -f $alidir/final.mat ]; then feat_type=lda; else feat_type=delta; fi

echo "$0: feature type is $feat_type"

feats=()
feats_one=()
for n in `seq 0 $[num_sys-1]`; do
  this_nj=$(cat ${alidirs[$n]}/num_jobs) || exit 1
  this_sdata=${datadirs[$n]}/split$this_nj
  [[ -d $this_sdata && ${datadirs[$n]}/feats.scp -ot $this_sdata ]] || split_data.sh ${datadirs[$n]} $this_nj || exit 1;
  ## Set up speaker-independent features.
  case $feat_type in
    delta) feats[$n]="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$this_sdata/JOB/utt2spk scp:$this_sdata/JOB/cmvn.scp scp:$this_sdata/JOB/feats.scp ark:- | add-deltas $delta_opts ark:- ark:- |"
      feats_one[$n]="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$this_sdata/1/utt2spk scp:$this_sdata/1/cmvn.scp scp:$this_sdata/1/feats.scp ark:- | add-deltas $delta_opts ark:- ark:- |";;
    lda) feats[$n]="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$this_sdata/JOB/utt2spk scp:$this_sdata/JOB/cmvn.scp scp:$this_sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |"
      feats_one[$n]="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$this_sdata/1/utt2spk scp:$this_sdata/1/cmvn.scp scp:$this_sdata/1/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |"
      cp $alidir/final.mat $dir
      cp $alidir/full.mat $dir 2>/dev/null
      ;;
    *) echo "$0: invalid feature type $feat_type" && exit 1;
  esac
  
  if $use_fmllr; then
    if [ ! -f ${alidirs[$n]}/trans.1 ]; then
      echo "$0: Could not find fMLLR transforms in ${alidirs[$n]}"
      exit 1
    fi

    echo "$0: Using transforms from ${alidirs[$n]}"
    feats[$n]="${feats[$n]} transform-feats --utt2spk=ark:$this_sdata/JOB/utt2spk ark,s,cs:${alidirs[$n]}/trans.JOB ark:- ark:- |"
    feats_one[$n]="${feats_one[$n]} transform-feats --utt2spk=ark:$this_sdata/1/utt2spk ark,s,cs:${alidirs[$n]}/trans.1 ark:- ark:- |"
  fi

  # Do subsampling of feats, if needed
  if [ $frame_subsampling_factor -gt 1 ]; then
    feats[$n]="${feats[$n]} subsample-feats --n=$frame_subsampling_factor ark:- ark:- |"
    feats_one[$n]="${feats_one[$n]} subsample-feats --n=$frame_subsampling_factor ark:- ark:- |"
  fi
done

if [ $stage -le -5 ]; then
  echo "$0: Initializing monophone model (for alignment conversion, in case topology changed)"

  [ ! -f $lang/phones/sets.int ] && exit 1;
  shared_phones_opt="--shared-phones=$lang/phones/sets.int"
  # get feature dimension
  example_feats="`echo ${feats[0]} | sed s/JOB/1/g`";
  if ! feat_dim=$(feat-to-dim "$example_feats" - 2>/dev/null) || [ -z $feat_dim ]; then
    feat-to-dim "$example_feats" - # to see the error message.
    echo "error getting feature dimension"
    exit 1;
  fi

  for n in `seq 0 $[num_sys-1]`; do
    copy-feats "${feats_one[$n]}" ark:-
  done | copy-feats ark:- ark:$dir/tmp.ark
  
  $cmd $dir/log/init_mono.log \
    gmm-init-mono $shared_phones_opt \
      "--train-feats=ark:subset-feats --n=10 ark:$dir/tmp.ark ark:- |" $lang/topo $feat_dim \
    $dir/mono.mdl $dir/mono.tree || exit 1
fi


if [ $stage -le -4 ]; then
  # Get tree stats.

  for n in `seq 0 $[num_sys-1]`; do
    echo "$0: Accumulating tree stats"
    this_data=${datadirs[$n]}
    this_alidir=${alidirs[$n]}
    this_nj=$(cat $this_alidir/num_jobs) || exit 1
    this_frame_subsampling_factor=1
    if [ -f $this_alidir/frame_subsampling_factor ]; then
      this_frame_subsampling_factor=$(cat $this_alidir/frame_subsampling_factor)
    fi

    if (( $frame_subsampling_factor % $this_frame_subsampling_factor != 0 )); then
      echo "$0: frame-subsampling-factor=$frame_subsampling_factor is not "
      echo "divisible by $this_frame_subsampling_factor (that of $this_alidir)"
      exit 1
    fi

    this_frame_subsampling_factor=$((frame_subsampling_factor / this_frame_subsampling_factor))
    $cmd JOB=1:$this_nj $dir/log/acc_tree.$n.JOB.log \
       convert-ali --frame-subsampling-factor=$this_frame_subsampling_factor \
           $this_alidir/final.mdl $dir/mono.mdl $dir/mono.tree "ark:gunzip -c $this_alidir/ali.JOB.gz|" ark:-  \| \
        acc-tree-stats $context_opts $tree_stats_opts --ci-phones=$ciphonelist $dir/mono.mdl \
           "${feats[$n]}" ark:- $dir/$n.JOB.treeacc || exit 1;
    [ "`ls $dir/$n.*.treeacc | wc -w`" -ne "$this_nj" ] && echo "$0: Wrong #tree-accs for data $n $this_data" && exit 1;
  done

  $cmd $dir/log/sum_tree_acc.log \
    sum-tree-stats $dir/treeacc $dir/*.treeacc || exit 1;
  rm $dir/*.treeacc
fi

if [ $stage -le -3 ] && $train_tree; then
  echo "$0: Getting questions for tree clustering."
  # preparing questions, roots file...
  $cmd $dir/log/questions.log \
     cluster-phones $cluster_phones_opts $context_opts $dir/treeacc \
     $lang/phones/sets.int $dir/questions.int || exit 1;
  cat $lang/phones/extra_questions.int >> $dir/questions.int
  $cmd $dir/log/compile_questions.log \
    compile-questions \
      $context_opts $lang/topo $dir/questions.int $dir/questions.qst || exit 1;

  echo "$0: Building the tree"
  $cmd $dir/log/build_tree.log \
    build-tree $context_opts --verbose=1 --max-leaves=$numleaves \
    --cluster-thresh=$cluster_thresh $dir/treeacc $lang/phones/roots.int \
    $dir/questions.qst $lang/topo $dir/tree || exit 1;
fi

if [ $stage -le -2 ]; then
  echo "$0: Initializing the model"
  gmm-init-model  --write-occs=$dir/1.occs  \
    $dir/tree $dir/treeacc $lang/topo $dir/1.mdl 2> $dir/log/init_model.log || exit 1;
  grep 'no stats' $dir/log/init_model.log && echo "This is a bad warning.";
  rm $dir/treeacc
fi

if [ $stage -le -1 ]; then
  # Convert the alignments to the new tree.  Note: we likely will not use these
  # converted alignments in the chain system directly, but they could be useful
  # for other purposes.

  for n in `seq 0 $[num_sys-1]`; do
    this_alidir=${alidirs[$n]}
    this_nj=$(cat $this_alidir/num_jobs) || exit 1
    
    this_frame_subsampling_factor=1
    if [ -f $this_alidir/frame_subsampling_factor ]; then
      this_frame_subsampling_factor=$(cat $this_alidir/frame_subsampling_factor)
    fi

    if (( $frame_subsampling_factor % $this_frame_subsampling_factor != 0 )); then
      echo "$0: frame-subsampling-factor=$frame_subsampling_factor is not "
      echo "divisible by $this_frame_subsampling_factor (hat of $this_alidir)"
      exit 1
    fi

    echo "$0: frame-subsampling-factor for $this_alidir is $this_frame_subsampling_factor"

    this_frame_subsampling_factor=$((frame_subsampling_factor / this_frame_subsampling_factor))
    echo "$0: Converting alignments from $this_alidir to use current tree"
    $cmd JOB=1:$this_nj $dir/log/convert.$n.JOB.log \
      convert-ali --repeat-frames=$repeat_frames \
        --frame-subsampling-factor=$this_frame_subsampling_factor \
        $this_alidir/final.mdl $dir/1.mdl $dir/tree "ark:gunzip -c $this_alidir/ali.JOB.gz |" \
        ark,scp:$dir/ali.$n.JOB.ark,$dir/ali.$n.JOB.scp || exit 1

    for i in `seq $this_nj`; do 
      cat $dir/ali.$n.$i.scp 
    done > $dir/ali.$n.scp || exit 1
  done

  for n in `seq 0 $[num_sys-1]`; do
    cat $dir/ali.$n.scp
  done | sort -k1,1 > $dir/ali.scp || exit 1

  utils/split_data.sh $data $nj
  $cmd JOB=1:$nj $dir/log/copy_alignments.JOB.log \
    copy-int-vector "scp:utils/filter_scp.pl $data/split$nj/JOB/utt2spk $dir/ali.scp |" \
    "ark:| gzip -c > $dir/ali.JOB.gz" || exit 1
fi

cp $dir/1.mdl $dir/final.mdl

echo $0: Done building tree


================================================
FILE: egs/steps/nnet3/chain/e2e/README.txt
================================================
The scripts related to end2end chain training are in this directory
Currently it has 3 scripts:

** prepare_e2e.sh which is almost equivalent
to regular chain's build-tree.sh (i.e. it creates the tree and
the transition-model) except it does not require any previously
trained models (in other terms, it does what stages -3 and -2
of steps/train_mono.sh do).

** get_egs_e2e.sh: this is simlilar to chain/get_egs.sh except it
uses training FSTs (instead of lattices) to generate end2end egs.

** train_e2e.py: this is very similar to chain/train.py but
with fewer stages (e.g. it does not compute the preconditioning matrix)


For details please see the comments at top of local/chain/e2e/run_flatstart_*.sh
and also src/chain/chain-generic-numerator.h.


================================================
FILE: egs/steps/nnet3/chain/e2e/compute_biphone_stats.py
================================================
#!/usr/bin/env python3

# Copyright    2018 Hossein Hadian
# Apache 2.0

import argparse
from os.path import join
import sys
import copy
import random

parser = argparse.ArgumentParser(description="""This script reads
    sequences of phone ids from std input and counts mono/biphone stats
    and writes the results to std out. The output can be used with
    gmm-init-biphone to create a better tree. The first part of the
    outupt is biphone counts with this format for each line:
    <phone-id> <phone-id> <count>
    and the second part of the output is monophone counts with the
    following format:
    <phone-id> <count>""")
parser.add_argument('langdir', type=str)
parser.add_argument('--shared-phones', type=str, choices=['true','false'],
                    default='true',
                    help="If true, stats will be collected for shared phones.")

args = parser.parse_args()
args.shared_phones = True if args.shared_phones == 'true' else False

# Read phone sets
phone_sets = []
phones = []
phone_to_shard_phone = {}
phone_to_shard_phone[0] = 0  # The no-left-context case
with open(join(args.langdir, 'phones/sets.int'), 'r', encoding='latin-1') as f:
    for line in f:
        phone_set = line.strip().split()
        phone_sets.append(phone_set)
        for phone in phone_set:
            phones.append(phone)
            phone_to_shard_phone[phone] = phone_set[0]

print('Loaded {} phone-sets containing {} phones.'.format(len(phone_sets),
                                                          len(phones)),
      file=sys.stderr)

biphone_counts = {}
mono_counts = {}
for line in sys.stdin:
    line = line.strip().split()
    key = line[0]
    line_phones = line[1:]
    for pair in zip([0] + line_phones, line_phones):  # 0 is for the no left-context case
        if args.shared_phones:
            pair = (phone_to_shard_phone[pair[0]], phone_to_shard_phone[pair[1]])
        if pair not in biphone_counts:
            biphone_counts[pair] = 0
        biphone_counts[pair] += 1
        mono_counts[pair[1]] = 1 if pair[1] not in mono_counts else mono_counts[pair[1]] + 1

for phone1 in [0] + phones:
    for phone2 in phones:
        pair = (phone1, phone2)
        shared_pair = ((phone_to_shard_phone[pair[0]], phone_to_shard_phone[pair[1]])
                       if args.shared_phones else pair)
        count = biphone_counts[shared_pair] if shared_pair in biphone_counts else 0
        if count != 0:
            print('{} {} {}'.format(pair[0], pair[1], count))
for phone in phones:
    shared = phone_to_shard_phone[phone] if args.shared_phones else phone
    count = mono_counts[shared] if shared in mono_counts else 0
    if count != 0:
        print('{} {}'.format(phone, count))


================================================
FILE: egs/steps/nnet3/chain/e2e/get_egs_e2e.sh
================================================
#!/usr/bin/env bash

# Copyright 2012-2015 Johns Hopkins University (Author: Daniel Povey)
# Copyright   2017  Hossein Hadian
# Apache 2.0.
#


# This is simlilar to chain/get_egs.sh except it
# uses training FSTs (instead of lattices) to generate end2end egs.
# It calls nnet3-chain-e2e-get-egs binary


# Begin configuration section.
cmd=run.pl
normalize_egs=true
frame_subsampling_factor=3 # frames-per-second of features we train on divided
                           # by frames-per-second at output of chain model
left_context=4    # amount of left-context per eg (i.e. extra frames of input features
                  # not present in the output supervision).
right_context=4   # amount of right-context per eg.
left_context_initial=-1    # if >=0, left-context for first chunk of an utterance
right_context_final=-1     # if >=0, right-context for last chunk of an utterance
compress=true   # set this to false to disable compression (e.g. if you want to see whether
                # results are affected).

num_utts_subset=1400     # number of utterances in validation and training
                        # subsets used for shrinkage and diagnostics.
num_valid_egs_combine=0  # #validation examples for combination weights at the very end.
num_train_egs_combine=1000 # number of train examples for the above.
num_egs_diagnostic=700 # number of examples for "compute_prob" jobs
frames_per_iter=400000 # each iteration of training, see this many frames per
                       # job, measured at the sampling rate of the features
                       # used.  This is just a guideline; it will pick a number
                       # that divides the number of samples in the entire data.

stage=0
nj=15         # This should be set to the maximum number of jobs you are
              # comfortable to run in parallel; you can increase it if your disk
              # speed is greater and you have more machines.
max_shuffle_jobs_run=50  # the shuffle jobs now include the nnet3-chain-normalize-egs command,
                         # which is fairly CPU intensive, so we can run quite a few at once
                         # without overloading the disks.
srand=0     # rand seed for nnet3-chain-get-egs, nnet3-chain-copy-egs and nnet3-chain-shuffle-egs
online_ivector_dir=  # can be used if we are including speaker information as iVectors.
cmvn_opts=  # can be used for specifying CMVN options, if feature type is not lda (if lda,
            # it doesn't make sense to use different options than were used as input to the
            # LDA transform).  This is used to turn off CMVN in the online-nnet experiments.
online_cmvn=false # Set to 'true' to replace 'apply-cmvn' by 'apply-cmvn-online' in the nnet3 input.
                  # The configuration is passed externally via '$cmvn_opts' given to train.py,
                  # typically as: --cmvn-opts="--config conf/online_cmvn.conf".
                  # The global_cmvn.stats are computed by this script from the features.
                  # Note: the online cmvn for ivector extractor it is controlled separately in
                  #       steps/online/nnet2/train_ivector_extractor.sh by --online-cmvn-iextractor


echo "$0 $@"  # Print the command line for logging

if [ -f path.sh ]; then . ./path.sh; fi
. parse_options.sh || exit 1;


if [ $# != 4 ]; then
  echo "Usage: $0 [opts] <data> <chain-dir> <fsts-dir> <egs-dir>"
  echo " e.g.: $0 data/train exp/chain/e2e exp/chain/e2e/egs"
  echo ""
  echo "From <chain-dir>, 0.trans_mdl (the transition-model), tree (the tree)"
  echo "and normalization.fst (the normalization FST, derived from the denominator FST)"
  echo "are read."
  echo ""
  echo "Main options (for others, see top of script file)"
  echo "  --config <config-file>                           # config file containing options"
  echo "  --nj <nj>                                        # The maximum number of jobs you want to run in"
  echo "                                                   # parallel (increase this only if you have good disk and"
  echo "                                                   # network speed).  default=6"
  echo "  --cmd (utils/run.pl;utils/queue.pl <queue opts>) # how to run jobs."
  echo "  --frames-per-iter <#samples;400000>              # Number of frames of data to process per iteration, per"
  echo "                                                   # process."
  echo "  --feat-type <lda|raw>                            # (raw is the default).  The feature type you want"
  echo "                                                   # to use as input to the neural net."
  echo "  --frame-subsampling-factor <factor;3>            # factor by which num-frames at nnet output is reduced "
  echo "  --left-context <int;4>                           # Number of frames on left side to append for feature input"
  echo "  --right-context <int;4>                          # Number of frames on right side to append for feature input"
  echo "  --left-context-initial <int;-1>                  # If >= 0, left-context for first chunk of an utterance"
  echo "  --right-context-final <int;-1>                   # If >= 0, right-context for last chunk of an utterance"
  echo "  --num-egs-diagnostic <#frames;4000>              # Number of egs used in computing (train,valid) diagnostics"
  echo "  --num-valid-egs-combine <#frames;10000>          # Number of egss used in getting combination weights at the"
  echo "                                                   # very end."
  echo "  --stage <stage|0>                                # Used to run a partially-completed training process from somewhere in"
  echo "                                                   # the middle."

  exit 1;
fi

data=$1
chaindir=$2
fstdir=$3
dir=$4

# Check some files.
[ ! -z "$online_ivector_dir" ] && \
  extra_files="$online_ivector_dir/ivector_online.scp $online_ivector_dir/ivector_period"

for f in $data/feats.scp $data/allowed_lengths.txt \
         $chaindir/{0.trans_mdl,tree,normalization.fst} $extra_files; do
  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
done

sdata=$data/split$nj
utils/split_data.sh $data $nj

mkdir -p $dir/log $dir/info

# Get list of validation utterances.

frame_shift=$(utils/data/get_frame_shift.sh $data)
utils/data/get_utt2dur.sh $data

frames_per_eg=$(cat $data/allowed_lengths.txt | tr '\n' , | sed 's/,$//')

[ ! -f "$data/utt2len" ] && feat-to-len scp:$data/feats.scp ark,t:$data/utt2len

cat $data/utt2len | \
  awk '{print $1}' | \
  utils/shuffle_list.pl 2>/dev/null | head -$num_utts_subset > $dir/valid_uttlist


len_uttlist=`wc -l $dir/valid_uttlist | awk '{print $1}'`
if [ $len_uttlist -lt $num_utts_subset ]; then
  echo "Number of utterances which have length at least $frames_per_eg is really low. Please check your data." && exit 1;
fi

if [ -f $data/utt2uniq ]; then  # this matters if you use data augmentation.
  # because of this stage we can again have utts with lengths less than
  # frames_per_eg
  echo "File $data/utt2uniq exists, so augmenting valid_uttlist to"
  echo "include all perturbed versions of the same 'real' utterances."
  mv $dir/valid_uttlist $dir/valid_uttlist.tmp
  utils/utt2spk_to_spk2utt.pl $data/utt2uniq > $dir/uniq2utt
  cat $dir/valid_uttlist.tmp | utils/apply_map.pl $data/utt2uniq | \
    sort | uniq | utils/apply_map.pl $dir/uniq2utt | \
    awk '{for(n=1;n<=NF;n++) print $n;}' | sort  > $dir/valid_uttlist
  rm $dir/uniq2utt $dir/valid_uttlist.tmp
fi

# awk -v mf_len=222 '{if ($2 == mf_len) print $1}' | \
cat $data/utt2len | \
  awk '{print $1}' | \
   utils/filter_scp.pl --exclude $dir/valid_uttlist | \
   utils/shuffle_list.pl 2>/dev/null | head -$num_utts_subset > $dir/train_subset_uttlist
len_uttlist=`wc -l $dir/train_subset_uttlist | awk '{print $1}'`
if [ $len_uttlist -lt $num_utts_subset ]; then
  echo "Number of utterances which have length at least $frames_per_eg is really low. Please check your data." && exit 1;
fi

## Set up features.

# get the global_cmvn stats for online-cmvn,
if $online_cmvn; then
  # create global_cmvn.stats,
  #
  # caution: the top-level nnet training script should copy
  # 'global_cmvn.stats' and 'online_cmvn' to its own dir.
  if ! matrix-sum --binary=false scp:$data/cmvn.scp - >$dir/global_cmvn.stats 2>/dev/null; then
    echo "$0: Error summing cmvn stats"
    exit 1
  fi
  touch $dir/online_cmvn
else
  [ -f $dir/online_cmvn ] && rm $dir/online_cmvn
fi

# create the feature pipelines,
if ! $online_cmvn; then
  # the original front-end with 'apply-cmvn',
  echo "$0: feature type is raw, with 'apply-cmvn'"
  feats="ark,s,cs:utils/filter_scp.pl --exclude $dir/valid_uttlist $sdata/JOB/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:- ark:- |"
  valid_feats="ark,s,cs:utils/filter_scp.pl $dir/valid_uttlist $data/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- |"
  train_subset_feats="ark,s,cs:utils/filter_scp.pl $dir/train_subset_uttlist $data/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- |"
else
  # the alternative front-end with 'apply-cmvn-online',
  # - the $cmvn_opts can be set to '--config=conf/online_cmvn.conf' which is the setup of ivector-extractor,
  echo "$0: feature type is raw, with 'apply-cmvn-online'"
  feats="ark,s,cs:utils/filter_scp.pl --exclude $dir/valid_uttlist $sdata/JOB/feats.scp | apply-cmvn-online $cmvn_opts --spk2utt=ark:$sdata/JOB/spk2utt $dir/global_cmvn.stats scp:- ark:- |"
  valid_feats="ark,s,cs:utils/filter_scp.pl $dir/valid_uttlist $data/feats.scp | apply-cmvn-online $cmvn_opts --spk2utt=ark:$data/spk2utt $dir/global_cmvn.stats scp:- ark:- |"
  train_subset_feats="ark,s,cs:utils/filter_scp.pl $dir/train_subset_uttlist $data/feats.scp | apply-cmvn-online $cmvn_opts --spk2utt=ark:$data/spk2utt $dir/global_cmvn.stats scp:- ark:- |"
fi
echo $cmvn_opts >$dir/cmvn_opts # caution: the top-level nnet training script should copy this to its own dir now.

if [ ! -z "$online_ivector_dir" ]; then
  ivector_dim=$(feat-to-dim scp:$online_ivector_dir/ivector_online.scp -) || exit 1;
  echo $ivector_dim > $dir/info/ivector_dim
  steps/nnet2/get_ivector_id.sh $online_ivector_dir > $dir/info/final.ie.id || exit 1
  ivector_period=$(cat $online_ivector_dir/ivector_period) || exit 1;
  ivector_opts="--online-ivectors=scp:$online_ivector_dir/ivector_online.scp --online-ivector-period=$ivector_period"
else
  ivector_opts=""
  echo 0 >$dir/info/ivector_dim
fi

if [ $stage -le 1 ]; then
  echo "$0: working out number of frames of training data"
  num_frames=$(steps/nnet2/get_num_frames.sh $data)
  echo $num_frames > $dir/info/num_frames
  echo "$0: working out feature dim"
  feats_one="$(echo $feats | sed s/JOB/1/g)"
  if ! feat_dim=$(feat-to-dim "$feats_one" - 2>/dev/null); then
    echo "Command failed (getting feature dim): feat-to-dim \"$feats_one\""
    exit 1
  fi
  echo $feat_dim > $dir/info/feat_dim
else
  num_frames=$(cat $dir/info/num_frames) || exit 1;
  feat_dim=$(cat $dir/info/feat_dim) || exit 1;
fi

# the + 1 is to round up, not down... we assume it doesn't divide exactly.
num_archives=$[$num_frames/$frames_per_iter+1]

# We may have to first create a smaller number of larger archives, with number
# $num_archives_intermediate, if $num_archives is more than the maximum number
# of open filehandles that the system allows per process (ulimit -n).
max_open_filehandles=500 #$(ulimit -n) || exit 1
num_archives_intermediate=$num_archives
archives_multiple=1
while [ $[$num_archives_intermediate+4] -gt $max_open_filehandles ]; do
  archives_multiple=$[$archives_multiple+1]
  num_archives_intermediate=$[$num_archives/$archives_multiple] || exit 1;
done
# now make sure num_archives is an exact multiple of archives_multiple.
num_archives=$[$archives_multiple*$num_archives_intermediate] || exit 1;

echo $num_archives >$dir/info/num_archives
echo $frames_per_eg >$dir/info/frames_per_eg
# Work out the number of egs per archive
egs_per_archive=$[$num_frames/($frames_per_eg*$num_archives)] || exit 1;
! [ $egs_per_archive -le $frames_per_iter ] && \
  echo "$0: script error: egs_per_archive=$egs_per_archive not <= frames_per_iter=$frames_per_iter" \
  && exit 1;

echo $egs_per_archive > $dir/info/egs_per_archive

echo "$0: creating $num_archives archives, each with $egs_per_archive egs, with"
echo "$0:   $frames_per_eg labels per example, and (left,right) context = ($left_context,$right_context)"
if [ $left_context_initial -ge 0 ] || [ $right_context_final -ge 0 ]; then
  echo "$0:   ... and (left-context-initial,right-context-final) = ($left_context_initial,$right_context_final)"
fi


if [ -e $dir/storage ]; then
  # Make soft links to storage directories, if distributing this way..  See
  # utils/create_split_dir.pl.
  echo "$0: creating data links"
  utils/create_data_link.pl $(for x in $(seq $num_archives); do echo $dir/cegs.$x.ark; done)
  for x in $(seq $num_archives_intermediate); do
    utils/create_data_link.pl $(for y in $(seq $nj); do echo $dir/cegs_orig.$y.$x.ark; done)
  done
fi


egs_opts="--left-context=$left_context --right-context=$right_context --num-frames=$frames_per_eg --frame-subsampling-factor=$frame_subsampling_factor --compress=$compress"
[ $left_context_initial -ge 0 ] && egs_opts="$egs_opts --left-context-initial=$left_context_initial"
[ $right_context_final -ge 0 ] && egs_opts="$egs_opts --right-context-final=$right_context_final"


echo $left_context > $dir/info/left_context
echo $right_context > $dir/info/right_context
echo $left_context_initial > $dir/info/left_context_initial
echo $right_context_final > $dir/info/right_context_final

num_fst_jobs=$(cat $fstdir/num_jobs) || exit 1;
for id in $(seq $num_fst_jobs); do cat $fstdir/fst.$id.scp; done > $fstdir/fst.scp

if [ $stage -le 3 ]; then
  echo "$0: Getting validation and training subset examples."
  rm $dir/.error 2>/dev/null

  # do the filtering just once, as fst.scp may be long.
  utils/filter_scp.pl <(cat $dir/valid_uttlist $dir/train_subset_uttlist) \
    <$fstdir/fst.scp >$fstdir/fst_special.scp
  if $normalize_egs; then
    norm_opt=$chaindir/normalization.fst
  else
    norm_opt=
  fi
  $cmd $dir/log/create_valid_subset.log \
    utils/filter_scp.pl $dir/valid_uttlist $fstdir/fst_special.scp \| \
    fstcopy scp:- ark:- \| \
    nnet3-chain-e2e-get-egs $ivector_opts --srand=$srand \
      $egs_opts $norm_opt \
      "$valid_feats" ark,s,cs:- $chaindir/0.trans_mdl "ark:$dir/valid_all.cegs" || touch $dir/.error &
  $cmd $dir/log/create_train_subset.log \
    utils/filter_scp.pl $dir/train_subset_uttlist $fstdir/fst_special.scp \| \
    fstcopy scp:- ark:- \| \
    nnet3-chain-e2e-get-egs $ivector_opts --srand=$srand \
      $egs_opts $norm_opt \
      "$train_subset_feats" ark,s,cs:- $chaindir/0.trans_mdl "ark:$dir/train_subset_all.cegs" || touch $dir/.error &
  wait;
  [ -f $dir/.error ] && echo "Error detected while creating train/valid egs" && exit 1
  echo "... Getting subsets of validation examples for diagnostics and combination."
  $cmd $dir/log/create_valid_subset_combine.log \
    nnet3-chain-subset-egs --n=$num_valid_egs_combine ark:$dir/valid_all.cegs \
    ark:$dir/valid_combine.cegs || touch $dir/.error &
  $cmd $dir/log/create_valid_subset_diagnostic.log \
    nnet3-chain-subset-egs --n=$num_egs_diagnostic ark:$dir/valid_all.cegs \
    ark:$dir/valid_diagnostic.cegs || touch $dir/.error &

  $cmd $dir/log/create_train_subset_combine.log \
    nnet3-chain-subset-egs --n=$num_train_egs_combine ark:$dir/train_subset_all.cegs \
    ark:$dir/train_combine.cegs || touch $dir/.error &
  $cmd $dir/log/create_train_subset_diagnostic.log \
    nnet3-chain-subset-egs --n=$num_egs_diagnostic ark:$dir/train_subset_all.cegs \
    ark:$dir/train_diagnostic.cegs || touch $dir/.error &
  wait
  sleep 5  # wait for file system to sync.
  cat $dir/valid_combine.cegs $dir/train_combine.cegs > $dir/combine.cegs

  for f in $dir/{combine,train_diagnostic,valid_diagnostic}.cegs; do
    [ ! -s $f ] && echo "No examples in file $f" && exit 1;
  done

  #rm $dir/valid_all.cegs $dir/train_subset_all.cegs $dir/{train,valid}_combine.cegs
  #exit 0
fi

echo "num_archives_intermediate:" $num_archives_intermediate
echo "num_archives: $num_archives"
echo "archives_multiple: $archives_multiple"

if [ $stage -le 4 ]; then
  # create cegs_orig.*.*.ark; the first index goes to $nj,
  # the second to $num_archives_intermediate.

  egs_list=
  for n in $(seq $num_archives_intermediate); do
    egs_list="$egs_list ark:$dir/cegs_orig.JOB.$n.ark"
  done
  echo "$0: Generating training examples on disk"

  # The examples will go round-robin to egs_list.  Note: we omit the
  # 'normalization.fst' argument while creating temporary egs: the phase of egs
  # preparation that involves the normalization FST is quite CPU-intensive and
  # it's more convenient to do it later, in the 'shuffle' stage.  Otherwise to
  # make it efficient we need to use a large 'nj', like 40, and in that case
  # there can be too many small files to deal with, because the total number of
  # files is the product of 'nj' by 'num_archives_intermediate', which might be
  # quite large.
  $cmd JOB=1:$nj $dir/log/get_egs.JOB.log \
    utils/filter_scp.pl $sdata/JOB/utt2spk $fstdir/fst.scp \| \
    fstcopy scp:- ark:- \| \
    nnet3-chain-e2e-get-egs $ivector_opts --srand=\$[JOB+$srand] $egs_opts \
     "$feats" ark,s,cs:- $chaindir/0.trans_mdl ark:- \| \
    nnet3-chain-copy-egs --random=true --srand=\$[JOB+$srand] ark:- $egs_list || exit 1;
fi

if [ $stage -le 5 ]; then
  echo "$0: recombining and shuffling order of archives on disk"
  # combine all the "egs_orig.*.JOB.scp" (over the $nj splits of the data) and
  # shuffle the order, writing to the egs.JOB.ark

  # the input is a concatenation over the input jobs.
  egs_list=
  for n in $(seq $nj); do
    egs_list="$egs_list $dir/cegs_orig.$n.JOB.ark"
  done

  if [ $archives_multiple == 1 ]; then # normal case.
    if $normalize_egs; then
      $cmd --max-jobs-run $max_shuffle_jobs_run --mem 8G JOB=1:$num_archives_intermediate $dir/log/shuffle.JOB.log \
        nnet3-chain-normalize-egs $chaindir/normalization.fst "ark:cat $egs_list|" ark:- \| \
        nnet3-chain-shuffle-egs --srand=\$[JOB+$srand] ark:- ark:$dir/cegs.JOB.ark  || exit 1;
    else
      $cmd --max-jobs-run $max_shuffle_jobs_run --mem 8G JOB=1:$num_archives_intermediate $dir/log/shuffle.JOB.log \
        nnet3-chain-shuffle-egs --srand=\$[JOB+$srand] "ark:cat $egs_list|" ark:$dir/cegs.JOB.ark  || exit 1;
    fi
  else
    # we need to shuffle the 'intermediate archives' and then split into the
    # final archives.  we create soft links to manage this splitting, because
    # otherwise managing the output names is quite difficult (and we don't want
    # to submit separate queue jobs for each intermediate archive, because then
    # the --max-jobs-run option is hard to enforce).
    output_archives="$(for y in $(seq $archives_multiple); do echo ark:$dir/cegs.JOB.$y.ark; done)"
    for x in $(seq $num_archives_intermediate); do
      for y in $(seq $archives_multiple); do
        archive_index=$[($x-1)*$archives_multiple+$y]
        # egs.intermediate_archive.{1,2,...}.ark will point to egs.archive.ark
        ln -sf cegs.$archive_index.ark $dir/cegs.$x.$y.ark || exit 1
      done
    done
    $cmd --max-jobs-run $max_shuffle_jobs_run --mem 8G JOB=1:$num_archives_intermediate $dir/log/shuffle.JOB.log \
      nnet3-chain-normalize-egs $chaindir/normalization.fst "ark:cat $egs_list|" ark:- \| \
      nnet3-chain-shuffle-egs --srand=\$[JOB+$srand] ark:- ark:- \| \
      nnet3-chain-copy-egs ark:- $output_archives || exit 1;
  fi
fi

if [ $stage -le 6 ]; then
  echo "$0: removing temporary archives"
  (
    cd $dir
    for f in $(ls -l . | grep 'cegs_orig' | awk '{ X=NF-1; Y=NF-2; if ($X == "->")  print $Y, $NF; }'); do rm $f; done
    # the next statement removes them if we weren't using the soft links to a
    # 'storage' directory.
    rm cegs_orig.*.ark 2>/dev/null
  )
  if [ $archives_multiple -gt 1 ]; then
    # there are some extra soft links that we should delete.
    for f in $dir/cegs.*.*.ark; do rm $f; done
  fi
  echo "$0: removing temporary alignments"
  rm $dir/ali.{ark,scp} 2>/dev/null

fi

echo "$0: Finished preparing training examples"


================================================
FILE: egs/steps/nnet3/chain/e2e/prepare_e2e.sh
================================================
#!/usr/bin/env bash
# Copyright 2017  Hossein Hadian
# Apache 2.0

# To be run from ..
# Flat start chain model training.

# This script initializes a trivial tree and transition model
# for flat-start chain training. It then generates the training
# graphs for the training data.

# Begin configuration section.
cmd=run.pl
nj=4
stage=0
shared_phones=true
treedir=              # If specified, the tree and model will be copied from there
                      # note that it may not be flat start anymore.
type=mono             # Can be either mono or biphone -- either way
                      # the resulting tree is full (i.e. it doesn't do any tying)
ci_silence=false      # If true, silence phones will be treated as context independent

scale_opts="--transition-scale=0.0 --self-loop-scale=0.0"
tie=false             # If true, gmm-init-biphone will do some tying when
                      # creating the full biphone tree (it won't be full anymore).
                      # Specifically, it will revert to monophone if the data
                      # counts for a biphone are smaller than min_biphone_count.
                      # If the monophone count is also smaller than min_monophone_count,
                      # it will revert to a shared global phone. Note that this
                      # only affects biphone models (i.e., type=biphone) which
                      # use the special chain topology.
min_biphone_count=100
min_monophone_count=20
# End configuration section.

echo "$0 $@"  # Print the command line for logging

if [ -f path.sh ]; then . ./path.sh; fi
. parse_options.sh || exit 1;

if [ $# != 3 ]; then
  echo "Usage: steps/prepare_e2e.sh [options] <data-dir> <lang-dir> <exp-dir>"
  echo " e.g.: steps/prepare_e2e.sh data/train data/lang_chain exp/chain/e2e_tree"
  echo "main options (for others, see top of script file)"
  echo "  --config <config-file>                           # config containing options"
  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
  echo "  --type <mono | biphone>                          # context dependency type"
  echo "  --tie <true | false>                             # enable/disable count-based tying"
  exit 1;
fi

data=$1
lang=$2
dir=$3

if [[ "$type" != "mono" && "$type" != "biphone" ]]; then
  echo "'type' should be either mono or biphone."
  exit 1;
fi

oov_sym=`cat $lang/oov.int` || exit 1;

mkdir -p $dir/log

echo $scale_opts > $dir/scale_opts  # just for easier reference (it is in the logs too)
echo $nj > $dir/num_jobs
sdata=$data/split$nj;
[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;

cp $lang/phones.txt $dir || exit 1;

[ ! -f $lang/phones/sets.int ] && exit 1;

if $shared_phones; then
  shared_phones_opt="--shared-phones=$lang/phones/sets.int"
fi

ciphonelist=`cat $lang/phones/context_indep.csl` || exit 1;
if $ci_silence; then
  ci_opt="--ci-phones=$ciphonelist"
fi

tie_opts=
if $tie && [[ "$type" = "biphone" ]]; then
  cat $data/text | steps/chain/e2e/text_to_phones.py --edge-silprob 0 \
                                                     --between-silprob 0 \
                                                     $lang | \
    cut -d' ' -f 2- | utils/sym2int.pl $lang/phones.txt | \
    steps/chain/e2e/compute_biphone_stats.py $lang >$dir/phone-stats.txt
  tie_opts="--min-biphone-count=$min_biphone_count \
--min-monophone-count=$min_monophone_count --phone-counts=$dir/phone-stats.txt"
fi

if [ $stage -le 0 ]; then
  if [ -z $treedir ]; then
    echo "$0: Initializing $type system."
    # feat dim does not matter here. Just set it to 10
    $cmd $dir/log/init_${type}_mdl_tree.log \
         gmm-init-$type $tie_opts $ci_opt $shared_phones_opt $lang/topo 10 \
         $dir/0.mdl $dir/tree || exit 1;
  else
    echo "$0: Copied tree/mdl from $treedir." >$dir/log/init_mdl_tree.log
    cp $treedir/final.mdl $dir/0.mdl || exit 1;
    cp $treedir/tree $dir || exit 1;
  fi
  copy-transition-model $dir/0.mdl $dir/0.trans_mdl
  ln -s 0.mdl $dir/final.mdl  # for consistency with scripts which require a final.mdl
fi

lex=$lang/L.fst
if [ $stage -le 1 ]; then
  echo "$0: Compiling training graphs"
  $cmd JOB=1:$nj $dir/log/compile_graphs.JOB.log \
    compile-train-graphs $scale_opts --read-disambig-syms=$lang/phones/disambig.int \
    $dir/tree $dir/0.mdl $lex \
    "ark:sym2int.pl --map-oov $oov_sym -f 2- $lang/words.txt < $sdata/JOB/text|" \
    "ark,scp:$dir/fst.JOB.ark,$dir/fst.JOB.scp" || exit 1;
fi

echo "$0: Done"


================================================
FILE: egs/steps/nnet3/chain/e2e/text_to_phones.py
================================================
#!/usr/bin/env python

# Copyright    2017 Hossein Hadian
# Apache 2.0


""" This reads data/train/text from standard input, converts the word transcriptions
    to phone transcriptions using the provided lexicon,
    and writes them to standard output.
"""
from __future__ import print_function

import argparse
from os.path import join
import sys
import copy
import random

parser = argparse.ArgumentParser(description="""This script reads
    data/train/text from std input and converts the word transcriptions
    to phone transcriptions using the provided lexicon""")
parser.add_argument('langdir', type=str)
parser.add_argument('--edge-silprob', type=float, default=0.8,
                    help="""Probability of optional silence at the beginning
                    and end.""")
parser.add_argument('--between-silprob', type=float, default=0.2,
                    help="Probability of optional silence between the words.")


args = parser.parse_args()

# optional silence
sil = open(join(args.langdir,
                "phones/optional_silence.txt")).readline().strip()

oov_word = open(join(args.langdir, "oov.txt")).readline().strip()


# load the lexicon
lexicon = {}
with open(join(args.langdir, "phones/align_lexicon.txt")) as f:
    for line in f:
        line = line.strip();
        parts = line.split()
        lexicon[parts[0]] = parts[2:]  # ignore parts[1]

n_tot = 0
n_fail = 0
for line in sys.stdin:
    line = line.strip().split()
    key = line[0]
    word_trans = line[1:]   # word-level transcription
    phone_trans = []        # phone-level transcription
    if random.random() < args.edge_silprob:
        phone_trans += [sil]
    for i in range(len(word_trans)):
        n_tot += 1
        word = word_trans[i]
        if word not in lexicon:
            n_fail += 1
            if n_fail < 20:
                sys.stderr.write("{} not found in lexicon, replacing with {}\n".format(word, oov_word))
            elif n_fail == 20:
                sys.stderr.write("Not warning about OOVs any more.\n")
            pronunciation = lexicon[oov_word]
        else:
            pronunciation = copy.deepcopy(lexicon[word])
        phone_trans += pronunciation
        prob = args.between_silprob if i < len(word_trans) - 1 else args.edge_silprob
        if random.random() < prob:
            phone_trans += [sil]
    print(key + " " + " ".join(phone_trans))

sys.stderr.write("Done. {} out of {} were OOVs.\n".format(n_fail, n_tot))


================================================
FILE: egs/steps/nnet3/chain/e2e/train_e2e.py
================================================
#!/usr/bin/env python

# Copyright 2016    Vijayaditya Peddinti.
#           2016    Vimal Manohar
#           2017    Hossein Hadian
# Apache 2.0.

""" This script does flat-start chain training and is based on
    steps/nnet3/chain/train.py.
"""

import argparse
import logging
import os
import pprint
import shutil
import sys
import traceback

sys.path.insert(0, 'steps')
import libs.nnet3.train.common as common_train_lib
import libs.common as common_lib
import libs.nnet3.train.chain_objf.acoustic_model as chain_lib
import libs.nnet3.report.log_parse as nnet3_log_parse


logger = logging.getLogger('libs')
logger.setLevel(logging.INFO)
handler = logging.StreamHandler()
handler.setLevel(logging.INFO)
formatter = logging.Formatter("%(asctime)s [%(pathname)s:%(lineno)s - "
                              "%(funcName)s - %(levelname)s ] %(message)s")
handler.setFormatter(formatter)
logger.addHandler(handler)
logger.info('Starting chain model trainer (train.py)')


def get_args():
    """ Get args from stdin.

    We add compulsary arguments as named arguments for readability

    The common options are defined in the object
    libs.nnet3.train.common.CommonParser.parser.
    See steps/libs/nnet3/train/common.py
    """

    parser = argparse.ArgumentParser(
        description="""Trains RNN and DNN acoustic models using the 'chain'
        objective function.""",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        conflict_handler='resolve',
        parents=[common_train_lib.CommonParser().parser])

    # egs extraction options
    parser.add_argument("--egs.chunk-width", type=str, dest='chunk_width',
                        default="20",
                        help="""Number of frames per chunk in the examples
                        used to train the RNN.   Caution: if you double this you
                        should halve --trainer.samples-per-iter.  May be
                        a comma-separated list of alternatives: first width
                        is the 'principal' chunk-width, used preferentially""")

    # chain options
    parser.add_argument("--chain.lm-opts", type=str, dest='lm_opts',
                        default=None, action=common_lib.NullstrToNoneAction,
                        help="options to be be passed to chain-est-phone-lm")
    parser.add_argument("--chain.l2-regularize", type=float,
                        dest='l2_regularize', default=0.0,
                        help="""Weight of regularization function which is the
                        l2-norm of the output of the network. It should be used
                        without the log-softmax layer for the outputs.  As
                        l2-norm of the log-softmax outputs can dominate the
                        objective function.""")
    parser.add_argument("--chain.xent-regularize", type=float,
                        dest='xent_regularize', default=0.0,
                        help="Weight of regularization function which is the "
                        "cross-entropy cost the outputs.")
    parser.add_argument("--chain.leaky-hmm-coefficient", type=float,
                        dest='leaky_hmm_coefficient', default=0.00001,
                        help="")
    parser.add_argument("--chain.apply-deriv-weights", type=str,
                        dest='apply_deriv_weights', default=True,
                        action=common_lib.StrToBoolAction,
                        choices=["true", "false"],
                        help="")
    parser.add_argument("--chain.frame-subsampling-factor", type=int,
                        dest='frame_subsampling_factor', default=3,
                        help="ratio of frames-per-second of features we "
                        "train on, to chain model's output")
    parser.add_argument("--chain.alignment-subsampling-factor", type=int,
                        dest='alignment_subsampling_factor',
                        default=3,
                        help="ratio of frames-per-second of input "
                        "alignments to chain model's output")
    parser.add_argument("--chain.left-deriv-truncate", type=int,
                        dest='left_deriv_truncate',
                        default=None,
                        help="Deprecated. Kept for back compatibility")


    # trainer options
    parser.add_argument("--trainer.num-epochs", type=float, dest='num_epochs',
                        default=10.0,
                        help="Number of epochs to train the model")
    parser.add_argument("--trainer.frames-per-iter", type=int,
                        dest='frames_per_iter', default=800000,
                        help="""Each iteration of training, see this many
                        [input] frames per job.  This option is passed to
                        get_egs.sh.  Aim for about a minute of training
                        time""")
    parser.add_argument("--trainer.num-chunk-per-minibatch", type=str,
                        dest='num_chunk_per_minibatch', default='128',
                        help="""Number of sequences to be processed in
                        parallel every minibatch.  May be a more general
                        rule as accepted by the --minibatch-size option of
                        nnet3-merge-egs; run that program without args to see
                        the format.""")

    # Parameters for the optimization
    parser.add_argument("--trainer.optimization.initial-effective-lrate",
                        type=float, dest='initial_effective_lrate',
                        default=0.0002,
                        help="Learning rate used during the initial iteration")
    parser.add_argument("--trainer.optimization.final-effective-lrate",
                        type=float, dest='final_effective_lrate',
                        default=0.00002,
                        help="Learning rate used during the final iteration")
    parser.add_argument("--trainer.optimization.shrink-value", type=float,
                        dest='shrink_value', default=1.0,
                        help="""Scaling factor used for scaling the parameter
                        matrices when the derivative averages are below the
                        shrink-threshold at the non-linearities.  E.g. 0.99.
                        Only applicable when the neural net contains sigmoid or
                        tanh units.""")
    parser.add_argument("--trainer.optimization.shrink-saturation-threshold",
                        type=float,
                        dest='shrink_saturation_threshold', default=0.40,
                        help="""Threshold that controls when we apply the
                        'shrinkage' (i.e. scaling by shrink-value).  If the
                        saturation of the sigmoid and tanh nonlinearities in
                        the neural net (as measured by
                        steps/nnet3/get_saturation.pl) exceeds this threshold
                        we scale the parameter matrices with the
                        shrink-value.""")
    # RNN-specific training options
    parser.add_argument("--trainer.deriv-truncate-margin", type=int,
                        dest='deriv_truncate_margin', default=None,
                        help="""(Relevant only for recurrent models). If
                        specified, gives the margin (in input frames) around
                        the 'required' part of each chunk that the derivatives
                        are backpropagated to. If unset, the derivatives are
                        backpropagated all the way to the boundaries of the
                        input data. E.g. 8 is a reasonable setting. Note: the
                        'required' part of the chunk is defined by the model's
                        {left,right}-context.""")

    # General options
    parser.add_argument("--feat-dir", type=str, required=True,
                        help="Directory with features used for training "
                        "the neural network.")
    parser.add_argument("--tree-dir", type=str, required=True,
                        help="""Directory containing the tree to use for this
                        model (we also expect 0.mdl and fsts.* in that
                        directory""")
    parser.add_argument("--dir", type=str, required=True,
                        help="Directory to store the models and "
                        "all other files.")

    print(' '.join(sys.argv))
    print(sys.argv)

    args = parser.parse_args()

    [args, run_opts] = process_args(args)

    return [args, run_opts]


def process_args(args):
    """ Process the options got from get_args()
    """

    if not common_train_lib.validate_chunk_width(args.chunk_width):
        raise Exception("--egs.chunk-width has an invalid value");

    if not common_train_lib.validate_minibatch_size_str(args.num_chunk_per_minibatch):
        raise Exception("--trainer.num-chunk-per-minibatch has an invalid value");

    if args.chunk_left_context < 0:
        raise Exception("--egs.chunk-left-context should be non-negative")

    if args.chunk_right_context < 0:
        raise Exception("--egs.chunk-right-context should be non-negative")

    if args.left_deriv_truncate is not None:
        args.deriv_truncate_margin = -args.left_deriv_truncate
        logger.warning(
            "--chain.left-deriv-truncate (deprecated) is set by user, and "
            "--trainer.deriv-truncate-margin is set to negative of that "
            "value={0}. We recommend using the option "
            "--trainer.deriv-truncate-margin.".format(
                args.deriv_truncate_margin))

    if (not os.path.exists(args.dir + "/configs")):
        raise Exception("This scripts expects the directory specified with "
                        "--dir={0} to exist and have a configs/ directory which "
                        "is the output of make_configs.py script".format(args.dir))

    # set the options corresponding to args.use_gpu
    run_opts = common_train_lib.RunOpts()
    if args.use_gpu in ["true", "false"]:
        args.use_gpu = ("yes" if args.use_gpu == "true" else "no")
    if args.use_gpu in ["yes", "wait"]:
        if not common_lib.check_if_cuda_compiled():
            logger.warning(
                """You are running with one thread but you have not compiled
                   for CUDA.  You may be running a setup optimized for GPUs.
                   If you have GPUs and have nvcc installed, go to src/ and do
                   ./configure; make""")

        run_opts.train_queue_opt = "--gpu 1"
        run_opts.parallel_train_opts = "--use-gpu={}".format(args.use_gpu)
        run_opts.combine_queue_opt = "--gpu 1"
        run_opts.combine_gpu_opt = "--use-gpu={}".format(args.use_gpu)

    else:
        logger.warning("Without using a GPU this will be very slow. "
                       "nnet3 does not yet support multiple threads.")

        run_opts.train_queue_opt = ""
        run_opts.parallel_train_opts = "--use-gpu=no"
        run_opts.combine_queue_opt = ""
        run_opts.combine_gpu_opt = "--use-gpu=no"

    run_opts.command = args.command
    run_opts.egs_command = (args.egs_command
                            if args.egs_command is not None else
                            args.command)

    return [args, run_opts]


def train(args, run_opts):
    """ The main function for training.

    Args:
        args: a Namespace object with the required parameters
            obtained from the function process_args()
        run_opts: RunOpts object obtained from the process_args()
    """

    arg_string = pprint.pformat(vars(args))
    logger.info("Arguments for the experiment\n{0}".format(arg_string))

    # Check files
    files = ['{0}/feats.scp'.format(args.feat_dir), '{0}/fst.1.scp'.format(args.tree_dir),
             '{0}/final.mdl'.format(args.tree_dir), '{0}/tree'.format(args.tree_dir),
             '{0}/phone_lm.fst'.format(args.tree_dir),
             '{0}/num_jobs'.format(args.tree_dir)]
    for file in files:
        if not os.path.isfile(file):
            raise Exception('Expected {0} to exist.'.format(file))

    # Set some variables.
    num_jobs = common_lib.get_number_of_jobs(args.tree_dir)
    feat_dim = common_lib.get_feat_dim(args.feat_dir)
    ivector_dim = common_lib.get_ivector_dim(args.online_ivector_dir)
    ivector_id = common_lib.get_ivector_extractor_id(args.online_ivector_dir)
    logger.info("feat-dim: {}, ivector-dim: {}".format(feat_dim, ivector_dim))

    # split the training data into parts for individual jobs
    # we will use the same number of jobs as that used for compiling FSTs
    common_lib.execute_command("utils/split_data.sh {0} {1}".format(
            args.feat_dir, num_jobs))
    shutil.copy('{0}/tree'.format(args.tree_dir), args.dir)
    shutil.copy('{0}/phones.txt'.format(args.tree_dir), args.dir)
    shutil.copy('{0}/phone_lm.fst'.format(args.tree_dir), args.dir)
    shutil.copy('{0}/0.trans_mdl'.format(args.tree_dir), args.dir)
    with open('{0}/num_jobs'.format(args.dir), 'w') as f:
        f.write(str(num_jobs))

    config_dir = '{0}/configs'.format(args.dir)
    var_file = '{0}/vars'.format(config_dir)

    variables = common_train_lib.parse_generic_config_vars_file(var_file)

    # Set some variables.
    try:
        model_left_context = variables['model_left_context']
        model_right_context = variables['model_right_context']
    except KeyError as e:
        raise Exception("KeyError {0}: Variables need to be defined in "
                        "{1}".format(str(e), '{0}/configs'.format(args.dir)))

    left_context = args.chunk_left_context + model_left_context
    right_context = args.chunk_right_context + model_right_context
    left_context_initial = (args.chunk_left_context_initial + model_left_context if
                            args.chunk_left_context_initial >= 0 else -1)
    right_context_final = (args.chunk_right_context_final + model_right_context if
                           args.chunk_right_context_final >= 0 else -1)

    # Initialize as "raw" nnet, prior to training the LDA-like preconditioning
    # matrix.  This first config just does any initial splicing that we do;
    # we do this as it's a convenient way to get the stats for the 'lda-like'
    # transform.

    if (args.stage <= -5):
        logger.info("Creating denominator FST")
        chain_lib.create_denominator_fst(args.dir, args.tree_dir, run_opts)

    if (args.stage <= -4):
        logger.info("Initializing a basic network...")
        common_lib.execute_command(
            """{command} {dir}/log/nnet_init.log \
                    nnet3-init --srand=-2 {dir}/configs/final.config \
                    {dir}/init.raw""".format(command=run_opts.command,
                                             dir=args.dir))

    egs_left_context = left_context + args.frame_subsampling_factor / 2
    egs_right_context = right_context + args.frame_subsampling_factor / 2
    egs_left_context_initial = (left_context_initial + args.frame_subsampling_factor / 2 if
                                left_context_initial >= 0 else -1)
    egs_right_context_final = (right_context_final + args.frame_subsampling_factor / 2 if
                               right_context_final >= 0 else -1)

    default_egs_dir = '{0}/egs'.format(args.dir)
    if (args.stage <= -3) and args.egs_dir is None:
        logger.info("Generating end-to-end egs...")
        common_lib.execute_command(
            """steps/nnet3/chain/e2e/get_egs_e2e.sh {egs_opts} \
                    --cmd "{command}" \
                    --cmvn-opts "{cmvn_opts}" \
                    --online-ivector-dir "{ivector_dir}" \
                    --left-context {left_context} \
                    --right-context {right_context} \
                    --left-context-initial {left_context_initial} \
                    --right-context-final {right_context_final} \
                    --frame-subsampling-factor {frame_subsampling_factor} \
                    --stage {stage} \
                    --frames-per-iter {frames_per_iter} \
                    --srand {srand} \
                    {data} {dir} {fst_dir} {egs_dir}""".format(
                        command=run_opts.egs_command,
                        cmvn_opts=args.cmvn_opts if args.cmvn_opts is not None else '',
                        ivector_dir=(args.online_ivector_dir
                                     if args.online_ivector_dir is not None
                                     else ''),
                        left_context=egs_left_context,
                        right_context=egs_right_context,
                        left_context_initial=egs_left_context_initial,
                        right_context_final=egs_right_context_final,
                        frame_subsampling_factor=args.frame_subsampling_factor,
                        stage=args.egs_stage, frames_per_iter=args.frames_per_iter,
                        srand=args.srand,
                        data=args.feat_dir, dir=args.dir, fst_dir=args.tree_dir,
                        egs_dir=default_egs_dir,
                        egs_opts=args.egs_opts if args.egs_opts is not None else ''))

    if args.egs_dir is None:
        egs_dir = default_egs_dir
    else:
        egs_dir = args.egs_dir

    [egs_left_context, egs_right_context,
     frames_per_eg_str, num_archives] = (
        common_train_lib.verify_egs_dir(egs_dir, feat_dim,
                                        ivector_dim, ivector_id,
                                        egs_left_context, egs_right_context,
                                        egs_left_context_initial,
                                        egs_right_context_final))

    num_archives_expanded = num_archives * args.frame_subsampling_factor

    if (args.num_jobs_final > num_archives_expanded):
        raise Exception('num_jobs_final cannot exceed the '
                        'expanded number of archives')

    # copy the properties of the egs to dir for
    # use during decoding
    logger.info("Copying the properties from {0} to {1}".format(egs_dir, args.dir))
    common_train_lib.copy_egs_properties_to_exp_dir(egs_dir, args.dir)


    if (args.stage <= -1):
        logger.info("Preparing the initial acoustic model.")
        chain_lib.prepare_initial_acoustic_model(args.dir, run_opts)

    with open("{0}/frame_subsampling_factor".format(args.dir), "w") as f:
        f.write(str(args.frame_subsampling_factor))

    # set num_iters so that as close as possible, we process the data
    # $num_epochs times, i.e. $num_iters*$avg_num_jobs) ==
    # $num_epochs*$num_archives, where
    # avg_num_jobs=(num_jobs_initial+num_jobs_final)/2.
    num_archives_to_process = int(args.num_epochs * num_archives_expanded)
    num_archives_processed = 0
    num_iters = ((num_archives_to_process * 2)
                 / (args.num_jobs_initial + args.num_jobs_final))

    models_to_combine = common_train_lib.get_model_combine_iters(
        num_iters, args.num_epochs,
        num_archives_expanded, args.max_models_combine,
        args.num_jobs_final)

    min_deriv_time = None
    max_deriv_time_relative = None
    if args.deriv_truncate_margin is not None:
        min_deriv_time = -args.deriv_truncate_margin - model_left_context
        max_deriv_time_relative = \
           args.deriv_truncate_margin + model_right_context

    logger.info("Training will run for {0} epochs = "
                "{1} iterations".format(args.num_epochs, num_iters))

    for iter in range(num_iters):

        percent = num_archives_processed * 100.0 / num_archives_to_process
        epoch = (num_archives_processed * args.num_epochs
                 / num_archives_to_process)

        if (args.exit_stage is not None) and (iter == args.exit_stage):
            logger.info("Exiting early due to --exit-stage {0}".format(iter))
            return

        current_num_jobs = common_train_lib.get_current_num_jobs(
            iter, num_iters,
            args.num_jobs_initial, args.num_jobs_step, args.num_jobs_final)

        if args.stage <= iter:
            model_file = "{dir}/{iter}.mdl".format(dir=args.dir, iter=iter)

            lrate = common_train_lib.get_learning_rate(iter, current_num_jobs,
                                                       num_iters,
                                                       num_archives_processed,
                                                       num_archives_to_process,
                                                       args.initial_effective_lrate,
                                                       args.final_effective_lrate)
            shrinkage_value = 1.0 - (args.proportional_shrink * lrate)
            if shrinkage_value <= 0.5:
                raise Exception("proportional-shrink={0} is too large, it gives "
                                "shrink-value={1}".format(args.proportional_shrink,
                                                          shrinkage_value))
            if args.shrink_value < shrinkage_value:
                shrinkage_value = (args.shrink_value
                                   if common_train_lib.should_do_shrinkage(
                                        iter, model_file,
                                        args.shrink_saturation_threshold)
                                   else shrinkage_value)

            shrink_info_str = ''
            if shrinkage_value != 1.0:
                shrink_info_str = 'shrink: {0:0.5f}'.format(shrinkage_value)
            logger.info("Iter: {0}/{1}   Jobs: {2}   "
                        "Epoch: {3:0.2f}/{4:0.1f} ({5:0.1f}% complete)   "
                        "lr: {6:0.6f}   {7}".format(iter, num_iters - 1,
                                                    current_num_jobs,
                                                    epoch, args.num_epochs,
                                                    percent,
                                                    lrate, shrink_info_str))

            chain_lib.train_one_iteration(
                dir=args.dir,
                iter=iter,
                srand=args.srand,
                egs_dir=egs_dir,
                num_jobs=current_num_jobs,
                num_archives_processed=num_archives_processed,
                num_archives=num_archives,
                learning_rate=lrate,
                dropout_edit_string=common_train_lib.get_dropout_edit_string(
                    args.dropout_schedule,
                    float(num_archives_processed) / num_archives_to_process,
                    iter),
                shrinkage_value=shrinkage_value,
                num_chunk_per_minibatch_str=args.num_chunk_per_minibatch,
                apply_deriv_weights=args.apply_deriv_weights,
                min_deriv_time=min_deriv_time,
                max_deriv_time_relative=max_deriv_time_relative,
                l2_regularize=args.l2_regularize,
                xent_regularize=args.xent_regularize,
                leaky_hmm_coefficient=args.leaky_hmm_coefficient,
                momentum=args.momentum,
                max_param_change=args.max_param_change,
                shuffle_buffer_size=args.shuffle_buffer_size,
                frame_subsampling_factor=args.frame_subsampling_factor,
                run_opts=run_opts)


            if args.cleanup:
                # do a clean up everything but the last 2 models, under certain
                # conditions
                common_train_lib.remove_model(
                    args.dir, iter-2, num_iters, models_to_combine,
                    args.preserve_model_interval)

            if args.email is not None:
                reporting_iter_interval = num_iters * args.reporting_interval
                if iter % reporting_iter_interval == 0:
                    # lets do some reporting
                    [report, times, data] = (
                        nnet3_log_parse.generate_acc_logprob_report(
                            args.dir, "log-probability"))
                    message = report
                    subject = ("Update : Expt {dir} : "
                               "Iter {iter}".format(dir=args.dir, iter=iter))
                    common_lib.send_mail(message, subject, args.email)

        num_archives_processed = num_archives_processed + current_num_jobs


    if args.stage <= num_iters:
        logger.info("Doing final combination to produce final.mdl")
        chain_lib.combine_models(
            dir=args.dir, num_iters=num_iters,
            models_to_combine=models_to_combine,
            num_chunk_per_minibatch_str=args.num_chunk_per_minibatch,
            egs_dir=egs_dir,
            leaky_hmm_coefficient=args.leaky_hmm_coefficient,
            l2_regularize=args.l2_regularize,
            xent_regularize=args.xent_regularize,
            run_opts=run_opts)


    if args.cleanup:
        logger.info("Cleaning up the experiment directory "
                    "{0}".format(args.dir))
        remove_egs = args.remove_egs
        if args.egs_dir is not None:
            # this egs_dir was not created by this experiment so we will not
            # delete it
            remove_egs = False

        common_train_lib.clean_nnet_dir(
            args.dir, num_iters, egs_dir,
            preserve_model_interval=args.preserve_model_interval,
            remove_egs=remove_egs)

    # do some reporting
    [report, times, data] = nnet3_log_parse.generate_acc_logprob_report(
        args.dir, "log-probability")
    if args.email is not None:
        common_lib.send_mail(report, "Update : Expt {0} : "
                                     "complete".format(args.dir), args.email)

    with open("{dir}/accuracy.report".format(dir=args.dir), "w") as f:
        f.write(report)

    common_lib.execute_command("steps/info/chain_dir_info.pl "
                                 "{0}".format(args.dir))


def main():
    [args, run_opts] = get_args()
    try:
        train(args, run_opts)
        common_lib.wait_for_background_commands()
    except BaseException as e:
        # look for BaseException so we catch KeyboardInterrupt, which is
        # what we get when a background thread dies.
        if args.email is not None:
            message = ("Training session for experiment {dir} "
                       "died due to an error.".format(dir=args.dir))
            common_lib.send_mail(message, message, args.email)
        if not isinstance(e, KeyboardInterrupt):
            traceback.print_exc()
        sys.exit(1)

if __name__ == "__main__":
    main()


================================================
FILE: egs/steps/nnet3/chain/gen_topo.pl
================================================
#!/usr/bin/env perl

# Copyright 2012  Johns Hopkins University (author: Daniel Povey)

# Generate a topology file.  This allows control of the number of states in the
# non-silence HMMs, and in the silence HMMs.  This is a modified version of
# 'utils/gen_topo.pl' that generates a different type of topology, one that we
# believe should be useful in the 'chain' model.  Note: right now it doesn't
# have any real options, and it treats silence and nonsilence the same.  The
# intention is that you write different versions of this script, or add options,
# if you experiment with it.

if (@ARGV != 2) {
  print STDERR "Usage: utils/gen_topo.pl <colon-separated-nonsilence-phones> <colon-separated-silence-phones>\n";
  print STDERR "e.g.:  utils/gen_topo.pl 4:5:6:7:8:9:10 1:2:3\n";
  exit (1);
}

($nonsil_phones, $sil_phones) = @ARGV;

$nonsil_phones =~ s/:/ /g;
$sil_phones =~ s/:/ /g;
$nonsil_phones =~ m/^\d[ \d]+$/ || die "$0: bad arguments @ARGV\n";
$sil_phones =~ m/^\d[ \d]*$/ || die "$0: bad arguments @ARGV\n";

print "<Topology>\n";
print "<TopologyEntry>\n";
print "<ForPhones>\n";
print "$nonsil_phones $sil_phones\n";
print "</ForPhones>\n";
# The next two lines may look like a bug, but they are as intended.  State 0 has
# no self-loop, it happens exactly once.  And it can go either to state 1 (with
# a self-loop) or to state 2, so we can have zero or more instances of state 1
# following state 0.
# We make the transition-probs 0.5 so they normalize, to keep the code happy.
# In fact, we always set the transition probability scale to 0.0 in the 'chain'
# code, so they are never used.
print "<State> 0 <PdfClass> 0 <Transition> 1 0.5 <Transition> 2 0.5 </State>\n";
print "<State> 1 <PdfClass> 1 <Transition> 1 0.5 <Transition> 2 0.5 </State>\n";
print "<State> 2 </State>\n";
print "</TopologyEntry>\n";
print "</Topology>\n";


================================================
FILE: egs/steps/nnet3/chain/gen_topo.py
================================================
#!/usr/bin/env python

# Copyright 2012  Johns Hopkins University (author: Daniel Povey)

# This script was modified around 11.11.2016, when the code was extended to
# support having a different pdf-class on the self loop.

# Generate a topology file.  This allows control of the number of states in the
# non-silence HMMs, and in the silence HMMs.  This is a modified version of
# 'utils/gen_topo.pl' that generates a different type of topology, one that we
# believe should be useful in the 'chain' model.  Note: right now it doesn't
# have any real options, and it treats silence and nonsilence the same.  The
# intention is that you write different versions of this script, or add options,
# if you experiment with it.

from __future__ import print_function
import argparse


parser = argparse.ArgumentParser(description="Usage: steps/nnet3/chain/gen_topo.py "
                                             "<colon-separated-nonsilence-phones> <colon-separated-silence-phones>"
                                             "e.g.:  steps/nnet3/chain/gen_topo.pl 4:5:6:7:8:9:10 1:2:3\n",
                                 epilog="See egs/swbd/s5c/local/chain/train_tdnn_a.sh for example of usage.");
parser.add_argument("nonsilence_phones", type=str,
                    help="List of non-silence phones as integers, separated by colons, e.g. 4:5:6:7:8:9");
parser.add_argument("silence_phones", type=str,
                    help="List of silence phones as integers, separated by colons, e.g. 1:2:3");

args = parser.parse_args()

silence_phones = [ int(x) for x in args.silence_phones.split(":") ]
nonsilence_phones = [ int(x) for x in args.nonsilence_phones.split(":") ]
all_phones = silence_phones +  nonsilence_phones

print("<Topology>")
print("<TopologyEntry>")
print("<ForPhones>")
print(" ".join([str(x) for x in all_phones]))
print("</ForPhones>")
# We make the transition-probs 0.5 so they normalize, to keep the code happy.
# In fact, we always set the transition probability scale to 0.0 in the 'chain'
# code, so they are never used.
# Note: the <ForwardPdfClass> will actually happen on the incoming arc because
# we always build the graph with "reorder=true".
print("<State> 0 <ForwardPdfClass> 0 <SelfLoopPdfClass> 1 <Transition> 0 0.5 <Transition> 1 0.5 </State>")
print("<State> 1 </State>")
print("</TopologyEntry>")
print("</Topology>")


================================================
FILE: egs/steps/nnet3/chain/gen_topo2.py
================================================
#!/usr/bin/env python

# Copyright 2012  Johns Hopkins University (author: Daniel Povey)

# Generate a topology file.  This allows control of the number of states in the
# non-silence HMMs, and in the silence HMMs.  This is a modified version of
# 'utils/gen_topo.pl' that generates a different type of topology, one that we
# believe should be useful in the 'chain' model.  Note: right now it doesn't
# have any real options, and it treats silence and nonsilence the same.  The
# intention is that you write different versions of this script, or add options,
# if you experiment with it.

from __future__ import print_function
import argparse


parser = argparse.ArgumentParser(description="Usage: steps/nnet3/chain/gen_topo.py "
                                             "<colon-separated-nonsilence-phones> <colon-separated-silence-phones>"
                                             "e.g.:  steps/nnet3/chain/gen_topo.pl 4:5:6:7:8:9:10 1:2:3\n",
                                 epilog="See egs/swbd/s5c/local/chain/train_tdnn_a.sh for example of usage.");
parser.add_argument("nonsilence_phones", type=str,
                    help="List of non-silence phones as integers, separated by colons, e.g. 4:5:6:7:8:9");
parser.add_argument("silence_phones", type=str,
                    help="List of silence phones as integers, separated by colons, e.g. 1:2:3");

args = parser.parse_args()

silence_phones = [ int(x) for x in args.silence_phones.split(":") ]
nonsilence_phones = [ int(x) for x in args.nonsilence_phones.split(":") ]
all_phones = silence_phones +  nonsilence_phones

print("<Topology>")
print("<TopologyEntry>")
print("<ForPhones>")
print(" ".join([str(x) for x in all_phones]))
print("</ForPhones>")

# the pdf-classes are as follows:
#  pdf-class 0 is in a 1-frame sequence, the initial and final state.
#  pdf-class 1 is in a sequence with >=3 frames, the 'middle' states.  (important that
#   it be numbered 1, which is the default list of pdf-classes used in 'cluster-phones').
#  pdf-class 2 is the initial-state in a sequence with >= 2 frames.
#  pdf-class 3 is the final-state in a sequence with >= 2 frames.
# state 0 is nonemitting in this topology.

print("<State> 0 <Transition> 1 0.5 <Transition> 2 0.5 </State>")  # initial nonemitting state.
print("<State> 1 <PdfClass> 0 <Transition> 5 1.0 </State>")  # 1-frame sequence.
print("<State> 2 <PdfClass> 2 <Transition> 3 0.5 <Transition> 4 0.5 </State>")  # 2 or more frames
print("<State> 3 <PdfClass> 1 <Transition> 3 0.5 <Transition> 4 0.5 </State>")  # 3 or more frames
print("<State> 4 <PdfClass> 3 <Transition> 5 1.0 </State>") # 2 or more frames.
print("<State> 5 </State>")  # final nonemitting state

print("</TopologyEntry>")
print("</Topology>")


================================================
FILE: egs/steps/nnet3/chain/gen_topo3.py
================================================
#!/usr/bin/env python

# Copyright 2012  Johns Hopkins University (author: Daniel Povey)

# Generate a topology file.  This allows control of the number of states in the
# non-silence HMMs, and in the silence HMMs.  This is a modified version of
# 'utils/gen_topo.pl' that generates a different type of topology, one that we
# believe should be useful in the 'chain' model.  Note: right now it doesn't
# have any real options, and it treats silence and nonsilence the same.  The
# intention is that you write different versions of this script, or add options,
# if you experiment with it.

from __future__ import print_function
import argparse


parser = argparse.ArgumentParser(description="Usage: steps/nnet3/chain/gen_topo.py "
                                             "<colon-separated-nonsilence-phones> <colon-separated-silence-phones>"
                                             "e.g.:  steps/nnet3/chain/gen_topo.pl 4:5:6:7:8:9:10 1:2:3\n",
                                 epilog="See egs/swbd/s5c/local/chain/train_tdnn_a.sh for example of usage.");
parser.add_argument("nonsilence_phones", type=str,
                    help="List of non-silence phones as integers, separated by colons, e.g. 4:5:6:7:8:9");
parser.add_argument("silence_phones", type=str,
                    help="List of silence phones as integers, separated by colons, e.g. 1:2:3");

args = parser.parse_args()

silence_phones = [ int(x) for x in args.silence_phones.split(":") ]
nonsilence_phones = [ int(x) for x in args.nonsilence_phones.split(":") ]
all_phones = silence_phones +  nonsilence_phones

print("<Topology>")
print("<TopologyEntry>")
print("<ForPhones>")
print(" ".join([str(x) for x in all_phones]))
print("</ForPhones>")
print("<State> 0 <PdfClass> 0 <Transition> 0 0.5 <Transition> 1 0.5 </State>")
print("<State> 1 </State>")
print("</TopologyEntry>")
print("</Topology>")


================================================
FILE: egs/steps/nnet3/chain/gen_topo4.py
================================================
#!/usr/bin/env python

# Copyright 2012  Johns Hopkins University (author: Daniel Povey)

# Generate a topology file.  This allows control of the number of states in the
# non-silence HMMs, and in the silence HMMs.  This is a modified version of
# 'utils/gen_topo.pl' that generates a different type of topology, one that we
# believe should be useful in the 'chain' model.  Note: right now it doesn't
# have any real options, and it treats silence and nonsilence the same.  The
# intention is that you write different versions of this script, or add options,
# if you experiment with it.

from __future__ import print_function
import argparse


parser = argparse.ArgumentParser(description="Usage: steps/nnet3/chain/gen_topo.py "
                                             "<colon-separated-nonsilence-phones> <colon-separated-silence-phones>"
                                             "e.g.:  steps/nnet3/chain/gen_topo.pl 4:5:6:7:8:9:10 1:2:3\n",
                                 epilog="See egs/swbd/s5c/local/chain/train_tdnn_a.sh for example of usage.");
parser.add_argument("nonsilence_phones", type=str,
                    help="List of non-silence phones as integers, separated by colons, e.g. 4:5:6:7:8:9");
parser.add_argument("silence_phones", type=str,
                    help="List of silence phones as integers, separated by colons, e.g. 1:2:3");

args = parser.parse_args()

silence_phones = [ int(x) for x in args.silence_phones.split(":") ]
nonsilence_phones = [ int(x) for x in args.nonsilence_phones.split(":") ]
all_phones = silence_phones +  nonsilence_phones

print("<Topology>")
print("<TopologyEntry>")
print("<ForPhones>")
print(" ".join([str(x) for x in all_phones]))
print("</ForPhones>")
# state 0 is obligatory (occurs once)
print("<State> 0 <PdfClass> 0 <Transition> 1 0.3333 <Transition> 2 0.3333 <Transition> 3 0.3333 </State> ")
# state 1 is used only when >2 frames
print("<State> 1 <PdfClass> 1 <Transition> 1 0.5 <Transition> 2 0.5 </State>")
# state 2 is used only when >=2 frames (and occurs once)
print("<State> 2 <PdfClass> 2 <Transition> 3 1.0 </State>")
print("<State> 3 </State>")  # final nonemitting state
print("</TopologyEntry>")
print("</Topology>")


================================================
FILE: egs/steps/nnet3/chain/gen_topo5.py
================================================
#!/usr/bin/env python

# Copyright 2012  Johns Hopkins University (author: Daniel Povey)

# Generate a topology file.  This allows control of the number of states in the
# non-silence HMMs, and in the silence HMMs.  This is a modified version of
# 'utils/gen_topo.pl' that generates a different type of topology, one that we
# believe should be useful in the 'chain' model.  Note: right now it doesn't
# have any real options, and it treats silence and nonsilence the same.  The
# intention is that you write different versions of this script, or add options,
# if you experiment with it.

from __future__ import print_function
import argparse


parser = argparse.ArgumentParser(description="Usage: steps/nnet3/chain/gen_topo.py "
                                             "<colon-separated-nonsilence-phones> <colon-separated-silence-phones>"
                                             "e.g.:  steps/nnet3/chain/gen_topo.pl 4:5:6:7:8:9:10 1:2:3\n",
                                 epilog="See egs/swbd/s5c/local/chain/train_tdnn_a.sh for example of usage.");
parser.add_argument("nonsilence_phones", type=str,
                    help="List of non-silence phones as integers, separated by colons, e.g. 4:5:6:7:8:9");
parser.add_argument("silence_phones", type=str,
                    help="List of silence phones as integers, separated by colons, e.g. 1:2:3");

args = parser.parse_args()

silence_phones = [ int(x) for x in args.silence_phones.split(":") ]
nonsilence_phones = [ int(x) for x in args.nonsilence_phones.split(":") ]
all_phones = silence_phones +  nonsilence_phones

print("<Topology>")
print("<TopologyEntry>")
print("<ForPhones>")
print(" ".join([str(x) for x in all_phones]))
print("</ForPhones>")
# state 0 is nonemitting
print("<State> 0 <Transition> 1 0.5 <Transition> 2 0.5 </State>")
# state 1 is for when we traverse it in 1 state
print("<State> 1 <PdfClass> 0 <Transition> 4 1.0 </State>")
# state 2 is for when we traverse it in >1 state, for the first state.
print("<State> 2 <PdfClass> 2 <Transition> 3 1.0 </State>")
# state 3 is for the self-loop.  Use pdf-class 1 here so that the default
# phone-class clustering (which uses only pdf-class 1 by default) gets only
# stats from longer phones.
print("<State> 3 <PdfClass> 1 <Transition> 3 0.5 <Transition> 4 0.5 </State>")
print("<State> 4 </State>")
print("</TopologyEntry>")
print("</Topology>")


================================================
FILE: egs/steps/nnet3/chain/gen_topo_orig.py
================================================
#!/usr/bin/env python

# Copyright 2012  Johns Hopkins University (author: Daniel Povey)

# This file is as ./gen_topo.py used to be (before we extended the transition-model
# code to support having a different self-loop pdf-class).  It is included
# here for baseline and testing purposes.


# Generate a topology file.  This allows control of the number of states in the
# non-silence HMMs, and in the silence HMMs.  This is a modified version of
# 'utils/gen_topo.pl' that generates a different type of topology, one that we
# believe should be useful in the 'chain' model.  Note: right now it doesn't
# have any real options, and it treats silence and nonsilence the same.  The
# intention is that you write different versions of this script, or add options,
# if you experiment with it.

from __future__ import print_function
import argparse


parser = argparse.ArgumentParser(description="Usage: steps/nnet3/chain/gen_topo.py "
                                             "<colon-separated-nonsilence-phones> <colon-separated-silence-phones>"
                                             "e.g.:  steps/nnet3/chain/gen_topo.pl 4:5:6:7:8:9:10 1:2:3\n",
                                 epilog="See egs/swbd/s5c/local/chain/train_tdnn_a.sh for example of usage.");
parser.add_argument("nonsilence_phones", type=str,
                    help="List of non-silence phones as integers, separated by colons, e.g. 4:5:6:7:8:9");
parser.add_argument("silence_phones", type=str,
                    help="List of silence phones as integers, separated by colons, e.g. 1:2:3");

args = parser.parse_args()

silence_phones = [ int(x) for x in args.silence_phones.split(":") ]
nonsilence_phones = [ int(x) for x in args.nonsilence_phones.split(":") ]
all_phones = silence_phones +  nonsilence_phones

print("<Topology>")
print("<TopologyEntry>")
print("<ForPhones>")
print(" ".join([str(x) for x in all_phones]))
print("</ForPhones>")
# The next two lines may look like a bug, but they are as intended.  State 0 has
# no self-loop, it happens exactly once.  And it can go either to state 1 (with
# a self-loop) or to state 2, so we can have zero or more instances of state 1
# following state 0.
# We make the transition-probs 0.5 so they normalize, to keep the code happy.
# In fact, we always set the transition probability scale to 0.0 in the 'chain'
# code, so they are never used.
print("<State> 0 <PdfClass> 0 <Transition> 1 0.5 <Transition> 2 0.5 </State>")
print("<State> 1 <PdfClass> 1 <Transition> 1 0.5 <Transition> 2 0.5 </State>")
print("<State> 2 </State>")
print("</TopologyEntry>")
print("</Topology>")


================================================
FILE: egs/steps/nnet3/chain/get_egs.sh
================================================
#!/usr/bin/env bash

# Copyright 2012-2015 Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.
#
# This script, which will generally be called from other neural-net training
# scripts, extracts the training examples used to train the 'chain' system
# (and also the validation examples used for diagnostics), and puts them in
# separate archives.
#
# This script dumps egs with many frames of labels, controlled by the
# frames_per_eg config variable (default: 25), plus left and right context.
# Because CTC training involves alignment of data, we can't meaningfully train
# frame by frame.   The supervision approach involves the time alignment, though--
# it is just applied in a loose way, where each symbol can appear in the
# frame-range that it was in in the alignment, extended by a certain margin.
#


# Begin configuration section.
cmd=run.pl
frames_per_eg=25   # number of feature frames example (not counting added context).
                   # more->less disk space and less time preparing egs, but more
                   # I/O during training.
frames_overlap_per_eg=0  # number of supervised frames of overlap that we aim for per eg.
                  # can be useful to avoid wasted data if you're using --left-deriv-truncate
                  # and --right-deriv-truncate.
frame_subsampling_factor=3 # frames-per-second of features we train on divided
                           # by frames-per-second at output of chain model
alignment_subsampling_factor=3 # frames-per-second of input alignments divided
                               # by frames-per-second at output of chain model
left_context=4    # amount of left-context per eg (i.e. extra frames of input features
                  # not present in the output supervision).
right_context=4   # amount of right-context per eg.
constrained=true  # 'constrained=true' is the traditional setup; 'constrained=false'
                  # gives you the 'unconstrained' egs creation in which the time
                  # boundaries are not enforced inside chunks.

left_context_initial=-1    # if >=0, left-context for first chunk of an utterance
right_context_final=-1     # if >=0, right-context for last chunk of an utterance
compress=true   # set this to false to disable compression (e.g. if you want to see whether
                # results are affected).

num_utts_subset=300     # number of utterances in validation and training
                        # subsets used for shrinkage and diagnostics.
num_valid_egs_combine=0  # #validation examples for combination weights at the very end.
num_train_egs_combine=1000 # number of train examples for the above.
num_egs_diagnostic=400 # number of frames for "compute_prob" jobs
frames_per_iter=400000 # each iteration of training, see this many frames per
                       # job, measured at the sampling rate of the features
                       # used.  This is just a guideline; it will pick a number
                       # that divides the number of samples in the entire data.

right_tolerance=  # chain right tolerance == max label delay.
left_tolerance=

stage=0
max_jobs_run=15         # This should be set to the maximum number of nnet3-chain-get-egs jobs you are
                        # comfortable to run in parallel; you can increase it if your disk
                        # speed is greater and you have more machines.
max_shuffle_jobs_run=50  # the shuffle jobs now include the nnet3-chain-normalize-egs command,
                         # which is fairly CPU intensive, so we can run quite a few at once
                         # without overloading the disks.
srand=0     # rand seed for nnet3-chain-get-egs, nnet3-chain-copy-egs and nnet3-chain-shuffle-egs
online_ivector_dir=  # can be used if we are including speaker information as iVectors.
cmvn_opts=  # can be used for specifying CMVN options, if feature type is not lda (if lda,
            # it doesn't make sense to use different options than were used as input to the
            # LDA transform).  This is used to turn off CMVN in the online-nnet experiments.
online_cmvn=false # Set to 'true' to replace 'apply-cmvn' by 'apply-cmvn-online' in the nnet3 input.
                  # The configuration is passed externally via '$cmvn_opts' given to train.py,
                  # typically as: --cmvn-opts="--config conf/online_cmvn.conf".
                  # The global_cmvn.stats are computed by this script from the features.
                  # Note: the online cmvn for ivector extractor it is controlled separately in
                  #       steps/online/nnet2/train_ivector_extractor.sh by --online-cmvn-iextractor
lattice_lm_scale=     # If supplied, the graph/lm weight of the lattices will be
                      # used (with this scale) in generating supervisions
                      # This is 0 by default for conventional supervised training,
                      # but may be close to 1 for the unsupervised part of the data
                      # in semi-supervised training. The optimum is usually
                      # 0.5 for unsupervised data.
lattice_prune_beam=         # If supplied, the lattices will be pruned to this beam,
                            # before being used to get supervisions.
acwt=0.1   # For pruning
deriv_weights_scp=
generate_egs_scp=false

echo "$0 $@"  # Print the command line for logging

if [ -f path.sh ]; then . ./path.sh; fi
. parse_options.sh || exit 1;


if [ $# != 4 ]; then
  echo "Usage: $0 [opts] <data> <chain-dir> <lattice-dir> <egs-dir>"
  echo " e.g.: $0 data/train exp/tri4_nnet exp/tri3_lats exp/tri4_nnet/egs"
  echo ""
  echo "From <chain-dir>, 0.trans_mdl (the transition-model), tree (the tree)"
  echo "and normalization.fst (the normalization FST, derived from the denominator FST)"
  echo "are read."
  echo ""
  echo "Main options (for others, see top of script file)"
  echo "  --config <config-file>                           # config file containing options"
  echo "  --max-jobs-run <max-jobs-run>                    # The maximum number of jobs you want to run in"
  echo "                                                   # parallel (increase this only if you have good disk and"
  echo "                                                   # network speed).  default=6"
  echo "  --cmd (utils/run.pl;utils/queue.pl <queue opts>) # how to run jobs."
  echo "  --frames-per-iter <#samples;400000>              # Number of frames of data to process per iteration, per"
  echo "                                                   # process."
  echo "  --frame-subsampling-factor <factor;3>            # factor by which num-frames at nnet output is reduced "
  echo "  --frames-per-eg <frames;25>                      # number of supervised frames per eg on disk"
  echo "  --frames-overlap-per-eg <frames;25>              # number of supervised frames of overlap between egs"
  echo "  --left-context <int;4>                           # Number of frames on left side to append for feature input"
  echo "  --right-context <int;4>                          # Number of frames on right side to append for feature input"
  echo "  --left-context-initial <int;-1>                  # If >= 0, left-context for first chunk of an utterance"
  echo "  --right-context-final <int;-1>                   # If >= 0, right-context for last chunk of an utterance"
  echo "  --num-egs-diagnostic <#frames;4000>              # Number of egs used in computing (train,valid) diagnostics"
  echo "  --num-valid-egs-combine <#frames;10000>          # Number of egs used in getting combination weights at the"
  echo "                                                   # very end."
  echo "  --lattice-lm-scale <float>                       # If supplied, the graph/lm weight of the lattices will be "
  echo "                                                   # used (with this scale) in generating supervisions"
  echo "  --lattice-prune-beam <float>                     # If supplied, the lattices will be pruned to this beam, "
  echo "                                                   # before being used to get supervisions."
  echo "  --acwt <float;0.1>                               # Acoustic scale -- affects pruning"
  echo "  --deriv-weights-scp <str>                        # If supplied, adds per-frame weights to the supervision."
  echo "  --generate-egs-scp <bool;false>                  # Generates scp files -- Required if the egs will be "
  echo "                                                   # used for multilingual/multitask training."
  echo "  --stage <stage|0>                                # Used to run a partially-completed training process from somewhere in"
  echo "                                                   # the middle."

  exit 1;
fi

data=$1
chaindir=$2
latdir=$3
dir=$4

# Check some files.
[ ! -z "$online_ivector_dir" ] && \
  extra_files="$online_ivector_dir/ivector_online.scp $online_ivector_dir/ivector_period"

for f in $data/feats.scp $latdir/lat.1.gz $latdir/final.mdl \
         $chaindir/{0.trans_mdl,tree,normalization.fst} $extra_files; do
  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
done

nj=$(cat $latdir/num_jobs) || exit 1
if [ -f $latdir/per_utt ]; then
  sdata=$data/split${nj}utt
  utils/split_data.sh --per-utt $data $nj
else
  sdata=$data/split$nj
  utils/split_data.sh $data $nj
fi

mkdir -p $dir/log $dir/info

# Get list of validation utterances.
frame_shift=$(utils/data/get_frame_shift.sh $data) || exit 1

if [ -f $data/utt2uniq ]; then
  # Must hold out all augmented versions of the same utterance.
  echo "$0: File $data/utt2uniq exists, so ensuring the hold-out set" \
       "includes all perturbed versions of the same source utterance."
  utils/utt2spk_to_spk2utt.pl $data/utt2uniq 2>/dev/null | \
      utils/shuffle_list.pl 2>/dev/null | \
    awk -v max_utt=$num_utts_subset '{
        for (n=2;n<=NF;n++) print $n;
        printed += NF-1;
        if (printed >= max_utt) exit(0); }' |
    sort > $dir/valid_uttlist
else
  awk '{print $1}' $data/utt2spk | \
    utils/shuffle_list.pl 2>/dev/null | \
    head -$num_utts_subset > $dir/valid_uttlist
fi
len_valid_uttlist=$(wc -l < $dir/valid_uttlist)

awk '{print $1}' $data/utt2spk | \
   utils/filter_scp.pl --exclude $dir/valid_uttlist | \
   utils/shuffle_list.pl 2>/dev/null | \
   head -$num_utts_subset > $dir/train_subset_uttlist
len_trainsub_uttlist=$(wc -l <$dir/train_subset_uttlist)

if [[ $len_valid_uttlist -lt $num_utts_subset ||
      $len_trainsub_uttlist -lt $num_utts_subset ]]; then
  echo "$0: Number of utterances is very small. Please check your data." && exit 1;
fi

echo "$0: Holding out $len_valid_uttlist utterances in validation set and" \
     "$len_trainsub_uttlist in training diagnostic set, out of total" \
     "$(wc -l < $data/utt2spk)."


echo "$0: creating egs.  To ensure they are not deleted later you can do:  touch $dir/.nodelete"

## Set up features.

# get the global_cmvn stats for online-cmvn,
if $online_cmvn; then
  # create global_cmvn.stats,
  #
  # caution: the top-level nnet training script should copy
  # 'global_cmvn.stats' and 'online_cmvn' to its own dir.
  if ! matrix-sum --binary=false scp:$data/cmvn.scp - >$dir/global_cmvn.stats 2>/dev/null; then
    echo "$0: Error summing cmvn stats"
    exit 1
  fi
  touch $dir/online_cmvn
else
  [ -f $dir/online_cmvn ] && rm $dir/online_cmvn
fi

# create the feature pipelines,
if ! $online_cmvn; then
  # the original front-end with 'apply-cmvn',
  echo "$0: feature type is raw, with 'apply-cmvn'"
  feats="ark,s,cs:utils/filter_scp.pl --exclude $dir/valid_uttlist $sdata/JOB/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:- ark:- |"
  valid_feats="ark,s,cs:utils/filter_scp.pl $dir/valid_uttlist $data/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- |"
  train_subset_feats="ark,s,cs:utils/filter_scp.pl $dir/train_subset_uttlist $data/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- |"
else
  # the alternative front-end with 'apply-cmvn-online',
  # - the $cmvn_opts can be set to '--config=conf/online_cmvn.conf' which is the setup of ivector-extractor,
  echo "$0: feature type is raw, with 'apply-cmvn-online'"
  feats="ark,s,cs:utils/filter_scp.pl --exclude $dir/valid_uttlist $sdata/JOB/feats.scp | apply-cmvn-online $cmvn_opts --spk2utt=ark:$sdata/JOB/spk2utt $dir/global_cmvn.stats scp:- ark:- |"
  valid_spk2utt="ark:utils/filter_scp.pl $dir/valid_uttlist $data/utt2spk | utils/utt2spk_to_spk2utt.pl |"
  valid_feats="ark,s,cs:utils/filter_scp.pl $dir/valid_uttlist $data/feats.scp | apply-cmvn-online $cmvn_opts --spk2utt=\"$valid_spk2utt\" $dir/global_cmvn.stats scp:- ark:- |"
  train_subset_spk2utt="ark:utils/filter_scp.pl $dir/train_subset_uttlist $data/utt2spk | utils/utt2spk_to_spk2utt.pl |"
  train_subset_feats="ark,s,cs:utils/filter_scp.pl $dir/train_subset_uttlist $data/feats.scp | apply-cmvn-online $cmvn_opts --spk2utt=\"$train_subset_spk2utt\" $dir/global_cmvn.stats scp:- ark:- |"
fi
echo $cmvn_opts >$dir/cmvn_opts # caution: the top-level nnet training script should copy this to its own dir now.


tree-info $chaindir/tree | grep num-pdfs | awk '{print $2}' > $dir/info/num_pdfs || exit 1

if [ ! -z "$online_ivector_dir" ]; then
  ivector_dim=$(feat-to-dim scp:$online_ivector_dir/ivector_online.scp -) || exit 1;
  echo $ivector_dim > $dir/info/ivector_dim
  steps/nnet2/get_ivector_id.sh $online_ivector_dir > $dir/info/final.ie.id || exit 1
  ivector_period=$(cat $online_ivector_dir/ivector_period) || exit 1;
  ivector_opts="--online-ivectors=scp:$online_ivector_dir/ivector_online.scp --online-ivector-period=$ivector_period"
else
  ivector_opts=""
  echo 0 >$dir/info/ivector_dim
fi

if [ $stage -le 1 ]; then
  echo "$0: working out number of frames of training data"
  num_frames=$(steps/nnet2/get_num_frames.sh $data)
  echo $num_frames > $dir/info/num_frames
  echo "$0: working out feature dim"
  feats_one="$(echo $feats | sed s/JOB/1/g)"
  if ! feat_dim=$(feat-to-dim "$feats_one" - 2>/dev/null); then
    echo "Command failed (getting feature dim): feat-to-dim \"$feats_one\""
    exit 1
  fi
  echo $feat_dim > $dir/info/feat_dim
else
  num_frames=$(cat $dir/info/num_frames) || exit 1;
  feat_dim=$(cat $dir/info/feat_dim) || exit 1;
fi

# the + 1 is to round up, not down... we assume it doesn't divide exactly.
num_archives=$[$num_frames/$frames_per_iter+1]

# We may have to first create a smaller number of larger archives, with number
# $num_archives_intermediate, if $num_archives is more than the maximum number
# of open filehandles that the system allows per process (ulimit -n).
# This sometimes gives a misleading answer as GridEngine sometimes changes the
# limit, so we limit it to 512.
max_open_filehandles=$(ulimit -n) || exit 1
[ $max_open_filehandles -gt 512 ] && max_open_filehandles=512
num_archives_intermediate=$num_archives
archives_multiple=1
while [ $[$num_archives_intermediate+4] -gt $max_open_filehandles ]; do
  archives_multiple=$[$archives_multiple+1]
  num_archives_intermediate=$[$num_archives/$archives_multiple] || exit 1;
done
# now make sure num_archives is an exact multiple of archives_multiple.
num_archives=$[$archives_multiple*$num_archives_intermediate] || exit 1;

echo $num_archives >$dir/info/num_archives
echo $frames_per_eg >$dir/info/frames_per_eg
# Work out the number of egs per archive
egs_per_archive=$[$num_frames/($frames_per_eg*$num_archives)] || exit 1;
! [ $egs_per_archive -le $frames_per_iter ] && \
  echo "$0: script error: egs_per_archive=$egs_per_archive not <= frames_per_iter=$frames_per_iter" \
  && exit 1;

echo $egs_per_archive > $dir/info/egs_per_archive

echo "$0: creating $num_archives archives, each with $egs_per_archive egs, with"
echo "$0:   $frames_per_eg labels per example, and (left,right) context = ($left_context,$right_context)"
if [ $left_context_initial -ge 0 ] || [ $right_context_final -ge 0 ]; then
  echo "$0:   ... and (left-context-initial,right-context-final) = ($left_context_initial,$right_context_final)"
fi


if [ -e $dir/storage ]; then
  # Make soft links to storage directories, if distributing this way..  See
  # utils/create_split_dir.pl.
  echo "$0: creating data links"
  utils/create_data_link.pl $(for x in $(seq $num_archives); do echo $dir/cegs.$x.ark; done)
  for x in $(seq $num_archives_intermediate); do
    utils/create_data_link.pl $(for y in $(seq $nj); do echo $dir/cegs_orig.$y.$x.ark; done)
  done
fi

egs_opts="--left-context=$left_context --right-context=$right_context --num-frames=$frames_per_eg --frame-subsampling-factor=$frame_subsampling_factor --compress=$compress"
[ $left_context_initial -ge 0 ] && egs_opts="$egs_opts --left-context-initial=$left_context_initial"
[ $right_context_final -ge 0 ] && egs_opts="$egs_opts --right-context-final=$right_context_final"

[ ! -z "$deriv_weights_scp" ] && egs_opts="$egs_opts --deriv-weights-rspecifier=scp:$deriv_weights_scp"

chain_supervision_all_opts="--lattice-input=true --frame-subsampling-factor=$alignment_subsampling_factor"
[ ! -z $right_tolerance ] && \
  chain_supervision_all_opts="$chain_supervision_all_opts --right-tolerance=$right_tolerance"

[ ! -z $left_tolerance ] && \
  chain_supervision_all_opts="$chain_supervision_all_opts --left-tolerance=$left_tolerance"

if ! $constrained; then
  chain_supervision_all_opts="$chain_supervision_all_opts --convert-to-pdfs=false"
  trans_mdl_opt=--transition-model=$chaindir/0.trans_mdl
else
  trans_mdl_opt=
fi


lats_rspecifier="ark:gunzip -c $latdir/lat.JOB.gz |"
if [ ! -z $lattice_prune_beam ]; then
  if [ "$lattice_prune_beam" == "0" ] || [ "$lattice_prune_beam" == "0.0" ]; then
    lats_rspecifier="$lats_rspecifier lattice-1best --acoustic-scale=$acwt ark:- ark:- |"
  else
    lats_rspecifier="$lats_rspecifier lattice-prune --acoustic-scale=$acwt --beam=$lattice_prune_beam ark:- ark:- |"
  fi
fi

normalization_fst_scale=1.0

if [ ! -z "$lattice_lm_scale" ]; then
  chain_supervision_all_opts="$chain_supervision_all_opts --lm-scale=$lattice_lm_scale"

  normalization_fst_scale=$(perl -e "
  if ($lattice_lm_scale >= 1.0 || $lattice_lm_scale < 0) {
    print STDERR \"Invalid --lattice-lm-scale $lattice_lm_scale\";
    exit(1);
  }
  print (1.0 - $lattice_lm_scale);") || exit 1
fi

echo $left_context > $dir/info/left_context
echo $right_context > $dir/info/right_context
echo $left_context_initial > $dir/info/left_context_initial
echo $right_context_final > $dir/info/right_context_final

if [ $stage -le 2 ]; then
  echo "$0: Getting validation and training subset examples in background."
  rm $dir/.error 2>/dev/null

  (
    $cmd --max-jobs-run 6 JOB=1:$nj $dir/log/lattice_copy.JOB.log \
      lattice-copy --include="cat $dir/valid_uttlist $dir/train_subset_uttlist |" --ignore-missing \
      "$lats_rspecifier" \
      ark,scp:$dir/lat_special.JOB.ark,$dir/lat_special.JOB.scp || exit 1

    for id in $(seq $nj); do cat $dir/lat_special.$id.scp; done > $dir/lat_special.scp

    $cmd $dir/log/create_valid_subset.log \
      utils/filter_scp.pl $dir/valid_uttlist $dir/lat_special.scp \| \
      lattice-align-phones --replace-output-symbols=true $latdir/final.mdl scp:- ark:- \| \
      chain-get-supervision $chain_supervision_all_opts $chaindir/tree $chaindir/0.trans_mdl \
        ark:- ark:- \| \
      nnet3-chain-get-egs $ivector_opts --srand=$srand \
         $egs_opts --normalization-fst-scale=$normalization_fst_scale \
         $trans_mdl_opt $chaindir/normalization.fst \
        "$valid_feats" ark,s,cs:- "ark:$dir/valid_all.cegs" || exit 1
    $cmd $dir/log/create_train_subset.log \
      utils/filter_scp.pl $dir/train_subset_uttlist $dir/lat_special.scp \| \
      lattice-align-phones --replace-output-symbols=true $latdir/final.mdl scp:- ark:- \| \
      chain-get-supervision $chain_supervision_all_opts \
        $chaindir/tree $chaindir/0.trans_mdl ark:- ark:- \| \
      nnet3-chain-get-egs $ivector_opts --srand=$srand \
        $egs_opts --normalization-fst-scale=$normalization_fst_scale \
        $trans_mdl_opt $chaindir/normalization.fst \
        "$train_subset_feats" ark,s,cs:- "ark:$dir/train_subset_all.cegs" || exit 1
    sleep 5  # wait for file system to sync.
    echo "$0: Getting subsets of validation examples for diagnostics and combination."
    if $generate_egs_scp; then
      valid_diagnostic_output="ark,scp:$dir/valid_diagnostic.cegs,$dir/valid_diagnostic.scp"
      train_diagnostic_output="ark,scp:$dir/train_diagnostic.cegs,$dir/train_diagnostic.scp"
    else
      valid_diagnostic_output="ark:$dir/valid_diagnostic.cegs"
      train_diagnostic_output="ark:$dir/train_diagnostic.cegs"
    fi
    $cmd $dir/log/create_valid_subset_combine.log \
      nnet3-chain-subset-egs --n=$num_valid_egs_combine ark:$dir/valid_all.cegs \
      ark:$dir/valid_combine.cegs || exit 1
    $cmd $dir/log/create_valid_subset_diagnostic.log \
      nnet3-chain-subset-egs --n=$num_egs_diagnostic ark:$dir/valid_all.cegs \
      $valid_diagnostic_output || exit 1

    $cmd $dir/log/create_train_subset_combine.log \
      nnet3-chain-subset-egs --n=$num_train_egs_combine ark:$dir/train_subset_all.cegs \
      ark:$dir/train_combine.cegs || exit 1
    $cmd $dir/log/create_train_subset_diagnostic.log \
      nnet3-chain-subset-egs --n=$num_egs_diagnostic ark:$dir/train_subset_all.cegs \
      $train_diagnostic_output || exit 1
    sleep 5  # wait for file system to sync.
    if $generate_egs_scp; then
      cat $dir/valid_combine.cegs $dir/train_combine.cegs | \
        nnet3-chain-copy-egs ark:- ark,scp:$dir/combine.cegs,$dir/combine.scp
    else
      cat $dir/valid_combine.cegs $dir/train_combine.cegs > $dir/combine.cegs
    fi

    for f in $dir/{combine,train_diagnostic,valid_diagnostic}.cegs; do
      [ ! -s $f ] && echo "$0: No examples in file $f" && exit 1;
    done
    rm $dir/valid_all.cegs $dir/train_subset_all.cegs $dir/{train,valid}_combine.cegs
  ) || touch $dir/.error &
fi

if [ $stage -le 4 ]; then
  # create cegs_orig.*.*.ark; the first index goes to $nj,
  # the second to $num_archives_intermediate.

  egs_list=
  for n in $(seq $num_archives_intermediate); do
    egs_list="$egs_list ark:$dir/cegs_orig.JOB.$n.ark"
  done
  echo "$0: Generating training examples on disk"

  # The examples will go round-robin to egs_list.  Note: we omit the
  # 'normalization.fst' argument while creating temporary egs: the phase of egs
  # preparation that involves the normalization FST is quite CPU-intensive and
  # it's more convenient to do it later, in the 'shuffle' stage.  Otherwise to
  # make it efficient we need to use a large 'nj', like 40, and in that case
  # there can be too many small files to deal with, because the total number of
  # files is the product of 'nj' by 'num_archives_intermediate', which might be
  # quite large.

  $cmd --max-jobs-run $max_jobs_run JOB=1:$nj $dir/log/get_egs.JOB.log \
    lattice-align-phones --replace-output-symbols=true $latdir/final.mdl \
      "$lats_rspecifier" ark:- \| \
    chain-get-supervision $chain_supervision_all_opts \
      $chaindir/tree $chaindir/0.trans_mdl ark:- ark:- \| \
    nnet3-chain-get-egs $ivector_opts --srand=\$[JOB+$srand] $egs_opts \
      --num-frames-overlap=$frames_overlap_per_eg $trans_mdl_opt \
     "$feats" ark,s,cs:- ark:- \| \
    nnet3-chain-copy-egs --random=true --srand=\$[JOB+$srand] ark:- $egs_list || exit 1;
fi

if [ -f $dir/.error ]; then
  echo "$0: Error detected while creating train/valid egs" && exit 1
fi

if [ $stage -le 5 ]; then
  echo "$0: recombining and shuffling order of archives on disk"
  # combine all the "egs_orig.*.JOB.scp" (over the $nj splits of the data) and
  # shuffle the order, writing to the egs.JOB.ark

  # the input is a concatenation over the input jobs.
  egs_list=
  for n in $(seq $nj); do
    egs_list="$egs_list $dir/cegs_orig.$n.JOB.ark"
  done

  if [ $archives_multiple == 1 ]; then # normal case.
    if $generate_egs_scp; then
      output_archive="ark,scp:$dir/cegs.JOB.ark,$dir/cegs.JOB.scp"
    else
      output_archive="ark:$dir/cegs.JOB.ark"
    fi
    $cmd --max-jobs-run $max_shuffle_jobs_run --mem 8G \
      JOB=1:$num_archives_intermediate $dir/log/shuffle.JOB.log \
      nnet3-chain-normalize-egs --normalization-fst-scale=$normalization_fst_scale \
        $chaindir/normalization.fst "ark:cat $egs_list|" ark:- \| \
      nnet3-chain-shuffle-egs --srand=\$[JOB+$srand] ark:- $output_archive || exit 1;

    if $generate_egs_scp; then
      #concatenate cegs.JOB.scp in single cegs.scp
      for j in $(seq $num_archives_intermediate); do
        cat $dir/cegs.$j.scp || exit 1;
      done > $dir/cegs.scp || exit 1;
      for f in $dir/cegs.*.scp; do rm $f; done
    fi
  else
    # we need to shuffle the 'intermediate archives' and then split into the
    # final archives.  we create soft links to manage this splitting, because
    # otherwise managing the output names is quite difficult (and we don't want
    # to submit separate queue jobs for each intermediate archive, because then
    # the --max-jobs-run option is hard to enforce).
    if $generate_egs_scp; then
      output_archives="$(for y in $(seq $archives_multiple); do echo ark,scp:$dir/cegs.JOB.$y.ark,$dir/cegs.JOB.$y.scp; done)"
    else
      output_archives="$(for y in $(seq $archives_multiple); do echo ark:$dir/cegs.JOB.$y.ark; done)"
    fi
    for x in $(seq $num_archives_intermediate); do
      for y in $(seq $archives_multiple); do
        archive_index=$[($x-1)*$archives_multiple+$y]
        # egs.intermediate_archive.{1,2,...}.ark will point to egs.archive.ark
        ln -sf cegs.$archive_index.ark $dir/cegs.$x.$y.ark || exit 1
      done
    done
    $cmd --max-jobs-run $max_shuffle_jobs_run --mem 8G \
      JOB=1:$num_archives_intermediate $dir/log/shuffle.JOB.log \
      nnet3-chain-normalize-egs --normalization-fst-scale=$normalization_fst_scale \
        $chaindir/normalization.fst "ark:cat $egs_list|" ark:- \| \
      nnet3-chain-shuffle-egs --srand=\$[JOB+$srand] ark:- ark:- \| \
      nnet3-chain-copy-egs ark:- $output_archives || exit 1;

    if $generate_egs_scp; then
      #concatenate cegs.JOB.scp in single cegs.scp
      rm -rf $dir/cegs.scp
      for j in $(seq $num_archives_intermediate); do
        for y in $(seq $archives_multiple); do
          cat $dir/cegs.$j.$y.scp || exit 1;
        done
      done > $dir/cegs.scp || exit 1;
      for f in $dir/cegs.*.*.scp; do rm $f; done
    fi
  fi
fi

wait
if [ -f $dir/.error ]; then
  echo "$0: Error detected while creating train/valid egs" && exit 1
fi

if [ $stage -le 6 ]; then
  echo "$0: Removing temporary archives, alignments and lattices"
  (
    cd $dir
    for f in $(ls -l . | grep 'cegs_orig' | awk '{ X=NF-1; Y=NF-2; if ($X == "->")  print $Y, $NF; }'); do rm $f; done
    # the next statement removes them if we weren't using the soft links to a
    # 'storage' directory.
    rm cegs_orig.*.ark 2>/dev/null
  )
  if ! $generate_egs_scp && [ $archives_multiple -gt 1 ]; then
    # there are some extra soft links that we should delete.
    for f in $dir/cegs.*.*.ark; do rm $f; done
  fi
  rm $dir/ali.{ark,scp} 2>/dev/null
  rm $dir/lat_special.*.{ark,scp} 2>/dev/null
fi

echo "$0: Finished preparing training examples"


================================================
FILE: egs/steps/nnet3/chain/get_model_context.sh
================================================
#!/bin/bash

# Copyright   2019  Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.
#             2019  Idiap Research Institute (Author: Srikanth Madikeri)
#
# This script computes the total left and right context needed for example (eg)
# creation from a set of 'chain' models.
# See the usage message for more information about input and output formats.

# Begin configuration section.
frame_subsampling_factor=1   # The total frame subsampling factor of the bottom
                             # + top model, i.e. the relative difference in
                             # frame rate between the input of the bottom model
                             # and the output of the top model.  Would normally
                             # be 3.

langs=default                # the list of languages.  This script checks that
                             # in the dir (first arg to the script), each
                             # language exists as $lang.mdl, and it warns if
                             # any model files appear (which might indicate a
                             # script bug).
# End configuration section

echo "$0 $@"  # Print the command line for logging

if [ -f path.sh ]; then . ./path.sh; fi
. parse_options.sh || exit 1;


if [ $# != 2 ]; then
  cat 1>&2 <<EOF
Usage: $0 [opts] <model-dir> <output-info-file>
This script works out some acoustic-context-related information,
and writes it, long with  the options provided to the script,
to the <output-info-file> provided.  An example of what
output-info-file> might contain after this script is called, is:
langs default
frame_subsampling_factor 3
bottom_subsampling_factor 3
model_left_context 22
model_right_context 22
  e.g.: $0 --frame-subsampling-factor 3 
          --langs 'default' exp/chaina/tdnn1a_sp/0 exp/chaina/tdnn1a_sp/0/info.txt
 Options:
     --frame-subsampling-factor    # (default: 1)  Total frame subsampling factor of
                                   # both models combined, i.e. ratio of
                                   # frame rate of input features vs.
                                   # alignments and decoding (e.g. 3).
     --bottom-subsampling-factor   # (default: 1) Controls the frequency at which
                                   # the output of the bottom model is
                                   # evaluated, and the interpretation of frame
                                   # offsets in the top config file.  Must be a
                                   # divisor of --frame-subsampling-factor
     --langs                       # The list of languages (must be in quotes,
                                   # to be parsed as a single arg).  May be
                                   # 'default' or e.g. 'english french'
EOF
  exit 1;
fi


dir=$1
info_file=$2

# die on error or undefined variable.
set -e -u

if [ ! -d $dir ]; then
  echo 1>&2 "$0: expected directory $dir to exist"
  exit 1
fi

if [ -z $langs ]; then
  echo 1>&2 "$0: list of languages (--langs option) is empty"
  exit 1
fi

if  ! [ $frame_subsampling_factor -ge 1 ]; then
  echo 1>&2 "$0: there was a problem with the options --frame-subsampling-factor=$frame_subsampling_factor"
  exit 1
fi

mkdir -p $dir/temp

for lang in $langs; do
  if [ ! -s $dir/$lang.mdl ]; then
    echo 1>&2 "$0: expected file $dir/$lang.mdl to exist and be nonempty (check --langs option)"
    exit 1
  fi
  nnet3-am-info $dir/$lang.mdl > $dir/temp/$lang.info
  this_left_context=$(grep '^left-context:' $dir/temp/$lang.info | awk '{print $2}')
  this_right_context=$(grep '^right-context:' $dir/temp/$lang.info | awk '{print $2}')
done

left_context=$this_left_context
right_context=$this_right_context


cat >$info_file <<EOF
frame_subsampling_factor $frame_subsampling_factor
langs $langs
model_left_context $left_context
model_right_context $right_context
EOF


echo "$0: Finished getting model context"


================================================
FILE: egs/steps/nnet3/chain/get_phone_post.sh
================================================
#!/usr/bin/env bash
# Copyright 2012-2015  Johns Hopkins University (Author: Daniel Povey).
#  Apache 2.0.


# This script obtains phone posteriors from a trained chain model, using either
# the xent output or the forward-backward posteriors from the denominator fst.
# The phone posteriors will be in matrices where the column index can be
# interpreted as phone-index - 1.

# You may want to mess with the compression options.  Be careful: with the current
# settings, you might sometimes get exact zeros as the posterior values.

# CAUTION!  This script isn't very suitable for dumping features from recurrent
# architectures such as LSTMs, because it doesn't support setting the chunk size
# and left and right context.  (Those would have to be passed into nnet3-compute
# or nnet3-chain-compute-post).

# Begin configuration section.
stage=0

nj=1  # Number of jobs to run.
cmd=run.pl
remove_word_position_dependency=false
use_xent_output=false
online_ivector_dir=
use_gpu=false
count_smoothing=1.0  # this should be some small number, I don't think it's critical;
                     # it will mainly affect the probability we assign to phones that
                     # were never seen in training.  note: this is added to the raw
                     # transition-id occupation counts, so 1.0 means, add a single
                     # frame's count to each transition-id's counts.

# End configuration section.

set -e -u
echo "$0 $@"  # Print the command line for logging

[ -f path.sh ] && . ./path.sh
. parse_options.sh || exit 1;

if [ $# != 5 ]; then
  echo "Usage: $0 <chain-tree-dir> <chain-model-dir> <lang-dir> <data-dir> <phone-post-dir>"
  echo " e.g.: $0 --remove-word-position-dependency true --online-ivector-dir exp/nnet3/ivectors_test_eval92_hires \\"
  echo "       exp/chain/tree_a_sp exp/chain/tdnn1a_sp data/lang data/test_eval92_hires exp/chain/tdnn1a_sp_post_eval92"
  echo " ... you'll normally want to set the --nj and --cmd options as well."
  echo ""
  echo "Main options (for others, see top of script file)"
  echo "  --cmd (run.pl|queue.pl|... <queue opts>)    # how to run jobs."
  echo "  --config <config-file>                      # config containing options"
  echo "  --stage <stage>                             # stage to do partial re-run from."
  echo "  --nj <N>                                    # Number of parallel jobs to run, default:1"
  echo "  --remove-word-position-dependency <bool>    # If true, remove word-position-dependency"
  echo "                                              # info when dumping posteriors (default: false)"
  echo "  --use-xent-output <bool>                    # If true, use the cross-entropy output of the"
  echo "                                              # neural network when dumping posteriors"
  echo "                                              # (default: false, will use chain denominator FST)"
  echo "  --online-ivector-dir <dir>                  # Directory where we dumped online-computed"
  echo "                                              # ivectors corresponding to the data in <data>"
  echo "  --use-gpu <bool>                            # Set to true to use GPUs (not recommended as the"
  echo "                                              # binary is very poorly optimized for GPU use)."
  exit 1;
fi


tree_dir=$1
model_dir=$2
lang=$3
data=$4
dir=$5


for f in $tree_dir/tree $tree_dir/final.mdl $tree_dir/ali.1.gz $tree_dir/num_jobs \
         $model_dir/final.mdl $model_dir/frame_subsampling_factor $model_dir/den.fst \
         $data/feats.scp $lang/phones.txt; do
  [ ! -f $f ] && echo "train_sat.sh: no such file $f" && exit 1;
done

sdata=$data/split${nj}utt
[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh --per-utt $data $nj || exit 1;

use_ivector=false

cmvn_opts=$(cat $model_dir/cmvn_opts)
feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- |"

if [ ! -z "$online_ivector_dir" ];then
  steps/nnet2/check_ivectors_compatible.sh $model_dir $online_ivector_dir || exit 1;
  ivector_period=$(cat $online_ivector_dir/ivector_period) || exit 1;
  ivector_feats="scp:utils/filter_scp.pl $sdata/JOB/utt2spk $online_ivector_dir/ivector_online.scp |"
  ivector_opts="--online-ivector-period=$ivector_period --online-ivectors='$ivector_feats'"
else
  ivector_opts=
fi

if $use_gpu; then
  gpu_queue_opt="--gpu 1"
  gpu_opt="--use-gpu=yes"
  if ! cuda-compiled; then
    echo "$0: WARNING: you are running with one thread but you have not compiled"
    echo "   for CUDA.  You may be running a setup optimized for GPUs.  If you have"
    echo "   GPUs and have nvcc installed, go to src/ and do ./configure; make"
    exit 1
  fi
else
  gpu_queue_opts=
  gpu_opt="--use-gpu=no"
fi
frame_subsampling_factor=$(cat $model_dir/frame_subsampling_factor)

mkdir -p $dir/log
cp $model_dir/frame_subsampling_factor $dir/

if [ $stage -le 0 ]; then
  if [ ! -f $dir/tacc ] || [ $dir/tacc -ot $tree_dir/ali.1.gz ]; then
    echo "$0: obtaining transition-id counts in $dir/tacc"
    # Obtain counts for each transition-id, from the alignments.
    this_nj=$(cat $tree_dir/num_jobs)


    $cmd JOB=1:$this_nj $dir/log/acc_taccs.JOB.log \
       ali-to-post "ark:gunzip -c $tree_dir/ali.JOB.gz|" ark:- \| \
       post-to-tacc $tree_dir/final.mdl ark:- $dir/tacc.JOB

    input_taccs=$(for n in $(seq $this_nj); do echo $dir/tacc.$n; done)

    $cmd $dir/log/sum_taccs.log \
         vector-sum --binary=false $input_taccs $dir/tacc

    rm $dir/tacc.*
  else
    echo "$0: skipping creation of $dir/tacc since it already exists."
  fi
fi


if [ $stage -le 1 ] && $remove_word_position_dependency; then
  echo "$0: creating $dir/phone_map.int"
  utils/lang/get_word_position_phone_map.pl $lang $dir
else
  # Either way, $dir/phones.txt will be a symbol table for the phones that
  # we are dumping (although the matrices we dump won't contain anything
  # for symbol 0 which is <eps>).
  grep -v '^#' $lang/phones.txt > $dir/phones.txt
fi

if [ $stage -le 1 ]; then
  # we want the phones in integer form as it's safer for processing by script.
  # $data/fake_phones.txt will just contain e.g. "0 0\n1 1\n....", it's used
  # to force show-transitions to print the phones as integers.
  awk '{print $2,$2}' <$lang/phones.txt >$dir/fake_phones.txt


  # The format of the 'show-transitions' command below is like the following:
  #show-transitions tempdir/phone_map.int exp/chain/tree_a_sp/final.mdl
  #Transition-state 1: phone = 1 hmm-state = 0 forward-pdf = 0 self-loop-pdf = 51
  # Transition-id = 1 p = 0.5 [self-loop]
  # Transition-id = 2 p = 0.5 [0 -> 1]
  #Transition-state 2: phone = 10 hmm-state = 0 forward-pdf = 0 self-loop-pdf = 51
  # Transition-id = 3 p = 0.5 [self-loop]
  # Transition-id = 4 p = 0.5 [0 -> 1]

  # The following inline script processes that info about the transition model
  # into the file $dir/phones_and_pdfs.txt, which has a line for each transition-id
  # (starting from number 1), and the format of each line is
  # <phone-id> <pdf-id>
  show-transitions $dir/fake_phones.txt $tree_dir/final.mdl | \
    perl -ane ' if(m/Transition-state.* phone = (\d+) pdf = (\d+)/) { $phone = $1; $forward_pdf = $2; $self_loop_pdf = $2; }
        if(m/Transition-state.* phone = (\d+) .* forward-pdf = (\d+) self-loop-pdf = (\d+)/) {
          $phone = $1; $forward_pdf = $2; $self_loop_pdf = $3; }
        if(m/Transition-id/) {  if (m/self-loop/) { print "$phone $self_loop_pdf\n"; }
            else { print "$phone $forward_pdf\n" } } ' > $dir/phones_and_pdfs.txt


  # The following command just separates the 'tacc' file into a similar format
  # to $dir/phones_and_pdfs.txt, with one count per line, and a line per transition-id
  # starting from number 1.  We skip the first two fields which are "[ 0" (the 0 is
  # for transition-id=0, since transition-ids are 1-based), and the last field which is "]".
  awk '{ for (n=3;n<NF;n++) print $n; }' <$dir/tacc  >$dir/transition_counts.txt

  num_lines1=$(wc -l <$dir/phones_and_pdfs.txt)
  num_lines2=$(wc -l <$dir/transition_counts.txt)
  if [ $num_lines1 -ne $num_lines2 ]; then
    echo "$0: mismatch in num-lines between phones_and_pdfs.txt and transition_counts.txt: $num_lines1 vs $num_lines2"
    exit 1
  fi

  # after 'paste', the format of the data will be
  # <phone-id> <pdf-id> <data-count>
  # we add the count smoothing at this point.
  paste $dir/phones_and_pdfs.txt $dir/transition_counts.txt | \
     awk -v s=$count_smoothing '{print $1, $2, (s+$3);}' > $dir/combined_info.txt

  if $remove_word_position_dependency; then
    # map the phones to word-position-independent phones; you can see $dir/phones.txt
    # to interpret the final output.
    utils/apply_map.pl -f 1 $dir/phone_map.int <$dir/combined_info.txt > $dir/temp.txt
    mv $dir/temp.txt $dir/combined_info.txt
  fi

  awk 'BEGIN{num_phones=1;num_pdfs=1;} { phone=$1; pdf=$2; count=$3; pdf_count[pdf] += count; counts[pdf,phone] += count;
       if (phone>num_phones) num_phones=phone; if (pdf>=num_pdfs) num_pdfs = pdf + 1; }
       END{ print "[ "; for(phone=1;phone<=num_phones;phone++) {
          for (pdf=0;pdf<num_pdfs;pdf++) printf("%.3f ", counts[pdf,phone]/pdf_count[pdf]);
           print ""; } print "]"; }' <$dir/combined_info.txt >$dir/transform.mat

fi


if [ $stage -le 2 ]; then

  # note: --compression-method=3 is kTwoByteAuto: Each element is stored in two
  # bytes as a uint16, with the representable range of values chosen
  # automatically with the minimum and maximum elements of the matrix as its
  # edges.
  compress_opts="--compress=true --compression-method=3"

  if $use_xent_output; then
    # This block uses the 'output-xent' output of the nnet.

    model="nnet3-copy '--edits-config=echo remove-output-nodes name=output; echo rename-node old-name=output-xent new-name=output|' $model_dir/final.mdl -|"

    $cmd $gpu_queue_opts JOB=1:$nj $dir/log/get_phone_post.JOB.log \
       nnet3-compute $gpu_opt $ivector_opts \
       --frame-subsampling-factor=$frame_subsampling_factor --apply-exp=true \
       "$model" "$feats" ark:- \| \
       transform-feats $dir/transform.mat ark:- ark:- \| \
       copy-feats $compress_opts ark:- ark,scp:$dir/phone_post.JOB.ark,$dir/phone_post.JOB.scp
  else
    # This block is when we are using the 'chain' output (recommended as the posteriors
    # will be much more accurate).
    $cmd $gpu_queue_opts JOB=1:$nj $dir/log/get_phone_post.JOB.log \
       nnet3-chain-compute-post $gpu_opt $ivector_opts --transform-mat=$dir/transform.mat \
          --frame-subsampling-factor=$frame_subsampling_factor \
        $model_dir/final.mdl $model_dir/den.fst "$feats" ark:- \| \
       copy-feats $compress_opts ark:- ark,scp:$dir/phone_post.JOB.ark,$dir/phone_post.JOB.scp
  fi

  sleep 5
  # Make a single .scp file, for convenience.
  for n in $(seq $nj); do cat $dir/phone_post.$n.scp; done > $dir/phone_post.scp

fi


================================================
FILE: egs/steps/nnet3/chain/make_weighted_den_fst.sh
================================================
#!/usr/bin/env bash

# Copyright 2017 Vimal Manohar
#           2017 Pegah Ghahremani
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#  http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.

# This script creates denominator FST (den.fst) and normalization.fst for
# chain training. It additionally copies the transition model and tree from the
# first alignment directory to the chain directory.
# Alternatively, if the --am-dir option is used, the transition model and tree
# are taken from there instead of the first alignment directory.
# This script can accept multiple sources of alignments with same phone sets
# that can be weighted to estimate phone LM.
# You can use the --num-repeats option to repeat some source data more than
# once when training the LM for the denominator FST.

set -o pipefail

# begin configuration section.
cmd=run.pl
stage=0
num_repeats= # Comma-separated list of positive integer multiplicities, one
             # for each input alignment directory.  The alignments from
             # each source will be scaled by the corresponding value when
             # training the LM.
             # If not specified, weight '1' is used for all data sources.

am_dir=
lm_opts='--num-extra-lm-states=2000'
#end configuration section.


[ -f ./path.sh ] && . ./path.sh
. parse_options.sh || exit 1;

if [ $# -lt 2 ]; then
  echo "Usage: $0 [options] <ali-dir1> [<ali-dir2> ...] <out-dir>";
  echo "e.g.: $0 exp/tri1_ali exp/tri2_ali exp/chain/tdnn_1a_sp";
  echo "Options: "
  echo " --cmd (run.pl|queue.pl...)      # Specify how to run jobs.";
  echo "--lm-opts                        # Options for phone LM generation";
  echo "--num-repeats                    # Comma-separated list of postive integer"
  echo "                                 # multiplicities, one for each input"
  echo "                                 # alignment directory.  The alignments"
  echo "                                 # from each source will be scaled by"
  echo "                                 # the corresponding value when training"
  echo "                                 # the LM.  If not specified, weight '1'"
  echo "                                 # is used for all data sources."
  echo "--am-dir                         # Path to the base AM directory. Set this"
  echo "                                 # when the AM you will be training from"
  echo "                                 # isn't necessarily the one which created"
  echo "                                 # the alignments. If this is not set, the"
  echo "                                 # tree and transition model from the first"
  echo "                                 # ali-dir will be copied to out-dir."
  exit 1;
fi

dir=${@: -1}   # the working directory: last argument to the script
ali_dirs=( $@ )  # read the remaining arguments into an array
unset ali_dirs[${#ali_dirs[@]}-1]  # 'pop' the last argument which is $dir
num_alignments=${#ali_dirs[@]}    # number of alignment dirs to combine

if [ -z "$am_dir" ]; then
  am_dir=${ali_dirs[0]}
fi

mkdir -p $dir/log

# Go through each alignment directory and make sure the phones match.
for n in `seq 0 $[$num_alignments-1]`;do
  ali_dir=${ali_dirs[$n]}
  for f in $ali_dir/ali.1.gz $ali_dir/final.mdl $ali_dir/tree; do
    [ ! -f $f ] && echo "$0: Expected file $f to exist" && exit 1;
  done
  utils/lang/check_phones_compatible.sh ${am_dir}/phones.txt \
    ${ali_dirs[$n]}/phones.txt || exit 1;
done

# Make sure we have the AM and tree in the am_dir.
for f in $am_dir/final.mdl $am_dir/tree; do
  [ ! -f $f ] && echo "$0: Expected file $f to exist" && exit 1;
done

cp $am_dir/tree $dir || exit 1

if [ -z "$num_repeats" ]; then
  # If 'num_repeats' is not specified, set num_repeats_array to e.g. (1 1 1).
  num_repeats_array=( $(for n in $(seq $num_alignments); do echo 1; done) )
else
  num_repeats_array=(${num_repeats//,/ })
  num_repeats=${#num_repeats_array[@]}
  if [ $num_repeats -ne $num_alignments ]; then
    echo "$0: too many or too few elements in --num-repeats option: '$num_repeats'"
    exit 1
  fi
fi

all_phones=""  # will contain the names of the .gz files containing phones,
               # with some members possibly repeated per the --num-repeats
               # option
for n in `seq 0 $[num_alignments-1]`; do
  this_num_repeats=${num_repeats_array[$n]}
  this_alignment_dir=${ali_dirs[$n]}
  num_jobs=$(cat $this_alignment_dir/num_jobs)
  if ! [ "$this_num_repeats" -ge 0 ]; then
    echo "Expected comma-separated list of integers for --num-repeats option, got '$num_repeats'"
    exit 1
  fi


  if [ $stage -le 1 ]; then
    for j in $(seq $num_jobs); do gunzip -c $this_alignment_dir/ali.$j.gz; done | \
      ali-to-phones $this_alignment_dir/final.mdl ark:- "ark:|gzip -c >$dir/phones.$n.gz" || exit 1;
  fi

  if [ ! -s $dir/phones.$n.gz ]; then
    echo "$dir/phones.$n.gz is empty or does not exist"
    exit 1
  fi

  all_phones="$all_phones $(for r in $(seq $this_num_repeats); do echo $dir/phones.$n.gz; done)"
done

if [ $stage -le 2 ]; then
  $cmd $dir/log/make_phone_lm_fst.log \
    gunzip -c $all_phones \| \
    chain-est-phone-lm $lm_opts ark:- $dir/phone_lm.fst || exit 1;
  rm $dir/phones.*.gz
fi

if [ $stage -le 3 ]; then
  copy-transition-model $am_dir/final.mdl $dir/0.trans_mdl || exit 1;
fi

if [ $stage -le 4 ]; then
  $cmd $dir/log/make_den_fst.log \
    chain-make-den-fst $dir/tree $dir/0.trans_mdl \
    $dir/phone_lm.fst \
    $dir/den.fst $dir/normalization.fst || exit 1
fi

echo "Successfully created {den,normalization}.fst"

exit 0


================================================
FILE: egs/steps/nnet3/chain/multilingual/combine_egs.sh
================================================
#!/usr/bin/env bash

# Copyright 2017     Pegah Ghahremani
#           2017-18  Vimal Manohar
# Apache 2.0

# This script generates examples for multilingual training of 'chain' 
# models using separate input egs dir per language as input.
# This script is similar to steps/nnet3/multilingual/combine_egs.sh, but 
# works on 'chain' egs. This is also useful for semi-supervised training,
# where supervised and unsupervised datasets are treated as different 
# languages.

# This scripts produces 3 sets of files --
# cegs.*.scp, cegs.output.*.ark, cegs.weight.*.ark
#
# cegs.*.scp are the SCP files of the training examples.
# cegs.weight.*.ark map from the key of the example to the language-specific
# weight of that example.
# cegs.output.*.ark map from the key of the example to the name of
# the output-node in the neural net for that specific language, e.g.
# 'output-2'.
#
# Begin configuration section.
cmd=run.pl
block_size=256          # This is the number of consecutive egs that we take from
                        # each source, and it only affects the locality of disk
                        # access.
lang2weight=            # array of weights one per input languge to scale example's output
                        # w.r.t its input language during training.
stage=0

echo "$0 $@"  # Print the command line for logging

if [ -f path.sh ]; then . ./path.sh; fi
. parse_options.sh || exit 1;

if [ $# -lt 3 ]; then
  cat <<EOF
  This script generates examples for multilingual training of neural network
  using separate input egs dir per language as input.
  See top of the script for details.

  Usage: $0 [opts] <num-input-langs,N> <lang1-egs-dir> ...<langN-egs-dir> <multilingual-egs-dir>
   e.g.: $0 [opts] 2 exp/lang1/egs exp/lang2/egs exp/multi/egs

  Options:
      --cmd (utils/run.pl|utils/queue.pl <queue opts>)  # how to run jobs.
      --block-size <int|512>      # it is the number of consecutive egs that we take from 
                                  # each source, and it only affects the locality of disk 
                                  # access. This does not have to be the actual minibatch size
EOF
  exit 1;
fi

num_langs=$1

shift 1
args=("$@")
megs_dir=${args[-1]} # multilingual directory
mkdir -p $megs_dir
mkdir -p $megs_dir/info
if [ ${#args[@]} != $[$num_langs+1] ]; then
  echo "$0: num of input example dirs provided is not compatible with num_langs $num_langs."
  echo "Usage:$0 [opts] <num-input-langs,N> <lang1-egs-dir> ...<langN-egs-dir> <multilingual-egs-dir>"
  echo "Usage:$0 [opts] 2 exp/lang1/egs exp/lang2/egs exp/multi/egs"
  exit 1;
fi

required="cegs.scp combine.scp train_diagnostic.scp valid_diagnostic.scp"
train_scp_list=
train_diagnostic_scp_list=
valid_diagnostic_scp_list=
combine_scp_list=

# read paramter from $egs_dir[0]/info and cmvn_opts
# to write in multilingual egs_dir.
check_params="info/feat_dim info/ivector_dim info/left_context info/right_context info/left_context_initial info/right_context_final cmvn_opts"
ivec_dim=`cat ${args[0]}/info/ivector_dim`
if [ $ivec_dim -ne 0 ];then check_params="$check_params info/final.ie.id"; fi

for param in $check_params info/frames_per_eg; do
  cat ${args[0]}/$param > $megs_dir/$param || exit 1;
done

tot_num_archives=0
for lang in $(seq 0 $[$num_langs-1]);do
  multi_egs_dir[$lang]=${args[$lang]}
  for f in $required; do
    if [ ! -f ${multi_egs_dir[$lang]}/$f ]; then
      echo "$0: no such file ${multi_egs_dir[$lang]}/$f." && exit 1;
    fi
  done
  num_archives=$(cat ${multi_egs_dir[$lang]}/info/num_archives)
  tot_num_archives=$[tot_num_archives+num_archives]
  train_scp_list="$train_scp_list ${args[$lang]}/cegs.scp"
  train_diagnostic_scp_list="$train_diagnostic_scp_list ${args[$lang]}/train_diagnostic.scp"
  valid_diagnostic_scp_list="$valid_diagnostic_scp_list ${args[$lang]}/valid_diagnostic.scp"
  combine_scp_list="$combine_scp_list ${args[$lang]}/combine.scp"

  # check parameter dimension to be the same in all egs dirs
  for f in $check_params; do
    if [ -f $megs_dir/$f ] && [ -f ${multi_egs_dir[$lang]}/$f ]; then
      f1=$(cat $megs_dir/$f)
      f2=$(cat ${multi_egs_dir[$lang]}/$f)
      if [ "$f1" != "$f2" ]  ; then
        echo "$0: mismatch for $f in $megs_dir vs. ${multi_egs_dir[$lang]}($f1 vs. $f2)."
        exit 1;
      fi
    else
      echo "$0: file $f does not exits in $megs_dir or ${multi_egs_dir[$lang]}/$f ."
    fi
  done
done

if [ ! -z "$lang2weight" ]; then
  egs_opt="--lang2weight '$lang2weight'"
fi

if [ $stage -le 0 ]; then
  echo "$0: allocating multilingual examples for training."
  # Generate cegs.*.scp for multilingual setup.
  $cmd $megs_dir/log/allocate_multilingual_examples_train.log \
    steps/nnet3/multilingual/allocate_multilingual_examples.py $egs_opt \
      --num-archives $tot_num_archives \
      --block-size $block_size \
      --egs-prefix "cegs." \
      $train_scp_list $megs_dir || exit 1;
fi

if [ $stage -le 1 ]; then
  echo "$0: combine combine.scp examples from all langs in $megs_dir/combine.scp."
  # Generate combine.scp for multilingual setup.
  $cmd $megs_dir/log/allocate_multilingual_examples_combine.log \
    steps/nnet3/multilingual/allocate_multilingual_examples.py $egs_opt \
      --num-archives 1 \
      --block-size $block_size \
      --egs-prefix "combine." \
      $combine_scp_list $megs_dir || exit 1;

  echo "$0: combine train_diagnostic.scp examples from all langs in $megs_dir/train_diagnostic.scp."
  # Generate train_diagnostic.scp for multilingual setup.
  $cmd $megs_dir/log/allocate_multilingual_examples_train_diagnostic.log \
    steps/nnet3/multilingual/allocate_multilingual_examples.py $egs_opt \
      --num-archives 1 \
      --block-size $block_size \
      --egs-prefix "train_diagnostic." \
      $train_diagnostic_scp_list $megs_dir || exit 1;


  echo "$0: combine valid_diagnostic.scp examples from all langs in $megs_dir/valid_diagnostic.scp."
  # Generate valid_diagnostic.scp for multilingual setup.
  $cmd $megs_dir/log/allocate_multilingual_examples_valid_diagnostic.log \
    steps/nnet3/multilingual/allocate_multilingual_examples.py $egs_opt \
      --num-archives 1 \
      --block-size $block_size \
      --egs-prefix "valid_diagnostic." \
      $valid_diagnostic_scp_list $megs_dir || exit 1;

fi
for egs_type in combine train_diagnostic valid_diagnostic; do
  mv $megs_dir/${egs_type}.output.1.ark $megs_dir/${egs_type}.output.ark || exit 1;
  mv $megs_dir/${egs_type}.weight.1.ark $megs_dir/${egs_type}.weight.ark || exit 1;
  mv $megs_dir/${egs_type}.1.scp $megs_dir/${egs_type}.scp || exit 1;
done
mv $megs_dir/info/cegs.num_archives $megs_dir/info/num_archives || exit 1;
mv $megs_dir/info/cegs.num_tasks $megs_dir/info/num_tasks || exit 1;
echo "$0: Finished preparing multilingual training example."


================================================
FILE: egs/steps/nnet3/chain/train.py
================================================
#!/usr/bin/env python

# Copyright 2016    Vijayaditya Peddinti.
#           2016    Vimal Manohar
# Apache 2.0.

""" This script is based on steps/nnet3/chain/train.sh
"""
from __future__ import division
from __future__ import print_function

import argparse
import logging
import os
import pprint
import shutil
import sys
import traceback

sys.path.insert(0, 'steps')
import libs.nnet3.train.common as common_train_lib
import libs.common as common_lib
import libs.nnet3.train.chain_objf.acoustic_model as chain_lib
import libs.nnet3.report.log_parse as nnet3_log_parse


logger = logging.getLogger('libs')
logger.setLevel(logging.INFO)
handler = logging.StreamHandler()
handler.setLevel(logging.INFO)
formatter = logging.Formatter("%(asctime)s [%(pathname)s:%(lineno)s - "
                              "%(funcName)s - %(levelname)s ] %(message)s")
handler.setFormatter(formatter)
logger.addHandler(handler)
logger.info('Starting chain model trainer (train.py)')


def get_args():
    """ Get args from stdin.

    We add compulsary arguments as named arguments for readability

    The common options are defined in the object
    libs.nnet3.train.common.CommonParser.parser.
    See steps/libs/nnet3/train/common.py
    """

    parser = argparse.ArgumentParser(
        description="""Trains RNN and DNN acoustic models using the 'chain'
        objective function.""",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        conflict_handler='resolve',
        parents=[common_train_lib.CommonParser().parser])

    # egs extraction options
    parser.add_argument("--egs.chunk-width", type=str, dest='chunk_width',
                        default="20",
                        help="""Number of frames per chunk in the examples
                        used to train the RNN.   Caution: if you double this you
                        should halve --trainer.samples-per-iter.  May be
                        a comma-separated list of alternatives: first width
                        is the 'principal' chunk-width, used preferentially""")
    parser.add_argument("--egs.nj", type=int, required=False,
                        default=0, dest="egs_nj",
                        help="""Number of jobs to use when generating egs.
                        Default: the same number as used for tree generation.
                        You probably do not need to tweak this, unless you
                        want to adapt a neural network on some different,
                        smaller-size data.""")

    # chain options
    parser.add_argument("--chain.lm-opts", type=str, dest='lm_opts',
                        default=None, action=common_lib.NullstrToNoneAction,
                        help="options to be be passed to chain-est-phone-lm")
    parser.add_argument("--chain.l2-regularize", type=float,
                        dest='l2_regularize', default=0.0,
                        help="""Weight of regularization function which is the
                        l2-norm of the output of the network. It should be used
                        without the log-softmax layer for the outputs.  As
                        l2-norm of the log-softmax outputs can dominate the
                        objective function.""")
    parser.add_argument("--chain.xent-regularize", type=float,
                        dest='xent_regularize', default=0.0,
                        help="Weight of regularization function which is the "
                        "cross-entropy cost the outputs.")
    parser.add_argument("--chain.right-tolerance", type=int,
                        dest='right_tolerance', default=5, help="")
    parser.add_argument("--chain.left-tolerance", type=int,
                        dest='left_tolerance', default=5, help="")
    parser.add_argument("--chain.leaky-hmm-coefficient", type=float,
                        dest='leaky_hmm_coefficient', default=0.00001,
                        help="")
    parser.add_argument("--chain.apply-deriv-weights", type=str,
                        dest='apply_deriv_weights', default=True,
                        action=common_lib.StrToBoolAction,
                        choices=["true", "false"],
                        help="")
    parser.add_argument("--chain.frame-subsampling-factor", type=int,
                        dest='frame_subsampling_factor', default=3,
                        help="ratio of frames-per-second of features we "
                        "train on, to chain model's output")
    parser.add_argument("--chain.alignment-subsampling-factor", type=int,
                        dest='alignment_subsampling_factor',
                        default=3,
                        help="ratio of frames-per-second of input "
                        "alignments to chain model's output")
    parser.add_argument("--chain.left-deriv-truncate", type=int,
                        dest='left_deriv_truncate',
                        default=None,
                        help="Deprecated. Kept for back compatibility")

    # trainer options
    parser.add_argument("--trainer.input-model", type=str,
                        dest='input_model', default=None,
                        action=common_lib.NullstrToNoneAction,
                        help="If specified, this model is used as initial "
                             "'raw' model (0.raw in the script) instead of "
                             "initializing the model from the xconfig. "
                             "Also configs dir is not expected to exist "
                             "and left/right context is computed from this "
                             "model.")
    parser.add_argument("--trainer.num-epochs", type=float, dest='num_epochs',
                        default=10.0,
                        help="Number of epochs to train the model")
    parser.add_argument("--trainer.frames-per-iter", type=int,
                        dest='frames_per_iter', default=800000,
                        help="""Each iteration of training, see this many
                        [input] frames per job.  This option is passed to
                        get_egs.sh.  Aim for about a minute of training
                        time""")

    parser.add_argument("--trainer.num-chunk-per-minibatch", type=str,
                        dest='num_chunk_per_minibatch', default='128',
                        help="""Number of sequences to be processed in
                        parallel every minibatch.  May be a more general
                        rule as accepted by the --minibatch-size option of
                        nnet3-merge-egs; run that program without args to see
                        the format.""")

    # Parameters for the optimization
    parser.add_argument("--trainer.optimization.initial-effective-lrate",
                        type=float, dest='initial_effective_lrate',
                        default=0.0002,
                        help="Learning rate used during the initial iteration")
    parser.add_argument("--trainer.optimization.final-effective-lrate",
                        type=float, dest='final_effective_lrate',
                        default=0.00002,
                        help="Learning rate used during the final iteration")
    parser.add_argument("--trainer.optimization.shrink-value", type=float,
                        dest='shrink_value', default=1.0,
                        help="""Scaling factor used for scaling the parameter
                        matrices when the derivative averages are below the
                        shrink-threshold at the non-linearities.  E.g. 0.99.
                        Only applicable when the neural net contains sigmoid or
                        tanh units.""")
    parser.add_argument("--trainer.optimization.shrink-saturation-threshold",
                        type=float,
                        dest='shrink_saturation_threshold', default=0.40,
                        help="""Threshold that controls when we apply the
                        'shrinkage' (i.e. scaling by shrink-value).  If the
                        saturation of the sigmoid and tanh nonlinearities in
                        the neural net (as measured by
                        steps/nnet3/get_saturation.pl) exceeds this threshold
                        we scale the parameter matrices with the
                        shrink-value.""")
    # RNN-specific training options
    parser.add_argument("--trainer.deriv-truncate-margin", type=int,
                        dest='deriv_truncate_margin', default=None,
                        help="""(Relevant only for recurrent models). If
                        specified, gives the margin (in input frames) around
                        the 'required' part of each chunk that the derivatives
                        are backpropagated to. If unset, the derivatives are
                        backpropagated all the way to the boundaries of the
                        input data. E.g. 8 is a reasonable setting. Note: the
                        'required' part of the chunk is defined by the model's
                        {left,right}-context.""")

    # General options
    parser.add_argument("--feat-dir", type=str, required=True,
                        help="Directory with features used for training "
                        "the neural network.")
    parser.add_argument("--tree-dir", type=str, required=True,
                        help="""Directory containing the tree to use for this
                        model (we also expect final.mdl and ali.*.gz in that
                        directory""")
    parser.add_argument("--lat-dir", type=str, required=True,
                        help="Directory with numerator lattices "
                        "used for training the neural network.")
    parser.add_argument("--dir", type=str, required=True,
                        help="Directory to store the models and "
                        "all other files.")

    print(' '.join(sys.argv))
    print(sys.argv)

    args = parser.parse_args()

    [args, run_opts] = process_args(args)

    return [args, run_opts]


def process_args(args):
    """ Process the options got from get_args()
    """

    if not common_train_lib.validate_chunk_width(args.chunk_width):
        raise Exception("--egs.chunk-width has an invalid value")

    if not common_train_lib.validate_minibatch_size_str(args.num_chunk_per_minibatch):
        raise Exception("--trainer.num-chunk-per-minibatch has an invalid value")

    if args.chunk_left_context < 0:
        raise Exception("--egs.chunk-left-context should be non-negative")

    if args.chunk_right_context < 0:
        raise Exception("--egs.chunk-right-context should be non-negative")

    if args.left_deriv_truncate is not None:
        args.deriv_truncate_margin = -args.left_deriv_truncate
        logger.warning(
            "--chain.left-deriv-truncate (deprecated) is set by user, and "
            "--trainer.deriv-truncate-margin is set to negative of that "
            "value={0}. We recommend using the option "
            "--trainer.deriv-truncate-margin.".format(
                args.deriv_truncate_margin))

    if (not os.path.exists(args.dir)):
        raise Exception("Directory specified with --dir={0} "
                        "does not exist.".format(args.dir))
    if (not os.path.exists(args.dir + "/configs") and
        (args.input_model is None or not os.path.exists(args.input_model))):
        raise Exception("Either --trainer.input-model option should be supplied, "
                        "and exist; or the {0}/configs directory should exist."
                        "".format(args.dir))

    # set the options corresponding to args.use_gpu
    run_opts = common_train_lib.RunOpts()
    if args.use_gpu in ["true", "false"]:
        args.use_gpu = ("yes" if args.use_gpu == "true" else "no")
    if args.use_gpu in ["yes", "wait"]:
        if not common_lib.check_if_cuda_compiled():
            logger.warning(
                """You are running with one thread but you have not compiled
                   for CUDA.  You may be running a setup optimized for GPUs.
                   If you have GPUs and have nvcc installed, go to src/ and do
                   ./configure; make""")

        run_opts.train_queue_opt = "--gpu 1"
        run_opts.parallel_train_opts = "--use-gpu={}".format(args.use_gpu)
        run_opts.combine_queue_opt = "--gpu 1"
        run_opts.combine_gpu_opt = "--use-gpu={}".format(args.use_gpu)

    else:
        logger.warning("Without using a GPU this will be very slow. "
                       "nnet3 does not yet support multiple threads.")

        run_opts.train_queue_opt = ""
        run_opts.parallel_train_opts = "--use-gpu=no"
        run_opts.combine_queue_opt = ""
        run_opts.combine_gpu_opt = "--use-gpu=no"

    run_opts.command = args.command
    run_opts.egs_command = (args.egs_command
                            if args.egs_command is not None else
                            args.command)

    return [args, run_opts]


def train(args, run_opts):
    """ The main function for training.

    Args:
        args: a Namespace object with the required parameters
            obtained from the function process_args()
        run_opts: RunOpts object obtained from the process_args()
    """

    arg_string = pprint.pformat(vars(args))
    logger.info("Arguments for the experiment\n{0}".format(arg_string))

    # Check files
    chain_lib.check_for_required_files(args.feat_dir, args.tree_dir,
                                       args.lat_dir if args.egs_dir is None
                                       else None)

    # Copy phones.txt from tree-dir to dir. Later, steps/nnet3/decode.sh will
    # use it to check compatibility between training and decoding phone-sets.
    shutil.copy('{0}/phones.txt'.format(args.tree_dir), args.dir)

    # Set some variables.
    if args.egs_nj <= 0:
        num_jobs = common_lib.get_number_of_jobs(args.tree_dir)
    else:
        num_jobs = args.egs_nj
    feat_dim = common_lib.get_feat_dim(args.feat_dir)
    ivector_dim = common_lib.get_ivector_dim(args.online_ivector_dir)
    ivector_id = common_lib.get_ivector_extractor_id(args.online_ivector_dir)

    # split the training data into parts for individual jobs
    # we will use the same number of jobs as that used for alignment
    common_lib.execute_command("utils/split_data.sh {0} {1}"
                               "".format(args.feat_dir, num_jobs))
    with open('{0}/num_jobs'.format(args.dir), 'w') as f:
        f.write(str(num_jobs))

    if args.input_model is None:
        config_dir = '{0}/configs'.format(args.dir)
        var_file = '{0}/vars'.format(config_dir)

        variables = common_train_lib.parse_generic_config_vars_file(var_file)
    else:
        # If args.input_model is specified, the model left and right contexts
        # are computed using input_model.
        variables = common_train_lib.get_input_model_info(args.input_model)

    # Set some variables.
    try:
        model_left_context = variables['model_left_context']
        model_right_context = variables['model_right_context']
    except KeyError as e:
        raise Exception("KeyError {0}: Variables need to be defined in "
                        "{1}".format(str(e), '{0}/configs'.format(args.dir)))

    left_context = args.chunk_left_context + model_left_context
    right_context = args.chunk_right_context + model_right_context
    left_context_initial = (args.chunk_left_context_initial + model_left_context if
                            args.chunk_left_context_initial >= 0 else -1)
    right_context_final = (args.chunk_right_context_final + model_right_context if
                           args.chunk_right_context_final >= 0 else -1)

    # Initialize as "raw" nnet, prior to training the LDA-like preconditioning
    # matrix.  This first config just does any initial splicing that we do;
    # we do this as it's a convenient way to get the stats for the 'lda-like'
    # transform.
    if (args.stage <= -6):
        logger.info("Creating phone language-model")
        chain_lib.create_phone_lm(args.dir, args.tree_dir, run_opts,
                                  lm_opts=args.lm_opts)

    if (args.stage <= -5):
        logger.info("Creating denominator FST")
        shutil.copy('{0}/tree'.format(args.tree_dir), args.dir)
        chain_lib.create_denominator_fst(args.dir, args.tree_dir, run_opts)

    if ((args.stage <= -4) and
            os.path.exists("{0}/configs/init.config".format(args.dir))
            and (args.input_model is None)):
        logger.info("Initializing a basic network for estimating "
                    "preconditioning matrix")
        common_lib.execute_command(
            """{command} {dir}/log/nnet_init.log \
            nnet3-init --srand=-2 {dir}/configs/init.config \
            {dir}/init.raw""".format(command=run_opts.command,
                                     dir=args.dir))

    egs_left_context = left_context + args.frame_subsampling_factor // 2
    egs_right_context = right_context + args.frame_subsampling_factor // 2
    # note: the '+ args.frame_subsampling_factor / 2' is to allow for the
    # fact that we'll be shifting the data slightly during training to give
    # variety to the training data.
    egs_left_context_initial = (left_context_initial +
                                args.frame_subsampling_factor // 2 if
                                left_context_initial >= 0 else -1)
    egs_right_context_final = (right_context_final +
                               args.frame_subsampling_factor // 2 if
                               right_context_final >= 0 else -1)

    default_egs_dir = '{0}/egs'.format(args.dir)

    if (args.egs_dir is not None) and (args.cmvn_opts != "--norm-means=false --norm-vars=false"):
        logger.warning("the --feat.cmvn-opts option has no effect because we are not dumping egs")

    if (args.egs_dir is not None) and (args.frames_per_iter != 800000):
        logger.warning("the --trainer.frames-per-iter option has no effect because we are not dumping egs")

    if ((args.stage <= -3) and args.egs_dir is None):
        logger.info("Generating egs")
        if (not os.path.exists("{0}/den.fst".format(args.dir)) or
                not os.path.exists("{0}/normalization.fst".format(args.dir)) or
                not os.path.exists("{0}/tree".format(args.dir))):
            raise Exception("Chain egs generation expects {0}/den.fst, "
                            "{0}/normalization.fst and {0}/tree "
                            "to exist.".format(args.dir))
        # this is where get_egs.sh is called.
        chain_lib.generate_chain_egs(
            dir=args.dir, data=args.feat_dir,
            lat_dir=args.lat_dir, egs_dir=default_egs_dir,
            left_context=egs_left_context,
            right_context=egs_right_context,
            left_context_initial=egs_left_context_initial,
            right_context_final=egs_right_context_final,
            run_opts=run_opts,
            left_tolerance=args.left_tolerance,
            right_tolerance=args.right_tolerance,
            frame_subsampling_factor=args.frame_subsampling_factor,
            alignment_subsampling_factor=args.alignment_subsampling_factor,
            frames_per_eg_str=args.chunk_width,
            srand=args.srand,
            egs_opts=args.egs_opts,
            cmvn_opts=args.cmvn_opts,
            online_ivector_dir=args.online_ivector_dir,
            frames_per_iter=args.frames_per_iter,
            stage=args.egs_stage)

    if args.egs_dir is None:
        egs_dir = default_egs_dir
    else:
        egs_dir = args.egs_dir

    [egs_left_context, egs_right_context,
     frames_per_eg_str, num_archives] = (
         common_train_lib.verify_egs_dir(egs_dir, feat_dim,
                                         ivector_dim, ivector_id,
                                         egs_left_context, egs_right_context,
                                         egs_left_context_initial,
                                         egs_right_context_final))
    assert(args.chunk_width == frames_per_eg_str)
    num_archives_expanded = num_archives * args.frame_subsampling_factor

    if (args.num_jobs_final > num_archives_expanded):
        raise Exception('num_jobs_final cannot exceed the '
                        'expanded number of archives')

    # copy the properties of the egs to dir for
    # use during decoding
    logger.info("Copying the properties from {0} to {1}".format(egs_dir, args.dir))
    common_train_lib.copy_egs_properties_to_exp_dir(egs_dir, args.dir)

    if not os.path.exists('{0}/valid_diagnostic.cegs'.format(egs_dir)):
        if (not os.path.exists('{0}/valid_diagnostic.scp'.format(egs_dir))):
            raise Exception('Neither {0}/valid_diagnostic.cegs nor '
                            '{0}/valid_diagnostic.scp exist.'
                            'This script expects one of them.'.format(egs_dir))
        use_multitask_egs = True
    else:
        use_multitask_egs = False

    if ((args.stage <= -2) and (os.path.exists(args.dir+"/configs/init.config"))
            and (args.input_model is None)):
        logger.info('Computing the preconditioning matrix for input features')

        chain_lib.compute_preconditioning_matrix(
            args.dir, egs_dir, num_archives, run_opts,
            max_lda_jobs=args.max_lda_jobs,
            rand_prune=args.rand_prune,
            use_multitask_egs=use_multitask_egs)

    if (args.stage <= -1):
        logger.info("Preparing the initial acoustic model.")
        chain_lib.prepare_initial_acoustic_model(args.dir, run_opts,
                                                 input_model=args.input_model)

    with open("{0}/frame_subsampling_factor".format(args.dir), "w") as f:
        f.write(str(args.frame_subsampling_factor))

    # set num_iters so that as close as possible, we process the data
    # $num_epochs times, i.e. $num_iters*$avg_num_jobs) ==
    # $num_epochs*$num_archives, where
    # avg_num_jobs=(num_jobs_initial+num_jobs_final)/2.
    num_archives_to_process = int(args.num_epochs * num_archives_expanded)
    num_archives_processed = 0
    num_iters = ((num_archives_to_process * 2)
                 // (args.num_jobs_initial + args.num_jobs_final))

    # If do_final_combination is True, compute the set of models_to_combine.
    # Otherwise, models_to_combine will be none.
    if args.do_final_combination:
        models_to_combine = common_train_lib.get_model_combine_iters(
            num_iters, args.num_epochs,
            num_archives_expanded, args.max_models_combine,
            args.num_jobs_final)
    else:
        models_to_combine = None

    min_deriv_time = None
    max_deriv_time_relative = None
    if args.deriv_truncate_margin is not None:
        min_deriv_time = -args.deriv_truncate_margin - model_left_context
        max_deriv_time_relative = \
           args.deriv_truncate_margin + model_right_context

    logger.info("Training will run for {0} epochs = "
                "{1} iterations".format(args.num_epochs, num_iters))

    for iter in range(num_iters):
        if (args.exit_stage is not None) and (iter == args.exit_stage):
            logger.info("Exiting early due to --exit-stage {0}".format(iter))
            return

        current_num_jobs = common_train_lib.get_current_num_jobs(
            iter, num_iters,
            args.num_jobs_initial, args.num_jobs_step, args.num_jobs_final)

        if args.stage <= iter:
            model_file = "{dir}/{iter}.mdl".format(dir=args.dir, iter=iter)

            lrate = common_train_lib.get_learning_rate(iter, current_num_jobs,
                                                       num_iters,
                                                       num_archives_processed,
                                                       num_archives_to_process,
                                                       args.initial_effective_lrate,
                                                       args.final_effective_lrate)
            shrinkage_value = 1.0 - (args.proportional_shrink * lrate)
            if shrinkage_value <= 0.5:
                raise Exception("proportional-shrink={0} is too large, it gives "
                                "shrink-value={1}".format(args.proportional_shrink,
                                                          shrinkage_value))
            if args.shrink_value < shrinkage_value:
                shrinkage_value = (args.shrink_value
                                   if common_train_lib.should_do_shrinkage(
                                       iter, model_file,
                                       args.shrink_saturation_threshold)
                                   else shrinkage_value)

            percent = num_archives_processed * 100.0 / num_archives_to_process
            epoch = (num_archives_processed * args.num_epochs
                     / num_archives_to_process)
            shrink_info_str = ''
            if shrinkage_value != 1.0:
                shrink_info_str = 'shrink: {0:0.5f}'.format(shrinkage_value)
            logger.info("Iter: {0}/{1}   Jobs: {2}   "
                        "Epoch: {3:0.2f}/{4:0.1f} ({5:0.1f}% complete)   "
                        "lr: {6:0.6f}   {7}".format(iter, num_iters - 1,
                                                    current_num_jobs,
                                                    epoch, args.num_epochs,
                                                    percent,
                                                    lrate, shrink_info_str))

            chain_lib.train_one_iteration(
                dir=args.dir,
                iter=iter,
                srand=args.srand,
                egs_dir=egs_dir,
                num_jobs=current_num_jobs,
                num_archives_processed=num_archives_processed,
                num_archives=num_archives,
                learning_rate=lrate,
                dropout_edit_string=common_train_lib.get_dropout_edit_string(
                    args.dropout_schedule,
                    float(num_archives_processed) / num_archives_to_process,
                    iter),
                train_opts=' '.join(args.train_opts),
                shrinkage_value=shrinkage_value,
                num_chunk_per_minibatch_str=args.num_chunk_per_minibatch,
                apply_deriv_weights=args.apply_deriv_weights,
                min_deriv_time=min_deriv_time,
                max_deriv_time_relative=max_deriv_time_relative,
                l2_regularize=args.l2_regularize,
                xent_regularize=args.xent_regularize,
                leaky_hmm_coefficient=args.leaky_hmm_coefficient,
                momentum=args.momentum,
                max_param_change=args.max_param_change,
                shuffle_buffer_size=args.shuffle_buffer_size,
                frame_subsampling_factor=args.frame_subsampling_factor,
                run_opts=run_opts,
                backstitch_training_scale=args.backstitch_training_scale,
                backstitch_training_interval=args.backstitch_training_interval,
                use_multitask_egs=use_multitask_egs)

            if args.cleanup:
                # do a clean up everything but the last 2 models, under certain
                # conditions
                common_train_lib.remove_model(
                    args.dir, iter-2, num_iters, models_to_combine,
                    args.preserve_model_interval)

            if args.email is not None:
                reporting_iter_interval = num_iters * args.reporting_interval
                if iter % reporting_iter_interval == 0:
                    # lets do some reporting
                    [report, times, data] = (
                        nnet3_log_parse.generate_acc_logprob_report(
                            args.dir, "log-probability"))
                    message = report
                    subject = ("Update : Expt {dir} : "
                               "Iter {iter}".format(dir=args.dir, iter=iter))
                    common_lib.send_mail(message, subject, args.email)

        num_archives_processed = num_archives_processed + current_num_jobs

    if args.stage <= num_iters:
        if args.do_final_combination:
            logger.info("Doing final combination to produce final.mdl")
            chain_lib.combine_models(
                dir=args.dir, num_iters=num_iters,
                models_to_combine=models_to_combine,
                num_chunk_per_minibatch_str=args.num_chunk_per_minibatch,
                egs_dir=egs_dir,
                leaky_hmm_coefficient=args.leaky_hmm_coefficient,
                l2_regularize=args.l2_regularize,
                xent_regularize=args.xent_regularize,
                run_opts=run_opts,
                max_objective_evaluations=args.max_objective_evaluations,
                use_multitask_egs=use_multitask_egs)
        else:
            logger.info("Copying the last-numbered model to final.mdl")
            common_lib.force_symlink("{0}.mdl".format(num_iters),
                                     "{0}/final.mdl".format(args.dir))
            chain_lib.compute_train_cv_probabilities(
                dir=args.dir, iter=num_iters, egs_dir=egs_dir,
                l2_regularize=args.l2_regularize, xent_regularize=args.xent_regularize,
                leaky_hmm_coefficient=args.leaky_hmm_coefficient,
                run_opts=run_opts,
                use_multitask_egs=use_multitask_egs)
            common_lib.force_symlink("compute_prob_valid.{iter}.log"
                                     "".format(iter=num_iters),
                                     "{dir}/log/compute_prob_valid.final.log".format(
                                         dir=args.dir))

    if args.cleanup:
        logger.info("Cleaning up the experiment directory "
                    "{0}".format(args.dir))
        remove_egs = args.remove_egs
        if args.egs_dir is not None:
            # this egs_dir was not created by this experiment so we will not
            # delete it
            remove_egs = False

        # leave the last-two-numbered models, for diagnostic reasons.
        common_train_lib.clean_nnet_dir(
            args.dir, num_iters - 1, egs_dir,
            preserve_model_interval=args.preserve_model_interval,
            remove_egs=remove_egs)

    # do some reporting
    [report, times, data] = nnet3_log_parse.generate_acc_logprob_report(
        args.dir, "log-probability")
    if args.email is not None:
        common_lib.send_mail(report, "Update : Expt {0} : "
                                     "complete".format(args.dir), args.email)

    with open("{dir}/accuracy.report".format(dir=args.dir), "w") as f:
        f.write(report)

    common_lib.execute_command("steps/info/chain_dir_info.pl "
                               "{0}".format(args.dir))


def main():
    [args, run_opts] = get_args()
    try:
        train(args, run_opts)
        common_lib.wait_for_background_commands()
    except BaseException as e:
        # look for BaseException so we catch KeyboardInterrupt, which is
        # what we get when a background thread dies.
        if args.email is not None:
            message = ("Training session for experiment {dir} "
                       "died due to an error.".format(dir=args.dir))
            common_lib.send_mail(message, message, args.email)
        if not isinstance(e, KeyboardInterrupt):
            traceback.print_exc()
        sys.exit(1)

if __name__ == "__main__":
    main()


================================================
FILE: egs/steps/nnet3/chain/train_tdnn.sh
================================================
#!/usr/bin/env bash

# THIS SCRIPT IS DEPRECATED, see ./train.py

# note, TDNN is the same as what we used to call multisplice.
# This version of the script, nnet3/chain/train_tdnn.sh, is for 'chain' systems.

# Copyright 2012-2015  Johns Hopkins University (Author: Daniel Povey).
#           2013  Xiaohui Zhang
#           2013  Guoguo Chen
#           2014  Vimal Manohar
#           2014  Vijayaditya Peddinti
# Apache 2.0.


# Begin configuration section.
cmd=run.pl
num_epochs=10      # Number of epochs of training;
                   # the number of iterations is worked out from this.
                   # Be careful with this: we actually go over the data
                   # num-epochs * frame-subsampling-factor times, due to
                   # using different data-shifts.
apply_deriv_weights=true
initial_effective_lrate=0.0002
final_effective_lrate=0.00002
extra_left_context=0  # actually for recurrent setups.
pnorm_input_dim=3000
pnorm_output_dim=300
relu_dim=  # you can use this to make it use ReLU's instead of p-norms.

jesus_opts=  # opts to steps/nnet3/make_jesus_configs.py.
             # If nonempty, assumes you want to use the jesus nonlinearity,
             # and you should supply various options to that script in
             # this string.
rand_prune=4.0 # Relates to a speedup we do for LDA.
minibatch_size=512  # This default is suitable for GPU-based training.
                    # Set it to 128 for multi-threaded CPU-based training.
lm_opts=   # options to chain-est-phone-lm
l2_regularize=0.0
leaky_hmm_coefficient=0.00001
xent_regularize=0.0
frames_per_iter=800000  # each iteration of training, see this many [input]
                        # frames per job.  This option is passed to get_egs.sh.
                        # Aim for about a minute of training time
right_tolerance=5  # tolerance at the same frame-rate as the alignment directory.
left_tolerance=5    # tolerance at the same frame-rate as the alignment directory.
num_jobs_initial=1  # Number of neural net jobs to run in parallel at the start of training
num_jobs_final=8   # Number of neural net jobs to run in parallel at the end of training
frame_subsampling_factor=3  # ratio of frames-per-second of features we train
                            # on, to chain model's output
alignment_subsampling_factor=3  # ratio of frames-per-second of input alignments
                                # to chain model's output
get_egs_stage=0    # can be used for rerunning after partial
online_ivector_dir=
max_param_change=2.0
remove_egs=true  # set to false to disable removing egs after training is done.

max_models_combine=20 # The "max_models_combine" is the maximum number of models we give
  # to the final 'combine' stage, but these models will themselves be averages of
  # iteration-number ranges.
ngram_order=3

shuffle_buffer_size=5000 # This "buffer_size" variable controls randomization of the samples
                # on each iter.  You could set it to 0 or to a large value for complete
                # randomization, but this would both consume memory and cause spikes in
                # disk I/O.  Smaller is easier on disk and memory but less random.  It's
                # not a huge deal though, as samples are anyway randomized right at the start.
                # (the point of this is to get data in different minibatches on different iterations,
                # since in the preconditioning method, 2 samples in the same minibatch can
                # affect each others' gradients.
final_layer_normalize_target=1.0  # you can set this to less than one if you
                                  # think the final layer is learning too fast
                                  # compared with the other layers.
add_layers_period=2 # by default, add new layers every 2 iterations.
stage=-7
exit_stage=-100 # you can set this to terminate the training early.  Exits before running this stage


# count space-separated fields in splice_indexes to get num-hidden-layers.
splice_indexes="-4,-3,-2,-1,0,1,2,3,4  0  -2,2  0  -4,4 0"

# Format : layer<hidden_layer>/<frame_indices>....layer<hidden_layer>/<frame_indices> "
# note: hidden layers which are composed of one or more components,
# so hidden layer indexing is different from component count

randprune=4.0 # speeds up LDA.
use_gpu=true    # if true, we run on GPU.
cleanup=true
egs_dir=
max_lda_jobs=20  # use no more than 20 jobs for the LDA accumulation.
lda_opts=
egs_opts=
transform_dir=     # If supplied, this dir used instead of latdir to find transforms.
cmvn_opts=  # will be passed to get_lda.sh and get_egs.sh, if supplied.
            # only relevant for "raw" features, not lda.
frames_per_eg=25   # number of frames of output per chunk.  To be passed on to get_egs.sh.
left_deriv_truncate=   # number of time-steps to avoid using the deriv of, on the left.
right_deriv_truncate=  # number of time-steps to avoid using the deriv of, on the right.

# End configuration section.

trap 'for pid in $(jobs -pr); do kill -TERM $pid; done' INT QUIT TERM


echo "$0: THIS SCRIPT IS DEPRECATED"
echo "$0 $@"  # Print the command line for logging

if [ -f path.sh ]; then . ./path.sh; fi
. parse_options.sh || exit 1;

if [ $# != 4 ]; then
  echo "Usage: $0 [opts] <data> <tree-dir> <phone-lattice-dir> <exp-dir>"
  echo " e.g.: $0 data/train exp/chain/tri3b_tree exp/tri3_latali exp/chain/tdnn_a"
  echo ""
  echo "Main options (for others, see top of script file)"
  echo "  --config <config-file>                           # config file containing options"
  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
  echo "  --num-epochs <#epochs|15>                        # Number of epochs of training"
  echo "  --initial-effective-lrate <lrate|0.02> # effective learning rate at start of training."
  echo "  --final-effective-lrate <lrate|0.004>   # effective learning rate at end of training."
  echo "                                                   # data, 0.00025 for large data"
  echo "  --num-hidden-layers <#hidden-layers|2>           # Number of hidden layers, e.g. 2 for 3 hours of data, 4 for 100hrs"
  echo "  --add-layers-period <#iters|2>                   # Number of iterations between adding hidden layers"
  echo "  --num-jobs-initial <num-jobs|1>                  # Number of parallel jobs to use for neural net training, at the start."
  echo "  --num-jobs-final <num-jobs|8>                    # Number of parallel jobs to use for neural net training, at the end"
  echo "  --num-threads <num-threads|16>                   # Number of parallel threads per job, for CPU-based training (will affect"
  echo "                                                   # results as well as speed; may interact with batch size; if you increase"
  echo "                                                   # this, you may want to decrease the batch size."
  echo "  --parallel-opts <opts|\"--num-threads 16 --mem 1G\">      # extra options to pass to e.g. queue.pl for processes that"
  echo "                                                   # use multiple threads... note, you might have to reduce --mem"
  echo "                                                   # versus your defaults, because it gets multiplied by the --num-threads argument."
  echo "  --io-opts <opts|\"--max-jobs-run 10\">                      # Options given to e.g. queue.pl for jobs that do a lot of I/O."
  echo "  --minibatch-size <minibatch-size|128>            # Size of minibatch to process (note: product with --num-threads"
  echo "                                                   # should not get too large, e.g. >2k)."
  echo "  --frames-per-iter <#frames|400000>               # Number of frames of data to process per iteration, per"
  echo "                                                   # process."
  echo "  --splice-indexes <string|layer0/-4:-3:-2:-1:0:1:2:3:4> "
  echo "                                                   # Frame indices used for each splice layer."
  echo "                                                   # Format : layer<hidden_layer_index>/<frame_indices>....layer<hidden_layer>/<frame_indices> "
  echo "                                                   # (note: we splice processed, typically 40-dimensional frames"
  echo "  --lda-dim <dim|''>                               # Dimension to reduce spliced features to with LDA"
  echo "  --stage <stage|-4>                               # Used to run a partially-completed training process from somewhere in"
  echo "                                                   # the middle."


  exit 1;
fi

data=$1
treedir=$2
latdir=$3
dir=$4


# Check some files.
for f in $data/feats.scp $treedir/ali.1.gz $treedir/final.mdl $treedir/tree \
    $latdir/lat.1.gz $latdir/final.mdl $latdir/num_jobs $latdir/splice_opts; do
  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
done

# Copy phones.txt from tree-dir to dir. Later, steps/nnet3/decode.sh will
# use it to check compatibility between training and decoding phone-sets.
cp $treedir/phones.txt $dir

# Set some variables.
nj=`cat $treedir/num_jobs` || exit 1;  # number of jobs in alignment dir...

sdata=$data/split$nj
utils/split_data.sh $data $nj

mkdir -p $dir/log
echo $nj > $dir/num_jobs
cp $treedir/tree $dir


# First work out the feature and iVector dimension, needed for tdnn config creation.
feat_dim=$(feat-to-dim --print-args=false scp:$data/feats.scp -) || \
  { echo "$0: Error getting feature dim"; exit 1; }

if [ -z "$online_ivector_dir" ]; then
  ivector_dim=0
else
  ivector_dim=$(feat-to-dim scp:$online_ivector_dir/ivector_online.scp -) || exit 1;
fi

if  [ $stage -le -7 ]; then
  echo "$0: creating phone language-model"

  $cmd $dir/log/make_phone_lm.log \
    chain-est-phone-lm $lm_opts \
     "ark:gunzip -c $treedir/ali.*.gz | ali-to-phones $treedir/final.mdl ark:- ark:- |" \
     $dir/phone_lm.fst || exit 1
fi

if [ $stage -le -6 ]; then
  echo "$0: creating denominator FST"
  copy-transition-model $treedir/final.mdl $dir/0.trans_mdl
  $cmd $dir/log/make_den_fst.log \
    chain-make-den-fst $dir/tree $dir/0.trans_mdl $dir/phone_lm.fst \
       $dir/den.fst $dir/normalization.fst || exit 1;
fi

# work out num-leaves
num_leaves=$(am-info $dir/0.trans_mdl | grep -w pdfs | awk '{print $NF}') || exit 1;
[ $num_leaves -gt 0 ] || exit 1;

if [ $stage -le -5 ]; then
  echo "$0: creating neural net configs";

  if [ ! -z "$jesus_opts" ]; then
    $cmd $dir/log/make_configs.log \
       python steps/nnet3/make_jesus_configs.py \
      --xent-regularize=$xent_regularize \
      --include-log-softmax=false \
      --splice-indexes "$splice_indexes"  \
      --feat-dim $feat_dim \
      --ivector-dim $ivector_dim  \
       $jesus_opts \
      --num-targets $num_leaves \
      $dir/configs || exit 1;
  else
    [ $xent_regularize != "0.0" ] && \
      echo "$0: --xent-regularize option not supported by tdnn/make_configs.py." && exit 1;
    if [ ! -z "$relu_dim" ]; then
      dim_opts="--relu-dim $relu_dim"
    else
      dim_opts="--pnorm-input-dim $pnorm_input_dim --pnorm-output-dim  $pnorm_output_dim"
    fi

    python steps/nnet3/tdnn/make_configs.py $pool_opts \
      --include-log-softmax=false \
      --final-layer-normalize-target $final_layer_normalize_target \
      --splice-indexes "$splice_indexes"  \
      --feat-dim $feat_dim \
      --ivector-dim $ivector_dim  \
      $dim_opts \
      --num-targets $num_leaves \
      --use-presoftmax-prior-scale false \
      $dir/configs || exit 1;
  fi

  # Initialize as "raw" nnet, prior to training the LDA-like preconditioning
  # matrix.  This first config just does any initial splicing that we do;
  # we do this as it's a convenient way to get the stats for the 'lda-like'
  # transform.
  $cmd $dir/log/nnet_init.log \
    nnet3-init --srand=-2 $dir/configs/init.config $dir/init.raw || exit 1;
fi

# sourcing the "vars" below sets
# left_context=(something)
# right_context=(something)
# num_hidden_layers=(something)
. $dir/configs/vars || exit 1;

# the next 2 lines are in case the configs were created by an older
# config-generating script, which writes to left_context and right_context
# instead of model_left_context and model_right_context.
[ -z $model_left_context ] && model_left_context=$left_context
[ -z $model_right_context ] && model_right_context=$right_context

! [ "$num_hidden_layers" -gt 0 ] && echo \
 "$0: Expected num_hidden_layers to be defined" && exit 1;

[ -z "$transform_dir" ] && transform_dir=$latdir

if [ $stage -le -4 ] && [ -z "$egs_dir" ]; then
  extra_opts=()
  [ ! -z "$cmvn_opts" ] && extra_opts+=(--cmvn-opts "$cmvn_opts")
  [ ! -z "$online_ivector_dir" ] && extra_opts+=(--online-ivector-dir $online_ivector_dir)
  extra_opts+=(--transform-dir $transform_dir)
  # we need a bit of extra left-context and right-context to allow for frame
  # shifts (we use shifted version of the data for more variety).
  extra_opts+=(--left-context $[$model_left_context+$frame_subsampling_factor/2+$extra_left_context])
  extra_opts+=(--right-context $[$model_right_context+$frame_subsampling_factor/2])
  echo "$0: calling get_egs.sh"
  steps/nnet3/chain/get_egs.sh $egs_opts "${extra_opts[@]}" \
      --frames-per-iter $frames_per_iter --stage $get_egs_stage \
      --cmd "$cmd" \
      --right-tolerance "$right_tolerance" \
      --left-tolerance "$left_tolerance" \
      --frames-per-eg $frames_per_eg \
      --frame-subsampling-factor $frame_subsampling_factor \
      --alignment-subsampling-factor $alignment_subsampling_factor \
      $data $dir $latdir $dir/egs || exit 1;
fi

[ -z $egs_dir ] && egs_dir=$dir/egs

if [ "$feat_dim" != "$(cat $egs_dir/info/feat_dim)" ]; then
  echo "$0: feature dimension mismatch with egs in $egs_dir: $feat_dim vs $(cat $egs_dir/info/feat_dim)";
  exit 1;
fi
if [ "$ivector_dim" != "$(cat $egs_dir/info/ivector_dim)" ]; then
  echo "$0: ivector dimension mismatch with egs in $egs_dir: $ivector_dim vs $(cat $egs_dir/info/ivector_dim)";
  exit 1;
fi

# copy any of the following that exist, to $dir.
cp $egs_dir/{cmvn_opts,splice_opts,final.mat} $dir 2>/dev/null

# confirm that the egs_dir has the necessary context (especially important if
# the --egs-dir option was used on the command line).
egs_left_context=$(cat $egs_dir/info/left_context) || exit -1
egs_right_context=$(cat $egs_dir/info/right_context) || exit -1
( [ $egs_left_context -lt $model_left_context ] || \
  [ $egs_right_context -lt $model_right_context ] ) && \
   echo "$0: egs in $egs_dir have too little context" && exit -1;

frames_per_eg=$(cat $egs_dir/info/frames_per_eg) || { echo "error: no such file $egs_dir/info/frames_per_eg"; exit 1; }
num_archives=$(cat $egs_dir/info/num_archives) || { echo "error: no such file $egs_dir/info/frames_per_eg"; exit 1; }

num_archives_expanded=$[$num_archives*$frame_subsampling_factor]

[ $num_jobs_initial -gt $num_jobs_final ] && \
  echo "$0: --initial-num-jobs cannot exceed --final-num-jobs" && exit 1;

[ $num_jobs_final -gt $num_archives_expanded ] && \
  echo "$0: --final-num-jobs cannot exceed #archives $num_archives_expanded." && exit 1;

if [ $stage -le -3 ]; then
  echo "$0: getting preconditioning matrix for input features."
  num_lda_jobs=$num_archives
  [ $num_lda_jobs -gt $max_lda_jobs ] && num_lda_jobs=$max_lda_jobs

  # Write stats with the same format as stats for LDA.
  $cmd JOB=1:$num_lda_jobs $dir/log/get_lda_stats.JOB.log \
      nnet3-chain-acc-lda-stats --rand-prune=$rand_prune \
         $dir/init.raw "ark:$egs_dir/cegs.JOB.ark" $dir/JOB.lda_stats || exit 1;

  all_lda_accs=$(for n in $(seq $num_lda_jobs); do echo $dir/$n.lda_stats; done)
  $cmd $dir/log/sum_transform_stats.log \
    sum-lda-accs $dir/lda_stats $all_lda_accs || exit 1;

  rm $all_lda_accs || exit 1;

  # this computes a fixed affine transform computed in the way we described in
  # Appendix C.6 of http://arxiv.org/pdf/1410.7455v6.pdf; it's a scaled variant
  # of an LDA transform but without dimensionality reduction.
  $cmd $dir/log/get_transform.log \
     nnet-get-feature-transform $lda_opts $dir/lda.mat $dir/lda_stats || exit 1;

  ln -sf ../lda.mat $dir/configs/lda.mat
fi

if [ $stage -le -1 ]; then
  # Add the first layer; this will add in the lda.mat and
  # presoftmax_prior_scale.vec.

  echo "$0: creating initial raw model"
  $cmd $dir/log/add_first_layer.log \
       nnet3-init --srand=-1 $dir/init.raw $dir/configs/layer1.config $dir/0.raw || exit 1;


  # The model-format for a 'chain' acoustic model is just the transition
  # model and then the raw nnet, so we can use 'cat' to create this, as
  # long as they have the same mode (binary or not binary).
  # We ensure that they have the same mode (even if someone changed the
  # script to make one or both of them text mode) by copying them both
  # before concatenating them.

  echo "$0: creating initial model"
  $cmd $dir/log/init_model.log \
    nnet3-am-init $dir/0.trans_mdl $dir/0.raw $dir/0.mdl || exit 1;
fi

echo $frame_subsampling_factor >$dir/frame_subsampling_factor || exit 1;

# set num_iters so that as close as possible, we process the data $num_epochs
# times, i.e. $num_iters*$avg_num_jobs) == $num_epochs*$num_archives_expanded
# where avg_num_jobs=(num_jobs_initial+num_jobs_final)/2.

num_archives_to_process=$[$num_epochs*$num_archives_expanded]
num_archives_processed=0
num_iters=$[($num_archives_to_process*2)/($num_jobs_initial+$num_jobs_final)]

! [ $num_iters -gt $[$finish_add_layers_iter+2] ] \
  && echo "$0: Insufficient epochs" && exit 1

finish_add_layers_iter=$[$num_hidden_layers * $add_layers_period]

echo "$0: Will train for $num_epochs epochs = $num_iters iterations"

if $use_gpu; then
  parallel_suffix=""
  train_queue_opt="--gpu 1"
  combine_queue_opt="--gpu 1"
  prior_gpu_opt="--use-gpu=yes"
  prior_queue_opt="--gpu 1"
  parallel_train_opts=
  if ! cuda-compiled; then
    echo "$0: WARNING: you are running with one thread but you have not compiled"
    echo "   for CUDA.  You may be running a setup optimized for GPUs.  If you have"
    echo "   GPUs and have nvcc installed, go to src/ and do ./configure; make"
    exit 1
  fi
else
  echo "$0: without using a GPU this will be very slow.  nnet3 does not yet support multiple threads."
  parallel_train_opts="--use-gpu=no"
  train_queue_opt="--num-threads $num_threads"
  combine_queue_opt=""  # the combine stage will be quite slow if not using
                        # GPU, as we didn't enable that program to use
                        # multiple threads.
  prior_gpu_opt="--use-gpu=no"
  prior_queue_opt=""
fi


approx_iters_per_epoch_final=$[$num_archives_expanded/$num_jobs_final]
# First work out how many iterations we want to combine over in the final
# nnet3-combine-fast invocation.  (We may end up subsampling from these if the
# number exceeds max_model_combine).  The number we use is:
# min(max(max_models_combine, approx_iters_per_epoch_final),
#     1/2 * iters_after_last_layer_added)
num_iters_combine=$max_models_combine
if [ $num_iters_combine -lt $approx_iters_per_epoch_final ]; then
   num_iters_combine=$approx_iters_per_epoch_final
fi
half_iters_after_add_layers=$[($num_iters-$finish_add_layers_iter)/2]
if [ $num_iters_combine -gt $half_iters_after_add_layers ]; then
  num_iters_combine=$half_iters_after_add_layers
fi
first_model_combine=$[$num_iters-$num_iters_combine+1]

x=0

deriv_time_opts=
[ ! -z "$left_deriv_truncate" ] && deriv_time_opts="--optimization.min-deriv-time=$left_deriv_truncate"
[ ! -z "$right_deriv_truncate" ] && \
  deriv_time_opts="$deriv_time_opts --optimization.max-deriv-time=$((frames_per_eg - right_deriv_truncate))"


while [ $x -lt $num_iters ]; do
  [ $x -eq $exit_stage ] && echo "$0: Exiting early due to --exit-stage $exit_stage" && exit 0;

  this_num_jobs=$(perl -e "print int(0.5+$num_jobs_initial+($num_jobs_final-$num_jobs_initial)*$x/$num_iters);")

  ilr=$initial_effective_lrate; flr=$final_effective_lrate; np=$num_archives_processed; nt=$num_archives_to_process;
  this_learning_rate=$(perl -e "print (($x + 1 >= $num_iters ? $flr : $ilr*exp($np*log($flr/$ilr)/$nt))*$this_num_jobs);");

  echo "On iteration $x, learning rate is $this_learning_rate."


  if [ $x -ge 0 ] && [ $stage -le $x ]; then

    # Set off jobs doing some diagnostics, in the background.
    # Use the egs dir from the previous iteration for the diagnostics
    $cmd $dir/log/compute_prob_valid.$x.log \
      nnet3-chain-compute-prob --l2-regularize=$l2_regularize --leaky-hmm-coefficient=$leaky_hmm_coefficient --xent-regularize=$xent_regularize \
          "nnet3-am-copy --raw=true $dir/$x.mdl -|" $dir/den.fst \
          "ark,bg:nnet3-chain-merge-egs ark:$egs_dir/valid_diagnostic.cegs ark:- |" &
    $cmd $dir/log/compute_prob_train.$x.log \
      nnet3-chain-compute-prob --l2-regularize=$l2_regularize --leaky-hmm-coefficient=$leaky_hmm_coefficient  --xent-regularize=$xent_regularize \
          "nnet3-am-copy --raw=true $dir/$x.mdl -|" $dir/den.fst \
          "ark,bg:nnet3-chain-merge-egs ark:$egs_dir/train_diagnostic.cegs ark:- |" &

    if [ $x -gt 0 ]; then
      # This doesn't use the egs, it only shows the relative change in model parameters.
      $cmd $dir/log/progress.$x.log \
        nnet3-show-progress --use-gpu=no "nnet3-am-copy --raw=true $dir/$[$x-1].mdl - |" \
                  "nnet3-am-copy --raw=true $dir/$x.mdl - |" '&&' \
        nnet3-am-info $dir/$x.mdl &
    fi

    echo "Training neural net (pass $x)"

    if [ $x -gt 0 ] && \
      [ $x -le $[($num_hidden_layers-1)*$add_layers_period] ] && \
      [ $[$x%$add_layers_period] -eq 0 ]; then
      do_average=false # if we've just mixed up, don't do averaging but take the
                       # best.
      cur_num_hidden_layers=$[1+$x/$add_layers_period]
      config=$dir/configs/layer$cur_num_hidden_layers.config
      mdl="nnet3-am-copy --raw=true --learning-rate=$this_learning_rate $dir/$x.mdl - | nnet3-init --srand=$x - $config - |"
      cache_io_opts=""
    else
      do_average=true
      if [ $x -eq 0 ]; then do_average=false; fi # on iteration 0, pick the best, don't average.
      mdl="nnet3-am-copy --raw=true --learning-rate=$this_learning_rate $dir/$x.mdl -|"
      cache_io_opts="--read-cache=$dir/cache.$x"
    fi
    if $do_average; then
      this_minibatch_size=$minibatch_size
      this_max_param_change=$max_param_change
    else
      # on iteration zero or when we just added a layer, use a smaller minibatch
      # size (and we will later choose the output of just one of the jobs): the
      # model-averaging isn't always helpful when the model is changing too fast
      # (i.e. it can worsen the objective function), and the smaller minibatch
      # size will help to keep the update stable.
      this_minibatch_size=$[$minibatch_size/2];
      this_max_param_change=$(perl -e "print ($max_param_change/sqrt(2));")
    fi

    rm $dir/.error 2>/dev/null


    (
      trap 'for pid in $(jobs -pr); do kill -TERM $pid; done' INT QUIT TERM
      # this sub-shell is so that when we "wait" below,
      # we only wait for the training jobs that we just spawned,
      # not the diagnostic jobs that we spawned above.

      # We can't easily use a single parallel SGE job to do the main training,
      # because the computation of which archive and which --frame option
      # to use for each job is a little complex, so we spawn each one separately.
      for n in $(seq $this_num_jobs); do
        k=$[$num_archives_processed + $n - 1]; # k is a zero-based index that we'll derive
                                               # the other indexes from.
        archive=$[($k%$num_archives)+1]; # work out the 1-based archive index.
        frame_shift=$[($k/$num_archives)%$frame_subsampling_factor];
        if [ $n -eq 1 ]; then
          # opts for computation cache (storing compiled computation).
          this_cache_io_opts="$cache_io_opts --write-cache=$dir/cache.$[$x+1]"
        else
          this_cache_io_opts="$cache_io_opts"
        fi
        $cmd $train_queue_opt $dir/log/train.$x.$n.log \
          nnet3-chain-train --apply-deriv-weights=$apply_deriv_weights \
             --l2-regularize=$l2_regularize --leaky-hmm-coefficient=$leaky_hmm_coefficient --xent-regularize=$xent_regularize \
              $this_cache_io_opts $parallel_train_opts $deriv_time_opts \
             --max-param-change=$this_max_param_change \
            --print-interval=10 "$mdl" $dir/den.fst \
          "ark,bg:nnet3-chain-copy-egs --frame-shift=$frame_shift ark:$egs_dir/cegs.$archive.ark ark:- | nnet3-chain-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x ark:- ark:-| nnet3-chain-merge-egs --minibatch-size=$this_minibatch_size ark:- ark:- |" \
          $dir/$[$x+1].$n.raw || touch $dir/.error &
      done
      wait
    )
    # the error message below is not that informative, but $cmd will
    # have printed a more specific one.
    [ -f $dir/.error ] && echo "$0: error on iteration $x of training" && exit 1;

    models_to_average=$(steps/nnet3/get_successful_models.py --difference-threshold 0.1 $this_num_jobs $dir/log/train.$x.%.log)
    nnets_list=
    for n in $models_to_average; do
      nnets_list="$nnets_list $dir/$[$x+1].$n.raw"
    done

    if $do_average; then
      # average the output of the different jobs.
      $cmd $dir/log/average.$x.log \
        nnet3-average $nnets_list - \| \
        nnet3-am-copy --set-raw-nnet=- $dir/$x.mdl $dir/$[$x+1].mdl || exit 1;
    else
      # choose the best from the different jobs.
      n=$(perl -e '($nj,$pat)=@ARGV; $best_n=1; $best_logprob=-1.0e+10; for ($n=1;$n<=$nj;$n++) {
          $fn = sprintf($pat,$n); open(F, "<$fn") || die "Error opening log file $fn";
          undef $logprob; while (<F>) { if (m/log-prob-per-frame=(\S+)/) { $logprob=$1; } }
          close(F); if (defined $logprob && $logprob > $best_logprob) { $best_logprob=$logprob;
          $best_n=$n; } } print "$best_n\n"; ' $num_jobs_nnet $dir/log/train.$x.%d.log) || exit 1;
      [ -z "$n" ] && echo "Error getting best model" && exit 1;
      $cmd $dir/log/select.$x.log \
        nnet3-am-copy --set-raw-nnet=$dir/$[$x+1].$n.raw  $dir/$x.mdl $dir/$[$x+1].mdl || exit 1;
    fi

    rm $nnets_list
    [ ! -f $dir/$[$x+1].mdl ] && exit 1;
    if [ -f $dir/$[$x-1].mdl ] && $cleanup && \
       [ $[($x-1)%10] -ne 0  ] && [ $[$x-1] -lt $first_model_combine ]; then
      rm $dir/$[$x-1].mdl
    fi
  fi
  rm $dir/cache.$x 2>/dev/null
  x=$[$x+1]
  num_archives_processed=$[$num_archives_processed+$this_num_jobs]
done


if [ $stage -le $num_iters ]; then
  echo "Doing final combination to produce final.mdl"

  # Now do combination.  In the nnet3 setup, the logic
  # for doing averaging of subsets of the models in the case where
  # there are too many models to reliably esetimate interpolation
  # factors (max_models_combine) is moved into the nnet3-combine
  nnets_list=()
  for n in $(seq 0 $[num_iters_combine-1]); do
    iter=$[$first_model_combine+$n]
    [ ! -f $dir/$iter.mdl ] && echo "Expected $mdl to exist" && exit 1;
    mdl="nnet3-am-copy --raw=true $dir/$iter.mdl - |"
    nnets_list[$n]="$mdl";
  done

  # Below, we use --use-gpu=no to disable nnet3-combine-fast from using a GPU,
  # as if there are many models it can give out-of-memory error; and we set
  # num-threads to 8 to speed it up (this isn't ideal...)

  $cmd $combine_queue_opt $dir/log/combine.log \
    nnet3-chain-combine --num-iters=40  --l2-regularize=$l2_regularize --leaky-hmm-coefficient=$leaky_hmm_coefficient \
       --enforce-sum-to-one=true --enforce-positive-weights=true \
       --verbose=3 $dir/den.fst "${nnets_list[@]}" "ark,bg:nnet3-chain-merge-egs --minibatch-size=$minibatch_size ark:$egs_dir/combine.cegs ark:-|" \
       "|nnet3-am-copy --set-raw-nnet=- $dir/$first_model_combine.mdl $dir/final.mdl" || exit 1;


  # Compute the probability of the final, combined model with
  # the same subset we used for the previous compute_probs, as the
  # different subsets will lead to different probs.
  $cmd $dir/log/compute_prob_valid.final.log \
    nnet3-chain-compute-prob --l2-regularize=$l2_regularize --leaky-hmm-coefficient=$leaky_hmm_coefficient --xent-regularize=$xent_regularize \
           "nnet3-am-copy --raw=true $dir/final.mdl - |" $dir/den.fst \
    "ark,bg:nnet3-chain-merge-egs ark:$egs_dir/valid_diagnostic.cegs ark:- |" &
  $cmd $dir/log/compute_prob_train.final.log \
    nnet3-chain-compute-prob --l2-regularize=$l2_regularize --leaky-hmm-coefficient=$leaky_hmm_coefficient --xent-regularize=$xent_regularize \
      "nnet3-am-copy --raw=true $dir/final.mdl - |" $dir/den.fst \
    "ark,bg:nnet3-chain-merge-egs ark:$egs_dir/train_diagnostic.cegs ark:- |" &
fi

if [ ! -f $dir/final.mdl ]; then
  echo "$0: $dir/final.mdl does not exist."
  # we don't want to clean up if the training didn't succeed.
  exit 1;
fi

sleep 2

echo Done

if $cleanup; then
  echo Cleaning up data
  if $remove_egs && [[ $egs_dir =~ $dir/egs* ]]; then
    steps/nnet2/remove_egs.sh $egs_dir
  fi

  echo Removing most of the models
  for x in `seq 0 $num_iters`; do
    if [ $[$x%100] -ne 0 ] && [ $x -ne $num_iters ] && [ -f $dir/$x.mdl ]; then
       # delete all but every 100th model; don't delete the ones which combine to form the final model.
      rm $dir/$x.mdl
    fi
  done
fi

steps/info/chain_dir_info.pl $dir

exit 0


================================================
FILE: egs/steps/nnet3/chain2/combine_egs.sh
================================================
#!/bin/bash
#
# Copyright 2020 Srikanth Madikeri (Idiap Research Institute)
# Apache 2.0
#
# This script combines egs folder generated with chain2 recipes to prepare a single egs folder
# for multilingual training

echo "$0 $@"  # Print the command line for logging
. ./cmd.sh
set -e

# Begin configuration section
cmd=
block_size=256
stage=0
frames_per_job=1500000  
left_context=13
right_context=9
# TODO: add lang2weight support
lang2weight=            # array of weights one per input languge to scale example's output
                        # w.r.t its input language during training.
lang_list=

echo "$0 $@"  # Print the command line for logging

if [ -f path.sh ]; then . ./path.sh; fi
. parse_options.sh || exit 1;

[[ -f local.conf ]] && . local.conf

if [ $# -lt 3 ]; then
  cat <<EOF
  This script generates examples for multilingual LF-MMI training.
  The input egs directories are generated with chain2 get_egs scripts.

  Usage: $0 [opts] <num-input-langs,N> <lang1-egs-dir> ...<langN-egs-dir> <multilingual-egs-dir>
   e.g.: $0 [opts] 2 exp/lang1/egs exp/lang2/egs exp/multi/egs

  Options:
      --cmd (utils/run.pl|utils/queue.pl <queue opts>)  # how to run jobs.
EOF
  exit 1;
fi

num_langs=$1
if [ $# != $[$num_langs+2] ]; then
  echo "$0: num of input example dirs provided is not compatible with num_langs $num_langs."
  echo "Usage:$0 [opts] <num-input-langs,N> <lang1-egs-dir> ...<langN-egs-dir> <multilingual-egs-dir>"
  echo "Usage:$0 [opts] 2 exp/lang1/egs exp/lang2/egs exp/multi/egs"
  exit 1;
fi
megs_dir=${@: -1} # multilingual directory
mkdir -p $megs_dir
shift 1
args=("$@")

required="info.txt train.scp train_subset.scp heldout_subset.scp"
train_scp_list=
train_diagnostic_scp_list=
valid_diagnostic_scp_list=
combine_scp_list=

# we don't copy lang because there wont be a single lang
check_params="feat_dim left_context right_context left_context_initial right_context_final" 
ivec_dim=`fgrep ivector_dim ${args[0]}/info.txt | awk '{print $2}'`
if [ $ivec_dim -ne 0 ];then check_params="$check_params ivector_dim final.ie.id"; fi

echo "dir_type randomized_chain_egs" > $megs_dir/info.txt
# frames_per_chunk is not included in check_params because we allow different
# values for different languages
for param in $check_params frames_per_chunk; do
    awk "/^$param/" ${args[0]}/info.txt
    
done >> $megs_dir/info.txt
echo "langs ${lang_list[@]}" >> $megs_dir/info.txt

tot_num_archives=0
tot_num_scps=0
for lang in $(seq 0 $[$num_langs-1]);do
  multi_egs_dir[$lang]=${args[$lang]}
  for f in $required; do
    if [ ! -f ${multi_egs_dir[$lang]}/$f ]; then
      echo "$0: no such file ${multi_egs_dir[$lang]}/$f" && exit 1;
    fi
  done
  num_chunks=$(fgrep num_chunks ${multi_egs_dir[$lang]}/info.txt | awk '{print $2}')
  curr_frames_per_chunk_avg=`awk '/^frames_per_chunk_avg/  {print $2;}' ${multi_egs_dir[$lang]}/info.txt`
  tot_num_archives=$[tot_num_archives+((num_chunks*curr_frames_per_chunk_avg)/frames_per_job+1)]
  tot_num_scps=$[tot_num_scps+num_scps]
  train_diagnostic_scp_list="$train_diagnostic_scp_list ${args[$lang]}/train_subset.scp"
  valid_diagnostic_scp_list="$valid_diagnostic_scp_list ${args[$lang]}/valid_subset.scp"
  for f in $check_params; do
    if [ `grep -c "^$f" ${multi_egs_dir[$lang]}/info.txt` -ge 1 ]; then
      f1=$(fgrep -m 1 $f $megs_dir/info.txt | awk '{print $2}')
      f2=$(fgrep -m 1 $f ${multi_egs_dir[$lang]}/info.txt | awk '{print $2}')
      if [ "$f1" != "$f2" ]  ; then
        echo "$0: mismatch for $f in $megs_dir vs. ${multi_egs_dir[$lang]}($f1 vs. $f2)."
        exit 1;
      fi
    else
      echo "$0: parameter $f does not exist in $megs_dir or ${multi_egs_dir[$lang]}/$f ."
    fi
  done
done
num_scp_files=$tot_num_archives
echo "num_scp_files $num_scp_files" >> $megs_dir/info.txt
sed_cmd=
for lang in $(seq 0 $[$num_langs-1]);do
    lang_name=${lang_list[$lang]}
    weight=`echo $lang2weight | tr ',' ' ' | cut -d ' ' -f$[$lang+1]`
    sed_cmd="$sed_cmd s/.*lang=${lang_name}.*/$weight/;"
done

dir=$megs_dir/
if [ $stage -le 0 ]; then
    echo "$0: Creating $num_scp_files scp files."
    for lang in $(seq 0 $[$num_langs-1]);do
        lang_name=${lang_list[$lang]}
        [ ! -d $dir/temp_${lang_name}/ ] && mkdir $dir/temp_${lang_name}/
        # randomize, append language name as a query and split input scp into $num_blocks blocks
        utils/shuffle_list.pl ${args[$lang]}/train.scp | \
            awk -v lang_name="$lang_name" \
                '{if ($1 !~ /?/){$1=$1"?lang=" lang_name; print;} else {$1=$1"&lang=" lang_name; print;}}' > $dir/temp_${lang_name}/train.shuffled.scp 
            utils/split_scp.pl $dir/temp_${lang_name}/train.shuffled.scp \
                $(for i in $(seq $num_scp_files); do echo $dir/temp_${lang_name}/train.$i.scp; done) || exit 1
        # split each block into sub-blocks
        for i in `seq $num_scp_files`; do
            utils/split_scp.pl <(utils/shuffle_list.pl $dir/temp_${lang_name}/train.$i.scp) \
                $(for j in $(seq $num_scp_files); do echo $dir/temp_${lang_name}/train.$i.$j.scp; done)
        done
    done

    for j in `seq $num_scp_files`; do
        input_list=$(for lang in $(seq 0 $[$num_langs-1]);do lang_name=${lang_list[$lang]}; echo $dir/temp_${lang_name}/train.*.$j.scp; done)
        # the shuffling is probably not required because we will do it once again before
        # merging examples
        cat $input_list | utils/shuffle_list.pl > $dir/train.$j.scp
        sed "$sed_cmd" < <(awk '{print $1}' $dir/train.$j.scp) > $dir/train.weight.$j.ark.col2
        paste -d ' ' <(awk '{print $1}' $dir/train.$j.scp) $dir/train.weight.$j.ark.col2 > $dir/train.weight.$j.ark
        rm $dir/train.weight.$j.ark.col2
    done
fi

if [ $stage -le 1 ]; then
    for subset_file  in train_subset heldout_subset; do
        for lang in $(seq 0 $[$num_langs-1]);do
            lang_name=${lang_list[$lang]}
            cat ${args[$lang]}/${subset_file}.scp  | \
            awk -v lang_name="$lang_name" \
                '{if ($1 !~ /?/){$1=$1"?lang=" lang_name; print;} else {$1=$1"&lang=" lang_name; print;}}' 
        done > $dir/${subset_file}.scp
        sed "$sed_cmd" < <(awk '{print $1}' $dir/${subset_file}.scp) > $dir/${subset_file}.weight.ark.col2
        paste -d ' ' <(awk '{print $1}' $dir/${subset_file}.scp) $dir/${subset_file}.weight.ark.col2 > $dir/${subset_file}.weight.ark
        rm $dir/${subset_file}.weight.ark.col2
    done
fi

if [ $stage -le 2 ]; then
    echo "$0: Clean up"
    for lang in $(seq 0 $[$num_langs-1]);do
        lang_name=${lang_list[$lang]}
        rm -r $dir/temp_${lang_name}/
    done
fi

echo "$0: Finished preparing multilingual training example."


================================================
FILE: egs/steps/nnet3/chain2/compute_preconditioning_matrix.sh
================================================
#!/bin/bash

# Copyright 2019 Idiap Research Institute (Author: Srikanth Madikeri).  Apache 2.0.

rand_prune=4.0
nj=8
cmd=run.pl
lda_acc_opts=
lda_transform_opts=
lda_sum_opts=
egs_opts=
stage=0
use_scp=true

echo "$0 $@"  # Print the command line for logging

if [ -f path.sh ]; then . ./path.sh; fi
. parse_options.sh || exit 1;

if [ $# -ne 3 ]; then
    echo "Usage: $0 [opts] <model> <egs-folder> <lda-output-folder>" 
    echo "e.g. $0 exp/chain/tdnn1a_sp/configs/init.raw exp/chain/tdnn1a_sp/egs/ exp/chain/tdnn1a_sp"
    echo ""
    echo "This script computes pre-conditioning matrix given the model (usually init.raw file from the config folder),"
    echo "egs-folder which has train.*.scp files to be used to train LDA, and"
    echo "lda-output-folder that will contain lda.mat file."
    echo ""
    echo "Main options (for others, see top of script file)"
    echo "  --cmd (utils/run.pl;utils/queue.pl <queue opts>) # how to run jobs."
    echo "  --nj <int;8> # number of jobs. this is also the number of train.*.scp files in egs/"
    echo "  --lda-acc-opts # options to be passed to nnet3-chain-acc-lda-stats"
    echo "  --lda-sum-opts # options to be passed to sum-lda-accs"
    echo "  --lda-transform-opts # options to be passed to nnet-get-feature-transform"
    exit 1;
fi

model=$1
egs=$2
ldafolder=$3

if [ ! -d $ldafolder ]; then
    echo "Creating $ldafolder"
    mkdir -p $ldafolder || exit 1
fi


if [ $stage -le 0 ]; then
        if $use_scp; then
            egs_rspecifier="ark:nnet3-chain-copy-egs $egs_opts scp:$egs/train.JOB.scp ark:- |"
        else
            egs_rspecifier="ark:nnet3-chain-copy-egs $egs_opts ark:$egs/train.JOB.ark ark:- |"
        fi
        echo "$0: Accumulating LDA stats"
        $cmd JOB=1:$nj $ldafolder/log/acc.JOB.log \
                nnet3-chain-acc-lda-stats $lda_acc_opts --rand-prune=${rand_prune} \
                $model "${egs_rspecifier}" \
                $ldafolder/JOB.lda_stats || exit 1
fi

if [ $stage -le 1 ]; then
    echo "$0: Summing LDA stats"
    lda_stats_files=
    for i in `seq 1 $nj`; do
        lda_stats_files="$lda_stats_files $ldafolder/$i.lda_stats"
    done

    $cmd $ldafolder/log/sum_transform_stats.log \
        sum-lda-accs $lda_sum_opts $ldafolder/lda_stats $lda_stats_files || exit 1
    rm $lda_stats_files
fi

if [ $stage -le 2 ]; then
    echo "$0: Computing LDA transform"
    $cmd $ldafolder/log/get_transform.log \
        nnet-get-feature-transform $lda_transform_opts \
        $ldafolder/lda.mat $ldafolder/lda_stats || exit 1

    rm $ldafolder/lda_stats
    # lda.mat is in $ldafolder, i.e. one up from $ldafolder/config.
    ln -sf ../lda.mat $ldafolder/configs/lda.mat
fi

echo "$0: Finished computing LDA transform"
exit 0;


================================================
FILE: egs/steps/nnet3/chain2/get_raw_egs.sh
================================================
#!/bin/bash

# Copyright   2019  Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.
# Copyright   2019  Idiap Research Institute (Author: Srikanth Madikeri).  Apache 2.0.
#
# This script dumps 'raw' egs for 'chain' training.  What 'raw' means in this
# context is that they need to be further processed to merge egs of the same
# speaker, etc.  So they won't be directly consumed by training, but by
# by the script process_egs.sh.


# Begin configuration section.
cmd=run.pl
frames_per_chunk=150  # Number of frames (at feature frame rate) per example.  You
                      # are allowed to make this a comma-separated list,
                      # e.g. 150,110,100, meaning that a range of eg widths are
                      # allowed (but this may not be as helpful when using our
                      # adaptation framework, since it will tend to split up
                      # utterances into separate minibatches.

frame_subsampling_factor=3 # frames-per-second of features we train on divided
                           # by frames-per-second at output of chain model
alignment_subsampling_factor=3 # frames-per-second of input alignments divided
                               # by frames-per-second at output of chain model
constrained=true  # 'constrained=true' is the traditional setup; 'constrained=false'
                  # gives you the 'unconstrained' egs creation in which the time
                  # boundaries are not enforced inside chunks.
left_context=0    # amount of left-context per eg (i.e. extra frames of input
                  # features not present in the output supervision).  Would
                  # normally depend on the model context, plus desired 'extra'
                  # context (e.g. for LSTM).
right_context=0   # amount of right-context per eg.

left_context_initial=-1   # if >=0, right-context for last chunk of an utterance.
right_context_final=-1     # if >=0, right-context for last chunk of an utterance.

compress=true   # set this to false to disable compression (e.g. if you want to
                # see whether results are affected).  Note: if the features on
                # disk were originally compressed, nnet3-chain-get-egs will dump
                # compressed features regardless (since there is no further loss
                # in that case).

lang=default   # the language name.  will usually be 'default' in single-language
               # setups.  Requires because it's part of the name of some of
               # the input files.

right_tolerance=  # chain right tolerance == max label delay.  Only relevant if
                  # constrained=true.  At frame rate of alignments.  Code
                  # default is 5.
left_tolerance=   # chain left tolerance (versus alignments from lattices).
                  # Only relevant if constrained=true.  At frame rate of
                  # alignments.  Code default is 5.

stage=0
max_jobs_run=40         # This should be set to the maximum number of
                        # nnet3-chain-get-egs jobs you are comfortable to run in
                        # parallel; you can increase it if your disk speed is
                        # greater and you have more machines.


srand=0         # rand seed for nnet3-chain-get-egs, nnet3-chain-copy-egs and nnet3-chain-shuffle-egs
online_ivector_dir=  # can be used if we are including speaker information as iVectors.
cmvn_opts=  # can be used for specifying CMVN options, if feature type is not lda (if lda,
            # it doesn't make sense to use different options than were used as input to the
            # LDA transform).  This is used to turn off CMVN in the online-nnet experiments.

lattice_lm_scale=     # If supplied, the graph/lm weight of the lattices will be
                      # used (with this scale) in generating supervisions
                      # This is 0 by default for conventional supervised training,
                      # but may be close to 1 for the unsupervised part of the data
                      # in semi-supervised training. The optimum is usually
                      # 0.5 for unsupervised data.
lattice_prune_beam=        # If supplied, the lattices will be pruned to this beam,
                           # before being used to get supervisions.

acwt=0.1   # For pruning.  Should be, for instance, 1.0 for chain lattices.
deriv_weights_scp=

# end configuration section

echo "$0 $@"  # Print the command line for logging

if [ -f path.sh ]; then . ./path.sh; fi
. parse_options.sh || exit 1;


if [ $# != 4 ]; then
  echo "Usage: $0 [opts] <data> <chain-dir> <lattice-dir> <raw-egs-dir>"
  echo " e.g.: $0 data/train exp/chain/tdnn1a_sp exp/tri3_lats exp/chain/tdnn1a_sp/raw_egs"
  echo ""
  echo "From <chain-dir>, 0/<lang>.mdl (for the transition-model), <lang>.tree (the tree), "
  echo "   den_fsts/<lang>.den.fst, and den_fsts/<lang>.normalization.fst (the normalization "
  echo "   FST, derived from the denominator FST echo are read (where <lang> is specified"
  echo "   by the --lang option (its default values is 'default')"
  echo ""
  echo "Main options (for others, see top of script file)"
  echo "  --config <config-file>                           # config file containing options (alternative to this"
  echo "                                                   # command line)"
  echo "  --max-jobs-run <max-jobs-run>                    # The maximum number of jobs you want to run in"
  echo "                                                   # parallel (increase this only if you have good disk and"
  echo "                                                   # network speed).  default=6"
  echo "  --cmd (utils/run.pl;utils/queue.pl <queue opts>) # how to run jobs."
  echo "  --frame-subsampling-factor <factor;3>            # factor by which num-frames at nnet output is reduced "
  echo "  --lang       <language-name;'default'>           # Name of the language, determines names of some inputs."
  echo "  --frames-per-chunk <frames;150>                  # number of supervised frames per chunk on disk"
  echo "                                                   # ... may be a comma separated list, but we advise a single"
  echo "                                                   #  number in most cases, due to interaction with the need "
  echo "                                                   # to group egs from the same speaker into groups."
  echo "  --left-context <int;0>                           # Number of frames on left side to append for feature input"
  echo "  --right-context <int;0>                          # Number of frames on right side to append for feature input"
  echo "  --left-context-initial <int;-1>                  # Left-context for first chunk of an utterance"
  echo "  --right-context-final <int;-1>                   # Right-context for last chunk of an utterance"
  echo "  --lattice-lm-scale <float>                       # If supplied, the graph/lm weight of the lattices will be "
  echo "                                                   # used (with this scale) in generating supervisions"
  echo "  --lattice-prune-beam <float>                     # If supplied, the lattices will be pruned to this beam, "
  echo "                                                   # before being used to get supervisions."
  echo "  --acwt <float;0.1>                               # Acoustic scale -- should be acoustic scale at which the "
  echo "                                                   # supervision lattices are to be interpreted.  Affects pruning"
  echo "  --deriv-weights-scp <str>                        # If supplied, adds per-frame weights to the supervision."
  echo "                                                   # (e.g., might be relevant for unsupervised training)."
  echo "  --stage <stage|0>                                # Used to run this script from somewhere in"
  echo "                                                   # the middle."
  exit 1;
fi

data=$1
chaindir=$2
latdir=$3
dir=$4

tree=$chaindir/${lang}.tree
trans_mdl=$chaindir/init/${lang}_trans.mdl
normalization_fst=$chaindir/den_fsts/${lang}.normalization.fst
den_fst=$chaindir/den_fsts/${lang}.den.fst

[ ! -z "$online_ivector_dir" ] && \
  extra_files="$online_ivector_dir/ivector_online.scp $online_ivector_dir/ivector_period"

for f in $data/feats.scp $latdir/lat.1.gz $latdir/final.mdl \
         $tree $normalization_fst $den_fst $extra_files; do
  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
done

nj=$(cat $latdir/num_jobs) || exit 1
if [ -f $latdir/per_utt ]; then
  sdata=$data/split${nj}utt
  utils/split_data.sh --per-utt $data $nj
else
  sdata=$data/split$nj
  utils/split_data.sh $data $nj
fi

mkdir -p $dir/log  $dir/misc

cp $tree $dir/misc/
copy-transition-model $trans_mdl $dir/misc/${lang}.trans_mdl
cp $normalization_fst $den_fst $dir/misc/
cp $data/utt2spk $dir/misc/
if [ -f $data/utt2uniq ]; then
  cp $data/utt2uniq $dir/misc/
elif [ -f $dir/misc/utt2uniq ]; then
  rm $dir/misc/utt2uniq
fi

if [ -e $dir/storage ]; then
  # Make soft links to storage directories, if distributing this way..  See
  # utils/create_split_dir.pl.
  echo "$0: creating data links"
  utils/create_data_link.pl $(for x in $(seq $nj); do echo $dir/cegs.$x.ark; done)
fi


lats_rspecifier="ark:gunzip -c $latdir/lat.JOB.gz |"
if [ ! -z $lattice_prune_beam ]; then
  if [ "$lattice_prune_beam" == "0" ] || [ "$lattice_prune_beam" == "0.0" ]; then
    lats_rspecifier="$lats_rspecifier lattice-1best --acoustic-scale=$acwt ark:- ark:- |"
  else
    lats_rspecifier="$lats_rspecifier lattice-prune --acoustic-scale=$acwt --beam=$lattice_prune_beam ark:- ark:- |"
  fi
fi

egs_opts="--long-key=true --left-context=$left_context --right-context=$right_context --num-frames=$frames_per_chunk --frame-subsampling-factor=$frame_subsampling_factor --compress=$compress"
[ $left_context_initial -ge 0 ] && egs_opts="$egs_opts --left-context-initial=$left_context_initial"
[ $right_context_final -ge 0 ] && egs_opts="$egs_opts --right-context-final=$right_context_final"

[ ! -z "$deriv_weights_scp" ] && egs_opts="$egs_opts --deriv-weights-rspecifier=scp:$deriv_weights_scp"


chain_supervision_all_opts="--lattice-input=true --frame-subsampling-factor=$alignment_subsampling_factor"
[ ! -z $right_tolerance ] && \
  chain_supervision_all_opts="$chain_supervision_all_opts --right-tolerance=$right_tolerance"

[ ! -z $left_tolerance ] && \
  chain_supervision_all_opts="$chain_supervision_all_opts --left-tolerance=$left_tolerance"

if ! $constrained; then
  # e2e supervision
  chain_supervision_all_opts="$chain_supervision_all_opts --convert-to-pdfs=false"
  egs_opts="$egs_opts --transition-model=$chaindir/0.trans_mdl"
fi

if [ ! -z "$lattice_lm_scale" ]; then
  chain_supervision_all_opts="$chain_supervision_all_opts --lm-scale=$lattice_lm_scale"

  normalization_fst_scale=$(perl -e "
  if ($lattice_lm_scale >= 1.0 || $lattice_lm_scale < 0) {
    print STDERR \"Invalid --lattice-lm-scale $lattice_lm_scale\"; exit(1);
  }
  print (1.0 - $lattice_lm_scale);") || exit 1
  egs_opts="$egs_opts --normalization-fst-scale=$normalization_fst_scale"
fi

if [ ! -z "$online_ivector_dir" ]; then
  ivector_period=$(cat $online_ivector_dir/ivector_period) || exit 1;
  ivector_opts="--online-ivectors=scp:$online_ivector_dir/ivector_online.scp --online-ivector-period=$ivector_period"
else
  ivector_opts=""
fi

feats="scp:$sdata/JOB/feats.scp"
if [ ! -z "$cmvn_opts" ]; then
    if [ ! -f $data/cmvn.scp ]; then
        echo "Cannot find $data/cmvn.scp. But cmvn_opts=$cmvn_opts"
        exit 1
    fi
    if [ `echo $cmvn_opts | fgrep -c true` -eq 1 ]; then
        feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- |"
    fi
fi

if [ $stage -le 0 ]; then
  $cmd --max-jobs-run $max_jobs_run JOB=1:$nj $dir/log/get_egs.JOB.log \
       lattice-align-phones --replace-output-symbols=true $latdir/final.mdl \
       "$lats_rspecifier" ark:- \| \
       chain-get-supervision $chain_supervision_all_opts \
       $dir/misc/${lang}.tree $dir/misc/${lang}.trans_mdl ark:- ark:- \| \
       nnet3-chain-get-egs $ivector_opts --srand=\$[JOB+$srand] $egs_opts \
       "$normalization_fst" "$feats" ark,s,cs:- \
       ark,scp:$dir/cegs.JOB.ark,$dir/cegs.JOB.scp || exit 1;
fi


if [ $stage -le 1 ]; then
  num_input_frames=$(steps/nnet2/get_num_frames.sh $data)
  frames_and_chunks=$(for n in $(seq $nj); do cat $dir/log/get_egs.$n.log; done | \
           perl -e '$nc=0; $nf=0; while(<STDIN>) {
     if (m/Split .+ into (\d+) chunks/) { $this_nc = $1;  }
     if (m/Average chunk length was (\d+.\d+) frames/) { $nf += $1 * $this_nc;  $nc += $this_nc; }
    } print "$nf $nc"; ')
    echo $frames_and_chunks
  num_chunks=$(echo $frames_and_chunks | awk '{print $2}')
  frames_per_chunk_avg=$[num_input_frames/num_chunks]
  feat_dim=$(feat-to-dim scp:$sdata/1/feats.scp -)
  num_leaves=$(tree-info $tree | awk '/^num-pdfs/ {print $2}')
  if [ $left_context_initial -lt 0 ]; then
    left_context_initial=$left_context
  fi
  if [ $right_context_final -lt 0 ]; then
    right_context_final=$right_context
  fi

  cat >$dir/info.txt <<EOF
dir_type raw_chain_egs
num_input_frames $num_input_frames
num_chunks $num_chunks
lang $lang
feat_dim $feat_dim
num_leaves $num_leaves
frames_per_chunk $frames_per_chunk
frames_per_chunk_avg $frames_per_chunk_avg
left_context $left_context
left_context_initial $left_context_initial
right_context $right_context
right_context_final $right_context_final
EOF

  if [ ! -z "$online_ivector_dir" ]; then
      ivector_dim=$(feat-to-dim scp:$online_ivector_dir/ivector_online.scp -) || exit 1;
      echo ivector_dim $ivector_dim >> $dir/info.txt
      steps/nnet2/get_ivector_id.sh $online_ivector_dir || exit 1
      echo final.ie.id `cat $online_ivector_dir/final.ie.id` >> $dir/info.txt
      if [ ! -f $online_ivector_dir/ivector_period ]; then
        echo "$0: $online_ivector_dir/ivector_period does not exist"
        exit 1
      fi
      ivector_period=$(cat $online_ivector_dir/ivector_period)
      ivector_opts="--online-ivectors=scp:$online_ivector_dir/ivector_online.scp --online-ivector-period=$ivector_period"
  else
      ivector_opts=""
  fi

  if ! cat $dir/info.txt | awk '{if (NF == 1) exit(1);}'; then
    echo "$0: we failed to obtain at least one of the fields in $dir/info.txt"
    exit 1
  fi
fi


if [ $stage -le 2 ]; then
  for n in $(seq $nj); do cat $dir/cegs.$n.scp; done > $dir/all.scp
fi

echo "$0: Finished preparing raw egs"


================================================
FILE: egs/steps/nnet3/chain2/internal/get_best_model.sh
================================================
#!/bin/bash

# Copyright   2019  Idiap Research Institute (Author: Srikanth Madikeri).  Apache 2.0.
# This script is the equivalent of get_successful_models function in the python library.
# It takes a list of models and returns either the best model (the deafult) or a list of
# models to average.

models_to_average=false
difference_threshold=1.0
output=output


# echo "$0 $@"  # Print the command line for logging

if [ -f path.sh ]; then . ./path.sh; fi
. parse_options.sh || exit 1;

if [ $# -lt 1 ]; then
    echo "Usage: $0: [options] <model-1-log> <model-2-log> .... <model-N-log>"
    echo "where <model-n> is one of the n models to choose from."
    echo ""
    echo "--models-to-average: when true, returns the models to be averaged rather than the single best model"
    echo "--difference-threshold: used to reject models. models with objf < max-value - difference_threshold are rejected"
    echo "--output: the objf of the this output layer is used for model selection"
    echo ""
    exit 1;
fi

if ! $models_to_average; then
    if [ $# -eq 1 ]; then
        basename $1 | tr '.' ' ' | awk '{ print $(NF-1) }'
        exit 0;
    fi
    model_log_list=$(for arg in $*; do echo $arg; done)
    first_log=$1
    log_line=`fgrep -m 1 "Overall average objective function for '$output' is" $first_log`
    colno=`echo $log_line | cut -d '=' -f1 | wc -w`
    ((colno+=2))
    filename=$(fgrep -m 1 "Overall average objective function for '$output' is" $model_log_list | \
        cut -d ' ' -f1,$colno | tr ':' ' ' | \
        awk '{print $1,$3}' | \
        sort -k2,2 -g | tail -1 | cut -d ' ' -f1)
    basename $filename | tr '.' ' ' | awk '{ print $(NF-1) }'
fi


================================================
FILE: egs/steps/nnet3/chain2/internal/get_train_schedule.py
================================================
#!/usr/bin/env python3

# Copyright 2019    Johns Hopkins University (author: Daniel Povey)
# Copyright         Hossein Hadian
# Copyright   2019  Idiap Research Institute (Author: Srikanth Madikeri).  


# Apache 2.0.

""" This script outputs information about a neural net training schedule,
    to be used by ../train.sh, in the form of lines that can be selected
    and sourced by the shell.
"""

import argparse
import sys

sys.path.insert(0, 'steps')
import libs.nnet3.train.common as common_train_lib
import libs.common as common_lib

def get_args():
    parser = argparse.ArgumentParser(
        description="""Output training schedule information to be consumed by ../train.sh""",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    parser.add_argument("--frame-subsampling-factor", type=int, default=3,
                        help="""Frame subsampling factor for the combined model
                        (bottom+top), will normally be 3.  Required here in order
                        to deal with frame-shifted versions of the input.""")
    parser.add_argument("--initial-effective-lrate",
                        type=float,
                        dest='initial_effective_lrate', default=0.001,
                        help="""Effective learning rate used on the first iteration,
                        determines schedule via geometric interpolation with
                        --final-effective-lrate.   Actual learning rate is
                        this times the num-jobs on that iteration.""")
    parser.add_argument("--final-effective-lrate", type=float,
                        dest='final_effective_lrate', default=0.0001,
                        help="""Learning rate used on the final iteration, see
                        --initial-effective-lrate for more documentation.""")
    parser.add_argument("--num-jobs-initial", type=int, default=1,
                        help="""Number of parallel neural net jobs to use at
                        the start of training""")
    parser.add_argument("--num-jobs-final", type=int, default=1,
                        help="""Number of parallel neural net jobs to use at
                        the end of training.  Would normally
                        be >= --num-jobs-initial""")
    parser.add_argument("--num-epochs", type=float, default=4.0,
                        help="""The number of epochs to train for.
                        Note: the 'real' number of times we see each
                        utterance is this number times --frame-subsampling-factor
                        (to cover frame-shifted copies of the data), times
                        the value of --num-repeats given to process_egs.sh,
                        times any factor arising from data augmentation.""")
    parser.add_argument("--dropout-schedule", type=str,
                        help="""Use this to specify the dropout schedule (how the dropout probability varies
                        with time, 0 == no dropout).  You specify a piecewise
                        linear function on the domain [0,1], where 0 is the
                        start and 1 is the end of training; the
                        function-argument (x) rises linearly with the amount of
                        data you have seen, not iteration number (this improves
                        invariance to num-jobs-{initial-final}).  E.g. '0,0.2,0'
                        means 0 at the start; 0.2 after seeing half the data;
                        and 0 at the end.  You may specify the x-value of
                        selected points, e.g.  '0,0.2@0.25,0' means that the 0.2
                        dropout-proportion is reached a quarter of the way
                        through the data.  The start/end x-values are at
                        x=0/x=1, and other unspecified x-values are interpolated
                        between known x-values.  You may specify different rules
                        for different component-name patterns using
                        'pattern1=func1 pattern2=func2', e.g. 'relu*=0,0.1,0
                        lstm*=0,0.2,0'.  More general should precede less
                        general patterns, as they are applied sequentially.""")

    parser.add_argument("--num-scp-files", type=int, default=0, required=True,
                        help="""The number of .scp files in the egs dir.""")
    parser.add_argument("--schedule-out", type=str, required=True,
                        help="""Output file containing the training schedule.  The output
                        is lines, one per training iteration.
                        Each line (one per iteration) is a list of ;-separated commands setting shell
                        variables.  Currently the following variables are set:
                        iter, num_jobs, inv_num_jobs, scp_indexes, frame_shifts, dropout_opt, lrate.
                        """)

    print(sys.argv, file=sys.stderr)
    args = parser.parse_args()

    return args

def get_schedules(args):
    num_scp_files_expanded = args.num_scp_files * args.frame_subsampling_factor
    num_scp_files_to_process = int(args.num_epochs * num_scp_files_expanded)
    num_scp_files_processed = 0
    num_iters = ((num_scp_files_to_process * 2)
                 // (args.num_jobs_initial + args.num_jobs_final))

    with open(args.schedule_out, 'w', encoding='latin-1') as ostream:
        for iter in range(num_iters):
            current_num_jobs = int(0.5 + args.num_jobs_initial
                                   + (args.num_jobs_final - args.num_jobs_initial)
                                   * float(iter) / num_iters)
            # as a special case, for iteration zero we use just one job
            # regardless of the --num-jobs-initial and --num-jobs-final.  This
            # is because the model averaging does not work reliably for a
            # freshly initialized model.
            # if iter == 0:
            #     current_num_jobs = 1

            lrate = common_train_lib.get_learning_rate(iter, current_num_jobs,
                                                       num_iters,
                                                       num_scp_files_processed,
                                                       num_scp_files_to_process,
                                                       args.initial_effective_lrate,
                                                       args.final_effective_lrate)

            if args.dropout_schedule == "":
                args.dropout_schedule = None
            dropout_edit_option = common_train_lib.get_dropout_edit_option(
                args.dropout_schedule,
                float(num_scp_files_processed) / max(1, (num_scp_files_to_process - args.num_jobs_final)),
                iter)

            frame_shifts = []
            egs = []
            for job in range(1, current_num_jobs + 1):
                # k is a zero-based index that we will derive the other indexes from.
                k = num_scp_files_processed + job - 1
                # work out the 1-based scp index.
                scp_index = (k % args.num_scp_files) + 1
                # previous : frame_shift = (k/num_scp_files) % frame_subsampling_factor
                frame_shift = ((scp_index + k // args.num_scp_files)
                               % args.frame_subsampling_factor)

                # Instead of frame shifts like [0, 1, 2], we make them more like
                # [0, 1, -1].  This is clearer in intent, and keeps the
                # supervision starting at frame zero, which IIRC is a
                # requirement somewhere in the 'chaina' code.
#               TODO: delete this section if no longer useful
                # if frame_shift > (args.frame_subsampling_factor // 2):
                #     frame_shift = frame_shift - args.frame_subsampling_factor

                frame_shifts.append(str(frame_shift))
                egs.append(str(scp_index))


            print("""iter={iter}; num_jobs={nj}; inv_num_jobs={nj_inv}; scp_indexes=(pad {indexes}); frame_shifts=(pad {shifts}); dropout_opt="{opt}"; lrate={lrate}""".format(
                iter=iter, nj=current_num_jobs, nj_inv=(1.0 / current_num_jobs),
                indexes = ' '.join(egs), shifts=' '.join(frame_shifts),
                opt=dropout_edit_option, lrate=lrate), file=ostream)
            num_scp_files_processed = num_scp_files_processed + current_num_jobs


def main():
    args = get_args()
    get_schedules(args)

if __name__ == "__main__":
    main()


================================================
FILE: egs/steps/nnet3/chain2/process_egs.sh
================================================
#!/bin/bash

# Copyright   2019  Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.
# Copyright   2019  Idiap Research Institute (Author: Srikanth Madikeri).  Apache 2.0.
#
# This script takes nnet examples dumped by steps/chain/get_raw_egs.sh and
# combines the chunks into groups by speaker (to the extent possible; it may
# need to combine speakers in some cases), locally randomizes the result, and
# dumps the resulting egs to disk.  Chunks of these will later be globally
# randomized (at the scp level) by steps/chaina/randomize_egs.sh


# Begin configuration section.
cmd=run.pl
num_repeats=1  # number of times we repeat the same chunks with different
               # grouping.  
compress=true   # set this to false to disable compression (e.g. if you want to see whether
                # results are affected).

num_utts_subset=300     # number of utterances in validation and training
                        # subsets used for shrinkage and diagnostics.


shuffle_buffer_size=5000   # Size of buffer (containing grouped egs) to use
                           # for random shuffle.

stage=0
nj=5             # the number of parallel jobs to run.
srand=0

echo "$0 $@"  # Print the command line for logging

if [ -f path.sh ]; then . ./path.sh; fi
. parse_options.sh || exit 1;


if [ $# != 2 ]; then
  echo "Usage: $0 [opts] <raw-egs-dir> <processed-egs-dir>"
  echo " e.g.: $0 exp/chaina/tdnn1a_sp/raw_egs exp/chaina/tdnn1a_sp/processed_egs"
  echo ""
  echo "Main options (for others, see top of script file)"
  echo "  --config <config-file>                           # config file containing options (alternative to this"
  echo "                                                   # command line)"
  echo "  --cmd (utils/run.pl;utils/queue.pl <queue opts>) # how to run jobs."
  echo "  --num-repeats <n;2>                              # Number of times we group the same chunks into different"
  echo "                                                   # groups.  For now only the values 1 and 2 are"
  echo "                                                   # recommended, due to the very simple way we choose"
  echo "                                                   # the groups (it's consecutive)."
  echo "  --nj       <num-jobs;5>                          # Number of jobs to run in parallel.  Usually quite a"
  echo "                                                   # small number, as we'll be limited by disk access"
  echo "                                                   # speed."
  echo "  --compress <bool;true>                           # True if you want the egs to be compressed"
  echo "                                                   # (e.g. you may set to false for debugging purposes, to"
  echo "                                                   # check that the compression is not hurting)."
  echo "  --num-heldout-egs <n;200>                        # Number of egs to put in train_subset.scp and heldout_subset.scp."
  echo "                                                   # These will be used for diagnostics.  Note: this number is"
  echo "                                                   # the number of  grouped egs, after merging --chunks-per-group"
  echo "                                                   # chunks into a single eg."
  echo "                                                   # ... may be a comma separated list, but we advise a single"
  echo "                                                   #  number in most cases, due to interaction with the need "
  echo "                                                   # to group egs from the same speaker into groups."
  echo "  --stage <stage|0>                                # Used to run this script from somewhere in"
  echo "                                                   # the middle."
  exit 1;
fi

raw_egs_dir=$1
dir=$2

# die on error or undefined variable.
set -e -u

if ! steps/chain2/validate_raw_egs.sh $raw_egs_dir; then
  echo "$0: failed to validate input directory $raw_egs_dir"
  exit 1
fi


mkdir -p $dir/temp $dir/log


if [ $stage -le 0 ]; then
  echo "$0: choosing heldout_subset and train_subset"

  utt2uniq_opt=
  if [ -f $raw_egs_dir/misc/utt2uniq ]; then
      utt2uniq_opt="--utt2uniq=$raw_egs_dir/misc/utt2uniq"
      echo "$0: File $raw_egs_dir/misc/utt2uniq exists, so ensuring the hold-out set" \
           "includes all perturbed versions of the same source utterance."
      utils/utt2spk_to_spk2utt.pl $raw_egs_dir/misc/utt2uniq 2>/dev/null | \
          utils/shuffle_list.pl 2>/dev/null | \
            awk -v max_utt=$num_utts_subset '{
                for (n=2;n<=NF;n++) print $n;
                printed += NF-1;
                if (printed >= max_utt) nextfile; }' \
          | fgrep -f - $raw_egs_dir/all.scp | sort -k1,1 > $dir/temp/heldout_subset.list
  else
      awk '{print $1}' $raw_egs_dir/misc/utt2spk | \
        utils/shuffle_list.pl 2>/dev/null | \
        head -$num_utts_subset |  fgrep -f - $raw_egs_dir/all.scp | sort -k1,1 > $dir/temp/heldout_subset.list
  fi

  awk '{print $1}' $raw_egs_dir/misc/utt2spk | \
     utils/filter_scp.pl --exclude $dir/temp/heldout_subset.list | \
     utils/shuffle_list.pl 2>/dev/null | \
     head -$num_utts_subset | fgrep -f - $raw_egs_dir/all.scp | sort -k1,1 > $dir/temp/train_subset.list

  awk '{print $1}' $raw_egs_dir/misc/utt2spk | \
     utils/filter_scp.pl --exclude $dir/temp/heldout_subset.list | fgrep -f - $raw_egs_dir/all.scp > $dir/temp/train.list
  fi
len_valid_uttlist=$(wc -l < $dir/temp/heldout_subset.list)
len_trainsub_uttlist=$(wc -l <$dir/temp/train_subset.list)

if [ $stage -le 1 ]; then

  for name in heldout_subset train_subset; do
    echo "$0: merging and shuffling $name egs"

    cp $dir/temp/${name}.list $dir/temp/${name}.scp

    $cmd $dir/log/shuffle_${name}_egs.log \
      nnet3-chain-shuffle-egs --srand=$srand scp:$dir/temp/${name}.scp ark,scp:$dir/${name}.ark,$dir/${name}.scp
  done

  # Split up the training list into multiple smaller lists, as it could be long.
  utils/split_scp.pl $dir/temp/train.list  $(for j in $(seq $nj); do echo $dir/temp/train.$j.scp; done)

  if [ -e $dir/storage ]; then
    # Make soft links to storage directories, if distributing this way..  See
    # utils/create_split_dir.pl.
    echo "$0: creating data links"
    utils/create_data_link.pl $(for j in $(seq $nj); do echo $dir/train.$j.ark; done) || true
  fi

  $cmd JOB=1:$nj $dir/log/shuffle_train_egs.JOB.log \
     nnet3-chain-shuffle-egs --buffer-size=$shuffle_buffer_size \
         --srand=\$[JOB+$srand] scp:$dir/temp/train.JOB.scp ark,scp:$dir/train.JOB.ark,$dir/train.JOB.scp || exit 1;
  cat $(for j in $(seq $nj); do echo $dir/train.$j.scp; done) > $dir/train.scp
fi

cat $raw_egs_dir/info.txt  | awk  -v num_repeats=$num_repeats \
   '
  /^dir_type / { print "dir_type processed_chain_egs"; next; }
  /^num_input_frames / { print "num_input_frames "$2 * num_repeats; next; } # approximate; ignores held-out egs.
  /^num_chunks / { print "num_chunks " $2 * num_repeats; next; }
   {print;}
  END{print "num_repeats " num_repeats;}' >$dir/info.txt


if ! cat $dir/info.txt | awk '{if (NF == 1) exit(1);}'; then
  echo "$0: we failed to obtain at least one of the fields in $dir/info.txt"
  exit 1
fi

cp -r $raw_egs_dir/misc/ $dir/


echo "$0: Finished processing egs"


================================================
FILE: egs/steps/nnet3/chain2/randomize_egs.sh
================================================
#!/bin/bash

# Copyright   2019  Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.
# Copyright   2019  Idiap Research Institute (Author: Srikanth Madikeri).  Apache 2.0.
#
# This script takes nnet examples dumped by steps/chain/process_egs.sh,
# globally randomizes the egs, and divides into multiple .scp files.  This is
# the form of egs which is consumed by the training script.  All this is done
# only by manipulating the contents of .scp files.  To keep locality of disk
# access, we only randomize blocks of egs (e.g.  blocks containing 128 groups of
# sequences).  This doesn't defeat randomization, because both process_egs.sh
# and the training script use nnet3-shuffle-egs to do more local randomization.

# Later on, we'll have a multilingual/multi-input-dir version fo this script
# that combines egs from various data sources and possibly multiple languages.
# This version assumes there is just one language.

# Begin configuration section.
cmd=run.pl

groups_per_block=128     # The 'groups' are the egs in the scp file from
                         # process_egs.sh, containing '--chunks-per-group' sequences
                         # each.
num_blocks=256

frames_per_job=3000000   # The number of frames of data we want to process per
                         # training job (will determine how long each job takes,
                         # and the frequency of model averaging.  This was
                         # previously called --frames-per-iter, but
                         # --frames-per-job is clearer as each job does this
                         # many.

num_groups_combine=1000  # the number of groups from the training set that we
                         # randomly choose as input to nnet3-chain-combine;
                         # these will go to combine.scp.  train_subset.scp and
                         # heldout_subset.scp are, for now, just copied over
                         # from the input.

# Later we may provide a mechanism to change the language name; for now we
# just copy it from the input.


srand=0
stage=0

echo "$0 $@"  # Print the command line for logging

if [ -f path.sh ]; then . ./path.sh; fi
. parse_options.sh || exit 1;


if [ $# != 2 ]; then
  echo "Usage: $0 [opts] <processed-egs-dir> <randomized-egs-dir>"
  echo " e.g.: $0 --frames-per-job 2000000 exp/chain/tdnn1a_sp/processed_egs exp/chain/tdnn1a_sp/egs"
  echo ""
  echo "Main options (for others, see top of script file)"
  echo "  --config <config-file>                           # config file containing options (alternative to this"
  echo "                                                   # command line)"
  echo "  --cmd (utils/run.pl;utils/queue.pl <queue opts>) # how to run jobs."
  echo "  --groups-per-block <n;128>                       # The number of groups (i.e. previously merged egs"
  echo "                                                   # containing --chunks-per-group chunks) to to consider "
  echo "                                                   # as one block, where whole blocks are randomized;"
  echo "                                                   # smaller means more complete randomization but less"
  echo "                                                   # local disk access."
  echo "  --frames-per-job <n;3000000>                     # The number of input frames (not counting context)"
  echo "                                                   # that we aim to have in each scp file after"
  echo "                                                   # randomization and splitting."
  echo "  --num-groups-combine <n;1000>                    # The number of randomly chosen groups to"
  echo "                                                   # put in the subset in 'combine.scp' which will"
  echo "                                                   # be used in nnet3-chain-combine to decide which"
  echo "                                                   # models to average over."
  echo "  --stage <stage|0>                                # Used to run this script from somewhere in"
  echo "                                                   # the middle."
  echo "  --srand <srand|0>                                # Random seed, affects randomization."
  exit 1;
fi

processed_egs_dir=$1
dir=$2

# die on error or undefined variable.
set -e -u

if ! steps/chain2/validate_processed_egs.sh $processed_egs_dir; then
  echo "$0: could not validate input directory $processed_egs_dir"
  exit 1
fi

# Work out how many groups per job and how many frames per job we'll have

info_in=$processed_egs_dir/info.txt

# num_scp_files is the number of archives
num_input_frames=$(awk '/^num_input_frames/ { nif=$2; print nif}' $info_in)
frames_per_chunk_avg=$(awk '/^frames_per_chunk_avg/ { fpc=$2; print fpc}' $info_in)
num_chunks=$(awk '/^num_chunks/ { nc=$2; print nc}' $info_in)
num_scp_files=$[(num_chunks * frames_per_chunk_avg)/frames_per_job +1]
[ $num_scp_files -eq 0 ] && num_scp_files=1

frames_per_scp_file=$[(num_chunks*frames_per_chunk_avg)/num_scp_files] # because it may be slightly different from frames_per_job


mkdir -p $dir/temp

if [ -d $dir/misc ]; then
  rm -r $dir/misc
fi

mkdir -p $dir/misc
cp $processed_egs_dir/misc/* $dir/misc

utils/shuffle_list.pl  $processed_egs_dir/train.scp > $dir/temp/train.scp
utils/split_scp.pl $dir/temp/train.scp $(for i in $(seq $num_blocks); do echo $dir/temp/train.$i.scp; done)
for i in `seq $num_blocks`; do
    utils/split_scp.pl <(utils/shuffle_list.pl $dir/temp/train.$i.scp) $(for j in $(seq $num_scp_files); do echo $dir/temp/train.$i.$j.scp; done)
done
for j in `seq $num_scp_files`; do
    cat $dir/temp/train.*.$j.scp | utils/shuffle_list.pl > $dir/train.$j.scp
done
rm -rf $dir/temp &

cp $processed_egs_dir/heldout_subset.scp $processed_egs_dir/train_subset.scp $dir/


# note: there is only one language in $processed_egs_dir (any
# merging would be done at the randomization stage but that is not supported yet).

lang=$(awk '/^lang / { print $2; }' <$processed_egs_dir/info.txt)

# We'll store info files per language, containing the part of the information
# that is language-specific, plus a single global info.txt containing stuff that
# is not language specific.
# This will get more complicated once we actually support multiple languages,
# and when we allow multiple input processed egs dirs for the same language.

grep -v -E '^dir_type|^lang|^feat_dim' <$processed_egs_dir/info.txt | \
  cat <(echo "dir_type randomized_chain_egs") - > $dir/info_$lang.txt


cat <<EOF >$dir/info.txt
dir_type randomized_chain_egs
num_scp_files $num_scp_files
langs $lang
frames_per_scp_file $frames_per_scp_file
EOF
# frames_per_job, after rounding, becomes frames_per_scp_file.

# note: frames_per_chunk_avg will be present in the info.txt file as well as
# the per-language files.
grep -E '^feat_dim|^frames_per_chunk_avg' <$processed_egs_dir/info.txt >>$dir/info.txt


if ! cat $dir/info.txt | awk '{if (NF == 1) exit(1);}'; then
  echo "$0: we failed to obtain at least one of the fields in $dir/info.txt"
  exit 1
fi


wait;
echo "$0: Finished randomizing egs"


================================================
FILE: egs/steps/nnet3/chain2/train.sh
================================================
#!/usr/bin/env bash

# Copyright   2019  Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.
# Copyright   2019  Idiap Research Institute (Author: Srikanth Madikeri).  Apache 2.0.


# Begin configuration section
stage=-2
cmd=run.pl
gpu_cmd_opt=
leaky_hmm_coefficient=0.1
xent_regularize=0.1
apply_deriv_weights=false   # you might want to set this to true in unsupervised training
                            # scenarios.
memory_compression_level=2  # Enables us to use larger minibatch size than we
                            # otherwise could, but may not be optimal for speed
                            # (--> set to 0 if you have plenty of memory.
dropout_schedule=
srand=0
max_param_change=2.0    # we use a smaller than normal default (it's normally
                        # 2.0), because there are two models (bottom and top).
use_gpu=yes   # can be "yes", "no", "optional", "wait"
print_interval=10
momentum=0.0
parallel_train_opts=
verbose_opt=

common_opts=           # Options passed through to nnet3-chain-train and nnet3-chain-combine

num_epochs=4.0   #  Note: each epoch may actually contain multiple repetitions of
                 #  the data, for various reasons:
                 #    using the --num-repeats option in process_egs.sh
                 #    data augmentation
                 #    different data shifts (this includes 3 different shifts
                 #    of the data if frame_subsampling_factor=3 (see $dir/init/info.txt)

num_jobs_initial=1
num_jobs_final=1
initial_effective_lrate=0.001
final_effective_lrate=0.0001
minibatch_size=32  # This is how you set the minibatch size. 

max_iters_combine=80
max_models_combine=20
diagnostic_period=5    # Get diagnostics every this-many iterations

shuffle_buffer_size=1000  # This "buffer_size" variable controls randomization of the groups
                          # on each iter.


l2_regularize=
out_of_range_regularize=0.01
multilingual_eg=false

# End configuration section


echo "$0 $@"  # Print the command line for logging

if [ -f path.sh ]; then . ./path.sh; fi
. parse_options.sh || exit 1;


if [ $# != 2 ]; then
  echo "Usage: $0  [options] <egs-dir>  <model-dir>"
  echo " e.g.: $0 exp/chain/tdnn1a_sp/egs  exp/chain/tdnn1a_sp"
  echo ""
  echo "This is the default script to train acoustic models for chain2 recipes."
  echo "The script requires two arguments:"
  echo "<egs-dir>: directory where egs files are stored"
  echo "<model-dir>: directory where the final model will be stored"
  echo ""
  echo "See the top of the script to check possible options to pass to it."
  exit 1
fi

egs_dir=$1
dir=$2

set -e -u  # die on failed command or undefined variable

steps/chain2/validate_randomized_egs.sh $egs_dir

for f in $dir/init/info.txt; do
  if [ ! -f $f ]; then
    echo "$0: expected file $f to exist"
    exit 1
  fi
done
cat $egs_dir/info.txt >> $dir/init/info.txt


frame_subsampling_factor=$(awk '/^frame_subsampling_factor/ {print $2}' <$dir/init/info.txt)
num_scp_files=$(awk '/^num_scp_files/ {print $2}' <$egs_dir/info.txt)

if [ $stage -le -2 ]; then
    echo "$0: Generating training schedule"
    steps/chain2/internal/get_train_schedule.py \
      --frame-subsampling-factor=$frame_subsampling_factor \
      --num-jobs-initial=$num_jobs_initial \
      --num-jobs-final=$num_jobs_final \
      --num-epochs=$num_epochs \
      --dropout-schedule="$dropout_schedule" \
      --num-scp-files=$num_scp_files \
      --frame-subsampling-factor=$frame_subsampling_factor \
      --initial-effective-lrate=$initial_effective_lrate \
      --final-effective-lrate=$final_effective_lrate \
      --schedule-out=$dir/schedule.txt
fi


if [ "$use_gpu" != "no" ]; then gpu_cmd_opt="--gpu 1"; else gpu_cmd_opt=""; fi

num_iters=$(wc -l <$dir/schedule.txt)

echo "$0: will train for $num_epochs epochs = $num_iters iterations"

# source the 1st line of schedule.txt in the shell; this sets
# lrate and dropout_opt, among other variables.
. <(head -n 1 $dir/schedule.txt)
langs=$(awk '/^langs/ { $1=""; print; }' <$dir/init/info.txt | tail -1)
num_langs=$(echo $langs | wc -w)

mkdir -p $dir/log

# Copy models with initial learning rate and dropout options from $dir/init to $dir/0
if [ $stage -le -1 ]; then
  echo "$0: Copying transition model"
  if [ $num_langs -eq 1 ]; then
      echo "$0: Num langs is 1"
      cp $dir/init/default.raw $dir/0.raw
      if [ -f $dir/init/default_trans.mdl ]; then
          cp $dir/init/default_trans.mdl $dir/0_trans.mdl 
      fi
  else
      echo "$0: Num langs is $num_langs"
      cp $dir/init/multi.raw $dir/0.raw
  fi
fi


l2_regularize_opt=""
if [ ! -z $l2_regularize ]; then
    l2_regularize_opt="--l2-regularize=$l2_regularize"
fi

x=0
if [ $stage -gt $x ]; then x=$stage; fi

[ $max_models_combine -gt $[num_iters/2] ] && max_models_combine=$[num_iters/2];
combine_start_iter=$[num_iters+1-max_models_combine]

while [ $x -lt $num_iters ]; do
  # Source some variables fromm schedule.txt.  The effect will be something
  # like the following:
  # iter=0; num_jobs=2; inv_num_jobs=0.5; scp_indexes=(pad 1 2); frame_shifts=(pad 1 2); dropout_opt="--edits='set-dropout-proportion name=* proportion=0.0'" lrate=0.002
  . <(grep "^iter=$x;" $dir/schedule.txt)

  echo "$0: training, iteration $x of $num_iters, num-jobs is $num_jobs"

  next_x=$[$x+1]
  den_fst_dir=$egs_dir/misc
  model_out_prefix=$dir/${next_x}
  model_out=${model_out_prefix}.mdl
  multilingual_eg_opts=
  if $multilingual_eg; then
       multilingual_eg_opts="--multilingual-eg=true"
  fi

  # for the first 4 iterations, plus every $diagnostic_period iterations, launch
  # some diagnostic processes.  We don't do this on iteration 0, because
  # the batchnorm stats wouldn't be ready
  if [ $x -gt 0 ] && [ $[x%diagnostic_period] -eq 0 -o $x -lt 5 ]; then

    [ -f $dir/.error_diagnostic ] && rm $dir/.error_diagnostic
    for name in train heldout; do
      egs_opts=
      if $multilingual_eg; then
          weight_rspecifier=$egs_dir/diagnostic_${name}.weight.ark
          [[ -f $weight_rspecifier ]] && egs_opts="--weights=ark:$weight_rspecifier"
      fi
      $cmd $gpu_cmd_opt $dir/log/diagnostic_${name}.$x.log \
         nnet3-chain-train2 --use-gpu=$use_gpu \
            --leaky-hmm-coefficient=$leaky_hmm_coefficient \
            --xent-regularize=$xent_regularize \
            --out-of-range-regularize=$out_of_range_regularize \
            $l2_regularize_opt \
            --print-interval=10  \
           "nnet3-copy --learning-rate=$lrate $dir/${x}.raw - |" $den_fst_dir \
           "ark:nnet3-chain-copy-egs $egs_opts scp:$egs_dir/${name}_subset.scp ark:- | nnet3-chain-merge-egs $multilingual_eg_opts --minibatch-size=1:64 ark:- ark:-|" \
           $dir/${next_x}_${name}.mdl || touch $dir/.error_diagnostic &

       # Make sure we do not run more than $num_jobs_final at once
       [ $num_jobs_final -eq 1 ] && wait

    done
    wait
  fi

  if [ $x -gt 0 ]; then
    # This doesn't use the egs, it only shows the relative change in model parameters.
    $cmd $dir/log/progress.$x.log \
      nnet3-show-progress --use-gpu=no $dir/$(($x-1)).raw $dir/${x}.raw '&&' \
        nnet3-info $dir/${x}.raw &
  fi

  cache_io_opt="--write-cache=$dir/cache.$next_x"
  if [ $x -gt 0 -a -f $dir/cache.$x ]; then
      cache_io_opt="$cache_io_opt --read-cache=$dir/cache.$x"
  fi
  for j in $(seq $num_jobs); do
    scp_index=${scp_indexes[$j]}
    frame_shift=${frame_shifts[$j]}

    egs_opts=
    if $multilingual_eg; then
        weight_rspecifier=$egs_dir/train.weight.$scp_index.ark
        [[ -f $weight_rspecifier ]] && egs_opts="--weights=ark:$weight_rspecifier"
    fi
    $cmd $gpu_cmd_opt $dir/log/train.$x.$j.log \
         nnet3-chain-train2  \
             $parallel_train_opts $verbose_opt \
             --out-of-range-regularize=$out_of_range_regularize \
             $cache_io_opt \
             --use-gpu=$use_gpu --apply-deriv-weights=$apply_deriv_weights \
             --leaky-hmm-coefficient=$leaky_hmm_coefficient --xent-regularize=$xent_regularize \
             --print-interval=$print_interval --max-param-change=$max_param_change \
             --momentum=$momentum \
             --l2-regularize-factor=$inv_num_jobs \
             $l2_regularize_opt \
             --srand=$srand \
             "nnet3-copy --learning-rate=$lrate $dir/${x}.raw - |" $den_fst_dir \
             "ark:nnet3-chain-copy-egs $egs_opts --frame-shift=$frame_shift scp:$egs_dir/train.$scp_index.scp ark:- | nnet3-chain-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x ark:- ark:- | nnet3-chain-merge-egs $multilingual_eg_opts --minibatch-size=$minibatch_size ark:- ark:-|" \
             ${model_out_prefix}.$j.raw || touch $dir/.error &
  done
  wait
  if [ -f $dir/.error ]; then
    echo "$0: error detected training on iteration $x"
    exit 1
  fi
  if [ $x -ge 1 ]; then
      models_to_average=$(for j in `seq $num_jobs`; do echo ${model_out_prefix}.$j.raw; done)
      $cmd $dir/log/average.$x.log \
          nnet3-average $models_to_average $dir/$next_x.raw  || exit 1;
      rm $models_to_average
  else
      lang=$(echo $langs | awk '{print $1}')
      model_index=`steps/nnet3/chain2/internal/get_best_model.sh --output output-${lang} $dir/log/train.$x.*.log`
      cp ${model_out_prefix}.$model_index.raw $dir/$next_x.raw
      rm ${model_out_prefix}.*.raw
  fi
  [ -f $dir/$x/.error_diagnostic ] && echo "$0: error getting diagnostics on iter $x" && exit 1;

  if [ -f $dir/cache.$x ]; then
      rm $dir/cache.$x
  fi
  delete_iter=$[x-2]
  if [ $delete_iter -lt $combine_start_iter ]; then
      if [ -f $dir/$delete_iter.raw ]; then
          rm $dir/$delete_iter.raw
      fi
  fi
  if [ -f $dir/${next_x}_train.mdl ]; then
      rm $dir/${next_x}_{train,heldout}.mdl
  fi
  x=$[x+1]
done


if [ $stage -le $num_iters ]; then
  echo "$0: doing model combination"
  den_fst_dir=$egs_dir/misc
  input_models=$(for x in $(seq $combine_start_iter $num_iters); do echo $dir/${x}.raw; done)
  output_model_dir=$dir/final

   $cmd $gpu_cmd_opt $dir/log/combine.log \
      nnet3-chain-combine2 --use-gpu=$use_gpu \
        --leaky-hmm-coefficient=$leaky_hmm_coefficient \
        --print-interval=10  \
        $den_fst_dir $input_models \
        "ark:nnet3-chain-merge-egs $multilingual_eg_opts  scp:$egs_dir/train_subset.scp ark:-|" \
        $dir/final.raw || exit 1;
   if ! $multilingual_eg; then
       nnet3-copy  --edits="rename-node old-name=output new-name=output-dummy; rename-node old-name=output-default new-name=output" \
          $dir/final.raw - | \
          nnet3-am-init $dir/0_trans.mdl - $dir/final.mdl
   fi

   # Compute the probability of the final, combined model with
   # the same subset we used for the previous diagnostic processes, as the
   # different subsets will lead to different probs.
   [ -f $dir/.error_diagnostic ] && rm $dir/.error_diagnostic
   for name in train heldout; do
     egs_opts=
     if $multilingual_eg; then
       weight_rspecifier=$egs_dir/diagnostic_${name}.weight.ark
       [[ -f $weight_rspecifier ]] && egs_opts="--weights=ark:$weight_rspecifier"
     fi
     $cmd $gpu_cmd_opt $dir/log/diagnostic_${name}.final.log \
       nnet3-chain-train2 --use-gpu=$use_gpu \
         --leaky-hmm-coefficient=$leaky_hmm_coefficient \
         --xent-regularize=$xent_regularize \
         --out-of-range-regularize=$out_of_range_regularize \
         $l2_regularize_opt \
         --print-interval=10  \
         $dir/final.raw  $den_fst_dir \
         "ark:nnet3-chain-copy-egs $egs_opts scp:$egs_dir/${name}_subset.scp ark:- | nnet3-chain-merge-egs $multilingual_eg_opts --minibatch-size=1:64 ark:- ark:-|" \
         $dir/final_${name}.mdl || touch $dir/.error_diagnostic &
   done

   if [ -f $dir/final_train.mdl ]; then
     rm $dir/final_{train,heldout}.mdl
   fi
fi

if [[ ! $multilingual_eg ]] && [[ ! -f $dir/final.mdl ]]; then
  echo "$0: $dir/final.mdl does not exist."
  # we don't want to clean up if the training didn't succeed.
  exit 1;
fi

sleep 2

echo "$0: done"

steps/info/chain_dir_info.pl $dir

exit 0


================================================
FILE: egs/steps/nnet3/chain2/validate_processed_egs.sh
================================================
#!/bin/bash

# Copyright   2019  Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.
# Copyright   2019  Idiap Research Institute (Author: Srikanth Madikeri).  Apache 2.0.
#
# This script validates a directory containing 'processed' egs for 'chain'
# training, i.e. the output of process_egs.sh.  It also helps to document the
# expectations on such a directory.


if [ -f path.sh ]; then . ./path.sh; fi


if [ $# != 1 ]; then
  echo "Usage: $0  <processed-egs-dir>"
  echo " e.g.: $0 exp/chain/tdnn1a_sp/processed_egs"
  echo ""
  echo "Validates that the processed-egs dir has the expected format"
fi

dir=$1

# Note: the .ark files are not actually consumed directly downstream (only via
# the top-level .scp files), but we check them anyway for now.
for f in $dir/train.scp $dir/info.txt \
         $dir/heldout_subset.{ark,scp} $dir/train_subset.{ark,scp} \
         $dir/train.1.scp $dir/train.1.ark; do
  if ! [ -f $f -a -s $f ]; then
    echo "$0: expected file $f to exist and be nonempty."
    exit 1
  fi
done


if [ $(awk '/^dir_type/ { print $2; }' <$dir/info.txt) != "processed_chain_egs" ]; then
  grep dir_type $dir/info.txt
  echo "$0: dir_type should be processed_chain_egs in $dir/info.txt"
  exit 1
fi

lang=$(awk '/^lang / {print $2; }' <$dir/info.txt)

for f in $dir/misc/$lang.{trans_mdl,normalization.fst,den.fst}; do
  if ! [ -f $f -a -s $f ]; then
    echo "$0: expected file $f to exist and be nonempty."
    exit 1
  fi
done

echo "$0: sucessfully validated processed egs in $dir"


================================================
FILE: egs/steps/nnet3/chain2/validate_randomized_egs.sh
================================================
#!/bin/bash

# Copyright   2019  Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.
# Copyright   2019  Idiap Research Institute (Author: Srikanth Madikeri).  Apache 2.0.
#
# This script validates a directory containing 'randomized' egs for 'chain'
# training, i.e. the output of randomize_egs.sh (this is the final form of the
# egs which is consumed by the training script).  It also helps to document the
# expectations on such a directory.


if [ -f path.sh ]; then . ./path.sh; fi


if [ $# != 1 ]; then
  echo "Usage: $0  <randomized-egs-dir>"
  echo " e.g.: $0 exp/chain/tdnn1a_sp/egs"
  echo ""
  echo "Validates that the final (randomized) egs dir has the expected format"
fi

dir=$1

# Note: the .ark files are not actually consumed directly downstream (only via
# the top-level .scp files), but we check them anyway for now.
for f in $dir/train.1.scp $dir/info.txt \
         $dir/heldout_subset.scp $dir/train_subset.scp; do
  if ! [ -f $f -a -s $f ]; then
    echo "$0: expected file $f to exist and be nonempty."
    exit 1
  fi
done


if [ $(awk '/^dir_type/ { print $2; }' <$dir/info.txt) != "randomized_chain_egs" ]; then
  grep dir_type $dir/info.txt
  echo "$0: dir_type should be randomized_chain_egs in $dir/info.txt"
  exit 1
fi

langs=$(awk '/^langs / {$1 = ""; print; }' <$dir/info.txt)
num_scp_files=$(awk '/^num_scp_files / { print $2; }' <$dir/info.txt)

if [ -z "$langs" ]; then
  echo "$0: expecting the list of languages to be nonempty in $dir/info.txt"
  exit 1
fi

for lang in $langs; do
  for f in $dir/misc/$lang.{trans_mdl,normalization.fst,den.fst} $dir/info_${lang}.txt; do
    if ! [ -f $f -a -s $f ]; then
      echo "$0: expected file $f to exist and be nonempty."
      exit 1
    fi
  done
done

for i in $(seq $num_scp_files); do
  if ! [ -s $dir/train.$i.scp ]; then
    echo "$0: expected file $dir/train.$i.scp to exist and be nonempty."
    exit 1
  fi
done


echo "$0: sucessfully validated randomized egs in $dir"


================================================
FILE: egs/steps/nnet3/chain2/validate_raw_egs.sh
================================================
#!/bin/bash

# Copyright   2019  Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.
# Copyright   2019  Idiap Research Institute (Author: Srikanth Madikeri).  Apache 2.0.
#
# This script validates a directory containing 'raw' egs for 'chain' training.
# It also helps to document the expectations on such a directory.


if [ -f path.sh ]; then . ./path.sh; fi


if [ $# != 1 ]; then
  echo "Usage: $0  <raw-egs-dir>"
  echo " e.g.: $0 exp/chaina/tdnn1a_sp/raw_egs"
  echo ""
  echo "Validates that the raw-egs dir has the expected format"
fi

dir=$1

for f in $dir/all.scp $dir/cegs.1.ark $dir/info.txt \
         $dir/misc/utt2spk; do
  if ! [ -s $f ]; then
    echo "$0: expected file $f to exist and be nonempty."
    exit 1
  fi
done


if [ $(awk '/^dir_type/ { print $2; }' <$dir/info.txt) != "raw_chain_egs" ]; then
  grep dir_type $dir/info.txt
  echo "$0: dir_type should be raw_chain_egs in $dir/info.txt"
  exit 1
fi

lang=$(awk '/^lang / {print $2; }' <$dir/info.txt)

for f in $dir/misc/$lang.{trans_mdl,normalization.fst,den.fst}; do
  if ! [ -s $f ]; then
    echo "$0: expected file $f to exist and be nonempty."
    exit 1
  fi
done

echo "$0: sucessfully validated raw egs in $dir"


================================================
FILE: egs/steps/nnet3/components.py
================================================
#!/usr/bin/env python
# Note: this file is part of some nnet3 config-creation tools that are now deprecated.

from __future__ import print_function
import os
import argparse
import sys
import warnings
import copy
from operator import itemgetter

def GetSumDescriptor(inputs):
    sum_descriptors = inputs
    while len(sum_descriptors) != 1:
        cur_sum_descriptors = []
        pair = []
        while len(sum_descriptors) > 0:
            value = sum_descriptors.pop()
            if value.strip() != '':
                pair.append(value)
            if len(pair) == 2:
                cur_sum_descriptors.append("Sum({0}, {1})".format(pair[0], pair[1]))
                pair = []
        if pair:
            cur_sum_descriptors.append(pair[0])
        sum_descriptors = cur_sum_descriptors
    return sum_descriptors

# adds the input nodes and returns the descriptor
def AddInputLayer(config_lines, feat_dim, splice_indexes=[0], ivector_dim=0):
    components = config_lines['components']
    component_nodes = config_lines['component-nodes']
    output_dim = 0
    components.append('input-node name=input dim=' + str(feat_dim))
    list = [('Offset(input, {0})'.format(n) if n != 0 else 'input') for n in splice_indexes]
    output_dim += len(splice_indexes) * feat_dim
    if ivector_dim > 0:
        components.append('input-node name=ivector dim=' + str(ivector_dim))
        list.append('ReplaceIndex(ivector, t, 0)')
        output_dim += ivector_dim
    if len(list) > 1:
        splice_descriptor = "Append({0})".format(", ".join(list))
    else:
        splice_descriptor = list[0]
    print(splice_descriptor)
    return {'descriptor': splice_descriptor,
            'dimension': output_dim}

def AddNoOpLayer(config_lines, name, input):
    components = config_lines['components']
    component_nodes = config_lines['component-nodes']

    components.append('component name={0}_noop type=NoOpComponent dim={1}'.format(name, input['dimension']))
    component_nodes.append('component-node name={0}_noop component={0}_noop input={1}'.format(name, input['descriptor']))

    return {'descriptor':  '{0}_noop'.format(name),
            'dimension': input['dimension']}

def AddLdaLayer(config_lines, name, input, lda_file):
    return AddFixedAffineLayer(config_lines, name, input, lda_file)

def AddFixedAffineLayer(config_lines, name, input, matrix_file):
    components = config_lines['components']
    component_nodes = config_lines['component-nodes']

    components.append('component name={0}_fixaffine type=FixedAffineComponent matrix={1}'.format(name, matrix_file))
    component_nodes.append('component-node name={0}_fixaffine component={0}_fixaffine input={1}'.format(name, input['descriptor']))

    return {'descriptor':  '{0}_fixaffine'.format(name),
            'dimension': input['dimension']}


def AddBlockAffineLayer(config_lines, name, input, output_dim, num_blocks):
    components = config_lines['components']
    component_nodes = config_lines['component-nodes']
    assert((input['dimension'] % num_blocks == 0) and
            (output_dim % num_blocks == 0))
    components.append('component name={0}_block_affine type=BlockAffineComponent input-dim={1} output-dim={2} num-blocks={3}'.format(name, input['dimension'], output_dim, num_blocks))
    component_nodes.append('component-node name={0}_block_affine component={0}_block_affine input={1}'.format(name, input['descriptor']))

    return {'descriptor' : '{0}_block_affine'.format(name),
                           'dimension' : output_dim}

def AddPermuteLayer(config_lines, name, input, column_map):
    components = config_lines['components']
    component_nodes = config_lines['component-nodes']
    permute_indexes = ",".join([str(x) for x in column_map])
    components.append('component name={0}_permute type=PermuteComponent column-map={1}'.format(name, permute_indexes))
    component_nodes.append('component-node name={0}_permute component={0}_permute input={1}'.format(name, input['descriptor']))

    return {'descriptor': '{0}_permute'.format(name),
            'dimension': input['dimension']}

def AddAffineLayer(config_lines, name, input, output_dim, ng_affine_options = "", max_change_per_component = 0.75):
    components = config_lines['components']
    component_nodes = config_lines['component-nodes']

    # Per-component max-change option
    max_change_options = "max-change={0:.2f}".format(max_change_per_component) if max_change_per_component is not None else ''

    components.append("component name={0}_affine type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3} {4}".format(name, input['dimension'], output_dim, ng_affine_options, max_change_options))
    component_nodes.append("component-node name={0}_affine component={0}_affine input={1}".format(name, input['descriptor']))

    return {'descriptor':  '{0}_affine'.format(name),
            'dimension': output_dim}

def AddAffRelNormLayer(config_lines, name, input, output_dim, ng_affine_options = " bias-stddev=0 ", norm_target_rms = 1.0, self_repair_scale = None, max_change_per_component = 0.75):
    components = config_lines['components']
    component_nodes = config_lines['component-nodes']

    # self_repair_scale is a constant scaling the self-repair vector computed in RectifiedLinearComponent
    self_repair_string = "self-repair-scale={0:.10f}".format(self_repair_scale) if self_repair_scale is not None else ''
    # Per-component max-change option
    max_change_options = "max-change={0:.2f}".format(max_change_per_component) if max_change_per_component is not None else ''

    components.append("component name={0}_affine type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3} {4}".format(name, input['dimension'], output_dim, ng_affine_options, max_change_options))
    components.append("component name={0}_relu type=RectifiedLinearComponent dim={1} {2}".format(name, output_dim, self_repair_string))
    components.append("component name={0}_renorm type=NormalizeComponent dim={1} target-rms={2}".format(name, output_dim, norm_target_rms))

    component_nodes.append("component-node name={0}_affine component={0}_affine input={1}".format(name, input['descriptor']))
    component_nodes.append("component-node name={0}_relu component={0}_relu input={0}_affine".format(name))
    component_nodes.append("component-node name={0}_renorm component={0}_renorm input={0}_relu".format(name))

    return {'descriptor':  '{0}_renorm'.format(name),
            'dimension': output_dim}

def AddAffPnormLayer(config_lines, name, input, pnorm_input_dim, pnorm_output_dim, ng_affine_options = " bias-stddev=0 ", norm_target_rms = 1.0):
    components = config_lines['components']
    component_nodes = config_lines['component-nodes']

    components.append("component name={0}_affine type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input['dimension'], pnorm_input_dim, ng_affine_options))
    components.append("component name={0}_pnorm type=PnormComponent input-dim={1} output-dim={2}".format(name, pnorm_input_dim, pnorm_output_dim))
    components.append("component name={0}_renorm type=NormalizeComponent dim={1} target-rms={2}".format(name, pnorm_output_dim, norm_target_rms))

    component_nodes.append("component-node name={0}_affine component={0}_affine input={1}".format(name, input['descriptor']))
    component_nodes.append("component-node name={0}_pnorm component={0}_pnorm input={0}_affine".format(name))
    component_nodes.append("component-node name={0}_renorm component={0}_renorm input={0}_pnorm".format(name))

    return {'descriptor':  '{0}_renorm'.format(name),
            'dimension': pnorm_output_dim}

def AddConvolutionLayer(config_lines, name, input,
                       input_x_dim, input_y_dim, input_z_dim,
                       filt_x_dim, filt_y_dim,
                       filt_x_step, filt_y_step,
                       num_filters, input_vectorization,
                       param_stddev = None, bias_stddev = None,
                       filter_bias_file = None,
                       is_updatable = True):
    assert(input['dimension'] == input_x_dim * input_y_dim * input_z_dim)
    components = config_lines['components']
    component_nodes = config_lines['component-nodes']

    conv_init_string = ("component name={name}_conv type=ConvolutionComponent "
                       "input-x-dim={input_x_dim} input-y-dim={input_y_dim} input-z-dim={input_z_dim} "
                       "filt-x-dim={filt_x_dim} filt-y-dim={filt_y_dim} "
                       "filt-x-step={filt_x_step} filt-y-step={filt_y_step} "
                       "input-vectorization-order={vector_order}".format(name = name,
                       input_x_dim = input_x_dim, input_y_dim = input_y_dim, input_z_dim = input_z_dim,
                       filt_x_dim = filt_x_dim, filt_y_dim = filt_y_dim,
                       filt_x_step = filt_x_step, filt_y_step = filt_y_step,
                       vector_order = input_vectorization))
    if filter_bias_file is not None:
        conv_init_string += " matrix={0}".format(filter_bias_file)
    else:
        conv_init_string += " num-filters={0}".format(num_filters)

    components.append(conv_init_string)
    component_nodes.append("component-node name={0}_conv_t component={0}_conv input={1}".format(name, input['descriptor']))

    num_x_steps = (1 + (input_x_dim - filt_x_dim) // filt_x_step)
    num_y_steps = (1 + (input_y_dim - filt_y_dim) // filt_y_step)
    output_dim = num_x_steps * num_y_steps * num_filters;
    return {'descriptor':  '{0}_conv_t'.format(name),
            'dimension': output_dim,
            '3d-dim': [num_x_steps, num_y_steps, num_filters],
            'vectorization': 'zyx'}

# The Maxpooling component assumes input vectorizations of type zyx
def AddMaxpoolingLayer(config_lines, name, input,
                      input_x_dim, input_y_dim, input_z_dim,
                      pool_x_size, pool_y_size, pool_z_size,
                      pool_x_step, pool_y_step, pool_z_step):
    if input_x_dim < 1 or input_y_dim < 1 or input_z_dim < 1:
        raise Exception("non-positive maxpooling input size ({0}, {1}, {2})".
                 format(input_x_dim, input_y_dim, input_z_dim))
    if pool_x_size > input_x_dim or pool_y_size > input_y_dim or pool_z_size > input_z_dim:
        raise Exception("invalid maxpooling pool size vs. input size")
    if pool_x_step > pool_x_size or pool_y_step > pool_y_size or pool_z_step > pool_z_size:
        raise Exception("invalid maxpooling pool step vs. pool size")

    assert(input['dimension'] == input_x_dim * input_y_dim * input_z_dim)
    components = config_lines['components']
    component_nodes = config_lines['component-nodes']

    components.append('component name={name}_maxp type=MaxpoolingComponent '
                      'input-x-dim={input_x_dim} input-y-dim={input_y_dim} input-z-dim={input_z_dim} '
                      'pool-x-size={pool_x_size} pool-y-size={pool_y_size} pool-z-size={pool_z_size} '
                      'pool-x-step={pool_x_step} pool-y-step={pool_y_step} pool-z-step={pool_z_step} '.
                      format(name = name,
                      input_x_dim = input_x_dim, input_y_dim = input_y_dim, input_z_dim = input_z_dim,
                      pool_x_size = pool_x_size, pool_y_size = pool_y_size, pool_z_size = pool_z_size,
                      pool_x_step = pool_x_step, pool_y_step = pool_y_step, pool_z_step = pool_z_step))

    component_nodes.append('component-node name={0}_maxp_t component={0}_maxp input={1}'.format(name, input['descriptor']))

    num_pools_x = 1 + (input_x_dim - pool_x_size) // pool_x_step;
    num_pools_y = 1 + (input_y_dim - pool_y_size) // pool_y_step;
    num_pools_z = 1 + (input_z_dim - pool_z_size) // pool_z_step;
    output_dim = num_pools_x * num_pools_y * num_pools_z;

    return {'descriptor':  '{0}_maxp_t'.format(name),
            'dimension': output_dim,
            '3d-dim': [num_pools_x, num_pools_y, num_pools_z],
            'vectorization': 'zyx'}


def AddSoftmaxLayer(config_lines, name, input):
    components = config_lines['components']
    component_nodes = config_lines['component-nodes']

    components.append("component name={0}_log_softmax type=LogSoftmaxComponent dim={1}".format(name, input['dimension']))
    component_nodes.append("component-node name={0}_log_softmax component={0}_log_softmax input={1}".format(name, input['descriptor']))

    return {'descriptor':  '{0}_log_softmax'.format(name),
            'dimension': input['dimension']}


def AddSigmoidLayer(config_lines, name, input, self_repair_scale = None):
    components = config_lines['components']
    component_nodes = config_lines['component-nodes']

    # self_repair_scale is a constant scaling the self-repair vector computed in SigmoidComponent
    self_repair_string = "self-repair-scale={0:.10f}".format(self_repair_scale) if self_repair_scale is not None else ''
    components.append("component name={0}_sigmoid type=SigmoidComponent dim={1}".format(name, input['dimension'], self_repair_string))
    component_nodes.append("component-node name={0}_sigmoid component={0}_sigmoid input={1}".format(name, input['descriptor']))
    return {'descriptor':  '{0}_sigmoid'.format(name),
            'dimension': input['dimension']}

def AddOutputLayer(config_lines, input, label_delay = None, suffix=None, objective_type = "linear"):
    components = config_lines['components']
    component_nodes = config_lines['component-nodes']
    name = 'output'
    if suffix is not None:
        name = '{0}-{1}'.format(name, suffix)

    if label_delay is None:
        component_nodes.append('output-node name={0} input={1} objective={2}'.format(name, input['descriptor'], objective_type))
    else:
        component_nodes.append('output-node name={0} input=Offset({1},{2}) objective={3}'.format(name, input['descriptor'], label_delay, objective_type))

def AddFinalLayer(config_lines, input, output_dim,
        ng_affine_options = " param-stddev=0 bias-stddev=0 ",
        max_change_per_component = 1.5,
        label_delay=None,
        use_presoftmax_prior_scale = False,
        prior_scale_file = None,
        include_log_softmax = True,
        add_final_sigmoid = False,
        name_affix = None,
        objective_type = "linear"):
    components = config_lines['components']
    component_nodes = config_lines['component-nodes']

    if name_affix is not None:
        final_node_prefix = 'Final-' + str(name_affix)
    else:
        final_node_prefix = 'Final'

    prev_layer_output = AddAffineLayer(config_lines,
            final_node_prefix , input, output_dim,
            ng_affine_options, max_change_per_component)
    if include_log_softmax:
        if use_presoftmax_prior_scale :
            components.append('component name={0}-fixed-scale type=FixedScaleComponent scales={1}'.format(final_node_prefix, prior_scale_file))
            component_nodes.append('component-node name={0}-fixed-scale component={0}-fixed-scale input={1}'.format(final_node_prefix,
                prev_layer_output['descriptor']))
            prev_layer_output['descriptor'] = "{0}-fixed-scale".format(final_node_prefix)
        prev_layer_output = AddSoftmaxLayer(config_lines, final_node_prefix, prev_layer_output)
    elif add_final_sigmoid:
        # Useful when you need the final outputs to be probabilities
        # between 0 and 1.
        # Usually used with an objective-type such as "quadratic"
        prev_layer_output = AddSigmoidLayer(config_lines, final_node_prefix, prev_layer_output)
    # we use the same name_affix as a prefix in for affine/scale nodes but as a
    # suffix for output node
    AddOutputLayer(config_lines, prev_layer_output, label_delay, suffix = name_affix, objective_type = objective_type)

def AddLstmLayer(config_lines,
                 name, input, cell_dim,
                 recurrent_projection_dim = 0,
                 non_recurrent_projection_dim = 0,
                 clipping_threshold = 30.0,
                 zeroing_threshold = 15.0,
                 zeroing_interval = 20,
                 ng_per_element_scale_options = "",
                 ng_affine_options = "",
                 lstm_delay = -1,
                 self_repair_scale_nonlinearity = None,
                 max_change_per_component = 0.75):
    assert(recurrent_projection_dim >= 0 and non_recurrent_projection_dim >= 0)
    components = config_lines['components']
    component_nodes = config_lines['component-nodes']

    input_descriptor = input['descriptor']
    input_dim = input['dimension']
    name = name.strip()

    if (recurrent_projection_dim == 0):
        add_recurrent_projection = False
        recurrent_projection_dim = cell_dim
        recurrent_connection = "m_t"
    else:
        add_recurrent_projection = True
        recurrent_connection = "r_t"
    if (non_recurrent_projection_dim == 0):
        add_non_recurrent_projection = False
    else:
        add_non_recurrent_projection = True

    # self_repair_scale_nonlinearity is a constant scaling the self-repair vector computed in derived classes of NonlinearComponent,
    # i.e.,  SigmoidComponent, TanhComponent and RectifiedLinearComponent
    self_repair_nonlinearity_string = "self-repair-scale={0:.10f}".format(self_repair_scale_nonlinearity) if self_repair_scale_nonlinearity is not None else ''
    # Natural gradient per element scale parameters
    ng_per_element_scale_options += " param-mean=0.0 param-stddev=1.0 "
    # Per-component max-change option
    max_change_options = "max-change={0:.2f}".format(max_change_per_component) if max_change_per_component is not None else ''
    # Parameter Definitions W*(* replaced by - to have valid names)
    components.append("# Input gate control : W_i* matrices")
    components.append("component name={0}_W_i-xr type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3} {4}".format(name, input_dim + recurrent_projection_dim, cell_dim, ng_affine_options, max_change_options))
    components.append("# note : the cell outputs pass through a diagonal matrix")
    components.append("component name={0}_w_ic type=NaturalGradientPerElementScaleComponent  dim={1} {2} {3}".format(name, cell_dim, ng_per_element_scale_options, max_change_options))

    components.append("# Forget gate control : W_f* matrices")
    components.append("component name={0}_W_f-xr type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3} {4}".format(name, input_dim + recurrent_projection_dim, cell_dim, ng_affine_options, max_change_options))
    components.append("# note : the cell outputs pass through a diagonal matrix")
    components.append("component name={0}_w_fc type=NaturalGradientPerElementScaleComponent  dim={1} {2} {3}".format(name, cell_dim, ng_per_element_scale_options, max_change_options))

    components.append("#  Output gate control : W_o* matrices")
    components.append("component name={0}_W_o-xr type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3} {4}".format(name, input_dim + recurrent_projection_dim, cell_dim, ng_affine_options, max_change_options))
    components.append("# note : the cell outputs pass through a diagonal matrix")
    components.append("component name={0}_w_oc type=NaturalGradientPerElementScaleComponent  dim={1} {2} {3}".format(name, cell_dim, ng_per_element_scale_options, max_change_options))

    components.append("# Cell input matrices : W_c* matrices")
    components.append("component name={0}_W_c-xr type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3} {4}".format(name, input_dim + recurrent_projection_dim, cell_dim, ng_affine_options, max_change_options))


    components.append("# Defining the non-linearities")
    components.append("component name={0}_i type=SigmoidComponent dim={1} {2}".format(name, cell_dim, self_repair_nonlinearity_string))
    components.append("component name={0}_f type=SigmoidComponent dim={1} {2}".format(name, cell_dim, self_repair_nonlinearity_string))
    components.append("component name={0}_o type=SigmoidComponent dim={1} {2}".format(name, cell_dim, self_repair_nonlinearity_string))
    components.append("component name={0}_g type=TanhComponent dim={1} {2}".format(name, cell_dim, self_repair_nonlinearity_string))
    components.append("component name={0}_h type=TanhComponent dim={1} {2}".format(name, cell_dim, self_repair_nonlinearity_string))

    components.append("# Defining the cell computations")
    components.append("component name={0}_c1 type=ElementwiseProductComponent input-dim={1} output-dim={2}".format(name, 2 * cell_dim, cell_dim))
    components.append("component name={0}_c2 type=ElementwiseProductComponent input-dim={1} output-dim={2}".format(name, 2 * cell_dim, cell_dim))
    components.append("component name={0}_m type=ElementwiseProductComponent input-dim={1} output-dim={2}".format(name, 2 * cell_dim, cell_dim))
    components.append("component name={0}_c type=BackpropTruncationComponent dim={1} "
        "clipping-threshold={2} zeroing-threshold={3} zeroing-interval={4} "
        "recurrence-interval={5}".format(name, cell_dim, clipping_threshold, zeroing_threshold,
        zeroing_interval, abs(lstm_delay)))

    # c1_t and c2_t defined below
    component_nodes.append("component-node name={0}_c_t component={0}_c input=Sum({0}_c1_t, {0}_c2_t)".format(name))
    c_tminus1_descriptor = "IfDefined(Offset({0}_c_t, {1}))".format(name, lstm_delay)

    component_nodes.append("# i_t")
    component_nodes.append("component-node name={0}_i1 component={0}_W_i-xr input=Append({1}, IfDefined(Offset({0}_{2}, {3})))".format(name, input_descriptor, recurrent_connection, lstm_delay))
    component_nodes.append("component-node name={0}_i2 component={0}_w_ic  input={1}".format(name, c_tminus1_descriptor))
    component_nodes.append("component-node name={0}_i_t component={0}_i input=Sum({0}_i1, {0}_i2)".format(name))

    component_nodes.append("# f_t")
    component_nodes.append("component-node name={0}_f1 component={0}_W_f-xr input=Append({1}, IfDefined(Offset({0}_{2}, {3})))".format(name, input_descriptor, recurrent_connection, lstm_delay))
    component_nodes.append("component-node name={0}_f2 component={0}_w_fc  input={1}".format(name, c_tminus1_descriptor))
    component_nodes.append("component-node name={0}_f_t component={0}_f input=Sum({0}_f1,{0}_f2)".format(name))

    component_nodes.append("# o_t")
    component_nodes.append("component-node name={0}_o1 component={0}_W_o-xr input=Append({1}, IfDefined(Offset({0}_{2}, {3})))".format(name, input_descriptor, recurrent_connection, lstm_delay))
    component_nodes.append("component-node name={0}_o2 component={0}_w_oc input={0}_c_t".format(name))
    component_nodes.append("component-node name={0}_o_t component={0}_o input=Sum({0}_o1, {0}_o2)".format(name))

    component_nodes.append("# h_t")
    component_nodes.append("component-node name={0}_h_t component={0}_h input={0}_c_t".format(name))

    component_nodes.append("# g_t")
    component_nodes.append("component-node name={0}_g1 component={0}_W_c-xr input=Append({1}, IfDefined(Offset({0}_{2}, {3})))".format(name, input_descriptor, recurrent_connection, lstm_delay))
    component_nodes.append("component-node name={0}_g_t component={0}_g input={0}_g1".format(name))

    component_nodes.append("# parts of c_t")
    component_nodes.append("component-node name={0}_c1_t component={0}_c1  input=Append({0}_f_t, {1})".format(name, c_tminus1_descriptor))
    component_nodes.append("component-node name={0}_c2_t component={0}_c2 input=Append({0}_i_t, {0}_g_t)".format(name))

    component_nodes.append("# m_t")
    component_nodes.append("component-node name={0}_m_t component={0}_m input=Append({0}_o_t, {0}_h_t)".format(name))

    # add the recurrent connections
    if (add_recurrent_projection and add_non_recurrent_projection):
        components.append("# projection matrices : Wrm and Wpm")
        components.append("component name={0}_W-m type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3} {4}".format(name, cell_dim, recurrent_projection_dim + non_recurrent_projection_dim, ng_affine_options, max_change_options))
        components.append("component name={0}_r type=BackpropTruncationComponent dim={1} "
            "clipping-threshold={2} zeroing-threshold={3} zeroing-interval={4} "
            "recurrence-interval={5}".format(name, recurrent_projection_dim, clipping_threshold,
            zeroing_threshold, zeroing_interval, abs(lstm_delay)))
        component_nodes.append("# r_t and p_t")
        component_nodes.append("component-node name={0}_rp_t component={0}_W-m input={0}_m_t".format(name))
        component_nodes.append("dim-range-node name={0}_r_t_preclip input-node={0}_rp_t dim-offset=0 dim={1}".format(name, recurrent_projection_dim))
        component_nodes.append("component-node name={0}_r_t component={0}_r input={0}_r_t_preclip".format(name))
        output_descriptor = '{0}_rp_t'.format(name)
        output_dim = recurrent_projection_dim + non_recurrent_projection_dim

    elif add_recurrent_projection:
        components.append("# projection matrices : Wrm")
        components.append("component name={0}_Wrm type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3} {4}".format(
            name, cell_dim, recurrent_projection_dim, ng_affine_options, max_change_options))
        components.append("component name={0}_r type=BackpropTruncationComponent dim={1} "
            "clipping-threshold={2} zeroing-threshold={3} zeroing-interval={4} "
            "recurrence-interval={5}".format(name, recurrent_projection_dim, clipping_threshold,
            zeroing_threshold, zeroing_interval, abs(lstm_delay)))
        component_nodes.append("# r_t")
        component_nodes.append("component-node name={0}_r_t_preclip component={0}_Wrm input={0}_m_t".format(name))
        component_nodes.append("component-node name={0}_r_t component={0}_r input={0}_r_t_preclip".format(name))
        output_descriptor = '{0}_r_t'.format(name)
        output_dim = recurrent_projection_dim

    else:
        components.append("component name={0}_r type=BackpropTruncationComponent dim={1} "
            "clipping-threshold={2} zeroing-threshold={3} zeroing-interval={4} "
            "recurrence-interval={5}".format(name, cell_dim, clipping_threshold,
            zeroing_threshold, zeroing_interval, abs(lstm_delay)))
        component_nodes.append("component-node name={0}_r_t component={0}_r input={0}_m_t".format(name))
        output_descriptor = '{0}_r_t'.format(name)
        output_dim = cell_dim

    return {
            'descriptor': output_descriptor,
            'dimension':output_dim
            }

def AddBLstmLayer(config_lines,
                  name, input, cell_dim,
                  recurrent_projection_dim = 0,
                  non_recurrent_projection_dim = 0,
                  clipping_threshold = 1.0,
                  zeroing_threshold = 3.0,
                  zeroing_interval = 20,
                  ng_per_element_scale_options = "",
                  ng_affine_options = "",
                  lstm_delay = [-1,1],
                  self_repair_scale_nonlinearity = None,
                  max_change_per_component = 0.75):
    assert(len(lstm_delay) == 2 and lstm_delay[0] < 0 and lstm_delay[1] > 0)
    output_forward = AddLstmLayer(config_lines = config_lines,
                                  name = "{0}_forward".format(name),
                                  input = input,
                                  cell_dim = cell_dim,
                                  recurrent_projection_dim = recurrent_projection_dim,
                                  non_recurrent_projection_dim = non_recurrent_projection_dim,
                                  clipping_threshold = clipping_threshold,
                                  zeroing_threshold = zeroing_threshold,
                                  zeroing_interval = zeroing_interval,
                                  ng_per_element_scale_options = ng_per_element_scale_options,
                                  ng_affine_options = ng_affine_options,
                                  lstm_delay = lstm_delay[0],
                                  self_repair_scale_nonlinearity = self_repair_scale_nonlinearity,
                                  max_change_per_component = max_change_per_component)
    output_backward = AddLstmLayer(config_lines = config_lines,
                                   name = "{0}_backward".format(name),
                                   input = input,
                                   cell_dim = cell_dim,
                                   recurrent_projection_dim = recurrent_projection_dim,
                                   non_recurrent_projection_dim = non_recurrent_projection_dim,
                                   clipping_threshold = clipping_threshold,
                                   zeroing_threshold = zeroing_threshold,
                                   zeroing_interval = zeroing_interval,
                                   ng_per_element_scale_options = ng_per_element_scale_options,
                                   ng_affine_options = ng_affine_options,
                                   lstm_delay = lstm_delay[1],
                                   self_repair_scale_nonlinearity = self_repair_scale_nonlinearity,
                                   max_change_per_component = max_change_per_component)
    output_descriptor = 'Append({0}, {1})'.format(output_forward['descriptor'], output_backward['descriptor'])
    output_dim = output_forward['dimension'] + output_backward['dimension']

    return {
            'descriptor': output_descriptor,
            'dimension':output_dim
            }


================================================
FILE: egs/steps/nnet3/compute_output.sh
================================================
#!/usr/bin/env bash

# Copyright 2012-2015  Johns Hopkins University (Author: Daniel Povey).
#                2016  Vimal Manohar
# Apache 2.0.

# This script does forward propagation through a neural network.

# Begin configuration section.
stage=1
nj=4 # number of jobs.
cmd=run.pl
use_gpu=false
frames_per_chunk=50
iter=final
extra_left_context=0
extra_right_context=0
extra_left_context_initial=-1
extra_right_context_final=-1
frame_subsampling_factor=1
compress=false    # Specifies whether the output should be compressed before
                  # dumping to disk
online_ivector_dir=
output_name=      # Dump outputs for this output-node
apply_exp=false  # Apply exp i.e. write likelihoods instead of log-likelihoods
# End configuration section.

echo "$0 $@"  # Print the command line for logging

[ -f ./path.sh ] && . ./path.sh; # source the path.
. parse_options.sh || exit 1;

if [ $# -ne 3 ]; then
  echo "Usage: $0 [options] <data-dir> <nnet-dir> <output-dir>"
  echo "e.g.:   steps/nnet3/compute_output.sh --nj 8 \\"
  echo "--online-ivector-dir exp/nnet3/ivectors_test_eval92 \\"
  echo "    data/test_eval92_hires exp/nnet3/tdnn exp/nnet3/tdnn/output"
  echo "main options (for others, see top of script file)"
  echo "  --config <config-file>                   # config containing options"
  echo "  --nj <nj>                                # number of parallel jobs"
  echo "  --cmd <cmd>                              # Command to run in parallel with"
  echo "  --iter <iter>                            # Iteration of model to decode; default is final."
  exit 1;
fi

data=$1
srcdir=$2
dir=$3

mkdir -p $dir/log

# convert $dir to absolute pathname
fdir=`perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' $dir ${PWD}`

model=$srcdir/$iter.raw
if [ ! -f $srcdir/$iter.raw ]; then
  echo "$0: WARNING: no such file $srcdir/$iter.raw. Trying $srcdir/$iter.mdl instead."
  model=$srcdir/$iter.mdl
fi

[ ! -z "$online_ivector_dir" ] && \
  extra_files="$online_ivector_dir/ivector_online.scp $online_ivector_dir/ivector_period"

for f in $data/feats.scp $model $extra_files; do
  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
done

if [ ! -z "$output_name" ] && [ "$output_name" != "output" ]; then
  echo "$0: Using output-name $output_name"
  model="nnet3-copy --edits='remove-output-nodes name=output;rename-node old-name=$output_name new-name=output' $model - |"
fi

sdata=$data/split$nj;
cmvn_opts=`cat $srcdir/cmvn_opts` || exit 1;

[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
echo $nj > $dir/num_jobs

## Set up features.
if [ -f $srcdir/final.mat ]; then
  echo "$0: ERROR: lda feature type is no longer supported." && exit 1
fi
feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- |"

if [ ! -z "$online_ivector_dir" ]; then
  ivector_period=$(cat $online_ivector_dir/ivector_period) || exit 1;
  ivector_opts="--online-ivectors=scp:$online_ivector_dir/ivector_online.scp --online-ivector-period=$ivector_period"
fi

frame_subsampling_opt=
if [ $frame_subsampling_factor -ne 1 ]; then
  # e.g. for 'chain' systems
  frame_subsampling_opt="--frame-subsampling-factor=$frame_subsampling_factor"
fi

if $apply_exp; then
  output_wspecifier="ark:| copy-matrix --apply-exp ark:- ark,scp:$dir/output.JOB.ark,$dir/output.JOB.scp"
else
  output_wspecifier="ark:| copy-feats --compress=$compress ark:- ark,scp:$dir/output.JOB.ark,$dir/output.JOB.scp"
fi

gpu_opt="--use-gpu=no"
gpu_queue_opt=

if $use_gpu; then
  gpu_queue_opt="--gpu 1"
  suffix="-batch"
  gpu_opt="--use-gpu=yes"
else
  gpu_opt="--use-gpu=no"
fi

if [ $stage -le 2 ]; then
  $cmd $gpu_queue_opt JOB=1:$nj $dir/log/compute_output.JOB.log \
    nnet3-compute$suffix $gpu_opt $ivector_opts $frame_subsampling_opt \
     --frames-per-chunk=$frames_per_chunk \
     --extra-left-context=$extra_left_context \
     --extra-right-context=$extra_right_context \
     --extra-left-context-initial=$extra_left_context_initial \
     --extra-right-context-final=$extra_right_context_final \
     "$model" "$feats" "$output_wspecifier" || exit 1;
fi

for n in $(seq $nj); do
  cat $dir/output.$n.scp
done > $dir/output.scp

exit 0;


================================================
FILE: egs/steps/nnet3/convert_nnet2_to_nnet3.py
================================================
#!/usr/bin/env python

# Copyright 2017    Joachim Fainberg.

# This script converts nnet2 models into nnet3 models.
# It requires knowledge of valid components which
# can be modified in the configuration section below.

from __future__ import print_function
import argparse, os, tempfile, logging, sys, shutil, fileinput, re
from collections import defaultdict, namedtuple
import numpy as np
sys.path.insert(0, 'steps/')
import libs.nnet3.train.common as common_train_lib
import libs.common as common_lib

# Begin configuration section
# Components and their corresponding node names

NODE_NAMES = {
    "<AffineComponent>":"affine",
    "<AffineComponentPreconditioned>":"affine",
    "<AffineComponentPreconditionedOnline>":"affine",
    "<BlockAffineComponent>":"affine",
    "<BlockAffineComponentPreconditioned>":"affine",
    "<SigmoidComponent>":"nonlin",
    "<TanhComponent>":"nonlin",
    "<PowerComponent>":"nonlin",
    "<RectifiedLinearComponent>":"nonlin",
    "<SoftHingeComponent>":"nonlin",
    "<PnormComponent>":"nonlin",
    "<NormalizeComponent>":"renorm",
    "<MaxoutComponent>":"maxout",
    "<MaxpoolingComponent>":"maxpool",
    "<ScaleComponent>":"rescale",
    "<DropoutComponent>":"dropout",
    "<SoftmaxComponent>":"softmax",
    "<LogSoftmaxComponent>":"log-softmax",
    "<FixedScaleComponent>":"fixed-scale",
    "<FixedAffineComponent>":"fixed-affine",
    "<FixedLinearComponent>":"fixed-linear",
    "<FixedBiasComponent>":"fixed-bias",
    "<PermuteComponent>":"permute",
    "<AdditiveNoiseComponent>":"noise",
    "<Convolutional1dComponent>":"conv",
    "<SumGroupComponent>":"sum-group",
    "<DctComponent>":"dct",
    "<SpliceComponent>":"splice",
    "<SpliceMaxComponent>":"splice"
}

SPLICE_COMPONENTS = [c for c in NODE_NAMES if "Splice" in c]
AFFINE_COMPONENTS = [c for c in NODE_NAMES if "Affine" in c]

KNOWN_COMPONENTS = list(NODE_NAMES.keys())
# End configuration section

logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
handler = logging.StreamHandler()
handler.setLevel(logging.INFO)
formatter = logging.Formatter("%(asctime)s [%(filename)s:%(lineno)s - "
                              "%(funcName)s - %(levelname)s ] %(message)s")
handler.setFormatter(formatter)
logger.addHandler(handler)

def GetArgs():
    parser = argparse.ArgumentParser(
        description="Converts nnet2 into nnet3 models.",
        epilog="""e.g. steps/nnet3/convert_nnet2_to_nnet3.py 
                  exp/tri4_nnet2 exp/tri4_nnet3""")
    parser.add_argument("--tmpdir", type=str, default="./",
                        help="Custom location for the temporary directory.")
    parser.add_argument("--skip-cleanup", action='store_true',
                        help="Will not remove the temporary directory.")
    parser.add_argument("--model", type=str, default='final.mdl',
                        help="Choose a specific model to convert.")
    parser.add_argument("--binary", type=str, default="true", 
                        choices=["true","false"], 
                        help="Whether to write the model in binary or not.")
    parser.add_argument("nnet2_dir", metavar="src-nnet2-dir", type=str,
                        help="")
    parser.add_argument("nnet3_dir", metavar="src-nnet3-dir", type=str,
                        help="")

    print(' '.join(sys.argv))

    args = parser.parse_args()

    if not os.path.exists(args.nnet3_dir):
        os.makedirs(args.nnet3_dir)
    if args.tmpdir and not os.path.exists(args.tmpdir):
        os.makedirs(args.tmpdir)

    return args

class Nnet3Model(object):
    """Holds configuration for an Nnet3 model."""
    
    def __init__(self):
        self.input_dim = -1
        self.output_dim = -1
        self.ivector_dim = 0 
        self.counts = defaultdict(int)
        self.num_components = 0
        self.components_read = 0
        self.config = ""
        self.transition_model = ""
        self.priors = ""
        self.components = []

    def add_component(self, component, pairs):
        """Adds components to the model. 
        
        Takes a dictionary of key-value pairs.
        """
        self.components_read += 1

        Component = namedtuple("Component", "ident component pairs")

        if "<InputDim>" in pairs and self.input_dim == -1:
            self.input_dim = int(pairs["<InputDim>"])

        if "<ConstComponentDim>" in pairs and self.ivector_dim == 0:
            self.ivector_dim = int(pairs["<ConstComponentDim>"])

        # remove nnet2 specific tokens and catch descriptors
        if component == "<PnormComponent>" and "<P>" in pairs:
            pairs.pop("<P>")
        elif component in SPLICE_COMPONENTS:
            self.components.append(Component("splice", component, pairs))
            return

        # format pairs: {'<InputDim>':43} -> {'input-dim':43}
        pairs = ["{0}={1}".format(token_to_string(key), pairs[key]) for key in pairs]
        
        # keep track of layer type number (e.g. affine3)
        node_name = NODE_NAMES[component]
        self.counts[node_name] += 1

        # e.g. affine3
        ident = node_name + str(self.counts[node_name])

        # <PnormComponent> -> PnormComponent
        component = component[1:-1]

        self.components.append(Component(ident, component, pairs))

    def write_config(self, filename):
        """Write config to filename."""
        logger.info("Writing config to {0}".format(filename))

        self.config = filename
        with open(filename, 'w') as f:
            for component in self.components:
                if component.ident == "splice":
                    continue
                config_string = ' '.join(component.pairs)

                f.write("component name={name} type={comp_type} {config_string}"
                        "\n".format(name=component.ident, 
                                    comp_type=component.component, 
                                    config_string=config_string))

            f.write("\n# Component nodes\n")
            if self.ivector_dim != 0:
                f.write("input-node name=input dim={0}\n".format(self.input_dim-self.ivector_dim))
                f.write("input-node name=ivector dim={0}\n".format(self.ivector_dim))
            else:
                f.write("input-node name=input dim={0}\n".format(self.input_dim))
            previous_component = "input"
            for component in self.components:
                if component.ident == "splice":
                    # Create splice string for the next node
                    previous_component = make_splice_string(previous_component, 
                                                   component.pairs["<Context>"],
                                                   component.pairs["<ConstComponentDim>"])
                    continue
                f.write("component-node name={name} component={name} "
                        "input={inp}\n".format(name=component.ident, 
                                               inp=previous_component))
                previous_component = component.ident
            logger.warning("Assuming linear objective.")
            f.write("output-node name=output input={inp} objective={obj}"
                    "\n".format(inp=previous_component, obj='linear'))

    def write_model(self, model, binary="true"):
        if not os.path.exists(self.config):
            raise IOError("Config file {0} does not exist.".format(self.config))

        # write raw model
        common_lib.execute_command("nnet3-init --binary=true {0} {1}"
            .format(self.config, os.path.join(tmpdir, "nnet3.raw")))

        # add transition model
        common_lib.execute_command("nnet3-am-init --binary=true {0} {1} {2}"
            .format(self.transition_model, os.path.join(tmpdir, "nnet3.raw"),
                    os.path.join(tmpdir, "nnet3_no_prior.mdl")))

        # add priors
        common_lib.execute_command("nnet3-am-adjust-priors "
                                     "--binary={0} {1} {2} {3}"
            .format(binary, os.path.join(tmpdir, "nnet3_no_prior.mdl"), 
                    self.priors, model))

def parse_nnet2_to_nnet3(line_buffer):
    """Reads an Nnet2 model into an Nnet3 object.

    Parses by passing line_buffer objects depending upon the
    current place or component being read.

    Returns Nnet3 object.
    """
    model = Nnet3Model()

    # <TransitionModel> ...
    model.transition_model = parse_transition_model(line_buffer)
    
    # <Nnet> <NumComponents> ...
    line, model.num_components = parse_nnet2_header(line_buffer)

    # Parse remaining components
    while True:
        if line.startswith("</Components>"):
            break
        component, pairs = parse_component(line, line_buffer)
        model.add_component(component, pairs)
        line = next(line_buffer)

    model.priors = parse_priors(line, line_buffer)
    
    if model.components_read != model.num_components:
        logger.error("Did not read all components succesfully: {0}/{1}"
                     .format(model.components_read, model.num_components))

    return model

def parse_transition_model(line_buffer):
    """Writes transition model to text file.
    
    Returns filename.
    """
    line = next(line_buffer)
    assert line.startswith("<TransitionModel>")

    transition_model = os.path.join(tmpdir, "transition_model")

    with open(transition_model, 'w') as fc:
        fc.write(line)
        
        while True:
            line = next(line_buffer)
            fc.write(line)
            if line.startswith("</TransitionModel>"):
                break

        return transition_model

def parse_nnet2_header(line_buffer):
    """Returns number of components in Nnet2 header."""
    line = next(line_buffer)
    assert line.startswith("<Nnet>")

    line = consume_token("<Nnet>", line)
    num_components = int(line.split()[1])
    line = line.partition(str(num_components))[2]
    line = consume_token("<Components>", line)

    return line, num_components 
                
def parse_component(line, line_buffer):
    component = line.split()[0]
    pairs = {}

    if component in SPLICE_COMPONENTS:
        line, pairs = parse_splice_component(component, line, line_buffer)
    elif component in AFFINE_COMPONENTS:
        pairs = parse_affine_component(component, line, line_buffer)
    elif component == "<FixedScaleComponent>":
        pairs = parse_fixed_scale_component(component, line, line_buffer)
    elif component == "<FixedBiasComponent>":
        pairs = parse_fixed_bias_component(component, line, line_buffer)
    elif component == "<SumGroupComponent>":
        pairs = parse_sum_group_component(component, line, line_buffer)
    elif component in KNOWN_COMPONENTS:
        pairs = parse_standard_component(component, line, line_buffer)
    else:
        raise LookupError("Unrecognised component, {0}.".format(component))

    parse_end_of_component(component, line, line_buffer)

    return component, pairs

def parse_standard_component(component, line, line_buffer):
    # Ignores stats such as ValueSum and DerivSum
    line = consume_token(component, line)
    pairs = re.findall("(<\w+>) ([\w.-]+)", line)

    return dict(pairs)

def parse_fixed_scale_component(component, line, line_buffer):
    line = consume_token(component, line)
    line = consume_token("<Scales>", line)

    scales = np.array([parse_vector(line)])

    _, filename = tempfile.mkstemp(dir=tmpdir)
    with open(filename, 'w') as f:
        f.write("[ ")
        np.savetxt(f, scales, newline='')
        f.write(" ]")

    return {"<Scales>" : filename}

def parse_sum_group_component(component, line, line_buffer):
    line = consume_token(component, line)
    line = consume_token("<Sizes>", line)

    sizes = line.strip().strip("[]").strip().replace(' ', ',')

    return {"<Sizes>" : sizes}

def parse_fixed_bias_component(component, line, line_buffer):
    line = consume_token(component, line)
    line = consume_token("<Bias>", line)

    scales = np.array([parse_vector(line)])

    _, filename = tempfile.mkstemp(dir=tmpdir)
    with open(filename, 'w') as f:
        f.write("[ ")
        np.savetxt(f, scales, newline='')
        f.write(" ]")

    return {"<Bias>" : filename}

def parse_splice_component(component, line, line_buffer):
    if component == "<SpliceMaxComponent>":
        raise NotImplementedError("Script doesn't support SpliceMaxComponent.")

    line = consume_token(component, line)
    line = consume_token("<InputDim>", line)
    [input_dim, _, line] = line.strip().partition(' ')
    line = consume_token("<Context>", line)
    context = line.strip()[1:-1].split()

    const_component_dim = 0
    line = next(line_buffer) # Context vector adds newline
    line = consume_token("<ConstComponentDim>", line)
    const_component_dim = int(line.strip().split()[0])

    return line, {"<InputDim>" : input_dim, "<Context>" : context, 
            "<ConstComponentDim>" : const_component_dim}

def parse_end_of_component(component, line, line_buffer):
    # Keeps reading until it hits the end tag for component
    end_component = "</" + component[1:]

    while end_component not in line:
        line = next(line_buffer)

    return

def parse_affine_component(component, line, line_buffer):
    assert ("<LinearParams>" in line)

    pairs = dict(re.findall("(<\w+>) ([\w.-]+)", line))

    # read the linear params and bias and convert it to a matrix
    weights = parse_weights(line_buffer)
    bias = parse_bias(next(line_buffer))

    matrix = np.concatenate([weights, bias.T], axis=1)

    # write matrix and return pairs with filename
    _, filename = tempfile.mkstemp(dir=tmpdir)
    with open(filename, 'w') as f:
        f.write("[ ")
        np.savetxt(f, matrix)
        f.write(" ]")

    pairs["<Matrix>"] = filename

    return pairs

def parse_weights(line_buffer):
    weights = []

    while True:
        line = next(line_buffer)

        if line.strip().endswith("["):
            continue
        elif line.strip().endswith("]"):
            weights.append(parse_vector(line))
            break
        else:
            weights.append(parse_vector(line))

    return np.array(weights)

def parse_bias(line):
    if "<BiasParams>" in line:
        line = consume_token("<BiasParams>", line)

    return np.array([parse_vector(line)])

def parse_vector(line):
    vector = line.strip().strip("[]")
    return np.array([float(x) for x in vector.split()], dtype="float32")

def parse_priors(line, line_buffer):
    vector = parse_vector(line.partition('[')[2])
    priors = os.path.join(tmpdir, "priors")

    with open(priors, 'w') as f:
        f.write("[ ")
        np.savetxt(f, vector, newline=' ')
        f.write(" ]")

    return priors

def token_to_string(token):
    """Converts tokens to lowercase, hyphen-bounded strings.

    E.g. <InputDim> -> input-dim
    """
    string = token[1:-1]
    string = re.sub(r"((?<=[a-z])[A-Z]|(?<!\A)[A-Z](?=[a-z]))", r'-\1', string).lower()
    return string

def consume_token(token, line):
    """Returns line without token"""
    if token != line.split(None, 1)[0]:
        logger.error("Unexpected token, expected '{0}', got '{1}'."
              .format(token, line.split(None, 1)[0]))

    return line.partition(token)[2]

def make_splice_string(nodename, context, const_component_dim=0):
    """Generates splice string from a list of context.

    E.g. make_splice_string("renorm4", [-4, 4])
    returns "Append(Offset(renorm4, -4), Offset(renorm4, 4))"
    """
    assert type(context) == list, "context argument must be a list"
    string = ["Offset({0}, {1})".format(nodename, i) for i in context]
    if const_component_dim > 0:
        string.append("ReplaceIndex(ivector, t, 0)")
    string = "Append(" + ", ".join(string) + ")"
    return string

tmpdir = ""

def Main():
    args = GetArgs()
    logger.info("Converting nnet2 model {0} to nnet3 model {1}"
                .format(os.path.join(args.nnet2_dir, args.model), 
                        os.path.join(args.nnet3_dir, args.model)))
    global tmpdir
    tmpdir = tempfile.mkdtemp(dir=args.tmpdir) 

    # Convert nnet2 model to text and remove preconditioning
    common_lib.execute_command("nnet-am-copy "
            "--remove-preconditioning=true --binary=false {0}/{1} {2}/{1}"
            .format(args.nnet2_dir, args.model, tmpdir))

    # Parse nnet2 and return nnet3 object
    with open(os.path.join(tmpdir, args.model)) as f:
        nnet3 = parse_nnet2_to_nnet3(f)

    # Write model
    nnet3.write_config(os.path.join(tmpdir, "config"))
    nnet3.write_model(os.path.join(args.nnet3_dir, args.model), 
                      binary=args.binary)
        
    if not args.skip_cleanup:
        shutil.rmtree(tmpdir)
    else:
        logger.info("Not removing temporary directory {0}".format(tmpdir))
     
    logger.info("Wrote nnet3 model to {0}".format(os.path.join(args.nnet3_dir, 
                                                  args.model)))

if __name__ == "__main__":
    Main()


================================================
FILE: egs/steps/nnet3/decode.sh
================================================
#!/usr/bin/env bash

# Copyright 2012-2015  Johns Hopkins University (Author: Daniel Povey).
# Apache 2.0.

# This script does decoding with a neural-net.

# Begin configuration section.
stage=1
nj=4 # number of decoding jobs.
acwt=0.1  # Just a default value, used for adaptation and beam-pruning..
post_decode_acwt=1.0  # can be used in 'chain' systems to scale acoustics by 10 so the
                      # regular scoring script works.
cmd=run.pl
beam=15.0
frames_per_chunk=50
max_active=7000
min_active=200
ivector_scale=1.0
lattice_beam=8.0 # Beam we use in lattice generation.
iter=final
num_threads=1 # if >1, will use gmm-latgen-faster-parallel
use_gpu=false # If true, will use a GPU, with nnet3-latgen-faster-batch.
              # In that case it is recommended to set num-threads to a large
              # number, e.g. 20 if you have that many free CPU slots on a GPU
              # node, and to use a small number of jobs.
scoring_opts=
skip_diagnostics=false
skip_scoring=false
extra_left_context=0
extra_right_context=0
extra_left_context_initial=-1
extra_right_context_final=-1
online_ivector_dir=
minimize=false
# End configuration section.

echo "$0 $@"  # Print the command line for logging

[ -f ./path.sh ] && . ./path.sh; # source the path.
. utils/parse_options.sh || exit 1;

if [ $# -ne 3 ]; then
  echo "Usage: $0 [options] <graph-dir> <data-dir> <decode-dir>"
  echo "e.g.:   steps/nnet3/decode.sh --nj 8 \\"
  echo "--online-ivector-dir exp/nnet2_online/ivectors_test_eval92 \\"
  echo "    exp/tri4b/graph_bg data/test_eval92_hires $dir/decode_bg_eval92"
  echo "main options (for others, see top of script file)"
  echo "  --config <config-file>                   # config containing options"
  echo "  --nj <nj>                                # number of parallel jobs"
  echo "  --cmd <cmd>                              # Command to run in parallel with"
  echo "  --beam <beam>                            # Decoding beam; default 15.0"
  echo "  --iter <iter>                            # Iteration of model to decode; default is final."
  echo "  --scoring-opts <string>                  # options to local/score.sh"
  echo "  --num-threads <n>                        # number of threads to use, default 1."
  echo "  --use-gpu <true|false>                   # default: false.  If true, we recommend"
  echo "                                           # to use large --num-threads as the graph"
  echo "                                           # search becomes the limiting factor."
  exit 1;
fi

graphdir=$1
data=$2
dir=$3
srcdir=`dirname $dir`; # Assume model directory one level up from decoding directory.
model=$srcdir/$iter.mdl


extra_files=
if [ ! -z "$online_ivector_dir" ]; then
  steps/nnet2/check_ivectors_compatible.sh $srcdir $online_ivector_dir || exit 1
  extra_files="$online_ivector_dir/ivector_online.scp $online_ivector_dir/ivector_period"
fi

utils/lang/check_phones_compatible.sh {$srcdir,$graphdir}/phones.txt || exit 1

for f in $graphdir/HCLG.fst $data/feats.scp $model $extra_files; do
  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
done

sdata=$data/split$nj;
if [ -f $srcdir/cmvn_opts ]; then
    cmvn_opts=`cat $srcdir/cmvn_opts`
else
    cmvn_opts="--norm-means=false --norm-vars=false"
fi
thread_string=
if $use_gpu; then
  if [ $num_threads -eq 1 ]; then
    echo "$0: **Warning: we recommend to use --num-threads > 1 for GPU-based decoding."
  fi
  thread_string="-batch --num-threads=$num_threads"
  queue_opt="--num-threads $num_threads --gpu 1"
elif [ $num_threads -gt 1 ]; then
  thread_string="-parallel --num-threads=$num_threads"
  queue_opt="--num-threads $num_threads"
fi

mkdir -p $dir/log
[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
echo $nj > $dir/num_jobs

## Set up features.
if [ -f $srcdir/online_cmvn ]; then online_cmvn=true
else online_cmvn=false; fi

if ! $online_cmvn; then
  echo "$0: feature type is raw"
  feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- |"
else
  echo "$0: feature type is raw (apply-cmvn-online)"
  feats="ark,s,cs:apply-cmvn-online $cmvn_opts --spk2utt=ark:$sdata/JOB/spk2utt $srcdir/global_cmvn.stats scp:$sdata/JOB/feats.scp ark:- |"
fi

if [ ! -z "$online_ivector_dir" ]; then
  ivector_period=$(cat $online_ivector_dir/ivector_period) || exit 1;
  ivector_opts="--online-ivectors=scp:$online_ivector_dir/ivector_online.scp --online-ivector-period=$ivector_period"
fi

if [ "$post_decode_acwt" == 1.0 ]; then
  lat_wspecifier="ark:|gzip -c >$dir/lat.JOB.gz"
else
  lat_wspecifier="ark:|lattice-scale --acoustic-scale=$post_decode_acwt ark:- ark:- | gzip -c >$dir/lat.JOB.gz"
fi

frame_subsampling_opt=
if [ -f $srcdir/frame_subsampling_factor ]; then
  # e.g. for 'chain' systems
  frame_subsampling_opt="--frame-subsampling-factor=$(cat $srcdir/frame_subsampling_factor)"
elif [ -f $srcdir/init/info.txt ]; then
    frame_subsampling_factor=$(awk '/^frame_subsampling_factor/ {print $2}' <$srcdir/init/info.txt)
    if [ ! -z $frame_subsampling_factor ]; then
        frame_subsampling_opt="--frame-subsampling-factor=$frame_subsampling_factor"
    fi
fi

if [ $stage -le 1 ]; then
  $cmd $queue_opt JOB=1:$nj $dir/log/decode.JOB.log \
    nnet3-latgen-faster$thread_string $ivector_opts $frame_subsampling_opt \
     --frames-per-chunk=$frames_per_chunk \
     --extra-left-context=$extra_left_context \
     --extra-right-context=$extra_right_context \
     --extra-left-context-initial=$extra_left_context_initial \
     --extra-right-context-final=$extra_right_context_final \
     --minimize=$minimize --max-active=$max_active --min-active=$min_active --beam=$beam \
     --lattice-beam=$lattice_beam --acoustic-scale=$acwt --allow-partial=true \
     --word-symbol-table=$graphdir/words.txt "$model" \
     $graphdir/HCLG.fst "$feats" "$lat_wspecifier" || exit 1;
fi


if [ $stage -le 2 ]; then
  if ! $skip_diagnostics ; then
    [ ! -z $iter ] && iter_opt="--iter $iter"
    steps/diagnostic/analyze_lats.sh --cmd "$cmd" $iter_opt $graphdir $dir
  fi
fi


# The output of this script is the files "lat.*.gz"-- we'll rescore this at
# different acoustic scales to get the final output.
if [ $stage -le 3 ]; then
  if ! $skip_scoring ; then
    [ ! -x local/score.sh ] && \
      echo "Not scoring because local/score.sh does not exist or not executable." && exit 1;
    echo "score best paths"
    [ "$iter" != "final" ] && iter_opt="--iter $iter"
    local/score.sh $scoring_opts --cmd "$cmd" $data $graphdir $dir
    echo "score confidence and timing with sclite"
  fi
fi
echo "Decoding done."
exit 0;


================================================
FILE: egs/steps/nnet3/decode_grammar.sh
================================================
#!/usr/bin/env bash

# Copyright 2012-2015  Johns Hopkins University (Author: Daniel Povey).
# Apache 2.0.

# This is a version of ./decode.sh that allows you to decode with a GrammarFst.
# See kaldi-asr.org/doc/grammar.html for an overview of what this is about.

# Begin configuration section.
stage=1
nj=4 # number of decoding jobs.
acwt=0.1  # Just a default value, used for adaptation and beam-pruning..
post_decode_acwt=1.0  # can be used in 'chain' systems to scale acoustics by 10 so the
                      # regular scoring script works.
cmd=run.pl
beam=15.0
frames_per_chunk=50
max_active=7000
min_active=200
ivector_scale=1.0
lattice_beam=8.0 # Beam we use in lattice generation.
iter=final
scoring_opts=
skip_diagnostics=false
skip_scoring=false
extra_left_context=0
extra_right_context=0
extra_left_context_initial=-1
extra_right_context_final=-1
online_ivector_dir=
minimize=false
# End configuration section.

echo "$0 $@"  # Print the command line for logging

[ -f ./path.sh ] && . ./path.sh; # source the path.
. utils/parse_options.sh || exit 1;

if [ $# -ne 3 ]; then
  echo "Usage: $0 [options] <graph-dir> <data-dir> <decode-dir>"
  echo "e.g.:   steps/nnet3/decode.sh --nj 8 \\"
  echo "--online-ivector-dir exp/nnet2_online/ivectors_test_eval92 \\"
  echo "    exp/tri4b/graph_bg data/test_eval92_hires $dir/decode_bg_eval92"
  echo "main options (for others, see top of script file)"
  echo "  --config <config-file>                   # config containing options"
  echo "  --nj <nj>                                # number of parallel jobs"
  echo "  --cmd <cmd>                              # Command to run in parallel with"
  echo "  --beam <beam>                            # Decoding beam; default 15.0"
  echo "  --iter <iter>                            # Iteration of model to decode; default is final."
  echo "  --scoring-opts <string>                  # options to local/score.sh"
  exit 1;
fi

graphdir=$1
data=$2
dir=$3
srcdir=`dirname $dir`; # Assume model directory one level up from decoding directory.
model=$srcdir/$iter.mdl


extra_files=
if [ ! -z "$online_ivector_dir" ]; then
  steps/nnet2/check_ivectors_compatible.sh $srcdir $online_ivector_dir || exit 1
  extra_files="$online_ivector_dir/ivector_online.scp $online_ivector_dir/ivector_period"
fi

utils/lang/check_phones_compatible.sh {$srcdir,$graphdir}/phones.txt || exit 1

for f in $graphdir/HCLG.gra $data/feats.scp $model $extra_files; do
  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
done

sdata=$data/split$nj;
cmvn_opts=`cat $srcdir/cmvn_opts` || exit 1;

mkdir -p $dir/log
[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
echo $nj > $dir/num_jobs


## Set up features.
echo "$0: feature type is raw"

feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- |"

if [ ! -z "$online_ivector_dir" ]; then
  ivector_period=$(cat $online_ivector_dir/ivector_period) || exit 1;
  ivector_opts="--online-ivectors=scp:$online_ivector_dir/ivector_online.scp --online-ivector-period=$ivector_period"
fi

if [ "$post_decode_acwt" == 1.0 ]; then
  lat_wspecifier="ark:|gzip -c >$dir/lat.JOB.gz"
else
  lat_wspecifier="ark:|lattice-scale --acoustic-scale=$post_decode_acwt ark:- ark:- | gzip -c >$dir/lat.JOB.gz"
fi

frame_subsampling_opt=
if [ -f $srcdir/frame_subsampling_factor ]; then
  # e.g. for 'chain' systems
  frame_subsampling_opt="--frame-subsampling-factor=$(cat $srcdir/frame_subsampling_factor)"
fi

if [ $stage -le 1 ]; then
  $cmd JOB=1:$nj $dir/log/decode.JOB.log \
    nnet3-latgen-grammar $ivector_opts $frame_subsampling_opt \
     --frames-per-chunk=$frames_per_chunk \
     --extra-left-context=$extra_left_context \
     --extra-right-context=$extra_right_context \
     --extra-left-context-initial=$extra_left_context_initial \
     --extra-right-context-final=$extra_right_context_final \
     --minimize=$minimize --max-active=$max_active --min-active=$min_active --beam=$beam \
     --lattice-beam=$lattice_beam --acoustic-scale=$acwt --allow-partial=true \
     --word-symbol-table=$graphdir/words.txt "$model" \
     $graphdir/HCLG.gra "$feats" "$lat_wspecifier" || exit 1;
fi


if [ $stage -le 2 ]; then
  if ! $skip_diagnostics ; then
    [ ! -z $iter ] && iter_opt="--iter $iter"
    steps/diagnostic/analyze_lats.sh --cmd "$cmd" $iter_opt $graphdir $dir
  fi
fi


# The output of this script is the files "lat.*.gz"-- we'll rescore this at
# different acoustic scales to get the final output.
if [ $stage -le 3 ]; then
  if ! $skip_scoring ; then
    [ ! -x local/score.sh ] && \
      echo "Not scoring because local/score.sh does not exist or not executable." && exit 1;
    echo "score best paths"
    [ "$iter" != "final" ] && iter_opt="--iter $iter"
    local/score.sh $scoring_opts --cmd "$cmd" $data $graphdir $dir
    echo "score confidence and timing with sclite"
  fi
fi
echo "Decoding done."
exit 0;


================================================
FILE: egs/steps/nnet3/decode_lookahead.sh
================================================
#!/bin/bash

# Copyright 2019       Alpha Cephei Inc (Author: Nickolay Shmmyrev).
# Copyright 2012-2015  Johns Hopkins University (Author: Daniel Povey).
# Apache 2.0.

# This script does decoding with a neural-net with lookahead composition of HCL and G graphs.

# Begin configuration section.
stage=1
nj=4 # number of decoding jobs.
acwt=0.1  # Just a default value, used for adaptation and beam-pruning..
post_decode_acwt=1.0  # can be used in 'chain' systems to scale acoustics by 10 so the
                      # regular scoring script works.
cmd=run.pl
beam=15.0
frames_per_chunk=50
max_active=7000
min_active=200
ivector_scale=1.0
lattice_beam=8.0 # Beam we use in lattice generation.
iter=final
use_gpu=false # If true, will use a GPU, with nnet3-latgen-faster-batch.
              # In that case it is recommended to set num-threads to a large
              # number, e.g. 20 if you have that many free CPU slots on a GPU
              # node, and to use a small number of jobs.
scoring_opts=
skip_diagnostics=false
skip_scoring=false
extra_left_context=0
extra_right_context=0
extra_left_context_initial=-1
extra_right_context_final=-1
online_ivector_dir=
minimize=false
# End configuration section.

echo "$0 $@"  # Print the command line for logging

[ -f ./path.sh ] && . ./path.sh; # source the path.
. utils/parse_options.sh || exit 1;

if [ $# -ne 3 ]; then
  echo "Usage: $0 [options] <graph-dir> <data-dir> <decode-dir>"
  echo "e.g.:   $0 --nj 8 \\"
  echo "--online-ivector-dir exp/nnet2_online/ivectors_test_eval92 \\"
  echo "    exp/tri4b/graph_bg data/test_eval92_hires $dir/decode_bg_eval92"
  echo "main options (for others, see top of script file)"
  echo "  --config <config-file>                   # config containing options"
  echo "  --nj <nj>                                # number of parallel jobs"
  echo "  --cmd <cmd>                              # Command to run in parallel with"
  echo "  --beam <beam>                            # Decoding beam; default 15.0"
  echo "  --iter <iter>                            # Iteration of model to decode; default is final."
  echo "  --scoring-opts <string>                  # options to local/score.sh"
  echo "  --num-threads <n>                        # number of threads to use, default 1."
  echo "  --use-gpu <true|false>                   # default: false.  If true, we recommend"
  echo "                                           # to use large --num-threads as the graph"
  echo "                                           # search becomes the limiting factor."
  exit 1;
fi

graphdir=$1
data=$2
dir=$3
srcdir=`dirname $dir`; # Assume model directory one level up from decoding directory.
model=$srcdir/$iter.mdl


extra_files=
if [ ! -z "$online_ivector_dir" ]; then
  steps/nnet2/check_ivectors_compatible.sh $srcdir $online_ivector_dir || exit 1
  extra_files="$online_ivector_dir/ivector_online.scp $online_ivector_dir/ivector_period"
fi

#utils/lang/check_phones_compatible.sh {$srcdir,$graphdir}/phones.txt || exit 1

for f in $graphdir/HCLr.fst $graphdir/Gr.fst $graphdir/disambig_tid.int $data/feats.scp $model $extra_files; do
  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
done

sdata=$data/split$nj;
cmvn_opts=`cat $srcdir/cmvn_opts` || exit 1;
thread_string=

mkdir -p $dir/log
[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
echo $nj > $dir/num_jobs


## Set up features.
echo "$0: feature type is raw"

feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- |"

if [ ! -z "$online_ivector_dir" ]; then
  ivector_period=$(cat $online_ivector_dir/ivector_period) || exit 1;
  ivector_opts="--online-ivectors=scp:$online_ivector_dir/ivector_online.scp --online-ivector-period=$ivector_period"
fi

if [ "$post_decode_acwt" == 1.0 ]; then
  lat_wspecifier="ark:|gzip -c >$dir/lat.JOB.gz"
else
  lat_wspecifier="ark:|lattice-scale --acoustic-scale=$post_decode_acwt ark:- ark:- | gzip -c >$dir/lat.JOB.gz"
fi

frame_subsampling_opt=
if [ -f $srcdir/frame_subsampling_factor ]; then
  # e.g. for 'chain' systems
  frame_subsampling_opt="--frame-subsampling-factor=$(cat $srcdir/frame_subsampling_factor)"
fi

if [ $stage -le 1 ]; then
  $cmd $queue_opt JOB=1:$nj $dir/log/decode.JOB.log \
    nnet3-latgen-faster-lookahead $ivector_opts $frame_subsampling_opt \
     --frames-per-chunk=$frames_per_chunk \
     --extra-left-context=$extra_left_context \
     --extra-right-context=$extra_right_context \
     --extra-left-context-initial=$extra_left_context_initial \
     --extra-right-context-final=$extra_right_context_final \
     --minimize=$minimize --max-active=$max_active --min-active=$min_active --beam=$beam \
     --lattice-beam=$lattice_beam --acoustic-scale=$acwt --allow-partial=true \
     --word-symbol-table=$graphdir/words.txt "$model" \
     $graphdir/HCLr.fst $graphdir/Gr.fst $graphdir/disambig_tid.int "$feats" "$lat_wspecifier" || exit 1;
fi


if [ $stage -le 2 ]; then
  if ! $skip_diagnostics ; then
    [ ! -z $iter ] && iter_opt="--iter $iter"
    steps/diagnostic/analyze_lats.sh --cmd "$cmd" $iter_opt $graphdir $dir
  fi
fi


# The output of this script is the files "lat.*.gz"-- we'll rescore this at
# different acoustic scales to get the final output.
if [ $stage -le 3 ]; then
  if ! $skip_scoring ; then
    [ ! -x local/score.sh ] && \
      echo "Not scoring because local/score.sh does not exist or not executable." && exit 1;
    echo "score best paths"
    [ "$iter" != "final" ] && iter_opt="--iter $iter"
    local/score.sh $scoring_opts --cmd "$cmd" $data $graphdir $dir
    echo "score confidence and timing with sclite"
  fi
fi
echo "Decoding done."
exit 0;


================================================
FILE: egs/steps/nnet3/decode_looped.sh
================================================
#!/usr/bin/env bash

# Copyright 2012-2015  Johns Hopkins University (Author: Daniel Povey).
# Apache 2.0.


# This is like decode.sh except it uses "looped" decoding.  This is an nnet3
# mechanism for reusing previously computed activations when we evaluate the
# neural net for successive chunks of data.  It is applicable to TDNNs and LSTMs
# and similar forward-recurrent topologies, but not to backward-recurrent
# topologies like BLSTMs.  Be careful because the script itself does not have a
# way to figure out what kind of topology you are using.
#
# Also be aware that this decoding mechanism means that you have effectively
# unlimited context within the utterance.  Unless your models were trained (at
# least partly) on quite large chunk-sizes, e.g. 100 or more (although the
# longer the BLSTM recurrence the larger chunk-size you'd need in training),
# there is a possibility that this effectively infinite left-context will cause
# a mismatch with the training condition.  Also, for recurrent topologies, you may want to make sure
# that the --extra-left-context-initial matches the --egs.chunk-left-context-initial
# that you trained with, .  [note: if not specified during training, it defaults to
# the same as the regular --extra-left-context

# This script does decoding with a neural-net.

# Begin configuration section.
stage=1
nj=4 # number of decoding jobs.
acwt=0.1  # Just a default value, used for adaptation and beam-pruning..
post_decode_acwt=1.0  # can be used in 'chain' systems to scale acoustics by 10 so the
                      # regular scoring script works.
cmd=run.pl
beam=15.0
frames_per_chunk=50
max_active=7000
min_active=200
ivector_scale=1.0
lattice_beam=8.0 # Beam we use in lattice generation.
iter=final
scoring_opts=
skip_diagnostics=false
skip_scoring=false
extra_left_context_initial=0
online_ivector_dir=
minimize=false
# End configuration section.

echo "$0 $@"  # Print the command line for logging

[ -f ./path.sh ] && . ./path.sh; # source the path.
. parse_options.sh || exit 1;

if [ $# -ne 3 ]; then
  echo "Usage: $0 [options] <graph-dir> <data-dir> <decode-dir>"
  echo "e.g.:   steps/nnet3/decode.sh --nj 8 \\"
  echo "--online-ivector-dir exp/nnet2_online/ivectors_test_eval92 \\"
  echo "    exp/tri4b/graph_bg data/test_eval92_hires $dir/decode_bg_eval92"
  echo "main options (for others, see top of script file)"
  echo "  --config <config-file>                   # config containing options"
  echo "  --nj <nj>                                # number of parallel jobs"
  echo "  --cmd <cmd>                              # Command to run in parallel with"
  echo "  --beam <beam>                            # Decoding beam; default 15.0"
  echo "  --iter <iter>                            # Iteration of model to decode; default is final."
  echo "  --scoring-opts <string>                  # options to local/score.sh"
  exit 1;
fi

graphdir=$1
data=$2
dir=$3
srcdir=`dirname $dir`; # Assume model directory one level up from decoding directory.
model=$srcdir/$iter.mdl


[ ! -z "$online_ivector_dir" ] && \
  extra_files="$online_ivector_dir/ivector_online.scp $online_ivector_dir/ivector_period"

for f in $graphdir/HCLG.fst $data/feats.scp $model $extra_files; do
  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
done

sdata=$data/split$nj;
cmvn_opts=`cat $srcdir/cmvn_opts` || exit 1;

mkdir -p $dir/log
[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
echo $nj > $dir/num_jobs


## Set up features.
echo "$0: feature type is raw"

splice_opts=`cat $srcdir/splice_opts 2>/dev/null`

feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- |"

if [ ! -z "$online_ivector_dir" ]; then
  ivector_period=$(cat $online_ivector_dir/ivector_period) || exit 1;
  ivector_opts="--online-ivectors=scp:$online_ivector_dir/ivector_online.scp --online-ivector-period=$ivector_period"
fi

if [ "$post_decode_acwt" == 1.0 ]; then
  lat_wspecifier="ark:|gzip -c >$dir/lat.JOB.gz"
else
  lat_wspecifier="ark:|lattice-scale --acoustic-scale=$post_decode_acwt ark:- ark:- | gzip -c >$dir/lat.JOB.gz"
fi

frame_subsampling_opt=
if [ -f $srcdir/frame_subsampling_factor ]; then
  # e.g. for 'chain' systems
  frame_subsampling_opt="--frame-subsampling-factor=$(cat $srcdir/frame_subsampling_factor)"
fi

if [ $stage -le 1 ]; then
  $cmd JOB=1:$nj $dir/log/decode.JOB.log \
    nnet3-latgen-faster-looped $ivector_opts $frame_subsampling_opt \
     --frames-per-chunk=$frames_per_chunk \
     --extra-left-context-initial=$extra_left_context_initial \
     --minimize=$minimize --max-active=$max_active --min-active=$min_active --beam=$beam \
     --lattice-beam=$lattice_beam --acoustic-scale=$acwt --allow-partial=true \
     --word-symbol-table=$graphdir/words.txt "$model" \
     $graphdir/HCLG.fst "$feats" "$lat_wspecifier" || exit 1;
fi


if [ $stage -le 2 ]; then
  if ! $skip_diagnostics ; then
    [ ! -z $iter ] && iter_opt="--iter $iter"
    steps/diagnostic/analyze_lats.sh --cmd "$cmd" $iter_opt $graphdir $dir
  fi
fi


# The output of this script is the files "lat.*.gz"-- we'll rescore this at
# different acoustic scales to get the final output.
if [ $stage -le 3 ]; then
  if ! $skip_scoring ; then
    [ ! -x local/score.sh ] && \
      echo "Not scoring because local/score.sh does not exist or not executable." && exit 1;
    echo "score best paths"
    [ "$iter" != "final" ]
    local/score.sh $scoring_opts --cmd "$cmd" $data $graphdir $dir
    echo "score confidence and timing with sclite"
  fi
fi
echo "Decoding done."
exit 0;


================================================
FILE: egs/steps/nnet3/decode_score_fusion.sh
================================================
#!/usr/bin/env bash

# Copyright 2018        Tien-Hong Lo

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#  http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.


# Script for system combination using output of the neural networks.
# This calls nnet3-compute, matrix-sum and latgen-faster-mapped to create a system combination.
set -euo pipefail
# begin configuration section.
cmd=run.pl

# Neural Network
stage=0
iter=final
nj=30
output_name="output"
ivector_scale=1.0
apply_exp=false  # Apply exp i.e. write likelihoods instead of log-likelihoods
compress=false    # Specifies whether the output should be compressed before
                  # dumping to disk
use_gpu=false
skip_diagnostics=false
extra_left_context=0
extra_right_context=0
extra_left_context_initial=-1
extra_right_context_final=-1
online_ivector_dir=
frame_subsampling_factor=
frames_per_chunk=150
average=true

# Decode
beam=15.0 # prune the lattices prior to MBR decoding, for speed.
max_active=7000
min_active=200
acwt=0.1  # Just a default value, used for adaptation and beam-pruning..
post_decode_acwt=1.0  # can be used in 'chain' systems to scale acoustics by 10 so the
                      # regular scoring script works.
lattice_beam=8.0 # Beam we use in lattice generation.
num_threads=1 # if >1, will use latgen-faster--map-parallel
min_lmwt=5
max_lmwt=15
parallel_opts="--num-threads 3"
scoring_opts=
minimize=false
skip_scoring=false

word_determinize=false  # If set to true, then output lattice does not retain
                        # alternate paths a sequence of words (with alternate pronunciations).
                        # Setting to true is the default in steps/nnet3/decode.sh.
                        # However, setting this to false
                        # is useful for generation w of semi-supervised training
                        # supervision and frame-level confidences.
write_compact=true   # If set to false, then writes the lattice in non-compact format,
                     # retaining the acoustic scores on each arc. This is
                     # required to be false for LM rescoring undeterminized
                     # lattices (when --word-determinize is false)
#end configuration section.

[ -f ./path.sh ] && . ./path.sh
. parse_options.sh || exit 1;


if [ $# -lt 5 ]; then
  echo "Usage: $0 [options] <data-dir> <graph-dir> <nnet3-dir> <nnet3-dir2> [<nnet3-dir3> ... ] <output-dir>"
  echo "e.g.:   steps/nnet3/decode_score_fusion.sh --nj 8 \\"
  echo "    --online-ivector-dir exp/nnet3/ivectors_test \\"
  echo "    data/test_hires exp/nnet3/tdnn/graph exp/nnet3/tdnn/output exp/nnet3/tdnn1/output .. \\"
  echo "    exp/nnet3/tdnn_comb/decode_test"
  echo "main options (for others, see top of script file)"
  echo "  --config <config-file>                   # config containing options"
  echo "  --nj <nj>                                # number of parallel jobs"
  echo "  --cmd <cmd>                              # Command to run in parallel with"
  echo "  --iter <iter>                            # Iteration of model to decode; default is final."
  exit 1;
fi

echo "$0 $@"

data=$1
graphdir=$2
dir=${@: -1}  # last argument to the script
shift 2;
model_dirs=( $@ )  # read the remaining arguments into an array
unset model_dirs[${#model_dirs[@]}-1]  # 'pop' the last argument which is odir
num_sys=${#model_dirs[@]}  # number of systems to combine

for f in $graphdir/words.txt $graphdir/phones/word_boundary.int ; do
  [ ! -f $f ] && echo "$0: file $f does not exist" && exit 1;
done

[ ! -z "$online_ivector_dir" ] && \
   extra_files="$online_ivector_dir/ivector_online.scp $online_ivector_dir/ivector_period"
   
if [ ! -z "$online_ivector_dir" ]; then
    ivector_period=$(cat $online_ivector_dir/ivector_period) || exit 1;
    ivector_opts="--online-ivectors=scp:$online_ivector_dir/ivector_online.scp --online-ivector-period=$ivector_period"
fi

# assign frame_subsampling_factor automatically if empty
if [ -z $frame_subsampling_factor ]; then
   frame_subsampling_factor=`cat ${model_dirs[0]}/frame_subsampling_factor` || exit 1;
fi

# check if standard chain system or not.
if [ $frame_subsampling_factor -eq 3 ]; then
   if [ $acwt != 1.0 ] || [ $post_decode_acwt != 10.0 ]; then
     echo -e '\n\n'
     echo "$0 WARNING: In standard chain system, acwt = 1.0, post_decode_acwt = 10.0"
     echo "$0 WARNING: Your acwt = $acwt, post_decode_acwt = $post_decode_acwt"
     echo "$0 WARNING: This is OK if you know what you are doing."
     echo -e '\n\n'
   fi
fi

frame_subsampling_opt=
if [ $frame_subsampling_factor -ne 1 ]; then
  # e.g. for 'chain' systems
  frame_subsampling_opt="--frame-subsampling-factor=$frame_subsampling_factor"
fi

# Possibly use multi-threaded decoder
thread_string=
[ $num_threads -gt 1 ] && thread_string="-parallel --num-threads=$num_threads"

mkdir -p $dir/temp

for i in `seq 0 $[num_sys-1]`; do
  srcdir=${model_dirs[$i]}
  
  model=$srcdir/$iter.mdl
  if [ ! -f $srcdir/$iter.mdl ]; then
    echo "$0: Error: no such file $srcdir/$iter.raw. Trying $srcdir/$iter.mdl exit" && exit 1;
  fi
  
  # check that they have the same tree
  show-transitions $graphdir/phones.txt $model > $dir/temp/transition.${i}.txt
  cmp_tree=`diff -q $dir/temp/transition.0.txt $dir/temp/transition.${i}.txt | awk '{print $5}'`
  if [ ! -z $cmp_tree ]; then
    echo "$0 tree must be the same."
    exit 0;
  fi
  
  # check that they have the same frame-subsampling-factor
  if [ $frame_subsampling_factor -ne `cat $srcdir/frame_subsampling_factor` ]; then
    echo "$0 frame_subsampling_factor must be the same.\\"
    echo "Default:$frame_subsampling_factor \\"
    echo "In $srcdir:`cat $srcdir/frame_subsampling_factor`"
    exit 0;
  fi
  
  for f in $data/feats.scp $model $extra_files; do
    [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
  done

  if [ ! -z "$output_name" ] && [ "$output_name" != "output" ]; then
    echo "$0: Using output-name $output_name"
    model="nnet3-copy --edits='remove-output-nodes name=output;rename-node old-name=$output_name new-name=output' $model - |"
  fi

  ## Set up features.
  if [ -f $srcdir/final.mat ]; then
    echo "$0: Error: lda feature type is no longer supported." && exit 1
  fi
  
  sdata=$data/split$nj;
  cmvn_opts=`cat $srcdir/cmvn_opts` || exit 1;
  
  feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- |"

  if $apply_exp; then
    output_wspecifier="ark:| copy-matrix --apply-exp ark:- ark:-"
  else
    output_wspecifier="ark:| copy-feats --compress=$compress ark:- ark:-"
  fi

  gpu_opt="--use-gpu=no"
  gpu_queue_opt=

  if $use_gpu; then
    gpu_queue_opt="--gpu 1"
    gpu_opt="--use-gpu=yes"
  fi

  echo "$i $model";
  models[$i]="ark,s,cs:nnet3-compute $gpu_opt $ivector_opts $frame_subsampling_opt \
     --frames-per-chunk=$frames_per_chunk \
     --extra-left-context=$extra_left_context \
     --extra-right-context=$extra_right_context \
     --extra-left-context-initial=$extra_left_context_initial \
     --extra-right-context-final=$extra_right_context_final \
     '$model' '$feats' '$output_wspecifier' |"
done

# remove tempdir
rm -rf $dir/temp

# split data to nj
[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
echo $nj > $dir/num_jobs


# Assume the nnet trained by 
# the same tree and frame subsampling factor.
mkdir -p $dir/log

if [ -f $model ]; then
  echo "$0: $model exists, copy model to $dir/../"
  cp $model $dir/../
fi

if [ -f $srcdir/frame_shift ]; then
  cp $srcdir/frame_shift $dir/../
  echo "$0: $srcdir/frame_shift exists, copy $srcdir/frame_shift to $dir/../"
elif [ -f $srcdir/frame_subsampling_factor ]; then
  cp $srcdir/frame_subsampling_factor $dir/../
  echo "$0: $srcdir/frame_subsampling_factor exists, copy $srcdir/frame_subsampling_factor to $dir/../"
fi

lat_wspecifier="ark:|"
extra_opts=
if ! $write_compact; then
  extra_opts="--determinize-lattice=false"
  lat_wspecifier="ark:| lattice-determinize-phone-pruned --beam=$lattice_beam --acoustic-scale=$acwt --minimize=$minimize --word-determinize=$word_determinize --write-compact=false $model ark:- ark:- |"
fi

if [ "$post_decode_acwt" == 1.0 ]; then
  lat_wspecifier="$lat_wspecifier gzip -c >$dir/lat.JOB.gz"
else
  lat_wspecifier="$lat_wspecifier lattice-scale --acoustic-scale=$post_decode_acwt --write-compact=$write_compact ark:- ark:- | gzip -c >$dir/lat.JOB.gz"
fi


if [ $stage -le 0 ]; then  
  $cmd --num-threads $num_threads JOB=1:$nj $dir/log/decode.JOB.log \
     matrix-sum --average=$average "${models[@]}" ark:- \| \
     latgen-faster-mapped$thread_string --lattice-beam=$lattice_beam --acoustic-scale=$acwt --allow-partial=true \
     --minimize=$minimize --max-active=$max_active --min-active=$min_active --beam=$beam \
     --word-symbol-table=$graphdir/words.txt ${extra_opts} "$model" \
     $graphdir/HCLG.fst ark:- "$lat_wspecifier"
fi

if [ $stage -le 1 ]; then
  if ! $skip_diagnostics ; then
    [ ! -z $iter ] && iter_opt="--iter $iter"
    steps/diagnostic/analyze_lats.sh --cmd "$cmd" $iter_opt $graphdir $dir
  fi
fi

if ! $skip_scoring ; then
  if [ $stage -le 2 ]; then
    [ ! -x local/score.sh ] && \
    echo "Not scoring because local/score.sh does not exist or not executable." && exit 1;
    echo "score best paths"
    [ "$iter" != "final" ] && iter_opt="--iter $iter"
	scoring_opts="--min_lmwt $min_lmwt"
    local/score.sh $scoring_opts --cmd "$cmd" $data $graphdir $dir
    echo "score confidence and timing with sclite"
  fi
fi


exit 0


================================================
FILE: egs/steps/nnet3/decode_semisup.sh
================================================
#!/usr/bin/env bash

# Copyright 2012-2015  Johns Hopkins University (Author: Daniel Povey).
# Apache 2.0.

# This script does decoding with a neural-net.

# Begin configuration section.
stage=1
nj=4 # number of decoding jobs.
acwt=0.1  # Just a default value, used for adaptation and beam-pruning..
post_decode_acwt=1.0  # can be used in 'chain' systems to scale acoustics by 10 so the
                      # regular scoring script works.
cmd=run.pl
beam=15.0
frames_per_chunk=50
max_active=7000
min_active=200
ivector_scale=1.0
lattice_beam=8.0 # Beam we use in lattice generation.
iter=final
num_threads=1 # if >1, will use gmm-latgen-faster-parallel
scoring_opts=
skip_diagnostics=false
skip_scoring=false
extra_left_context=0
extra_right_context=0
extra_left_context_initial=-1
extra_right_context_final=-1
online_ivector_dir=
minimize=false
word_determinize=false  # If set to true, then output lattice does not retain
                        # alternate paths a sequence of words (with alternate pronunciations).
                        # Setting to true is the default in steps/nnet3/decode.sh.
                        # However, setting this to false
                        # is useful for generation w of semi-supervised training
                        # supervision and frame-level confidences.
write_compact=true   # If set to false, then writes the lattice in non-compact format,
                     # retaining the acoustic scores on each arc. This is
                     # required to be false for LM rescoring undeterminized
                     # lattices (when --word-determinize is false)
                     # Useful for semi-supervised training with rescored lattices.
# End configuration section.

echo "$0 $@"  # Print the command line for logging

[ -f ./path.sh ] && . ./path.sh; # source the path.
. utils/parse_options.sh || exit 1;

if [ $# -ne 3 ]; then
  echo "Usage: $0 [options] <graph-dir> <data-dir> <decode-dir>"
  echo "e.g.:   steps/nnet3/decode.sh --nj 8 \\"
  echo "--online-ivector-dir exp/nnet2_online/ivectors_test_eval92 \\"
  echo "    exp/tri4b/graph_bg data/test_eval92_hires $dir/decode_bg_eval92"
  echo "main options (for others, see top of script file)"
  echo "  --transform-dir <decoding-dir>           # directory of previous decoding"
  echo "                                           # where we can find transforms for SAT systems."
  echo "  --config <config-file>                   # config containing options"
  echo "  --nj <nj>                                # number of parallel jobs"
  echo "  --cmd <cmd>                              # Command to run in parallel with"
  echo "  --beam <beam>                            # Decoding beam; default 15.0"
  echo "  --iter <iter>                            # Iteration of model to decode; default is final."
  echo "  --scoring-opts <string>                  # options to local/score.sh"
  echo "  --num-threads <n>                        # number of threads to use, default 1."
  exit 1;
fi

graphdir=$1
data=$2
dir=$3
srcdir=`dirname $dir`; # Assume model directory one level up from decoding directory.
model=$srcdir/$iter.mdl


extra_files=
if [ ! -z "$online_ivector_dir" ]; then
  steps/nnet2/check_ivectors_compatible.sh $srcdir $online_ivector_dir || exit 1
  extra_files="$online_ivector_dir/ivector_online.scp $online_ivector_dir/ivector_period"
fi

utils/lang/check_phones_compatible.sh {$srcdir,$graphdir}/phones.txt || exit 1

for f in $graphdir/HCLG.fst $data/feats.scp $model $extra_files; do
  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
done

sdata=$data/split$nj;
cmvn_opts=`cat $srcdir/cmvn_opts` || exit 1;
thread_string=
[ $num_threads -gt 1 ] && thread_string="-parallel --num-threads=$num_threads"

mkdir -p $dir/log
[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
echo $nj > $dir/num_jobs


## Set up features.
echo "$0: feature type is raw"

feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- |"

if [ ! -z "$online_ivector_dir" ]; then
  ivector_period=$(cat $online_ivector_dir/ivector_period) || exit 1;
  ivector_opts="--online-ivectors=scp:$online_ivector_dir/ivector_online.scp --online-ivector-period=$ivector_period"
fi

extra_opts=
lat_wspecifier="ark:|"
if ! $write_compact; then
  extra_opts="--determinize-lattice=false"
  lat_wspecifier="ark:| lattice-determinize-phone-pruned --beam=$lattice_beam --acoustic-scale=$acwt --minimize=$minimize --word-determinize=$word_determinize --write-compact=false $model ark:- ark:- |"
fi

if [ "$post_decode_acwt" == 1.0 ]; then
  lat_wspecifier="$lat_wspecifier gzip -c >$dir/lat.JOB.gz"
else
  lat_wspecifier="$lat_wspecifier lattice-scale --acoustic-scale=$post_decode_acwt --write-compact=$write_compact ark:- ark:- | gzip -c >$dir/lat.JOB.gz"
fi

frame_subsampling_opt=
if [ -f $srcdir/frame_subsampling_factor ]; then
  # e.g. for 'chain' systems
  frame_subsampling_opt="--frame-subsampling-factor=$(cat $srcdir/frame_subsampling_factor)"
fi

# Copy the model as it is required when generating egs
cp $model $dir/  || exit 1

if [ $stage -le 1 ]; then
  $cmd --num-threads $num_threads JOB=1:$nj $dir/log/decode.JOB.log \
    nnet3-latgen-faster$thread_string $ivector_opts $frame_subsampling_opt \
     --frames-per-chunk=$frames_per_chunk \
     --extra-left-context=$extra_left_context \
     --extra-right-context=$extra_right_context \
     --extra-left-context-initial=$extra_left_context_initial \
     --extra-right-context-final=$extra_right_context_final \
     --minimize=$minimize --word-determinize=$word_determinize \
     --max-active=$max_active --min-active=$min_active --beam=$beam \
     --lattice-beam=$lattice_beam --acoustic-scale=$acwt --allow-partial=true \
     --word-symbol-table=$graphdir/words.txt ${extra_opts} "$model" \
     $graphdir/HCLG.fst "$feats" "$lat_wspecifier" || exit 1;
fi

if [ $stage -le 2 ]; then
  if ! $skip_diagnostics ; then
    [ ! -z $iter ] && iter_opt="--iter $iter"
    steps/diagnostic/analyze_lats.sh --cmd "$cmd" $iter_opt $graphdir $dir
  fi
fi


# The output of this script is the files "lat.*.gz"-- we'll rescore this at
# different acoustic scales to get the final output.
if [ $stage -le 3 ]; then
  if ! $skip_scoring ; then
    [ ! -x local/score.sh ] && \
      echo "Not scoring because local/score.sh does not exist or not executable." && exit 1;
    echo "score best paths"
    [ "$iter" != "final" ] && iter_opt="--iter $iter"
    local/score.sh $scoring_opts --cmd "$cmd" $data $graphdir $dir
    echo "score confidence and timing with sclite"
  fi
fi
echo "Decoding done."
exit 0;


================================================
FILE: egs/steps/nnet3/dot/descriptor_parser.py
================================================
#!/usr/bin/env python

# we're using python 3.x style print but want it to work in python 2.x,
from __future__ import print_function
import pprint
import re
import sys

start_identifier = "("
end_identifier = ")"

def ParseSubsegmentsAndArguments(segment_endpoints, sub_segments, arguments, input_string):
    # name the sub_segments, and other arguments
    arg_name_start_index = segment_endpoints[0]
    args = ''
    for sub_segment in sub_segments:
        endpoints = sub_segment['endpoints']
        args += input_string[arg_name_start_index:endpoints[0]+1]
        arg_name_start_index=endpoints[1]+1
    args += input_string[arg_name_start_index:segment_endpoints[1]+1]

    args = args.split(',')
    if len(sub_segments) > 0:
        sub_segment_index = 0
        for sub_segment_name in args:
            sub_segment_name = sub_segment_name.strip()
            if sub_segment_name[-1] == "(":
                # this subsegment is a function
                sub_segment_name = sub_segment_name[:-1]
                sub_segments[sub_segment_index]['name'] = sub_segment_name
                sub_segment_index += 1

            else:
                arguments.append(sub_segment_name)
    else:
        arguments = [re.sub(',','', x.strip()) for x in input_string[segment_endpoints[0]:segment_endpoints[1]+1].split()]
        sub_segments = []
    return sub_segments, arguments

def IdentifyNestedSegments(input_string):
    indices = []
    segments = []
    for i in range(len(input_string)):
        if input_string[i] == start_identifier:
            indices.append(i)
        if input_string[i] == end_identifier:
            # new segment has been found
            current_segment_endpoints = [indices.pop(), i]
            sub_segments = []
            arguments = []
            # identify the sub-segments
            # the sub-segments would be on the top of the stack
            # with start index greater than current segment
            # and end index less than current segment
            # these sub-segments are listed in reverse order on the stack,
            # the final segment is on the top
            while len(segments) > 0:
                if ((segments[-1]['endpoints'][0] > current_segment_endpoints[0]) and
                    (segments[-1]['endpoints'][1] < current_segment_endpoints[1])):
                    sub_segments.insert(0, segments.pop())
                else:
                    break

            sub_segments, arguments = ParseSubsegmentsAndArguments([current_segment_endpoints[0]+1, current_segment_endpoints[1]-1], sub_segments, arguments, input_string)
            segments.append({
                             'name':'',
                             'endpoints':current_segment_endpoints,
                             'sub_segments':sub_segments,
                             'arguments':arguments
                             })
    arguments = []
    segments, arguments = ParseSubsegmentsAndArguments([0, len(input_string)], segments, arguments, input_string)
    if arguments:
        if segments:
            raise Exception('Arguments not expected outside top level braces : {0}'.format(input_string))
    if len(segments) > 1:
        raise Exception('only one parent segment expected : {0}'.format(input_string))

    return [segments, arguments]

if __name__ == "__main__":
    strings= [
        "Append(Offset-2(input, -2), Offset-1(input, -1), input, Offset+1(input, 1), Offset+2(input, 2), ReplaceIndex(ivector, t, 0))",
        "Wx"]
    for string in strings:
        segments = IdentifyNestedSegments(string)
        pprint.pprint(segments)


================================================
FILE: egs/steps/nnet3/dot/nnet3_to_dot.py
================================================
#!/usr/bin/env python

# Copyright      2015  Johns Hopkins University (Author: Vijayaditya Peddinti)
# Apache 2.0

# script to convert nnet3-am-info output to a dot graph


# we're using python 3.x style print but want it to work in python 2.x,
from __future__ import print_function
import re
import os
import argparse
import sys
import math
import warnings
import descriptor_parser
import pprint

node_attributes = {
    'input-node':{
        'shape':'oval'
    },
    'output-node':{
        'shape':'oval'
    },
    'NaturalGradientAffineComponent':{
        'color':'lightgrey',
        'shape':'box',
        'style':'filled'
    },
    'NaturalGradientPerElementScaleComponent':{
        'color':'lightpink',
        'shape':'box',
        'style':'filled'
    },
    'ConvolutionComponent':{
        'color':'lightpink',
        'shape':'box',
        'style':'filled'
    },
    'FixedScaleComponent':{
        'color':'blueviolet',
        'shape':'box',
        'style':'filled'
    },
    'FixedAffineComponent':{
        'color':'darkolivegreen1',
        'shape':'box',
        'style':'filled'
    },
    'SigmoidComponent':{
        'color':'bisque',
        'shape':'rectangle',
        'style':'filled'
    },
    'TanhComponent':{
        'color':'bisque',
        'shape':'rectangle',
        'style':'filled'
    },
    'NormalizeComponent':{
        'color':'aquamarine',
        'shape':'rectangle',
        'style':'filled'
    },
    'RectifiedLinearComponent':{
        'color':'bisque',
        'shape':'rectangle',
        'style':'filled'
    },
    'ClipGradientComponent':{
        'color':'bisque',
        'shape':'rectangle',
        'style':'filled'
    },
    'ElementwiseProductComponent':{
        'color':'green',
        'shape':'rectangle',
        'style':'filled'
    },
    'LogSoftmaxComponent':{
        'color':'cyan',
        'shape':'rectangle',
        'style':'filled'
    }
}

def GetDotNodeName(name_string, is_component = False):
    # this function is required as dot does not allow all the component names
    # allowed by nnet3.
    # Identified incompatibilities :
    #   1. dot does not allow hyphen(-) and dot(.) in names
    #   2. Nnet3 names can be shared among components and component nodes
    #      dot does not allow common names
    #
    node_name_string = re.sub("-", "hyphen", name_string)
    node_name_string = re.sub("\.", "_dot_", node_name_string)
    if is_component:
        node_name_string += node_name_string.strip() + "_component"
    return {"label":name_string, "node":node_name_string}

def ProcessAppendDescriptor(segment, parent_node_name, affix, edge_attributes = None):
    dot_graph = []
    names = []
    desc_name = 'Append_{0}'.format(affix)
    for i in range(len(segment['sub_segments'])):
        sub_segment = segment['sub_segments'][i]
        part_name = "{0}{1}{2}".format(desc_name, sub_segment['name'], i)
        names.append("<{0}> part {1}".format(GetDotNodeName(part_name)['node'], i))
        dot_graph += DescriptorSegmentToDot(sub_segment, "{0}:{1}".format(desc_name, part_name), desc_name)

    part_index = len(segment['sub_segments'])
    for i in range(len(segment['arguments'])):
        part_name = "{0}{1}{2}".format(desc_name, segment['arguments'][i], part_index + i)
        names.append("<{0}> part {1}".format(GetDotNodeName(part_name)['node'], part_index + i))
        dot_graph.append("{0} -> {1}:{2}".format(GetDotNodeName(segment['arguments'][i])['node'], GetDotNodeName(desc_name)['node'], GetDotNodeName(part_name)['node']))

    label = "|".join(names)
    label = "{{"+label+"}|Append}"
    dot_graph.append('{0} [shape=Mrecord, label="{1}"];'.format(GetDotNodeName(desc_name)['node'], label))

    attr_string = ''
    if edge_attributes is not None:
        if 'label' in edge_attributes:
            attr_string += " label={0} ".format(edge_attributes['label'])
        if 'style' in edge_attributes:
            attr_string += ' style={0} '.format(edge_attributes['style'])

    dot_string = '{0} -> {1} [tailport=s]'.format(GetDotNodeName(desc_name)['node'], GetDotNodeName(parent_node_name)['node'])

    if attr_string != '':
        dot_string += ' [{0}] '.format(attr_string)
    dot_graph.append(dot_string)


    return dot_graph

def ProcessRoundDescriptor(segment, parent_node_name, affix, edge_attributes = None):
    dot_graph = []

    label = 'Round ({0})'.format(segment['arguments'][1])
    style = None
    if edge_attributes is not None:
        if 'label' in edge_attributes:
            label = "{0} {1}".format(edge_attributes['label'], label)
        if 'style' in edge_attributes:
            style  = 'style={0}'.format(edge_attributes['style'])

    attr_string = 'label="{0}"'.format(label)
    if style is not None:
        attr_string += ' {0}'.format(style)
    dot_graph.append('{0}->{1} [ {2} ]'.format(GetDotNodeName(segment['arguments'][0])['node'],
                                                                    GetDotNodeName(parent_node_name)['node'],
                                                                    attr_string))
    if segment['sub_segments']:
        raise Exception("Round can just deal with forwarding descriptor, no sub-segments allowed")
    return dot_graph


def ProcessOffsetDescriptor(segment, parent_node_name, affix, edge_attributes = None):
    dot_graph = []

    label = 'Offset ({0})'.format(segment['arguments'][1])
    style = None
    if edge_attributes is not None:
        if 'label' in edge_attributes:
            label = "{0} {1}".format(edge_attributes['label'], label)
        if 'style' in edge_attributes:
            style  = 'style={0}'.format(edge_attributes['style'])

    attr_string = 'label="{0}"'.format(label)
    if style is not None:
        attr_string += ' {0}'.format(style)

    dot_graph.append('{0}->{1} [ {2} ]'.format(GetDotNodeName(segment['arguments'][0])['node'],
                                                                    GetDotNodeName(parent_node_name)['node'],
                                                                    attr_string))
    if segment['sub_segments']:
        raise Exception("Offset can just deal with forwarding descriptor, no sub-segments allowed")
    return dot_graph

def ProcessSumDescriptor(segment, parent_node_name, affix, edge_attributes = None):
    dot_graph = []
    names = []
    desc_name = 'Sum_{0}'.format(affix)
    # create the sum node
    for i in range(len(segment['sub_segments'])):
        sub_segment = segment['sub_segments'][i]
        part_name = "{0}{1}{2}".format(desc_name, sub_segment['name'], i)
        names.append("<{0}> part {1}".format(GetDotNodeName(part_name)['node'], i))
        dot_graph += DescriptorSegmentToDot(sub_segment, "{0}:{1}".format(desc_name, part_name), "{0}_{1}".format(desc_name, i))

    # link the sum node parts to corresponding segments
    part_index = len(segment['sub_segments'])
    for i in range(len(segment['arguments'])):
        part_name = "{0}{1}{2}".format(desc_name, segment['arguments'][i], part_index + i)
        names.append("<{0}> part {1}".format(GetDotNodeName(part_name)['node'], part_index + i))
        dot_graph.append("{0} -> {1}:{2}".format(GetDotNodeName(segment['arguments'][i])['node'], GetDotNodeName(desc_name)['node'], GetDotNodeName(part_name)['node']))

    label = "|".join(names)
    label = '{{'+label+'}|Sum}'
    dot_graph.append('{0} [shape=Mrecord, label="{1}", color=red];'.format(GetDotNodeName(desc_name)['node'], label))

    attr_string = ''
    if edge_attributes is not None:
        if 'label' in edge_attributes:
            attr_string += " label={0} ".format(edge_attributes['label'])
        if 'style' in edge_attributes:
            attr_string += ' style={0} '.format(edge_attributes['style'])

    dot_string = '{0} -> {1}'.format(GetDotNodeName(desc_name)['node'], GetDotNodeName(parent_node_name)['node'])

    dot_string += ' [{0} tailport=s ] '.format(attr_string)
    dot_graph.append(dot_string)
    return dot_graph

def ProcessReplaceIndexDescriptor(segment, parent_node_name, affix, edge_attributes = None):
    dot_graph = []

    label = 'ReplaceIndex({0}, {1})'.format(segment['arguments'][1], segment['arguments'][2])
    style = None
    if edge_attributes is not None:
        if 'label' in edge_attributes:
            label = "{0} {1}".format(edge_attributes['label'], label)
        if 'style' in edge_attributes:
            style  = 'style={0}'.format(edge_attributes['style'])

    attr_string = 'label="{0}"'.format(label)
    if style is not None:
        attr_string += ' {0}'.format(style)

    dot_graph.append('{0}->{1} [{2}]'.format(GetDotNodeName(segment['arguments'][0])['node'],
                                                                    GetDotNodeName(parent_node_name)['node'],
                                                                    attr_string))
    if segment['sub_segments']:
        raise Exception("ReplaceIndex can just deal with forwarding descriptor, no sub-segments allowed")
    return dot_graph

def ProcessIfDefinedDescriptor(segment, parent_node_name, affix, edge_attributes = None):
    # IfDefined adds attributes to the edges
    if edge_attributes is not None:
        raise Exception("edge_attributes was not None, this means an IfDefined descriptor was calling the current IfDefined descriptor. This is not allowed")
    dot_graph = []
    dot_graph.append('#ProcessIfDefinedDescriptor')
    names = []

    if segment['sub_segments']:
        sub_segment = segment['sub_segments'][0]
        dot_graph += DescriptorSegmentToDot(sub_segment, parent_node_name, parent_node_name, edge_attributes={'style':'dotted', 'label':'IfDefined'})

    if segment['arguments']:
        dot_graph.append('{0} -> {1} [style=dotted, label="IfDefined"]'.format(GetDotNodeName(segment['arguments'][0])['node'], GetDotNodeName(parent_node_name)['node']))

    return dot_graph

def DescriptorSegmentToDot(segment, parent_node_name, affix, edge_attributes = None):
    # segment is a dicionary which corresponds to a descriptor
    dot_graph = []
    if segment['name'] == "Append":
        dot_graph += ProcessAppendDescriptor(segment, parent_node_name, affix, edge_attributes)
    elif segment['name'] == "Offset":
        dot_graph += ProcessOffsetDescriptor(segment, parent_node_name, affix, edge_attributes)
    elif segment['name'] == "Sum":
        dot_graph += ProcessSumDescriptor(segment, parent_node_name, affix, edge_attributes)
    elif segment['name'] == "IfDefined":
        dot_graph += ProcessIfDefinedDescriptor(segment, parent_node_name, affix, edge_attributes)
    elif segment['name'] == "ReplaceIndex":
        dot_graph += ProcessReplaceIndexDescriptor(segment, parent_node_name, affix, edge_attributes)
    elif segment['name'] == "Round":
        dot_graph += ProcessRoundDescriptor(segment, parent_node_name, affix, edge_attributes)
    elif segment['name'] == "Scale":
        pass
    else:
        raise Exception('Descriptor {0}, is not recognized by this script. Please add Process{0}Descriptor method'.format(segment['name']))
    return dot_graph

def Nnet3DescriptorToDot(descriptor, parent_node_name):
    dot_lines = []
    [segments, arguments] = descriptor_parser.IdentifyNestedSegments(descriptor)
    if segments:
        for segment in segments:
            dot_lines += DescriptorSegmentToDot(segment, parent_node_name, parent_node_name)
    elif arguments:
        assert(len(arguments) == 1)
        dot_lines.append("{0} -> {1}".format(GetDotNodeName(arguments[0])['node'], GetDotNodeName(parent_node_name)['node']))
    return dot_lines

def ParseNnet3String(string):
    if re.search('^input-node|^component|^output-node|^component-node|^dim-range-node', string.strip()) is None:
        return [None, None]

    parts = string.split()
    config_type = parts[0]
    fields = []
    prev_field = ''
    for i in range(1, len(parts)):
        if re.search('=', parts[i]) is None:
            prev_field += ' '+parts[i]
        else:
            if not (prev_field.strip() == ''):
                fields.append(prev_field)
            sub_parts = parts[i].split('=')
            if (len(sub_parts) != 2):
                raise Exception('Malformed config line {0}'.format(string))
            fields.append(sub_parts[0])
            prev_field = sub_parts[1]
    fields.append(prev_field)

    parsed_string = {}
    try:
        while len(fields) > 0:
            value = re.sub(',$', '', fields.pop().strip())
            key = fields.pop()
            parsed_string[key.strip()] = value.strip()
    except IndexError:
        raise Exception('Malformed config line {0}'.format(string))
    return [config_type, parsed_string]

# sample component config line
# component name=L0_lda type=FixedAffineComponent, input-dim=300, output-dim=300, linear-params-stddev=0.00992724, bias-params-stddev=0.573973
def Nnet3ComponentToDot(component_config, component_attributes = None):
    label = ''
    if component_attributes is None:
        component_attributes = component_config.keys()
    attributes_to_print = set(component_attributes).intersection(list(component_config.keys()))
    # process the known fields
    for key in attributes_to_print:
        if key in component_config:
            label += '{0} = {1}\\n'.format(key, component_config[key])

    attr_string = ''
    try:
        attributes = node_attributes[component_config['type']]
        for key in attributes.keys():
            attr_string += ' {0}={1} '.format(key, attributes[key])
    except KeyError:
        pass

    return ['{0} [label="{1}" {2}]'.format(GetDotNodeName(component_config['name'], is_component = True)['node'], label, attr_string)]


# input-node name=input dim=40
def Nnet3InputToDot(parsed_config):
    return ['{0} [ label="{1}\\ndim={2}"]'.format(GetDotNodeName(parsed_config['name'])['node'], parsed_config['name'], parsed_config['dim'] )]

# output-node name=output input=Final_log_softmax dim=3940 objective=linear
#output-node name=output input=Offset(Final_log_softmax, 5) dim=3940 objective=linear
def Nnet3OutputToDot(parsed_config):
    dot_graph = []
    dot_graph += Nnet3DescriptorToDot(parsed_config['input'], parsed_config['name'])
    dot_graph.append('{0} [ label="{1}\\nobjective={2}"]'.format(GetDotNodeName(parsed_config['name'])['node'], parsed_config['name'], parsed_config['objective']))
    return dot_graph

# dim-range-node name=Lstm1_r_t input-node=Lstm1_rp_t dim-offset=0 dim=256
def Nnet3DimrangeToDot(parsed_config):
    dot_graph = []
    dot_node = GetDotNodeName(parsed_config['name'])
    dot_graph.append('{0} [shape=rectangle, label="{1}"]'.format(dot_node['node'], dot_node['label']))
    dot_graph.append('{0} -> {1} [taillabel="dimrange({2}, {3})"]'.format(GetDotNodeName(parsed_config['input-node'])['node'],
                                                           GetDotNodeName(parsed_config['name'])['node'],
                                                           parsed_config['dim-offset'],
                                                           parsed_config['dim']))
    return dot_graph

def Nnet3ComponentNodeToDot(parsed_config):
    dot_graph = []
    dot_graph += Nnet3DescriptorToDot(parsed_config['input'], parsed_config['name'])
    dot_node = GetDotNodeName(parsed_config['name'])
    dot_graph.append('{0} [ label="{1}", shape=box ]'.format(dot_node['node'], dot_node['label']))
    dot_graph.append('{0} -> {1} [ weight=10 ]'.format(GetDotNodeName(parsed_config['component'], is_component = True)['node'],
                                                       GetDotNodeName(parsed_config['name'])['node']))
    return dot_graph

def GroupConfigs(configs, node_prefixes = None):
    if node_prefixes is None:
        node_prefixes = []
    # we make the assumption that nodes belonging to the same sub-graph have a
    # commong prefix.
    grouped_configs = {}
    for node_prefix in node_prefixes:
        group = []
        rest = []
        for config in configs:
            if re.search('^{0}'.format(node_prefix), config[1]['name']) is not None:
                group.append(config)
            else:
                rest.append(config)
        configs = rest
        grouped_configs[node_prefix] = group
    grouped_configs[None] = configs

    return grouped_configs

def ParseConfigLines(lines, node_prefixes = None, component_attributes = None ):
    if node_prefixes is None:
        node_prefixes = []
    config_lines = []
    dot_graph=[]
    configs = []
    for line in lines:
        config_type, parsed_config = ParseNnet3String(line)
        if config_type is not None:
            configs.append([config_type, parsed_config])

    # process the config lines
    grouped_configs = GroupConfigs(configs, node_prefixes)
    for group in grouped_configs.keys():
        configs = grouped_configs[group]
        if not configs:
            continue
        if group is not None:
            # subgraphs prefixed with cluster will be treated differently by
            # dot
            dot_graph.append('subgraph cluster_{0} '.format(group) + "{")
            dot_graph.append('color=blue')

        for config in configs:
            config_type = config[0]
            parsed_config = config[1]
            if config_type is None:
                continue
            if config_type == 'input-node':
                dot_graph += Nnet3InputToDot(parsed_config)
            elif config_type == 'output-node':
                dot_graph += Nnet3OutputToDot(parsed_config)
            elif config_type == 'component-node':
                dot_graph += Nnet3ComponentNodeToDot(parsed_config)
            elif config_type == 'dim-range-node':
                dot_graph += Nnet3DimrangeToDot(parsed_config)
            elif config_type == 'component':
                dot_graph += Nnet3ComponentToDot(parsed_config, component_attributes)

        if group is not None:
            dot_graph.append('label = "{0}"'.format(group))
            dot_graph.append('}')

    dot_graph.insert(0, 'digraph nnet3graph {')
    dot_graph.append('}')

    return dot_graph

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Converts the output of nnet3-am-info "
                                                 "to dot graph. The output has to be compiled"
                                                 " with dot to generate a displayable graph",
                                    epilog="See steps/nnet3/nnet3_to_dot.sh for example.");
    parser.add_argument("--component-attributes", type=str,
                        help="Attributes of the components which should be displayed in the dot-graph "
                             "e.g. --component-attributes name,type,input-dim,output-dim", default=None)
    parser.add_argument("--node-prefixes", type=str,
                        help="list of prefixes. Nnet3 components/component-nodes with the same prefix"
                        " will be clustered together in the dot-graph"
                        " --node-prefixes Lstm1,Lstm2,Layer1", default=None)

    parser.add_argument("dotfile", help="name of the dot output file")

    print(' '.join(sys.argv), file=sys.stderr)

    args = parser.parse_args()
    component_attributes = None
    if args.component_attributes is not None:
        component_attributes = args.component_attributes.split(',')
    node_prefixes = []
    if args.node_prefixes is not None:
        node_prefixes = args.node_prefixes.split(',')

    lines = sys.stdin.readlines()
    dot_graph = ParseConfigLines(lines, component_attributes = component_attributes, node_prefixes = node_prefixes)

    dotfile_handle = open(args.dotfile, "w")
    dotfile_handle.write("\n".join(dot_graph))
    dotfile_handle.close()


================================================
FILE: egs/steps/nnet3/get_degs.sh
================================================
#!/usr/bin/env bash

# Copyright 2012-2016   Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.
# Copyright 2014-2015   Vimal Manohar

# Decodes denlats and dumps egs for discriminative training, in one script
# (avoids writing the non-compact lattices to disk, which can use a lot of disk
# space).


# Begin configuration section.
cmd=run.pl
max_copy_jobs=5  # Limit disk I/O

# feature options
online_ivector_dir=

# example splitting and context options
frames_per_eg=150 # number of frames of labels per example.
                  # Note: may in general be a comma-separated string of alternative
                  # durations; the first one (the principal num-frames) is preferred.
frames_overlap_per_eg=30 # number of supervised frames of overlap that we aim for per eg.
                  # can be useful to avoid wasted data if you're using --left-deriv-truncate
                  # and --right-deriv-truncate.
looped=false       # Set to true to enable looped decoding [can
                   # be a bit faster, for forward-recurrent models like LSTMs.]

# .. these context options also affect decoding.
extra_left_context=0    # amount of left-context per eg, past what is required by the model
                        # (only useful for recurrent networks like LSTMs/BLSTMs)
extra_right_context=0   # amount of right-context per eg, past what is required by the model
                        # (only useful for backwards-recurrent networks like BLSTMs)
extra_left_context_initial=-1    # if >= 0, the --extra-left-context to use at
                                 # the start of utterances.  Recommend 0 if you
                                 # used 0 for the baseline DNN training; if <0,
                                 # defaults to same as extra_left_context
extra_right_context_final=-1     # if >= 0, the --extra-right-context to use at
                                 # the end of utterances.  Recommend 0 if you
                                 # used 0 for the baseline DNN training; if <0,
                                 # defaults to same as extra_left_context

compress=true   # set this to false to disable lossy compression of features
                # dumped with egs (e.g. if you want to see whether results are
                # affected).

num_utts_subset=80     # number of utterances in validation and training
                       # subsets used for diagnostics.
num_egs_subset=800     # number of egs (maximum) for the validation and training
                       # subsets used for diagnostics.
frames_per_iter=1000000 # each iteration of training, see this many frames
                        # per job.  This is just a guideline; it will pick a number
                        # that divides the number of samples in the entire data.
cleanup=true

stage=0
nj=200

# By default this script uses final.mdl in <srcdir>, this configures it.
iter=final


# decoding-graph option
self_loop_scale=0.1  # for decoding graph.. should be 1.0 for chain models.

# options relating to decoding.
frames_per_chunk_decoding=150
beam=13.0
lattice_beam=7.0
acwt=0.1
max_active=5000
min_active=200
max_mem=20000000 # This will stop the processes getting too large.
# This is in bytes, but not "real" bytes-- you have to multiply
# by something like 5 or 10 to get real bytes (not sure why so large)
num_threads=1

# affects whether we invoke lattice-determinize-non-compact after decoding
# discriminative-get-supervision.
determinize_before_split=true


# End configuration section.


echo "$0 $@"  # Print the command line for logging

if [ -f path.sh ]; then . ./path.sh; fi
. parse_options.sh || exit 1;


if [ $# != 5 ]; then
  echo "Usage: $0 [opts] <data> <lang> <src-dir> <ali-dir> <degs-dir>"
  echo " e.g.: $0 data/train data/lang exp/nnet3/tdnn_a exp/nnet3/tdnn_a_ali exp/nnet3/tdnn_a_degs"
  echo ""
  echo "For options, see top of script file.  Standard options:"
  echo "  --config <config-file>                           # config file containing options"
  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs (probably would be good to add --max-jobs-run 5 or so if using"
  echo "                                                   # GridEngine (to avoid excessive NFS traffic)."
  echo "  --stage <stage|-8>                               # Used to run a partially-completed training process from somewhere in"
  echo "                                                   # the middle."
  echo "  --online-ivector-dir <dir|"">                    # Directory for online-estimated iVectors, used in the"
  echo "                                                   # online-neural-net setup."
  echo "  --nj <nj|200>                                    # number of jobs to submit to the queue."
  echo "  --num-threads <n|1>                              # number of threads per decoding job"
  exit 1;
fi

data=$1
lang=$2
srcdir=$3
alidir=$4
dir=$5


extra_files=
[ ! -z $online_ivector_dir ] && \
  extra_files="$extra_files $online_ivector_dir/ivector_period $online_ivector_dir/ivector_online.scp"

# Check some files.
for f in $data/feats.scp $lang/L.fst $lang/phones/silence.csl $srcdir/${iter}.mdl $srcdir/tree \
      $srcdir/cmvn_opts $alidir/ali.1.gz $alidir/num_jobs $extra_files; do
  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
done

mkdir -p $dir/log $dir/info || exit 1;

utils/lang/check_phones_compatible.sh $lang/phones.txt $srcdir/phones.txt || exit 1;
utils/lang/check_phones_compatible.sh $lang/phones.txt $alidir/phones.txt || exit 1;
cp $lang/phones.txt $dir || exit 1;


utils/split_data.sh --per-utt $data $nj
sdata=$data/split${nj}utt


## Set up features.
echo "$0: feature type is raw"


cmvn_opts=$(cat $srcdir/cmvn_opts) || exit 1

feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- |"

cp $srcdir/{splice_opts,cmvn_opts} $dir 2>/dev/null || true

## set iVector options
if [ ! -z "$online_ivector_dir" ]; then
  online_ivector_period=$(cat $online_ivector_dir/ivector_period)
  ivector_opts="--online-ivectors=scp:$online_ivector_dir/ivector_online.scp --online-ivector-period=$online_ivector_period"
fi

## set frame-subsampling-factor option and copy file
if [ -f $srcdir/frame_subsampling_factor ]; then
  frame_subsampling_factor=$(cat $srcdir/frame_subsampling_factor) || exit 1
  # e.g. for 'chain' systems
  frame_subsampling_opt="--frame-subsampling-factor=$frame_subsampling_factor"
  cp $srcdir/frame_subsampling_factor $dir
  if [ $frame_subsampling_factor -ne 1 ] && [ "$self_loop_scale" == "0.1" ]; then
    echo "$0: warning: frame_subsampling_factor is not 1 (so likely a chain system),"
    echo "...  but self-loop-scale is 0.1.  Make sure this is not a mistake."
    sleep 1
  fi
else
  frame_subsampling_factor=1
fi

if [ "$self_loop_scale" == "1.0" ] && [ "$acwt" == 0.1 ]; then
  echo "$0: warning: you set --self-loop-scale=1.0 (so likely a chain system)",
  echo " ... but the acwt is still 0.1 (you probably want --acwt 1.0)"
  sleep 1
fi

## Make the decoding graph.
if [ $stage -le 0 ]; then
  new_lang="$dir/"$(basename "$lang")
  rm -r $new_lang 2>/dev/null
  cp -rH $lang $dir
  echo "$0: Making unigram grammar FST in $new_lang"
  oov=$(cat data/lang/oov.txt)
  cat $data/text | utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt | \
   awk '{for(n=2;n<=NF;n++){ printf("%s ", $n); } printf("\n"); }' | \
    utils/make_unigram_grammar.pl | fstcompile | fstarcsort --sort_type=ilabel > $new_lang/G.fst \
    || exit 1;

  utils/mkgraph.sh --self-loop-scale $self_loop_scale $new_lang $srcdir $dir/dengraph || exit 1;
fi

# copy alignments into ark,scp format which allows us to use different num-jobs
# from the alignment, and is also convenient for getting priors.
if [ $stage -le 1 ]; then
  echo "$0: Copying input alignments"
  nj_ali=$(cat $alidir/num_jobs)
  alis=$(for n in $(seq $nj_ali); do echo -n "$alidir/ali.$n.gz "; done)
  $cmd $dir/log/copy_alignments.log \
     copy-int-vector "ark:gunzip -c $alis|" \
     ark,scp:$dir/ali.ark,$dir/ali.scp || exit 1;
fi

[ -f $dir/ali.scp ] || { echo "$0: expected $dir/ali.scp to exist"; exit 1; }

if [ $stage -le 2 ]; then
  echo "$0: working out number of frames of training data"
  num_frames=$(steps/nnet2/get_num_frames.sh $data)
  echo $num_frames > $dir/info/num_frames
  echo "$0: working out feature dim"
  feats_one="$(echo $feats | sed s:JOB:1:g)"
  if feat_dim=$(feat-to-dim "$feats_one" - 2>/dev/null); then
    echo $feat_dim > $dir/info/feat_dim
  else # run without stderr redirection to show the error.
    feat-to-dim "$feats_one" -; exit 1
  fi
else
  num_frames=$(cat $dir/info/num_frames)
fi
if ! [ "$num_frames" -gt 0 ]; then
  echo "$0: bad num-frames=$num_frames"; exit 1
fi

# copy the model to the degs directory.
cp $srcdir/${iter}.mdl $dir/final.mdl || exit 1

# Create some info in $dir/info

# Work out total number of archives. Add one on the assumption the
# num-frames won't divide exactly, and we want to round up.
num_archives=$[num_frames/frames_per_iter+1]

echo $num_archives >$dir/info/num_archives
echo $frame_subsampling_factor >$dir/info/frame_subsampling_factor
cp $lang/phones/silence.csl $dir/info/

# the first field in frames_per_eg (which is a comma-separated list of numbers)
# is the 'principal' frames-per-eg, and for purposes of working out the number
# of archives we assume that this will be the average number of frames per eg.
frames_per_eg_principal=$(echo $frames_per_eg | cut -d, -f1)


# read 'mof' as max_open_filehandles.
# When splitting up the scp files, we don't want to have to hold too many
# files open at once.  If the number of archives we have to write exceeds
# 256 (or less if unlimit -n is smaller), we split in two stages.
mof=$(ulimit -n) || exit 1
# the next step helps work around inconsistency between different machines on a
# cluster.  It's unlikely that the allowed number of open filehandles would ever
# be less than 256.
if [ $mof -gt 256 ]; then mof=256; fi
# allocate mof minus 3 for the max allowed outputs, because of
# stdin,stderr,stdout.  this will normally come to 253.  We'll do a two-stage
# splitting if the needed number of scp files is larger than this.
num_groups=$[(num_archives+(mof-3)-1)/(mof-3)]
group_size=$[(num_archives+num_groups-1)/num_groups]
if [ $num_groups -gt 1 ]; then
  new_num_archives=$[group_size*num_groups]
  [ $new_num_archives -ne $num_archives ] && \
    echo "$0: rounding up num-archives from $num_archives to $new_num_archives for easier splitting"
  num_archives=$new_num_archives
  echo $new_num_archives >$dir/info/num_archives
fi


if [ -e $dir/storage ]; then
  # Make soft links to storage directories, if distributing this way..  See
  # utils/create_split_dir.pl.
  echo "$0: creating data links"
  utils/create_data_link.pl $(for x in $(seq $num_archives); do echo $dir/degs.$x.ark; done)
  utils/create_data_link.pl $(for x in $(seq $num_archives); do echo $dir/degs.$x.scp; done)
  utils/create_data_link.pl $(for y in $(seq $nj); do echo $dir/degs_orig.$y.ark; done)
  utils/create_data_link.pl $(for y in $(seq $nj); do echo $dir/degs_orig.$y.scp; done)
  utils/create_data_link.pl $(for y in $(seq $nj); do echo $dir/degs_orig_filtered.$y.scp; done)
fi


extra_context_opts="--extra-left-context=$extra_left_context --extra-right-context=$extra_right_context --extra-left-context-initial=$extra_left_context_initial --extra-right-context-final=$extra_right_context_final"

# work out absolute context opts, --left-context and so on [need model context]
model_left_context=$(nnet3-am-info $srcdir/${iter}.mdl | grep "^left-context:" | awk '{print $2}')
model_right_context=$(nnet3-am-info $srcdir/${iter}.mdl | grep "^right-context:" | awk '{print $2}')
left_context=$[model_left_context+extra_left_context+frame_subsampling_factor/2]
right_context=$[model_right_context+extra_right_context+frame_subsampling_factor/2]
context_opts="--left-context=$left_context --right-context=$right_context"
if [ $extra_left_context_initial -ge 0 ]; then
  left_context_initial=$[model_left_context+extra_left_context_initial+frame_subsampling_factor/2]
  context_opts="$context_opts --left-context-initial=$left_context_initial"
fi
if [ $extra_right_context_final -ge 0 ]; then
  right_context_final=$[model_right_context+extra_right_context_final+frame_subsampling_factor/2]
  context_opts="$context_opts --right-context-final=$right_context_final"
fi

##
if [ $num_threads -eq 1 ]; then
  if $looped; then
    decoder="nnet3-latgen-faster-looped"
    [ $extra_left_context_initial -ge 0 ] && \
      decoder="$decoder --extra-left-context-initial=$extra_left_context_initial"
  else
    decoder="nnet3-latgen-faster $extra_context_opts"
  fi
  threads_cmd_opt=
else
  $looped && { echo "$0: --num-threads must be one if you use looped decoding"; exit 1; }
  threads_cmd_opt="--num-threads $num_threads"
  decoder="nnet3-latgen-faster-parallel --num-threads=$num_threads $extra_context_opts"
  true
fi

# set the command to determinize lattices, if specified.
if $determinize_before_split; then
  lattice_determinize_cmd="lattice-determinize-non-compact --acoustic-scale=$acwt --max-mem=$max_mem --minimize=true --prune=true --beam=$lattice_beam ark:- ark:-"
else
  lattice_determinize_cmd="cat"
fi

if [ $stage -le 3 ]; then
  echo "$0: decoding and dumping egs"
  $cmd $threads_cmd_opt JOB=1:$nj $dir/log/decode_and_get_egs.JOB.log \
     $decoder \
     $ivector_opts $frame_subsampling_opt \
    --frames-per-chunk=$frames_per_chunk_decoding \
    --determinize-lattice=false \
    --max-active=$max_active --min-active=$min_active --beam=$beam \
    --lattice-beam=$lattice_beam --acoustic-scale=$acwt --allow-partial=false \
    --word-symbol-table=$lang/words.txt $dir/final.mdl  \
    $dir/dengraph/HCLG.fst "$feats" ark:- \| \
    $lattice_determinize_cmd  \| \
    nnet3-discriminative-get-egs --acoustic-scale=$acwt --compress=$compress \
      $frame_subsampling_opt --num-frames=$frames_per_eg \
      --num-frames-overlap=$frames_overlap_per_eg \
      $ivector_opts $context_opts \
      $dir/final.mdl "$feats"  "ark,s,cs:-" \
      "scp:utils/filter_scp.pl $sdata/JOB/utt2spk $dir/ali.scp |" \
      ark,scp:$dir/degs_orig.JOB.ark,$dir/degs_orig.JOB.scp || exit 1
fi


if [ $stage -le 4 ]; then
  echo "$0: getting validation utterances."

  ## Get list of validation utterances.
  awk '{print $1}' $data/utt2spk | utils/shuffle_list.pl | head -$num_utts_subset \
   > $dir/valid_uttlist || exit 1;

  if [ -f $data/utt2uniq ]; then  # this matters if you use data augmentation.
    echo "File $data/utt2uniq exists, so augmenting valid_uttlist to"
    echo "include all perturbed versions of the same 'real' utterances."
    mv $dir/valid_uttlist $dir/valid_uttlist.tmp
    utils/utt2spk_to_spk2utt.pl $data/utt2uniq > $dir/uniq2utt
    cat $dir/valid_uttlist.tmp | utils/apply_map.pl $data/utt2uniq | \
      sort | uniq | utils/apply_map.pl $dir/uniq2utt | \
      awk '{for(n=1;n<=NF;n++) print $n;}' | sort  > $dir/valid_uttlist
    rm $dir/uniq2utt $dir/valid_uttlist.tmp
  fi

  # the following awk statement turns 'foo123' into something like
  # '^foo123-[0-9]\+ ' which is a grep expression that matches the lines in the
  # .scp file that correspond to an utterance in valid_uttlist.
  cat $dir/valid_uttlist | awk '{printf("^%s-[0-9]\\+ \n", $1);}' \
     >$dir/valid_uttlist.regexps || exit 1

  # remove the validation utterances from deg_orig.*.scp to produce
  # degs_orig_filtered.*.scp.
  # note: the '||' true is in case the grep returns nonzero status for
  # some splits, because they were all validation utterances.
  $cmd JOB=1:$nj $dir/log/filter_and_shuffle.JOB.log \
     grep -v -f $dir/valid_uttlist.regexps $dir/degs_orig.JOB.scp '>' \
     $dir/degs_orig_filtered.JOB.scp '||' true || exit 1

  # extract just the validation utterances from deg_orig.*.scp to produce
  # degs_valid.*.scp.
  $cmd JOB=1:$nj $dir/log/extract_validation_egs.JOB.log \
    grep -f $dir/valid_uttlist.regexps $dir/degs_orig.JOB.scp '>' \
    $dir/degs_valid.JOB.scp '||' true || exit 1

  for j in $(seq $nj); do
    cat $dir/degs_valid.$j.scp; rm $dir/degs_valid.$j.scp;
  done | utils/shuffle_list.pl | head -n$num_utts_subset >$dir/valid_diagnostic.scp || exit 1

  [ -s $dir/valid_diagnostic.scp ] || { echo "$0: error getting validation egs"; exit 1; }
fi


# function/pseudo-command to randomly shuffle input lines using a small buffer size
function shuffle {
    perl -e ' use List::Util qw(shuffle); srand(0);
       $bufsz=1000; @A = (); while(<STDIN>) { push @A, $_; if (@A == $bufsz) {
       $n=int(rand()*$bufsz); print $A[$n]; $A[$n] = $A[$bufsz-1]; pop @A; }}
       @A = shuffle(@A); print @A; '
}
# funtion/pseudo-command to put input lines round robin to command line args.
function round_robin {
  perl -e '@F=(); foreach $a (@ARGV) { my $f; open($f, ">$a") || die "opening file $a"; push @F, $f; }
         $N=@F; $N>0||die "No output files"; $n=0;
         while (<STDIN>) { $fh=$F[$n%$N]; $n++; print $fh $_ || die "error printing"; } ' $*
}


if [ $stage -le 5 ]; then
  echo "$0: rearranging scp files"

  if [ $num_groups -eq 1 ]; then
    # output directly to the archive files.
    outputs=$(for n in $(seq $num_archives); do echo $dir/degs.$n.scp; done)
  else
    # output to intermediate 'group' files.
    outputs=$(for g in $(seq $num_groups); do echo $dir/degs_group.$g.scp; done)
  fi

  # We can't use UNIX's split command because of compatibility issues (BSD
  # version very different from GNU version), so we use 'round_robin' which is
  # a bash function that calls an inline perl script.
  for j in $(seq $nj); do cat $dir/degs_orig_filtered.$j.scp; done | \
    shuffle | round_robin $outputs || exit 1

  if [ $num_groups -gt 1 ]; then
    for g in $(seq $num_groups); do
      first=$[1+group_size*(g-1)]
      last=$[group_size*g]
      outputs=$(for n in $(seq $first $last); do echo $dir/degs.$n.scp; done)
      cat $dir/degs_group.$g.scp | shuffle | round_robin $outputs
    done
  fi
fi

if [ $stage -le 6 ]; then
  echo "$0: getting train-subset scp"
  # get degs_train_subset.scp by taking the top and tail of the degs files [quicker
  # than cat'ing all the files, random shuffling and head]

  nl=$[$num_egs_subset/$num_archives + 1]

  # use utils/shuffle_list.pl because it provides a complete shuffle (ok since
  # the amount of data is small).  note: shuf is not available on mac by
  # default.
  for n in $(seq $num_archives); do
    head -n$nl $dir/degs.$n.scp;  tail -n$nl $dir/degs.$n.scp
  done  | utils/shuffle_list.pl | head -n$num_utts_subset >$dir/train_diagnostic.scp
  [ -s $dir/train_diagnostic.scp ] || { echo "$0: error getting train_diagnostic.scp"; exit 1; }
fi

if [ $stage -le 7 ]; then
  echo "$0: creating final archives"
  $cmd --max-jobs-run "$max_copy_jobs" \
     JOB=1:$num_archives $dir/log/copy_archives.JOB.log \
     nnet3-discriminative-copy-egs scp:$dir/degs.JOB.scp ark:$dir/degs.JOB.ark || exit 1

  run.pl $dir/log/copy_train_subset.log \
      nnet3-discriminative-copy-egs scp:$dir/train_diagnostic.scp \
         ark:$dir/train_diagnostic.degs  || exit 1

  run.pl $dir/log/copy_valid_subset.log \
      nnet3-discriminative-copy-egs scp:$dir/valid_diagnostic.scp \
         ark:$dir/valid_diagnostic.degs  || exit 1
fi

if [ $stage -le 10 ] && $cleanup; then
  echo "$0: cleaning up temporary files."
  for j in $(seq $nj); do
    for f in $dir/degs_orig.$j.{ark,scp} $dir/degs_orig_filtered.$j.scp; do
      [ -L $f ] && rm $(utils/make_absolute.sh $f); rm $f
    done
  done
  rm $dir/degs_group.*.scp $dir/valid_diagnostic.scp $dir/train_diagnostic.scp 2>/dev/null
  rm $dir/ali.ark $dir/ali.scp 2>/dev/null
  for n in $(seq $num_archives); do
    for f in $dir/degs.$n.scp; do
      [ -L $f ] && rm $(utils/make_absolute.sh $f); rm $f
    done
  done
fi


echo "$0: Finished decoding and preparing training examples"

exit 0


================================================
FILE: egs/steps/nnet3/get_egs.sh
================================================
#!/usr/bin/env bash

# Copyright 2012-2016 Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.
#
# This script, which will generally be called from other neural-net training
# scripts, extracts the training examples used to train the neural net (and also
# the validation examples used for diagnostics), and puts them in separate archives.
#
# This script dumps egs with several frames of labels, controlled by the
# frames_per_eg config variable (default: 8).  This takes many times less disk
# space because typically we have 4 to 7 frames of context on the left and
# right, and this ends up getting shared.  This is at the expense of slightly
# higher disk I/O while training.

set -o pipefail
trap "" PIPE

# Begin configuration section.
cmd=run.pl
frame_subsampling_factor=1
frames_per_eg=8   # number of frames of labels per example.  more->less disk space and
                  # less time preparing egs, but more I/O during training.
                  # Note: may in general be a comma-separated string of alternative
                  # durations (more useful when using large chunks, e.g. for BLSTMs);
                  # the first one (the principal num-frames) is preferred.
left_context=4    # amount of left-context per eg (i.e. extra frames of input features
                  # not present in the output supervision).
right_context=4   # amount of right-context per eg.
left_context_initial=-1    # if >=0, left-context for first chunk of an utterance
right_context_final=-1     # if >=0, right-context for last chunk of an utterance
compress=true   # set this to false to disable compression (e.g. if you want to see whether
                # results are affected).

num_utts_subset=300     # number of utterances in validation and training
                        # subsets used for shrinkage and diagnostics.
num_valid_frames_combine=0 # #valid frames for combination weights at the very end.
num_train_frames_combine=60000 # # train frames for the above.
num_frames_diagnostic=10000 # number of frames for "compute_prob" jobs
samples_per_iter=400000 # this is the target number of egs in each archive of egs
                        # (prior to merging egs).  We probably should have called
                        # it egs_per_iter. This is just a guideline; it will pick
                        # a number that divides the number of samples in the
                        # entire data.

stage=0
nj=6         # This should be set to the maximum number of jobs you are
             # comfortable to run in parallel; you can increase it if your disk
             # speed is greater and you have more machines.
srand=0     # rand seed for nnet3-copy-egs and nnet3-shuffle-egs
online_ivector_dir=  # can be used if we are including speaker information as iVectors.
cmvn_opts=  # can be used for specifying CMVN options, if feature type is not lda (if lda,
            # it doesn't make sense to use different options than were used as input to the
            # LDA transform).  This is used to turn off CMVN in the online-nnet experiments.
online_cmvn=false # Set to 'true' to replace 'apply-cmvn' by 'apply-cmvn-online' in the nnet3 input.
                  # The configuration is passed externally via '$cmvn_opts' given to train.py,
                  # typically as: --cmvn-opts="--config conf/online_cmvn.conf".
                  # The global_cmvn.stats are computed by this script from the features.
                  # Note: the online cmvn for ivector extractor it is controlled separately in
                  #       steps/online/nnet2/train_ivector_extractor.sh by --online-cmvn-iextractor

generate_egs_scp=false # If true, it will generate egs.JOB.*.scp per egs archive

echo "$0 $@"  # Print the command line for logging

if [ -f path.sh ]; then . ./path.sh; fi
. parse_options.sh || exit 1;

if [ $# != 3 ]; then
  echo "Usage: $0 [opts] <data> <ali-dir> <egs-dir>"
  echo " e.g.: $0 data/train exp/tri3_ali exp/tri4_nnet/egs"
  echo ""
  echo "Main options (for others, see top of script file)"
  echo "  --config <config-file>                           # config file containing options"
  echo "  --nj <nj>                                        # The maximum number of jobs you want to run in"
  echo "                                                   # parallel (increase this only if you have good disk and"
  echo "                                                   # network speed).  default=6"
  echo "  --cmd (utils/run.pl;utils/queue.pl <queue opts>) # how to run jobs."
  echo "  --samples-per-iter <#samples;400000>             # Target number of egs per archive (option is badly named)"
  echo "  --frames-per-eg <frames;8>                       # number of frames per eg on disk"
  echo "                                                   # May be either a single number or a comma-separated list"
  echo "                                                   # of alternatives (useful when training LSTMs, where the"
  echo "                                                   # frames-per-eg is the chunk size, to get variety of chunk"
  echo "                                                   # sizes).  The first in the list is preferred and is used"
  echo "                                                   # when working out the number of archives etc."
  echo "  --left-context <int;4>                           # Number of frames on left side to append for feature input"
  echo "  --right-context <int;4>                          # Number of frames on right side to append for feature input"
  echo "  --left-context-initial <int;-1>                  # If >= 0, left-context for first chunk of an utterance"
  echo "  --right-context-final <int;-1>                   # If >= 0, right-context for last chunk of an utterance"
  echo "  --num-frames-diagnostic <#frames;4000>           # Number of frames used in computing (train,valid) diagnostics"
  echo "  --num-valid-frames-combine <#frames;10000>       # Number of frames used in getting combination weights at the"
  echo "                                                   # very end."
  echo "  --stage <stage|0>                                # Used to run a partially-completed training process from somewhere in"
  echo "                                                   # the middle."

  exit 1;
fi

data=$1
alidir=$2
dir=$3

# Check some files.
[ ! -z "$online_ivector_dir" ] && \
  extra_files="$online_ivector_dir/ivector_online.scp $online_ivector_dir/ivector_period"

for f in $data/feats.scp $alidir/ali.1.gz $alidir/final.mdl $alidir/tree $extra_files; do
  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
done

sdata=$data/split$nj
utils/split_data.sh $data $nj

mkdir -p $dir/log $dir/info
cp $alidir/tree $dir

num_ali_jobs=$(cat $alidir/num_jobs) || exit 1;


num_utts=$(cat $data/utt2spk | wc -l)
if ! [ $num_utts -gt $[$num_utts_subset*4] ]; then
  echo "$0: number of utterances $num_utts in your training data is too small versus --num-utts-subset=$num_utts_subset"
  echo "... you probably have so little data that it doesn't make sense to train a neural net."
  exit 1
fi

# Get list of validation utterances.
awk '{print $1}' $data/utt2spk | utils/shuffle_list.pl 2>/dev/null | head -$num_utts_subset \
    > $dir/valid_uttlist

if [ -f $data/utt2uniq ]; then  # this matters if you use data augmentation.
  echo "File $data/utt2uniq exists, so augmenting valid_uttlist to"
  echo "include all perturbed versions of the same 'real' utterances."
  mv $dir/valid_uttlist $dir/valid_uttlist.tmp
  utils/utt2spk_to_spk2utt.pl $data/utt2uniq > $dir/uniq2utt
  cat $dir/valid_uttlist.tmp | utils/apply_map.pl $data/utt2uniq | \
    sort | uniq | utils/apply_map.pl $dir/uniq2utt | \
    awk '{for(n=1;n<=NF;n++) print $n;}' | sort  > $dir/valid_uttlist
  rm $dir/uniq2utt $dir/valid_uttlist.tmp
fi

awk '{print $1}' $data/utt2spk | utils/filter_scp.pl --exclude $dir/valid_uttlist | \
   utils/shuffle_list.pl 2>/dev/null | head -$num_utts_subset > $dir/train_subset_uttlist

echo "$0: creating egs.  To ensure they are not deleted later you can do:  touch $dir/.nodelete"

## Set up features.

# get the global_cmvn stats for online-cmvn,
if $online_cmvn; then
  # create global_cmvn.stats
  #
  # caution: the top-level nnet training script should copy
  # 'global_cmvn.stats' and 'online_cmvn' to its own dir.
  if ! matrix-sum --binary=false scp:$data/cmvn.scp - >$dir/global_cmvn.stats 2>/dev/null; then
    echo "$0: Error summing cmvn stats"
    exit 1
  fi
  touch $dir/online_cmvn
else
  [ -f $dir/online_cmvn ] && rm $dir/online_cmvn
fi

# create the feature pipelines,
if ! $online_cmvn; then
  # the original front-end with 'apply-cmvn',
  echo "$0: feature type is raw, with 'apply-cmvn'"
  feats="ark,s,cs:utils/filter_scp.pl --exclude $dir/valid_uttlist $sdata/JOB/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:- ark:- |"
  valid_feats="ark,s,cs:utils/filter_scp.pl $dir/valid_uttlist $data/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- |"
  train_subset_feats="ark,s,cs:utils/filter_scp.pl $dir/train_subset_uttlist $data/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- |"
else
  # the alternative front-end with 'apply-cmvn-online',
  # - the $cmvn_opts can be set to '--config=conf/online_cmvn.conf' which is the setup of ivector-extractor,
  echo "$0: feature type is raw, with 'apply-cmvn-online'"
  feats="ark,s,cs:utils/filter_scp.pl --exclude $dir/valid_uttlist $sdata/JOB/feats.scp | apply-cmvn-online $cmvn_opts --spk2utt=ark:$sdata/JOB/spk2utt  $dir/global_cmvn.stats scp:- ark:- |"
  valid_spk2utt="ark:utils/filter_scp.pl $dir/valid_uttlist $data/utt2spk | utils/utt2spk_to_spk2utt.pl |"
  valid_feats="ark,s,cs:utils/filter_scp.pl $dir/valid_uttlist $data/feats.scp | apply-cmvn-online $cmvn_opts --spk2utt=\"$valid_spk2utt\" $dir/global_cmvn.stats scp:- ark:- |"
  train_subset_spk2utt="ark:utils/filter_scp.pl $dir/train_subset_uttlist $data/utt2spk | utils/utt2spk_to_spk2utt.pl |"
  train_subset_feats="ark,s,cs:utils/filter_scp.pl $dir/train_subset_uttlist $data/feats.scp | apply-cmvn-online $cmvn_opts --spk2utt=\"$train_subset_spk2utt\" $dir/global_cmvn.stats scp:- ark:- |"
fi
echo $cmvn_opts >$dir/cmvn_opts # caution: the top-level nnet training script should copy this to its own dir now.


if [ ! -z "$online_ivector_dir" ]; then
  ivector_dim=$(feat-to-dim scp:$online_ivector_dir/ivector_online.scp -) || exit 1;
  echo $ivector_dim > $dir/info/ivector_dim
  steps/nnet2/get_ivector_id.sh $online_ivector_dir > $dir/info/final.ie.id || exit 1
  ivector_period=$(cat $online_ivector_dir/ivector_period) || exit 1;
  ivector_opts="--online-ivectors=scp:$online_ivector_dir/ivector_online.scp --online-ivector-period=$ivector_period"
else
  ivector_opts=""
  echo 0 >$dir/info/ivector_dim
fi

if [ $stage -le 1 ]; then
  echo "$0: working out number of frames of training data"
  num_frames=$(steps/nnet2/get_num_frames.sh $data)
  echo $num_frames > $dir/info/num_frames
  echo "$0: working out feature dim"
  feats_one="$(echo $feats | sed s/JOB/1/g)"
  if feat_dim=$(feat-to-dim "$feats_one" - 2>/dev/null); then
    echo $feat_dim > $dir/info/feat_dim
  else # run without redirection to show the error.
    feat-to-dim "$feats_one" -; exit 1
  fi
else
  num_frames=$(cat $dir/info/num_frames) || exit 1;
  feat_dim=$(cat $dir/info/feat_dim) || exit 1;
fi


# the first field in frames_per_eg (which is a comma-separated list of numbers)
# is the 'principal' frames-per-eg, and for purposes of working out the number
# of archives we assume that this will be the average number of frames per eg.
frames_per_eg_principal=$(echo $frames_per_eg | cut -d, -f1)

# the + 1 is to round up, not down... we assume it doesn't divide exactly.
num_archives=$[$num_frames/($frames_per_eg_principal*$samples_per_iter)+1]
if [ $num_archives -eq 1 ]; then
  echo "*** $0: warning: the --frames-per-eg is too large to generate one archive with"
  echo "*** as many as --samples-per-iter egs in it.  Consider reducing --frames-per-eg."
  sleep 4
fi

# We may have to first create a smaller number of larger archives, with number
# $num_archives_intermediate, if $num_archives is more than the maximum number
# of open filehandles that the system allows per process (ulimit -n).
# This sometimes gives a misleading answer as GridEngine sometimes changes that
# somehow, so we limit it to 512.
max_open_filehandles=$(ulimit -n) || exit 1
[ $max_open_filehandles -gt 512 ] && max_open_filehandles=512
num_archives_intermediate=$num_archives
archives_multiple=1
while [ $[$num_archives_intermediate+4] -gt $max_open_filehandles ]; do
  archives_multiple=$[$archives_multiple+1]
  num_archives_intermediate=$[$num_archives/$archives_multiple+1];
done
# now make sure num_archives is an exact multiple of archives_multiple.
num_archives=$[$archives_multiple*$num_archives_intermediate]

echo $num_archives >$dir/info/num_archives
echo $frames_per_eg >$dir/info/frames_per_eg
# Work out the number of egs per archive
egs_per_archive=$[$num_frames/($frames_per_eg_principal*$num_archives)]
! [ $egs_per_archive -le $samples_per_iter ] && \
  echo "$0: script error: egs_per_archive=$egs_per_archive not <= samples_per_iter=$samples_per_iter" \
  && exit 1;

echo $egs_per_archive > $dir/info/egs_per_archive

echo "$0: creating $num_archives archives, each with $egs_per_archive egs, with"
echo "$0:   $frames_per_eg labels per example, and (left,right) context = ($left_context,$right_context)"
if [ $left_context_initial -ge 0 ] || [ $right_context_final -ge 0 ]; then
  echo "$0:   ... and (left-context-initial,right-context-final) = ($left_context_initial,$right_context_final)"
fi


if [ -e $dir/storage ]; then
  # Make soft links to storage directories, if distributing this way..  See
  # utils/create_split_dir.pl.
  echo "$0: creating data links"
  utils/create_data_link.pl $(for x in $(seq $num_archives); do echo $dir/egs.$x.ark; done)
  for x in $(seq $num_archives_intermediate); do
    utils/create_data_link.pl $(for y in $(seq $nj); do echo $dir/egs_orig.$y.$x.ark; done)
  done
fi

if [ $stage -le 2 ]; then
  echo "$0: copying data alignments"
  for id in $(seq $num_ali_jobs); do gunzip -c $alidir/ali.$id.gz; done | \
    copy-int-vector ark:- ark,scp:$dir/ali.ark,$dir/ali.scp || exit 1;
fi

egs_opts="--left-context=$left_context --right-context=$right_context --compress=$compress --num-frames=$frames_per_eg"
[ $left_context_initial -ge 0 ] && egs_opts="$egs_opts --left-context-initial=$left_context_initial"
[ $right_context_final -ge 0 ] && egs_opts="$egs_opts --right-context-final=$right_context_final"

echo $left_context > $dir/info/left_context
echo $right_context > $dir/info/right_context
echo $left_context_initial > $dir/info/left_context_initial
echo $right_context_final > $dir/info/right_context_final


num_pdfs=$(tree-info --print-args=false $alidir/tree | grep num-pdfs | awk '{print $2}')
if [ $stage -le 3 ]; then
  echo "$0: Getting validation and training subset examples."
  rm $dir/.error 2>/dev/null
  echo "$0: ... extracting validation and training-subset alignments."


  # do the filtering just once, as ali.scp may be long.
  utils/filter_scp.pl <(cat $dir/valid_uttlist $dir/train_subset_uttlist) \
    <$dir/ali.scp >$dir/ali_special.scp

  $cmd $dir/log/create_valid_subset.log \
    utils/filter_scp.pl $dir/valid_uttlist $dir/ali_special.scp \| \
    ali-to-pdf $alidir/final.mdl scp:- ark:- \| \
    ali-to-post ark:- ark:- \| \
    nnet3-get-egs --num-pdfs=$num_pdfs --frame-subsampling-factor=$frame_subsampling_factor \
      $ivector_opts $egs_opts "$valid_feats" \
      ark,s,cs:- "ark:$dir/valid_all.egs" || touch $dir/.error &
  $cmd $dir/log/create_train_subset.log \
    utils/filter_scp.pl $dir/train_subset_uttlist $dir/ali_special.scp \| \
    ali-to-pdf $alidir/final.mdl scp:- ark:- \| \
    ali-to-post ark:- ark:- \| \
    nnet3-get-egs --num-pdfs=$num_pdfs --frame-subsampling-factor=$frame_subsampling_factor \
      $ivector_opts $egs_opts "$train_subset_feats" \
      ark,s,cs:- "ark:$dir/train_subset_all.egs" || touch $dir/.error &
  wait;
  [ -f $dir/.error ] && echo "Error detected while creating train/valid egs" && exit 1
  echo "... Getting subsets of validation examples for diagnostics and combination."
  if $generate_egs_scp; then
    valid_diagnostic_output="ark,scp:$dir/valid_diagnostic.egs,$dir/valid_diagnostic.scp"
    train_diagnostic_output="ark,scp:$dir/train_diagnostic.egs,$dir/train_diagnostic.scp"
  else
    valid_diagnostic_output="ark:$dir/valid_diagnostic.egs"
    train_diagnostic_output="ark:$dir/train_diagnostic.egs"
  fi
  $cmd $dir/log/create_valid_subset_combine.log \
    nnet3-subset-egs --n=$[$num_valid_frames_combine/$frames_per_eg_principal] ark:$dir/valid_all.egs \
      ark:$dir/valid_combine.egs || touch $dir/.error &
  $cmd $dir/log/create_valid_subset_diagnostic.log \
    nnet3-subset-egs --n=$[$num_frames_diagnostic/$frames_per_eg_principal] ark:$dir/valid_all.egs \
    $valid_diagnostic_output || touch $dir/.error &

  $cmd $dir/log/create_train_subset_combine.log \
    nnet3-subset-egs --n=$[$num_train_frames_combine/$frames_per_eg_principal] ark:$dir/train_subset_all.egs \
      ark:$dir/train_combine.egs || touch $dir/.error &
  $cmd $dir/log/create_train_subset_diagnostic.log \
    nnet3-subset-egs --n=$[$num_frames_diagnostic/$frames_per_eg_principal] ark:$dir/train_subset_all.egs \
    $train_diagnostic_output || touch $dir/.error &
  wait
  sleep 5  # wait for file system to sync.
  cat $dir/valid_combine.egs $dir/train_combine.egs > $dir/combine.egs
  if $generate_egs_scp; then
    cat $dir/valid_combine.egs $dir/train_combine.egs  | \
    nnet3-copy-egs ark:- ark,scp:$dir/combine.egs,$dir/combine.scp
    rm $dir/{train,valid}_combine.scp
  else
    cat $dir/valid_combine.egs $dir/train_combine.egs > $dir/combine.egs
  fi
  for f in $dir/{combine,train_diagnostic,valid_diagnostic}.egs; do
    [ ! -s $f ] && echo "No examples in file $f" && exit 1;
  done
  rm $dir/valid_all.egs $dir/train_subset_all.egs $dir/{train,valid}_combine.egs
fi

if [ $stage -le 4 ]; then
  # create egs_orig.*.*.ark; the first index goes to $nj,
  # the second to $num_archives_intermediate.

  egs_list=
  for n in $(seq $num_archives_intermediate); do
    egs_list="$egs_list ark:$dir/egs_orig.JOB.$n.ark"
  done
  echo "$0: Generating training examples on disk"
  # The examples will go round-robin to egs_list.
  $cmd JOB=1:$nj $dir/log/get_egs.JOB.log \
    nnet3-get-egs --num-pdfs=$num_pdfs --frame-subsampling-factor=$frame_subsampling_factor \
    $ivector_opts $egs_opts "$feats" \
    "ark,s,cs:filter_scp.pl $sdata/JOB/utt2spk $dir/ali.scp | ali-to-pdf $alidir/final.mdl scp:- ark:- | ali-to-post ark:- ark:- |" ark:- \| \
    nnet3-copy-egs --random=true --srand=\$[JOB+$srand] ark:- $egs_list || exit 1;
fi

if [ $stage -le 5 ]; then
  echo "$0: recombining and shuffling order of archives on disk"
  # combine all the "egs_orig.*.JOB.scp" (over the $nj splits of the data) and
  # shuffle the order, writing to the egs.JOB.ark

  # the input is a concatenation over the input jobs.
  egs_list=
  for n in $(seq $nj); do
    egs_list="$egs_list $dir/egs_orig.$n.JOB.ark"
  done

  if [ $archives_multiple == 1 ]; then # normal case.
    if $generate_egs_scp; then
      output_archive="ark,scp:$dir/egs.JOB.ark,$dir/egs.JOB.scp"
    else
      output_archive="ark:$dir/egs.JOB.ark"
    fi
    $cmd --max-jobs-run $nj JOB=1:$num_archives_intermediate $dir/log/shuffle.JOB.log \
      nnet3-shuffle-egs --srand=\$[JOB+$srand] "ark:cat $egs_list|" $output_archive  || exit 1;

    if $generate_egs_scp; then
      #concatenate egs.JOB.scp in single egs.scp
      rm $dir/egs.scp 2> /dev/null || true
      for j in $(seq $num_archives_intermediate); do
        cat $dir/egs.$j.scp || exit 1;
      done > $dir/egs.scp || exit 1;
      for f in $dir/egs.*.scp; do rm $f; done
    fi
  else
    # we need to shuffle the 'intermediate archives' and then split into the
    # final archives.  we create soft links to manage this splitting, because
    # otherwise managing the output names is quite difficult (and we don't want
    # to submit separate queue jobs for each intermediate archive, because then
    # the --max-jobs-run option is hard to enforce).
    if $generate_egs_scp; then
      output_archives="$(for y in $(seq $archives_multiple); do echo ark,scp:$dir/egs.JOB.$y.ark,$dir/egs.JOB.$y.scp; done)"
    else
      output_archives="$(for y in $(seq $archives_multiple); do echo ark:$dir/egs.JOB.$y.ark; done)"
    fi
    for x in $(seq $num_archives_intermediate); do
      for y in $(seq $archives_multiple); do
        archive_index=$[($x-1)*$archives_multiple+$y]
        # egs.intermediate_archive.{1,2,...}.ark will point to egs.archive.ark
        ln -sf egs.$archive_index.ark $dir/egs.$x.$y.ark || exit 1
      done
    done
    $cmd --max-jobs-run $nj JOB=1:$num_archives_intermediate $dir/log/shuffle.JOB.log \
      nnet3-shuffle-egs --srand=\$[JOB+$srand] "ark:cat $egs_list|" ark:- \| \
      nnet3-copy-egs ark:- $output_archives || exit 1;

    if $generate_egs_scp; then
      #concatenate egs.JOB.scp in single egs.scp
      rm $dir/egs.scp 2> /dev/null || true
      for j in $(seq $num_archives_intermediate); do
        for y in $(seq $num_archives_intermediate); do
          cat $dir/egs.$j.$y.scp || exit 1;
        done
      done > $dir/egs.scp || exit 1;
      for f in $dir/egs.*.*.scp; do rm $f; done
    fi
  fi
fi

if [ $frame_subsampling_factor -ne 1 ]; then
  echo $frame_subsampling_factor > $dir/info/frame_subsampling_factor
fi

if [ $stage -le 6 ]; then
  echo "$0: removing temporary archives"
  for x in $(seq $nj); do
    for y in $(seq $num_archives_intermediate); do
      file=$dir/egs_orig.$x.$y.ark
      [ -L $file ] && rm $(utils/make_absolute.sh $file)
      rm $file
    done
  done
  if [ $archives_multiple -gt 1 ]; then
    # there are some extra soft links that we should delete.
    for f in $dir/egs.*.*.ark; do rm $f; done
  fi
  echo "$0: removing temporary alignments"
  # Ignore errors below because trans.* might not exist.
  rm $dir/ali.{ark,scp} 2>/dev/null
fi

echo "$0: Finished preparing training examples"


================================================
FILE: egs/steps/nnet3/get_egs_discriminative.sh
================================================
#!/usr/bin/env bash

# Copyright 2012-2016   Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.
# Copyright 2014-2015   Vimal Manohar

# Note: you may find it more convenient to use the newer script get_degs.sh, which
# combines decoding and example-creation in one step without writing lattices.

# This script dumps examples MPE or MMI or state-level minimum bayes risk (sMBR)
# training of neural nets.
# Criterion supported are mpe, smbr and mmi

# Begin configuration section.
cmd=run.pl
frames_per_eg=150 # number of frames of labels per example.  more->less disk space and
                  # less time preparing egs, but more I/O during training.
                  # Note: may in general be a comma-separated string of alternative
                  # durations; the first one (the principal num-frames) is preferred.
frames_overlap_per_eg=30 # number of supervised frames of overlap that we aim for per eg.
                  # can be useful to avoid wasted data if you're using --left-deriv-truncate
                  # and --right-deriv-truncate.
frame_subsampling_factor=1 # ratio between input and output frame-rate of nnet.
                           # this should be read from the nnet. For now, it is taken as an option
left_context=4    # amount of left-context per eg (i.e. extra frames of input features
                  # not present in the output supervision).
right_context=4   # amount of right-context per eg.
left_context_initial=-1    # if >=0, left-context for first chunk of an utterance
right_context_final=-1     # if >=0, right-context for last chunk of an utterance
adjust_priors=true
compress=true   # set this to false to disable compression (e.g. if you want to see whether
                # results are affected).
num_utts_subset=80     # number of utterances in validation and training
                        # subsets used for shrinkage and diagnostics.

frames_per_iter=400000 # each iteration of training, see this many frames
                       # per job.  This is just a guideline; it will pick a number
                       # that divides the number of samples in the entire data.

acwt=0.1

stage=0
max_jobs_run=15
max_shuffle_jobs_run=15

online_ivector_dir=
cmvn_opts=  # can be used for specifying CMVN options, if feature type is not lda (if lda,
            # it doesn't make sense to use different options than were used as input to the
            # LDA transform).  This is used to turn off CMVN in the online-nnet experiments.

num_priors_subset=1000  #  number of utterances used to calibrate the per-state
                        #  priors.  Note: these don't have to be held out from
                        #  the training data.
num_archives_priors=10

# End configuration section.


echo "$0 $@"  # Print the command line for logging

if [ -f path.sh ]; then . ./path.sh; fi
. parse_options.sh || exit 1;


if [ $# != 6 ]; then
  echo "Usage: $0 [opts] <data> <lang> <ali-dir> <denlat-dir> <src-model-file> <degs-dir>"
  echo " e.g.: $0 data/train data/lang exp/tri3_ali exp/tri4_nnet_denlats exp/tri4/final.mdl exp/tri4_mpe/degs"
  echo ""
  echo "Main options (for others, see top of script file)"
  echo "  --config <config-file>                           # config file containing options"
  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs (probably would be good to add --max-jobs-run 5 or so if using"
  echo "                                                   # GridEngine (to avoid excessive NFS traffic)."
  echo "  --samples-per-iter <#samples|400000>             # Number of samples of data to process per iteration, per"
  echo "                                                   # process."
  echo "  --stage <stage|-8>                               # Used to run a partially-completed training process from somewhere in"
  echo "                                                   # the middle."
  echo "  --online-ivector-dir <dir|"">                    # Directory for online-estimated iVectors, used in the"
  echo "                                                   # online-neural-net setup."
  echo "  --left-context <int;4>                           # Number of frames on left side to append for feature input"
  echo "  --right-context <int;4>                          # Number of frames on right side to append for feature input"
  echo "  --left-context-initial <int;-1>                  # If >= 0, left-context for first chunk of an utterance"
  echo "  --right-context-final <int;-1>                   # If >= 0, right-context for last chunk of an utterance"
  exit 1;
fi

data=$1
lang=$2
alidir=$3
denlatdir=$4
src_model=$5
dir=$6

extra_files=
[ ! -z $online_ivector_dir ] && \
  extra_files="$online_ivector_dir/ivector_period $online_ivector_dir/ivector_online.scp"

# Check some files.
for f in $data/feats.scp $lang/L.fst $alidir/ali.1.gz $alidir/num_jobs $alidir/tree \
         $denlatdir/lat.1.gz $denlatdir/num_jobs $src_model $extra_files; do
  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
done

mkdir -p $dir/log $dir/info || exit 1;

utils/lang/check_phones_compatible.sh $lang/phones.txt $alidir/phones.txt || exit 1;
cp $lang/phones.txt $dir || exit 1;

nj=$(cat $denlatdir/num_jobs) || exit 1;

sdata=$data/split$nj
utils/split_data.sh $data $nj

# Get list of validation utterances.
awk '{print $1}' $data/utt2spk | utils/shuffle_list.pl | head -$num_utts_subset \
    > $dir/valid_uttlist || exit 1;

if [ -f $data/utt2uniq ]; then  # this matters if you use data augmentation.
  echo "File $data/utt2uniq exists, so augmenting valid_uttlist to"
  echo "include all perturbed versions of the same 'real' utterances."
  mv $dir/valid_uttlist $dir/valid_uttlist.tmp
  utils/utt2spk_to_spk2utt.pl $data/utt2uniq > $dir/uniq2utt
  cat $dir/valid_uttlist.tmp | utils/apply_map.pl $data/utt2uniq | \
    sort | uniq | utils/apply_map.pl $dir/uniq2utt | \
    awk '{for(n=1;n<=NF;n++) print $n;}' | sort  > $dir/valid_uttlist
  rm $dir/uniq2utt $dir/valid_uttlist.tmp
fi

awk '{print $1}' $data/utt2spk | utils/filter_scp.pl --exclude $dir/valid_uttlist | \
   utils/shuffle_list.pl | head -$num_utts_subset > $dir/train_subset_uttlist || exit 1;

if [ $stage -le 1 ]; then
  nj_ali=$(cat $alidir/num_jobs)
  alis=$(for n in $(seq $nj_ali); do echo -n "$alidir/ali.$n.gz "; done)
  $cmd $dir/log/copy_alignments.log \
    copy-int-vector "ark:gunzip -c $alis|" \
    ark,scp:$dir/ali.ark,$dir/ali.scp || exit 1;
fi

prior_ali_rspecifier="ark,s,cs:utils/filter_scp.pl $dir/priors_uttlist $dir/ali.scp | ali-to-pdf $alidir/final.mdl scp:- ark:- |"

silphonelist=`cat $lang/phones/silence.csl` || exit 1;
cp $alidir/tree $dir
cp $lang/phones/silence.csl $dir/info/
cp $src_model $dir/final.mdl || exit 1

# Get list of utterances for prior computation.
awk '{print $1}' $data/utt2spk | utils/filter_scp.pl --exclude $dir/valid_uttlist | \
  utils/shuffle_list.pl | head -$num_priors_subset \
  > $dir/priors_uttlist || exit 1;

feats="ark,s,cs:utils/filter_scp.pl --exclude $dir/valid_uttlist $sdata/JOB/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:- ark:- |"
valid_feats="ark,s,cs:utils/filter_scp.pl $dir/valid_uttlist $data/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- |"
train_subset_feats="ark,s,cs:utils/filter_scp.pl $dir/train_subset_uttlist $data/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- |"
priors_feats="ark,s,cs:utils/filter_scp.pl $dir/priors_uttlist $data/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- |"
echo $cmvn_opts > $dir/cmvn_opts

if [ ! -z $online_ivector_dir ]; then
  ivector_period=$(cat $online_ivector_dir/ivector_period)
  ivector_dim=$(feat-to-dim scp:$online_ivector_dir/ivector_online.scp -) || exit 1;
  echo $ivector_dim >$dir/info/ivector_dim
  steps/nnet2/get_ivector_id.sh $online_ivector_dir > $dir/info/final.ie.id || exit 1
  ivector_opts="--online-ivectors=scp:$online_ivector_dir/ivector_online.scp --online-ivector-period=$ivector_period"
else
  ivector_opts=""
fi

if [ $stage -le 2 ]; then
  echo "$0: working out number of frames of training data"
  num_frames=$(steps/nnet2/get_num_frames.sh $data)
  echo $num_frames > $dir/info/num_frames
  echo "$0: working out feature dim"
  feats_one="$(echo $feats | sed s:JOB:1:g)"
  if feat_dim=$(feat-to-dim "$feats_one" - 2>/dev/null); then
    echo $feat_dim > $dir/info/feat_dim
  else # run without stderr redirection to show the error.
    feat-to-dim "$feats_one" -; exit 1
  fi
fi

# Work out total number of archives. Add one on the assumption the
# num-frames won't divide exactly, and we want to round up.
num_archives=$[$num_frames/$frames_per_iter+1]

# We may have to first create a smaller number of larger archives, with number
# $num_archives_intermediate, if $num_archives is more than the maximum number
# of open filehandles that the system allows per process (ulimit -n).
max_open_filehandles=$(ulimit -n) || exit 1
num_archives_intermediate=$num_archives
archives_multiple=1
while [ $[$num_archives_intermediate+4] -gt $max_open_filehandles ]; do
  archives_multiple=$[$archives_multiple+1]
  num_archives_intermediate=$[$num_archives/$archives_multiple] || exit 1;
done
# now make sure num_archives is an exact multiple of archives_multiple.
num_archives=$[$archives_multiple*$num_archives_intermediate] || exit 1;

echo $num_archives >$dir/info/num_archives
echo $frames_per_eg >$dir/info/frames_per_eg

# the first field in frames_per_eg (which is a comma-separated list of numbers)
# is the 'principal' frames-per-eg, and for purposes of working out the number
# of archives we assume that this will be the average number of frames per eg.
frames_per_eg_principal=$(echo $frames_per_eg | cut -d, -f1)

# Work out the number of egs per archive
egs_per_archive=$[$num_frames/($frames_per_eg_principal*$num_archives)] || exit 1;
! [ $egs_per_archive -le $frames_per_iter ] && \
  echo "$0: script error: egs_per_archive=$egs_per_archive not <= frames_per_iter=$frames_per_iter" \
  && exit 1;

echo $egs_per_archive > $dir/info/egs_per_archive

echo "$0: creating $num_archives archives, each with $egs_per_archive egs, with"
echo "$0:   $frames_per_eg labels per example, and (left,right) context = ($left_context,$right_context)"
if [ $left_context_initial -ge 0 ] || [ $right_context_final -ge 0 ]; then
  echo "$0:   ... and (left-context-initial,right-context-final) = ($left_context_initial,$right_context_final)"
fi


if [ -e $dir/storage ]; then
  # Make soft links to storage directories, if distributing this way..  See
  # utils/create_split_dir.pl.
  echo "$0: creating data links"
  utils/create_data_link.pl $(for x in $(seq $num_archives); do echo $dir/degs.$x.ark; done)
  for x in $(seq $num_archives_intermediate); do
    utils/create_data_link.pl $(for y in $(seq $nj); do echo $dir/degs_orig.$y.$x.ark; done)
  done
fi

if [ $stage -le 3 ]; then
  echo "$0: copying training lattices"

  $cmd --max-jobs-run 6 JOB=1:$nj $dir/log/lattice_copy.JOB.log \
    lattice-copy --write-compact=false --include="cat $dir/valid_uttlist $dir/train_subset_uttlist |" --ignore-missing \
    "ark:gunzip -c $denlatdir/lat.JOB.gz|" ark,scp:$dir/lat_special.JOB.ark,$dir/lat_special.JOB.scp || exit 1;

  for id in $(seq $nj); do cat $dir/lat_special.$id.scp; done > $dir/lat_special.scp
fi


# If frame_subsampling_factor > 0, we will later be shifting the egs slightly to
# the left or right as part of training, so we see (e.g.) all shifts of the data
# modulo 3... we need to extend the l/r context slightly to account for this, to
# ensure we see the entire context that the model requires.
left_context=$[left_context+frame_subsampling_factor/2]
right_context=$[right_context+frame_subsampling_factor/2]
[ $left_context_initial -ge 0 ] && left_context_initial=$[left_context_initial+frame_subsampling_factor/2]
[ $right_context_final -ge 0 ] && right_context_final=$[right_context_final+frame_subsampling_factor/2]

egs_opts="--left-context=$left_context --right-context=$right_context --num-frames=$frames_per_eg --compress=$compress --frame-subsampling-factor=$frame_subsampling_factor --acoustic-scale=$acwt"
[ $left_context_initial -ge 0 ] && egs_opts="$egs_opts --left-context-initial=$left_context_initial"
[ $right_context_final -ge 0 ] && egs_opts="$egs_opts --right-context-final=$right_context_final"


# don't do the overlap thing for the priors computation data-- but do use the
# same num-frames for the eg, which would be much more efficient in case it's a
# recurrent model and has a lot of frames of context.  In any case we're not
# doing SGD so there is no benefit in having short chunks.
priors_egs_opts="--left-context=$left_context --right-context=$right_context --num-frames=$frames_per_eg --compress=$compress"
[ $left_context_initial -ge 0 ] && priors_egs_opts="$priors_egs_opts --left-context-initial=$left_context_initial"
[ $right_context_final -ge 0 ] && priors_egs_opts="$priors_egs_opts --right-context-final=$right_context_final"


echo $left_context > $dir/info/left_context
echo $right_context > $dir/info/right_context
echo $left_context_initial > $dir/info/left_context_initial
echo $right_context_final > $dir/info/right_context_final

echo $frame_subsampling_factor > $dir/info/frame_subsampling_factor


if [ "$frame_subsampling_factor" != 1 ]; then
  if $adjust_priors; then
    echo "$0: setting --adjust-priors false since adjusting priors is not supported (and does not make sense) for chain models"
    adjust_priors=false
  fi
fi

(
  if $adjust_priors && [ $stage -le 10 ]; then
    if [ ! -f $dir/ali.scp ]; then
      nj_ali=$(cat $alidir/num_jobs)
      alis=$(for n in $(seq $nj_ali); do echo -n "$alidir/ali.$n.gz "; done)
      $cmd $dir/log/copy_alignments.log \
        copy-int-vector "ark:gunzip -c $alis|" \
        ark,scp:$dir/ali.ark,$dir/ali.scp || exit 1;
    fi

    priors_egs_list=
    for y in `seq $num_archives_priors`; do
      utils/create_data_link.pl $dir/priors_egs.$y.ark
      priors_egs_list="$priors_egs_list ark:$dir/priors_egs.$y.ark"
    done

    echo "$0: dumping egs for prior adjustment in the background."

    num_pdfs=`am-info $alidir/final.mdl | grep pdfs | awk '{print $NF}' 2>/dev/null` || exit 1

    $cmd $dir/log/create_priors_subset.log \
      nnet3-get-egs --num-pdfs=$num_pdfs $ivector_opts $priors_egs_opts "$priors_feats" \
      "$prior_ali_rspecifier ali-to-post ark:- ark:- |" \
      ark:- \| nnet3-copy-egs ark:- $priors_egs_list || \
      { touch $dir/.error; echo "Error in creating priors subset. See $dir/log/create_priors_subset.log"; exit 1; }

    sleep 3;

    echo $num_archives_priors >$dir/info/num_archives_priors
  else
    echo 0 > $dir/info/num_archives_priors
  fi
) &

if [ $stage -le 4 ]; then
  echo "$0: Getting validation and training subset examples."
  rm $dir/.error 2>/dev/null
  echo "$0: ... extracting validation and training-subset alignments."

  #utils/filter_scp.pl <(cat $dir/valid_uttlist $dir/train_subset_uttlist) \
  #  <$dir/lat.scp >$dir/lat_special.scp

  utils/filter_scp.pl <(cat $dir/valid_uttlist $dir/train_subset_uttlist) \
    <$dir/ali.scp >$dir/ali_special.scp

  $cmd $dir/log/create_valid_subset.log \
    nnet3-discriminative-get-egs $ivector_opts $egs_opts \
    $dir/final.mdl "$valid_feats" scp:$dir/lat_special.scp \
    scp:$dir/ali_special.scp "ark:$dir/valid_diagnostic.degs" || touch $dir/.error &

  $cmd $dir/log/create_train_subset.log \
    nnet3-discriminative-get-egs $ivector_opts $egs_opts \
    $dir/final.mdl "$train_subset_feats" scp:$dir/lat_special.scp \
    scp:$dir/ali_special.scp  "ark:$dir/train_diagnostic.degs" || touch $dir/.error &
  wait;
  [ -f $dir/.error ] && echo "Error detected while creating train/valid egs" && exit 1
  echo "... Getting subsets of validation examples for diagnostics and combination."

  for f in $dir/{train_diagnostic,valid_diagnostic}.degs; do
    [ ! -s $f ] && echo "No examples in file $f" && exit 1;
  done
fi

if [ $stage -le 5 ]; then
  # create degs_orig.*.*.ark; the first index goes to $nj,
  # the second to $num_archives_intermediate.

  degs_list=
  for n in $(seq $num_archives_intermediate); do
    degs_list="$degs_list ark:$dir/degs_orig.JOB.$n.ark"
  done
  echo "$0: Generating training examples on disk"

  # The examples will go round-robin to degs_list.
  # To make it efficient we need to use a large 'nj', like 40, and in that case
  # there can be too many small files to deal with, because the total number of
  # files is the product of 'nj' by 'num_archives_intermediate', which might be
  # quite large.
  $cmd --max-jobs-run $max_jobs_run JOB=1:$nj $dir/log/get_egs.JOB.log \
    nnet3-discriminative-get-egs $ivector_opts $egs_opts \
      --num-frames-overlap=$frames_overlap_per_eg \
      $dir/final.mdl "$feats" "ark,s,cs:gunzip -c $denlatdir/lat.JOB.gz |" \
      "scp:utils/filter_scp.pl $sdata/JOB/utt2spk $dir/ali.scp |" ark:- \| \
    nnet3-discriminative-copy-egs --random=true --srand=JOB ark:- $degs_list || exit 1;
fi

if [ $stage -le 6 ]; then
  echo "$0: recombining and shuffling order of archives on disk"
  # combine all the "degs_orig.*.JOB.scp" (over the $nj splits of the data) and
  # shuffle the order, writing to the degs.JOB.ark

  # the input is a concatenation over the input jobs.
  degs_list=
  for n in $(seq $nj); do
    degs_list="$degs_list $dir/degs_orig.$n.JOB.ark"
  done

  if [ $archives_multiple == 1 ]; then # normal case.
    $cmd --max-jobs-run $max_shuffle_jobs_run --mem 8G JOB=1:$num_archives_intermediate $dir/log/shuffle.JOB.log \
      nnet3-discriminative-shuffle-egs --srand=JOB "ark:cat $degs_list|" ark:$dir/degs.JOB.ark  || exit 1;
  else
    # we need to shuffle the 'intermediate archives' and then split into the
    # final archives.  we create soft links to manage this splitting, because
    # otherwise managing the output names is quite difficult (and we don't want
    # to submit separate queue jobs for each intermediate archive, because then
    # the --max-jobs-run option is hard to enforce).
    output_archives=$(for y in $(seq $archives_multiple); do echo -n "ark:$dir/degs.JOB.$y.ark "; done)
    for x in $(seq $num_archives_intermediate); do
      for y in $(seq $archives_multiple); do
        archive_index=$[($x-1)*$archives_multiple+$y]
        # degs.intermediate_archive.{1,2,...}.ark will point to degs.archive.ark
        ln -sf degs.$archive_index.ark $dir/degs.$x.$y.ark || exit 1
      done
    done
    $cmd --max-jobs-run $max_shuffle_jobs_run --mem 8G JOB=1:$num_archives_intermediate $dir/log/shuffle.JOB.log \
      nnet3-discriminative-shuffle-egs --srand=JOB "ark:cat $degs_list|" ark:- \| \
      nnet3-discriminative-copy-egs ark:- $output_archives || exit 1;
  fi
fi

if [ $stage -le 7 ]; then
  echo "$0: removing temporary archives"
  for x in $(seq $nj); do
    for y in $(seq $num_archives_intermediate); do
      file=$dir/degs_orig.$x.$y.ark
      [ -L $file ] && rm $(utils/make_absolute.sh $file)
      rm $file
    done
  done
  if [ $archives_multiple -gt 1 ]; then
    # there are some extra soft links that we should delete.
    for f in $dir/degs.*.*.ark; do rm $f; done
  fi
  echo "$0: removing temporary lattices"
  rm $dir/lat.*
  echo "$0: removing temporary alignments"
  rm $dir/ali.{ark,scp} 2>/dev/null
fi

wait

echo "$0: Finished preparing training examples"


================================================
FILE: egs/steps/nnet3/get_egs_targets.sh
================================================
#!/usr/bin/env bash

# Copyright 2012-2015 Johns Hopkins University (Author: Daniel Povey).
#           2015-2016 Vimal Manohar
# Apache 2.0.

# This script is similar to steps/nnet3/get_egs.sh but used
# when getting general targets (not from alignment directory) for raw nnet
#
# This script, which will generally be called from other neural-net training
# scripts, extracts the training examples used to train the neural net (and also
# the validation examples used for diagnostics), and puts them in separate archives.
#
# This script dumps egs with several frames of labels, controlled by the
# frames_per_eg config variable (default: 8).  This takes many times less disk
# space because typically we have 4 to 7 frames of context on the left and
# right, and this ends up getting shared.  This is at the expense of slightly
# higher disk I/O while training.

set -o pipefail
trap "" PIPE

# Begin configuration section.
cmd=run.pl
target_type=sparse  # dense to have dense targets,
                    # sparse to have posteriors targets
num_targets=        # required for target-type=sparse with raw nnet
frame_subsampling_factor=1
length_tolerance=2
frames_per_eg=8   # number of frames of labels per example.  more->less disk space and
                  # less time preparing egs, but more I/O during training.
                  # Note: may in general be a comma-separated string of alternative
                  # durations (more useful when using large chunks, e.g. for BLSTMs);
                  # the first one (the principal num-frames) is preferred.
left_context=4    # amount of left-context per eg (i.e. extra frames of input features
                  # not present in the output supervision).
right_context=4   # amount of right-context per eg.
left_context_initial=-1    # if >=0, left-context for first chunk of an utterance
right_context_final=-1     # if >=0, right-context for last chunk of an utterance
compress=true   # set this to false to disable compression (e.g. if you want to see whether
                # results are affected).
num_utts_subset=300     # number of utterances in validation and training
                        # subsets used for shrinkage and diagnostics.
num_utts_subset_valid=  # number of utterances in validation
                        # subsets used for shrinkage and diagnostics
                        # if provided, overrides num-utts-subset
num_utts_subset_train=  # number of utterances in training
                        # subsets used for shrinkage and diagnostics.
                        # if provided, overrides num-utts-subset
num_valid_frames_combine=0 # #valid frames for combination weights at the very end.
num_train_frames_combine=60000 # # train frames for the above.
num_frames_diagnostic=10000 # number of frames for "compute_prob" jobs
samples_per_iter=400000 # this is the target number of egs in each archive of egs
                        # (prior to merging egs).  We probably should have called
                        # it egs_per_iter. This is just a guideline; it will pick
                        # a number that divides the number of samples in the
                        # entire data.

stage=0
nj=6         # This should be set to the maximum number of jobs you are
             # comfortable to run in parallel; you can increase it if your disk
             # speed is greater and you have more machines.
srand=0
online_ivector_dir=  # can be used if we are including speaker information as iVectors.
cmvn_opts=  # can be used for specifying CMVN options, if feature type is not lda (if lda,
            # it doesn't make sense to use different options than were used as input to the
            # LDA transform).  This is used to turn off CMVN in the online-nnet experiments.
generate_egs_scp=false # If true, it will generate egs.JOB.*.scp per egs archive

echo "$0 $@"  # Print the command line for logging

if [ -f path.sh ]; then . ./path.sh; fi
. parse_options.sh || exit 1;

if [ $# != 3 ]; then
  echo "Usage: $0 [opts] <data> <targets-scp> <egs-dir>"
  echo " e.g.: $0 data/train data/train/snr_targets.scp exp/tri4_nnet/egs"
  echo ""
  echo "Main options (for others, see top of script file)"
  echo "  --config <config-file>                           # config file containing options"
  echo "  --nj <nj>                                        # The maximum number of jobs you want to run in"
  echo "                                                   # parallel (increase this only if you have good disk and"
  echo "                                                   # network speed).  default=6"
  echo "  --cmd (utils/run.pl;utils/queue.pl <queue opts>) # how to run jobs."
  echo "  --samples-per-iter <#samples;400000>             # Target number of egs per archive (option is badly named)"
  echo "  --frames-per-eg <frames;8>                       # number of frames per eg on disk"
  echo "                                                   # May be either a single number or a comma-separated list"
  echo "                                                   # of alternatives (useful when training LSTMs, where the"
  echo "                                                   # frames-per-eg is the chunk size, to get variety of chunk"
  echo "                                                   # sizes).  The first in the list is preferred and is used"
  echo "                                                   # when working out the number of archives etc."
  echo "  --left-context <int;4>                           # Number of frames on left side to append for feature input"
  echo "  --right-context <int;4>                          # Number of frames on right side to append for feature input"
  echo "  --left-context-initial <int;-1>                  # If >= 0, left-context for first chunk of an utterance"
  echo "  --right-context-final <int;-1>                   # If >= 0, right-context for last chunk of an utterance"
  echo "  --num-frames-diagnostic <#frames;4000>           # Number of frames used in computing (train,valid) diagnostics"
  echo "  --num-valid-frames-combine <#frames;10000>       # Number of frames used in getting combination weights at the"
  echo "                                                   # very end."
  echo "  --stage <stage|0>                                # Used to run a partially-completed training process from somewhere in"
  echo "                                                   # the middle."

  exit 1;
fi

data=$1
targets_scp=$2
dir=$3

# Check some files.
[ ! -z "$online_ivector_dir" ] && \
  extra_files="$online_ivector_dir/ivector_online.scp $online_ivector_dir/ivector_period"

for f in $data/feats.scp $targets_scp $extra_files; do
  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
done

sdata=$data/split$nj
utils/split_data.sh $data $nj

mkdir -p $dir/log $dir/info

[ -z "$num_utts_subset_valid" ] && num_utts_subset_valid=$num_utts_subset
[ -z "$num_utts_subset_train" ] && num_utts_subset_train=$num_utts_subset

num_utts=$(cat $data/utt2spk | wc -l)
if ! [ $num_utts -gt $[$num_utts_subset_valid*4] ]; then
  echo "$0: number of utterances $num_utts in your training data is too small versus --num-utts-subset=$num_utts_subset"
  echo "... you probably have so little data that it doesn't make sense to train a neural net."
  exit 1
fi

# Get list of validation utterances.
awk '{print $1}' $data/utt2spk | utils/shuffle_list.pl 2>/dev/null | head -$num_utts_subset_valid | sort \
    > $dir/valid_uttlist

if [ -f $data/utt2uniq ]; then  # this matters if you use data augmentation.
  echo "File $data/utt2uniq exists, so augmenting valid_uttlist to"
  echo "include all perturbed versions of the same 'real' utterances."
  mv $dir/valid_uttlist $dir/valid_uttlist.tmp
  utils/utt2spk_to_spk2utt.pl $data/utt2uniq > $dir/uniq2utt
  cat $dir/valid_uttlist.tmp | utils/apply_map.pl $data/utt2uniq | \
    sort | uniq | utils/apply_map.pl $dir/uniq2utt | \
    awk '{for(n=1;n<=NF;n++) print $n;}' | sort  > $dir/valid_uttlist
  rm $dir/uniq2utt $dir/valid_uttlist.tmp
fi

awk '{print $1}' $data/utt2spk | utils/filter_scp.pl --exclude $dir/valid_uttlist | \
   utils/shuffle_list.pl 2>/dev/null | head -$num_utts_subset_train | sort > $dir/train_subset_uttlist

## Set up features.
echo "$0: feature type is raw"

feats="ark,s,cs:utils/filter_scp.pl --exclude $dir/valid_uttlist $sdata/JOB/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:- ark:- |"
valid_feats="ark,s,cs:utils/filter_scp.pl $dir/valid_uttlist $data/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- |"
train_subset_feats="ark,s,cs:utils/filter_scp.pl $dir/train_subset_uttlist $data/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- |"
echo $cmvn_opts >$dir/cmvn_opts # caution: the top-level nnet training script should copy this to its own dir now.

if [ ! -z "$online_ivector_dir" ]; then
  steps/nnet2/get_ivector_id.sh $online_ivector_dir > $dir/info/final.ie.id || exit 1
  ivector_dim=$(feat-to-dim scp:$online_ivector_dir/ivector_online.scp -) || exit 1
  echo $ivector_dim > $dir/info/ivector_dim
  ivector_period=$(cat $online_ivector_dir/ivector_period) || exit 1;

  ivector_opts="--online-ivectors=scp:$online_ivector_dir/ivector_online.scp --online-ivector-period=$ivector_period"
else
  ivector_opts=""
  echo 0 >$dir/info/ivector_dim
fi

if [ $stage -le 1 ]; then
  echo "$0: working out number of frames of training data"
  num_frames=$(steps/nnet2/get_num_frames.sh $data)
  echo $num_frames > $dir/info/num_frames
  echo "$0: working out feature dim"
  feats_one="$(echo $feats | sed s:JOB:1:g)"
  if feat_dim=$(feat-to-dim "$feats_one" - 2>/dev/null); then
    echo $feat_dim > $dir/info/feat_dim
  else # run without stderr redirection to show the error.
    feat-to-dim "$feats_one" -; exit 1
  fi
else
  num_frames=$(cat $dir/info/num_frames) || exit 1;
  feat_dim=$(cat $dir/info/feat_dim) || exit 1;
fi


# the first field in frames_per_eg (which is a comma-separated list of numbers)
# is the 'principal' frames-per-eg, and for purposes of working out the number
# of archives we assume that this will be the average number of frames per eg.
frames_per_eg_principal=$(echo $frames_per_eg | cut -d, -f1)

# the + 1 is to round up, not down... we assume it doesn't divide exactly.
num_archives=$[$num_frames/($frames_per_eg_principal*$samples_per_iter)+1]
if [ $num_archives -eq 1 ]; then
  echo "*** $0: warning: the --frames-per-eg is too large to generate one archive with"
  echo "*** as many as --samples-per-iter egs in it.  Consider reducing --frames-per-eg."
  sleep 4
fi

# We may have to first create a smaller number of larger archives, with number
# $num_archives_intermediate, if $num_archives is more than the maximum number
# of open filehandles that the system allows per process (ulimit -n).
# This sometimes gives a misleading answer as GridEngine sometimes changes the
# limit, so we limit it to 512.
max_open_filehandles=$(ulimit -n) || exit 1
[ $max_open_filehandles -gt 512 ] && max_open_filehandles=512
num_archives_intermediate=$num_archives
archives_multiple=1
while [ $[$num_archives_intermediate+4] -gt $max_open_filehandles ]; do
  archives_multiple=$[$archives_multiple+1]
  num_archives_intermediate=$[$num_archives/$archives_multiple+1];
done
# now make sure num_archives is an exact multiple of archives_multiple.
num_archives=$[$archives_multiple*$num_archives_intermediate]

echo $num_archives >$dir/info/num_archives
echo $frames_per_eg >$dir/info/frames_per_eg
# Work out the number of egs per archive
egs_per_archive=$[$num_frames/($frames_per_eg_principal*$num_archives)]
! [ $egs_per_archive -le $samples_per_iter ] && \
  echo "$0: script error: egs_per_archive=$egs_per_archive not <= samples_per_iter=$samples_per_iter" \
  && exit 1;

echo $egs_per_archive > $dir/info/egs_per_archive

echo "$0: creating $num_archives archives, each with $egs_per_archive egs, with"
echo "$0:   $frames_per_eg labels per example, and (left,right) context = ($left_context,$right_context)"
if [ $left_context_initial -ge 0 ] || [ $right_context_final -ge 0 ]; then
  echo "$0:   ... and (left-context-initial,right-context-final) = ($left_context_initial,$right_context_final)"
fi


if [ -e $dir/storage ]; then
  # Make soft links to storage directories, if distributing this way..  See
  # utils/create_split_dir.pl.
  echo "$0: creating data links"
  utils/create_data_link.pl $(for x in $(seq $num_archives); do echo $dir/egs.$x.ark; done)
  for x in $(seq $num_archives_intermediate); do
    utils/create_data_link.pl $(for y in $(seq $nj); do echo $dir/egs_orig.$y.$x.ark; done)
  done
fi

egs_opts="--left-context=$left_context --right-context=$right_context --compress=$compress --num-frames=$frames_per_eg"
[ $left_context_initial -ge 0 ] && egs_opts="$egs_opts --left-context-initial=$left_context_initial"
[ $right_context_final -ge 0 ] && egs_opts="$egs_opts --right-context-final=$right_context_final"

echo $left_context > $dir/info/left_context
echo $right_context > $dir/info/right_context
echo $left_context_initial > $dir/info/left_context_initial
echo $right_context_final > $dir/info/right_context_final

for n in `seq $nj`; do
  utils/filter_scp.pl $sdata/$n/utt2spk $targets_scp > $dir/targets.$n.scp
done

targets_scp_split=$dir/targets.JOB.scp

if [ $target_type == "dense" ]; then
  num_targets=$(feat-to-dim "scp:$targets_scp" - 2>/dev/null) || exit 1
fi

if [ -z "$num_targets" ]; then
  echo "$0: num-targets is not set"
  exit 1
fi

case $target_type in
  "dense")
    get_egs_program="nnet3-get-egs-dense-targets --num-targets=$num_targets"
    targets="scp,s,cs:utils/filter_scp.pl --exclude $dir/valid_uttlist $targets_scp_split |"
    valid_targets="scp,s,cs:utils/filter_scp.pl $dir/valid_uttlist $targets_scp |"
    train_subset_targets="scp,s,cs:utils/filter_scp.pl $dir/train_subset_uttlist $targets_scp |"
    ;;
  "sparse")
    get_egs_program="nnet3-get-egs --num-pdfs=$num_targets"
    targets="ark,s,cs:utils/filter_scp.pl --exclude $dir/valid_uttlist $targets_scp_split | ali-to-post scp:- ark:- |"
    valid_targets="ark,s,cs:utils/filter_scp.pl $dir/valid_uttlist $targets_scp | ali-to-post scp:- ark:- |"
    train_subset_targets="ark,s,cs:utils/filter_scp.pl $dir/train_subset_uttlist $targets_scp | ali-to-post scp:- ark:- |"
    ;;
  default)
    echo "$0: Unknown --target-type $target_type. Choices are dense and sparse"
    exit 1
esac

if [ $stage -le 3 ]; then
  echo "$0: Getting validation and training subset examples."
  rm -f $dir/.error 2>/dev/null
  $cmd $dir/log/create_valid_subset.log \
    $get_egs_program --frame-subsampling-factor=$frame_subsampling_factor \
    --length-tolerance=$length_tolerance \
    $ivector_opts $egs_opts "$valid_feats" \
    "$valid_targets" \
    "ark:$dir/valid_all.egs" || touch $dir/.error &
  $cmd $dir/log/create_train_subset.log \
    $get_egs_program --frame-subsampling-factor=$frame_subsampling_factor \
    --length-tolerance=$length_tolerance \
    $ivector_opts $egs_opts "$train_subset_feats" \
    "$train_subset_targets" \
    "ark:$dir/train_subset_all.egs" || touch $dir/.error &
  wait;

  [ -f $dir/.error ] && echo "Error detected while creating train/valid egs" && exit 1
  echo "... Getting subsets of validation examples for diagnostics and combination."
  if $generate_egs_scp; then
    valid_diagnostic_output="ark,scp:$dir/valid_diagnostic.egs,$dir/valid_diagnostic.scp"
    train_diagnostic_output="ark,scp:$dir/train_diagnostic.egs,$dir/train_diagnostic.scp"
  else
    valid_diagnostic_output="ark:$dir/valid_diagnostic.egs"
    train_diagnostic_output="ark:$dir/train_diagnostic.egs"
  fi
  $cmd $dir/log/create_valid_subset_combine.log \
    nnet3-subset-egs --n=$[$num_valid_frames_combine/$frames_per_eg_principal] ark:$dir/valid_all.egs \
    ark:$dir/valid_combine.egs || touch $dir/.error &
  $cmd $dir/log/create_valid_subset_diagnostic.log \
    nnet3-subset-egs --n=$[$num_frames_diagnostic/$frames_per_eg_principal] ark:$dir/valid_all.egs \
    $valid_diagnostic_output || touch $dir/.error &

  $cmd $dir/log/create_train_subset_combine.log \
    nnet3-subset-egs --n=$[$num_train_frames_combine/$frames_per_eg_principal] ark:$dir/train_subset_all.egs \
    ark:$dir/train_combine.egs || touch $dir/.error &
  $cmd $dir/log/create_train_subset_diagnostic.log \
    nnet3-subset-egs --n=$[$num_frames_diagnostic/$frames_per_eg_principal] ark:$dir/train_subset_all.egs \
    $train_diagnostic_output || touch $dir/.error &
  wait
  sleep 5  # wait for file system to sync.
  cat $dir/valid_combine.egs $dir/train_combine.egs > $dir/combine.egs
  if $generate_egs_scp; then
    cat $dir/valid_combine.egs $dir/train_combine.egs  | \
    nnet3-copy-egs ark:- ark,scp:$dir/combine.egs,$dir/combine.scp
    rm $dir/{train,valid}_combine.scp
  else
    cat $dir/valid_combine.egs $dir/train_combine.egs > $dir/combine.egs
  fi
  for f in $dir/{combine,train_diagnostic,valid_diagnostic}.egs; do
    [ ! -s $f ] && echo "No examples in file $f" && exit 1;
  done
  rm $dir/valid_all.egs $dir/train_subset_all.egs $dir/{train,valid}_combine.egs
fi

if [ $stage -le 4 ]; then
  # create egs_orig.*.*.ark; the first index goes to $nj,
  # the second to $num_archives_intermediate.

  egs_list=
  for n in $(seq $num_archives_intermediate); do
    egs_list="$egs_list ark:$dir/egs_orig.JOB.$n.ark"
  done
  echo "$0: Generating training examples on disk"
  # The examples will go round-robin to egs_list.
  $cmd JOB=1:$nj $dir/log/get_egs.JOB.log \
    $get_egs_program --frame-subsampling-factor=$frame_subsampling_factor \
    --length-tolerance=$length_tolerance \
    $ivector_opts $egs_opts "$feats" "$targets" \
    ark:- \| \
    nnet3-copy-egs --random=true --srand=\$[JOB+$srand] ark:- $egs_list || exit 1;
fi

if [ $stage -le 5 ]; then
  echo "$0: recombining and shuffling order of archives on disk"
  # combine all the "egs_orig.*.JOB.scp" (over the $nj splits of the data) and
  # shuffle the order, writing to the egs.JOB.ark

  # the input is a concatenation over the input jobs.
  egs_list=
  for n in $(seq $nj); do
    egs_list="$egs_list $dir/egs_orig.$n.JOB.ark"
  done

  if [ $archives_multiple == 1 ]; then # normal case.
    if $generate_egs_scp; then
      output_archive="ark,scp:$dir/egs.JOB.ark,$dir/egs.JOB.scp"
    else
      output_archive="ark:$dir/egs.JOB.ark"
    fi
    $cmd --max-jobs-run $nj JOB=1:$num_archives_intermediate $dir/log/shuffle.JOB.log \
      nnet3-shuffle-egs --srand=\$[JOB+$srand] "ark:cat $egs_list|" $output_archive  || exit 1;

    if $generate_egs_scp; then
      #concatenate egs.JOB.scp in single egs.scp
      rm $dir/egs.scp 2> /dev/null || true
      for j in $(seq $num_archives_intermediate); do
        cat $dir/egs.$j.scp || exit 1;
      done > $dir/egs.scp || exit 1;
      for f in $dir/egs.*.scp; do rm $f; done
    fi
  else
    # we need to shuffle the 'intermediate archives' and then split into the
    # final archives.  we create soft links to manage this splitting, because
    # otherwise managing the output names is quite difficult (and we don't want
    # to submit separate queue jobs for each intermediate archive, because then
    # the --max-jobs-run option is hard to enforce).
    if $generate_egs_scp; then
      output_archives="$(for y in $(seq $archives_multiple); do echo ark,scp:$dir/egs.JOB.$y.ark,$dir/egs.JOB.$y.scp; done)"
    else
      output_archives="$(for y in $(seq $archives_multiple); do echo ark:$dir/egs.JOB.$y.ark; done)"
    fi
    for x in $(seq $num_archives_intermediate); do
      for y in $(seq $archives_multiple); do
        archive_index=$[($x-1)*$archives_multiple+$y]
        # egs.intermediate_archive.{1,2,...}.ark will point to egs.archive.ark
        ln -sf egs.$archive_index.ark $dir/egs.$x.$y.ark || exit 1
      done
    done
    $cmd --max-jobs-run $nj JOB=1:$num_archives_intermediate $dir/log/shuffle.JOB.log \
      nnet3-shuffle-egs --srand=\$[JOB+$srand] "ark:cat $egs_list|" ark:- \| \
      nnet3-copy-egs ark:- $output_archives || exit 1;

    if $generate_egs_scp; then
      #concatenate egs.JOB.scp in single egs.scp
      rm $dir/egs.scp 2> /dev/null || true
      for j in $(seq $num_archives_intermediate); do
        for y in $(seq $num_archives_intermediate); do
          cat $dir/egs.$j.$y.scp || exit 1;
        done
      done > $dir/egs.scp || exit 1;
      for f in $dir/egs.*.*.scp; do rm $f; done
    fi
  fi
fi

if [ $frame_subsampling_factor -ne 1 ]; then
  echo $frame_subsampling_factor > $dir/info/frame_subsampling_factor
fi

wait

if [ $stage -le 6 ]; then
  echo "$0: removing temporary archives"
  for x in $(seq $nj); do
    for y in $(seq $num_archives_intermediate); do
      file=$dir/egs_orig.$x.$y.ark
      [ -L $file ] && rm $(utils/make_absolute.sh $file)
      rm $file
    done
  done
  if [ $archives_multiple -gt 1 ]; then
    # there are some extra soft links that we should delete.
    for f in $dir/egs.*.*.ark; do rm $f; done
  fi
  echo "$0: removing temporary stuff"
  rm -f $dir/targets.*.scp 2>/dev/null
fi

echo "$0: Finished preparing training examples"


================================================
FILE: egs/steps/nnet3/get_saturation.pl
================================================
#!/usr/bin/env perl

# This program parses the output of nnet3-am-info or nnet3-info,
# and prints out a number between zero and one that reflects
# how saturated the (sigmoid and tanh) nonlinearities are, on average
# over the model.
#
# This is based on the 'avg-deriv' (average-derivative) values printed
# out for the sigmoid and tanh components.  The 'saturation' of such a component
# is defined as (1.0 - its avg-deriv / the maximum possible derivative of that nonlinearity),
# where the denominator is 1.0 for tanh and 0.25 for sigmoid.
# This component averages the saturation over all the sigmoid/tanh units in
# the network.
#
# It parses the Info() output of components of type SigmoidComponent,
# TanhComponent, and LstmNonlinearityComponent.  It prints an error message to
# stderr and returns with status 1 if it could not find the info for any such components
# in the input stream.

# Usage: nnet3-am-info 10.mdl | steps/nnet3/get_saturation.pl
# or: nnet3-info 10.raw | steps/nnet3/get_saturation.pl

use warnings;

my $num_nonlinearities = 0;
my $total_saturation = 0.0;

while (<STDIN>) {
  if (m/type=SigmoidComponent/) {
    # a line like:
    # component name=Lstm1_f type=SigmoidComponent, dim=1280, count=5.02e+05,
    # value-avg=[percentiles(0,1,2,5 10,20,50,80,90
    # 95,98,99,100)=(0.06,0.17,0.19,0.24 0.28,0.33,0.44,0.62,0.79
    # 0.96,0.99,1.0,1.0), mean=0.482, stddev=0.198],
    # deriv-avg=[percentiles(0,1,2,5 10,20,50,80,90
    # 95,98,99,100)=(0.0001,0.003,0.004,0.03 0.12,0.18,0.22,0.24,0.25
    # 0.25,0.25,0.25,0.25), mean=0.198, stddev=0.0591]
    if (m/deriv-avg=[^m]+mean=([^,]+),/) {
      $num_nonlinearities += 1;
      my $this_saturation = 1.0 - ($1 / 0.25);
      $total_saturation += $this_saturation;
    } else {
      print STDERR "$0: could not make sense of line (no deriv-avg?): $_";
    }
  } elsif (m/type=TanhComponent/) {
    if (m/deriv-avg=[^m]+mean=([^,]+),/) {
      $num_nonlinearities += 1;
      my $this_saturation = 1.0 - ($1 / 1.0);
      $total_saturation += $this_saturation;
    } else {
      print STDERR "$0: could not make sense of line (no deriv-avg?): $_";
    }
  } elsif (m/type=LstmNonlinearityComponent/) {
    # An example of a line like this is right at the bottom of this program, it's extremely long.
    my $ok = 1;
    foreach my $sigmoid_name ( ("i_t", "f_t", "o_t") ) {
      if (m/${sigmoid_name}_sigmoid=[{][^}]+deriv-avg=[^}]+mean=([^,]+),/) {
        $num_nonlinearities += 1;
        my $this_saturation = 1.0 - ($1 / 0.25);
        $total_saturation += $this_saturation;
      } else {
        $ok = 0;
      }
    }
    foreach my $tanh_name ( ("c_t", "m_t") ) {
      if (m/${tanh_name}_tanh=[{][^}]+deriv-avg=[^}]+mean=([^,]+),/) {
        $num_nonlinearities += 1;
        my $this_saturation = 1.0 - ($1 / 1.0);
        $total_saturation += $this_saturation;
      } else {
        $ok = 0;
      }
    }
    if (! $ok) {
      print STDERR "Could not parse at least one of the avg-deriv values in the following info line: $_";
    }
  } elsif (m/type=.*GruNonlinearityComponent/) {
    if (m/deriv-avg=[^m]+mean=([^,]+),/) {
      $num_nonlinearities += 1;
      my $this_saturation = 1.0 - ($1 / 1.0);
      $total_saturation += $this_saturation;
    } else {
      print STDERR "$0: could not make sense of line (no deriv-avg?): $_";
    }
  }
}


if ($num_nonlinearities == 0) {
  print "0.0\n";
  exit(0);
} else {
  my $saturation = $total_saturation / $num_nonlinearities;
  if ($saturation < 0.0 || $saturation > 1.0) {
    print STDERR "Bad saturation value: $saturation\n";
    exit(1);
  } else {
    print "$saturation\n";
  }
}


# example line with LstmNonlinearityComponent that we parse:
# component name=lstm2.lstm_nonlin type=LstmNonlinearityComponent, input-dim=2560, output-dim=1024, learning-rate=0.002, max-change=0.75, cell-dim=512, w_ic-rms=0.9941, w_fc-rms=0.8901, w_oc-rms=0.9794, count=3.53e+05, i_t_sigmoid={ self-repair-lower-threshold=0.05, self-repair-scale=1e-05, self-repaired-proportion=0.0722299, value-avg=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.04,0.08,0.09,0.12 0.17,0.25,0.46,0.76,0.87 0.91,0.96,0.96,1.0), mean=0.494, stddev=0.253], deriv-avg=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.0007,0.03,0.04,0.06 0.09,0.12,0.19,0.23,0.24 0.25,0.25,0.25,0.25), mean=0.179, stddev=0.0595] }, f_t_sigmoid={ self-repair-lower-threshold=0.05, self-repair-scale=1e-05, self-repaired-proportion=0.0688061, value-avg=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.06,0.11,0.13,0.17 0.22,0.30,0.51,0.70,0.82 0.90,0.96,0.98,1.0), mean=0.509, stddev=0.219], deriv-avg=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.001,0.01,0.03,0.07 0.11,0.15,0.21,0.24,0.25 0.25,0.25,0.25,0.25), mean=0.194, stddev=0.0561] }, c_t_tanh={ self-repair-lower-threshold=0.2, self-repair-scale=1e-05, self-repaired-proportion=0.178459, value-avg=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(-1.0,-0.98,-0.97,-0.92 -0.82,-0.65,-0.01,0.66,0.87 0.94,0.95,0.97,0.99), mean=0.00447, stddev=0.612], deriv-avg=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.003,0.02,0.04,0.10 0.14,0.25,0.65,0.84,0.90 0.94,0.97,0.97,0.98), mean=0.58, stddev=0.281] }, o_t_sigmoid={ self-repair-lower-threshold=0.05, self-repair-scale=1e-05, self-repaired-proportion=0.0608838, value-avg=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.02,0.07,0.09,0.12 0.17,0.25,0.52,0.77,0.86 0.90,0.94,0.96,0.99), mean=0.514, stddev=0.256], deriv-avg=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.007,0.04,0.04,0.07 0.09,0.12,0.19,0.23,0.24 0.25,0.25,0.25,0.25), mean=0.175, stddev=0.0579] }, m_t_tanh={ self-repair-lower-threshold=0.2, self-repair-scale=1e-05, self-repaired-proportion=0.134653, value-avg=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(-0.99,-0.95,-0.92,-0.85 -0.73,-0.51,0.02,0.48,0.73 0.86,0.96,0.98,1.0), mean=0.00581, stddev=0.522], deriv-avg=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.002,0.03,0.04,0.13 0.26,0.41,0.75,0.93,0.97 0.99,1.0,1.0,1.0), mean=0.672, stddev=0.272] }


================================================
FILE: egs/steps/nnet3/get_successful_models.py
================================================
#!/usr/bin/env python

from __future__ import print_function
import re
import os
import argparse
import sys
import warnings
import copy
import glob


if __name__ == "__main__":
    # we add compulsory arguments as named arguments for readability
    parser = argparse.ArgumentParser(description="Create a list of models suitable for averaging "
                                                 "based on their train objf values.",
                                     epilog="See steps/nnet3/lstm/train.sh for example.")

    parser.add_argument("--difference-threshold", type=float,
                        help="The threshold for discarding models, "
                        "when objf of the model differs more than this value from the best model "
                        "it is discarded.",
                        default=1.0)

    parser.add_argument("num_models", type=int,
                        help="Number of models.")

    parser.add_argument("logfile_pattern", type=str,
                        help="Pattern for identifying the log-file names. "
                        "It specifies the entire log file name, except for the job number, "
                        "which is replaced with '%'. e.g. exp/nneet3/tdnn_sp/log/train.4.%.log")


    args = parser.parse_args()

    assert(args.num_models > 0)

    parse_regex = re.compile("LOG .* Overall average objective function for 'output' is ([0-9e.\-+]+) over ([0-9e.\-+]+) frames")
    loss = []
    for i in range(args.num_models):
        model_num = i + 1
        logfile = re.sub('%', str(model_num), args.logfile_pattern)
        lines = open(logfile, 'r').readlines()
        this_loss = -100000
        for line_num in range(1, len(lines) + 1):
            # we search from the end as this would result in
            # lesser number of regex searches. Python regex is slow !
            mat_obj = parse_regex.search(lines[-1*line_num])
            if mat_obj is not None:
                this_loss = float(mat_obj.groups()[0])
                break;
        loss.append(this_loss);
    max_index = loss.index(max(loss))
    accepted_models = []
    for i in range(args.num_models):
        if (loss[max_index] - loss[i]) <= args.difference_threshold:
            accepted_models.append(i+1)

    model_list = " ".join([str(x) for x in accepted_models])
    print(model_list)

    if len(accepted_models) != args.num_models:
        print("WARNING: Only {0}/{1} of the models have been accepted for averaging, based on log files {2}.".format(len(accepted_models), args.num_models, args.logfile_pattern), file=sys.stderr)
        print("         Using models {0}".format(model_list), file=sys.stderr)


================================================
FILE: egs/steps/nnet3/lstm/make_configs.py
================================================
#!/usr/bin/env python

# This script is deprecated, please use ../xconfig_to_configs.py

from __future__ import print_function
import os
import argparse
import sys
import warnings
import copy
import imp

nodes = imp.load_source('nodes', 'steps/nnet3/components.py')
sys.path.insert(0, 'steps')
import libs.common as common_lib

def GetArgs():
    # we add compulsary arguments as named arguments for readability
    parser = argparse.ArgumentParser(description="Writes config files and variables "
                                                 "for LSTMs creation and training",
                                     epilog="See steps/nnet3/lstm/train.sh for example.")

    # Only one of these arguments can be specified, and one of them has to
    # be compulsarily specified
    feat_group = parser.add_mutually_exclusive_group(required = True)
    feat_group.add_argument("--feat-dim", type=int,
                            help="Raw feature dimension, e.g. 13")
    feat_group.add_argument("--feat-dir", type=str,
                            help="Feature directory, from which we derive the feat-dim")

    # only one of these arguments can be specified
    ivector_group = parser.add_mutually_exclusive_group(required = False)
    ivector_group.add_argument("--ivector-dim", type=int,
                                help="iVector dimension, e.g. 100", default=0)
    ivector_group.add_argument("--ivector-dir", type=str,
                                help="iVector dir, which will be used to derive the ivector-dim  ", default=None)

    num_target_group = parser.add_mutually_exclusive_group(required = True)
    num_target_group.add_argument("--num-targets", type=int,
                                  help="number of network targets (e.g. num-pdf-ids/num-leaves)")
    num_target_group.add_argument("--ali-dir", type=str,
                                  help="alignment directory, from which we derive the num-targets")
    num_target_group.add_argument("--tree-dir", type=str,
                                  help="directory with final.mdl, from which we derive the num-targets")

    # General neural network options
    parser.add_argument("--splice-indexes", type=str,
                        help="Splice indexes at input layer, e.g. '-3,-2,-1,0,1,2,3'", required = True, default="0")
    parser.add_argument("--xent-regularize", type=float,
                        help="For chain models, if nonzero, add a separate output for cross-entropy "
                        "regularization (with learning-rate-factor equal to the inverse of this)",
                        default=0.0)
    parser.add_argument("--include-log-softmax", type=str, action=common_lib.StrToBoolAction,
                        help="add the final softmax layer ", default=True, choices = ["false", "true"])
    parser.add_argument("--max-change-per-component", type=float,
                        help="Enforces per-component max change (except for the final affine layer). "
                        "if 0 it would not be enforced.", default=0.75)
    parser.add_argument("--max-change-per-component-final", type=float,
                        help="Enforces per-component max change for the final affine layer. "
                        "if 0 it would not be enforced.", default=1.5)

    # LSTM options
    parser.add_argument("--num-lstm-layers", type=int,
                        help="Number of LSTM layers to be stacked", default=1)
    parser.add_argument("--cell-dim", type=int,
                        help="dimension of lstm-cell")
    parser.add_argument("--recurrent-projection-dim", type=int,
                        help="dimension of recurrent projection")
    parser.add_argument("--non-recurrent-projection-dim", type=int,
                        help="dimension of non-recurrent projection")
    parser.add_argument("--hidden-dim", type=int,
                        help="dimension of fully-connected layers")

    # Natural gradient options
    parser.add_argument("--ng-per-element-scale-options", type=str,
                        help="options to be supplied to NaturalGradientPerElementScaleComponent", default="")
    parser.add_argument("--ng-affine-options", type=str,
                        help="options to be supplied to NaturalGradientAffineComponent", default="")

    # Gradient clipper options
    parser.add_argument("--norm-based-clipping", type=str, action=common_lib.StrToBoolAction,
                        help="Outdated option retained for back compatibility, has no effect.",
                        default=True, choices = ["false", "true"])
    parser.add_argument("--clipping-threshold", type=float,
                        help="clipping threshold used in BackpropTruncation components, "
                        "if clipping-threshold=0 no clipping is done", default=30)
    parser.add_argument("--zeroing-threshold", type=float,
                        help="zeroing threshold used in BackpropTruncation components, "
                        "if zeroing-threshold=0 no periodic zeroing is done", default=15.0)
    parser.add_argument("--zeroing-interval", type=int,
                        help="zeroing interval used in BackpropTruncation components", default=20)
    parser.add_argument("--self-repair-scale-nonlinearity", type=float,
                        help="A non-zero value activates the self-repair mechanism in the sigmoid and tanh non-linearities of the LSTM", default=0.00001)
    parser.add_argument("--self-repair-scale-clipgradient", type=float,
                        help="Outdated option retained for back compatibility, has no effect.",
                        default=1.0)

    # Delay options
    parser.add_argument("--label-delay", type=int, default=None,
                        help="option to delay the labels to make the lstm robust")

    parser.add_argument("--lstm-delay", type=str, default=None,
                        help="option to have different delays in recurrence for each lstm")

    parser.add_argument("config_dir",
                        help="Directory to write config files and variables")

    print(' '.join(sys.argv))

    args = parser.parse_args()
    args = CheckArgs(args)

    return args

def CheckArgs(args):
    if not os.path.exists(args.config_dir):
        os.makedirs(args.config_dir)

    ## Check arguments.
    if args.feat_dir is not None:
        args.feat_dim = common_lib.get_feat_dim(args.feat_dir)

    if args.ali_dir is not None:
        args.num_targets = common_lib.get_number_of_leaves_from_tree(args.ali_dir)
    elif args.tree_dir is not None:
        args.num_targets = common_lib.get_number_of_leaves_from_tree(args.tree_dir)

    if args.ivector_dir is not None:
        args.ivector_dim = common_lib.get_ivector_dim(args.ivector_dir)

    if not args.feat_dim > 0:
        raise Exception("feat-dim has to be postive")

    if not args.num_targets > 0:
        print(args.num_targets)
        raise Exception("num_targets has to be positive")

    if not args.ivector_dim >= 0:
        raise Exception("ivector-dim has to be non-negative")

    if not args.max_change_per_component >= 0 or not args.max_change_per_component_final >= 0:
        raise Exception("max-change-per-component and max_change-per-component-final should be non-negative")

    if (args.num_lstm_layers < 1):
        sys.exit("--num-lstm-layers has to be a positive integer")
    if (args.clipping_threshold < 0 or args.zeroing_threshold < 0):
        sys.exit("--clipping-threshold and --zeroing-threshold have to be non-negative")
    if not args.zeroing_interval > 0:
        raise Exception("--zeroing-interval has to be positive")
    if args.lstm_delay is None:
        args.lstm_delay = [[-1]] * args.num_lstm_layers
    else:
        try:
            args.lstm_delay = ParseLstmDelayString(args.lstm_delay.strip())
        except ValueError:
            sys.exit("--lstm-delay has incorrect format value. Provided value is '{0}'".format(args.lstm_delay))
        if len(args.lstm_delay) != args.num_lstm_layers:
            sys.exit("--lstm-delay: Number of delays provided has to match --num-lstm-layers")

    return args

def PrintConfig(file_name, config_lines):
    f = open(file_name, 'w')
    f.write("\n".join(config_lines['components'])+"\n")
    f.write("\n#Component nodes\n")
    f.write("\n".join(config_lines['component-nodes'])+"\n")
    f.close()

def ParseSpliceString(splice_indexes, label_delay=None):
    ## Work out splice_array e.g. splice_array = [ [ -3,-2,...3 ], [0], [-2,2], .. [ -8,8 ] ]
    split1 = splice_indexes.split(" ");  # we already checked the string is nonempty.
    if len(split1) < 1:
        splice_indexes = "0"

    left_context=0
    right_context=0
    if label_delay is not None:
        left_context = -label_delay
        right_context = label_delay

    splice_array = []
    try:
        for i in range(len(split1)):
            indexes = [int(x) for x in split1[i].strip().split(",")]
            print(indexes)
            if len(indexes) < 1:
                raise ValueError("invalid --splice-indexes argument, too-short element: "
                                + splice_indexes)

            if (i > 0)  and ((len(indexes) != 1) or (indexes[0] != 0)):
                raise ValueError("elements of --splice-indexes splicing is only allowed initial layer.")

            if not indexes == sorted(indexes):
                raise ValueError("elements of --splice-indexes must be sorted: "
                                + splice_indexes)
            left_context += -indexes[0]
            right_context += indexes[-1]
            splice_array.append(indexes)
    except ValueError as e:
        raise ValueError("invalid --splice-indexes argument " + splice_indexes + str(e))

    left_context = max(0, left_context)
    right_context = max(0, right_context)

    return {'left_context':left_context,
            'right_context':right_context,
            'splice_indexes':splice_array,
            'num_hidden_layers':len(splice_array)
            }

def ParseLstmDelayString(lstm_delay):
    ## Work out lstm_delay e.g. "-1 [-1,1] -2" -> list([ [-1], [-1, 1], [-2] ])
    split1 = lstm_delay.split(" ");
    lstm_delay_array = []
    try:
        for i in range(len(split1)):
            indexes = [int(x) for x in split1[i].strip().lstrip('[').rstrip(']').strip().split(",")]
            if len(indexes) < 1:
                raise ValueError("invalid --lstm-delay argument, too-short element: "
                                + lstm_delay)
            elif len(indexes) == 2 and indexes[0] * indexes[1] >= 0:
                raise ValueError('Warning: {} is not a standard BLSTM mode. There should be a negative delay for the forward, and a postive delay for the backward.'.format(indexes))
            if len(indexes) == 2 and indexes[0] > 0: # always a negative delay followed by a postive delay
                indexes[0], indexes[1] = indexes[1], indexes[0]
            lstm_delay_array.append(indexes)
    except ValueError as e:
        raise ValueError("invalid --lstm-delay argument " + lstm_delay + str(e))

    return lstm_delay_array


def MakeConfigs(config_dir, feat_dim, ivector_dim, num_targets,
                splice_indexes, lstm_delay, cell_dim, hidden_dim,
                recurrent_projection_dim, non_recurrent_projection_dim,
                num_lstm_layers, num_hidden_layers,
                norm_based_clipping, clipping_threshold, zeroing_threshold, zeroing_interval,
                ng_per_element_scale_options, ng_affine_options,
                label_delay, include_log_softmax, xent_regularize,
                self_repair_scale_nonlinearity, self_repair_scale_clipgradient,
                max_change_per_component, max_change_per_component_final):

    config_lines = {'components':[], 'component-nodes':[]}

    config_files={}
    prev_layer_output = nodes.AddInputLayer(config_lines, feat_dim, splice_indexes[0], ivector_dim)

    # Add the init config lines for estimating the preconditioning matrices
    init_config_lines = copy.deepcopy(config_lines)
    init_config_lines['components'].insert(0, '# Config file for initializing neural network prior to')
    init_config_lines['components'].insert(0, '# preconditioning matrix computation')
    nodes.AddOutputLayer(init_config_lines, prev_layer_output)
    config_files[config_dir + '/init.config'] = init_config_lines

    prev_layer_output = nodes.AddLdaLayer(config_lines, "L0", prev_layer_output, config_dir + '/lda.mat')

    for i in range(num_lstm_layers):
        if len(lstm_delay[i]) == 2: # add a bi-directional LSTM layer
            prev_layer_output = nodes.AddBLstmLayer(config_lines = config_lines,
                                                    name = "BLstm{0}".format(i+1),
                                                    input = prev_layer_output,
                                                    cell_dim = cell_dim,
                                                    recurrent_projection_dim = recurrent_projection_dim,
                                                    non_recurrent_projection_dim = non_recurrent_projection_dim,
                                                    clipping_threshold = clipping_threshold,
                                                    zeroing_threshold = zeroing_threshold,
                                                    zeroing_interval = zeroing_interval,
                                                    ng_per_element_scale_options = ng_per_element_scale_options,
                                                    ng_affine_options = ng_affine_options,
                                                    lstm_delay = lstm_delay[i],
                                                    self_repair_scale_nonlinearity = self_repair_scale_nonlinearity,
                                                    max_change_per_component = max_change_per_component)
        else: # add a uni-directional LSTM layer
            prev_layer_output = nodes.AddLstmLayer(config_lines = config_lines,
                                                   name = "Lstm{0}".format(i+1),
                                                   input = prev_layer_output,
                                                   cell_dim = cell_dim,
                                                   recurrent_projection_dim = recurrent_projection_dim,
                                                   non_recurrent_projection_dim = non_recurrent_projection_dim,
                                                   clipping_threshold = clipping_threshold,
                                                   zeroing_threshold = zeroing_threshold,
                                                   zeroing_interval = zeroing_interval,
                                                   ng_per_element_scale_options = ng_per_element_scale_options,
                                                   ng_affine_options = ng_affine_options,
                                                   lstm_delay = lstm_delay[i][0],
                                                   self_repair_scale_nonlinearity = self_repair_scale_nonlinearity,
                                                   max_change_per_component = max_change_per_component)
        # make the intermediate config file for layerwise discriminative
        # training
        nodes.AddFinalLayer(config_lines, prev_layer_output, num_targets, ng_affine_options, max_change_per_component = max_change_per_component_final, label_delay = label_delay, include_log_softmax = include_log_softmax)


        if xent_regularize != 0.0:
            nodes.AddFinalLayer(config_lines, prev_layer_output, num_targets,
                                include_log_softmax = True, label_delay = label_delay,
                                max_change_per_component = max_change_per_component_final,
                                name_affix = 'xent')

        config_files['{0}/layer{1}.config'.format(config_dir, i+1)] = config_lines
        config_lines = {'components':[], 'component-nodes':[]}

    for i in range(num_lstm_layers, num_hidden_layers):
        prev_layer_output = nodes.AddAffRelNormLayer(config_lines, "L{0}".format(i+1),
                                               prev_layer_output, hidden_dim,
                                               ng_affine_options, self_repair_scale = self_repair_scale_nonlinearity, max_change_per_component = max_change_per_component)
        # make the intermediate config file for layerwise discriminative
        # training
        nodes.AddFinalLayer(config_lines, prev_layer_output, num_targets, ng_affine_options, max_change_per_component = max_change_per_component_final, label_delay = label_delay, include_log_softmax = include_log_softmax)

        if xent_regularize != 0.0:
            nodes.AddFinalLayer(config_lines, prev_layer_output, num_targets,
                                include_log_softmax = True, label_delay = label_delay,
                                max_change_per_component = max_change_per_component_final,
                                name_affix = 'xent')

        config_files['{0}/layer{1}.config'.format(config_dir, i+1)] = config_lines
        config_lines = {'components':[], 'component-nodes':[]}

    # printing out the configs
    # init.config used to train lda-mllt train
    for key in config_files.keys():
        PrintConfig(key, config_files[key])


def ProcessSpliceIndexes(config_dir, splice_indexes, label_delay, num_lstm_layers):
    parsed_splice_output = ParseSpliceString(splice_indexes.strip(), label_delay)
    left_context = parsed_splice_output['left_context']
    right_context = parsed_splice_output['right_context']
    num_hidden_layers = parsed_splice_output['num_hidden_layers']
    splice_indexes = parsed_splice_output['splice_indexes']

    if (num_hidden_layers < num_lstm_layers):
        raise Exception("num-lstm-layers : number of lstm layers has to be greater than number of layers, decided based on splice-indexes")

    # write the files used by other scripts like steps/nnet3/get_egs.sh
    f = open(config_dir + "/vars", "w")
    print('model_left_context={}'.format(left_context), file=f)
    print('model_right_context={}'.format(right_context), file=f)
    print('num_hidden_layers={}'.format(num_hidden_layers), file=f)
    # print('initial_right_context=' + str(splice_array[0][-1]), file=f)
    f.close()

    return [left_context, right_context, num_hidden_layers, splice_indexes]


def Main():
    args = GetArgs()
    [left_context, right_context, num_hidden_layers, splice_indexes] = ProcessSpliceIndexes(args.config_dir, args.splice_indexes, args.label_delay, args.num_lstm_layers)

    MakeConfigs(config_dir = args.config_dir,
                feat_dim = args.feat_dim, ivector_dim = args.ivector_dim,
                num_targets = args.num_targets,
                splice_indexes = splice_indexes, lstm_delay = args.lstm_delay,
                cell_dim = args.cell_dim,
                hidden_dim = args.hidden_dim,
                recurrent_projection_dim = args.recurrent_projection_dim,
                non_recurrent_projection_dim = args.non_recurrent_projection_dim,
                num_lstm_layers = args.num_lstm_layers,
                num_hidden_layers = num_hidden_layers,
                norm_based_clipping = args.norm_based_clipping,
                clipping_threshold = args.clipping_threshold,
                zeroing_threshold = args.zeroing_threshold,
                zeroing_interval = args.zeroing_interval,
                ng_per_element_scale_options = args.ng_per_element_scale_options,
                ng_affine_options = args.ng_affine_options,
                label_delay = args.label_delay,
                include_log_softmax = args.include_log_softmax,
                xent_regularize = args.xent_regularize,
                self_repair_scale_nonlinearity = args.self_repair_scale_nonlinearity,
                self_repair_scale_clipgradient = args.self_repair_scale_clipgradient,
                max_change_per_component = args.max_change_per_component,
                max_change_per_component_final = args.max_change_per_component_final)

if __name__ == "__main__":
    Main()


================================================
FILE: egs/steps/nnet3/lstm/train.sh
================================================
#!/usr/bin/env bash

# THIS SCRIPT IS DEPRECATED, see ../train_rnn.py

# Copyright 2012-2015  Johns Hopkins University (Author: Daniel Povey).
#           2013  Xiaohui Zhang
#           2013  Guoguo Chen
#           2014  Vimal Manohar
#           2014-2015  Vijayaditya Peddinti
# Apache 2.0.

# Terminology:
# sample - one input-output tuple, which is an input sequence and output sequence for LSTM
# frame  - one output label and the input context used to compute it

# Begin configuration section.
cmd=run.pl
num_epochs=10      # Number of epochs of training;
                   # the number of iterations is worked out from this.
initial_effective_lrate=0.0003
final_effective_lrate=0.00003
num_jobs_initial=1 # Number of neural net jobs to run in parallel at the start of training
num_jobs_final=8   # Number of neural net jobs to run in parallel at the end of training
prior_subset_size=20000 # 20k samples per job, for computing priors.
num_jobs_compute_prior=10 # these are single-threaded, run on CPU.
get_egs_stage=0    # can be used for rerunning after partial
online_ivector_dir=
presoftmax_prior_scale_power=-0.25  # we haven't yet used pre-softmax prior scaling in the LSTM model
remove_egs=true  # set to false to disable removing egs after training is done.

max_models_combine=20 # The "max_models_combine" is the maximum number of models we give
  # to the final 'combine' stage, but these models will themselves be averages of
  # iteration-number ranges.

shuffle_buffer_size=5000 # This "buffer_size" variable controls randomization of the samples
                # on each iter.  You could set it to 0 or to a large value for complete
                # randomization, but this would both consume memory and cause spikes in
                # disk I/O.  Smaller is easier on disk and memory but less random.  It's
                # not a huge deal though, as samples are anyway randomized right at the start.
                # (the point of this is to get data in different minibatches on different iterations,
                # since in the preconditioning method, 2 samples in the same minibatch can
                # affect each others' gradients.

add_layers_period=2 # by default, add new layers every 2 iterations.
stage=-6
exit_stage=-100 # you can set this to terminate the training early.  Exits before running this stage

# count space-separated fields in splice_indexes to get num-hidden-layers.
splice_indexes="-2,-1,0,1,2 0 0"
# Format : layer<hidden_layer>/<frame_indices>....layer<hidden_layer>/<frame_indices> "
# note: hidden layers which are composed of one or more components,
# so hidden layer indexing is different from component count

# LSTM parameters
num_lstm_layers=3
cell_dim=1024  # dimension of the LSTM cell
hidden_dim=1024  # the dimension of the fully connected hidden layer outputs
recurrent_projection_dim=256
non_recurrent_projection_dim=256
norm_based_clipping=true  # if true norm_based_clipping is used.
                          # In norm-based clipping the activation Jacobian matrix
                          # for the recurrent connections in the network is clipped
                          # to ensure that the individual row-norm (l2) does not increase
                          # beyond the clipping_threshold.
                          # If false, element-wise clipping is used.
clipping_threshold=30     # if norm_based_clipping is true this would be the maximum value of the row l2-norm,
                          # else this is the max-absolute value of each element in Jacobian.
chunk_width=20  # number of output labels in the sequence used to train an LSTM
                # Caution: if you double this you should halve --samples-per-iter.
chunk_left_context=40  # number of steps used in the estimation of LSTM state before prediction of the first label
chunk_right_context=0  # number of steps used in the estimation of LSTM state before prediction of the first label (usually used in bi-directional LSTM case)
label_delay=5  # the lstm output is used to predict the label with the specified delay
lstm_delay=" -1 -2 -3 "  # the delay to be used in the recurrence of lstms
                         # "-1 -2 -3" means the a three layer stacked LSTM would use recurrence connections with
                         # delays -1, -2 and -3 at layer1 lstm, layer2 lstm and layer3 lstm respectively
                         # "[-1,1] [-2,2] [-3,3]" means a three layer stacked bi-directional LSTM would use recurrence
                         # connections with delay -1 for the forward, 1 for the backward at layer1,
                         # -2 for the forward, 2 for the backward at layer2, and so on at layer3
num_bptt_steps=    # this variable counts the number of time steps to back-propagate from the last label in the chunk
                   # it is usually same as chunk_width


# nnet3-train options
shrink=0.99  # this parameter would be used to scale the parameter matrices
shrink_threshold=0.15  # a value less than 0.25 that we compare the mean of
                       # 'deriv-avg' for sigmoid components with, and if it's
                       # less, we shrink.
max_param_change=2.0  # max param change per minibatch
num_chunk_per_minibatch=100  # number of sequences to be processed in parallel every mini-batch

samples_per_iter=20000 # this is really the number of egs in each archive.  Each eg has
                       # 'chunk_width' frames in it-- for chunk_width=20, this value (20k)
                       # is equivalent to the 400k number that we use as a default in
                       # regular DNN training.
momentum=0.5    # e.g. 0.5.  Note: we implemented it in such a way that
                # it doesn't increase the effective learning rate.
use_gpu=true    # if true, we run on GPU.
cleanup=true
egs_dir=
max_lda_jobs=10  # use no more than 10 jobs for the LDA accumulation.
lda_opts=
egs_opts=
transform_dir=     # If supplied, this dir used instead of alidir to find transforms.
cmvn_opts=  # will be passed to get_lda.sh and get_egs.sh, if supplied.
            # only relevant for "raw" features, not lda.
feat_type=raw  # or set to 'lda' to use LDA features.
align_cmd=              # The cmd that is passed to steps/nnet2/align.sh
align_use_gpu=          # Passed to use_gpu in steps/nnet2/align.sh [yes/no]
realign_times=          # List of times on which we realign.  Each time is
                        # floating point number strictly between 0 and 1, which
                        # will be multiplied by the num-iters to get an iteration
                        # number.
num_jobs_align=30       # Number of jobs for realignment

rand_prune=4.0 # speeds up LDA.

# End configuration section.

trap 'for pid in $(jobs -pr); do kill -KILL $pid; done' INT QUIT TERM

echo "$0: THIS SCRIPT IS DEPRECATED"
echo "$0 $@"  # Print the command line for logging

if [ -f path.sh ]; then . ./path.sh; fi
. parse_options.sh || exit 1;

if [ $# != 4 ]; then
  echo "Usage: $0 [opts] <data> <lang> <ali-dir> <exp-dir>"
  echo " e.g.: $0 data/train data/lang exp/tri3_ali exp/tri4_nnet"
  echo ""
  echo "Main options (for others, see top of script file)"
  echo "  --config <config-file>                           # config file containing options"
  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
  echo "  --num-epochs <#epochs|10>                        # Number of epochs of training"
  echo "  --initial-effective-lrate <lrate|0.0003>         # effective learning rate at start of training."
  echo "  --final-effective-lrate <lrate|0.00003>          # effective learning rate at end of training."
  echo "                                                   # data, 0.00025 for large data"
  echo "  --momentum <momentum|0.5>                        # Momentum constant: note, this is "
  echo "                                                   # implemented in such a way that it doesn't"
  echo "                                                   # increase the effective learning rate."
  echo "  --num-jobs-initial <num-jobs|1>                  # Number of parallel jobs to use for neural net training, at the start."
  echo "  --num-jobs-final <num-jobs|8>                    # Number of parallel jobs to use for neural net training, at the end"
  echo "  --num-threads <num-threads|16>                   # Number of parallel threads per job, for CPU-based training (will affect"
  echo "                                                   # results as well as speed; may interact with batch size; if you increase"
  echo "                                                   # this, you may want to decrease the batch size."
  echo "  --parallel-opts <opts|\"--num-threads 16 --mem 1G\">      # extra options to pass to e.g. queue.pl for processes that"
  echo "                                                   # use multiple threads... note, you might have to reduce --mem"
  echo "                                                   # versus your defaults, because it gets multiplied by the --num-threads argument."
  echo "  --splice-indexes <string|\"-2,-1,0,1,2 0 0\"> "
  echo "                                                   # Frame indices used for each splice layer."
  echo "                                                   # Format : <frame_indices> .... <frame_indices> "
  echo "                                                   # the number of fields determines the number of LSTM and non-recurrent layers"
  echo "                                                   # also see the --num-lstm-layers option"
  echo "                                                   # (note: we splice processed, typically 40-dimensional frames"
  echo "  --lda-dim <dim|''>                               # Dimension to reduce spliced features to with LDA"
  echo "  --realign-epochs <list-of-epochs|''>             # A list of space-separated epoch indices the beginning of which"
  echo "                                                   # realignment is to be done"
  echo "  --align-cmd (utils/run.pl|utils/queue.pl <queue opts>) # passed to align.sh"
  echo "  --align-use-gpu (yes/no)                         # specify is gpu is to be used for realignment"
  echo "  --num-jobs-align <#njobs|30>                     # Number of jobs to perform realignment"
  echo "  --stage <stage|-4>                               # Used to run a partially-completed training process from somewhere in"
  echo "                                                   # the middle."

  echo " ################### LSTM options ###################### "
  echo "  --num-lstm-layers <int|3>                        # number of LSTM layers"
  echo "  --cell-dim   <int|1024>                          # dimension of the LSTM cell"
  echo "  --hidden-dim      <int|1024>                     # the dimension of the fully connected hidden layer outputs"
  echo "  --recurrent-projection-dim  <int|256>            # the output dimension of the recurrent-projection-matrix"
  echo "  --non-recurrent-projection-dim  <int|256>        # the output dimension of the non-recurrent-projection-matrix"
  echo "  --chunk-left-context <int|40>                    # number of time-steps used in the estimation of the first LSTM state"
  echo "  --chunk-width <int|20>                           # number of output labels in the sequence used to train an LSTM"
  echo "                                                   # Caution: if you double this you should halve --samples-per-iter."
  echo "  --norm-based-clipping <bool|true>                # if true norm_based_clipping is used."
  echo "                                                   # In norm-based clipping the activation Jacobian matrix"
  echo "                                                   # for the recurrent connections in the network is clipped"
  echo "                                                   # to ensure that the individual row-norm (l2) does not increase"
  echo "                                                   # beyond the clipping_threshold."
  echo "                                                   # If false, element-wise clipping is used."
  echo "  --num-bptt-steps <int|>                          # this variable counts the number of time steps to back-propagate from the last label in the chunk"
  echo "                                                   # it defaults to chunk_width"
  echo "  --label-delay <int|5>                            # the lstm output is used to predict the label with the specified delay"

  echo "  --lstm-delay <str|\" -1 -2 -3 \">                # the delay to be used in the recurrence of lstms"
  echo "                                                   # \"-1 -2 -3\" means the a three layer stacked LSTM would use recurrence connections with "
  echo "                                                   # delays -1, -2 and -3 at layer1 lstm, layer2 lstm and layer3 lstm respectively"
  echo "  --clipping-threshold <int|30>                    # if norm_based_clipping is true this would be the maximum value of the row l2-norm,"
  echo "                                                   # else this is the max-absolute value of each element in Jacobian."

  echo " ################### LSTM specific training options ###################### "
  echo "  --num-chunks-per-minibatch <minibatch-size|100>  # Number of sequences to be processed in parallel in a minibatch"
  echo "  --samples-per-iter <#samples|20000>              # Number of egs in each archive of data.  This times --chunk-width is"
  echo "                                                   # the number of frames processed per iteration"
  echo "  --shrink <shrink|0.99>                           # if non-zero this parameter will be used to scale the parameter matrices"
  echo "  --shrink-threshold <threshold|0.15>              # a threshold (should be between 0.0 and 0.25) that controls when to"
  echo "                                                   # do parameter shrinking."
  echo " for more options see the script"
  exit 1;
fi

data=$1
lang=$2
alidir=$3
dir=$4

if [ ! -z "$realign_times" ]; then
  [ -z "$align_cmd" ] && echo "$0: realign_times specified but align_cmd not specified" && exit 1
  [ -z "$align_use_gpu" ] && echo "$0: realign_times specified but align_use_gpu not specified" && exit 1
fi

# Check some files.
for f in $data/feats.scp $lang/L.fst $alidir/ali.1.gz $alidir/final.mdl $alidir/tree; do
  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
done


# Set some variables.
num_leaves=`tree-info $alidir/tree 2>/dev/null | grep num-pdfs | awk '{print $2}'` || exit 1
[ -z $num_leaves ] && echo "\$num_leaves is unset" && exit 1
[ "$num_leaves" -eq "0" ] && echo "\$num_leaves is 0" && exit 1

nj=`cat $alidir/num_jobs` || exit 1;  # number of jobs in alignment dir...
# in this dir we'll have just one job.
sdata=$data/split$nj
utils/split_data.sh $data $nj

mkdir -p $dir/log
echo $nj > $dir/num_jobs
cp $alidir/tree $dir

utils/lang/check_phones_compatible.sh $lang/phones.txt $alidir/phones.txt || exit 1;
cp $lang/phones.txt $dir || exit 1;
# First work out the feature and iVector dimension, needed for tdnn config creation.
case $feat_type in
  raw) feat_dim=$(feat-to-dim --print-args=false scp:$data/feats.scp -) || \
      { echo "$0: Error getting feature dim"; exit 1; }
    ;;
  lda)  [ ! -f $alidir/final.mat ] && echo "$0: With --feat-type lda option, expect $alidir/final.mat to exist."
   # get num-rows in lda matrix, which is the lda feature dim.
   feat_dim=$(matrix-dim --print-args=false $alidir/final.mat | cut -f 1)
    ;;
  *)
   echo "$0: Bad --feat-type '$feat_type';"; exit 1;
esac
if [ -z "$online_ivector_dir" ]; then
  ivector_dim=0
else
  ivector_dim=$(feat-to-dim scp:$online_ivector_dir/ivector_online.scp -) || exit 1;
fi


if [ $stage -le -5 ]; then
  echo "$0: creating neural net configs";

  # create the config files for nnet initialization
  # note an additional space is added to splice_indexes to
  # avoid issues with the python ArgParser which can have
  # issues with negative arguments (due to minus sign)
  config_extra_opts=()
  [ ! -z "$lstm_delay" ] && config_extra_opts+=(--lstm-delay "$lstm_delay")

  steps/nnet3/lstm/make_configs.py  "${config_extra_opts[@]}" \
    --splice-indexes "$splice_indexes " \
    --num-lstm-layers $num_lstm_layers \
    --feat-dim $feat_dim \
    --ivector-dim $ivector_dim \
    --cell-dim $cell_dim \
    --hidden-dim $hidden_dim \
    --recurrent-projection-dim $recurrent_projection_dim \
    --non-recurrent-projection-dim $non_recurrent_projection_dim \
    --norm-based-clipping $norm_based_clipping \
    --clipping-threshold $clipping_threshold \
    --num-targets $num_leaves \
    --label-delay $label_delay \
   $dir/configs || exit 1;
  # Initialize as "raw" nnet, prior to training the LDA-like preconditioning
  # matrix.  This first config just does any initial splicing that we do;
  # we do this as it's a convenient way to get the stats for the 'lda-like'
  # transform.
  $cmd $dir/log/nnet_init.log \
    nnet3-init --srand=-2 $dir/configs/init.config $dir/init.raw || exit 1;
fi
# sourcing the "vars" below sets
# model_left_context=(something)
# model_right_context=(something)
# num_hidden_layers=(something)
. $dir/configs/vars || exit 1;
left_context=$((chunk_left_context + model_left_context))
right_context=$((chunk_right_context + model_right_context))
context_opts="--left-context=$left_context --right-context=$right_context"

! [ "$num_hidden_layers" -gt 0 ] && echo \
 "$0: Expected num_hidden_layers to be defined" && exit 1;

[ -z "$transform_dir" ] && transform_dir=$alidir

if [ $stage -le -4 ] && [ -z "$egs_dir" ]; then
  extra_opts=()
  [ ! -z "$cmvn_opts" ] && extra_opts+=(--cmvn-opts "$cmvn_opts")
  [ ! -z "$feat_type" ] && extra_opts+=(--feat-type $feat_type)
  [ ! -z "$online_ivector_dir" ] && extra_opts+=(--online-ivector-dir $online_ivector_dir)
  extra_opts+=(--transform-dir $transform_dir)
  extra_opts+=(--left-context $left_context)
  extra_opts+=(--right-context $right_context)

  # Note: in RNNs we process sequences of labels rather than single label per sample
  echo "$0: calling get_egs.sh"
  steps/nnet3/get_egs.sh $egs_opts "${extra_opts[@]}" \
      --cmd "$cmd" $egs_opts \
      --stage $get_egs_stage \
      --samples-per-iter $samples_per_iter \
      --frames-per-eg $chunk_width \
      $data $alidir $dir/egs || exit 1;
fi

[ -z $egs_dir ] && egs_dir=$dir/egs

if [ "$feat_dim" != "$(cat $egs_dir/info/feat_dim)" ]; then
  echo "$0: feature dimension mismatch with egs, $feat_dim vs $(cat $egs_dir/info/feat_dim)";
  exit 1;
fi
if [ "$ivector_dim" != "$(cat $egs_dir/info/ivector_dim)" ]; then
  echo "$0: ivector dimension mismatch with egs, $ivector_dim vs $(cat $egs_dir/info/ivector_dim)";
  exit 1;
fi

# copy any of the following that exist, to $dir.
cp $egs_dir/{cmvn_opts,splice_opts,final.mat} $dir 2>/dev/null

# confirm that the egs_dir has the necessary context (especially important if
# the --egs-dir option was used on the command line).
egs_left_context=$(cat $egs_dir/info/left_context) || exit -1
egs_right_context=$(cat $egs_dir/info/right_context) || exit -1
 ( [ $egs_left_context -lt $left_context ] || \
   [ $egs_right_context -lt $right_context ] ) && \
   echo "$0: egs in $egs_dir have too little context" && exit -1;

chunk_width=$(cat $egs_dir/info/frames_per_eg) || { echo "error: no such file $egs_dir/info/frames_per_eg"; exit 1; }
num_archives=$(cat $egs_dir/info/num_archives) || { echo "error: no such file $egs_dir/info/num_archives"; exit 1; }

[ $num_jobs_initial -gt $num_jobs_final ] && \
  echo "$0: --initial-num-jobs cannot exceed --final-num-jobs" && exit 1;

[ $num_jobs_final -gt $num_archives ] && \
  echo "$0: --final-num-jobs cannot exceed #archives $num_archives." && exit 1;


if [ $stage -le -3 ]; then
  echo "$0: getting preconditioning matrix for input features."
  num_lda_jobs=$num_archives
  [ $num_lda_jobs -gt $max_lda_jobs ] && num_lda_jobs=$max_lda_jobs

  # Write stats with the same format as stats for LDA.
  $cmd JOB=1:$num_lda_jobs $dir/log/get_lda_stats.JOB.log \
      nnet3-acc-lda-stats --rand-prune=$rand_prune \
        $dir/init.raw "ark:$egs_dir/egs.JOB.ark" $dir/JOB.lda_stats || exit 1;

  all_lda_accs=$(for n in $(seq $num_lda_jobs); do echo $dir/$n.lda_stats; done)
  $cmd $dir/log/sum_transform_stats.log \
    sum-lda-accs $dir/lda_stats $all_lda_accs || exit 1;

  rm $all_lda_accs || exit 1;

  # this computes a fixed affine transform computed in the way we described in
  # Appendix C.6 of http://arxiv.org/pdf/1410.7455v6.pdf; it's a scaled variant
  # of an LDA transform but without dimensionality reduction.
  $cmd $dir/log/get_transform.log \
     nnet-get-feature-transform $lda_opts $dir/lda.mat $dir/lda_stats || exit 1;

  ln -sf ../lda.mat $dir/configs/lda.mat
fi


if [ $stage -le -2 ]; then
  echo "$0: preparing initial vector for FixedScaleComponent before softmax"
  echo "  ... using priors^$presoftmax_prior_scale_power and rescaling to average 1"

  # obtains raw pdf count
  $cmd JOB=1:$nj $dir/log/acc_pdf.JOB.log \
     ali-to-post "ark:gunzip -c $alidir/ali.JOB.gz|" ark:- \| \
     post-to-tacc --per-pdf=true  $alidir/final.mdl ark:- $dir/pdf_counts.JOB || exit 1;
  $cmd $dir/log/sum_pdf_counts.log \
       vector-sum --binary=false $dir/pdf_counts.* $dir/pdf_counts || exit 1;
  rm $dir/pdf_counts.*

  awk -v power=$presoftmax_prior_scale_power -v smooth=0.01 \
     '{ for(i=2; i<=NF-1; i++) { count[i-2] = $i;  total += $i; }
        num_pdfs=NF-2;  average_count = total/num_pdfs;
        for (i=0; i<num_pdfs; i++) stot += (scale[i] = (count[i] + smooth * average_count)^power)
        printf " [ "; for (i=0; i<num_pdfs; i++) printf("%f ", scale[i]*num_pdfs/stot); print "]" }' \
     $dir/pdf_counts > $dir/presoftmax_prior_scale.vec
  ln -sf ../presoftmax_prior_scale.vec $dir/configs/presoftmax_prior_scale.vec
fi

if [ $stage -le -1 ]; then
  # Add the first layer; this will add in the lda.mat and
  # presoftmax_prior_scale.vec.
  $cmd $dir/log/add_first_layer.log \
       nnet3-init --srand=-3 $dir/init.raw $dir/configs/layer1.config $dir/0.raw || exit 1;

  # Convert to .mdl, train the transitions, set the priors.
  $cmd $dir/log/init_mdl.log \
    nnet3-am-init $alidir/final.mdl $dir/0.raw - \| \
    nnet3-am-train-transitions - "ark:gunzip -c $alidir/ali.*.gz|" $dir/0.mdl || exit 1;
fi


# set num_iters so that as close as possible, we process the data $num_epochs
# times, i.e. $num_iters*$avg_num_jobs) == $num_epochs*$num_archives,
# where avg_num_jobs=(num_jobs_initial+num_jobs_final)/2.

num_archives_to_process=$[$num_epochs*$num_archives]
num_archives_processed=0
num_iters=$[($num_archives_to_process*2)/($num_jobs_initial+$num_jobs_final)]

! [ $num_iters -gt $[$finish_add_layers_iter+2] ] \
  && echo "$0: Insufficient epochs" && exit 1

finish_add_layers_iter=$[$num_hidden_layers * $add_layers_period]

echo "$0: Will train for $num_epochs epochs = $num_iters iterations"

if $use_gpu; then
  parallel_suffix=""
  train_queue_opt="--gpu 1"
  combine_queue_opt="--gpu 1"
  prior_gpu_opt="--use-gpu=yes"
  prior_queue_opt="--gpu 1"
  parallel_train_opts=
  if ! cuda-compiled; then
    echo "$0: WARNING: you are running with one thread but you have not compiled"
    echo "   for CUDA.  You may be running a setup optimized for GPUs.  If you have"
    echo "   GPUs and have nvcc installed, go to src/ and do ./configure; make"
    exit 1
  fi
else
  echo "$0: without using a GPU this will be very slow.  nnet3 does not yet support multiple threads."
  parallel_train_opts="--use-gpu=no"
  combine_queue_opt=""  # the combine stage will be quite slow if not using
                        # GPU, as we didn't enable that program to use
                        # multiple threads.
  prior_gpu_opt="--use-gpu=no"
  prior_queue_opt=""
fi

approx_iters_per_epoch_final=$[$num_archives/$num_jobs_final]
# First work out how many iterations we want to combine over in the final
# nnet3-combine-fast invocation.  (We may end up subsampling from these if the
# number exceeds max_model_combine).  The number we use is:
# min(max(max_models_combine, approx_iters_per_epoch_final),
#     1/2 * iters_after_last_layer_added)
num_iters_combine=$max_models_combine
if [ $num_iters_combine -lt $approx_iters_per_epoch_final ]; then
   num_iters_combine=$approx_iters_per_epoch_final
fi
half_iters_after_add_layers=$[($num_iters-$finish_add_layers_iter)/2]
if [ $num_iters_combine -gt $half_iters_after_add_layers ]; then
  num_iters_combine=$half_iters_after_add_layers
fi
first_model_combine=$[$num_iters-$num_iters_combine+1]

x=0

for realign_time in $realign_times; do
  # Work out the iterations on which we will re-align, if the --realign-times
  # option was used.  This is slightly approximate.
  ! perl -e "exit($realign_time > 0.0 && $realign_time < 1.0 ? 0:1);" && \
    echo "Invalid --realign-times option $realign_times: elements must be strictly between 0 and 1.";
  # the next formula is based on the one for mix_up_iter above.
  realign_iter=$(perl -e '($j,$k,$n,$p)=@ARGV; print int(0.5 + ($j==$k ? $n*$p : $n*(sqrt((1-$p)*$j*$j+$p*$k*$k)-$j)/($k-$j))); ' $num_jobs_initial $num_jobs_final $num_iters $realign_time) || exit 1;
  realign_this_iter[$realign_iter]=$realign_time
done

cur_egs_dir=$egs_dir
[ -z $num_bptt_steps ] && num_bptt_steps=$chunk_width;
min_deriv_time=$((chunk_width - num_bptt_steps))
while [ $x -lt $num_iters ]; do
  [ $x -eq $exit_stage ] && echo "$0: Exiting early due to --exit-stage $exit_stage" && exit 0;

  this_num_jobs=$(perl -e "print int(0.5+$num_jobs_initial+($num_jobs_final-$num_jobs_initial)*$x/$num_iters);")

  ilr=$initial_effective_lrate; flr=$final_effective_lrate; np=$num_archives_processed; nt=$num_archives_to_process;
  this_effective_learning_rate=$(perl -e "print ($x + 1 >= $num_iters ? $flr : $ilr*exp($np*log($flr/$ilr)/$nt));");
  this_learning_rate=$(perl -e "print ($this_effective_learning_rate*$this_num_jobs);");

  if [ ! -z "${realign_this_iter[$x]}" ]; then
    prev_egs_dir=$cur_egs_dir
    cur_egs_dir=$dir/egs_${realign_this_iter[$x]}
  fi

  if [ $x -ge 0 ] && [ $stage -le $x ]; then
    # Set this_shrink value.
    if [ $x -eq 0 ] || nnet3-am-info --print-args=false $dir/$x.mdl | \
      perl -e "while(<>){ if (m/type=Sigmoid.+deriv-avg=.+mean=(\S+)/) { \$n++; \$tot+=\$1; } } exit(\$tot/\$n > $shrink_threshold);"; then
      this_shrink=$shrink; # e.g. avg-deriv of sigmoids was <= 0.125, so shrink.
    else
      this_shrink=1.0  # don't shrink: sigmoids are not over-saturated.
    fi
    echo "On iteration $x, learning rate is $this_learning_rate and shrink value is $this_shrink."

    if [ ! -z "${realign_this_iter[$x]}" ]; then
      time=${realign_this_iter[$x]}

      echo "Getting average posterior for purposes of adjusting the priors."
      # Note: this just uses CPUs, using a smallish subset of data.
      # always use the first egs archive, which makes the script simpler;
      # we're using different random subsets of it.
      rm $dir/post.$x.*.vec 2>/dev/null
      $cmd JOB=1:$num_jobs_compute_prior $dir/log/get_post.$x.JOB.log \
        nnet3-copy-egs --srand=JOB --frame=random $context_opts ark:$prev_egs_dir/egs.1.ark ark:- \| \
        nnet3-subset-egs --srand=JOB --n=$prior_subset_size ark:- ark:- \| \
        nnet3-merge-egs ark:- ark:- \| \
        nnet3-compute-from-egs --apply-exp=true "nnet3-am-copy --raw=true $dir/$x.mdl -|" ark:- ark:- \| \
        matrix-sum-rows ark:- ark:- \| vector-sum ark:- $dir/post.$x.JOB.vec || exit 1;

      sleep 3;  # make sure there is time for $dir/post.$x.*.vec to appear.

      $cmd $dir/log/vector_sum.$x.log \
        vector-sum $dir/post.$x.*.vec $dir/post.$x.vec || exit 1;
      rm $dir/post.$x.*.vec;

      echo "Re-adjusting priors based on computed posteriors"
      $cmd $dir/log/adjust_priors.$x.log \
        nnet3-am-adjust-priors $dir/$x.mdl $dir/post.$x.vec $dir/$x.mdl || exit 1;

      sleep 2

      steps/nnet3/align.sh --nj $num_jobs_align --cmd "$align_cmd" --use-gpu $align_use_gpu \
        --transform-dir "$transform_dir" --online-ivector-dir "$online_ivector_dir" \
        --iter $x $data $lang $dir $dir/ali_$time || exit 1

      steps/nnet3/relabel_egs.sh --cmd "$cmd" --iter $x $dir/ali_$time \
        $prev_egs_dir $cur_egs_dir || exit 1

      if $cleanup && [[ $prev_egs_dir =~ $dir/egs* ]]; then
        steps/nnet3/remove_egs.sh $prev_egs_dir
      fi
    fi

    # Set off jobs doing some diagnostics, in the background.
    # Use the egs dir from the previous iteration for the diagnostics
    $cmd $dir/log/compute_prob_valid.$x.log \
      nnet3-compute-prob "nnet3-am-copy --raw=true $dir/$x.mdl - |" \
            "ark,bg:nnet3-merge-egs ark:$cur_egs_dir/valid_diagnostic.egs ark:- |" &
    $cmd $dir/log/compute_prob_train.$x.log \
      nnet3-compute-prob "nnet3-am-copy --raw=true $dir/$x.mdl - |" \
           "ark,bg:nnet3-merge-egs ark:$cur_egs_dir/train_diagnostic.egs ark:- |" &

    if [ $x -gt 0 ]; then
      $cmd $dir/log/progress.$x.log \
        nnet3-info "nnet3-am-copy --raw=true $dir/$x.mdl - |" '&&' \
        nnet3-show-progress --use-gpu=no "nnet3-am-copy --raw=true $dir/$[$x-1].mdl - |" "nnet3-am-copy --raw=true $dir/$x.mdl - |" \
        "ark,bg:nnet3-merge-egs --minibatch-size=256 ark:$cur_egs_dir/train_diagnostic.egs ark:-|" &
    fi

    echo "Training neural net (pass $x)"

    if [ $x -gt 0 ] && \
      [ $x -le $[($num_hidden_layers-1)*$add_layers_period] ] && \
      [ $[$x%$add_layers_period] -eq 0 ]; then
      do_average=false # if we've just mixed up, don't do averaging but take the
                       # best.
      cur_num_hidden_layers=$[1+$x/$add_layers_period]
      config=$dir/configs/layer$cur_num_hidden_layers.config
      raw="nnet3-am-copy --raw=true --learning-rate=$this_learning_rate $dir/$x.mdl - | nnet3-init --srand=$x - $config - |"
      cache_read_opt="" # an option for writing cache (storing pairs of nnet-computations
                        # and computation-requests) during training.
    else
      do_average=true
      if [ $x -eq 0 ]; then do_average=false; fi # on iteration 0, pick the best, don't average.
      raw="nnet3-am-copy --raw=true --learning-rate=$this_learning_rate $dir/$x.mdl -|"
      cache_read_opt="--read-cache=$dir/cache.$x"
    fi
    if $do_average; then
      this_num_chunk_per_minibatch=$num_chunk_per_minibatch
    else
      # on iteration zero or when we just added a layer, use a smaller minibatch
      # size (and we will later choose the output of just one of the jobs): the
      # model-averaging isn't always helpful when the model is changing too fast
      # (i.e. it can worsen the objective function), and the smaller minibatch
      # size will help to keep the update stable.
      this_num_chunk_per_minibatch=$[$num_chunk_per_minibatch/2];
    fi

    rm $dir/.error 2>/dev/null


    ( # this sub-shell is so that when we "wait" below,
      # we only wait for the training jobs that we just spawned,
      # not the diagnostic jobs that we spawned above.

      # We cannot easily use a single parallel SGE job to do the main training,
      # because the computation of which archive and which --frame option
      # to use for each job is a little complex, so we spawn each one separately.
      # this is no longer true for RNNs as we use do not use the --frame option
      # but we use the same script for consistency with FF-DNN code

      for n in $(seq $this_num_jobs); do
        k=$[$num_archives_processed + $n - 1]; # k is a zero-based index that we will derive
                                               # the other indexes from.
        archive=$[($k%$num_archives)+1]; # work out the 1-based archive index.
        if [ $n -eq 1 ]; then
          # an option for writing cache (storing pairs of nnet-computations and
          # computation-requests) during training.
          cache_write_opt=" --write-cache=$dir/cache.$[$x+1]"
        else
          cache_write_opt=""
        fi
        $cmd $train_queue_opt $dir/log/train.$x.$n.log \
          nnet3-train $parallel_train_opts $cache_read_opt $cache_write_opt --print-interval=10 --momentum=$momentum \
          --max-param-change=$max_param_change \
          --optimization.min-deriv-time=$min_deriv_time "$raw" \
          "ark,bg:nnet3-copy-egs $context_opts ark:$cur_egs_dir/egs.$archive.ark ark:- | nnet3-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x ark:- ark:-| nnet3-merge-egs --minibatch-size=$this_num_chunk_per_minibatch --measure-output-frames=false --discard-partial-minibatches=true ark:- ark:- |" \
          $dir/$[$x+1].$n.raw || touch $dir/.error &
      done
      wait
    )
    # the error message below is not that informative, but $cmd will
    # have printed a more specific one.
    [ -f $dir/.error ] && echo "$0: error on iteration $x of training" && exit 1;

    models_to_average=$(steps/nnet3/get_successful_models.py $this_num_jobs $dir/log/train.$x.%.log)
    nnets_list=
    for n in $models_to_average; do
      nnets_list="$nnets_list $dir/$[$x+1].$n.raw"
    done

    if $do_average; then
      # average the output of the different jobs.
      $cmd $dir/log/average.$x.log \
        nnet3-average $nnets_list - \| \
        nnet3-am-copy --scale=$this_shrink --set-raw-nnet=- $dir/$x.mdl $dir/$[$x+1].mdl || exit 1;
    else
      # choose the best from the different jobs.
      n=$(perl -e '($nj,$pat)=@ARGV; $best_n=1; $best_logprob=-1.0e+10; for ($n=1;$n<=$nj;$n++) {
          $fn = sprintf($pat,$n); open(F, "<$fn") || die "Error opening log file $fn";
          undef $logprob; while (<F>) { if (m/log-prob-per-frame=(\S+)/) { $logprob=$1; } }
          close(F); if (defined $logprob && $logprob > $best_logprob) { $best_logprob=$logprob;
          $best_n=$n; } } print "$best_n\n"; ' $this_num_jobs $dir/log/train.$x.%d.log) || exit 1;
      [ -z "$n" ] && echo "Error getting best model" && exit 1;
      $cmd $dir/log/select.$x.log \
        nnet3-am-copy --scale=$this_shrink --set-raw-nnet=$dir/$[$x+1].$n.raw  $dir/$x.mdl $dir/$[$x+1].mdl || exit 1;
    fi

    nnets_list=
    for n in `seq 1 $this_num_jobs`; do
      nnets_list="$nnets_list $dir/$[$x+1].$n.raw"
    done

    rm $nnets_list
    [ ! -f $dir/$[$x+1].mdl ] && exit 1;
    if [ -f $dir/$[$x-1].mdl ] && $cleanup && \
       [ $[($x-1)%100] -ne 0  ] && [ $[$x-1] -lt $first_model_combine ]; then
      rm $dir/$[$x-1].mdl
    fi
  fi
  rm $dir/cache.$x 2>/dev/null
  x=$[$x+1]
  num_archives_processed=$[$num_archives_processed+$this_num_jobs]
done


if [ $stage -le $num_iters ]; then
  echo "Doing final combination to produce final.mdl"

  # Now do combination.  In the nnet3 setup, the logic
  # for doing averaging of subsets of the models in the case where
  # there are too many models to reliably esetimate interpolation
  # factors (max_models_combine) is moved into the nnet3-combine
  nnets_list=()
  for n in $(seq 0 $[num_iters_combine-1]); do
    iter=$[$first_model_combine+$n]
    mdl=$dir/$iter.mdl
    [ ! -f $mdl ] && echo "Expected $mdl to exist" && exit 1;
    nnets_list[$n]="nnet3-am-copy --raw=true $mdl -|";
  done

  combine_num_chunk_per_minibatch=$(python -c "print int(1024.0/($chunk_width))")
  $cmd $combine_queue_opt $dir/log/combine.log \
    nnet3-combine --num-iters=40 \
       --enforce-sum-to-one=true --enforce-positive-weights=true \
       --verbose=3 "${nnets_list[@]}" "ark,bg:nnet3-merge-egs --measure-output-frames=false --minibatch-size=$combine_num_chunk_per_minibatch ark:$cur_egs_dir/combine.egs ark:-|" \
    "|nnet3-am-copy --set-raw-nnet=- $dir/$num_iters.mdl $dir/combined.mdl" || exit 1;

  # Compute the probability of the final, combined model with
  # the same subset we used for the previous compute_probs, as the
  # different subsets will lead to different probs.
  $cmd $dir/log/compute_prob_valid.final.log \
    nnet3-compute-prob "nnet3-am-copy --raw=true $dir/combined.mdl -|" \
    "ark,bg:nnet3-merge-egs --minibatch-size=256 ark:$cur_egs_dir/valid_diagnostic.egs ark:- |" &
  $cmd $dir/log/compute_prob_train.final.log \
    nnet3-compute-prob  "nnet3-am-copy --raw=true $dir/combined.mdl -|" \
    "ark,bg:nnet3-merge-egs --minibatch-size=256 ark:$cur_egs_dir/train_diagnostic.egs ark:- |" &
fi

if [ $stage -le $[$num_iters+1] ]; then
  echo "Getting average posterior for purposes of adjusting the priors."
  # Note: this just uses CPUs, using a smallish subset of data.
  rm $dir/post.$x.*.vec 2>/dev/null
  if [ $num_jobs_compute_prior -gt $num_archives ]; then egs_part=1;
  else egs_part=JOB; fi
  $cmd JOB=1:$num_jobs_compute_prior $prior_queue_opt $dir/log/get_post.$x.JOB.log \
    nnet3-subset-egs --srand=JOB --n=$prior_subset_size ark:$cur_egs_dir/egs.$egs_part.ark ark:- \| \
    nnet3-merge-egs --measure-output-frames=true --minibatch-size=128 ark:- ark:- \| \
    nnet3-compute-from-egs $prior_gpu_opt --apply-exp=true \
      "nnet3-am-copy --raw=true $dir/combined.mdl -|" ark:- ark:- \| \
    matrix-sum-rows ark:- ark:- \| vector-sum ark:- $dir/post.$x.JOB.vec || exit 1;

  sleep 3;  # make sure there is time for $dir/post.$x.*.vec to appear.

  $cmd $dir/log/vector_sum.$x.log \
   vector-sum $dir/post.$x.*.vec $dir/post.$x.vec || exit 1;

  rm $dir/post.$x.*.vec;

  echo "Re-adjusting priors based on computed posteriors"
  $cmd $dir/log/adjust_priors.final.log \
    nnet3-am-adjust-priors $dir/combined.mdl $dir/post.$x.vec $dir/final.mdl || exit 1;
fi


if [ ! -f $dir/final.mdl ]; then
  echo "$0: $dir/final.mdl does not exist."
  # we don't want to clean up if the training didn't succeed.
  exit 1;
fi

sleep 2

echo Done

if $cleanup; then
  echo Cleaning up data
  if $remove_egs && [[ $cur_egs_dir =~ $dir/egs* ]]; then
    steps/nnet2/remove_egs.sh $cur_egs_dir
  fi

  echo Removing most of the models
  for x in `seq 0 $num_iters`; do
    if [ $[$x%100] -ne 0 ] && [ $x -ne $num_iters ] && [ -f $dir/$x.mdl ]; then
       # delete all but every 100th model; don't delete the ones which combine to form the final model.
       rm $dir/$x.mdl
    fi
  done
fi


================================================
FILE: egs/steps/nnet3/make_bottleneck_features.sh
================================================
#!/usr/bin/env bash

# Copyright 2016 Pegah Ghahremani

# This script dumps bottleneck feature for model trained using nnet3.
# CAUTION!  This script isn't very suitable for dumping features from recurrent
# architectures such as LSTMs, because it doesn't support setting the chunk size
# and left and right context.  (Those would have to be passed into nnet3-compute).
# See also chain/get_phone_post.sh.

# Begin configuration section.
stage=1
nj=4
cmd=queue.pl
use_gpu=false
ivector_dir=
compress=true
# End configuration options.

echo "$0 $@"  # Print the command line for logging

[ -f path.sh ] && . ./path.sh # source the path.
. parse_options.sh || exit 1;

if [[ ( $# -lt 4 ) || ( $# -gt 6 ) ]]; then
   echo "usage: steps/nnet3/make_bottleneck_features.sh <bnf-node-name> <input-data-dir> <bnf-data-dir> <nnet-dir> [<log-dir> [<bnfdir>] ]"
   echo "e.g.:  steps/nnet3/make_bottleneck_features.sh tdnn_bn.renorm data/train data/train_bnf exp/nnet3/tdnn_bnf exp_bnf/dump_bnf bnf"
   echo "Note: <log-dir> defaults to <bnf-data-dir>/log and <bnfdir> defaults to"
   echo " <bnf-data-dir>/data"
   echo "main options (for others, see top of script file)"
   echo "  --config <config-file>                           # config containing options"
   echo "  --nj <nj>                                        # number of parallel jobs"
   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
   echo "  --ivector-dir                                    # directory for ivectors"
   exit 1;
fi
bnf_name=$1 # the component-node name in nnet3 model used for bottleneck feature extraction
data=$2
bnf_data=$3
nnetdir=$4
if [ $# -gt 4 ]; then
  logdir=$5
else
  logdir=$bnf_data/log
fi
if [ $# -gt 5 ]; then
  bnfdir=$6
else
  bnfdir=$bnf_data/data
fi

# Assume that final.nnet is in nnetdir
cmvn_opts=`cat $nnetdir/cmvn_opts`;
bnf_nnet=$nnetdir/final.raw
if [ ! -f $bnf_nnet ] ; then
  if [ ! -f $nnetdir/final.mdl ]; then
    echo "$0: No such file $bnf_nnet or $nnetdir/final.mdl";
    exit 1;
  else
    bnf_nnet=$nnetdir/final.mdl
  fi
fi

if $use_gpu; then
  compute_queue_opt="--gpu 1"
  compute_gpu_opt="--use-gpu=yes"
  if ! cuda-compiled; then
    echo "$0: WARNING: you are running with one thread but you have not compiled"
    echo "   for CUDA.  You may be running a setup optimized for GPUs.  If you have"
    echo "   GPUs and have nvcc installed, go to src/ and do ./configure; make"
    exit 1
  fi
else
  echo "$0: without using a GPU this will be very slow.  nnet3 does not yet support multiple threads."
  compute_gpu_opt="--use-gpu=no"
fi


## Set up input features of nnet
name=`basename $data`
sdata=$data/split$nj

mkdir -p $logdir
mkdir -p $bnf_data
mkdir -p $bnfdir
echo $nj > $bnfdir/num_jobs

[ ! -f $data/feats.scp ] && echo >&2 "The file $data/feats.scp does not exist!" && exit 1;
[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;

use_ivector=false
if [ ! -z "$ivector_dir" ];then
  use_ivector=true
  steps/nnet2/check_ivectors_compatible.sh $nnetdir $ivector_dir || exit 1;
fi

## Set up features.
if [ -f $nnetdir/online_cmvn ]; then online_cmvn=true
else online_cmvn=false; fi

if ! $online_cmvn; then
  echo "$0: feature type is raw"
  feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- |"
else
  echo "$0: feature type is raw (apply-cmvn-online)"
  feats="ark,s,cs:apply-cmvn-online $cmvn_opts --spk2utt=ark:$sdata/JOB/spk2utt $nnetdir/global_cmvn.stats scp:$sdata/JOB/feats.scp ark:- |"
fi
ivector_feats="scp:utils/filter_scp.pl $sdata/JOB/utt2spk $ivector_dir/ivector_online.scp |"

if [ $stage -le 1 ]; then
  echo "$0: Generating bottleneck (BNF) features using $bnf_nnet model as output of "
  echo "    component-node with name $bnf_name."
  echo "output-node name=output input=$bnf_name" > $bnf_data/output.config
  modified_bnf_nnet="nnet3-copy --nnet-config=$bnf_data/output.config $bnf_nnet - |"
  ivector_opts=
  if $use_ivector; then
    ivector_period=$(cat $ivector_dir/ivector_period) || exit 1;
    ivector_opts="--online-ivector-period=$ivector_period --online-ivectors='$ivector_feats'"
  fi
  $cmd $compute_queue_opt JOB=1:$nj $logdir/make_bnf_$name.JOB.log \
    nnet3-compute $compute_gpu_opt $ivector_opts "$modified_bnf_nnet" "$feats" ark:- \| \
    copy-feats --compress=$compress ark:- ark,scp:$bnfdir/raw_bnfeat_$name.JOB.ark,$bnfdir/raw_bnfeat_$name.JOB.scp || exit 1;
fi


N0=$(cat $data/feats.scp | wc -l)
N1=$(cat $bnfdir/raw_bnfeat_$name.*.scp | wc -l)
if [[ "$N0" != "$N1" ]]; then
  echo "$0: Error generating BNF features for $name (original:$N0 utterances, BNF:$N1 utterances)"
  exit 1;
fi

# Concatenate feats.scp into bnf_data
for n in $(seq $nj); do  cat $bnfdir/raw_bnfeat_$name.$n.scp; done > $bnf_data/feats.scp

for f in segments spk2utt text utt2spk wav.scp char.stm glm kws reco2file_and_channel stm; do
  [ -e $data/$f ] && cp -r $data/$f $bnf_data/$f
done

echo "$0: computing CMVN stats."
steps/compute_cmvn_stats.sh $bnf_data

echo "$0: done making BNF features."

exit 0;


================================================
FILE: egs/steps/nnet3/make_denlats.sh
================================================
#!/usr/bin/env bash
# Copyright 2012        Johns Hopkins University (Author: Daniel Povey)
#           2014-2015   Vimal Manohar
# Apache 2.0.

# Create denominator lattices for MMI/MPE training [deprecated].
# This version uses the neural-net models (version 3, i.e. the nnet3 code).
# Creates its output in $dir/lat.*.gz
# Note: the more recent discriminative training scripts will not use this
# script at all, they'll use get_degs.sh which combines the decoding
# and egs-dumping into one script (to save disk space and disk I/O).

# Begin configuration section.
nj=4
cmd=run.pl
sub_split=1
beam=13.0
frames_per_chunk=50
lattice_beam=7.0
self_loop_scale=0.1
acwt=0.1
max_active=5000
min_active=200
max_mem=20000000 # This will stop the processes getting too large.
# This is in bytes, but not "real" bytes-- you have to multiply
# by something like 5 or 10 to get real bytes (not sure why so large)
num_threads=1 # number of threads of decoder [only applicable if not looped, for now]
online_ivector_dir=
determinize=true
minimize=false
ivector_scale=1.0
extra_left_context=0
extra_right_context=0
extra_left_context_initial=-1
extra_right_context_final=-1
# End configuration section.


echo "$0 $@"  # Print the command line for logging

[ -f ./path.sh ] && . ./path.sh; # source the path.
. parse_options.sh || exit 1;

num_threads=1 # Fixed to 1 for now

if [ $# != 4 ]; then
  echo "Usage: steps/nnet3/make_denlats.sh [options] <data-dir> <lang-dir> <src-dir> <exp-dir>"
  echo "  e.g.: steps/nnet3/make_denlats.sh data/train data/lang exp/nnet4 exp/nnet4_denlats"
  echo ""
  echo "Main options (for others, see top of script file)"
  echo "  --config <config-file>                           # config containing options"
  echo "  --nj <nj>                                        # number of parallel jobs"
  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
  echo "  --sub-split <n-split>                            # e.g. 40; use this for "
  echo "                           # large databases so your jobs will be smaller and"
  echo "                           # will (individually) finish reasonably soon."
  echo "  --num-threads  <n>                # number of threads per decoding job"
  exit 1;
fi

data=$1
lang=$2
srcdir=$3
dir=$4


extra_files=
[ ! -z "$online_ivector_dir" ] && \
  extra_files="$online_ivector_dir/ivector_online.scp $online_ivector_dir/ivector_period"
for f in $data/feats.scp $lang/L.fst $srcdir/final.mdl $extra_files; do
  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1;
done

sdata=$data/split$nj
splice_opts=`cat $srcdir/splice_opts 2>/dev/null`
thread_string=
[ $num_threads -gt 1 ] && thread_string="-parallel --num-threads=$num_threads"

mkdir -p $dir/log
split_data.sh $data $nj || exit 1;
echo $nj > $dir/num_jobs

utils/lang/check_phones_compatible.sh $lang/phones.txt $srcdir/phones.txt || exit 1;

oov=`cat $lang/oov.int` || exit 1;

cp -rH $lang $dir/

# Compute grammar FST which corresponds to unigram decoding graph.
new_lang="$dir/"$(basename "$lang")

# mkgraph.sh expects a whole directory "lang", so put everything in one directory...
# it gets L_disambig.fst and G.fst (among other things) from $dir/lang, and
# final.mdl from $srcdir; the output HCLG.fst goes in $dir/graph.

echo "Compiling decoding graph in $dir/dengraph"
if [ -s $dir/dengraph/HCLG.fst ] && [ $dir/dengraph/HCLG.fst -nt $srcdir/final.mdl ]; then
  echo "Graph $dir/dengraph/HCLG.fst already exists: skipping graph creation."
else
  echo "Making unigram grammar FST in $new_lang"
  cat $data/text | utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt | \
   awk '{for(n=2;n<=NF;n++){ printf("%s ", $n); } printf("\n"); }' | \
    utils/make_unigram_grammar.pl | fstcompile | fstarcsort --sort_type=ilabel > $new_lang/G.fst \
    || exit 1;
  utils/mkgraph.sh --self-loop-scale $self_loop_scale $new_lang $srcdir $dir/dengraph || exit 1;
fi
cmvn_opts=`cat $srcdir/cmvn_opts 2>/dev/null`
cp $srcdir/cmvn_opts $dir 2>/dev/null

echo "$0: feature type is raw"

feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- |"

# if this job is interrupted by the user, we want any background jobs to be
# killed too.
cleanup() {
  local pids=$(jobs -pr)
  [ -n "$pids" ] && kill $pids
}
trap "cleanup" INT QUIT TERM EXIT

if [ ! -z "$online_ivector_dir" ]; then
  ivector_period=$(cat $online_ivector_dir/ivector_period) || exit 1;
  ivector_opts="--online-ivectors=scp:$online_ivector_dir/ivector_online.scp --online-ivector-period=$ivector_period"
fi

if [ -f $srcdir/frame_subsampling_factor ]; then
  # e.g. for 'chain' systems
  frame_subsampling_opt="--frame-subsampling-factor=$(cat $srcdir/frame_subsampling_factor)"
  cp $srcdir/frame_subsampling_factor $dir
fi

lattice_determinize_cmd=
if $determinize; then
  lattice_determinize_cmd="lattice-determinize-non-compact --acoustic-scale=$acwt --max-mem=$max_mem --minimize=$minimize --prune=true --beam=$lattice_beam ark:- ark:- |"
fi

if [ $sub_split -eq 1 ]; then
  $cmd --num-threads $num_threads JOB=1:$nj $dir/log/decode_den.JOB.log \
    nnet3-latgen-faster$thread_string $ivector_opts $frame_subsampling_opt \
    --frames-per-chunk=$frames_per_chunk \
    --extra-left-context=$extra_left_context \
    --extra-right-context=$extra_right_context \
    --extra-left-context-initial=$extra_left_context_initial \
    --extra-right-context-final=$extra_right_context_final \
    --minimize=false --determinize-lattice=false \
    --word-determinize=false --phone-determinize=false \
    --max-active=$max_active --min-active=$min_active --beam=$beam \
    --lattice-beam=$lattice_beam --acoustic-scale=$acwt --allow-partial=false \
    --max-mem=$max_mem --word-symbol-table=$lang/words.txt $srcdir/final.mdl  \
    $dir/dengraph/HCLG.fst "$feats" \
    "ark:|$lattice_determinize_cmd gzip -c >$dir/lat.JOB.gz" || exit 1
else

  # each job from 1 to $nj is split into multiple pieces (sub-split), and we aim
  # to have at most two jobs running at each time.  The idea is that if we have stragglers
  # from one job, we can be processing another one at the same time.
  rm $dir/.error 2>/dev/null

  prev_pid=
  for n in `seq $[nj+1]`; do
    if [ $n -gt $nj ]; then
      this_pid=
    elif [ -f $dir/.done.$n ] && [ $dir/.done.$n -nt $srcdir/final.mdl ]; then
      echo "Not processing subset $n as already done (delete $dir/.done.$n if not)";
      this_pid=
    else
      sdata2=$data/split$nj/$n/split${sub_split}utt;
      split_data.sh --per-utt $sdata/$n $sub_split || exit 1;
      mkdir -p $dir/log/$n
      mkdir -p $dir/part
      feats_subset=`echo $feats | sed s:JOB/:$n/split${sub_split}utt/JOB/:g`

      $cmd --num-threads $num_threads JOB=1:$sub_split $dir/log/$n/decode_den.JOB.log \
        nnet3-latgen-faster$thread_string $ivector_opts $frame_subsampling_opt \
        --frames-per-chunk=$frames_per_chunk \
        --extra-left-context=$extra_left_context \
        --extra-right-context=$extra_right_context \
        --extra-left-context-initial=$extra_left_context_initial \
        --extra-right-context-final=$extra_right_context_final \
        --minimize=false --determinize-lattice=false \
        --word-determinize=false --phone-determinize=false \
        --max-active=$max_active --min-active=$min_active --beam=$beam \
        --lattice-beam=$lattice_beam --acoustic-scale=$acwt --allow-partial=false \
        --max-mem=$max_mem --word-symbol-table=$lang/words.txt $srcdir/final.mdl  \
        $dir/dengraph/HCLG.fst "$feats_subset" \
        "ark:|$lattice_determinize_cmd gzip -c >$dir/lat.$n.JOB.gz" || touch $dir/.error &
      this_pid=$!
    fi
    if [ ! -z "$prev_pid" ]; then  # Wait for the previous job; merge the previous set of lattices.
      wait $prev_pid
      [ -f $dir/.error ] && echo "$0: error generating denominator lattices" && exit 1;
      rm $dir/.merge_error 2>/dev/null
      echo Merging archives for data subset $prev_n
      for k in `seq $sub_split`; do
        gunzip -c $dir/lat.$prev_n.$k.gz || touch $dir/.merge_error;
      done | gzip -c > $dir/lat.$prev_n.gz || touch $dir/.merge_error;
      [ -f $dir/.merge_error ] && echo "$0: Merging lattices for subset $prev_n failed (or maybe some other error)" && exit 1;
      rm $dir/lat.$prev_n.*.gz
      touch $dir/.done.$prev_n
    fi
    prev_n=$n
    prev_pid=$this_pid
  done
fi


echo "$0: done generating denominator lattices."


================================================
FILE: egs/steps/nnet3/make_tdnn_configs.py
================================================
#!/usr/bin/env python

# This script is deprecated, please use ../xconfig_to_configs.py

# we're using python 3.x style print but want it to work in python 2.x,
from __future__ import print_function
import re, os, argparse, sys, math, warnings


parser = argparse.ArgumentParser(description="Writes config files and variables "
                                 "for TDNNs creation and training",
                                 epilog="See steps/nnet3/train_tdnn.sh for example.");
parser.add_argument("--splice-indexes", type=str,
                    help="Splice indexes at each hidden layer, e.g. '-3,-2,-1,0,1,2,3 0 -2,2 0 -4,4 0 -8,8'")
parser.add_argument("--feat-dim", type=int,
                    help="Raw feature dimension, e.g. 13")
parser.add_argument("--ivector-dim", type=int,
                    help="iVector dimension, e.g. 100", default=0)
parser.add_argument("--include-log-softmax", type=str,
                    help="add the final softmax layer ", default="true", choices = ["false", "true"])
parser.add_argument("--final-layer-normalize-target", type=float,
                    help="RMS target for final layer (set to <1 if final layer learns too fast",
                    default=1.0)
parser.add_argument("--pnorm-input-dim", type=int,
                    help="input dimension to p-norm nonlinearities")
parser.add_argument("--pnorm-output-dim", type=int,
                    help="output dimension of p-norm nonlinearities")
parser.add_argument("--relu-dim", type=int,
                    help="dimension of ReLU nonlinearities")
parser.add_argument("--use-presoftmax-prior-scale", type=str,
                    help="if true, a presoftmax-prior-scale is added",
                    choices=['true', 'false'], default = "true")
parser.add_argument("--num-targets", type=int,
                    help="number of network targets (e.g. num-pdf-ids/num-leaves)")
parser.add_argument("config_dir",
                    help="Directory to write config files and variables");

print(' '.join(sys.argv))

args = parser.parse_args()

if not os.path.exists(args.config_dir):
    os.makedirs(args.config_dir)

## Check arguments.
if args.splice_indexes is None:
    sys.exit("--splice-indexes argument is required");
if args.feat_dim is None or not (args.feat_dim > 0):
    sys.exit("--feat-dim argument is required");
if args.num_targets is None or not (args.num_targets > 0):
    sys.exit("--num-targets argument is required");
if not args.relu_dim is None:
    if not args.pnorm_input_dim is None or not args.pnorm_output_dim is None:
        sys.exit("--relu-dim argument not compatible with "
                 "--pnorm-input-dim or --pnorm-output-dim options");
    nonlin_input_dim = args.relu_dim
    nonlin_output_dim = args.relu_dim
else:
    if not args.pnorm_input_dim > 0 or not args.pnorm_output_dim > 0:
        sys.exit("--relu-dim not set, so expected --pnorm-input-dim and "
                 "--pnorm-output-dim to be provided.");
    nonlin_input_dim = args.pnorm_input_dim
    nonlin_output_dim = args.pnorm_output_dim

if args.use_presoftmax_prior_scale == "true":
    use_presoftmax_prior_scale = True
else:
    use_presoftmax_prior_scale = False

## Work out splice_array e.g. splice_array = [ [ -3,-2,...3 ], [0], [-2,2], .. [ -8,8 ] ]
splice_array = []
left_context = 0
right_context = 0
split1 = args.splice_indexes.split();  # we already checked the string is nonempty.
if len(split1) < 1:
    sys.exit("invalid --splice-indexes argument, too short: "
             + args.splice_indexes)
try:
    for string in split1:
        split2 = string.split(",")
        if len(split2) < 1:
            sys.exit("invalid --splice-indexes argument, too-short element: "
                     + args.splice_indexes)
        int_list = []
        for int_str in split2:
            int_list.append(int(int_str))
        if not int_list == sorted(int_list):
            sys.exit("elements of --splice-indexes must be sorted: "
                     + args.splice_indexes)
        left_context += -int_list[0]
        right_context += int_list[-1]
        splice_array.append(int_list)
except ValueError as e:
    sys.exit("invalid --splice-indexes argument " + args.splice_indexes + str(e))
left_context = max(0, left_context)
right_context = max(0, right_context)
num_hidden_layers = len(splice_array)
input_dim = len(splice_array[0]) * args.feat_dim  +  args.ivector_dim

f = open(args.config_dir + "/vars", "w")
print('left_context={}'.format(left_context), file=f)
print('right_context={}'.format(right_context), file=f)
# the initial l/r contexts are actually not needed.
# print('initial_left_context=' + str(splice_array[0][0]), file=f)
# print('initial_right_context=' + str(splice_array[0][-1]), file=f)
print('num_hidden_layers={}'.format(num_hidden_layers), file=f)
f.close()

f = open(args.config_dir + "/init.config", "w")
print('# Config file for initializing neural network prior to', file=f)
print('# preconditioning matrix computation', file=f)
print('input-node name=input dim={}'.format(args.feat_dim), file=f)
list=[ ('Offset(input, {0})'.format(n) if n != 0 else 'input' ) for n in splice_array[0] ]
if args.ivector_dim > 0:
    print('input-node name=ivector dim={}'.format(args.ivector_dim), file=f)
    list.append('ReplaceIndex(ivector, t, 0)')
# example of next line:
# output-node name=output input="Append(Offset(input, -3), Offset(input, -2), Offset(input, -1), ... , Offset(input, 3), ReplaceIndex(ivector, t, 0))"
print('output-node name=output input=Append({0})'.format(", ".join(list)), file=f)
f.close()

for l in range(1, num_hidden_layers + 1):
    f = open(args.config_dir + "/layer{0}.config".format(l), "w")
    print('# Config file for layer {0} of the network'.format(l), file=f)
    if l == 1:
        print('component name=lda type=FixedAffineComponent matrix={0}/lda.mat'.
              format(args.config_dir), file=f)
    cur_dim = (nonlin_output_dim * len(splice_array[l-1]) if l > 1 else input_dim)

    print('# Note: param-stddev in next component defaults to 1/sqrt(input-dim).', file=f)
    print('component name=affine{0} type=NaturalGradientAffineComponent '
          'input-dim={1} output-dim={2} bias-stddev=0'.
        format(l, cur_dim, nonlin_input_dim), file=f)
    if args.relu_dim is not None:
        print('component name=nonlin{0} type=RectifiedLinearComponent dim={1}'.
              format(l, args.relu_dim), file=f)
    else:
        print('# In nnet3 framework, p in P-norm is always 2.', file=f)
        print('component name=nonlin{0} type=PnormComponent input-dim={1} output-dim={2}'.
              format(l, args.pnorm_input_dim, args.pnorm_output_dim), file=f)
    print('component name=renorm{0} type=NormalizeComponent dim={1} target-rms={2}'.format(
        l, nonlin_output_dim,
        (1.0 if l < num_hidden_layers else args.final_layer_normalize_target)), file=f)
    print('component name=final-affine type=NaturalGradientAffineComponent '
          'input-dim={0} output-dim={1} param-stddev=0 bias-stddev=0'.format(
          nonlin_output_dim, args.num_targets), file=f)
    # printing out the next two, and their component-nodes, for l > 1 is not
    # really necessary as they will already exist, but it doesn't hurt and makes
    # the structure clearer.
    if args.include_log_softmax == "true":
        if use_presoftmax_prior_scale :
            print('component name=final-fixed-scale type=FixedScaleComponent '
                  'scales={0}/presoftmax_prior_scale.vec'.format(
                    args.config_dir), file=f)
        print('component name=final-log-softmax type=LogSoftmaxComponent dim={0}'.format(
                args.num_targets), file=f)
    print('# Now for the network structure', file=f)
    if l == 1:
        splices = [ ('Offset(input, {0})'.format(n) if n != 0 else 'input') for n in splice_array[l-1] ]
        if args.ivector_dim > 0: splices.append('ReplaceIndex(ivector, t, 0)')
        orig_input='Append({0})'.format(', '.join(splices))
        # e.g. orig_input = 'Append(Offset(input, -2), ... Offset(input, 2), ivector)'
        print('component-node name=lda component=lda input={0}'.format(orig_input),
              file=f)
        cur_input='lda'
    else:
        # e.g. cur_input = 'Append(Offset(renorm1, -2), renorm1, Offset(renorm1, 2))'
        splices = [ ('Offset(renorm{0}, {1})'.format(l-1, n) if n !=0 else 'renorm{0}'.format(l-1))
                    for n in splice_array[l-1] ]
        cur_input='Append({0})'.format(', '.join(splices))
    print('component-node name=affine{0} component=affine{0} input={1} '.
          format(l, cur_input), file=f)
    print('component-node name=nonlin{0} component=nonlin{0} input=affine{0}'.
          format(l), file=f)
    print('component-node name=renorm{0} component=renorm{0} input=nonlin{0}'.
          format(l), file=f)

    print('component-node name=final-affine component=final-affine input=renorm{0}'.
          format(l), file=f)

    if args.include_log_softmax == "true":
        if use_presoftmax_prior_scale:
            print('component-node name=final-fixed-scale component=final-fixed-scale input=final-affine',
                  file=f)
            print('component-node name=final-log-softmax component=final-log-softmax '
                  'input=final-fixed-scale', file=f)
        else:
            print('component-node name=final-log-softmax component=final-log-softmax '
                  'input=final-affine', file=f)
        print('output-node name=output input=final-log-softmax', file=f)
    else:
        print('output-node name=output input=final-affine', file=f)
    f.close()

# component name=nonlin1 type=PnormComponent input-dim=$pnorm_input_dim output-dim=$pnorm_output_dim
# component name=renorm1 type=NormalizeComponent dim=$pnorm_output_dim
# component name=final-affine type=NaturalGradientAffineComponent input-dim=$pnorm_output_dim output-dim=$num_leaves param-stddev=0 bias-stddev=0
# component name=final-log-softmax type=LogSoftmaxComponent dim=$num_leaves


# ## Write file $config_dir/init.config to initialize the network, prior to computing the LDA matrix.
# ##will look like this, if we have iVectors:
# input-node name=input dim=13
# input-node name=ivector dim=100
# output-node name=output input="Append(Offset(input, -3), Offset(input, -2), Offset(input, -1), ... , Offset(input, 3), ReplaceIndex(ivector, t, 0))"

# ## Write file $config_dir/layer1.config that adds the LDA matrix, assumed to be in the config directory as
# ## lda.mat, the first hidden layer, and the output layer.
# component name=lda type=FixedAffineComponent matrix=$config_dir/lda.mat
# component name=affine1 type=NaturalGradientAffineComponent input-dim=$lda_input_dim output-dim=$pnorm_input_dim bias-stddev=0
# component name=nonlin1 type=PnormComponent input-dim=$pnorm_input_dim output-dim=$pnorm_output_dim
# component name=renorm1 type=NormalizeComponent dim=$pnorm_output_dim
# component name=final-affine type=NaturalGradientAffineComponent input-dim=$pnorm_output_dim output-dim=$num_leaves param-stddev=0 bias-stddev=0
# component name=final-log-softmax type=LogSoftmax dim=$num_leaves
# # InputOf(output) says use the same Descriptor of the current "output" node.
# component-node name=lda component=lda input=InputOf(output)
# component-node name=affine1 component=affine1 input=lda
# component-node name=nonlin1 component=nonlin1 input=affine1
# component-node name=renorm1 component=renorm1 input=nonlin1
# component-node name=final-affine component=final-affine input=renorm1
# component-node name=final-log-softmax component=final-log-softmax input=final-affine
# output-node name=output input=final-log-softmax


# ## Write file $config_dir/layer2.config that adds the second hidden layer.
# component name=affine2 type=NaturalGradientAffineComponent input-dim=$lda_input_dim output-dim=$pnorm_input_dim bias-stddev=0
# component name=nonlin2 type=PnormComponent input-dim=$pnorm_input_dim output-dim=$pnorm_output_dim
# component name=renorm2 type=NormalizeComponent dim=$pnorm_output_dim
# component name=final-affine type=NaturalGradientAffineComponent input-dim=$pnorm_output_dim output-dim=$num_leaves param-stddev=0 bias-stddev=0
# component-node name=affine2 component=affine2 input=Append(Offset(renorm1, -2), Offset(renorm1, 2))
# component-node name=nonlin2 component=nonlin2 input=affine2
# component-node name=renorm2 component=renorm2 input=nonlin2
# component-node name=final-affine component=final-affine input=renorm2
# component-node name=final-log-softmax component=final-log-softmax input=final-affine
# output-node name=output input=final-log-softmax


# ## ... etc.  In this example it would go up to $config_dir/layer5.config.


================================================
FILE: egs/steps/nnet3/multilingual/allocate_multilingual_examples.py
================================================
#!/usr/bin/env python3

# Copyright      2017 Pegah Ghahremani
#                2018 Hossein Hadian
#
# Apache 2.0.

""" This script generates examples for multilingual training of neural network.
    This scripts produces 3 sets of files --
    egs.*.scp, egs.output.*.ark, egs.weight.*.ark

    egs.*.scp are the SCP files of the training examples.
    egs.weight.*.ark map from the key of the example to the language-specific
    weight of that example.
    egs.output.*.ark map from the key of the example to the name of
    the output-node in the neural net for that specific language, e.g.
    'output-2'.

    --egs-prefix option can be used to generate train and diagnostics egs files.
    If --egs-prefix=train_diagnostics. is passed, then the files produced by the
    script will be named with the prefix as "train_diagnostics."
    instead of "egs."
    i.e. the files produced are -- train_diagnostics.*.scp,
    train_diagnostics.output.*.ark, train_diagnostics.weight.*.ark and
    train_diagnostics.ranges.*.txt.
    The other egs-prefix options used in the recipes are "valid_diagnositics."
    for validation examples and "combine." for examples used for model
    combination.

    For chain training egs, the --egs-prefix option should be "cegs."

    You can call this script as (e.g.):

    allocate_multilingual_examples.py [opts] example-scp-lists
        multilingual-egs-dir

    allocate_multilingual_examples.py --block-size 512
        --lang2weight  "0.2,0.8" exp/lang1/egs.scp exp/lang2/egs.scp
        exp/multi/egs

"""

import os, argparse, sys, random
import logging
import traceback

sys.path.insert(0, 'steps')

logger = logging.getLogger('libs')
logger.setLevel(logging.INFO)
handler = logging.StreamHandler()
handler.setLevel(logging.INFO)
formatter = logging.Formatter("%(asctime)s [%(pathname)s:%(lineno)s - "
                              "%(funcName)s - %(levelname)s ] %(message)s")
handler.setFormatter(formatter)
logger.addHandler(handler)
logger.info('Start generating multilingual examples')


def get_args():

    parser = argparse.ArgumentParser(
        description=""" This script generates examples for multilingual training
        of neural network by producing 3 sets of primary files
        as egs.*.scp, egs.output.*.ark, egs.weight.*.ark.
        egs.*.scp are the SCP files of the training examples.
        egs.weight.*.ark map from the key of the example to the language-specific
        weight of that example.
        egs.output.*.ark map from the key of the example to the name of
        the output-node in the neural net for that specific language, e.g.
        'output-2'.""",
        epilog="Called by steps/nnet3/multilingual/combine_egs.sh")

    parser.add_argument("--num-archives", type=int, default=None,
                        help="Number of archives to split the data into. (Note: in reality they are not "
                        "archives, only scp files, but we use this notation by analogy with the "
                        "conventional egs-creating script).")
    parser.add_argument("--block-size", type=int, default=512,
                        help="This relates to locality of disk access. 'block-size' is"
                        "the average number of examples that are read consecutively"
                        "from each input scp file (and are written in the same order to the output scp files)"
                        "Smaller values lead to more random disk access (during "
                        "the nnet3 training process).")
    parser.add_argument("--egs-prefix", type=str, default="egs.",
                        help="This option can be used to add a prefix to the filenames "
                        "of the output files. For e.g. "
                        "if --egs-prefix=combine. , then the files produced "
                        "by this script will be "
                        "combine.output.*.ark, combine.weight.*.ark, and combine.*.scp")
    parser.add_argument("--lang2weight", type=str,
                        help="Comma-separated list of weights, one per language. "
                        "The language order is as egs_scp_lists.")
# now the positional arguments
    parser.add_argument("egs_scp_lists", nargs='+',
                        help="List of egs.scp files per input language."
                           "e.g. exp/lang1/egs/egs.scp exp/lang2/egs/egs.scp")
    parser.add_argument("egs_dir",
                        help="Name of output egs directory e.g. exp/tdnn_multilingual_sp/egs")


    print(sys.argv, file=sys.stderr)
    args = parser.parse_args()

    return args


def read_lines(file_handle, num_lines):
    n_read = 0
    lines = []
    while n_read < num_lines:
        line = file_handle.readline()
        if not line:
            break
        lines.append(line.strip())
        n_read += 1
    return lines


def process_multilingual_egs(args):
    args = get_args()

    scp_lists = args.egs_scp_lists
    num_langs = len(scp_lists)

    lang_to_num_examples = [0] * num_langs
    for lang in range(num_langs):
        with open(scp_lists[lang]) as fh:
            lang_to_num_examples[lang] = sum([1 for line in fh])
        logger.info("Number of examples for language {0} "
                    "is {1}.".format(lang, lang_to_num_examples[lang]))

    # If weights are not provided, the weights are 1.0.
    if args.lang2weight is None:
        lang2weight = [1.0] * num_langs
    else:
        lang2weight = args.lang2weight.split(",")
        assert(len(lang2weight) == num_langs)

    if not os.path.exists(os.path.join(args.egs_dir, 'info')):
        os.makedirs(os.path.join(args.egs_dir, 'info'))

    with open("{0}/info/{1}num_tasks".format(args.egs_dir, args.egs_prefix), "w") as fh:
        print("{0}".format(num_langs), file=fh)

    # Total number of egs in all languages
    tot_num_egs = sum(lang_to_num_examples[i] for i in range(num_langs))
    num_archives = args.num_archives

    with open("{0}/info/{1}num_archives".format(args.egs_dir, args.egs_prefix), "w") as fh:
        print("{0}".format(num_archives), file=fh)

    logger.info("There are a total of {} examples in the input scp "
                "files.".format(tot_num_egs))
    logger.info("Number of blocks in each output archive will be approximately "
                "{}, and block-size is {}.".format(int(round(tot_num_egs / num_archives / args.block_size)),
                                                   args.block_size))
    for lang in range(num_langs):
        blocks_per_archive_this_lang = lang_to_num_examples[lang] / num_archives / args.block_size
        warning = ""
        if blocks_per_archive_this_lang < 1.0:
            warning = ("Warning: This means some of the output archives might "
                       "not include any examples from this lang.")
        logger.info("The proportion of egs from lang {} is {:.2f}. The number of blocks "
                    "per archive for this lang is approximately {:.2f}. "
                    "{}".format(lang, float(lang_to_num_examples[lang]) / tot_num_egs,
                                blocks_per_archive_this_lang,
                                warning))

    in_scp_file_handles = [open(scp_lists[lang], 'r') for lang in range(num_langs)]

    num_remaining_egs = tot_num_egs
    lang_to_num_remaining_egs = [n for n in lang_to_num_examples]
    for archive_index in range(num_archives + 1):  #  +1 is because we write to the last archive in two rounds
        num_remaining_archives = num_archives - archive_index
        num_remaining_blocks = float(num_remaining_egs) / args.block_size

        last_round = (archive_index == num_archives)
        if not last_round:
            num_blocks_this_archive = int(round(float(num_remaining_blocks) / num_remaining_archives))
            logger.info("Generating archive {} containing {} blocks...".format(archive_index, num_blocks_this_archive))
        else:  # This is the second round for the last archive. Flush all the remaining egs...
            archive_index = num_archives - 1
            num_blocks_this_archive = num_langs
            logger.info("Writing all the {} remaining egs to the last archive...".format(num_remaining_egs))

        out_scp_file_handle = open('{0}/{1}{2}.scp'.format(args.egs_dir, args.egs_prefix, archive_index + 1),
                                   'a' if last_round else 'w')
        eg_to_output_file_handle = open("{0}/{1}output.{2}.ark".format(args.egs_dir, args.egs_prefix, archive_index + 1),
                                        'a' if last_round else 'w')
        eg_to_weight_file_handle = open("{0}/{1}weight.{2}.ark".format(args.egs_dir, args.egs_prefix, archive_index + 1),
                                        'a' if last_round else 'w')


        for block_index in range(num_blocks_this_archive):
            # Find the lang with the highest proportion of remaining examples
            remaining_proportions = [float(remain) / tot for remain, tot in zip(lang_to_num_remaining_egs, lang_to_num_examples)]
            lang_index, max_proportion = max(enumerate(remaining_proportions), key=lambda a: a[1])

            # Read 'block_size' examples from the selected lang and write them to the current output scp file:
            example_lines  = read_lines(in_scp_file_handles[lang_index], args.block_size)
            for eg_line in example_lines:
                eg_id = eg_line.split()[0]
                print(eg_line, file=out_scp_file_handle)
                print("{0} output-{1}".format(eg_id, lang_index), file=eg_to_output_file_handle)
                print("{0} {1}".format(eg_id, lang2weight[lang_index]), file=eg_to_weight_file_handle)

            num_remaining_egs -= len(example_lines)
            lang_to_num_remaining_egs[lang_index] -= len(example_lines)

        out_scp_file_handle.close()
        eg_to_output_file_handle.close()
        eg_to_weight_file_handle.close()

    for handle in in_scp_file_handles:
        handle.close()
    logger.info("Finished generating {0}*.scp, {0}output.*.ark "
                "and {0}weight.*.ark files. Wrote a total of {1} examples "
                "to {2} archives.".format(args.egs_prefix,
                                          tot_num_egs - num_remaining_egs, num_archives))


def main():
    try:
        args = get_args()
        process_multilingual_egs(args)
    except Exception as e:
        traceback.print_exc()
        sys.exit(1)


if __name__ == "__main__":
    main()


================================================
FILE: egs/steps/nnet3/multilingual/combine_egs.sh
================================================
#!/usr/bin/env bash

# Copyright 2017     Pegah Ghahremani
#           2017-18  Vimal Manohar
#           2018     Hossein Hadian
# Apache 2.0

# This script generates examples for multilingual training of neural network
# using separate input egs dir per language as input.
# This scripts produces 3 sets of files --
# egs.*.scp, egs.output.*.ark, egs.weight.*.ark
#
# egs.*.scp are the SCP files of the training examples.
# egs.weight.*.ark map from the key of the example to the language-specific
# weight of that example.
# egs.output.*.ark map from the key of the example to the name of
# the output-node in the neural net for that specific language, e.g.
# 'output-2'.
#
# Begin configuration section.
cmd=run.pl
block_size=256          # This is the number of consecutive egs that we take from
                        # each source, and it only affects the locality of disk
                        # access.
lang2weight=            # array of weights one per input languge to scale example's output
                        # w.r.t its input language during training.
stage=0

echo "$0 $@"  # Print the command line for logging

if [ -f path.sh ]; then . ./path.sh; fi
. parse_options.sh || exit 1;

if [ $# -lt 3 ]; then
  cat <<EOF
  This script generates examples for multilingual training of neural network
  using separate input egs dir per language as input.
  See top of the script for details.

  Usage: $0 [opts] <num-input-langs,N> <lang1-egs-dir> ...<langN-egs-dir> <multilingual-egs-dir>
   e.g.: $0 [opts] 2 exp/lang1/egs exp/lang2/egs exp/multi/egs

  Options:
      --cmd (utils/run.pl|utils/queue.pl <queue opts>)  # how to run jobs.
      --block-size <int|512>      # it is the number of consecutive egs that we take from 
                                  # each source, and it only affects the locality of disk 
                                  # access. This does not have to be the actual minibatch size
EOF
  exit 1;
fi

num_langs=$1

shift 1
args=("$@")
megs_dir=${args[-1]} # multilingual directory
mkdir -p $megs_dir
mkdir -p $megs_dir/info
if [ ${#args[@]} != $[$num_langs+1] ]; then
  echo "$0: num of input example dirs provided is not compatible with num_langs $num_langs."
  echo "Usage:$0 [opts] <num-input-langs,N> <lang1-egs-dir> ...<langN-egs-dir> <multilingual-egs-dir>"
  echo "Usage:$0 [opts] 2 exp/lang1/egs exp/lang2/egs exp/multi/egs"
  exit 1;
fi

required="egs.scp combine.scp train_diagnostic.scp valid_diagnostic.scp"
train_scp_list=
train_diagnostic_scp_list=
valid_diagnostic_scp_list=
combine_scp_list=

# read paramter from $egs_dir[0]/info and cmvn_opts
# to write in multilingual egs_dir.
check_params="info/feat_dim info/ivector_dim info/left_context info/right_context info/left_context_initial info/right_context_final cmvn_opts"
ivec_dim=`cat ${args[0]}/info/ivector_dim`
if [ $ivec_dim -ne 0 ];then check_params="$check_params info/final.ie.id"; fi

for param in $check_params info/frames_per_eg; do
  cat ${args[0]}/$param > $megs_dir/$param || exit 1;
done

tot_num_archives=0
for lang in $(seq 0 $[$num_langs-1]);do
  multi_egs_dir[$lang]=${args[$lang]}
  for f in $required; do
    if [ ! -f ${multi_egs_dir[$lang]}/$f ]; then
      echo "$0: no such file ${multi_egs_dir[$lang]}/$f." && exit 1;
    fi
  done
  num_archives=$(cat ${multi_egs_dir[$lang]}/info/num_archives)
  tot_num_archives=$[tot_num_archives+num_archives]
  train_scp_list="$train_scp_list ${args[$lang]}/egs.scp"
  train_diagnostic_scp_list="$train_diagnostic_scp_list ${args[$lang]}/train_diagnostic.scp"
  valid_diagnostic_scp_list="$valid_diagnostic_scp_list ${args[$lang]}/valid_diagnostic.scp"
  combine_scp_list="$combine_scp_list ${args[$lang]}/combine.scp"

  # check parameter dimension to be the same in all egs dirs
  for f in $check_params; do
    if [ -f $megs_dir/$f ] && [ -f ${multi_egs_dir[$lang]}/$f ]; then
      f1=$(cat $megs_dir/$f)
      f2=$(cat ${multi_egs_dir[$lang]}/$f)
      if [ "$f1" != "$f2" ]  ; then
        echo "$0: mismatch for $f in $megs_dir vs. ${multi_egs_dir[$lang]}($f1 vs. $f2)."
        exit 1;
      fi
    else
      echo "$0: file $f does not exits in $megs_dir or ${multi_egs_dir[$lang]}/$f ."
    fi
  done
done

if [ ! -z "$lang2weight" ]; then
  egs_opt="--lang2weight '$lang2weight'"
fi

if [ $stage -le 0 ]; then
  echo "$0: allocating multilingual examples for training."
  # Generate egs.*.scp for multilingual setup.
  $cmd $megs_dir/log/allocate_multilingual_examples_train.log \
    steps/nnet3/multilingual/allocate_multilingual_examples.py $egs_opt \
      --num-archives $tot_num_archives \
      --block-size $block_size \
      $train_scp_list $megs_dir || exit 1;
fi

if [ $stage -le 1 ]; then
  echo "$0: combine combine.scp examples from all langs in $megs_dir/combine.scp."
  # Generate combine.scp for multilingual setup.
  $cmd $megs_dir/log/allocate_multilingual_examples_combine.log \
    steps/nnet3/multilingual/allocate_multilingual_examples.py $egs_opt \
      --num-archives 1 \
      --block-size $block_size \
      --egs-prefix "combine." \
      $combine_scp_list $megs_dir || exit 1;

  echo "$0: combine train_diagnostic.scp examples from all langs in $megs_dir/train_diagnostic.scp."
  # Generate train_diagnostic.scp for multilingual setup.
  $cmd $megs_dir/log/allocate_multilingual_examples_train_diagnostic.log \
    steps/nnet3/multilingual/allocate_multilingual_examples.py $egs_opt \
      --num-archives 1 \
      --block-size $block_size \
      --egs-prefix "train_diagnostic." \
      $train_diagnostic_scp_list $megs_dir || exit 1;


  echo "$0: combine valid_diagnostic.scp examples from all langs in $megs_dir/valid_diagnostic.scp."
  # Generate valid_diagnostic.scp for multilingual setup.
  $cmd $megs_dir/log/allocate_multilingual_examples_valid_diagnostic.log \
    steps/nnet3/multilingual/allocate_multilingual_examples.py $egs_opt \
      --num-archives 1 \
      --block-size $block_size \
      --egs-prefix "valid_diagnostic." \
      $valid_diagnostic_scp_list $megs_dir || exit 1;

fi
for egs_type in combine train_diagnostic valid_diagnostic; do
  mv $megs_dir/${egs_type}.output.1.ark $megs_dir/${egs_type}.output.ark || exit 1;
  mv $megs_dir/${egs_type}.weight.1.ark $megs_dir/${egs_type}.weight.ark || exit 1;
  mv $megs_dir/${egs_type}.1.scp $megs_dir/${egs_type}.scp || exit 1;
done
mv $megs_dir/info/egs.num_archives $megs_dir/info/num_archives || exit 1;
mv $megs_dir/info/egs.num_tasks $megs_dir/info/num_tasks || exit 1;
echo "$0: Finished preparing multilingual training example."


================================================
FILE: egs/steps/nnet3/nnet3_to_dot.sh
================================================
#!/usr/bin/env bash

# script showing use of nnet3_to_dot.py
# Copyright 2015  Johns Hopkins University (Author: Vijayaditya Peddinti).

# Begin configuration section.
component_attributes="name,type"
node_prefixes=""
info_bin=nnet3-am-info
echo "$0 $@"  # Print the command line for logging

[ -f ./path.sh ] && . ./path.sh; # source the path.
. parse_options.sh || exit 1;

if [ $# != 3 ]; then
  echo "Usage: $0 [opts] <nnet3-mdl-file> <output-dot-file> <output-png-file>"
  echo " e.g.: $0 exp/sdm1/nnet3/lstm_sp/0.mdl lstm.dot lstm.png"
  echo ""
  echo "Main options (for others, see top of script file)"
  echo "  --info-bin <nnet3-am-info|nnet3-info>        # Name of the binary to generate the nnet3 file"
  echo "  --component-attributes <string|name,type>     # attributes to be printed in nnet3 components"
  echo "  --node-prefixes <string|Lstm1,Lstm2>          # list of prefixes. Nnet3 components/component-nodes with the same prefix"
  echo "                                                # will be clustered together in the dot-graph"


  exit 1;
fi

model=$1
dot_file=$2
output_file=$3

attr=${node_prefixes:+ --node-prefixes "$node_prefixes"}
$info_bin $model | \
  steps/nnet3/dot/nnet3_to_dot.py \
    --component-attributes "$component_attributes" \
    $attr $dot_file
echo "Generated the dot file $dot_file"

command -v dot >/dev/null 2>&1 || { echo >&2 "This script requires dot but it's not installed. Please compile $dot_file with dot"; exit 1; }
dot -Tpdf $dot_file -o $output_file


================================================
FILE: egs/steps/nnet3/report/convert_model.py
================================================
#!/usr/bin/env python3

# This script dumps the parameters of (most components of) an nnet3 model as a
# pickled python dict.  (see documentation for the function 'read_model' below
# for more details).
#
# It also contains some utility function that you can get access by importing this
# file.
#
# In egs/mini_librispeech/s5/local/chain/diagnostic/report_example.py, you can
# find an example of the use of this script.
#
# Copyright 2017-2018    Daniel Povey
# Apache 2.0.


# This requires python 3.

import sys
import subprocess
import numpy as np
import pickle


def read_next_token(s, pos):
   """This function, given a string s (probably a long string, like a line or a file)
      and a position 'pos', finds the next token in the string (defined as a nonempty
      sequence of whitespace characters delimited by whitespace), and advances the
      position to one character after the end of this token.

      's' is expected to be of type 'str' and 'pos' of type 'int'.
      This function returns a tuple
         (token, new_pos).
      If we're at the end of the string (there is only whitespace between 'pos' and
      the end), then 'token' will be None and 'pos' will be len(s).
   """
   assert isinstance(s, str) and isinstance(pos, int)
   assert pos >= 0
   # Skip over any initial whitespace.
   while pos < len(s) and s[pos].isspace():
      pos += 1
   if pos >= len(s):
      # We reached the end of the string s without finding any non-whitespace.
      return (None, pos)
   initial_pos = pos
   while pos < len(s) and not s[pos].isspace():
      pos += 1
   token = s[initial_pos:pos]
   return (token, pos)

def check_for_newline(s, pos):
   """This function, given a string s (probably a long string, like a line or a file)
      and a position 'pos', in the string, eats up all the whitespace it can
      and records whether a newline was among that whitespace.
      It returns a tuple
         (saw_newline, new_pos)
      where saw_newline will be true if a newline was seen, and new_pos is
      the new position after eating up whitespace-- so either new_pos == len(s)
      or s[new_pos] is non-whitespace.
   """
   assert isinstance(s, str) and isinstance(pos, int)
   assert pos >= 0
   saw_newline = False
   while pos < len(s) and s[pos].isspace():
      if s[pos] == "\n":
         saw_newline = True
      pos += 1
   return (saw_newline, pos)

def read_float(s, pos):
   """This function, given a string s (probably a long string, like a line or a file)
      and a position 'pos', tries to read a text-format floating point or integer,
      starting from this position, and returns the
      pair (float, new_position).
      If something goes wrong it will print a warning to stderr and return (None, pos)
   """
   orig_pos = pos
   (tok, pos) = read_next_token(s, pos)
   f = None
   try:
      f = float(tok)
   except:
      print("{0}: at file position {1}, expected float but got {1}".format(
         sys.argv[0], orig_pos, tok), file=sys.stderr)
      return (None, pos)
   return (f, pos)

def read_int(s, pos):
   """This function, given a string s (probably a long string, like a line or a
      file) and a position 'pos', tries to read a text-format integer, starting
      from this position, and returns the
      pair (int, new_position).
      If something goes wrong it will print a warning to stderr and return (None, pos)
   """
   orig_pos = pos
   (tok, pos) = read_next_token(s, pos)
   i = None
   try:
      i = int(tok)
   except:
      print("{0}: at file position {1}, expected int but got {1}".format(
         tok).format(sys.argv[0], orig_pos, tok), file=sys.stderr)
      return (None, pos)
   return (i, pos)

def read_vector(s, pos):
   """This function, given a string s (probably a long string, like a line or a file)
      and a position 'pos', tries to read a text-format vector (something like "[ 1.0 2.0 3.0 ]"
      starting from this position, reads it as a 1-dimensional numpy array, and returns
      the pair (vector, new_position).
      If something goes wrong it will print a warning to stderr and return (None, pos)
   """
   orig_pos = pos
   (tok, pos) = read_next_token(s, pos)
   if tok != '[':
      print("{0}: at file position {1}, expected vector but got {1}".format(
         tok).format(sys.argv[0], pos, tok), file=sys.stderr)
      return (None, pos)
   v = []
   while True:
      (tok, pos) = read_next_token(s, pos)
      if tok is None or tok == ']':
         break
      try:
         f = float(tok)
         v.append(f)
      except:
         print("{0}: at file position {1}, reading vector, expected float but got {1}".
            format(sys.argv[0], pos, tok), file=sys.stderr)
         return (None, pos)
   if tok is None:
      print("{0}: encountered EOF while reading vector.".format(
         tok).format(sys.argv[0]), file=sys.stderr)
      return (None, pos)
   return (np.array(v, dtype=np.float32), pos)


def read_matrix(s, pos):
   """This function, given a string s (probably a long string, like a line or a file)
      and a position 'pos', tries to read a text-format matrix
      (something like "[\n 1.0 2.0\n 3.0 4.0 ]")
      starting from this position, reads it as a 2-dimensional numpy array, and returns
      pair (matrix, new_position).
      If something goes wrong it will print a warning to stderr and return (None, pos)
   """
   orig_pos = pos
   (tok, pos) = read_next_token(s, pos)
   if tok != '[':
      print("{0}: at file position {1}, expected matrix but got {1}".format(
         tok).format(sys.argv[0], pos, tok), file=sys.stderr)
      return (None, pos)
   # m will be an array of arrays (python arrays, not numpy arrays).
   m = []
   while True:
      # At this point, assume we're ready to read a new vector
      # (terminated by newline or by "]").
      v = []
      while True:
         (tok, pos) = read_next_token(s, pos)
         if tok == ']' or tok == None:
            break
         else:
            try:
               f = float(tok)
               v.append(f)
            except:
               print("{0}: at file position {1}, reading matrix, expected float but got {2}".format(
                  sys.argv[0], pos, tok), file=sys.stderr)
               return (None, pos)

         (saw_newline, pos) = check_for_newline(s, pos)
         if saw_newline:  # Newline terminates each row of the matrix.
            break
      if len(v) > 0:
         m.append(v)
      if tok == 'None':
         print("{0}: matrix starting at position {1} was unexpectedly terminated by EOF.".format(
            sys.argv[0], pos), file=sys.stderr)
         break
      if tok == ']':
         break
   ans_mat = None
   try:
      ans_mat = np.array(m, dtype=np.float32)
   except:
      if tok is None:
         print("{0}: error converting matrix starting at position {1} into numpy array.".format(
            sys.argv[0], orig_pos), file=sys.stderr)
   return (ans_mat, pos)


def is_component_type(component_type):
   """Returns True if 'component_type' is a plausible component type, e.g.
   something of the form "<xxxComponent>", otherwise False"""
   return (isinstance(component_type, str) and len(component_type) >= 13 and
           component_type[0] == "<" and component_type[-10:] == "Component>")


def read_generic(s, pos, terminating_token, action_dict):
   """This function is a generic mechanism for parsing things from text files
     (after reading the text file into a string).  It will return a pair
      (d, new_pos)
     where new_pos is the position in the string after reading the object,
     and d is a dict representing what we read in.

     'terminating_token' is either a token (a whitespace-delimited string)
         that terminates the object (something like "</RectifiedLinearComponent>"),
         or a set containing possible terminating tokens.
     'action_dict' is a dict from token to a pair (function, dict_key)
         where 'function' is the function we should use to read in data,
         and 'dict_key' is the key in the returned dictionary that we should
         use to store the result.  For instance, we might have:
             action_dict['<ParameterMatrix>'] = (read_matrix, 'params')
     It is OK if not everything in the object is covered in 'action_dict'.
     This function will simply skip over anything that it doesn't understand.
   """

   if isinstance(terminating_token, str):
      terminating_tokens = set([terminating_token])
   else:
      terminating_tokens = terminating_token
      assert isinstance(terminating_tokens, set)
   assert isinstance(action_dict, dict)

   # d will contain the fields of the object.
   d = dict()
   orig_pos = pos
   while True:
      (tok, pos) = read_next_token(s, pos)
      if tok in terminating_tokens:
         break
      if tok is None:
         print("{0}: error reading object starting at position {1}, got EOF "
               "while expecting one of: {2}".format(
                  sys.argv[0], orig_pos, terminating_tokens), file=sys.stderr)
         break
      if tok in action_dict:
         p = action_dict[tok]
         assert isinstance(p, tuple) and len(p) == 2
         assert callable(p[0]) and isinstance(p[1], str)
         (func, name) = p
         (obj, pos) = func(s, pos)
         d[name] = obj
   return (d, pos)


def get_action_dict(component_type):
   """Given a component-type (i.e. a string, like <SigmoidComponent>, returns an
      'action_dict' suitable for reading that component type (specifically, one
       that can be given as the 'action_dict' argumnt of 'read_generic').  To
      repeat the documentation there:

     'action_dict' is a dict from token to a pair (function, dict_key)
         where 'function' is the function we should use to read in data,
         and 'dict_key' is the key in the returned dictionary that we should
         use to store the result.  For instance, we might have:
             action_dict['<ParameterMatrix>'] = (read_matrix, 'params')
   """
   assert is_component_type(component_type)

   # e.g. if component_type is '<SigmoidComponent>', raw_component_type would be
   # 'Sigmoid'
   raw_component_type = component_type[1:-10]
   if raw_component_type in { 'Sigmoid', 'Tanh', 'RectifiedLinear',
                              'Softmax', 'LogSoftmax', 'NoOp' }:
      return { '<Dim>': (read_int, 'dim'),
               '<BlockDim>': (read_int, 'block-dim'),
               '<ValueAvg>': (read_vector, 'value-avg'),
               '<DerivAvg>': (read_vector, 'deriv-avg'),
               '<OderivRms>': (read_vector, 'oderiv-rms'),
               '<Count>': (read_float, 'count'),
               '<OderivCount>': (read_float, 'oderiv-count') }
   if raw_component_type in {'Affine',
                             'NaturalGradientAffine'}:
      # We call  '<LinearParams>' to just 'params' for compatibility with
      # LinearComponent.
      return { '<LinearParams>': (read_matrix, 'params'),
               '<BiasParams>': (read_vector, 'bias') }
   if raw_component_type  == 'Linear':
      return { '<Params>': (read_matrix, 'params') }
   if raw_component_type == 'BatchNorm':
      return { '<Dim>': (read_int, 'dim'),
               '<Count>': (read_float, 'count'),
               '<StatsMean>':  (read_vector, 'stats-mean'),
               '<StatsVar>':  (read_vector, 'stats-var') }
   # By default (if we don't know anything about the component type) we just
   # don't read anything.
   return { }


def get_stdout_from_command(command):
   """ Executes a command and returns its stdout output as a string.  The
       command is executed with shell=True, so it may contain pipes and
       other shell constructs.  Raises an exception if the command exits
       with nonzero status.
    """
   p = subprocess.Popen(command, shell=True,
                        stdout=subprocess.PIPE)

   stdout = p.communicate()[0]
   if p.returncode is not 0:
      raise Exception("Command exited with status {0}: {1}".format(
         p.returncode, command))
   return stdout.decode()


def read_component(s, pos):
   """Reads a component starting at position 'pos' in the string 's'.  At this position,
      there is expected to be a component type, e.g. <RectifiedLinearComponent>, and this
      funtion will read until after the end-marker, e.g. </RectifiedLinearComponent>,
      or if this fails for some reason, until the next instance of <ComponentName>.

      This funtion returns the pair (d, new_pos) where d is a dict from
      element-name to object (e.g. d['params'] might contain a matrix), and
      new_pos is the position in the string after reading this component in.
      Returns (None, new_pos) if something went wrong.
   """
   (component_type, pos) = read_next_token(s, pos)
   if not is_component_type(component_type):
      print("{0}: error reading Component: at position {1}, expected <xxxxComponent>,"
            " got: {2}".format(sys.argv[0], pos, component_type), file=sys.stderr)
      while True:
         (tok, pos) = read_next_token(s, pos)
         if tok is None or tok == '<ComponentName>':
            return (None, pos)
   terminating_token = "</" + component_type[1:]
   terminating_tokens = { terminating_token, '<ComponentName>' }

   action_dict = get_action_dict(component_type)
   (d, pos) = read_generic(s, pos, terminating_tokens, action_dict)
   if d is not None:
      d['type'] = component_type             # e.g. '<LinearComponent>'
      d['raw-type'] = component_type[1:-10]  # e.g. 'Linear'
   return (d, pos)


def read_model(filename):
   """Reads an nnet3 model from the provided filename, and returns a dict
      from the component-name to a dict containing things we have read
      in for that component."""
   command = "nnet3-copy --binary=false {0} -".format(filename)
   s = get_stdout_from_command(command)
   # The model starts with some structural stuff (component-nodes, etc.) that we
   # won't be attempting to parse.  We start parsing when we reach
   # <NumComponents>.
   pos = 0
   while True:
      (tok, pos) = read_next_token(s, pos)
      if tok is None:
         print("{0}: unexpected EOF on output of command {1}".format(
            sys.argv[0], command))
         return None
      if tok == "<NumComponents>":
         break
   # we just read <NumComponents>
   (tok, pos) = read_next_token(s, pos)
   # 'd', which we return, will be a dict from component-name
   # (e.g. 'tdnn1.affine'), to a dict containing elements of the component.
   d = dict()
   num_components = int(tok)  # shouldn't fail.
   for c in range(num_components):
      # read the components one by one...
      (tok, pos) = read_next_token(s, pos)
      if tok is None:
         print("{0}: unexpected EOF on output of command {1}".format(
            sys.argv[0], command))
         return None
      # We normally expect that tok will be '<ComponentName>', but if we read in
      # '<ComponentName>' while parsing the previous component (e.g. if its text form was
      # not terminated in the way we expected), then we accept that '<ComponentName>'
      # might not be available to parse.
      if tok == '<ComponentName>':
         component_pos = pos
         (component_name, pos) = read_next_token(s, pos)
      # At this point the type of the component will be printed: something like
      # <NaturalGradientAffineComponent>.  We let 'read_component' take it from
      # here, and it will read until the terminating </NaturalGradientAffineComponent>,
      # or, in the case of error, to EOF or the next <ComponentName> string.
      (component, pos) = read_component(s, pos)
      if component != None:
         d[component_name] = component
      else:
         print("{0}: error reading component with name {1} at position {2}".format(
            sys.argv[0], component_name, component_pos), file=sys.stderr)

   return d

def compute_derived_quantities(model):
   """This function, given a model as returned by 'read_model', computes certain
       potentially-useful derived quantities inside components: things like row
       and column norms of parameter matrices, standard deviations of
       accumulated stats.
   """
   assert isinstance(model, dict)
   for c in model.values():
      # 'c' represents the component; it's a dict.
      raw_component_type = c['raw-type']
      if raw_component_type in {'Linear', 'Affine', 'NaturalGradientAffine'}:
         params = c['params'] # this is the parameter matrix.
         # compute the row and column norms of the parameter matrix.
         c['row-norms'] = np.sqrt(np.sum(params * params, axis=1))
         c['col-norms'] = np.sqrt(np.sum(params * params, axis=0))
         size = c['col-norms'].size
         if size % 3 == 0:
            # if the input-dim of this layer is divisible by 3, then compute the
            # column-norms after reshaping... this is a kind of pooled column-norm
            # that makes sense for TDNNs or wherever we have used Append().
            c['col-norms-3'] = np.sqrt(np.sum(np.power(c['col-norms'], 2).reshape(3, size/3), axis=0))
            assert c['col-norms-3'].shape == (size/3,)

      if raw_component_type == 'BatchNorm':
         stats_var = c['stats-var']
         c['stats-stddev'] = np.sqrt(stats_var)

def compute_progress(model1, model2):
   """This function, given two models assumed to come from two successive
      iterations of training, computes certain component-level quantities
      that relate to the rate of change of parameters, and stores them in
      'model1'.
   """
   for component_name in model1:
      if not (component_name in model1 and component_name in model2):
         continue
      c1 = model1[component_name]
      c2 = model2[component_name]
      raw_component_type = c1['raw-type']
      if raw_component_type in {'Linear', 'Affine', 'NaturalGradientAffine'}:
         params1 = c1['params']
         params2 = c2['params']
         if params1.size != params2.size:
            continue  # can't compare them if sizes differ.
         params_diff = params1 - params2
         c1['row-change'] = np.sqrt(np.sum(params_diff * params_diff, axis=1))
         c1['col-change'] = np.sqrt(np.sum(params_diff * params_diff, axis=0))
         # compute relative change in rows and columns.
         epsilon = 1.0e-20
         if 'row-norms' in c1:
            c1['rel-row-change'] = c1['row-change'] / (c1['row-norms'] + epsilon)
         if 'col-norms' in c1:
            c1['rel-col-change'] = c1['col-change'] / (c1['col-norms'] + epsilon)


         size = c1['col-norms'].size
         if size % 3 == 0:
            # if the input-dim of this layer is divisible by 3, then average the
            # column changes over 3 blocks... this makes sense for TDNNs or
            # wherever we have used Append().
            c1['col-change-3'] = np.sum(c1['col-change'].reshape(3, size/3), axis=0)
            c1['rel-col-change-3'] = c1['col-change-3'] / (c1['col-norms-3'] + epsilon)


def test():
   assert sys.version_info.major >= 3
   assert read_next_token("", 0) == (None, 0)
   assert read_next_token("hello", 0) == ("hello", 5)
   assert read_next_token("hello there", 0) == ("hello", 5)
   assert read_next_token("hello there", 5) == ("there", 11)
   assert read_next_token("hello there", 6) == ("there", 11)
   (a, pos) = read_vector(" [ 1 2 3 ] ", 0)
   assert pos == 10 and np.array_equal(np.array([1,2,3], dtype=np.float32), a)
   assert check_for_newline("hello ", 4) == (False, 4)
   assert check_for_newline("hello ", 5) == (False, 6)
   assert check_for_newline("hello \n", 5) == (True, 7)
   assert check_for_newline("hello \nthere", 5) == (True, 7)
   (m, pos) = read_matrix(" [\n 1 2 3\n 4 5 6 ] ", 0)
   assert pos == 18 and np.array_equal(np.array([[1,2,3],[4,5,6]], dtype=np.float32), m)

   s = "  <ignore_this> 1 <some_vec> [ 1 2 3 ] <end>"
   (obj, pos) = read_generic(s, 0, "<end>", { '<some_vec>': (read_vector, 'some_vec') })
   assert pos == len(s)
   assert np.array_equal(obj['some_vec'], np.array([1, 2, 3], dtype=np.float32))

   m = read_model('exp/chain_cleaned/tdnn1c_sp_bi/final.mdl')
   compute_derived_quantities(m)
   print("model is: {0}".format(m))
   print("tested")


if __name__ == '__main__':
   if len(sys.argv) == 1:
      test()

   if len(sys.argv) != 3:
      print("Usage: {0} <nnet3-model-in> <pickled-model-out>".format(
         sys.argv[0]), file=sys.stderr)
      sys.exit(1)

   m = read_model(sys.argv[1])
   if m != None:
      try:
         f = open(sys.argv[2], "wb")
         pickle.dump(m, f)
      except:
         print("{0}: error writing to {1}".format(
            sys.argv[2]), file=sys.stderr)


================================================
FILE: egs/steps/nnet3/report/generate_plots.py
================================================
#!/usr/bin/env python

# Copyright 2016    Vijayaditya Peddinti
#           2016    Vimal Manohar
# Apache 2.0.

from __future__ import division
import argparse
import errno
import logging
import os
import re
import sys
import warnings

sys.path.insert(0, 'steps')
import libs.nnet3.report.log_parse as log_parse
import libs.common as common_lib

try:
    import matplotlib as mpl
    mpl.use('Agg')
    import matplotlib.pyplot as plt
    import numpy as np
    from matplotlib.patches import Rectangle
    # matplotlib issue https://github.com/matplotlib/matplotlib/issues/12513
    # plt.subplot() generates a false-positive warninig, suppress it for now.
    from matplotlib.cbook import MatplotlibDeprecationWarning
    warnings.filterwarnings('ignore', category=MatplotlibDeprecationWarning,
                            message='Adding an axes using the same arguments')
    g_plot = True
except ImportError:
    g_plot = False


logging.basicConfig(format="%(filename)s:%(lineno)s:%(levelname)s:%(message)s",
                    level=logging.INFO)
logger = logging.getLogger(__name__)


def get_args():
    parser = argparse.ArgumentParser(
        prog=sys.argv[0],  # By default, prog is set this to filename only.
        formatter_class=type('', (argparse.RawDescriptionHelpFormatter,
                                  argparse.ArgumentDefaultsHelpFormatter), {}),
        description="Parses the training logs and generates a variety of plots.\n"
        "e.g.: %(prog)s \\\n"
        "  exp/nnet3/tdnn exp/nnet3/tdnn1 exp/nnet3/tdnn2 exp/nnet3/tdnn/report.\n"
        "The report file 'report.pdf' will be generated in the <output_dir> directory.")

    parser.add_argument("--start-iter", type=int, metavar='N', default=1,
                        help="Iteration from which plotting will start.")
    parser.add_argument("--is-chain", type=common_lib.str_to_bool, default='false', metavar='BOOL',
                        help="Set to 'true' if <exp_dir>s contain chain models.")
    parser.add_argument("--is-rnnlm", type=common_lib.str_to_bool, default='false', metavar='BOOL',
                        help="Set to 'true' if <exp_dir>s contain RNNLM.")
    parser.add_argument("--output-nodes", type=str, metavar='NODES',
                        action=common_lib.NullstrToNoneAction,
                        help="List of space separated <output-node>:<objective-type> entries, "
                        "one for each output node")
    parser.add_argument("--comparison-dir", type=str, metavar='DIR', action='append',
                        help="[DEPRECATED] Experiment directories for comparison. "
                        "These will only be used for plots, not tables.")
    parser.add_argument("exp_dir", nargs='+',
                        help="The first <exp_dir> is the current experiment directory, e.g. "
                        "'exp/nnet3/tdnn'; the rest are up to 6 optional directories of other "
                        "experiments to be graphed on same plots for comparison.")
    parser.add_argument("output_dir",
                        help="output directory for reports, e.g. 'exp/nnet3/tdnn/report'")

    args = parser.parse_args()
    if ((args.comparison_dir is not None and len(args.comparison_dir) > 6) or
        (args.exp_dir is not None and len(args.exp_dir) > 7)):
        raise Exception(
            "Up to 6 comparison directories may be specified. "
            "If you want to compare with more experiments, you would have to carefully tune "
            "the plot_colors variable which specified colors used for plotting.")
    assert args.start_iter >= 1
    if args.is_chain and args.is_rnnlm:
        raise Exception("Options --is-chain and --is-rnnlm cannot be both true.")
    return args


g_plot_colors = ['red', 'blue', 'green', 'black', 'magenta', 'yellow', 'cyan']

class LatexReport(object):
    """Class for writing a Latex report"""

    def __init__(self, pdf_file):
        self.pdf_file = pdf_file
        self.document = []
        self.document.append(r"""
\documentclass[prl,10pt,twocolumn]{revtex4}
\usepackage{graphicx}    % Used to import the graphics
\begin{document}
""")

    def add_figure(self, figure_pdf, title):
        """we will have keep extending this replacement list based on errors
        during compilation escaping underscores in the title"""

        title = r"\texttt{"+re.sub("_", "\_", title)+"}"
        fig_latex = r"""
%...
\newpage
\begin{figure}[h]
  \begin{center}
    \caption{""" + title + r"""}
    \includegraphics[width=\textwidth]{""" + figure_pdf + r"""}
  \end{center}
\end{figure}
\clearpage
%...
"""
        self.document.append(fig_latex)

    def close(self):
        self.document.append(r"\end{document}")
        return self.compile()

    def compile(self):
        root, ext = os.path.splitext(self.pdf_file)
        dir_name = os.path.dirname(self.pdf_file)
        latex_file = root + ".tex"
        lat_file = open(latex_file, "w")
        lat_file.write("\n".join(self.document))
        lat_file.close()
        logger.info("Compiling the LaTeX report.")
        try:
            common_lib.execute_command(
                "pdflatex -interaction=batchmode "
                "-output-directory={0} {1}".format(dir_name, latex_file))
        except Exception as e:
            logger.warning("There was an error compiling LaTeX file %s. "
                           "Check report.log generated by pdflatex in the same directory. %s",
                           latex_file, e)
            return False
        return True


def latex_compliant_name(name_string):
    """this function is required as latex does not allow all the component names
    allowed by nnet3.
    Identified incompatibilities :
        1. latex does not allow dot(.) in file names
    """
    node_name_string = re.sub("\.", "_dot_", name_string)

    return node_name_string


def generate_acc_logprob_plots(exp_dir, output_dir, plot, key='accuracy',
        file_basename='accuracy', comparison_dir=None,
        start_iter=1, latex_report=None, output_name='output'):

    assert start_iter >= 1

    if plot:
        fig = plt.figure()
        plots = []

    comparison_dir = [] if comparison_dir is None else comparison_dir
    dirs = [exp_dir] + comparison_dir
    index = 0
    for dir in dirs:
        [report, times, data] = log_parse.generate_acc_logprob_report(dir, key,
                output_name)
        if index == 0:
            # this is the main experiment directory
            with open("{0}/{1}.log".format(output_dir,
                                           file_basename), "w") as f:
                f.write(report)

        if plot:
            color_val = g_plot_colors[index]
            data = np.array(data)
            if data.shape[0] == 0:
                logger.warning("Couldn't find any rows for the"
                               "accuracy/log-probability plot, not generating it")
                return
            data = data[data[:, 0] >= start_iter, :]
            plot_handle, = plt.plot(data[:, 0], data[:, 1], color=color_val,
                                    linestyle="--",
                                    label="train {0}".format(dir))
            plots.append(plot_handle)
            plot_handle, = plt.plot(data[:, 0], data[:, 2], color=color_val,
                                    label="valid {0}".format(dir))
            plots.append(plot_handle)
        index += 1
    if plot:
        plt.xlabel('Iteration')
        plt.ylabel(key)
        lgd = plt.legend(handles=plots, loc='lower center',
                         bbox_to_anchor=(0.5, -0.2 + len(dirs) * -0.1),
                         ncol=1, borderaxespad=0.)
        plt.grid(True)
        fig.suptitle("{0} plot for {1}".format(key, output_name))
        figfile_name = '{0}/{1}_{2}.pdf'.format(
            output_dir, file_basename,
            latex_compliant_name(output_name))
        plt.savefig(figfile_name, bbox_extra_artists=(lgd,),
                    bbox_inches='tight')
        if latex_report is not None:
            latex_report.add_figure(
                figfile_name,
                "Plot of {0} vs iterations for {1}".format(key, output_name))


# The name of five gates of lstmp
g_lstm_gate = ['i_t_sigmoid', 'f_t_sigmoid', 'c_t_tanh', 'o_t_sigmoid', 'm_t_tanh']

# The "extra" item is a placeholder. As each unit in python plot is
# composed by a legend_handle(linestyle) and a legend_label(description).
# For the unit which doesn't have linestyle, we use the "extra" placeholder.
if g_plot:
    extra = Rectangle((0, 0), 1, 1, facecolor="w", fill=False, edgecolor='none', linewidth=0)

# This function is used to insert a column to the legend, the column_index is 1-based
def insert_a_column_legend(legend_handle, legend_label, lp, mp, hp,
        dir, prefix_length, column_index):
    handle = [extra, lp, mp, hp]
    label = ["[1]{0}".format(dir[prefix_length:]), "", "", ""]
    for row in range(1,5):
        legend_handle.insert(column_index*row-1, handle[row-1])
        legend_label.insert(column_index*row-1, label[row-1])


# This function is used to plot a normal nonlinearity component or a gate of lstmp
def plot_a_nonlin_component(fig, dirs, stat_tables_per_component_per_dir,
        component_name, common_prefix, prefix_length, component_type,
        start_iter, gate_index=0, with_oderiv=0):
    fig.clf()
    index = 0
    legend_handle = [extra, extra, extra, extra]
    legend_label = ["", '5th percentile', '50th percentile', '95th percentile']

    if not with_oderiv:
        for dir in dirs:
            color_val = g_plot_colors[index]
            index += 1
            try:
                iter_stats = (stat_tables_per_component_per_dir[dir][component_name])
            except KeyError:
                # this component is not available in this network so lets
                # not just plot it
                insert_a_column_legend(legend_handle, legend_label, lp, mp, hp,
                        dir, prefix_length, index+1)
                continue

            data = np.array(iter_stats)
            data = data[data[:, 0] >= start_iter, :]

            ax = plt.subplot(211)
            lp, = ax.plot(data[:, 0], data[:, gate_index*10+5], color=color_val,
                    linestyle='--')
            mp, = ax.plot(data[:, 0], data[:, gate_index*10+6], color=color_val,
                    linestyle='-')
            hp, = ax.plot(data[:, 0], data[:, gate_index*10+7], color=color_val,
                    linestyle='--')
            insert_a_column_legend(legend_handle, legend_label, lp, mp, hp,
                    dir, prefix_length, index+1)

            ax.set_ylabel('Value-{0}'.format(component_type))
            ax.grid(True)

            ax = plt.subplot(212)
            lp, = ax.plot(data[:, 0], data[:, gate_index*10+8], color=color_val,
                    linestyle='--')
            mp, = ax.plot(data[:, 0], data[:, gate_index*10+9], color=color_val,
                    linestyle='-')
            hp, = ax.plot(data[:, 0], data[:, gate_index*10+10], color=color_val,
                    linestyle='--')
            ax.set_xlabel('Iteration')
            ax.set_ylabel('Derivative-{0}'.format(component_type))
            ax.grid(True)

        lgd = plt.legend(legend_handle, legend_label, loc='lower center',
                bbox_to_anchor=(0.5 , -0.5 + len(dirs) * -0.2),
                ncol=4, handletextpad = -2, title="[1]:{0}".format(common_prefix),
                borderaxespad=0.)
        plt.grid(True)

    else:
        for dir in dirs:
            color_val = g_plot_colors[index]
            index += 1
            try:
                iter_stats = (stat_tables_per_component_per_dir[dir][component_name])
            except KeyError:
                # this component is not available in this network so lets
                # not just plot it
                insert_a_column_legend(legend_handle, legend_label, lp, mp, hp,
                        dir, prefix_length, index+1)
                continue

            data = np.array(iter_stats)
            data = data[data[:, 0] >= start_iter, :]
            ax = plt.subplot(311)
            lp, = ax.plot(data[:, 0], data[:, gate_index*10+7], color=color_val,
                    linestyle='--')
            mp, = ax.plot(data[:, 0], data[:, gate_index*10+8], color=color_val,
                    linestyle='-')
            hp, = ax.plot(data[:, 0], data[:, gate_index*10+9], color=color_val,
                    linestyle='--')
            insert_a_column_legend(legend_handle, legend_label, lp, mp, hp,
                    dir, prefix_length, index+1)

            ax.set_ylabel('Value-{0}'.format(component_type))
            ax.grid(True)

            ax = plt.subplot(312)
            lp, = ax.plot(data[:, 0], data[:, gate_index*10+10], color=color_val,
                    linestyle='--')
            mp, = ax.plot(data[:, 0], data[:, gate_index*10+11], color=color_val,
                    linestyle='-')
            hp, = ax.plot(data[:, 0], data[:, gate_index*10+12], color=color_val,
                    linestyle='--')
            ax.set_ylabel('Derivative-{0}'.format(component_type))
            ax.grid(True)

            ax = plt.subplot(313)
            lp, = ax.plot(data[:, 0], data[:, gate_index*10+13], color=color_val,
                    linestyle='--')
            mp, = ax.plot(data[:, 0], data[:, gate_index*10+14], color=color_val,
                    linestyle='-')
            hp, = ax.plot(data[:, 0], data[:, gate_index*10+15], color=color_val,
                    linestyle='--')
            ax.set_xlabel('Iteration')
            ax.set_ylabel('Oderivative-{0}'.format(component_type))
            ax.grid(True)

            plt.subplots_adjust(top=0.8, hspace = 1.0, bottom = -0.2)
        lgd = plt.legend(legend_handle, legend_label, loc='lower center',
                bbox_to_anchor=(0.5 , -1.5 + len(dirs) * -0.2),
                ncol=4, handletextpad = -2, title="[1]:{0}".format(common_prefix),
                borderaxespad=0.)
        plt.grid(True)

    return lgd


# This function is used to generate the statistic plots of nonlinearity component
# Mainly divided into the following steps:
# 1) With log_parse function, we get the statistics from each directory.
# 2) Convert the collected nonlinearity statistics into the tables. Each table
#    contains all the statistics in each component of each directory.
# 3) The statistics of each component are stored into corresponding log files.
#    Each line of the log file contains the statistics of one iteration.
# 4) Plot the "Per-dimension average-(value, derivative) percentiles" figure
#    for each nonlinearity component.
def generate_nonlin_stats_plots(exp_dir, output_dir, plot, comparison_dir=None,
                                start_iter=1, latex_report=None):
    assert start_iter >= 1

    comparison_dir = [] if comparison_dir is None else comparison_dir
    dirs = [exp_dir] + comparison_dir
    index = 0
    stats_per_dir = {}
    with_oderiv = 0

    for dir in dirs:
        stats_per_component_per_iter = (
            log_parse.parse_progress_logs_for_nonlinearity_stats(dir))
        for key in stats_per_component_per_iter:
            if len(stats_per_component_per_iter[key]['stats']) == 0:
                logger.warning("Couldn't find any rows for the"
                               "nonlin stats plot, not generating it")

        stats_per_dir[dir] = stats_per_component_per_iter
    # convert the nonlin stats into tables
    stat_tables_per_component_per_dir = {}

    for dir in dirs:
        stats_per_component_per_iter = stats_per_dir[dir]
        component_names = stats_per_component_per_iter.keys()
        stat_tables_per_component = {}
        for component_name in component_names:
            comp_data = stats_per_component_per_iter[component_name]
            comp_type = comp_data['type']
            comp_stats = comp_data['stats']
            iters = sorted(comp_stats)
            iter_stats = []
            for iter in iters:
                iter_stats.append([iter] + comp_stats[iter])
            stat_tables_per_component[component_name] = iter_stats
        stat_tables_per_component_per_dir[dir] = stat_tables_per_component
    if len(comp_stats[iter]) == 15:
        with_oderiv = 1
    main_stat_tables = stat_tables_per_component_per_dir[exp_dir]

    for component_name in main_stat_tables.keys():
        # this is the main experiment directory
        with open("{dir}/nonlinstats_{comp_name}.log".format(
                    dir=output_dir, comp_name=component_name), "w") as f:
            if with_oderiv:
                # with oderiv-rms
                f.write("Iteration\tValueMean\tValueStddev\tDerivMean\tDerivStddev\t"
                        "OderivMean\tOderivStddev\t"
                        "Value_5th\tValue_50th\tValue_95th\t"
                        "Deriv_5th\tDeriv_50th\tDeriv_95th\t"
                        "Oderiv_5th\tOderiv_50th\tOderiv_95th\n")
            else:
                # without oderiv-rms
                f.write("Iteration\tValueMean\tValueStddev\tDerivMean\tDerivStddev\t"
                        "Value_5th\tValue_50th\tValue_95th\t"
                        "Deriv_5th\tDeriv_50th\tDeriv_95th\n")
            iter_stat_report = []
            iter_stats = main_stat_tables[component_name]
            for row in iter_stats:
                iter_stat_report.append("\t".join([str(x) for x in row]))
            f.write("\n".join(iter_stat_report))
            f.close()
    if plot:
        main_component_names = sorted(main_stat_tables)
        plot_component_names = set(main_component_names)
        for dir in dirs:
            component_names = set(stats_per_dir[dir].keys())
            plot_component_names = plot_component_names.intersection(
                component_names)
        plot_component_names = sorted(plot_component_names)
        if plot_component_names != main_component_names:
            logger.warning("The components in all the neural networks in the "
                           "given experiment dirs are not the same, so comparison plots are "
                           "provided only for common component names. Make sure that these are "
                           "comparable experiments before analyzing these plots.")

        fig = plt.figure()

        common_prefix = os.path.commonprefix(dirs)
        prefix_length = common_prefix.rfind('/')
        common_prefix = common_prefix[0:prefix_length]

        for component_name in main_component_names:
            if stats_per_dir[exp_dir][component_name]['type'] == 'LstmNonlinearity':
                for i in range(0,5):
                    component_type = 'Lstm-' + g_lstm_gate[i]
                    lgd = plot_a_nonlin_component(fig, dirs,
                            stat_tables_per_component_per_dir, component_name,
                            common_prefix, prefix_length, component_type, start_iter, i, with_oderiv)
                    fig.suptitle("Per-dimension average-(value, derivative) percentiles for "
                         "{component_name}-{gate}".format(component_name=component_name, gate=g_lstm_gate[i]))
                    comp_name = latex_compliant_name(component_name)
                    figfile_name = '{dir}/nonlinstats_{comp_name}_{gate}.pdf'.format(
                        dir=output_dir, comp_name=comp_name, gate=g_lstm_gate[i])
                    fig.savefig(figfile_name, bbox_extra_artists=(lgd,),
                        bbox_inches='tight')
                    if latex_report is not None:
                        latex_report.add_figure(
                        figfile_name,
                        "Per-dimension average-(value, derivative) percentiles for "
                        "{0}-{1}".format(component_name, g_lstm_gate[i]))
            else:
                component_type = stats_per_dir[exp_dir][component_name]['type']
                lgd = plot_a_nonlin_component(fig, dirs,
                        stat_tables_per_component_per_dir,component_name,
                        common_prefix, prefix_length, component_type, start_iter, 0, with_oderiv)
                if with_oderiv:
                    fig.suptitle("Per-dimension average-(value, derivative) and rms-oderivative percentiles for "
                         "{component_name}".format(component_name=component_name))
                else:
                    fig.suptitle("Per-dimension average-(value, derivative) percentiles for "
                         "{component_name}".format(component_name=component_name))
                comp_name = latex_compliant_name(component_name)
                figfile_name = '{dir}/nonlinstats_{comp_name}.pdf'.format(
                    dir=output_dir, comp_name=comp_name)
                fig.savefig(figfile_name, bbox_extra_artists=(lgd,),
                        bbox_inches='tight')
                if latex_report is not None:
                    if with_oderiv:
                        latex_report.add_figure(
                        figfile_name,
                        "Per-dimension average-(value, derivative) and rms-oderivative percentiles for "
                        "{0}".format(component_name))
                    else:
                        latex_report.add_figure(
                        figfile_name,
                        "Per-dimension average-(value, derivative) percentiles for "
                        "{0}".format(component_name))


def generate_clipped_proportion_plots(exp_dir, output_dir, plot,
                                      comparison_dir=None, start_iter=1,
                                      latex_report=None):
    assert(start_iter >= 1)

    comparison_dir = [] if comparison_dir is None else comparison_dir
    dirs = [exp_dir] + comparison_dir
    index = 0
    stats_per_dir = {}
    for dir in dirs:
        try:
            stats_per_dir[dir] = (
                log_parse.parse_progress_logs_for_clipped_proportion(dir))
        except log_parse.MalformedClippedProportionLineException as e:
            raise e
        except common_lib.KaldiCommandException as e:
            logger.warning("Could not extract the clipped proportions for %s, "
                           "this might be because there are no ClipGradientComponents.", dir)
            continue
        if len(stats_per_dir[dir]) == 0:
            logger.warning("Couldn't find any rows for the"
                           "clipped proportion plot, not generating it")
    try:
        main_cp_stats = stats_per_dir[exp_dir]['table']
    except KeyError:
        logger.warning("The main experiment directory %s does not have clipped proportions. "
                       "Not generating clipped proportion plots.", exp_dir)
        return

    # this is the main experiment directory
    file = open("{dir}/clipped_proportion.log".format(dir=output_dir), "w")
    iter_stat_report = ""
    for row in main_cp_stats:
        iter_stat_report += "\t".join([str(x) for x in row]) + "\n"
    file.write(iter_stat_report)
    file.close()

    if plot:
        main_component_names = sorted(stats_per_dir[exp_dir]['cp_per_iter_per_component'])
        plot_component_names = set(main_component_names)
        for dir in dirs:
            try:
                component_names = set(stats_per_dir[dir]['cp_per_iter_per_component'])
                plot_component_names = (
                    plot_component_names.intersection(component_names))
            except KeyError:
                continue
        plot_component_names = sorted(plot_component_names)
        if plot_component_names != main_component_names:
            logger.warning(
                "The components in all the neural networks in the given "
                "experiment dirs are not the same, so comparison plots are "
                "provided only for common component names. Make sure that these "
                "are comparable experiments before analyzing these plots.")

        fig = plt.figure()
        for component_name in main_component_names:
            fig.clf()
            index = 0
            plots = []
            for dir in dirs:
                color_val = g_plot_colors[index]
                index += 1
                try:
                    iter_stats = stats_per_dir[dir][
                        'cp_per_iter_per_component'][component_name]
                except KeyError:
                    # this component is not available in this network so lets
                    # not just plot it
                    continue

                data = np.array(iter_stats)
                data = data[data[:, 0] >= start_iter, :]
                ax = plt.subplot(111)
                mp, = ax.plot(data[:, 0], data[:, 1], color=color_val,
                              label="Clipped Proportion {0}".format(dir))
                plots.append(mp)
                ax.set_ylabel('Clipped Proportion')
                ax.set_ylim([0, 1.2])
                ax.grid(True)
            lgd = plt.legend(handles=plots, loc='lower center',
                             bbox_to_anchor=(0.5, -0.5 + len(dirs) * -0.2),
                             ncol=1, borderaxespad=0.)
            plt.grid(True)
            fig.suptitle("Clipped-proportion value at {comp_name}".format(
                            comp_name=component_name))
            comp_name = latex_compliant_name(component_name)
            figfile_name = '{dir}/clipped_proportion_{comp_name}.pdf'.format(
                dir=output_dir, comp_name=comp_name)
            fig.savefig(figfile_name, bbox_extra_artists=(lgd,),
                        bbox_inches='tight')
            if latex_report is not None:
                latex_report.add_figure(
                    figfile_name,
                    "Clipped proportion at {0}".format(component_name))


def generate_parameter_diff_plots(exp_dir, output_dir, plot,
                                  comparison_dir=None, start_iter=1,
                                  latex_report=None):
    # Parameter changes
    assert start_iter >= 1

    comparison_dir = [] if comparison_dir is None else comparison_dir
    dirs = [exp_dir] + comparison_dir
    index = 0
    stats_per_dir = {}
    key_file = {"Parameter differences": "parameter.diff",
                "Relative parameter differences": "relative_parameter.diff"}
    stats_per_dir = {}
    for dir in dirs:
        stats_per_dir[dir] = {}
        for key in key_file:
            stats_per_dir[dir][key] = (
                log_parse.parse_progress_logs_for_param_diff(dir, key))

    # write down the stats for the main experiment directory
    for diff_type in key_file:
        with open("{0}/{1}".format(output_dir, key_file[diff_type]), "w") as f:
            diff_per_component_per_iter = (
                stats_per_dir[exp_dir][diff_type]['progress_per_component'])
            component_names = (
                stats_per_dir[exp_dir][diff_type]['component_names'])
            max_iter = stats_per_dir[exp_dir][diff_type]['max_iter']
            f.write(" ".join(["Iteration"] + component_names)+"\n")
            total_missing_iterations = 0
            gave_user_warning = False
            for iter in range(max_iter + 1):
                iter_data = [str(iter)]
                for c in component_names:
                    try:
                        iter_data.append(
                            str(diff_per_component_per_iter[c][iter]))
                    except KeyError:
                        total_missing_iterations += 1
                        iter_data.append("NA")
                if (float(total_missing_iterations)/len(component_names) > 20
                        and not gave_user_warning):
                    logger.warning("There are more than %.0f missing iterations per component. "
                                   "Something might be wrong.",
                                   float(total_missing_iterations)/ len(component_names))
                    gave_user_warning = True

                f.write(" ".join(iter_data) + "\n")

    if plot:
        # get the component names
        diff_type = list(key_file.keys())[0]
        main_component_names = sorted(stats_per_dir[exp_dir][diff_type]['progress_per_component'])
        plot_component_names = set(main_component_names)
        for dir in dirs:
            try:
                component_names = set(stats_per_dir[dir][diff_type]['progress_per_component'])
                plot_component_names = plot_component_names.intersection(component_names)
            except KeyError:
                continue
        plot_component_names = sorted(plot_component_names)
        if plot_component_names != main_component_names:
            logger.warning("The components in all the neural networks in the "
                           "given experiment dirs are not the same, "
                           "so comparison plots are provided only for common "
                           "component names. "
                           "Make sure that these are comparable experiments "
                           "before analyzing these plots.")

        assert main_component_names

        fig = plt.figure()
        logger.info("Plotting parameter differences for components: " +
                    ", ".join(main_component_names))

        for component_name in main_component_names:
            fig.clf()
            index = 0
            plots = []
            for dir in dirs:
                color_val = g_plot_colors[index]
                index += 1
                iter_stats = []
                try:
                    for diff_type in ['Parameter differences',
                                      'Relative parameter differences']:
                        iter_stats.append(np.array(
                            sorted(stats_per_dir[dir][diff_type][
                                'progress_per_component'][
                                    component_name].items())))
                except KeyError as e:
                    # this component is not available in this network so lets
                    # not just plot it
                    if dir == exp_dir:
                        raise Exception("No parameter differences were available even in the main "
                                        "experiment dir for the component {0}. Something went "
                                        "wrong: {1}.".format(component_name, e))
                    continue
                ax = plt.subplot(211)
                mp, = ax.plot(iter_stats[0][:, 0], iter_stats[0][:, 1],
                              color=color_val,
                              label="Parameter Differences {0}".format(dir))
                plots.append(mp)
                ax.set_ylabel('Parameter Differences')
                ax.grid(True)

                ax = plt.subplot(212)
                mp, = ax.plot(iter_stats[1][:, 0], iter_stats[1][:, 1],
                              color=color_val,
                              label="Relative Parameter "
                                    "Differences {0}".format(dir))
                ax.set_xlabel('Iteration')
                ax.set_ylabel('Relative Parameter Differences')
                ax.grid(True)

            lgd = plt.legend(handles=plots, loc='lower center',
                             bbox_to_anchor=(0.5, -0.5 + len(dirs) * -0.2),
                             ncol=1, borderaxespad=0.)
            plt.grid(True)
            fig.suptitle("Parameter differences at {comp_name}".format(
                comp_name=component_name))
            comp_name = latex_compliant_name(component_name)
            figfile_name = '{dir}/param_diff_{comp_name}.pdf'.format(
                dir=output_dir, comp_name=comp_name)
            fig.savefig(figfile_name, bbox_extra_artists=(lgd,),
                        bbox_inches='tight')
            if latex_report is not None:
                latex_report.add_figure(
                    figfile_name,
                    "Parameter differences at {0}".format(component_name))


def generate_plots(exp_dir, output_dir, output_names, comparison_dir=None,
                   start_iter=1):
    try:
        os.makedirs(output_dir)
    except OSError as e:
        if e.errno == errno.EEXIST and os.path.isdir(output_dir):
            pass
        else:
            raise e
    if g_plot:
        latex_report = LatexReport("{0}/report.pdf".format(output_dir))
    else:
        latex_report = None

    for (output_name, objective_type) in output_names:
        if objective_type == "linear":
            logger.info("Generating accuracy plots for '%s'", output_name)
            generate_acc_logprob_plots(
                exp_dir, output_dir, g_plot, key='accuracy',
                file_basename='accuracy', comparison_dir=comparison_dir,
                start_iter=start_iter,
                latex_report=latex_report, output_name=output_name)

            logger.info("Generating log-likelihood plots for '%s'", output_name)
            generate_acc_logprob_plots(
                exp_dir, output_dir, g_plot, key='log-likelihood',
                file_basename='loglikelihood', comparison_dir=comparison_dir,
                start_iter=start_iter,
                latex_report=latex_report, output_name=output_name)
        elif objective_type == "chain":
            logger.info("Generating log-probability plots for '%s'", output_name)
            generate_acc_logprob_plots(
                exp_dir, output_dir, g_plot,
                key='log-probability', file_basename='log_probability',
                comparison_dir=comparison_dir, start_iter=start_iter,
                latex_report=latex_report, output_name=output_name)
        elif objective_type == "rnnlm_objective":
            logger.info("Generating RNNLM objective plots for '%s'", output_name)
            generate_acc_logprob_plots(
                exp_dir, output_dir, g_plot, key='rnnlm_objective',
                file_basename='objective', comparison_dir=comparison_dir,
                start_iter=start_iter,
                latex_report=latex_report, output_name=output_name)
        else:
            logger.info("Generating %s objective plots for '%s'", objective_type, output_name)
            generate_acc_logprob_plots(
                exp_dir, output_dir, g_plot, key='objective',
                file_basename='objective', comparison_dir=comparison_dir,
                start_iter=start_iter,
                latex_report=latex_report, output_name=output_name)

    logger.info("Generating non-linearity stats plots")
    generate_nonlin_stats_plots(
        exp_dir, output_dir, g_plot, comparison_dir=comparison_dir,
        start_iter=start_iter, latex_report=latex_report)

    logger.info("Generating clipped-proportion plots")
    generate_clipped_proportion_plots(
        exp_dir, output_dir, g_plot, comparison_dir=comparison_dir,
        start_iter=start_iter, latex_report=latex_report)

    logger.info("Generating parameter difference plots")
    generate_parameter_diff_plots(
        exp_dir, output_dir, g_plot, comparison_dir=comparison_dir,
        start_iter=start_iter, latex_report=latex_report)

    if g_plot and latex_report is not None:
        has_compiled = latex_report.close()
        if has_compiled:
            logger.info("Report file %s/report.pdf has been generated successfully.", output_dir)


def main():
    args = get_args()

    if not g_plot:
        logger.warning(
            "This script requires matplotlib and numpy.\n"
            "... Install these packages to generate plots.\n"
            "... If you are on a cluster where you do not have admin rights, use venv.\n"
            "... Generating text data table files only.")

    output_nodes = []

    if args.output_nodes is not None:
        nodes = args.output_nodes.split(' ')
        for n in nodes:
            parts = n.split(':')
            assert len(parts) == 2
            output_nodes.append(tuple(parts))
    elif args.is_chain:
        output_nodes.append(('output', 'chain'))
        output_nodes.append(('output-xent', 'chain'))
    elif args.is_rnnlm:
        output_nodes.append(('output', 'rnnlm_objective'))
    else:
        output_nodes.append(('output', 'linear'))

    if args.comparison_dir is not None:
      generate_plots(args.exp_dir[0], args.output_dir, output_nodes,
                     comparison_dir=args.comparison_dir,
                     start_iter=args.start_iter)
    else:
      if len(args.exp_dir) == 1:
        generate_plots(args.exp_dir[0], args.output_dir, output_nodes,
                       start_iter=args.start_iter)
      if len(args.exp_dir) > 1:
        generate_plots(args.exp_dir[0], args.output_dir, output_nodes,
                       comparison_dir=args.exp_dir[1:],
                       start_iter=args.start_iter)


if __name__ == "__main__":
    main()


================================================
FILE: egs/steps/nnet3/report/summarize_compute_debug_timing.py
================================================
#!/usr/bin/env python


# Copyright 2016 Vijayaditya Peddinti.
# Apache 2.0.


# we're using python 3.x style print but want it to work in python 2.x,
from __future__ import print_function
from __future__ import division
import sys
import re
import argparse

# expects the output of nnet3*train with --computation-debug=true
# will run faster if just the lines with "DebugAfterExecute" are provided
# <train-command> |grep DebugAfterExecute | steps/nnet3/report/summarize_compute_debug_timing.py

def GetArgs():
    parser = argparse.ArgumentParser(description="Summarizes the timing info from nnet3-*-train --computation.debug=true commands ")
    parser.add_argument("--node-prefixes", type=str,
                        help="list of prefixes. Execution times from nnet3 components with the same prefix"
                        " will be accumulated. Still distinguishes Propagate and BackPropagate commands"
                        " --node-prefixes Lstm1,Lstm2,Layer1", default=None)

    print(' '.join(sys.argv), file=sys.stderr)

    args = parser.parse_args()
    if args.node_prefixes is not None:
        raise NotImplementedError
        # this will be implemented after https://github.com/kaldi-asr/kaldi/issues/944
        args.node_prefixes = args.node_prefixes.split(',')
    else:
        args.node_prefixes = []

    return args
# get opening bracket position corresponding to the last closing bracket
def FindOpenParanthesisPosition(string):
    string = string.strip()
    if string[-1] != ")":
        # we don't know how to deal with these strings
        return None

    string_index = len(string) - 1
    closing_parans = []
    closing_parans.append(string_index)
    string_index -= 1
    while string_index >= 0:
        if string[string_index] == "(":
            if len(closing_parans) == 1:
                # this opening bracket corresponds to the last closing bracket
                return string_index
            else:
                closing_parans.pop()
        elif string[string_index] == ")":
            closing_parans.append(string_index)
        string_index -= 1

    raise Exception("Malformed string: Could not find opening paranthesis\n\t{0}".format(string))

# input : LOG (nnet3-chain-train:DebugAfterExecute():nnet-compute.cc:144) c68: BLstm1_backward_W_i-xr.Propagate(NULL, m6212(3136:3199, 0:555), &m31(0:63, 0:1023))
# output : BLstm1_backward_W_i-xr.Propagate
def ExtractCommandName(command_string):
    # create a concise representation for the the command
    # strip off : LOG (nnet3-chain-train:DebugAfterExecute():nnet-compute.cc:144)
    command = " ".join(command_string.split()[2:])
    # command = c68: BLstm1_backward_W_i-xr.Propagate(NULL, m6212(3136:3199, 0:555), &m31(0:63, 0:1023))
    end_position = FindOpenParanthesisPosition(command)
    if end_position is not None:
        command = command[:end_position]
    # command = c68: BLstm1_backward_W_i-xr.Propagate
    command = ":".join(command.split(":")[1:]).strip()
    # command = BLstm1_backward_W_i-xr.Propagate
    return command

def Main():
    # Sample Line
    # LOG (nnet3-chain-train:DebugAfterExecute():nnet-compute.cc:144) c128: m19 = []  |               |        time: 0.0007689 secs

    debug_regex = re.compile("DebugAfterExecute")
    command_times = {}
    for line in sys.stdin:
        parts = line.split("|")
        if len(parts) != 3:
            # we don't know how to deal with these lines
            continue
        if debug_regex.search(parts[0]) is not None:
            # this is a line printed in the DebugAfterExecute method

            # get the timing info
            time_parts = parts[-1].split()
            assert(len(time_parts) == 3 and time_parts[-1] == "secs" and time_parts[0] == "time:" )
            time = float(time_parts[1])

            command = ExtractCommandName(parts[0])
           # store the time
            try:
                command_times[command] += time
            except KeyError:
                command_times[command] = time

    total_time = sum(command_times.values())
    sorted_commands = sorted(command_times.items(), key = lambda x: x[1], reverse = True)
    for item in sorted_commands:
        print("{c} : time {t} : fraction {f}".format(c=item[0], t=item[1], f=float(item[1]) / total_time))


if __name__ == "__main__":
    args = GetArgs()
    Main()


================================================
FILE: egs/steps/nnet3/tdnn/make_configs.py
================================================
#!/usr/bin/env python

# This script is deprecated, please use ../xconfig_to_configs.py

# we're using python 3.x style print but want it to work in python 2.x,
from __future__ import print_function
from __future__ import division
import os
import argparse
import shlex
import sys
import warnings
import copy
import imp
import ast

nodes = imp.load_source('', 'steps/nnet3/components.py')
sys.path.insert(0, 'steps')
import libs.common as common_lib

def GetArgs():
    # we add compulsary arguments as named arguments for readability
    parser = argparse.ArgumentParser(description="Writes config files and variables "
                                                 "for TDNNs creation and training",
                                     epilog="See steps/nnet3/tdnn/train.sh for example.")

    # Only one of these arguments can be specified, and one of them has to
    # be compulsarily specified
    feat_group = parser.add_mutually_exclusive_group(required = True)
    feat_group.add_argument("--feat-dim", type=int,
                            help="Raw feature dimension, e.g. 13")
    feat_group.add_argument("--feat-dir", type=str,
                            help="Feature directory, from which we derive the feat-dim")

    # only one of these arguments can be specified
    ivector_group = parser.add_mutually_exclusive_group(required = False)
    ivector_group.add_argument("--ivector-dim", type=int,
                                help="iVector dimension, e.g. 100", default=0)
    ivector_group.add_argument("--ivector-dir", type=str,
                                help="iVector dir, which will be used to derive the ivector-dim  ", default=None)

    num_target_group = parser.add_mutually_exclusive_group(required = True)
    num_target_group.add_argument("--num-targets", type=int,
                                  help="number of network targets (e.g. num-pdf-ids/num-leaves)")
    num_target_group.add_argument("--ali-dir", type=str,
                                  help="alignment directory, from which we derive the num-targets")
    num_target_group.add_argument("--tree-dir", type=str,
                                  help="directory with final.mdl, from which we derive the num-targets")

    # CNN options
    parser.add_argument('--cnn.layer', type=str, action='append', dest = "cnn_layer",
                        help="CNN parameters at each CNN layer, e.g. --filt-x-dim=3 --filt-y-dim=8 "
                        "--filt-x-step=1 --filt-y-step=1 --num-filters=256 --pool-x-size=1 --pool-y-size=3 "
                        "--pool-z-size=1 --pool-x-step=1 --pool-y-step=3 --pool-z-step=1, "
                        "when CNN layers are used, no LDA will be added", default = None)
    parser.add_argument("--cnn.bottleneck-dim", type=int, dest = "cnn_bottleneck_dim",
                        help="Output dimension of the linear layer at the CNN output "
                        "for dimension reduction, e.g. 256."
                        "The default zero means this layer is not needed.", default=0)
    parser.add_argument("--cnn.cepstral-lifter", type=float, dest = "cepstral_lifter",
                        help="The factor used for determining the liftering vector in the production of MFCC. "
                        "User has to ensure that it matches the lifter used in MFCC generation, "
                        "e.g. 22.0", default=22.0)

    # General neural network options
    parser.add_argument("--splice-indexes", type=str, required = True,
                        help="Splice indexes at each layer, e.g. '-3,-2,-1,0,1,2,3' "
                        "If CNN layers are used the first set of splice indexes will be used as input "
                        "to the first CNN layer and later splice indexes will be interpreted as indexes "
                        "for the TDNNs.")
    parser.add_argument("--add-lda", type=str, action=common_lib.StrToBoolAction,
                        help="If \"true\" an LDA matrix computed from the input features "
                        "(spliced according to the first set of splice-indexes) will be used as "
                        "the first Affine layer. This affine layer's parameters are fixed during training. "
                        "If --cnn.layer is specified this option will be forced to \"false\".",
                        default=True, choices = ["false", "true"])

    parser.add_argument("--include-log-softmax", type=str, action=common_lib.StrToBoolAction,
                        help="add the final softmax layer ", default=True, choices = ["false", "true"])
    parser.add_argument("--add-final-sigmoid", type=str, action=common_lib.StrToBoolAction,
                        help="add a final sigmoid layer as alternate to log-softmax-layer. "
                        "Can only be used if include-log-softmax is false. "
                        "This is useful in cases where you want the output to be "
                        "like probabilities between 0 and 1. Typically the nnet "
                        "is trained with an objective such as quadratic",
                        default=False, choices = ["false", "true"])

    parser.add_argument("--objective-type", type=str,
                        help = "the type of objective; i.e. quadratic or linear",
                        default="linear", choices = ["linear", "quadratic"])
    parser.add_argument("--xent-regularize", type=float,
                        help="For chain models, if nonzero, add a separate output for cross-entropy "
                        "regularization (with learning-rate-factor equal to the inverse of this)",
                        default=0.0)
    parser.add_argument("--xent-separate-forward-affine", type=str, action=common_lib.StrToBoolAction,
                        help="if using --xent-regularize, gives it separate last-but-one weight matrix",
                        default=False, choices = ["false", "true"])
    parser.add_argument("--final-layer-normalize-target", type=float,
                        help="RMS target for final layer (set to <1 if final layer learns too fast",
                        default=1.0)
    parser.add_argument("--max-change-per-component", type=float,
                        help="Enforces per-component max change (except for the final affine layer). "
                        "if 0 it would not be enforced.", default=0.75)
    parser.add_argument("--max-change-per-component-final", type=float,
                        help="Enforces per-component max change for the final affine layer. "
                        "if 0 it would not be enforced.", default=1.5)
    parser.add_argument("--subset-dim", type=int, default=0,
                        help="dimension of the subset of units to be sent to the central frame")
    parser.add_argument("--pnorm-input-dim", type=int,
                        help="input dimension to p-norm nonlinearities")
    parser.add_argument("--pnorm-output-dim", type=int,
                        help="output dimension of p-norm nonlinearities")
    relu_dim_group = parser.add_mutually_exclusive_group(required = False)
    relu_dim_group.add_argument("--relu-dim", type=int,
                        help="dimension of all ReLU nonlinearity layers")
    relu_dim_group.add_argument("--relu-dim-final", type=int,
                        help="dimension of the last ReLU nonlinearity layer. Dimensions increase geometrically from the first through the last ReLU layer.", default=None)
    parser.add_argument("--relu-dim-init", type=int,
                        help="dimension of the first ReLU nonlinearity layer. Dimensions increase geometrically from the first through the last ReLU layer.", default=None)

    parser.add_argument("--self-repair-scale-nonlinearity", type=float,
                        help="A non-zero value activates the self-repair mechanism in the sigmoid and tanh non-linearities of the LSTM", default=None)


    parser.add_argument("--use-presoftmax-prior-scale", type=str, action=common_lib.StrToBoolAction,
                        help="if true, a presoftmax-prior-scale is added",
                        choices=['true', 'false'], default = True)
    parser.add_argument("config_dir",
                        help="Directory to write config files and variables")

    print(' '.join(sys.argv))

    args = parser.parse_args()
    args = CheckArgs(args)

    return args

def CheckArgs(args):
    if not os.path.exists(args.config_dir):
        os.makedirs(args.config_dir)

    ## Check arguments.
    if args.feat_dir is not None:
        args.feat_dim = common_lib.get_feat_dim(args.feat_dir)

    if args.ali_dir is not None:
        args.num_targets = common_lib.get_number_of_leaves_from_tree(args.ali_dir)
    elif args.tree_dir is not None:
        args.num_targets = common_lib.get_number_of_leaves_from_tree(args.tree_dir)

    if args.ivector_dir is not None:
        args.ivector_dim = common_lib.get_ivector_dim(args.ivector_dir)

    if not args.feat_dim > 0:
        raise Exception("feat-dim has to be postive")

    if not args.num_targets > 0:
        print(args.num_targets)
        raise Exception("num_targets has to be positive")

    if not args.ivector_dim >= 0:
        raise Exception("ivector-dim has to be non-negative")

    if (args.subset_dim < 0):
        raise Exception("--subset-dim has to be non-negative")

    if not args.relu_dim is None:
        if not args.pnorm_input_dim is None or not args.pnorm_output_dim is None or not args.relu_dim_init is None:
            raise Exception("--relu-dim argument not compatible with "
                            "--pnorm-input-dim or --pnorm-output-dim or --relu-dim-init options");
        args.nonlin_input_dim = args.relu_dim
        args.nonlin_output_dim = args.relu_dim
        args.nonlin_output_dim_final = None
        args.nonlin_output_dim_init = None
        args.nonlin_type = 'relu'

    elif not args.relu_dim_final is None:
        if not args.pnorm_input_dim is None or not args.pnorm_output_dim is None:
            raise Exception("--relu-dim-final argument not compatible with "
                            "--pnorm-input-dim or --pnorm-output-dim options")
        if args.relu_dim_init is None:
            raise Exception("--relu-dim-init argument should also be provided with --relu-dim-final")
        if args.relu_dim_init > args.relu_dim_final:
            raise Exception("--relu-dim-init has to be no larger than --relu-dim-final")
        args.nonlin_input_dim = None
        args.nonlin_output_dim = None
        args.nonlin_output_dim_final = args.relu_dim_final
        args.nonlin_output_dim_init = args.relu_dim_init
        args.nonlin_type = 'relu'

    else:
        if not args.relu_dim_init is None:
            raise Exception("--relu-dim-final argument not compatible with "
                            "--pnorm-input-dim or --pnorm-output-dim options")
        if not args.pnorm_input_dim > 0 or not args.pnorm_output_dim > 0:
            raise Exception("--relu-dim not set, so expected --pnorm-input-dim and "
                            "--pnorm-output-dim to be provided.");
        args.nonlin_input_dim = args.pnorm_input_dim
        args.nonlin_output_dim = args.pnorm_output_dim
        if (args.nonlin_input_dim < args.nonlin_output_dim) or (args.nonlin_input_dim % args.nonlin_output_dim != 0):
            raise Exception("Invalid --pnorm-input-dim {0} and --pnorm-output-dim {1}".format(args.nonlin_input_dim, args.nonlin_output_dim))
        args.nonlin_output_dim_final = None
        args.nonlin_output_dim_init = None
        args.nonlin_type = 'pnorm'

    if args.add_final_sigmoid and args.include_log_softmax:
        raise Exception("--include-log-softmax and --add-final-sigmoid cannot both be true.")

    if args.xent_separate_forward_affine and args.add_final_sigmoid:
        raise Exception("It does not make sense to have --add-final-sigmoid=true when xent-separate-forward-affine is true")

    if args.add_lda and args.cnn_layer is not None:
        args.add_lda = False
        warnings.warn("--add-lda is set to false as CNN layers are used.")

    if not args.max_change_per_component >= 0 or not args.max_change_per_component_final >= 0:
        raise Exception("max-change-per-component and max_change-per-component-final should be non-negative")

    return args

def AddConvMaxpLayer(config_lines, name, input, args):
    if '3d-dim' not in input:
        raise Exception("The input to AddConvMaxpLayer() needs '3d-dim' parameters.")

    input = nodes.AddConvolutionLayer(config_lines, name, input,
                              input['3d-dim'][0], input['3d-dim'][1], input['3d-dim'][2],
                              args.filt_x_dim, args.filt_y_dim,
                              args.filt_x_step, args.filt_y_step,
                              args.num_filters, input['vectorization'])

    if args.pool_x_size > 1 or args.pool_y_size > 1 or args.pool_z_size > 1:
      input = nodes.AddMaxpoolingLayer(config_lines, name, input,
                                input['3d-dim'][0], input['3d-dim'][1], input['3d-dim'][2],
                                args.pool_x_size, args.pool_y_size, args.pool_z_size,
                                args.pool_x_step, args.pool_y_step, args.pool_z_step)

    return input

# The ivectors are processed through an affine layer parallel to the CNN layers,
# then concatenated with the CNN output and passed to the deeper part of the network.
def AddCnnLayers(config_lines, cnn_layer, cnn_bottleneck_dim, cepstral_lifter, config_dir, feat_dim, splice_indexes=[0], ivector_dim=0):
    cnn_args = ParseCnnString(cnn_layer)
    num_cnn_layers = len(cnn_args)
    # We use an Idct layer here to convert MFCC to FBANK features
    common_lib.write_idct_matrix(feat_dim, cepstral_lifter, config_dir.strip() + "/idct.mat")
    prev_layer_output = {'descriptor':  "input",
                         'dimension': feat_dim}
    prev_layer_output = nodes.AddFixedAffineLayer(config_lines, "Idct", prev_layer_output, config_dir.strip() + '/idct.mat')

    list = [('Offset({0}, {1})'.format(prev_layer_output['descriptor'],n) if n != 0 else prev_layer_output['descriptor']) for n in splice_indexes]
    splice_descriptor = "Append({0})".format(", ".join(list))
    cnn_input_dim = len(splice_indexes) * feat_dim
    prev_layer_output = {'descriptor':  splice_descriptor,
                         'dimension': cnn_input_dim,
                         '3d-dim': [len(splice_indexes), feat_dim, 1],
                         'vectorization': 'yzx'}

    for cl in range(0, num_cnn_layers):
        prev_layer_output = AddConvMaxpLayer(config_lines, "L{0}".format(cl), prev_layer_output, cnn_args[cl])

    if cnn_bottleneck_dim > 0:
        prev_layer_output = nodes.AddAffineLayer(config_lines, "cnn-bottleneck", prev_layer_output, cnn_bottleneck_dim, "")

    if ivector_dim > 0:
        iv_layer_output = {'descriptor':  'ReplaceIndex(ivector, t, 0)',
                           'dimension': ivector_dim}
        iv_layer_output = nodes.AddAffineLayer(config_lines, "ivector", iv_layer_output, ivector_dim, "")
        prev_layer_output['descriptor'] = 'Append({0}, {1})'.format(prev_layer_output['descriptor'], iv_layer_output['descriptor'])
        prev_layer_output['dimension'] = prev_layer_output['dimension'] + iv_layer_output['dimension']

    return prev_layer_output

def PrintConfig(file_name, config_lines):
    f = open(file_name, 'w')
    f.write("\n".join(config_lines['components'])+"\n")
    f.write("\n#Component nodes\n")
    f.write("\n".join(config_lines['component-nodes'])+"\n")
    f.close()

def ParseCnnString(cnn_param_string_list):
    cnn_parser = argparse.ArgumentParser(description="cnn argument parser")

    cnn_parser.add_argument("--filt-x-dim", required=True, type=int)
    cnn_parser.add_argument("--filt-y-dim", required=True, type=int)
    cnn_parser.add_argument("--filt-x-step", type=int, default = 1)
    cnn_parser.add_argument("--filt-y-step", type=int, default = 1)
    cnn_parser.add_argument("--num-filters", required=True, type=int)
    cnn_parser.add_argument("--pool-x-size", type=int, default = 1)
    cnn_parser.add_argument("--pool-y-size", type=int, default = 1)
    cnn_parser.add_argument("--pool-z-size", type=int, default = 1)
    cnn_parser.add_argument("--pool-x-step", type=int, default = 1)
    cnn_parser.add_argument("--pool-y-step", type=int, default = 1)
    cnn_parser.add_argument("--pool-z-step", type=int, default = 1)

    cnn_args = []
    for cl in range(0, len(cnn_param_string_list)):
         cnn_args.append(cnn_parser.parse_args(shlex.split(cnn_param_string_list[cl])))

    return cnn_args

def ParseSpliceString(splice_indexes):
    splice_array = []
    left_context = 0
    right_context = 0
    split1 = splice_indexes.split();  # we already checked the string is nonempty.
    if len(split1) < 1:
        raise Exception("invalid splice-indexes argument, too short: "
                 + splice_indexes)
    try:
        for string in split1:
            split2 = string.split(",")
            if len(split2) < 1:
                raise Exception("invalid splice-indexes argument, too-short element: "
                         + splice_indexes)
            int_list = []
            for int_str in split2:
                int_list.append(int(int_str))
            if not int_list == sorted(int_list):
                raise Exception("elements of splice-indexes must be sorted: "
                         + splice_indexes)
            left_context += -int_list[0]
            right_context += int_list[-1]
            splice_array.append(int_list)
    except ValueError as e:
        raise Exception("invalid splice-indexes argument " + splice_indexes + str(e))
    left_context = max(0, left_context)
    right_context = max(0, right_context)

    return {'left_context':left_context,
            'right_context':right_context,
            'splice_indexes':splice_array,
            'num_hidden_layers':len(splice_array)
            }

# The function signature of MakeConfigs is changed frequently as it is intended for local use in this script.
def MakeConfigs(config_dir, splice_indexes_string,
                cnn_layer, cnn_bottleneck_dim, cepstral_lifter,
                feat_dim, ivector_dim, num_targets, add_lda,
                nonlin_type, nonlin_input_dim, nonlin_output_dim, subset_dim,
                nonlin_output_dim_init, nonlin_output_dim_final,
                use_presoftmax_prior_scale,
                final_layer_normalize_target,
                include_log_softmax,
                add_final_sigmoid,
                xent_regularize,
                xent_separate_forward_affine,
                self_repair_scale,
                max_change_per_component, max_change_per_component_final,
                objective_type):

    parsed_splice_output = ParseSpliceString(splice_indexes_string.strip())

    left_context = parsed_splice_output['left_context']
    right_context = parsed_splice_output['right_context']
    num_hidden_layers = parsed_splice_output['num_hidden_layers']
    splice_indexes = parsed_splice_output['splice_indexes']
    input_dim = len(parsed_splice_output['splice_indexes'][0]) + feat_dim + ivector_dim

    if xent_separate_forward_affine:
        if splice_indexes[-1] != [0]:
            raise Exception("--xent-separate-forward-affine option is supported only if the last-hidden layer has no splicing before it. Please use a splice-indexes with just 0 as the final splicing config.")

    prior_scale_file = '{0}/presoftmax_prior_scale.vec'.format(config_dir)

    config_lines = {'components':[], 'component-nodes':[]}

    config_files={}
    prev_layer_output = nodes.AddInputLayer(config_lines, feat_dim, splice_indexes[0], ivector_dim)

    # Add the init config lines for estimating the preconditioning matrices
    init_config_lines = copy.deepcopy(config_lines)
    init_config_lines['components'].insert(0, '# Config file for initializing neural network prior to')
    init_config_lines['components'].insert(0, '# preconditioning matrix computation')
    nodes.AddOutputLayer(init_config_lines, prev_layer_output)
    config_files[config_dir + '/init.config'] = init_config_lines

    if cnn_layer is not None:
        prev_layer_output = AddCnnLayers(config_lines, cnn_layer, cnn_bottleneck_dim, cepstral_lifter, config_dir,
                                         feat_dim, splice_indexes[0], ivector_dim)

    if add_lda:
        prev_layer_output = nodes.AddLdaLayer(config_lines, "L0", prev_layer_output, config_dir + '/lda.mat')

    left_context = 0
    right_context = 0
    # we moved the first splice layer to before the LDA..
    # so the input to the first affine layer is going to [0] index
    splice_indexes[0] = [0]

    if not nonlin_output_dim is None:
        nonlin_output_dims = [nonlin_output_dim] * num_hidden_layers
    elif nonlin_output_dim_init < nonlin_output_dim_final and num_hidden_layers == 1:
        raise Exception("num-hidden-layers has to be greater than 1 if relu-dim-init and relu-dim-final is different.")
    else:
        # computes relu-dim for each hidden layer. They increase geometrically across layers
        factor = pow(float(nonlin_output_dim_final) / nonlin_output_dim_init, 1.0 / (num_hidden_layers - 1)) if num_hidden_layers > 1 else 1
        nonlin_output_dims = [int(round(nonlin_output_dim_init * pow(factor, i))) for i in range(0, num_hidden_layers)]
        assert(nonlin_output_dims[-1] >= nonlin_output_dim_final - 1 and nonlin_output_dims[-1] <= nonlin_output_dim_final + 1) # due to rounding error
        nonlin_output_dims[-1] = nonlin_output_dim_final # It ensures that the dim of the last hidden layer is exactly the same as what is specified

    for i in range(0, num_hidden_layers):
        # make the intermediate config file for layerwise discriminative training

        # prepare the spliced input
        if not (len(splice_indexes[i]) == 1 and splice_indexes[i][0] == 0):
            try:
                zero_index = splice_indexes[i].index(0)
            except ValueError:
                zero_index = None
            # I just assume the prev_layer_output_descriptor is a simple forwarding descriptor
            prev_layer_output_descriptor = prev_layer_output['descriptor']
            subset_output = prev_layer_output
            if subset_dim > 0:
                # if subset_dim is specified the script expects a zero in the splice indexes
                assert(zero_index is not None)
                subset_node_config = "dim-range-node name=Tdnn_input_{0} input-node={1} dim-offset={2} dim={3}".format(i, prev_layer_output_descriptor, 0, subset_dim)
                subset_output = {'descriptor' : 'Tdnn_input_{0}'.format(i),
                                 'dimension' : subset_dim}
                config_lines['component-nodes'].append(subset_node_config)
            appended_descriptors = []
            appended_dimension = 0
            for j in range(len(splice_indexes[i])):
                if j == zero_index:
                    appended_descriptors.append(prev_layer_output['descriptor'])
                    appended_dimension += prev_layer_output['dimension']
                    continue
                appended_descriptors.append('Offset({0}, {1})'.format(subset_output['descriptor'], splice_indexes[i][j]))
                appended_dimension += subset_output['dimension']
            prev_layer_output = {'descriptor' : "Append({0})".format(" , ".join(appended_descriptors)),
                                 'dimension'  : appended_dimension}
        else:
            # this is a normal affine node
            pass

        if xent_separate_forward_affine and i == num_hidden_layers - 1:
            if xent_regularize == 0.0:
                raise Exception("xent-separate-forward-affine=True is valid only if xent-regularize is non-zero")

            if nonlin_type == "relu" :
                prev_layer_output_chain = nodes.AddAffRelNormLayer(config_lines, "Tdnn_pre_final_chain",
                                                                   prev_layer_output, nonlin_output_dim,
                                                                   norm_target_rms = final_layer_normalize_target,
                                                                   self_repair_scale = self_repair_scale,
                                                                   max_change_per_component = max_change_per_component)

                prev_layer_output_xent = nodes.AddAffRelNormLayer(config_lines, "Tdnn_pre_final_xent",
                                                                  prev_layer_output, nonlin_output_dim,
                                                                  norm_target_rms = final_layer_normalize_target,
                                                                  self_repair_scale = self_repair_scale,
                                                                  max_change_per_component = max_change_per_component)
            elif nonlin_type == "pnorm" :
                prev_layer_output_chain = nodes.AddAffPnormLayer(config_lines, "Tdnn_pre_final_chain",
                                                                 prev_layer_output, nonlin_input_dim, nonlin_output_dim,
                                                                 norm_target_rms = final_layer_normalize_target)

                prev_layer_output_xent = nodes.AddAffPnormLayer(config_lines, "Tdnn_pre_final_xent",
                                                                prev_layer_output, nonlin_input_dim, nonlin_output_dim,
                                                                norm_target_rms = final_layer_normalize_target)
            else:
                raise Exception("Unknown nonlinearity type")

            nodes.AddFinalLayer(config_lines, prev_layer_output_chain, num_targets,
                               max_change_per_component = max_change_per_component_final,
                               use_presoftmax_prior_scale = use_presoftmax_prior_scale,
                               prior_scale_file = prior_scale_file,
                               include_log_softmax = include_log_softmax)

            nodes.AddFinalLayer(config_lines, prev_layer_output_xent, num_targets,
                                ng_affine_options = " param-stddev=0 bias-stddev=0 learning-rate-factor={0} ".format(
                                    0.5 / xent_regularize),
                                max_change_per_component = max_change_per_component_final,
                                use_presoftmax_prior_scale = use_presoftmax_prior_scale,
                                prior_scale_file = prior_scale_file,
                                include_log_softmax = True,
                                name_affix = 'xent')
        else:
            if nonlin_type == "relu":
                prev_layer_output = nodes.AddAffRelNormLayer(config_lines, "Tdnn_{0}".format(i),
                                                            prev_layer_output, nonlin_output_dims[i],
                                                            norm_target_rms = 1.0 if i < num_hidden_layers -1 else final_layer_normalize_target,
                                                            self_repair_scale = self_repair_scale,
                                                            max_change_per_component = max_change_per_component)
            elif nonlin_type == "pnorm":
                prev_layer_output = nodes.AddAffPnormLayer(config_lines, "Tdnn_{0}".format(i),
                                                           prev_layer_output, nonlin_input_dim, nonlin_output_dim,
                                                           norm_target_rms = 1.0 if i < num_hidden_layers -1 else final_layer_normalize_target)
            else:
                raise Exception("Unknown nonlinearity type")
            # a final layer is added after each new layer as we are generating
            # configs for layer-wise discriminative training

            # add_final_sigmoid adds a sigmoid as a final layer as alternative
            # to log-softmax layer.
            # http://ufldl.stanford.edu/wiki/index.php/Softmax_Regression#Softmax_Regression_vs._k_Binary_Classifiers
            # This is useful when you need the final outputs to be probabilities between 0 and 1.
            # Usually used with an objective-type such as "quadratic".
            # Applications are k-binary classification such Ideal Ratio Mask prediction.
            nodes.AddFinalLayer(config_lines, prev_layer_output, num_targets,
                               max_change_per_component = max_change_per_component_final,
                               use_presoftmax_prior_scale = use_presoftmax_prior_scale,
                               prior_scale_file = prior_scale_file,
                               include_log_softmax = include_log_softmax,
                               add_final_sigmoid = add_final_sigmoid,
                               objective_type = objective_type)
            if xent_regularize != 0.0:
                nodes.AddFinalLayer(config_lines, prev_layer_output, num_targets,
                                    ng_affine_options = " param-stddev=0 bias-stddev=0 learning-rate-factor={0} ".format(
                                          0.5 / xent_regularize),
                                    max_change_per_component = max_change_per_component_final,
                                    use_presoftmax_prior_scale = use_presoftmax_prior_scale,
                                    prior_scale_file = prior_scale_file,
                                    include_log_softmax = True,
                                    name_affix = 'xent')

        config_files['{0}/layer{1}.config'.format(config_dir, i+1)] = config_lines
        config_lines = {'components':[], 'component-nodes':[]}

    left_context += int(parsed_splice_output['left_context'])
    right_context += int(parsed_splice_output['right_context'])

    # write the files used by other scripts like steps/nnet3/get_egs.sh
    f = open(config_dir + "/vars", "w")
    print('model_left_context={}'.format(left_context), file=f)
    print('model_right_context={}'.format(right_context), file=f)
    print('num_hidden_layers={}'.format(num_hidden_layers), file=f)
    print('num_targets={}'.format(num_targets), file=f)
    print('add_lda=' + ('true' if add_lda else 'false'), file=f)
    print('include_log_softmax=' + ('true' if include_log_softmax else 'false'), file=f)
    print('objective_type=' + objective_type, file=f)
    f.close()

    # printing out the configs
    # init.config used to train lda-mllt train
    for key in config_files.keys():
        PrintConfig(key, config_files[key])

def Main():
    args = GetArgs()

    MakeConfigs(config_dir = args.config_dir,
                splice_indexes_string = args.splice_indexes,
                feat_dim = args.feat_dim, ivector_dim = args.ivector_dim,
                num_targets = args.num_targets,
                add_lda = args.add_lda,
                cnn_layer = args.cnn_layer,
                cnn_bottleneck_dim = args.cnn_bottleneck_dim,
                cepstral_lifter = args.cepstral_lifter,
                nonlin_type = args.nonlin_type,
                nonlin_input_dim = args.nonlin_input_dim,
                nonlin_output_dim = args.nonlin_output_dim,
                subset_dim = args.subset_dim,
                nonlin_output_dim_init = args.nonlin_output_dim_init,
                nonlin_output_dim_final = args.nonlin_output_dim_final,
                use_presoftmax_prior_scale = args.use_presoftmax_prior_scale,
                final_layer_normalize_target = args.final_layer_normalize_target,
                include_log_softmax = args.include_log_softmax,
                add_final_sigmoid = args.add_final_sigmoid,
                xent_regularize = args.xent_regularize,
                xent_separate_forward_affine = args.xent_separate_forward_affine,
                self_repair_scale = args.self_repair_scale_nonlinearity,
                max_change_per_component = args.max_change_per_component,
                max_change_per_component_final = args.max_change_per_component_final,
                objective_type = args.objective_type)

if __name__ == "__main__":
    Main()


================================================
FILE: egs/steps/nnet3/tdnn/train.sh
================================================
#!/usr/bin/env bash

# THIS SCRIPT IS DEPRECATED, see ../train_dnn.py

# note, TDNN is the same as what we used to call multisplice.

# Copyright 2012-2015  Johns Hopkins University (Author: Daniel Povey).
#           2013  Xiaohui Zhang
#           2013  Guoguo Chen
#           2014  Vimal Manohar
#           2014  Vijayaditya Peddinti
# Apache 2.0.


# Begin configuration section.
cmd=run.pl
num_epochs=15      # Number of epochs of training;
                   # the number of iterations is worked out from this.
initial_effective_lrate=0.01
final_effective_lrate=0.001
pnorm_input_dim=3000
pnorm_output_dim=300
relu_dim=  # you can use this to make it use ReLU's instead of p-norms.
rand_prune=4.0 # Relates to a speedup we do for LDA.
minibatch_size=512  # This default is suitable for GPU-based training.
                    # Set it to 128 for multi-threaded CPU-based training.
max_param_change=2.0  # max param change per minibatch
samples_per_iter=400000 # each iteration of training, see this many samples
                        # per job.  This option is passed to get_egs.sh
num_jobs_initial=1  # Number of neural net jobs to run in parallel at the start of training
num_jobs_final=8   # Number of neural net jobs to run in parallel at the end of training
prior_subset_size=20000 # 20k samples per job, for computing priors.
num_jobs_compute_prior=10 # these are single-threaded, run on CPU.
get_egs_stage=0    # can be used for rerunning after partial
online_ivector_dir=
presoftmax_prior_scale_power=-0.25
use_presoftmax_prior_scale=true
remove_egs=true  # set to false to disable removing egs after training is done.

max_models_combine=20 # The "max_models_combine" is the maximum number of models we give
  # to the final 'combine' stage, but these models will themselves be averages of
  # iteration-number ranges.

shuffle_buffer_size=5000 # This "buffer_size" variable controls randomization of the samples
                # on each iter.  You could set it to 0 or to a large value for complete
                # randomization, but this would both consume memory and cause spikes in
                # disk I/O.  Smaller is easier on disk and memory but less random.  It's
                # not a huge deal though, as samples are anyway randomized right at the start.
                # (the point of this is to get data in different minibatches on different iterations,
                # since in the preconditioning method, 2 samples in the same minibatch can
                # affect each others' gradients.

add_layers_period=2 # by default, add new layers every 2 iterations.
stage=-6
exit_stage=-100 # you can set this to terminate the training early.  Exits before running this stage

# count space-separated fields in splice_indexes to get num-hidden-layers.
splice_indexes="-4,-3,-2,-1,0,1,2,3,4  0  -2,2  0  -4,4 0"
# Format : layer<hidden_layer>/<frame_indices>....layer<hidden_layer>/<frame_indices> "
# note: hidden layers which are composed of one or more components,
# so hidden layer indexing is different from component count
chunk_training=false  # if true training is done with chunk randomization, rather than frame randomization

randprune=4.0 # speeds up LDA.
use_gpu=true    # if true, we run on GPU.
cleanup=true
egs_dir=
max_lda_jobs=10  # use no more than 10 jobs for the LDA accumulation.
lda_opts=
egs_opts=
transform_dir=     # If supplied, this dir used instead of alidir to find transforms.
cmvn_opts=  # will be passed to get_lda.sh and get_egs.sh, if supplied.
            # only relevant for "raw" features, not lda.
feat_type=raw  # or set to 'lda' to use LDA features.
align_cmd=              # The cmd that is passed to steps/nnet2/align.sh
align_use_gpu=          # Passed to use_gpu in steps/nnet2/align.sh [yes/no]
realign_times=          # List of times on which we realign.  Each time is
                        # floating point number strictly between 0 and 1, which
                        # will be multiplied by the num-iters to get an iteration
                        # number.
num_jobs_align=30       # Number of jobs for realignment
# End configuration section.
frames_per_eg=8 # to be passed on to get_egs.sh
subset_dim=0

trap 'for pid in $(jobs -pr); do kill -KILL $pid; done' INT QUIT TERM

echo "$0: THIS SCRIPT IS DEPRECATED"
echo "$0 $@"  # Print the command line for logging

if [ -f path.sh ]; then . ./path.sh; fi
. parse_options.sh || exit 1;

if [ $# != 4 ]; then
  echo "Usage: $0 [opts] <data> <lang> <ali-dir> <exp-dir>"
  echo " e.g.: $0 data/train data/lang exp/tri3_ali exp/tri4_nnet"
  echo ""
  echo "Main options (for others, see top of script file)"
  echo "  --config <config-file>                           # config file containing options"
  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
  echo "  --num-epochs <#epochs|15>                        # Number of epochs of training"
  echo "  --initial-effective-lrate <lrate|0.02> # effective learning rate at start of training."
  echo "  --final-effective-lrate <lrate|0.004>   # effective learning rate at end of training."
  echo "                                                   # data, 0.00025 for large data"
  echo "  --num-hidden-layers <#hidden-layers|2>           # Number of hidden layers, e.g. 2 for 3 hours of data, 4 for 100hrs"
  echo "  --add-layers-period <#iters|2>                   # Number of iterations between adding hidden layers"
  echo "  --presoftmax-prior-scale-power <power|-0.25>     # use the specified power value on the priors (inverse priors) to scale"
  echo "                                                   # the pre-softmax outputs (set to 0.0 to disable the presoftmax element scale)"
  echo "  --num-jobs-initial <num-jobs|1>                  # Number of parallel jobs to use for neural net training, at the start."
  echo "  --num-jobs-final <num-jobs|8>                    # Number of parallel jobs to use for neural net training, at the end"
  echo "  --num-threads <num-threads|16>                   # Number of parallel threads per job, for CPU-based training (will affect"
  echo "                                                   # results as well as speed; may interact with batch size; if you increase"
  echo "                                                   # this, you may want to decrease the batch size."
  echo "  --parallel-opts <opts|\"--num-threads 16 --mem 1G\">      # extra options to pass to e.g. queue.pl for processes that"
  echo "                                                   # use multiple threads... note, you might have to reduce --mem"
  echo "                                                   # versus your defaults, because it gets multiplied by the --num-threads argument."
  echo "  --minibatch-size <minibatch-size|128>            # Size of minibatch to process (note: product with --num-threads"
  echo "                                                   # should not get too large, e.g. >2k)."
  echo "  --samples-per-iter <#samples|400000>             # Number of samples of data to process per iteration, per"
  echo "                                                   # process."
  echo "  --splice-indexes <string|layer0/-4:-3:-2:-1:0:1:2:3:4> "
  echo "                                                   # Frame indices used for each splice layer."
  echo "                                                   # Format : layer<hidden_layer_index>/<frame_indices>....layer<hidden_layer>/<frame_indices> "
  echo "                                                   # (note: we splice processed, typically 40-dimensional frames"
  echo "  --lda-dim <dim|''>                               # Dimension to reduce spliced features to with LDA"
  echo "  --realign-times <list-of-times|\"\">             # A list of space-separated floating point numbers between 0.0 and"
  echo "                                                   # 1.0 to specify how far through training realignment is to be done"
  echo "  --align-cmd (utils/run.pl|utils/queue.pl <queue opts>) # passed to align.sh"
  echo "  --align-use-gpu (yes/no)                         # specify is gpu is to be used for realignment"
  echo "  --num-jobs-align <#njobs|30>                     # Number of jobs to perform realignment"
  echo "  --stage <stage|-4>                               # Used to run a partially-completed training process from somewhere in"
  echo "                                                   # the middle."


  exit 1;
fi

data=$1
lang=$2
alidir=$3
dir=$4

if [ ! -z "$realign_times" ]; then
  [ -z "$align_cmd" ] && echo "$0: realign_times specified but align_cmd not specified" && exit 1
  [ -z "$align_use_gpu" ] && echo "$0: realign_times specified but align_use_gpu not specified" && exit 1
fi

# Check some files.
for f in $data/feats.scp $lang/L.fst $alidir/ali.1.gz $alidir/final.mdl $alidir/tree; do
  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
done


# Set some variables.
num_leaves=`tree-info $alidir/tree 2>/dev/null | grep num-pdfs | awk '{print $2}'` || exit 1
[ -z $num_leaves ] && echo "\$num_leaves is unset" && exit 1
[ "$num_leaves" -eq "0" ] && echo "\$num_leaves is 0" && exit 1

nj=`cat $alidir/num_jobs` || exit 1;  # number of jobs in alignment dir...
# in this dir we'll have just one job.
sdata=$data/split$nj
utils/split_data.sh $data $nj

mkdir -p $dir/log
echo $nj > $dir/num_jobs
cp $alidir/tree $dir

utils/lang/check_phones_compatible.sh $lang/phones.txt $alidir/phones.txt || exit 1;
cp $lang/phones.txt $dir || exit 1;
# First work out the feature and iVector dimension, needed for tdnn config creation.
case $feat_type in
  raw) feat_dim=$(feat-to-dim --print-args=false scp:$data/feats.scp -) || \
      { echo "$0: Error getting feature dim"; exit 1; }
    ;;
  lda)  [ ! -f $alidir/final.mat ] && echo "$0: With --feat-type lda option, expect $alidir/final.mat to exist."
   # get num-rows in lda matrix, which is the lda feature dim.
   feat_dim=$(matrix-dim --print-args=false $alidir/final.mat | cut -f 1)
    ;;
  *)
   echo "$0: Bad --feat-type '$feat_type';"; exit 1;
esac
if [ -z "$online_ivector_dir" ]; then
  ivector_dim=0
else
  ivector_dim=$(feat-to-dim scp:$online_ivector_dir/ivector_online.scp -) || exit 1;
fi


if [ $stage -le -5 ]; then
  echo "$0: creating neural net configs";

  if [ ! -z "$relu_dim" ]; then
    dim_opts="--relu-dim $relu_dim"
  else
    dim_opts="--pnorm-input-dim $pnorm_input_dim --pnorm-output-dim  $pnorm_output_dim"
  fi

  # create the config files for nnet initialization
  python steps/nnet3/tdnn/make_configs.py  \
    --splice-indexes "$splice_indexes"  \
    --subset-dim "$subset_dim" \
    --feat-dim $feat_dim \
    --ivector-dim $ivector_dim  \
     $dim_opts \
    --use-presoftmax-prior-scale $use_presoftmax_prior_scale \
    --num-targets  $num_leaves  \
   $dir/configs || exit 1;

  # Initialize as "raw" nnet, prior to training the LDA-like preconditioning
  # matrix.  This first config just does any initial splicing that we do;
  # we do this as it's a convenient way to get the stats for the 'lda-like'
  # transform.
  $cmd $dir/log/nnet_init.log \
    nnet3-init --srand=-2 $dir/configs/init.config $dir/init.raw || exit 1;
fi

# sourcing the "vars" below sets
# left_context=(something)
# right_context=(something)
# num_hidden_layers=(something)
. $dir/configs/vars || exit 1;

left_context=$model_left_context
right_context=$model_right_context

context_opts="--left-context=$left_context --right-context=$right_context"

! [ "$num_hidden_layers" -gt 0 ] && echo \
 "$0: Expected num_hidden_layers to be defined" && exit 1;

[ -z "$transform_dir" ] && transform_dir=$alidir


if [ $stage -le -4 ] && [ -z "$egs_dir" ]; then
  extra_opts=()
  [ ! -z "$cmvn_opts" ] && extra_opts+=(--cmvn-opts "$cmvn_opts")
  [ ! -z "$feat_type" ] && extra_opts+=(--feat-type $feat_type)
  [ ! -z "$online_ivector_dir" ] && extra_opts+=(--online-ivector-dir $online_ivector_dir)
  extra_opts+=(--transform-dir $transform_dir)
  extra_opts+=(--left-context $left_context)
  extra_opts+=(--right-context $right_context)
  echo "$0: calling get_egs.sh"
  steps/nnet3/get_egs.sh $egs_opts "${extra_opts[@]}" \
      --samples-per-iter $samples_per_iter --stage $get_egs_stage \
      --cmd "$cmd" $egs_opts \
      --frames-per-eg $frames_per_eg \
      $data $alidir $dir/egs || exit 1;
fi

[ -z $egs_dir ] && egs_dir=$dir/egs

if [ "$feat_dim" != "$(cat $egs_dir/info/feat_dim)" ]; then
  echo "$0: feature dimension mismatch with egs, $feat_dim vs $(cat $egs_dir/info/feat_dim)";
  exit 1;
fi
if [ "$ivector_dim" != "$(cat $egs_dir/info/ivector_dim)" ]; then
  echo "$0: ivector dimension mismatch with egs, $ivector_dim vs $(cat $egs_dir/info/ivector_dim)";
  exit 1;
fi

# copy any of the following that exist, to $dir.
cp $egs_dir/{cmvn_opts,splice_opts,final.mat} $dir 2>/dev/null

# confirm that the egs_dir has the necessary context (especially important if
# the --egs-dir option was used on the command line).
egs_left_context=$(cat $egs_dir/info/left_context) || exit -1
egs_right_context=$(cat $egs_dir/info/right_context) || exit -1
 ( [ $egs_left_context -lt $left_context ] || \
   [ $egs_right_context -lt $right_context ] ) && \
   echo "$0: egs in $egs_dir have too little context" && exit -1;

frames_per_eg=$(cat $egs_dir/info/frames_per_eg) || { echo "error: no such file $egs_dir/info/frames_per_eg"; exit 1; }
num_archives=$(cat $egs_dir/info/num_archives) || { echo "error: no such file $egs_dir/info/frames_per_eg"; exit 1; }

# num_archives_expanded considers each separate label-position from
# 0..frames_per_eg-1 to be a separate archive.
if [ "$chunk_training" == "true" ]; then
  num_archives_expanded=$num_archives
else
  num_archives_expanded=$[$num_archives*$frames_per_eg]
fi

[ $num_jobs_initial -gt $num_jobs_final ] && \
  echo "$0: --initial-num-jobs cannot exceed --final-num-jobs" && exit 1;

[ $num_jobs_final -gt $num_archives_expanded ] && \
  echo "$0: --final-num-jobs cannot exceed #archives $num_archives_expanded." && exit 1;


if [ $stage -le -3 ]; then
  echo "$0: getting preconditioning matrix for input features."
  num_lda_jobs=$num_archives
  [ $num_lda_jobs -gt $max_lda_jobs ] && num_lda_jobs=$max_lda_jobs

  # Write stats with the same format as stats for LDA.
  $cmd JOB=1:$num_lda_jobs $dir/log/get_lda_stats.JOB.log \
      nnet3-acc-lda-stats --rand-prune=$rand_prune \
        $dir/init.raw "ark:$egs_dir/egs.JOB.ark" $dir/JOB.lda_stats || exit 1;

  all_lda_accs=$(for n in $(seq $num_lda_jobs); do echo $dir/$n.lda_stats; done)
  $cmd $dir/log/sum_transform_stats.log \
    sum-lda-accs $dir/lda_stats $all_lda_accs || exit 1;

  rm $all_lda_accs || exit 1;

  # this computes a fixed affine transform computed in the way we described in
  # Appendix C.6 of http://arxiv.org/pdf/1410.7455v6.pdf; it's a scaled variant
  # of an LDA transform but without dimensionality reduction.
  $cmd $dir/log/get_transform.log \
     nnet-get-feature-transform $lda_opts $dir/lda.mat $dir/lda_stats || exit 1;

  ln -sf ../lda.mat $dir/configs/lda.mat
fi


if [ $stage -le -2 ]; then
  echo "$0: preparing initial vector for FixedScaleComponent before softmax"
  echo "  ... using priors^$presoftmax_prior_scale_power and rescaling to average 1"

  # obtains raw pdf count
  $cmd JOB=1:$nj $dir/log/acc_pdf.JOB.log \
     ali-to-post "ark:gunzip -c $alidir/ali.JOB.gz|" ark:- \| \
     post-to-tacc --per-pdf=true  $alidir/final.mdl ark:- $dir/pdf_counts.JOB || exit 1;
  $cmd $dir/log/sum_pdf_counts.log \
       vector-sum --binary=false $dir/pdf_counts.* $dir/pdf_counts || exit 1;
  rm $dir/pdf_counts.*

  awk -v power=$presoftmax_prior_scale_power -v smooth=0.01 \
     '{ for(i=2; i<=NF-1; i++) { count[i-2] = $i;  total += $i; }
        num_pdfs=NF-2;  average_count = total/num_pdfs;
        for (i=0; i<num_pdfs; i++) stot += (scale[i] = (count[i] + smooth * average_count)^power)
        printf " [ "; for (i=0; i<num_pdfs; i++) printf("%f ", scale[i]*num_pdfs/stot); print "]" }' \
     $dir/pdf_counts > $dir/presoftmax_prior_scale.vec
  ln -sf ../presoftmax_prior_scale.vec $dir/configs/presoftmax_prior_scale.vec
fi

if [ $stage -le -1 ]; then
  # Add the first layer; this will add in the lda.mat and
  # presoftmax_prior_scale.vec.
  $cmd $dir/log/add_first_layer.log \
       nnet3-init --srand=-3 $dir/init.raw $dir/configs/layer1.config $dir/0.raw || exit 1;

  # Convert to .mdl, train the transitions, set the priors.
  $cmd $dir/log/init_mdl.log \
    nnet3-am-init $alidir/final.mdl $dir/0.raw - \| \
    nnet3-am-train-transitions - "ark:gunzip -c $alidir/ali.*.gz|" $dir/0.mdl || exit 1;
fi


# set num_iters so that as close as possible, we process the data $num_epochs
# times, i.e. $num_iters*$avg_num_jobs) == $num_epochs*$num_archives_expanded,
# where avg_num_jobs=(num_jobs_initial+num_jobs_final)/2.

num_archives_to_process=$[$num_epochs*$num_archives_expanded]
num_archives_processed=0
num_iters=$[($num_archives_to_process*2)/($num_jobs_initial+$num_jobs_final)]

! [ $num_iters -gt $[$finish_add_layers_iter+2] ] \
  && echo "$0: Insufficient epochs" && exit 1

finish_add_layers_iter=$[$num_hidden_layers * $add_layers_period]

echo "$0: Will train for $num_epochs epochs = $num_iters iterations"

if $use_gpu; then
  parallel_suffix=""
  train_queue_opt="--gpu 1"
  combine_queue_opt="--gpu 1"
  prior_gpu_opt="--use-gpu=yes"
  prior_queue_opt="--gpu 1"
  parallel_train_opts=
  if ! cuda-compiled; then
    echo "$0: WARNING: you are running with one thread but you have not compiled"
    echo "   for CUDA.  You may be running a setup optimized for GPUs.  If you have"
    echo "   GPUs and have nvcc installed, go to src/ and do ./configure; make"
    exit 1
  fi
else
  echo "$0: without using a GPU this will be very slow.  nnet3 does not yet support multiple threads."
  parallel_train_opts="--use-gpu=no"
  combine_queue_opt=""  # the combine stage will be quite slow if not using
                        # GPU, as we didn't enable that program to use
                        # multiple threads.
  prior_gpu_opt="--use-gpu=no"
  prior_queue_opt=""
fi


approx_iters_per_epoch_final=$[$num_archives_expanded/$num_jobs_final]
# First work out how many iterations we want to combine over in the final
# nnet3-combine-fast invocation.  (We may end up subsampling from these if the
# number exceeds max_model_combine).  The number we use is:
# min(max(max_models_combine, approx_iters_per_epoch_final),
#     1/2 * iters_after_last_layer_added)
num_iters_combine=$max_models_combine
if [ $num_iters_combine -lt $approx_iters_per_epoch_final ]; then
   num_iters_combine=$approx_iters_per_epoch_final
fi
half_iters_after_add_layers=$[($num_iters-$finish_add_layers_iter)/2]
if [ $num_iters_combine -gt $half_iters_after_add_layers ]; then
  num_iters_combine=$half_iters_after_add_layers
fi
first_model_combine=$[$num_iters-$num_iters_combine+1]

x=0

for realign_time in $realign_times; do
  # Work out the iterations on which we will re-align, if the --realign-times
  # option was used.  This is slightly approximate.
  ! perl -e "exit($realign_time > 0.0 && $realign_time < 1.0 ? 0:1);" && \
    echo "Invalid --realign-times option $realign_times: elements must be strictly between 0 and 1.";
  # the next formula is based on the one for mix_up_iter above.
  realign_iter=$(perl -e '($j,$k,$n,$p)=@ARGV; print int(0.5 + ($j==$k ? $n*$p : $n*(sqrt((1-$p)*$j*$j+$p*$k*$k)-$j)/($k-$j))); ' $num_jobs_initial $num_jobs_final $num_iters $realign_time) || exit 1;
  realign_this_iter[$realign_iter]=$realign_time
done

cur_egs_dir=$egs_dir

while [ $x -lt $num_iters ]; do
  [ $x -eq $exit_stage ] && echo "$0: Exiting early due to --exit-stage $exit_stage" && exit 0;

  this_num_jobs=$(perl -e "print int(0.5+$num_jobs_initial+($num_jobs_final-$num_jobs_initial)*$x/$num_iters);")

  ilr=$initial_effective_lrate; flr=$final_effective_lrate; np=$num_archives_processed; nt=$num_archives_to_process;
  this_learning_rate=$(perl -e "print (($x + 1 >= $num_iters ? $flr : $ilr*exp($np*log($flr/$ilr)/$nt))*$this_num_jobs);");

  echo "On iteration $x, learning rate is $this_learning_rate."

  if [ ! -z "${realign_this_iter[$x]}" ]; then
    prev_egs_dir=$cur_egs_dir
    cur_egs_dir=$dir/egs_${realign_this_iter[$x]}
  fi

  if [ $x -ge 0 ] && [ $stage -le $x ]; then
    if [ ! -z "${realign_this_iter[$x]}" ]; then
      time=${realign_this_iter[$x]}

      echo "Getting average posterior for purposes of adjusting the priors."
      # Note: this just uses CPUs, using a smallish subset of data.
      # always use the first egs archive, which makes the script simpler;
      # we're using different random subsets of it.
      rm $dir/post.$x.*.vec 2>/dev/null
      $cmd JOB=1:$num_jobs_compute_prior $dir/log/get_post.$x.JOB.log \
        nnet3-copy-egs --srand=JOB --frame=random $context_opts ark:$prev_egs_dir/egs.1.ark ark:- \| \
        nnet3-subset-egs --srand=JOB --n=$prior_subset_size ark:- ark:- \| \
        nnet3-merge-egs ark:- ark:- \| \
        nnet3-compute-from-egs --apply-exp=true "nnet3-am-copy --raw=true $dir/$x.mdl -|" ark:- ark:- \| \
        matrix-sum-rows ark:- ark:- \| vector-sum ark:- $dir/post.$x.JOB.vec || exit 1;

      sleep 3;  # make sure there is time for $dir/post.$x.*.vec to appear.

      $cmd $dir/log/vector_sum.$x.log \
        vector-sum $dir/post.$x.*.vec $dir/post.$x.vec || exit 1;
      rm $dir/post.$x.*.vec;

      echo "Re-adjusting priors based on computed posteriors"
      $cmd $dir/log/adjust_priors.$x.log \
        nnet3-am-adjust-priors $dir/$x.mdl $dir/post.$x.vec $dir/$x.mdl || exit 1;

      sleep 2

      steps/nnet3/align.sh --nj $num_jobs_align --cmd "$align_cmd" --use-gpu $align_use_gpu \
        --transform-dir "$transform_dir" --online-ivector-dir "$online_ivector_dir" \
        --iter $x $data $lang $dir $dir/ali_$time || exit 1

      steps/nnet3/relabel_egs.sh --cmd "$cmd" --iter $x $dir/ali_$time \
        $prev_egs_dir $cur_egs_dir || exit 1

      if $cleanup && [[ $prev_egs_dir =~ $dir/egs* ]]; then
        steps/nnet3/remove_egs.sh $prev_egs_dir
      fi
    fi

    # Set off jobs doing some diagnostics, in the background.
    # Use the egs dir from the previous iteration for the diagnostics
    $cmd $dir/log/compute_prob_valid.$x.log \
      nnet3-compute-prob "nnet3-am-copy --raw=true $dir/$x.mdl - |" \
            "ark,bg:nnet3-merge-egs ark:$cur_egs_dir/valid_diagnostic.egs ark:- |" &
    $cmd $dir/log/compute_prob_train.$x.log \
      nnet3-compute-prob "nnet3-am-copy --raw=true $dir/$x.mdl - |" \
           "ark,bg:nnet3-merge-egs ark:$cur_egs_dir/train_diagnostic.egs ark:- |" &

    if [ $x -gt 0 ]; then
      $cmd $dir/log/progress.$x.log \
        nnet3-show-progress --use-gpu=no "nnet3-am-copy --raw=true $dir/$[$x-1].mdl - |" "nnet3-am-copy --raw=true $dir/$x.mdl - |" \
        "ark,bg:nnet3-merge-egs ark:$cur_egs_dir/train_diagnostic.egs ark:-|" '&&' \
        nnet3-info "nnet3-am-copy --raw=true $dir/$x.mdl - |" &
    fi

    echo "Training neural net (pass $x)"

    if [ $x -gt 0 ] && \
      [ $x -le $[($num_hidden_layers-1)*$add_layers_period] ] && \
      [ $[$x%$add_layers_period] -eq 0 ]; then
      do_average=false # if we've just mixed up, don't do averaging but take the
                       # best.
      cur_num_hidden_layers=$[1+$x/$add_layers_period]
      config=$dir/configs/layer$cur_num_hidden_layers.config
      raw="nnet3-am-copy --raw=true --learning-rate=$this_learning_rate $dir/$x.mdl - | nnet3-init --srand=$x - $config - |"
    else
      do_average=true
      if [ $x -eq 0 ]; then do_average=false; fi # on iteration 0, pick the best, don't average.
      raw="nnet3-am-copy --raw=true --learning-rate=$this_learning_rate $dir/$x.mdl -|"
    fi
    if $do_average; then
      this_minibatch_size=$minibatch_size
    else
      # on iteration zero or when we just added a layer, use a smaller minibatch
      # size (and we will later choose the output of just one of the jobs): the
      # model-averaging isn't always helpful when the model is changing too fast
      # (i.e. it can worsen the objective function), and the smaller minibatch
      # size will help to keep the update stable.
      this_minibatch_size=$[$minibatch_size/2];
    fi

    rm $dir/.error 2>/dev/null


    ( # this sub-shell is so that when we "wait" below,
      # we only wait for the training jobs that we just spawned,
      # not the diagnostic jobs that we spawned above.

      # We can't easily use a single parallel SGE job to do the main training,
      # because the computation of which archive and which --frame option
      # to use for each job is a little complex, so we spawn each one separately.
      for n in $(seq $this_num_jobs); do
        k=$[$num_archives_processed + $n - 1]; # k is a zero-based index that we'll derive
                                               # the other indexes from.
        archive=$[($k%$num_archives)+1]; # work out the 1-based archive index.
        frame=$[(($k/$num_archives)%$frames_per_eg)]; # work out the 0-based frame
        # index; this increases more slowly than the archive index because the
        # same archive with different frame indexes will give similar gradients,
        # so we want to separate them in time.

        $cmd $train_queue_opt $dir/log/train.$x.$n.log \
          nnet3-train $parallel_train_opts \
          --max-param-change=$max_param_change "$raw" \
          "ark,bg:nnet3-copy-egs --frame=$frame $context_opts ark:$cur_egs_dir/egs.$archive.ark ark:- | nnet3-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x ark:- ark:-| nnet3-merge-egs --minibatch-size=$this_minibatch_size --discard-partial-minibatches=true ark:- ark:- |" \
          $dir/$[$x+1].$n.raw || touch $dir/.error &
      done
      wait
    )
    # the error message below is not that informative, but $cmd will
    # have printed a more specific one.
    [ -f $dir/.error ] && echo "$0: error on iteration $x of training" && exit 1;

    nnets_list=
    for n in `seq 1 $this_num_jobs`; do
      nnets_list="$nnets_list $dir/$[$x+1].$n.raw"
    done

    if $do_average; then
      # average the output of the different jobs.
      $cmd $dir/log/average.$x.log \
        nnet3-average $nnets_list - \| \
        nnet3-am-copy --set-raw-nnet=- $dir/$x.mdl $dir/$[$x+1].mdl || exit 1;
    else
      # choose the best from the different jobs.
      n=$(perl -e '($nj,$pat)=@ARGV; $best_n=1; $best_logprob=-1.0e+10; for ($n=1;$n<=$nj;$n++) {
          $fn = sprintf($pat,$n); open(F, "<$fn") || die "Error opening log file $fn";
          undef $logprob; while (<F>) { if (m/log-prob-per-frame=(\S+)/) { $logprob=$1; } }
          close(F); if (defined $logprob && $logprob > $best_logprob) { $best_logprob=$logprob;
          $best_n=$n; } } print "$best_n\n"; ' $num_jobs_nnet $dir/log/train.$x.%d.log) || exit 1;
      [ -z "$n" ] && echo "Error getting best model" && exit 1;
      $cmd $dir/log/select.$x.log \
        nnet3-am-copy --set-raw-nnet=$dir/$[$x+1].$n.raw  $dir/$x.mdl $dir/$[$x+1].mdl || exit 1;
    fi

    rm $nnets_list
    [ ! -f $dir/$[$x+1].mdl ] && exit 1;
    if [ -f $dir/$[$x-1].mdl ] && $cleanup && \
       [ $[($x-1)%100] -ne 0  ] && [ $[$x-1] -lt $first_model_combine ]; then
      rm $dir/$[$x-1].mdl
    fi
  fi
  x=$[$x+1]
  num_archives_processed=$[$num_archives_processed+$this_num_jobs]
done


if [ $stage -le $num_iters ]; then
  echo "Doing final combination to produce final.mdl"

  # Now do combination.  In the nnet3 setup, the logic
  # for doing averaging of subsets of the models in the case where
  # there are too many models to reliably esetimate interpolation
  # factors (max_models_combine) is moved into the nnet3-combine
  nnets_list=()
  for n in $(seq 0 $[num_iters_combine-1]); do
    iter=$[$first_model_combine+$n]
    mdl=$dir/$iter.mdl
    [ ! -f $mdl ] && echo "Expected $mdl to exist" && exit 1;
    nnets_list[$n]="nnet3-am-copy --raw=true $mdl -|";
  done

  # Below, we use --use-gpu=no to disable nnet3-combine-fast from using a GPU,
  # as if there are many models it can give out-of-memory error; and we set
  # num-threads to 8 to speed it up (this isn't ideal...)

  $cmd $combine_queue_opt $dir/log/combine.log \
    nnet3-combine --num-iters=40 \
       --enforce-sum-to-one=true --enforce-positive-weights=true \
       --verbose=3 "${nnets_list[@]}" "ark,bg:nnet3-merge-egs --minibatch-size=1024 ark:$cur_egs_dir/combine.egs ark:-|" \
    "|nnet3-am-copy --set-raw-nnet=- $dir/$num_iters.mdl $dir/combined.mdl" || exit 1;

  # Compute the probability of the final, combined model with
  # the same subset we used for the previous compute_probs, as the
  # different subsets will lead to different probs.
  $cmd $dir/log/compute_prob_valid.final.log \
    nnet3-compute-prob "nnet3-am-copy --raw=true $dir/combined.mdl -|" \
    "ark,bg:nnet3-merge-egs ark:$cur_egs_dir/valid_diagnostic.egs ark:- |" &
  $cmd $dir/log/compute_prob_train.final.log \
    nnet3-compute-prob  "nnet3-am-copy --raw=true $dir/combined.mdl -|" \
    "ark,bg:nnet3-merge-egs ark:$cur_egs_dir/train_diagnostic.egs ark:- |" &
fi

if [ $stage -le $[$num_iters+1] ]; then
  echo "Getting average posterior for purposes of adjusting the priors."
  # Note: this just uses CPUs, using a smallish subset of data.
  if [ $num_jobs_compute_prior -gt $num_archives ]; then egs_part=1;
  else egs_part=JOB; fi
  rm $dir/post.$x.*.vec 2>/dev/null
  $cmd JOB=1:$num_jobs_compute_prior $prior_queue_opt $dir/log/get_post.$x.JOB.log \
    nnet3-copy-egs --frame=random $context_opts --srand=JOB ark:$cur_egs_dir/egs.$egs_part.ark ark:- \| \
    nnet3-subset-egs --srand=JOB --n=$prior_subset_size ark:- ark:- \| \
    nnet3-merge-egs ark:- ark:- \| \
    nnet3-compute-from-egs $prior_gpu_opt --apply-exp=true \
      "nnet3-am-copy --raw=true $dir/combined.mdl -|" ark:- ark:- \| \
    matrix-sum-rows ark:- ark:- \| vector-sum ark:- $dir/post.$x.JOB.vec || exit 1;

  sleep 3;  # make sure there is time for $dir/post.$x.*.vec to appear.

  $cmd $dir/log/vector_sum.$x.log \
   vector-sum $dir/post.$x.*.vec $dir/post.$x.vec || exit 1;

  rm $dir/post.$x.*.vec;

  echo "Re-adjusting priors based on computed posteriors"
  $cmd $dir/log/adjust_priors.final.log \
    nnet3-am-adjust-priors $dir/combined.mdl $dir/post.$x.vec $dir/final.mdl || exit 1;
fi


if [ ! -f $dir/final.mdl ]; then
  echo "$0: $dir/final.mdl does not exist."
  # we don't want to clean up if the training didn't succeed.
  exit 1;
fi

sleep 2

echo Done

if $cleanup; then
  echo Cleaning up data
  if $remove_egs && [[ $cur_egs_dir =~ $dir/egs* ]]; then
    steps/nnet2/remove_egs.sh $cur_egs_dir
  fi

  echo Removing most of the models
  for x in `seq 0 $num_iters`; do
    if [ $[$x%100] -ne 0 ] && [ $x -ne $num_iters ] && [ -f $dir/$x.mdl ]; then
       # delete all but every 100th model; don't delete the ones which combine to form the final model.
      rm $dir/$x.mdl
    fi
  done
fi

steps/info/nnet3_dir_info.pl $dir

exit 0


================================================
FILE: egs/steps/nnet3/tdnn/train_raw_nnet.sh
================================================
#!/usr/bin/env bash

# THIS SCRIPT IS DEPRECATED, see ../train_raw_dnn.py

# note, TDNN is the same as what we used to call multisplice.
# THIS SCRIPT IS DEPRECATED, see ../train_raw_dnn.py

# Copyright 2012-2015  Johns Hopkins University (Author: Daniel Povey).
#           2013  Xiaohui Zhang
#           2013  Guoguo Chen
#           2014-2016  Vimal Manohar
#           2014  Vijayaditya Peddinti
# Apache 2.0.


# Begin configuration section.
cmd=run.pl
num_epochs=15      # Number of epochs of training;
                   # the number of iterations is worked out from this.
initial_effective_lrate=0.01
final_effective_lrate=0.001
rand_prune=4.0 # Relates to a speedup we do for LDA.
minibatch_size=512  # This default is suitable for GPU-based training.
                    # Set it to 128 for multi-threaded CPU-based training.
max_param_change=2.0  # max param change per minibatch
samples_per_iter=400000 # each iteration of training, see this many samples
                        # per job.  This option is passed to get_egs.sh
num_jobs_initial=1  # Number of neural net jobs to run in parallel at the start of training
num_jobs_final=8   # Number of neural net jobs to run in parallel at the end of training
prior_subset_size=20000 # 20k samples per job, for computing priors.
num_jobs_compute_prior=10 # these are single-threaded, run on CPU.
get_egs_stage=0    # can be used for rerunning after partial
online_ivector_dir=
remove_egs=true  # set to false to disable removing egs after training is done.

max_models_combine=20 # The "max_models_combine" is the maximum number of models we give
  # to the final 'combine' stage, but these models will themselves be averages of
  # iteration-number ranges.

shuffle_buffer_size=5000 # This "buffer_size" variable controls randomization of the samples
                # on each iter.  You could set it to 0 or to a large value for complete
                # randomization, but this would both consume memory and cause spikes in
                # disk I/O.  Smaller is easier on disk and memory but less random.  It's
                # not a huge deal though, as samples are anyway randomized right at the start.
                # (the point of this is to get data in different minibatches on different iterations,
                # since in the preconditioning method, 2 samples in the same minibatch can
                # affect each others' gradients.

add_layers_period=2 # by default, add new layers every 2 iterations.
stage=-6
exit_stage=-100 # you can set this to terminate the training early.  Exits before running this stage

chunk_training=false  # if true training is done with chunk randomization, rather than frame randomization

randprune=4.0 # speeds up LDA.
use_gpu=true    # if true, we run on GPU.
cleanup=true
egs_dir=
configs_dir=
max_lda_jobs=10  # use no more than 10 jobs for the LDA accumulation.
lda_opts=
egs_opts=
transform_dir=     # If supplied, this dir used instead of alidir to find transforms.
cmvn_opts=  # will be passed to get_lda.sh and get_egs.sh, if supplied.
frames_per_eg=8 # to be passed on to get_egs.sh

# Raw nnet training options i.e. without transition model
nj=4
dense_targets=true        # Use dense targets instead of sparse targets

# End configuration section.

trap 'for pid in $(jobs -pr); do kill -KILL $pid; done' INT QUIT TERM

echo "$0: THIS SCRIPT IS DEPRECATED"
echo "$0 $@"  # Print the command line for logging

if [ -f path.sh ]; then . ./path.sh; fi
. parse_options.sh || exit 1;

if [ $# != 3 ]; then
  echo "$0: THIS SCRIPT IS DEPRECATED, see ../train_raw_dnn.py"
  echo "Usage: $0 [opts] <data> <targets-scp> <exp-dir>"
  echo " e.g.: $0 data/train scp:snr_targets/targets.scp exp/nnet3_snr_predictor"
  echo ""
  echo "Main options (for others, see top of script file)"
  echo "  --config <config-file>                           # config file containing options"
  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
  echo "  --num-epochs <#epochs|15>                        # Number of epochs of training"
  echo "  --initial-effective-lrate <lrate|0.02> # effective learning rate at start of training."
  echo "  --final-effective-lrate <lrate|0.004>   # effective learning rate at end of training."
  echo "                                                   # data, 0.00025 for large data"
  echo "  --num-hidden-layers <#hidden-layers|2>           # Number of hidden layers, e.g. 2 for 3 hours of data, 4 for 100hrs"
  echo "  --add-layers-period <#iters|2>                   # Number of iterations between adding hidden layers"
  echo "  --num-jobs-initial <num-jobs|1>                  # Number of parallel jobs to use for neural net training, at the start."
  echo "  --num-jobs-final <num-jobs|8>                    # Number of parallel jobs to use for neural net training, at the end"
  echo "  --num-threads <num-threads|16>                   # Number of parallel threads per job, for CPU-based training (will affect"
  echo "                                                   # results as well as speed; may interact with batch size; if you increase"
  echo "                                                   # this, you may want to decrease the batch size."
  echo "  --parallel-opts <opts|\"--num-threads 16 --mem 1G\">      # extra options to pass to e.g. queue.pl for processes that"
  echo "                                                   # use multiple threads... note, you might have to reduce --mem"
  echo "                                                   # versus your defaults, because it gets multiplied by the --num-threads argument."
  echo "  --minibatch-size <minibatch-size|128>            # Size of minibatch to process (note: product with --num-threads"
  echo "                                                   # should not get too large, e.g. >2k)."
  echo "  --samples-per-iter <#samples|400000>             # Number of samples of data to process per iteration, per"
  echo "                                                   # process."
  echo "  --splice-indexes <string|layer0/-4:-3:-2:-1:0:1:2:3:4> "
  echo "                                                   # Frame indices used for each splice layer."
  echo "                                                   # Format : layer<hidden_layer_index>/<frame_indices>....layer<hidden_layer>/<frame_indices> "
  echo "                                                   # (note: we splice processed, typically 40-dimensional frames"
  echo "  --lda-dim <dim|''>                               # Dimension to reduce spliced features to with LDA"
  echo "  --stage <stage|-4>                               # Used to run a partially-completed training process from somewhere in"
  echo "                                                   # the middle."


  exit 1;
fi

data=$1
targets_scp=$2
dir=$3

# Check some files.
for f in $data/feats.scp $targets_scp; do
  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
done

# in this dir we'll have just one job.
sdata=$data/split$nj
utils/split_data.sh $data $nj

mkdir -p $dir/log
echo $nj > $dir/num_jobs


# First work out the feature and iVector dimension, needed for tdnn config creation.
feat_dim=$(feat-to-dim --print-args=false scp:$data/feats.scp -) || \
      { echo "$0: Error getting feature dim"; exit 1; }

if [ -z "$online_ivector_dir" ]; then
  ivector_dim=0
else
  ivector_dim=$(feat-to-dim scp:$online_ivector_dir/ivector_online.scp -) || exit 1;
  steps/nnet2/get_ivector_id.sh $online_ivector_dir > $dir/final.ie.id || exit 1
fi

if [ ! -z "$configs_dir" ]; then
  cp -rT $configs_dir $dir/configs || exit 1
fi

if [ $stage -le -5 ]; then
  # Initialize as "raw" nnet, prior to training the LDA-like preconditioning
  # matrix.  This first config just does any initial splicing that we do;
  # we do this as it's a convenient way to get the stats for the 'lda-like'
  # transform.
  $cmd $dir/log/nnet_init.log \
    nnet3-init --srand=-2 $dir/configs/init.config $dir/init.raw || exit 1;
fi

# sourcing the "vars" below sets
# model_left_context=(something)
# model_right_context=(something)
# num_hidden_layers=(something)
# num_targets=(something)
# add_lda=(true|false)
# include_log_softmax=(true|false)
# objective_type=(something)
. $dir/configs/vars || exit 1;
left_context=$model_left_context
right_context=$model_right_context

[ -z "$num_targets" ] && echo "\$num_targets is not defined. Needs to be defined in $dir/configs/vars." && exit 1
[ -z "$add_lda" ] && echo "\$add_lda is not defined. Needs to be defined in $dir/configs/vars." && exit 1
[ -z "$include_log_softmax" ] && echo "\$include_log_softmax is not defined. Needs to be defined in $dir/configs/vars." && exit 1
[ -z "$objective_type" ] && echo "\$objective_type is not defined. Needs to be defined in $dir/configs/vars." && exit 1

context_opts="--left-context=$left_context --right-context=$right_context"

! [ "$num_hidden_layers" -gt 0 ] && echo \
 "$0: Expected num_hidden_layers to be defined" && exit 1;

if $dense_targets; then
  tmp_num_targets=`feat-to-dim scp:$targets_scp - 2>/dev/null` || exit 1

  if [ $tmp_num_targets -ne $num_targets ]; then
    echo "Mismatch between num-targets provided to script vs configs"
    exit 1
  fi
fi

if [ $stage -le -4 ] && [ -z "$egs_dir" ]; then
  extra_opts=()
  [ ! -z "$cmvn_opts" ] && extra_opts+=(--cmvn-opts "$cmvn_opts")
  [ ! -z "$online_ivector_dir" ] && extra_opts+=(--online-ivector-dir $online_ivector_dir)
  extra_opts+=(--transform-dir "$transform_dir")
  extra_opts+=(--left-context $left_context)
  extra_opts+=(--right-context $right_context)
  echo "$0: calling get_egs.sh"

  if $dense_targets; then
    target_type=dense
  else
    target_type=sparse
  fi

  steps/nnet3/get_egs_targets.sh $egs_opts "${extra_opts[@]}" \
    --samples-per-iter $samples_per_iter --stage $get_egs_stage \
    --cmd "$cmd" --nj $nj \
    --frames-per-eg $frames_per_eg \
    --target-type $target_type --num-targets $num_targets \
    $data $targets_scp $dir/egs || exit 1;
fi

[ -z $egs_dir ] && egs_dir=$dir/egs

if [ ! -z "$online_ivector_dir" ] ; then
  steps/nnet2/check_ivectors_compatible.sh $online_ivector_dir $egs_dir/info || exit 1
fi


if [ "$feat_dim" != "$(cat $egs_dir/info/feat_dim)" ]; then
  echo "$0: feature dimension mismatch with egs, $feat_dim vs $(cat $egs_dir/info/feat_dim)";
  exit 1;
fi
if [ "$ivector_dim" != "$(cat $egs_dir/info/ivector_dim)" ]; then
  echo "$0: ivector dimension mismatch with egs, $ivector_dim vs $(cat $egs_dir/info/ivector_dim)";
  exit 1;
fi

# copy any of the following that exist, to $dir.
cp $egs_dir/{cmvn_opts,splice_opts,final.mat} $dir 2>/dev/null

# confirm that the egs_dir has the necessary context (especially important if
# the --egs-dir option was used on the command line).
egs_left_context=$(cat $egs_dir/info/left_context) || exit -1
egs_right_context=$(cat $egs_dir/info/right_context) || exit -1
 ( [ $egs_left_context -lt $left_context ] || \
   [ $egs_right_context -lt $right_context ] ) && \
   echo "$0: egs in $egs_dir have too little context" && exit -1;

frames_per_eg=$(cat $egs_dir/info/frames_per_eg) || { echo "error: no such file $egs_dir/info/frames_per_eg"; exit 1; }
num_archives=$(cat $egs_dir/info/num_archives) || { echo "error: no such file $egs_dir/info/frames_per_eg"; exit 1; }

# num_archives_expanded considers each separate label-position from
# 0..frames_per_eg-1 to be a separate archive.
if [ "$chunk_training" == "true" ]; then
  num_archives_expanded=$num_archives
else
  num_archives_expanded=$[$num_archives*$frames_per_eg]
fi

[ $num_jobs_initial -gt $num_jobs_final ] && \
  echo "$0: --initial-num-jobs cannot exceed --final-num-jobs" && exit 1;

[ $num_jobs_final -gt $num_archives_expanded ] && \
  echo "$0: --final-num-jobs cannot exceed #archives $num_archives_expanded." && exit 1;


if $add_lda && [ $stage -le -3 ]; then
  echo "$0: getting preconditioning matrix for input features."
  num_lda_jobs=$num_archives
  [ $num_lda_jobs -gt $max_lda_jobs ] && num_lda_jobs=$max_lda_jobs

  # Write stats with the same format as stats for LDA.
  $cmd JOB=1:$num_lda_jobs $dir/log/get_lda_stats.JOB.log \
      nnet3-acc-lda-stats --rand-prune=$rand_prune \
        $dir/init.raw "ark:$egs_dir/egs.JOB.ark" $dir/JOB.lda_stats || exit 1;

  all_lda_accs=$(for n in $(seq $num_lda_jobs); do echo $dir/$n.lda_stats; done)
  $cmd $dir/log/sum_transform_stats.log \
    sum-lda-accs $dir/lda_stats $all_lda_accs || exit 1;

  rm $all_lda_accs || exit 1;

  # this computes a fixed affine transform computed in the way we described in
  # Appendix C.6 of http://arxiv.org/pdf/1410.7455v6.pdf; it's a scaled variant
  # of an LDA transform but without dimensionality reduction.
  $cmd $dir/log/get_transform.log \
     nnet-get-feature-transform $lda_opts $dir/lda.mat $dir/lda_stats || exit 1;

  ln -sf ../lda.mat $dir/configs/lda.mat
fi


if [ $stage -le -1 ]; then
  # Add the first layer; this will add in the lda.mat
  $cmd $dir/log/add_first_layer.log \
       nnet3-init --srand=-3 $dir/init.raw $dir/configs/layer1.config $dir/0.raw || exit 1;

fi


# set num_iters so that as close as possible, we process the data $num_epochs
# times, i.e. $num_iters*$avg_num_jobs) == $num_epochs*$num_archives_expanded,
# where avg_num_jobs=(num_jobs_initial+num_jobs_final)/2.

num_archives_to_process=$[$num_epochs*$num_archives_expanded]
num_archives_processed=0
num_iters=$[($num_archives_to_process*2)/($num_jobs_initial+$num_jobs_final)]

finish_add_layers_iter=$[$num_hidden_layers * $add_layers_period]

! [ $num_iters -gt $[$finish_add_layers_iter+2] ] \
  && echo "$0: Insufficient epochs" && exit 1

echo "$0: Will train for $num_epochs epochs = $num_iters iterations"

if $use_gpu; then
  parallel_suffix=""
  train_queue_opt="--gpu 1"
  combine_queue_opt="--gpu 1"
  prior_gpu_opt="--use-gpu=yes"
  prior_queue_opt="--gpu 1"
  parallel_train_opts=
  if ! cuda-compiled; then
    echo "$0: WARNING: you are running with one thread but you have not compiled"
    echo "   for CUDA.  You may be running a setup optimized for GPUs.  If you have"
    echo "   GPUs and have nvcc installed, go to src/ and do ./configure; make"
    exit 1
  fi
else
  echo "$0: without using a GPU this will be very slow.  nnet3 does not yet support multiple threads."
  parallel_train_opts="--use-gpu=no"
  combine_queue_opt=""  # the combine stage will be quite slow if not using
                        # GPU, as we didn't enable that program to use
                        # multiple threads.
  prior_gpu_opt="--use-gpu=no"
  prior_queue_opt=""
fi


approx_iters_per_epoch_final=$[$num_archives_expanded/$num_jobs_final]
# First work out how many iterations we want to combine over in the final
# nnet3-combine-fast invocation.  (We may end up subsampling from these if the
# number exceeds max_model_combine).  The number we use is:
# min(max(max_models_combine, approx_iters_per_epoch_final),
#     1/2 * iters_after_last_layer_added)
num_iters_combine=$max_models_combine
if [ $num_iters_combine -lt $approx_iters_per_epoch_final ]; then
   num_iters_combine=$approx_iters_per_epoch_final
fi
half_iters_after_add_layers=$[($num_iters-$finish_add_layers_iter)/2]
if [ $num_iters_combine -gt $half_iters_after_add_layers ]; then
  num_iters_combine=$half_iters_after_add_layers
fi
first_model_combine=$[$num_iters-$num_iters_combine+1]

x=0


compute_accuracy=false
if [ "$objective_type" == "linear" ]; then
  compute_accuracy=true
fi

while [ $x -lt $num_iters ]; do
  [ $x -eq $exit_stage ] && echo "$0: Exiting early due to --exit-stage $exit_stage" && exit 0;

  this_num_jobs=$(perl -e "print int(0.5+$num_jobs_initial+($num_jobs_final-$num_jobs_initial)*$x/$num_iters);")

  ilr=$initial_effective_lrate; flr=$final_effective_lrate; np=$num_archives_processed; nt=$num_archives_to_process;
  this_learning_rate=$(perl -e "print (($x + 1 >= $num_iters ? $flr : $ilr*exp($np*log($flr/$ilr)/$nt))*$this_num_jobs);");

  echo "On iteration $x, learning rate is $this_learning_rate."

  if [ $x -ge 0 ] && [ $stage -le $x ]; then

    # Set off jobs doing some diagnostics, in the background.
    # Use the egs dir from the previous iteration for the diagnostics
    $cmd $dir/log/compute_prob_valid.$x.log \
      nnet3-compute-prob --compute-accuracy=$compute_accuracy $dir/$x.raw \
      "ark,bg:nnet3-merge-egs ark:$egs_dir/valid_diagnostic.egs ark:- |" &
    $cmd $dir/log/compute_prob_train.$x.log \
      nnet3-compute-prob --compute-accuracy=$compute_accuracy $dir/$x.raw \
      "ark,bg:nnet3-merge-egs ark:$egs_dir/train_diagnostic.egs ark:- |" &

    if [ $x -gt 0 ]; then
      $cmd $dir/log/progress.$x.log \
        nnet3-show-progress --use-gpu=no $dir/$[x-1].raw $dir/$x.raw \
        "ark,bg:nnet3-merge-egs ark:$egs_dir/train_diagnostic.egs ark:-|" '&&' \
        nnet3-info $dir/$x.raw &
    fi

    echo "Training neural net (pass $x)"

    if [ $x -gt 0 ] && \
      [ $x -le $[($num_hidden_layers-1)*$add_layers_period] ] && \
      [ $[$x%$add_layers_period] -eq 0 ]; then
      do_average=false # if we've just mixed up, don't do averaging but take the
                       # best.
      cur_num_hidden_layers=$[1+$x/$add_layers_period]
      config=$dir/configs/layer$cur_num_hidden_layers.config
      raw="nnet3-copy --learning-rate=$this_learning_rate $dir/$x.raw - | nnet3-init --srand=$x - $config - |"
    else
      do_average=true
      if [ $x -eq 0 ]; then do_average=false; fi # on iteration 0, pick the best, don't average.
      raw="nnet3-copy --learning-rate=$this_learning_rate $dir/$x.raw -|"
    fi
    if $do_average; then
      this_minibatch_size=$minibatch_size
    else
      # on iteration zero or when we just added a layer, use a smaller minibatch
      # size (and we will later choose the output of just one of the jobs): the
      # model-averaging isn't always helpful when the model is changing too fast
      # (i.e. it can worsen the objective function), and the smaller minibatch
      # size will help to keep the update stable.
      this_minibatch_size=$[$minibatch_size/2];
    fi

    rm $dir/.error 2>/dev/null


    ( # this sub-shell is so that when we "wait" below,
      # we only wait for the training jobs that we just spawned,
      # not the diagnostic jobs that we spawned above.

      # We can't easily use a single parallel SGE job to do the main training,
      # because the computation of which archive and which --frame option
      # to use for each job is a little complex, so we spawn each one separately.
      for n in $(seq $this_num_jobs); do
        k=$[$num_archives_processed + $n - 1]; # k is a zero-based index that we'll derive
                                               # the other indexes from.
        archive=$[($k%$num_archives)+1]; # work out the 1-based archive index.
        frame=$[(($k/$num_archives)%$frames_per_eg)]; # work out the 0-based frame
        # index; this increases more slowly than the archive index because the
        # same archive with different frame indexes will give similar gradients,
        # so we want to separate them in time.

        $cmd $train_queue_opt $dir/log/train.$x.$n.log \
          nnet3-train $parallel_train_opts \
          --max-param-change=$max_param_change "$raw" \
          "ark,bg:nnet3-copy-egs --frame=$frame $context_opts ark:$egs_dir/egs.$archive.ark ark:- | nnet3-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x ark:- ark:-| nnet3-merge-egs --minibatch-size=$this_minibatch_size --discard-partial-minibatches=true ark:- ark:- |" \
          $dir/$[$x+1].$n.raw || touch $dir/.error &
      done
      wait
    )
    # the error message below is not that informative, but $cmd will
    # have printed a more specific one.
    [ -f $dir/.error ] && echo "$0: error on iteration $x of training" && exit 1;

    nnets_list=
    for n in `seq 1 $this_num_jobs`; do
      nnets_list="$nnets_list $dir/$[$x+1].$n.raw"
    done

    if $do_average; then
      # average the output of the different jobs.
      $cmd $dir/log/average.$x.log \
        nnet3-average $nnets_list $dir/$[x+1].raw || exit 1;
    else
      # choose the best from the different jobs.
      n=$(perl -e '($nj,$pat)=@ARGV; $best_n=1; $best_logprob=-1.0e+10; for ($n=1;$n<=$nj;$n++) {
          $fn = sprintf($pat,$n); open(F, "<$fn") || die "Error opening log file $fn";
          undef $logprob; while (<F>) { if (m/log-prob-per-frame=(\S+)/) { $logprob=$1; } }
          close(F); if (defined $logprob && $logprob > $best_logprob) { $best_logprob=$logprob;
          $best_n=$n; } } print "$best_n\n"; ' $this_num_jobs $dir/log/train.$x.%d.log) || exit 1;
      [ -z "$n" ] && echo "Error getting best model" && exit 1;
      $cmd $dir/log/select.$x.log \
        nnet3-copy $dir/$[$x+1].$n.raw $dir/$[$x+1].raw || exit 1;
    fi

    rm $nnets_list
    [ ! -f $dir/$[$x+1].raw ] && exit 1;
    if [ -f $dir/$[$x-1].raw ] && $cleanup && \
       [ $[($x-1)%100] -ne 0  ] && [ $[$x-1] -lt $first_model_combine ]; then
      rm $dir/$[$x-1].raw
    fi
  fi
  x=$[$x+1]
  num_archives_processed=$[$num_archives_processed+$this_num_jobs]
done

if [ $stage -le $num_iters ]; then
  echo "Doing final combination to produce final.raw"

  # Now do combination.  In the nnet3 setup, the logic
  # for doing averaging of subsets of the models in the case where
  # there are too many models to reliably esetimate interpolation
  # factors (max_models_combine) is moved into the nnet3-combine
  nnets_list=()
  for n in $(seq 0 $[num_iters_combine-1]); do
    iter=$[$first_model_combine+$n]
    nnet=$dir/$iter.raw
    [ ! -f $nnet ] && echo "Expected $nnet to exist" && exit 1;
    nnets_list[$n]=$nnet
  done

  # Below, we use --use-gpu=no to disable nnet3-combine-fast from using a GPU,
  # as if there are many models it can give out-of-memory error; and we set
  # num-threads to 8 to speed it up (this isn't ideal...)

  $cmd $combine_queue_opt $dir/log/combine.log \
    nnet3-combine --num-iters=40 \
    --enforce-sum-to-one=true --enforce-positive-weights=true \
    --verbose=3 "${nnets_list[@]}" "ark,bg:nnet3-merge-egs --minibatch-size=1024 ark:$egs_dir/combine.egs ark:-|" \
    $dir/final.raw || exit 1;

  # Compute the probability of the final, combined model with
  # the same subset we used for the previous compute_probs, as the
  # different subsets will lead to different probs.
  $cmd $dir/log/compute_prob_valid.final.log \
    nnet3-compute-prob --compute-accuracy=$compute_accuracy $dir/final.raw \
    "ark,bg:nnet3-merge-egs ark:$egs_dir/valid_diagnostic.egs ark:- |" &
  $cmd $dir/log/compute_prob_train.final.log \
    nnet3-compute-prob --compute-accuracy=$compute_accuracy $dir/final.raw \
    "ark,bg:nnet3-merge-egs ark:$egs_dir/train_diagnostic.egs ark:- |" &
fi

if $include_log_softmax && [ $stage -le $[$num_iters+1] ]; then
  echo "Getting average posterior for purpose of using as prior to convert posteriors to likelihoods."
  # Note: this just uses CPUs, using a smallish subset of data.
  if [ $num_jobs_compute_prior -gt $num_archives ]; then egs_part=1;
  else egs_part=JOB; fi
  rm $dir/post.$x.*.vec 2>/dev/null
  $cmd JOB=1:$num_jobs_compute_prior $prior_queue_opt $dir/log/get_post.$x.JOB.log \
    nnet3-copy-egs --frame=random $context_opts --srand=JOB ark:$egs_dir/egs.$egs_part.ark ark:- \| \
    nnet3-subset-egs --srand=JOB --n=$prior_subset_size ark:- ark:- \| \
    nnet3-merge-egs ark:- ark:- \| \
    nnet3-compute-from-egs $prior_gpu_opt --apply-exp=true \
    $dir/final.raw ark:- ark:- \| \
    matrix-sum-rows ark:- ark:- \| vector-sum ark:- $dir/post.$x.JOB.vec || exit 1;

  sleep 3;  # make sure there is time for $dir/post.$x.*.vec to appear.

  $cmd $dir/log/vector_sum.$x.log \
   vector-sum $dir/post.$x.*.vec $dir/post.$x.vec || exit 1;

  rm -f $dir/post.$x.*.vec;

fi


if [ ! -f $dir/final.raw ]; then
  echo "$0: $dir/final.raw does not exist."
  # we don't want to clean up if the training didn't succeed.
  exit 1;
fi

sleep 2

echo Done

if $cleanup; then
  echo Cleaning up data
  if $remove_egs && [[ $egs_dir =~ $dir/egs* ]]; then
    steps/nnet2/remove_egs.sh $egs_dir
  fi

  echo Removing most of the models
  for x in `seq 0 $num_iters`; do
    if [ $[$x%100] -ne 0 ] && [ $x -ne $num_iters ] && [ -f $dir/$x.mdl ]; then
       # delete all but every 100th model; don't delete the ones which combine to form the final model.
      rm $dir/$x.raw
    fi
  done
fi


================================================
FILE: egs/steps/nnet3/train_discriminative.sh
================================================
#!/usr/bin/env bash

# Copyright 2012-2014  Johns Hopkins University (Author: Daniel Povey)
#           2014-2015  Vimal Manohar
# Apache 2.0.

set -o pipefail

# This script does MPE or MMI or state-level minimum bayes risk (sMBR) training
# using egs obtained by steps/nnet3/get_egs_discriminative.sh

# Begin configuration section.
cmd=run.pl
num_epochs=4       # Number of epochs of training;
                   # the number of iterations is worked out from this.
                   # Be careful with this: we actually go over the data
                   # num-epochs * frame-subsampling-factor times, due to
                   # using different data-shifts.
use_gpu=true
apply_deriv_weights=true
use_frame_shift=false
run_diagnostics=true
learning_rate=0.00002
max_param_change=2.0
scale_max_param_change=false # if this option is used, scale it by num-jobs.

effective_lrate=    # If supplied, overrides the learning rate, which gets set to effective_lrate * num_jobs_nnet.
acoustic_scale=0.1  # acoustic scale for MMI/MPFE/SMBR training.
boost=0.0       # option relevant for MMI

criterion=smbr
drop_frames=false #  option relevant for MMI
one_silence_class=true # option relevant for MPE/SMBR
num_jobs_nnet=4    # Number of neural net jobs to run in parallel.  Note: this
                   # will interact with the learning rates (if you decrease
                   # this, you'll have to decrease the learning rate, and vice
                   # versa).
regularization_opts=
minibatch_size=64  # This is the number of examples rather than the number of output frames.
last_layer_factor=1.0  # relates to modify-learning-rates [deprecated]
shuffle_buffer_size=1000 # This "buffer_size" variable controls randomization of the samples
                # on each iter.  You could set it to 0 or to a large value for complete
                # randomization, but this would both consume memory and cause spikes in
                # disk I/O.  Smaller is easier on disk and memory but less random.  It's
                # not a huge deal though, as samples are anyway randomized right at the start.


stage=-3

num_threads=16  # this is the default but you may want to change it, e.g. to 1 if
                # using GPUs.

cleanup=true
keep_model_iters=100
remove_egs=false
src_model=  # will default to $degs_dir/final.mdl

num_jobs_compute_prior=10

min_deriv_time=0
max_deriv_time_relative=0
# End configuration section.


echo "$0 $@"  # Print the command line for logging

if [ -f path.sh ]; then . ./path.sh; fi
. parse_options.sh || exit 1;


if [ $# != 2 ]; then
  echo "Usage: $0 [opts] <degs-dir> <exp-dir>"
  echo " e.g.: $0 exp/nnet3/tdnn_sp_degs exp/nnet3/tdnn_sp_smbr"
  echo ""
  echo "Main options (for others, see top of script file)"
  echo "  --config <config-file>                           # config file containing options"
  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
  echo "  --num-epochs <#epochs|4>                        # Number of epochs of training"
  echo "  --learning-rate <learning-rate|0.0002>           # Learning rate to use"
  echo "  --effective-lrate <effective-learning-rate>      # If supplied, learning rate will be set to"
  echo "                                                   # this value times num-jobs-nnet."
  echo "  --num-jobs-nnet <num-jobs|8>                     # Number of parallel jobs to use for main neural net"
  echo "                                                   # training (will affect results as well as speed; try 8, 16)"
  echo "                                                   # Note: if you increase this, you may want to also increase"
  echo "                                                   # the learning rate.  Also note: if there are fewer archives"
  echo "                                                   # of egs than this, it will get reduced automatically."
  echo "  --num-threads <num-threads|16>                   # Number of parallel threads per job (will affect results"
  echo "                                                   # as well as speed; may interact with batch size; if you increase"
  echo "                                                   # this, you may want to decrease the batch size.  With GPU, must be 1."
  echo "  --parallel-opts <opts|\"--num-threads 16 --mem 1G\">      # extra options to pass to e.g. queue.pl for processes that"
  echo "                                                   # use multiple threads... "
  echo "  --stage <stage|-3>                               # Used to run a partially-completed training process from somewhere in"
  echo "                                                   # the middle."
  echo "  --criterion <criterion|smbr>                     # Training criterion: may be smbr, mmi or mpfe"
  echo "  --boost <boost|0.0>                              # Boosting factor for MMI (e.g., 0.1)"
  echo "  --drop-frames <true,false|false>                 # Option that affects MMI training: if true, we exclude gradients from frames"
  echo "                                                   # where the numerator transition-id is not in the denominator lattice."
  echo "  --one-silence-class <true,false|false>           # Option that affects MPE/SMBR training (will tend to reduce insertions)"
  echo "  --modify-learning-rates <true,false|false>       # If true, modify learning rates to try to equalize relative"
  echo "                                                   # changes across layers. [deprecated]"
  exit 1;
fi

degs_dir=$1
dir=$2

[ -z "$src_model" ] && src_model=$degs_dir/final.mdl

# Check some files.
for f in $degs_dir/degs.1.ark $degs_dir/info/{num_archives,silence.csl,frame_subsampling_factor} $src_model; do
  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
done

mkdir -p $dir/log || exit 1;


model_left_context=$(nnet3-am-info $src_model | grep "^left-context:" | awk '{print $2}')
model_right_context=$(nnet3-am-info $src_model | grep "^right-context:" | awk '{print $2}')

# Copy the ivector information
if [ -f $degs_dir/info/final.ie.id ]; then
  cp $degs_dir/info/final.ie.id $dir/ 2>/dev/null || true
fi

# copy some things
for f in splice_opts cmvn_opts tree final.mat; do
  if [ -f $degs_dir/$f ]; then
    cp $degs_dir/$f $dir/ || exit 1;
  fi
done

silphonelist=`cat $degs_dir/info/silence.csl` || exit 1;

num_archives=$(cat $degs_dir/info/num_archives) || exit 1;
frame_subsampling_factor=$(cat $degs_dir/info/frame_subsampling_factor)

echo $frame_subsampling_factor > $dir/frame_subsampling_factor

if $use_frame_shift; then
  num_archives_expanded=$[$num_archives*$frame_subsampling_factor]
else
  num_archives_expanded=$num_archives
fi

if [ $num_jobs_nnet -gt $num_archives_expanded ]; then
  echo "$0: num-jobs-nnet $num_jobs_nnet exceeds number of archives $num_archives_expanded,"
  echo " ... setting it to $num_archives."
  num_jobs_nnet=$num_archives_expanded
fi

num_archives_to_process=$[$num_epochs*$num_archives_expanded]
num_archives_processed=0
num_iters=$[$num_archives_to_process/$num_jobs_nnet]

echo "$0: Will train for $num_epochs epochs = $num_iters iterations"

if $use_gpu; then
  parallel_suffix=""
  train_queue_opt="--gpu 1"
  parallel_train_opts=
  if ! cuda-compiled; then
    echo "$0: WARNING: you are running with one thread but you have not compiled"
    echo "   for CUDA.  You may be running a setup optimized for GPUs.  If you have"
    echo "   GPUs and have nvcc installed, go to src/ and do ./configure; make"
    exit 1
  fi
else
  echo "$0: without using a GPU this will be very slow.  nnet3 does not yet support multiple threads."
  parallel_train_opts="--use-gpu=no"
fi

if $use_frame_shift; then
  num_epochs_expanded=$[num_epochs*frame_subsampling_factor]
else
  num_epochs_expanded=$num_epochs
fi

for e in $(seq 1 $num_epochs_expanded); do
  x=$[($e*$num_archives)/$num_jobs_nnet] # gives the iteration number.
  iter_to_epoch[$x]=$e
done

if [ $stage -le -1 ]; then
  echo "$0: Copying initial model and modifying preconditioning setup"

  # Note, the baseline model probably had preconditioning, and we'll keep it;
  # but we want online preconditioning with a larger number of samples of
  # history, since in this setup the frames are only randomized at the segment
  # level so they are highly correlated.  It might make sense to tune this a
  # little, later on, although I doubt it matters once the --num-samples-history
  # is large enough.

  if [ ! -z "$effective_lrate" ]; then
    learning_rate=$(perl -e "print ($num_jobs_nnet*$effective_lrate);")
    echo "$0: setting learning rate to $learning_rate = --num-jobs-nnet * --effective-lrate."
  fi


  # set the learning rate to $learning_rate, and
  # set the output-layer's learning rate to
  # $learning_rate times $last_layer_factor.
  edits_str="set-learning-rate learning-rate=$learning_rate"
  if [ "$last_layer_factor" != "1.0" ]; then
    last_layer_lrate=$(perl -e "print ($learning_rate*$last_layer_factor);") || exit 1
    edits_str="$edits_str; set-learning-rate name=output.affine learning-rate=$last_layer_lrate"
  fi

  $cmd $dir/log/convert.log \
    nnet3-am-copy --edits="$edits_str" "$src_model" $dir/0.mdl || exit 1;

  ln -sf 0.mdl $dir/epoch0.mdl
fi


rm $dir/.error 2>/dev/null

x=0

while [ $x -lt $num_iters ]; do
  if [ $stage -le $x ]; then
    if $run_diagnostics; then
      # Set off jobs doing some diagnostics, in the background.  # Use the egs dir from the previous iteration for the diagnostics
      $cmd $dir/log/compute_objf_valid.$x.log \
        nnet3-discriminative-compute-objf  $regularization_opts \
        --silence-phones=$silphonelist \
        --criterion=$criterion --drop-frames=$drop_frames \
        --one-silence-class=$one_silence_class \
        --boost=$boost --acoustic-scale=$acoustic_scale \
        $dir/$x.mdl \
        "ark,bg:nnet3-discriminative-copy-egs ark:$degs_dir/valid_diagnostic.degs ark:- | nnet3-discriminative-merge-egs --minibatch-size=1:64 ark:- ark:- |" &
      $cmd $dir/log/compute_objf_train.$x.log \
        nnet3-discriminative-compute-objf  $regularization_opts \
        --silence-phones=$silphonelist \
        --criterion=$criterion --drop-frames=$drop_frames \
        --one-silence-class=$one_silence_class \
        --boost=$boost --acoustic-scale=$acoustic_scale \
        $dir/$x.mdl \
        "ark,bg:nnet3-discriminative-copy-egs ark:$degs_dir/train_diagnostic.degs ark:- | nnet3-discriminative-merge-egs --minibatch-size=1:64 ark:- ark:- |" &
    fi

    if [ $x -gt 0 ]; then
      $cmd $dir/log/progress.$x.log \
        nnet3-show-progress --use-gpu=no "nnet3-am-copy --raw=true $dir/$[$x-1].mdl - |" "nnet3-am-copy --raw=true $dir/$x.mdl - |" \
        '&&' \
        nnet3-info "nnet3-am-copy --raw=true $dir/$x.mdl - |" &
    fi


    echo "Training neural net (pass $x)"

    cache_read_opt="--read-cache=$dir/cache.$x"

    ( # this sub-shell is so that when we "wait" below,
      # we only wait for the training jobs that we just spawned,
      # not the diagnostic jobs that we spawned above.

      # We can't easily use a single parallel SGE job to do the main training,
      # because the computation of which archive and which --frame option
      # to use for each job is a little complex, so we spawn each one separately.
      for n in `seq $num_jobs_nnet`; do
        k=$[$num_archives_processed + $n - 1]; # k is a zero-based index that we'll derive
                                               # the other indexes from.
        archive=$[($k%$num_archives)+1]; # work out the 1-based archive index.

        if [ $n -eq 1 ]; then
          # an option for writing cache (storing pairs of nnet-computations and
          # computation-requests) during training.
          cache_write_opt=" --write-cache=$dir/cache.$[$x+1]"
        else
          cache_write_opt=""
        fi

        if $use_frame_shift; then
          frame_shift=$[(k%num_archives + k/num_archives) % frame_subsampling_factor]
        else
          frame_shift=0
        fi

        #archive=$[(($n+($x*$num_jobs_nnet))%$num_archives)+1]
        if $scale_max_param_change; then
          this_max_param_change=$(perl -e "print ($max_param_change * $num_jobs_nnet);")
        else
          this_max_param_change=$max_param_change
        fi

        $cmd $train_queue_opt $dir/log/train.$x.$n.log \
          nnet3-discriminative-train $cache_read_opt $cache_write_opt \
          --apply-deriv-weights=$apply_deriv_weights \
          --optimization.min-deriv-time=-$model_left_context \
          --optimization.max-deriv-time-relative=$model_right_context \
            $parallel_train_opts \
          --max-param-change=$this_max_param_change \
          --silence-phones=$silphonelist \
          --criterion=$criterion --drop-frames=$drop_frames \
          --one-silence-class=$one_silence_class \
          --boost=$boost --acoustic-scale=$acoustic_scale $regularization_opts \
          $dir/$x.mdl \
          "ark,bg:nnet3-discriminative-copy-egs --frame-shift=$frame_shift ark:$degs_dir/degs.$archive.ark ark:- | nnet3-discriminative-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x ark:- ark:- | nnet3-discriminative-merge-egs --minibatch-size=$minibatch_size ark:- ark:- |" \
          $dir/$[$x+1].$n.raw || touch $dir/.error &
      done
      wait
      [ -f $dir/.error ] && exit 1
    )
    [ -f $dir/.error ] && { echo "Found $dir/.error. See $dir/log/train.$x.*.log"; exit 1; }

    nnets_list=$(for n in $(seq $num_jobs_nnet); do echo $dir/$[$x+1].$n.raw; done)

    # below use run.pl instead of a generic $cmd for these very quick stages,
    # so that we don't run the risk of waiting for a possibly hard-to-get GPU.
    run.pl $dir/log/average.$x.log \
      nnet3-average $nnets_list - \| \
      nnet3-am-copy --set-raw-nnet=- $dir/$x.mdl $dir/$[$x+1].mdl || exit 1;

    rm $nnets_list
    [ ! -f $dir/$[$x+1].mdl ] && echo "$0: Did not create $dir/$[$x+1].mdl" && exit 1;
    if [ -f $dir/$[$x-1].mdl ] && $cleanup && \
       [ $[($x-1)%$keep_model_iters] -ne 0  ] && \
       [ -z "${iter_to_epoch[$[$x-1]]}" ]; then
      rm $dir/$[$x-1].mdl
    fi

    [ -f $dir/.error ] && { echo "Found $dir/.error. Error on iteration $x"; exit 1; }
  fi

  rm $dir/cache.$x 2>/dev/null || true
  x=$[$x+1]
  num_archives_processed=$[num_archives_processed+num_jobs_nnet]

  if [ $stage -le $x ] && [ ! -z "${iter_to_epoch[$x]}" ]; then
    e=${iter_to_epoch[$x]}
    ln -sf $x.mdl $dir/epoch$e.mdl

    (
      rm $dir/.error 2> /dev/null

      steps/nnet3/adjust_priors.sh --egs-type degs \
        --num-jobs-compute-prior $num_jobs_compute_prior \
        --cmd "$cmd" --use-gpu false \
        --minibatch-size $minibatch_size \
        --use-raw-nnet false --iter epoch$e $dir $degs_dir \
        || { touch $dir/.error; echo "Error in adjusting priors. See errors above."; exit 1; }
    ) &
  fi

done

rm $dir/final.mdl 2>/dev/null
cp $dir/$x.mdl $dir/final.mdl

# function to remove egs that might be soft links.
remove () { for x in $*; do [ -L $x ] && rm $(utils/make_absolute.sh $x); rm $x; done }

if $cleanup && $remove_egs; then  # note: this is false by default.
  echo Removing training examples
  remove $degs_dir/degs.*
  remove $degs_dir/priors_egs.*
fi


if $cleanup; then
  echo Removing most of the models
  for x in `seq 1 $keep_model_iters $num_iters`; do
    if [ -z "${iter_to_epoch[$x]}" ]; then
      # if $x is not an epoch-final iteration..
      rm $dir/$x.mdl 2>/dev/null
    fi
  done
fi

wait
[ -f $dir/.error ] && { echo "Found $dir/.error."; exit 1; }

echo Done && exit 0


================================================
FILE: egs/steps/nnet3/train_dnn.py
================================================
#!/usr/bin/env python

# Copyright 2016    Vijayaditya Peddinti.
#           2016    Vimal Manohar
#           2017 Johns Hopkins University (author: Daniel Povey)
# Apache 2.0.

""" This script is based on steps/nnet3/tdnn/train.sh
"""

from __future__ import print_function
from __future__ import division
import argparse
import logging
import os
import pprint
import shutil
import sys
import traceback

sys.path.insert(0, 'steps')
import libs.nnet3.train.common as common_train_lib
import libs.common as common_lib
import libs.nnet3.train.frame_level_objf as train_lib
import libs.nnet3.report.log_parse as nnet3_log_parse


logger = logging.getLogger('libs')
logger.setLevel(logging.INFO)
handler = logging.StreamHandler()
handler.setLevel(logging.INFO)
formatter = logging.Formatter("%(asctime)s [%(pathname)s:%(lineno)s - "
                              "%(funcName)s - %(levelname)s ] %(message)s")
handler.setFormatter(formatter)
logger.addHandler(handler)
logger.info('Starting DNN trainer (train_dnn.py)')


def get_args():
    """ Get args from stdin.

    We add compulsory arguments as named arguments for readability

    The common options are defined in the object
    libs.nnet3.train.common.CommonParser.parser.
    See steps/libs/nnet3/train/common.py
    """
    parser = argparse.ArgumentParser(
        description="""Trains a feed forward DNN acoustic model using the
        cross-entropy objective.  DNNs include simple DNNs, TDNNs and CNNs.""",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        conflict_handler='resolve',
        parents=[common_train_lib.CommonParser(include_chunk_context=False).parser])

    # egs extraction options
    parser.add_argument("--egs.frames-per-eg", type=int, dest='frames_per_eg',
                        default=8,
                        help="Number of output labels per example")

    # trainer options
    parser.add_argument("--trainer.input-model", type=str,
                        dest='input_model', default=None,
                        action=common_lib.NullstrToNoneAction,
                        help="""If specified, this model is used as initial
                        raw model (0.raw in the script) instead of initializing
                        the model from xconfig. Configs dir is not expected to
                        exist and left/right context is computed from this
                        model.""")
    parser.add_argument("--trainer.prior-subset-size", type=int,
                        dest='prior_subset_size', default=20000,
                        help="Number of samples for computing priors")
    parser.add_argument("--trainer.num-jobs-compute-prior", type=int,
                        dest='num_jobs_compute_prior', default=10,
                        help="The prior computation jobs are single "
                        "threaded and run on the CPU")

    # Parameters for the optimization
    parser.add_argument("--trainer.optimization.minibatch-size",
                        type=str, dest='minibatch_size', default='512',
                        help="""Size of the minibatch used in SGD training
                        (argument to nnet3-merge-egs); may be a more general
                        rule as accepted by the --minibatch-size option of
                        nnet3-merge-egs; run that program without args to see
                        the format.""")

    # General options
    parser.add_argument("--feat-dir", type=str, required=False,
                        help="Directory with features used for training "
                        "the neural network.")
    parser.add_argument("--lang", type=str, required=False,
                        help="Language directory")
    parser.add_argument("--ali-dir", type=str, required=True,
                        help="Directory with alignments used for training "
                        "the neural network.")
    parser.add_argument("--dir", type=str, required=True,
                        help="Directory to store the models and "
                        "all other files.")

    print(' '.join(sys.argv), file=sys.stderr)
    print(sys.argv, file=sys.stderr)

    args = parser.parse_args()

    [args, run_opts] = process_args(args)

    return [args, run_opts]


def process_args(args):
    """ Process the options got from get_args()
    """

    if args.frames_per_eg < 1:
        raise Exception("--egs.frames-per-eg should have a minimum value of 1")

    if not common_train_lib.validate_minibatch_size_str(args.minibatch_size):
        raise Exception("--trainer.rnn.num-chunk-per-minibatch has an invalid value")

    if (not os.path.exists(args.dir)):
        raise Exception("Directory specified with --dir={0} "
                        "does not exist.".format(args.dir))
    if (not os.path.exists(args.dir + "/configs") and
        (args.input_model is None or not os.path.exists(args.input_model))):
        raise Exception("Either --trainer.input-model option should be supplied, "
                        "and exist; or the {0}/configs directory should exist."
                        "{0}/configs is the output of make_configs.py"
                        "".format(args.dir))

    # set the options corresponding to args.use_gpu
    run_opts = common_train_lib.RunOpts()
    if args.use_gpu in ["true", "false"]:
        args.use_gpu = ("yes" if args.use_gpu == "true" else "no")
    if args.use_gpu in ["yes", "wait"]:
        if not common_lib.check_if_cuda_compiled():
            logger.warning(
                """You are running with one thread but you have not compiled
                   for CUDA.  You may be running a setup optimized for GPUs.
                   If you have GPUs and have nvcc installed, go to src/ and do
                   ./configure; make""")

        run_opts.train_queue_opt = "--gpu 1"
        run_opts.parallel_train_opts = "--use-gpu={}".format(args.use_gpu)
        run_opts.combine_gpu_opt = "--use-gpu={}".format(args.use_gpu)
        run_opts.combine_queue_opt = "--gpu 1"
        run_opts.prior_gpu_opt = "--use-gpu={}".format(args.use_gpu)
        run_opts.prior_queue_opt = "--gpu 1"

    else:
        logger.warning("Without using a GPU this will be very slow. "
                       "nnet3 does not yet support multiple threads.")

        run_opts.train_queue_opt = ""
        run_opts.parallel_train_opts = "--use-gpu=no"
        run_opts.combine_gpu_opt = "--use-gpu=no"
        run_opts.combine_queue_opt = ""
        run_opts.prior_gpu_opt = "--use-gpu=no"
        run_opts.prior_queue_opt = ""

    run_opts.command = args.command
    run_opts.egs_command = (args.egs_command
                            if args.egs_command is not None else
                            args.command)
    run_opts.num_jobs_compute_prior = args.num_jobs_compute_prior

    return [args, run_opts]


def train(args, run_opts):
    """ The main function for training.

    Args:
        args: a Namespace object with the required parameters
            obtained from the function process_args()
        run_opts: RunOpts object obtained from the process_args()
    """

    arg_string = pprint.pformat(vars(args))
    logger.info("Arguments for the experiment\n{0}".format(arg_string))

    # Copy phones.txt from ali-dir to dir. Later, steps/nnet3/decode.sh will
    # use it to check compatibility between training and decoding phone-sets.
    shutil.copy('{0}/phones.txt'.format(args.ali_dir), args.dir)

    # Set some variables.
    # num_leaves = common_lib.get_number_of_leaves_from_tree(args.ali_dir)
    num_jobs = common_lib.get_number_of_jobs(args.ali_dir)
    feat_dim = common_lib.get_feat_dim(args.feat_dir)
    ivector_dim = common_lib.get_ivector_dim(args.online_ivector_dir)
    ivector_id = common_lib.get_ivector_extractor_id(args.online_ivector_dir)

    # split the training data into parts for individual jobs
    # we will use the same number of jobs as that used for alignment
    common_lib.execute_command("utils/split_data.sh {0} {1}".format(
        args.feat_dir, num_jobs))
    shutil.copy('{0}/tree'.format(args.ali_dir), args.dir)

    with open('{0}/num_jobs'.format(args.dir), 'w') as f:
        f.write('{}'.format(num_jobs))

    if args.input_model is None:
        config_dir = '{0}/configs'.format(args.dir)
        var_file = '{0}/vars'.format(config_dir)

        variables = common_train_lib.parse_generic_config_vars_file(var_file)
    else:
        # If args.input_model is specified, the model left and right contexts
        # are computed using input_model.
        variables = common_train_lib.get_input_model_info(args.input_model)

    # Set some variables.
    try:
        model_left_context = variables['model_left_context']
        model_right_context = variables['model_right_context']
    except KeyError as e:
        raise Exception("KeyError {0}: Variables need to be defined in "
                        "{1}".format(str(e), '{0}/configs'.format(args.dir)))

    left_context = model_left_context
    right_context = model_right_context

    # Initialize as "raw" nnet, prior to training the LDA-like preconditioning
    # matrix.  This first config just does any initial splicing that we do;
    # we do this as it's a convenient way to get the stats for the 'lda-like'
    # transform.

    if (args.stage <= -5) and os.path.exists(args.dir+"/configs/init.config") and \
       (args.input_model is None):
        logger.info("Initializing a basic network for estimating "
                    "preconditioning matrix")
        common_lib.execute_command(
            """{command} {dir}/log/nnet_init.log \
                    nnet3-init --srand=-2 {dir}/configs/init.config \
                    {dir}/init.raw""".format(command=run_opts.command,
                                             dir=args.dir))

    default_egs_dir = '{0}/egs'.format(args.dir)
    if (args.stage <= -4) and args.egs_dir is None:
        logger.info("Generating egs")

        if args.feat_dir is None:
            raise Exception("--feat-dir option is required if you don't supply --egs-dir")

        train_lib.acoustic_model.generate_egs(
            data=args.feat_dir, alidir=args.ali_dir, egs_dir=default_egs_dir,
            left_context=left_context, right_context=right_context,
            run_opts=run_opts,
            frames_per_eg_str=str(args.frames_per_eg),
            srand=args.srand,
            egs_opts=args.egs_opts,
            cmvn_opts=args.cmvn_opts,
            online_ivector_dir=args.online_ivector_dir,
            samples_per_iter=args.samples_per_iter,
            stage=args.egs_stage)

    if args.egs_dir is None:
        egs_dir = default_egs_dir
    else:
        egs_dir = args.egs_dir

    [egs_left_context, egs_right_context,
     frames_per_eg_str, num_archives] = (
         common_train_lib.verify_egs_dir(egs_dir, feat_dim,
                                         ivector_dim, ivector_id,
                                         left_context, right_context))
    assert str(args.frames_per_eg) == frames_per_eg_str

    if args.num_jobs_final > num_archives:
        raise Exception('num_jobs_final cannot exceed the number of archives '
                        'in the egs directory')

    # copy the properties of the egs to dir for
    # use during decoding
    common_train_lib.copy_egs_properties_to_exp_dir(egs_dir, args.dir)

    if args.stage <= -3 and os.path.exists(args.dir+"/configs/init.config") and (args.input_model is None):
        logger.info('Computing the preconditioning matrix for input features')

        train_lib.common.compute_preconditioning_matrix(
            args.dir, egs_dir, num_archives, run_opts,
            max_lda_jobs=args.max_lda_jobs,
            rand_prune=args.rand_prune)

    if args.stage <= -2 and (args.input_model is None):
        logger.info("Computing initial vector for FixedScaleComponent before"
                    " softmax, using priors^{prior_scale} and rescaling to"
                    " average 1".format(
                        prior_scale=args.presoftmax_prior_scale_power))

        common_train_lib.compute_presoftmax_prior_scale(
            args.dir, args.ali_dir, num_jobs, run_opts,
            presoftmax_prior_scale_power=args.presoftmax_prior_scale_power)

    if args.stage <= -1:
        logger.info("Preparing the initial acoustic model.")
        train_lib.acoustic_model.prepare_initial_acoustic_model(
            args.dir, args.ali_dir, run_opts,
            input_model=args.input_model)

    # set num_iters so that as close as possible, we process the data
    # $num_epochs times, i.e. $num_iters*$avg_num_jobs) ==
    # $num_epochs*$num_archives, where
    # avg_num_jobs=(num_jobs_initial+num_jobs_final)/2.
    num_archives_expanded = num_archives * args.frames_per_eg
    num_archives_to_process = int(args.num_epochs * num_archives_expanded)
    num_archives_processed = 0
    num_iters = int(num_archives_to_process * 2 / (args.num_jobs_initial + args.num_jobs_final))

    # If do_final_combination is True, compute the set of models_to_combine.
    # Otherwise, models_to_combine will be none.
    if args.do_final_combination:
        models_to_combine = common_train_lib.get_model_combine_iters(
            num_iters, args.num_epochs,
            num_archives_expanded, args.max_models_combine,
            args.num_jobs_final)
    else:
        models_to_combine = None

    logger.info("Training will run for {0} epochs = "
                "{1} iterations".format(args.num_epochs, num_iters))

    for iter in range(num_iters):
        if (args.exit_stage is not None) and (iter == args.exit_stage):
            logger.info("Exiting early due to --exit-stage {0}".format(iter))
            return

        current_num_jobs = common_train_lib.get_current_num_jobs(
            iter, num_iters,
            args.num_jobs_initial, args.num_jobs_step, args.num_jobs_final)

        if args.stage <= iter:
            lrate = common_train_lib.get_learning_rate(iter, current_num_jobs,
                                                       num_iters,
                                                       num_archives_processed,
                                                       num_archives_to_process,
                                                       args.initial_effective_lrate,
                                                       args.final_effective_lrate)
            shrinkage_value = 1.0 - (args.proportional_shrink * lrate)
            if shrinkage_value <= 0.5:
                raise Exception("proportional-shrink={0} is too large, it gives "
                                "shrink-value={1}".format(args.proportional_shrink,
                                                          shrinkage_value))

            percent = num_archives_processed * 100.0 / num_archives_to_process
            epoch = (num_archives_processed * args.num_epochs
                     / num_archives_to_process)
            shrink_info_str = ''
            if shrinkage_value != 1.0:
                shrink_info_str = 'shrink: {0:0.5f}'.format(shrinkage_value)
            logger.info("Iter: {0}/{1}   Jobs: {2}   "
                        "Epoch: {3:0.2f}/{4:0.1f} ({5:0.1f}% complete)   "
                        "lr: {6:0.6f}   {7}".format(iter, num_iters - 1,
                                                    current_num_jobs,
                                                    epoch, args.num_epochs,
                                                    percent,
                                                    lrate, shrink_info_str))

            train_lib.common.train_one_iteration(
                dir=args.dir,
                iter=iter,
                srand=args.srand,
                egs_dir=egs_dir,
                num_jobs=current_num_jobs,
                num_archives_processed=num_archives_processed,
                num_archives=num_archives,
                learning_rate=lrate,
                dropout_edit_string=common_train_lib.get_dropout_edit_string(
                    args.dropout_schedule,
                    float(num_archives_processed) / num_archives_to_process,
                    iter),
                train_opts=' '.join(args.train_opts),
                minibatch_size_str=args.minibatch_size,
                frames_per_eg=args.frames_per_eg,
                momentum=args.momentum,
                max_param_change=args.max_param_change,
                shrinkage_value=shrinkage_value,
                shuffle_buffer_size=args.shuffle_buffer_size,
                run_opts=run_opts)

            if args.cleanup:
                # do a clean up everythin but the last 2 models, under certain
                # conditions
                common_train_lib.remove_model(
                    args.dir, iter-2, num_iters, models_to_combine,
                    args.preserve_model_interval)

            if args.email is not None:
                reporting_iter_interval = num_iters * args.reporting_interval
                if iter % reporting_iter_interval == 0:
                    # lets do some reporting
                    [report, times, data] = (
                        nnet3_log_parse.generate_acc_logprob_report(args.dir))
                    message = report
                    subject = ("Update : Expt {dir} : "
                               "Iter {iter}".format(dir=args.dir, iter=iter))
                    common_lib.send_mail(message, subject, args.email)

        num_archives_processed = num_archives_processed + current_num_jobs

    if args.stage <= num_iters:
        if args.do_final_combination:
            logger.info("Doing final combination to produce final.mdl")
            train_lib.common.combine_models(
                dir=args.dir, num_iters=num_iters,
                models_to_combine=models_to_combine,
                egs_dir=egs_dir,
                minibatch_size_str=args.minibatch_size, run_opts=run_opts,
                max_objective_evaluations=args.max_objective_evaluations)

    if args.stage <= num_iters + 1:
        logger.info("Getting average posterior for purposes of "
                    "adjusting the priors.")

        # If args.do_final_combination is true, we will use the combined model.
        # Otherwise, we will use the last_numbered model.
        real_iter = 'combined' if args.do_final_combination else num_iters
        avg_post_vec_file = train_lib.common.compute_average_posterior(
            dir=args.dir, iter=real_iter,
            egs_dir=egs_dir, num_archives=num_archives,
            prior_subset_size=args.prior_subset_size, run_opts=run_opts)

        logger.info("Re-adjusting priors based on computed posteriors")
        combined_or_last_numbered_model = "{dir}/{iter}.mdl".format(dir=args.dir,
                iter=real_iter)
        final_model = "{dir}/final.mdl".format(dir=args.dir)
        train_lib.common.adjust_am_priors(args.dir, combined_or_last_numbered_model,
                avg_post_vec_file, final_model, run_opts)


    if args.cleanup:
        logger.info("Cleaning up the experiment directory "
                    "{0}".format(args.dir))
        remove_egs = args.remove_egs
        if args.egs_dir is not None:
            # this egs_dir was not created by this experiment so we will not
            # delete it
            remove_egs = False

        common_train_lib.clean_nnet_dir(
            nnet_dir=args.dir, num_iters=num_iters, egs_dir=egs_dir,
            preserve_model_interval=args.preserve_model_interval,
            remove_egs=remove_egs)

    # do some reporting
    [report, times, data] = nnet3_log_parse.generate_acc_logprob_report(args.dir)
    if args.email is not None:
        common_lib.send_mail(report, "Update : Expt {0} : "
                                     "complete".format(args.dir), args.email)

    with open("{dir}/accuracy.report".format(dir=args.dir), "w") as f:
        f.write(report)

    common_lib.execute_command("steps/info/nnet3_dir_info.pl "
                               "{0}".format(args.dir))


def main():
    [args, run_opts] = get_args()
    try:
        train(args, run_opts)
        common_lib.wait_for_background_commands()
    except BaseException as e:
        # look for BaseException so we catch KeyboardInterrupt, which is
        # what we get when a background thread dies.
        if args.email is not None:
            message = ("Training session for experiment {dir} "
                       "died due to an error.".format(dir=args.dir))
            common_lib.send_mail(message, message, args.email)
        if not isinstance(e, KeyboardInterrupt):
            traceback.print_exc()
        sys.exit(1)


if __name__ == "__main__":
    main()


================================================
FILE: egs/steps/nnet3/train_raw_dnn.py
================================================
#!/usr/bin/env python

# Copyright 2016    Vijayaditya Peddinti.
#           2016    Vimal Manohar
# Apache 2.0.

""" This script is similar to steps/nnet3/train_dnn.py but trains a
raw neural network instead of an acoustic model.
"""

from __future__ import print_function
from __future__ import division
import argparse
import logging
import pprint
import os
import sys
import traceback

sys.path.insert(0, 'steps')
import libs.nnet3.train.common as common_train_lib
import libs.common as common_lib
import libs.nnet3.train.frame_level_objf as train_lib
import libs.nnet3.report.log_parse as nnet3_log_parse


logger = logging.getLogger('libs')
logger.setLevel(logging.INFO)
handler = logging.StreamHandler()
handler.setLevel(logging.INFO)
formatter = logging.Formatter("%(asctime)s [%(pathname)s:%(lineno)s - "
                              "%(funcName)s - %(levelname)s ] %(message)s")
handler.setFormatter(formatter)
logger.addHandler(handler)
logger.info('Starting raw DNN trainer (train_raw_dnn.py)')


def get_args():
    """ Get args from stdin.

    The common options are defined in the object
    libs.nnet3.train.common.CommonParser.parser.
    See steps/libs/nnet3/train/common.py
    """

    parser = argparse.ArgumentParser(
        description="""Trains a feed forward raw DNN (without transition model)
        using frame-level objectives like cross-entropy and mean-squared-error.
        DNNs include simple DNNs, TDNNs and CNNs.""",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        conflict_handler='resolve',
        parents=[common_train_lib.CommonParser(include_chunk_context=False).parser])

    # egs extraction options
    parser.add_argument("--egs.frames-per-eg", type=int, dest='frames_per_eg',
                        default=8,
                        help="Number of output labels per example")
    parser.add_argument("--image.augmentation-opts", type=str,
                        dest='image_augmentation_opts',
                        default=None,
                        help="Image augmentation options")

    # trainer options
    parser.add_argument("--trainer.input-model", type=str,
                        dest='input_model', default=None,
                        action=common_lib.NullstrToNoneAction,
                        help="""If specified, this model is used as initial
                        raw model (0.raw in the script) instead of initializing
                        the model from xconfig. Configs dir is not expected to
                        exist and left/right context is computed from this
                        model.""")
    parser.add_argument("--trainer.prior-subset-size", type=int,
                        dest='prior_subset_size', default=20000,
                        help="Number of samples for computing priors")
    parser.add_argument("--trainer.num-jobs-compute-prior", type=int,
                        dest='num_jobs_compute_prior', default=10,
                        help="The prior computation jobs are single "
                        "threaded and run on the CPU")

    # Parameters for the optimization
    parser.add_argument("--trainer.optimization.minibatch-size",
                        type=str, dest='minibatch_size', default='512',
                        help="""Size of the minibatch used in SGD training
                        (argument to nnet3-merge-egs); may be a more general
                        rule as accepted by the --minibatch-size option of
                        nnet3-merge-egs; run that program without args to see
                        the format.""")
    parser.add_argument("--compute-average-posteriors",
                        type=str, action=common_lib.StrToBoolAction,
                        choices=["true", "false"], default=False,
                        help="""If true, then the average output of the
                        network is computed and dumped as post.final.vec""")

    # General options
    parser.add_argument("--nj", type=int, default=4,
                        help="Number of parallel jobs")
    parser.add_argument("--use-dense-targets", type=str,
                        action=common_lib.StrToBoolAction,
                        default=True, choices=["true", "false"],
                        help="Train neural network using dense targets")
    parser.add_argument("--feat-dir", type=str, required=False,
                        help="Directory with features used for training "
                        "the neural network.")
    parser.add_argument("--targets-scp", type=str, required=False,
                        help="""Targets for training neural network.
                        This is a kaldi-format SCP file of target matrices.
                        <utterance-id> <extended-filename-of-target-matrix>.
                        The target matrix's column dim must match 
                        the neural network output dim, and the
                        row dim must match the number of output frames 
                        i.e. after subsampling if "--frame-subsampling-factor" 
                        option is passed to --egs.opts.""")
    parser.add_argument("--dir", type=str, required=True,
                        help="Directory to store the models and "
                        "all other files.")

    print(' '.join(sys.argv))
    print(sys.argv)

    args = parser.parse_args()

    [args, run_opts] = process_args(args)

    return [args, run_opts]


def process_args(args):
    """ Process the options got from get_args()
    """

    if args.frames_per_eg < 1:
        raise Exception("--egs.frames-per-eg should have a minimum value of 1")

    if not common_train_lib.validate_minibatch_size_str(args.minibatch_size):
        raise Exception("--trainer.optimization.minibatch-size has an invalid value")

    if (not os.path.exists(args.dir)):
        raise Exception("Directory specified with --dir={0} "
                        "does not exist.".format(args.dir))
    if (not os.path.exists(args.dir + "/configs") and
        (args.input_model is None or not os.path.exists(args.input_model))):
        raise Exception("Either --trainer.input-model option should be supplied, "
                        "and exist; or the {0}/configs directory should exist."
                        "{0}/configs is the output of make_configs.py"
                        "".format(args.dir))

    # set the options corresponding to args.use_gpu
    run_opts = common_train_lib.RunOpts()
    if args.use_gpu in ["true", "false"]:
        args.use_gpu = ("yes" if args.use_gpu == "true" else "no")
    if args.use_gpu in ["yes", "wait"]:
        if not common_lib.check_if_cuda_compiled():
            logger.warning(
                """You are running with one thread but you have not compiled
                   for CUDA.  You may be running a setup optimized for GPUs.
                   If you have GPUs and have nvcc installed, go to src/ and do
                   ./configure; make""")

        run_opts.train_queue_opt = "--gpu 1"
        run_opts.parallel_train_opts = "--use-gpu={}".format(args.use_gpu)
        run_opts.combine_gpu_opt = "--use-gpu={}".format(args.use_gpu)
        run_opts.combine_queue_opt = "--gpu 1"
        run_opts.prior_gpu_opt = "--use-gpu={}".format(args.use_gpu)
        run_opts.prior_queue_opt = "--gpu 1"

    else:
        logger.warning("Without using a GPU this will be very slow. "
                       "nnet3 does not yet support multiple threads.")

        run_opts.train_queue_opt = ""
        run_opts.parallel_train_opts = "--use-gpu=no"
        run_opts.combine_gpu_opt = "--use-gpu=no"
        run_opts.combine_queue_opt = ""
        run_opts.prior_gpu_opt = "--use-gpu=no"
        run_opts.prior_queue_opt = ""

    run_opts.command = args.command
    run_opts.egs_command = (args.egs_command
                            if args.egs_command is not None else
                            args.command)
    run_opts.num_jobs_compute_prior = args.num_jobs_compute_prior

    return [args, run_opts]


def train(args, run_opts):
    """ The main function for training.

    Args:
        args: a Namespace object with the required parameters
            obtained from the function process_args()
        run_opts: RunOpts object obtained from the process_args()
    """

    arg_string = pprint.pformat(vars(args))
    logger.info("Arguments for the experiment\n{0}".format(arg_string))

    # Set some variables.

    # note, feat_dim gets set to 0 if args.feat_dir is unset (None).
    feat_dim = common_lib.get_feat_dim(args.feat_dir)
    ivector_dim = common_lib.get_ivector_dim(args.online_ivector_dir)
    ivector_id = common_lib.get_ivector_extractor_id(args.online_ivector_dir)

    config_dir = '{0}/configs'.format(args.dir)
    var_file = '{0}/vars'.format(config_dir)

    if args.input_model is None:
        config_dir = '{0}/configs'.format(args.dir)
        var_file = '{0}/vars'.format(config_dir)

        variables = common_train_lib.parse_generic_config_vars_file(var_file)
    else:
        # If args.input_model is specified, the model left and right contexts
        # are computed using input_model.
        variables = common_train_lib.get_input_model_info(args.input_model)

    # Set some variables.
    try:
        model_left_context = variables['model_left_context']
        model_right_context = variables['model_right_context']

    except KeyError as e:
        raise Exception("KeyError {0}: Variables need to be defined in "
                        "{1}".format(str(e), '{0}/configs'.format(args.dir)))

    left_context = model_left_context
    right_context = model_right_context


    # Initialize as "raw" nnet, prior to training the LDA-like preconditioning
    # matrix.  This first config just does any initial splicing that we do;
    # we do this as it's a convenient way to get the stats for the 'lda-like'
    # transform.
    if (args.stage <= -4) and os.path.exists(args.dir+"/configs/init.config") and \
       (args.input_model is None):
        logger.info("Initializing the network for computing the LDA stats")
        common_lib.execute_command(
            """{command} {dir}/log/nnet_init.log \
                    nnet3-init --srand=-2 {dir}/configs/init.config \
                    {dir}/init.raw""".format(command=run_opts.command,
                                             dir=args.dir))

    default_egs_dir = '{0}/egs'.format(args.dir)
    if (args.stage <= -3) and args.egs_dir is None:
        if args.targets_scp is None or args.feat_dir is None:
            raise Exception("If you don't supply the --egs-dir option, the "
                            "--targets-scp and --feat-dir options are required.")

        logger.info("Generating egs")

        if args.use_dense_targets:
            target_type = "dense"
            try:
                num_targets = int(variables['num_targets'])
                if (common_lib.get_feat_dim_from_scp(args.targets_scp)
                        != num_targets):
                    raise Exception("Mismatch between num-targets provided to "
                                    "script vs configs")
            except KeyError as e:
                num_targets = -1
        else:
            target_type = "sparse"
            try:
                num_targets = int(variables['num_targets'])
            except KeyError as e:
                raise Exception("KeyError {0}: Variables need to be defined "
                                "in {1}".format(
                                    str(e), '{0}/configs'.format(args.dir)))

        train_lib.raw_model.generate_egs_using_targets(
            data=args.feat_dir, targets_scp=args.targets_scp,
            egs_dir=default_egs_dir,
            left_context=left_context, right_context=right_context,
            run_opts=run_opts,
            frames_per_eg_str=str(args.frames_per_eg),
            srand=args.srand,
            egs_opts=args.egs_opts,
            cmvn_opts=args.cmvn_opts,
            online_ivector_dir=args.online_ivector_dir,
            samples_per_iter=args.samples_per_iter,
            stage=args.egs_stage,
            target_type=target_type,
            num_targets=num_targets)

    if args.egs_dir is None:
        egs_dir = default_egs_dir
    else:
        egs_dir = args.egs_dir

    [egs_left_context, egs_right_context,
     frames_per_eg_str, num_archives] = (
         common_train_lib.verify_egs_dir(egs_dir, feat_dim,
                                         ivector_dim, ivector_id,
                                         left_context, right_context))
    assert str(args.frames_per_eg) == frames_per_eg_str

    if args.num_jobs_final > num_archives:
        raise Exception('num_jobs_final cannot exceed the number of archives '
                        'in the egs directory')

    # copy the properties of the egs to dir for
    # use during decoding
    common_train_lib.copy_egs_properties_to_exp_dir(egs_dir, args.dir)

    if args.stage <= -2 and os.path.exists(args.dir+"/configs/init.config") and \
       (args.input_model is None):
        logger.info('Computing the preconditioning matrix for input features')

        train_lib.common.compute_preconditioning_matrix(
            args.dir, egs_dir, num_archives, run_opts,
            max_lda_jobs=args.max_lda_jobs,
            rand_prune=args.rand_prune)

    if args.stage <= -1:
        logger.info("Preparing the initial network.")
        common_train_lib.prepare_initial_network(args.dir, run_opts, args.srand, args.input_model)

    # set num_iters so that as close as possible, we process the data
    # $num_epochs times, i.e. $num_iters*$avg_num_jobs) ==
    # $num_epochs*$num_archives, where
    # avg_num_jobs=(num_jobs_initial+num_jobs_final)/2.
    num_archives_expanded = num_archives * args.frames_per_eg
    num_archives_to_process = int(args.num_epochs * num_archives_expanded)
    num_archives_processed = 0
    num_iters = int((num_archives_to_process * 2) / (args.num_jobs_initial + args.num_jobs_final))

    # If do_final_combination is True, compute the set of models_to_combine.
    # Otherwise, models_to_combine will be none.
    if args.do_final_combination:
        models_to_combine = common_train_lib.get_model_combine_iters(
            num_iters, args.num_epochs,
            num_archives_expanded, args.max_models_combine,
            args.num_jobs_final)
    else:
        models_to_combine = None

    if os.path.exists('{0}/valid_diagnostic.scp'.format(egs_dir)):
        if os.path.exists('{0}/valid_diagnostic.egs'.format(egs_dir)):
            raise Exception('both {0}/valid_diagnostic.egs and '
                            '{0}/valid_diagnostic.scp exist.'
                            'This script expects only one of them to exist.'
                            ''.format(egs_dir))
        use_multitask_egs = True
    else:
        if not os.path.exists('{0}/valid_diagnostic.egs'.format(egs_dir)):
            raise Exception('neither {0}/valid_diagnostic.egs nor '
                            '{0}/valid_diagnostic.scp exist.'
                            'This script expects one of them.'
                            ''.format(egs_dir))
        use_multitask_egs = False

    logger.info("Training will run for {0} epochs = "
                "{1} iterations".format(args.num_epochs, num_iters))

    for iter in range(num_iters):
        if (args.exit_stage is not None) and (iter == args.exit_stage):
            logger.info("Exiting early due to --exit-stage {0}".format(iter))
            return

        current_num_jobs = common_train_lib.get_current_num_jobs(
            iter, num_iters,
            args.num_jobs_initial, args.num_jobs_step, args.num_jobs_final)

        if args.stage <= iter:
            lrate = common_train_lib.get_learning_rate(iter, current_num_jobs,
                                                       num_iters,
                                                       num_archives_processed,
                                                       num_archives_to_process,
                                                       args.initial_effective_lrate,
                                                       args.final_effective_lrate)

            shrinkage_value = 1.0 - (args.proportional_shrink * lrate)
            if shrinkage_value <= 0.5:
                raise Exception("proportional-shrink={0} is too large, it gives "
                                "shrink-value={1}".format(args.proportional_shrink,
                                                          shrinkage_value))

            percent = num_archives_processed * 100.0 / num_archives_to_process
            epoch = (num_archives_processed * args.num_epochs
                     / num_archives_to_process)
            shrink_info_str = ''
            if shrinkage_value != 1.0:
                shrink_info_str = 'shrink: {0:0.5f}'.format(shrinkage_value)
            logger.info("Iter: {0}/{1}   Jobs: {2}   "
                        "Epoch: {3:0.2f}/{4:0.1f} ({5:0.1f}% complete)   "
                        "lr: {6:0.6f}   {7}".format(iter, num_iters - 1,
                                                    current_num_jobs,
                                                    epoch, args.num_epochs,
                                                    percent,
                                                    lrate, shrink_info_str))

            train_lib.common.train_one_iteration(
                dir=args.dir,
                iter=iter,
                srand=args.srand,
                egs_dir=egs_dir,
                num_jobs=current_num_jobs,
                num_archives_processed=num_archives_processed,
                num_archives=num_archives,
                learning_rate=lrate,
                dropout_edit_string=common_train_lib.get_dropout_edit_string(
                    args.dropout_schedule,
                    float(num_archives_processed) / num_archives_to_process,
                    iter),
                train_opts=' '.join(args.train_opts),
                minibatch_size_str=args.minibatch_size,
                frames_per_eg=args.frames_per_eg,
                momentum=args.momentum,
                max_param_change=args.max_param_change,
                shrinkage_value=shrinkage_value,
                shuffle_buffer_size=args.shuffle_buffer_size,
                run_opts=run_opts,
                get_raw_nnet_from_am=False,
                image_augmentation_opts=args.image_augmentation_opts,
                use_multitask_egs=use_multitask_egs,
                backstitch_training_scale=args.backstitch_training_scale,
                backstitch_training_interval=args.backstitch_training_interval)

            if args.cleanup:
                # do a clean up everything but the last 2 models, under certain
                # conditions
                common_train_lib.remove_model(
                    args.dir, iter-2, num_iters, models_to_combine,
                    args.preserve_model_interval,
                    get_raw_nnet_from_am=False)

            if args.email is not None:
                reporting_iter_interval = num_iters * args.reporting_interval
                if iter % reporting_iter_interval == 0:
                    # lets do some reporting
                    [report, times, data] = (
                        nnet3_log_parse.generate_acc_logprob_report(args.dir))
                    message = report
                    subject = ("Update : Expt {dir} : "
                               "Iter {iter}".format(dir=args.dir, iter=iter))
                    common_lib.send_mail(message, subject, args.email)

        num_archives_processed = num_archives_processed + current_num_jobs

    if args.stage <= num_iters:
        if args.do_final_combination:
            logger.info("Doing final combination to produce final.raw")
            train_lib.common.combine_models(
                dir=args.dir, num_iters=num_iters,
                models_to_combine=models_to_combine, egs_dir=egs_dir,
                minibatch_size_str=args.minibatch_size, run_opts=run_opts,
                get_raw_nnet_from_am=False,
                max_objective_evaluations=args.max_objective_evaluations,
                use_multitask_egs=use_multitask_egs)
        else:
            common_lib.force_symlink("{0}.raw".format(num_iters),
                                     "{0}/final.raw".format(args.dir))

    if args.compute_average_posteriors and args.stage <= num_iters + 1:
        logger.info("Getting average posterior for output-node 'output'.")
        train_lib.common.compute_average_posterior(
            dir=args.dir, iter='final', egs_dir=egs_dir,
            num_archives=num_archives,
            prior_subset_size=args.prior_subset_size, run_opts=run_opts,
            get_raw_nnet_from_am=False)

    if args.cleanup:
        logger.info("Cleaning up the experiment directory "
                    "{0}".format(args.dir))
        remove_egs = args.remove_egs
        if args.egs_dir is not None:
            # this egs_dir was not created by this experiment so we will not
            # delete it
            remove_egs = False

        common_train_lib.clean_nnet_dir(
            nnet_dir=args.dir, num_iters=num_iters, egs_dir=egs_dir,
            preserve_model_interval=args.preserve_model_interval,
            remove_egs=remove_egs,
            get_raw_nnet_from_am=False)

    # do some reporting
    outputs_list = common_train_lib.get_outputs_list("{0}/final.raw".format(
        args.dir), get_raw_nnet_from_am=False)
    if 'output' in outputs_list:
        [report, times, data] = nnet3_log_parse.generate_acc_logprob_report(
            args.dir)
        if args.email is not None:
            common_lib.send_mail(report, "Update : Expt {0} : "
                                         "complete".format(args.dir),
                                 args.email)

        with open("{dir}/accuracy.{output_name}.report".format(dir=args.dir,
                                                               output_name="output"),
                  "w") as f:
            f.write(report)

    common_lib.execute_command("steps/info/nnet3_dir_info.pl "
                               "{0}".format(args.dir))


def main():
    [args, run_opts] = get_args()
    try:
        train(args, run_opts)
        common_lib.wait_for_background_commands()
    except BaseException as e:
        # look for BaseException so we catch KeyboardInterrupt, which is
        # what we get when a background thread dies.
        if args.email is not None:
            message = ("Training session for experiment {dir} "
                       "died due to an error.".format(dir=args.dir))
            common_lib.send_mail(message, message, args.email)
        if not isinstance(e, KeyboardInterrupt):
            traceback.print_exc()
        sys.exit(1)


if __name__ == "__main__":
    main()


================================================
FILE: egs/steps/nnet3/train_raw_rnn.py
================================================
#!/usr/bin/env python


# Copyright 2016 Vijayaditya Peddinti.
#           2016 Vimal Manohar
#           2017 Johns Hopkins University (author: Daniel Povey)
# Apache 2.0.

""" This script is similar to steps/nnet3/train_rnn.py but trains a
raw neural network instead of an acoustic model.
"""
from __future__ import print_function
from __future__ import division
import argparse
import logging
import pprint
import os
import sys
import traceback

sys.path.insert(0, 'steps')
import libs.nnet3.train.common as common_train_lib
import libs.common as common_lib
import libs.nnet3.train.frame_level_objf as train_lib
import libs.nnet3.report.log_parse as nnet3_log_parse

logger = logging.getLogger('libs')
logger.setLevel(logging.INFO)
handler = logging.StreamHandler()
handler.setLevel(logging.INFO)
formatter = logging.Formatter("%(asctime)s [%(pathname)s:%(lineno)s - "
                              "%(funcName)s - %(levelname)s ] %(message)s")
handler.setFormatter(formatter)
logger.addHandler(handler)
logger.info('Starting RNN trainer (train_raw_rnn.py)')


def get_args():
    """ Get args from stdin.

    The common options are defined in the object
    libs.nnet3.train.common.CommonParser.parser.
    See steps/libs/nnet3/train/common.py
    """

    parser = argparse.ArgumentParser(
        description="""Trains a raw RNN (without transition model) using
        frame-level objectives like cross-entropy and mean-squared-error.
        RNNs include LSTMs, BLSTMs and GRUs.
        RNN acoustic model training differs from feed-forward DNN training in
        the following ways
            1. RNN acoustic models train on output chunks rather than
               individual outputs
            2. The training includes additional stage of shrinkage, where the
               parameters of the model are scaled when the derivative averages
               at the non-linearities are below a threshold.
            3. RNNs can also be trained with state preservation training""",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        conflict_handler='resolve',
        parents=[common_train_lib.CommonParser(default_chunk_left_context=40).parser])

    # egs extraction options
    parser.add_argument("--egs.chunk-width", type=str, dest='chunk_width',
                        default="20",
                        help="""Number of frames per chunk in the examples
                        used to train the RNN.   Caution: if you double this you
                        should halve --trainer.samples-per-iter.  May be
                        a comma-separated list of alternatives: first width
                        is the 'principal' chunk-width, used preferentially""")

    # trainer options
    parser.add_argument("--trainer.input-model", type=str,
                        dest='input_model', default=None,
                        action=common_lib.NullstrToNoneAction,
                        help="""If specified, this model is used as initial
                        raw model (0.raw in the script) instead of initializing
                        the model from xconfig. Configs dir is not expected to
                        exist and left/right context is computed from this
                        model.""")
    parser.add_argument("--trainer.samples-per-iter", type=int,
                        dest='samples_per_iter', default=20000,
                        help="""This is really the number of egs in each
                        archive.  Each eg has 'chunk_width' frames in it--
                        for chunk_width=20, this value (20k) is equivalent
                        to the 400k number that we use as a default in
                        regular DNN training.
                        Overrides the default value in CommonParser.""")
    parser.add_argument("--trainer.prior-subset-size", type=int,
                        dest='prior_subset_size', default=20000,
                        help="Number of samples for computing priors")
    parser.add_argument("--trainer.num-jobs-compute-prior", type=int,
                        dest='num_jobs_compute_prior', default=10,
                        help="The prior computation jobs are single "
                        "threaded and run on the CPU")

    # Parameters for the optimization
    parser.add_argument("--trainer.optimization.momentum", type=float,
                        dest='momentum', default=0.5,
                        help="""Momentum used in update computation.
                        Note: we implemented it in such a way that
                        it doesn't increase the effective learning rate.
                        Overrides the default value in CommonParser""")
    parser.add_argument("--trainer.optimization.shrink-value", type=float,
                        dest='shrink_value', default=0.99,
                        help="""Scaling factor used for scaling the parameter
                        matrices when the derivative averages are below the
                        shrink-threshold at the non-linearities.  E.g. 0.99.
                        Only applicable when the neural net contains sigmoid or
                        tanh units.""")
    parser.add_argument("--trainer.optimization.shrink-saturation-threshold",
                        type=float,
                        dest='shrink_saturation_threshold', default=0.40,
                        help="""Threshold that controls when we apply the
                        'shrinkage' (i.e. scaling by shrink-value).  If the
                        saturation of the sigmoid and tanh nonlinearities in
                        the neural net (as measured by
                        steps/nnet3/get_saturation.pl) exceeds this threshold
                        we scale the parameter matrices with the
                        shrink-value.""")
    # RNN specific trainer options
    parser.add_argument("--trainer.rnn.num-chunk-per-minibatch", type=str,
                        dest='num_chunk_per_minibatch', default='100',
                        help="""Number of sequences to be processed in
                        parallel every minibatch.  May be a more general
                        rule as accepted by the --minibatch-size option of
                        nnet3-merge-egs; run that program without args to see
                        the format.""")
    parser.add_argument("--trainer.deriv-truncate-margin", type=int,
                        dest='deriv_truncate_margin', default=8,
                        help="""Margin (in input frames) around the 'required'
                        part of each chunk that the derivatives are
                        backpropagated to. E.g., 8 is a reasonable setting.
                        Note: the 'required' part of the chunk is defined by
                        the model's {left,right}-context.""")
    parser.add_argument("--compute-average-posteriors",
                        type=str, action=common_lib.StrToBoolAction,
                        choices=["true", "false"], default=False,
                        help="""If true, then the average output of the
                        network is computed and dumped as post.final.vec""")

    # General options
    parser.add_argument("--nj", type=int, default=4,
                        help="Number of parallel jobs")
    parser.add_argument("--use-dense-targets", type=str,
                        action=common_lib.StrToBoolAction,
                        default=True, choices=["true", "false"],
                        help="Train neural network using dense targets")
    parser.add_argument("--feat-dir", type=str, required=True,
                        help="Directory with features used for training "
                        "the neural network.")
    parser.add_argument("--targets-scp", type=str, required=True,
                        help="Target for training neural network.")
    parser.add_argument("--dir", type=str, required=True,
                        help="Directory to store the models and "
                        "all other files.")

    print(' '.join(sys.argv))
    print(sys.argv)

    args = parser.parse_args()

    [args, run_opts] = process_args(args)

    return [args, run_opts]


def process_args(args):
    """ Process the options got from get_args()
    """

    if not common_train_lib.validate_chunk_width(args.chunk_width):
        raise Exception("--egs.chunk-width has an invalid value")

    if not common_train_lib.validate_minibatch_size_str(args.num_chunk_per_minibatch):
        raise Exception("--trainer.rnn.num-chunk-per-minibatch has an invalid value")

    if args.chunk_left_context < 0:
        raise Exception("--egs.chunk-left-context should be non-negative")

    if args.chunk_right_context < 0:
        raise Exception("--egs.chunk-right-context should be non-negative")

    if (not os.path.exists(args.dir)):
        raise Exception("Directory specified with --dir={0} "
                        "does not exist.".format(args.dir))
    if (not os.path.exists(args.dir + "/configs") and
        (args.input_model is None or not os.path.exists(args.input_model))):
        raise Exception("Either --trainer.input-model option should be supplied, "
                        "and exist; or the {0}/configs directory should exist."
                        "{0}/configs is the output of make_configs.py"
                        "".format(args.dir))

    # set the options corresponding to args.use_gpu
    run_opts = common_train_lib.RunOpts()
    if args.use_gpu in ["true", "false"]:
        args.use_gpu = ("yes" if args.use_gpu == "true" else "no")
    if args.use_gpu in ["yes", "wait"]:
        if not common_lib.check_if_cuda_compiled():
            logger.warning(
                """You are running with one thread but you have not compiled
                   for CUDA.  You may be running a setup optimized for GPUs.
                   If you have GPUs and have nvcc installed, go to src/ and do
                   ./configure; make""")

        run_opts.train_queue_opt = "--gpu 1"
        run_opts.parallel_train_opts = "--use-gpu={}".format(args.use_gpu)
        run_opts.combine_gpu_opt = "--use-gpu={}".format(args.use_gpu)
        run_opts.combine_queue_opt = "--gpu 1"
        run_opts.prior_gpu_opt = "--use-gpu={}".format(args.use_gpu)
        run_opts.prior_queue_opt = "--gpu 1"

    else:
        logger.warning("Without using a GPU this will be very slow. "
                       "nnet3 does not yet support multiple threads.")

        run_opts.train_queue_opt = ""
        run_opts.parallel_train_opts = "--use-gpu=no"
        run_opts.combine_gpu_opt = "--use-gpu=no"
        run_opts.combine_queue_opt = ""
        run_opts.prior_gpu_opt = "--use-gpu=no"
        run_opts.prior_queue_opt = ""

    run_opts.command = args.command
    run_opts.egs_command = (args.egs_command
                            if args.egs_command is not None else
                            args.command)
    run_opts.num_jobs_compute_prior = args.num_jobs_compute_prior

    return [args, run_opts]


def train(args, run_opts):
    """ The main function for training.

    Args:
        args: a Namespace object with the required parameters
            obtained from the function process_args()
        run_opts: RunOpts object obtained from the process_args()
    """

    arg_string = pprint.pformat(vars(args))
    logger.info("Arguments for the experiment\n{0}".format(arg_string))

    # Set some variables.
    feat_dim = common_lib.get_feat_dim(args.feat_dir)
    ivector_dim = common_lib.get_ivector_dim(args.online_ivector_dir)
    ivector_id = common_lib.get_ivector_extractor_id(args.online_ivector_dir)

    if args.input_model is None:
        config_dir = '{0}/configs'.format(args.dir)
        var_file = '{0}/vars'.format(config_dir)

        variables = common_train_lib.parse_generic_config_vars_file(var_file)
    else:
        # If args.input_model is specified, the model left and right contexts
        # are computed using input_model.
        variables = common_train_lib.get_input_model_info(args.input_model)

    # Set some variables.
    try:
        model_left_context = variables['model_left_context']
        model_right_context = variables['model_right_context']
    except KeyError as e:
        raise Exception("KeyError {0}: Variables need to be defined in "
                        "{1}".format(str(e), '{0}/configs'.format(args.dir)))

    left_context = args.chunk_left_context + model_left_context
    right_context = args.chunk_right_context + model_right_context
    left_context_initial = (args.chunk_left_context_initial + model_left_context if
                            args.chunk_left_context_initial >= 0 else -1)
    right_context_final = (args.chunk_right_context_final + model_right_context if
                           args.chunk_right_context_final >= 0 else -1)

    # Initialize as "raw" nnet, prior to training the LDA-like preconditioning
    # matrix.  This first config just does any initial splicing that we do;
    # we do this as it's a convenient way to get the stats for the 'lda-like'
    # transform.

    if (args.stage <= -4) and os.path.exists(args.dir+"/configs/init.config") and \
       (args.input_model is None):
        logger.info("Initializing the network for computing the LDA stats")
        common_lib.execute_command(
            """{command} {dir}/log/nnet_init.log \
                    nnet3-init --srand=-2 {dir}/configs/init.config \
                    {dir}/init.raw""".format(command=run_opts.command,
                                             dir=args.dir))

    default_egs_dir = '{0}/egs'.format(args.dir)
    if (args.stage <= -3) and args.egs_dir is None:
        logger.info("Generating egs")

        if args.use_dense_targets:
            target_type = "dense"
            try:
                num_targets = int(variables['num_targets'])
                if (common_lib.get_feat_dim_from_scp(args.targets_scp)
                        != num_targets):
                    raise Exception("Mismatch between num-targets provided to "
                                    "script vs configs")
            except KeyError as e:
                num_targets = -1
        else:
            target_type = "sparse"
            try:
                num_targets = int(variables['num_targets'])
            except KeyError as e:
                raise Exception("KeyError {0}: Variables need to be defined "
                                "in {1}".format(
                                    str(e), '{0}/configs'.format(args.dir)))

        train_lib.raw_model.generate_egs_using_targets(
            data=args.feat_dir, targets_scp=args.targets_scp,
            egs_dir=default_egs_dir,
            left_context=left_context,
            right_context=right_context,
            left_context_initial=left_context_initial,
            right_context_final=right_context_final,
            run_opts=run_opts,
            frames_per_eg_str=args.chunk_width,
            srand=args.srand,
            egs_opts=args.egs_opts,
            cmvn_opts=args.cmvn_opts,
            online_ivector_dir=args.online_ivector_dir,
            samples_per_iter=args.samples_per_iter,
            stage=args.egs_stage,
            target_type=target_type,
            num_targets=num_targets)

    if args.egs_dir is None:
        egs_dir = default_egs_dir
    else:
        egs_dir = args.egs_dir

    [egs_left_context, egs_right_context,
     frames_per_eg_str, num_archives] = (
         common_train_lib.verify_egs_dir(egs_dir, feat_dim,
                                         ivector_dim, ivector_id,
                                         left_context, right_context,
                                         left_context_initial,
                                         right_context_final))
    if args.chunk_width != frames_per_eg_str:
        raise Exception("mismatch between --egs.chunk-width and the frames_per_eg "
                        "in the egs dir {0} vs {1}".format(args.chunk_width,
                                                           frames_per_eg_str))

    if args.num_jobs_final > num_archives:
        raise Exception('num_jobs_final cannot exceed the number of archives '
                        'in the egs directory')

    # copy the properties of the egs to dir for
    # use during decoding
    common_train_lib.copy_egs_properties_to_exp_dir(egs_dir, args.dir)

    if args.stage <= -2 and os.path.exists(args.dir+"/configs/init.config") and \
       (args.input_model is None):
        logger.info('Computing the preconditioning matrix for input features')

        train_lib.common.compute_preconditioning_matrix(
            args.dir, egs_dir, num_archives, run_opts,
            max_lda_jobs=args.max_lda_jobs,
            rand_prune=args.rand_prune)

    if args.stage <= -1:
        logger.info("Preparing the initial network.")
        common_train_lib.prepare_initial_network(args.dir, run_opts, args.srand, args.input_model)

    # set num_iters so that as close as possible, we process the data
    # $num_epochs times, i.e. $num_iters*$avg_num_jobs) ==
    # $num_epochs*$num_archives, where
    # avg_num_jobs=(num_jobs_initial+num_jobs_final)/2.
    num_archives_to_process = int(args.num_epochs * num_archives)
    num_archives_processed = 0
    num_iters = int((num_archives_to_process * 2) / (args.num_jobs_initial + args.num_jobs_final))

    # If do_final_combination is True, compute the set of models_to_combine.
    # Otherwise, models_to_combine will be none.
    if args.do_final_combination:
        models_to_combine = common_train_lib.get_model_combine_iters(
            num_iters, args.num_epochs,
            num_archives, args.max_models_combine,
            args.num_jobs_final)
    else:
        models_to_combine = None

    if (os.path.exists('{0}/valid_diagnostic.scp'.format(egs_dir))):
        if (os.path.exists('{0}/valid_diagnostic.egs'.format(egs_dir))):
            raise Exception('both {0}/valid_diagnostic.egs and '
                            '{0}/valid_diagnostic.scp exist.'
                            'This script expects only one of them to exist.'
                            ''.format(egs_dir))
        use_multitask_egs = True
    else:
        if (not os.path.exists('{0}/valid_diagnostic.egs'
                               ''.format(egs_dir))):
            raise Exception('neither {0}/valid_diagnostic.egs nor '
                            '{0}/valid_diagnostic.scp exist.'
                            'This script expects one of them.'
                            ''.format(egs_dir))
        use_multitask_egs = False

    min_deriv_time = None
    max_deriv_time_relative = None
    if args.deriv_truncate_margin is not None:
        min_deriv_time = -args.deriv_truncate_margin - model_left_context
        max_deriv_time_relative = \
           args.deriv_truncate_margin + model_right_context

    logger.info("Training will run for {0} epochs = "
                "{1} iterations".format(args.num_epochs, num_iters))

    for iter in range(num_iters):
        if (args.exit_stage is not None) and (iter == args.exit_stage):
            logger.info("Exiting early due to --exit-stage {0}".format(iter))
            return

        current_num_jobs = common_train_lib.get_current_num_jobs(
            iter, num_iters,
            args.num_jobs_initial, args.num_jobs_step, args.num_jobs_final)

        if args.stage <= iter:
            model_file = "{dir}/{iter}.raw".format(dir=args.dir, iter=iter)

            lrate = common_train_lib.get_learning_rate(iter, current_num_jobs,
                                                       num_iters,
                                                       num_archives_processed,
                                                       num_archives_to_process,
                                                       args.initial_effective_lrate,
                                                       args.final_effective_lrate)

            # shrinkage_value is a scale on the parameters.
            shrinkage_value = 1.0 - (args.proportional_shrink * lrate)
            if shrinkage_value <= 0.5:
                raise Exception("proportional-shrink={0} is too large, it gives "
                                "shrink-value={1}".format(args.proportional_shrink,
                                                          shrinkage_value))
            if args.shrink_value < shrinkage_value:
                shrinkage_value = (args.shrink_value
                                   if common_train_lib.should_do_shrinkage(
                                           iter, model_file,
                                           args.shrink_saturation_threshold,
                                           get_raw_nnet_from_am=False)
                                   else shrinkage_value)

            percent = num_archives_processed * 100.0 / num_archives_to_process
            epoch = (num_archives_processed * args.num_epochs
                     / num_archives_to_process)
            shrink_info_str = ''
            if shrinkage_value != 1.0:
                shrink_info_str = 'shrink: {0:0.5f}'.format(shrinkage_value)
            logger.info("Iter: {0}/{1}   Jobs: {2}   "
                        "Epoch: {3:0.2f}/{4:0.1f} ({5:0.1f}% complete)   "
                        "lr: {6:0.6f}   {7}".format(iter, num_iters - 1,
                                                    current_num_jobs,
                                                    epoch, args.num_epochs,
                                                    percent,
                                                    lrate, shrink_info_str))

            train_lib.common.train_one_iteration(
                dir=args.dir,
                iter=iter,
                srand=args.srand,
                egs_dir=egs_dir,
                num_jobs=current_num_jobs,
                num_archives_processed=num_archives_processed,
                num_archives=num_archives,
                learning_rate=lrate,
                dropout_edit_string=common_train_lib.get_dropout_edit_string(
                    args.dropout_schedule,
                    float(num_archives_processed) / num_archives_to_process,
                    iter),
                train_opts=' '.join(args.train_opts),
                shrinkage_value=shrinkage_value,
                minibatch_size_str=args.num_chunk_per_minibatch,
                min_deriv_time=min_deriv_time,
                max_deriv_time_relative=max_deriv_time_relative,
                momentum=args.momentum,
                max_param_change=args.max_param_change,
                shuffle_buffer_size=args.shuffle_buffer_size,
                run_opts=run_opts,
                get_raw_nnet_from_am=False,
                use_multitask_egs=use_multitask_egs,
                compute_per_dim_accuracy=args.compute_per_dim_accuracy)

            if args.cleanup:
                # do a clean up everythin but the last 2 models, under certain
                # conditions
                common_train_lib.remove_model(
                    args.dir, iter-2, num_iters, models_to_combine,
                    args.preserve_model_interval,
                    get_raw_nnet_from_am=False)

            if args.email is not None:
                reporting_iter_interval = num_iters * args.reporting_interval
                if iter % reporting_iter_interval == 0:
                    # lets do some reporting
                    [report, times, data] = (
                        nnet3_log_parse.generate_acc_logprob_report(args.dir))
                    message = report
                    subject = ("Update : Expt {dir} : "
                               "Iter {iter}".format(dir=args.dir, iter=iter))
                    common_lib.send_mail(message, subject, args.email)

        num_archives_processed = num_archives_processed + current_num_jobs

    if args.stage <= num_iters:
        if args.do_final_combination:
            logger.info("Doing final combination to produce final.raw")
            train_lib.common.combine_models(
                dir=args.dir, num_iters=num_iters,
                models_to_combine=models_to_combine, egs_dir=egs_dir,
                minibatch_size_str=args.num_chunk_per_minibatch,
                run_opts=run_opts, chunk_width=args.chunk_width,
                get_raw_nnet_from_am=False,
                compute_per_dim_accuracy=args.compute_per_dim_accuracy,
                max_objective_evaluations=args.max_objective_evaluations,
                use_multitask_egs=use_multitask_egs)
        else:
            common_lib.force_symlink("{0}.raw".format(num_iters),
                                     "{0}/final.raw".format(args.dir))

    if args.compute_average_posteriors and args.stage <= num_iters + 1:
        logger.info("Getting average posterior for purposes of "
                    "adjusting the priors.")
        train_lib.common.compute_average_posterior(
            dir=args.dir, iter='final', egs_dir=egs_dir,
            num_archives=num_archives,
            prior_subset_size=args.prior_subset_size, run_opts=run_opts,
            get_raw_nnet_from_am=False)

    if args.cleanup:
        logger.info("Cleaning up the experiment directory "
                    "{0}".format(args.dir))
        remove_egs = args.remove_egs
        if args.egs_dir is not None:
            # this egs_dir was not created by this experiment so we will not
            # delete it
            remove_egs = False

        common_train_lib.clean_nnet_dir(
            nnet_dir=args.dir, num_iters=num_iters, egs_dir=egs_dir,
            preserve_model_interval=args.preserve_model_interval,
            remove_egs=remove_egs,
            get_raw_nnet_from_am=False)

    # do some reporting
    [report, times, data] = nnet3_log_parse.generate_acc_logprob_report(args.dir)
    if args.email is not None:
        common_lib.send_mail(report, "Update : Expt {0} : "
                                     "complete".format(args.dir), args.email)

    with open("{dir}/accuracy.report".format(dir=args.dir), "w") as f:
        f.write(report)

    common_lib.execute_command("steps/info/nnet3_dir_info.pl "
                               "{0}".format(args.dir))


def main():
    [args, run_opts] = get_args()
    try:
        train(args, run_opts)
        common_lib.wait_for_background_commands()
    except BaseException as e:
        # look for BaseException so we catch KeyboardInterrupt, which is
        # what we get when a background thread dies.
        if args.email is not None:
            message = ("Training session for experiment {dir} "
                       "died due to an error.".format(dir=args.dir))
            common_lib.send_mail(message, message, args.email)
        if not isinstance(e, KeyboardInterrupt):
            traceback.print_exc()
        sys.exit(1)

if __name__ == "__main__":
    main()


================================================
FILE: egs/steps/nnet3/train_rnn.py
================================================
#!/usr/bin/env python

# Copyright 2016    Vijayaditya Peddinti.
#           2016    Vimal Manohar
# Apache 2.0.

""" This script is based on steps/nnet3/lstm/train.sh
"""

from __future__ import print_function
from __future__ import division
import argparse
import logging
import os
import pprint
import shutil
import sys
import traceback

sys.path.insert(0, 'steps')
import libs.nnet3.train.common as common_train_lib
import libs.common as common_lib
import libs.nnet3.train.frame_level_objf as train_lib
import libs.nnet3.report.log_parse as nnet3_log_parse


logger = logging.getLogger('libs')
logger.setLevel(logging.INFO)
handler = logging.StreamHandler()
handler.setLevel(logging.INFO)
formatter = logging.Formatter("%(asctime)s [%(pathname)s:%(lineno)s - "
                              "%(funcName)s - %(levelname)s ] %(message)s")
handler.setFormatter(formatter)
logger.addHandler(handler)
logger.info('Starting RNN trainer (train_rnn.py)')


def get_args():
    """ Get args from stdin.

    We add compulsary arguments as named arguments for readability

    The common options are defined in the object
    libs.nnet3.train.common.CommonParser.parser.
    See steps/libs/nnet3/train/common.py
    """

    parser = argparse.ArgumentParser(
        description="""Trains an RNN acoustic model using the cross-entropy
        objective.  RNNs include LSTMs, BLSTMs and GRUs.
        RNN acoustic model training differs from feed-forward DNN training in
        the following ways
            1. RNN acoustic models train on output chunks rather than
               individual outputs
            2. The training includes additional stage of shrinkage, where
               the parameters of the model are scaled when the derivative
               averages at the non-linearities are below a threshold.
            3. RNNs can also be trained with state preservation training""",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        conflict_handler='resolve',
        parents=[common_train_lib.CommonParser(default_chunk_left_context=40).parser])

    # egs extraction options
    parser.add_argument("--egs.chunk-width", type=str, dest='chunk_width',
                        default="20",
                        help="""Number of frames per chunk in the examples
                        used to train the RNN.   Caution: if you double this you
                        should halve --trainer.samples-per-iter.  May be
                        a comma-separated list of alternatives: first width
                        is the 'principal' chunk-width, used preferentially""")
    parser.add_argument("--trainer.input-model", type=str,
                        dest='input_model', default=None,
                        action=common_lib.NullstrToNoneAction,
                        help="""If specified, this model is used as initial
                        raw model (0.raw in the script) instead of initializing
                        the model from xconfig. Configs dir is not expected to
                        exist and left/right context is computed from this
                        model.""")
    parser.add_argument("--trainer.samples-per-iter", type=int,
                        dest='samples_per_iter', default=20000,
                        help="""This is really the number of egs in each
                        archive.  Each eg has 'chunk_width' frames in it--
                        for chunk_width=20, this value (20k) is equivalent
                        to the 400k number that we use as a default in
                        regular DNN training.
                        Overrides the default value in CommonParser.""")
    parser.add_argument("--trainer.prior-subset-size", type=int,
                        dest='prior_subset_size', default=20000,
                        help="Number of samples for computing priors")
    parser.add_argument("--trainer.num-jobs-compute-prior", type=int,
                        dest='num_jobs_compute_prior', default=10,
                        help="The prior computation jobs are single "
                        "threaded and run on the CPU")

    # Parameters for the optimization
    parser.add_argument("--trainer.optimization.momentum", type=float,
                        dest='momentum', default=0.5,
                        help="""Momentum used in update computation.
                        Note: we implemented it in such a way that
                        it doesn't increase the effective learning rate.
                        Overrides the default value in CommonParser""")
    parser.add_argument("--trainer.optimization.shrink-value", type=float,
                        dest='shrink_value', default=0.99,
                        help="""Scaling factor used for scaling the parameter
                        matrices when the derivative averages are below the
                        shrink-threshold at the non-linearities.  E.g. 0.99.
                        Only applicable when the neural net contains sigmoid or
                        tanh units.""")
    parser.add_argument("--trainer.optimization.shrink-saturation-threshold",
                        type=float,
                        dest='shrink_saturation_threshold', default=0.40,
                        help="""Threshold that controls when we apply the
                        'shrinkage' (i.e. scaling by shrink-value).  If the
                        saturation of the sigmoid and tanh nonlinearities in
                        the neural net (as measured by
                        steps/nnet3/get_saturation.pl) exceeds this threshold
                        we scale the parameter matrices with the
                        shrink-value.""")
    # RNN specific trainer options
    parser.add_argument("--trainer.rnn.num-chunk-per-minibatch", type=str,
                        dest='num_chunk_per_minibatch', default='100',
                        help="""Number of sequences to be processed in
                        parallel every minibatch.  May be a more general
                        rule as accepted by the --minibatch-size option of
                        nnet3-merge-egs; run that program without args to see
                        the format.""")
    parser.add_argument("--trainer.deriv-truncate-margin", type=int,
                        dest='deriv_truncate_margin', default=8,
                        help="""Margin (in input frames) around the 'required'
                        part of each chunk that the derivatives are
                        backpropagated to. E.g., 8 is a reasonable setting.
                        Note: the 'required' part of the chunk is defined by
                        the model's {left,right}-context.""")

    # General options
    parser.add_argument("--feat-dir", type=str, required=False,
                        help="Directory with features used for training "
                        "the neural network.")
    parser.add_argument("--lang", type=str, required=False,
                        help="Language directory")
    parser.add_argument("--ali-dir", type=str, required=True,
                        help="Directory with alignments used for training "
                        "the neural network.")
    parser.add_argument("--dir", type=str, required=True,
                        help="Directory to store the models and "
                        "all other files.")

    print(' '.join(sys.argv))
    print(sys.argv)

    args = parser.parse_args()

    [args, run_opts] = process_args(args)

    return [args, run_opts]


def process_args(args):
    """ Process the options got from get_args()
    """

    if not common_train_lib.validate_chunk_width(args.chunk_width):
        raise Exception("--egs.chunk-width has an invalid value")

    if not common_train_lib.validate_minibatch_size_str(args.num_chunk_per_minibatch):
        raise Exception("--trainer.rnn.num-chunk-per-minibatch has an invalid value")

    if args.chunk_left_context < 0:
        raise Exception("--egs.chunk-left-context should be non-negative")

    if args.chunk_right_context < 0:
        raise Exception("--egs.chunk-right-context should be non-negative")

    if (not os.path.exists(args.dir)):
        raise Exception("Directory specified with --dir={0} "
                        "does not exist.".format(args.dir))
    if (not os.path.exists(args.dir + "/configs") and
        (args.input_model is None or not os.path.exists(args.input_model))):
        raise Exception("Either --trainer.input-model option should be supplied, "
                        "and exist; or the {0}/configs directory should exist. "
                        "{0}/configs is the output of make_configs.py"
                        "".format(args.dir))

    # set the options corresponding to args.use_gpu
    run_opts = common_train_lib.RunOpts()
    if args.use_gpu in ["true", "false"]:
        args.use_gpu = ("yes" if args.use_gpu == "true" else "no")
    if args.use_gpu in ["yes", "wait"]:
        if not common_lib.check_if_cuda_compiled():
            logger.warning(
                """You are running with one thread but you have not compiled
                   for CUDA.  You may be running a setup optimized for GPUs.
                   If you have GPUs and have nvcc installed, go to src/ and do
                   ./configure; make""")

        run_opts.train_queue_opt = "--gpu 1"
        run_opts.parallel_train_opts = "--use-gpu={}".format(args.use_gpu)
        run_opts.combine_gpu_opt = "--use-gpu={}".format(args.use_gpu)
        run_opts.combine_queue_opt = "--gpu 1"
        run_opts.prior_gpu_opt = "--use-gpu={}".format(args.use_gpu)
        run_opts.prior_queue_opt = "--gpu 1"

    else:
        logger.warning("Without using a GPU this will be very slow. "
                       "nnet3 does not yet support multiple threads.")

        run_opts.train_queue_opt = ""
        run_opts.parallel_train_opts = "--use-gpu=no"
        run_opts.combine_gpu_opt = "--use-gpu=no"
        run_opts.combine_queue_opt = ""
        run_opts.prior_gpu_opt = "--use-gpu=no"
        run_opts.prior_queue_opt = ""

    run_opts.command = args.command
    run_opts.egs_command = (args.egs_command
                            if args.egs_command is not None else
                            args.command)
    run_opts.num_jobs_compute_prior = args.num_jobs_compute_prior

    return [args, run_opts]


def train(args, run_opts):
    """ The main function for training.

    Args:
        args: a Namespace object with the required parameters
            obtained from the function process_args()
        run_opts: RunOpts object obtained from the process_args()
    """

    arg_string = pprint.pformat(vars(args))
    logger.info("Arguments for the experiment\n{0}".format(arg_string))

    # Copy phones.txt from ali-dir to dir. Later, steps/nnet3/decode.sh will
    # use it to check compatibility between training and decoding phone-sets.
    shutil.copy('{0}/phones.txt'.format(args.ali_dir), args.dir)

    # Set some variables.
    num_jobs = common_lib.get_number_of_jobs(args.ali_dir)
    feat_dim = common_lib.get_feat_dim(args.feat_dir)
    ivector_dim = common_lib.get_ivector_dim(args.online_ivector_dir)
    ivector_id = common_lib.get_ivector_extractor_id(args.online_ivector_dir)

    # split the training data into parts for individual jobs
    # we will use the same number of jobs as that used for alignment
    common_lib.execute_command("utils/split_data.sh {0} {1}".format(
        args.feat_dir, num_jobs))
    shutil.copy('{0}/tree'.format(args.ali_dir), args.dir)

    with open('{0}/num_jobs'.format(args.dir), 'w') as f:
        f.write('{}'.format(num_jobs))

    config_dir = '{0}/configs'.format(args.dir)
    var_file = '{0}/vars'.format(config_dir)

    if args.input_model is None:
        config_dir = '{0}/configs'.format(args.dir)
        var_file = '{0}/vars'.format(config_dir)

        variables = common_train_lib.parse_generic_config_vars_file(var_file)
    else:
        # If args.input_model is specified, the model left and right contexts
        # are computed using input_model.
        variables = common_train_lib.get_input_model_info(args.input_model)

    # Set some variables.
    try:
        model_left_context = variables['model_left_context']
        model_right_context = variables['model_right_context']
    except KeyError as e:
        raise Exception("KeyError {0}: Variables need to be defined in "
                        "{1}".format(str(e), '{0}/configs'.format(args.dir)))

    left_context = args.chunk_left_context + model_left_context
    right_context = args.chunk_right_context + model_right_context
    left_context_initial = (args.chunk_left_context_initial + model_left_context if
                            args.chunk_left_context_initial >= 0 else -1)
    right_context_final = (args.chunk_right_context_final + model_right_context if
                           args.chunk_right_context_final >= 0 else -1)

    # Initialize as "raw" nnet, prior to training the LDA-like preconditioning
    # matrix.  This first config just does any initial splicing that we do;
    # we do this as it's a convenient way to get the stats for the 'lda-like'
    # transform.

    if (args.stage <= -5) and (args.input_model is None):
        logger.info("Initializing a basic network for estimating "
                    "preconditioning matrix")
        common_lib.execute_command(
            """{command} {dir}/log/nnet_init.log \
                    nnet3-init --srand=-2 {dir}/configs/init.config \
                    {dir}/init.raw""".format(command=run_opts.command,
                                             dir=args.dir))

    default_egs_dir = '{0}/egs'.format(args.dir)
    if args.stage <= -4 and args.egs_dir is None:
        logger.info("Generating egs")

        if args.feat_dir is None:
            raise Exception("--feat-dir option is required if you don't supply --egs-dir")

        train_lib.acoustic_model.generate_egs(
            data=args.feat_dir, alidir=args.ali_dir,
            egs_dir=default_egs_dir,
            left_context=left_context,
            right_context=right_context,
            left_context_initial=left_context_initial,
            right_context_final=right_context_final,
            run_opts=run_opts,
            frames_per_eg_str=args.chunk_width,
            srand=args.srand,
            egs_opts=args.egs_opts,
            cmvn_opts=args.cmvn_opts,
            online_ivector_dir=args.online_ivector_dir,
            samples_per_iter=args.samples_per_iter,
            stage=args.egs_stage)

    if args.egs_dir is None:
        egs_dir = default_egs_dir
    else:
        egs_dir = args.egs_dir

    [egs_left_context, egs_right_context,
     frames_per_eg_str, num_archives] = (
         common_train_lib.verify_egs_dir(egs_dir, feat_dim,
                                         ivector_dim, ivector_id,
                                         left_context, right_context,
                                         left_context_initial, right_context_final))
    if args.chunk_width != frames_per_eg_str:
        raise Exception("mismatch between --egs.chunk-width and the frames_per_eg "
                        "in the egs dir {0} vs {1}".format(args.chunk_width,
                                                           frames_per_eg_str))

    if args.num_jobs_final > num_archives:
        raise Exception('num_jobs_final cannot exceed the number of archives '
                        'in the egs directory')

    # copy the properties of the egs to dir for
    # use during decoding
    common_train_lib.copy_egs_properties_to_exp_dir(egs_dir, args.dir)

    if args.stage <= -3 and (args.input_model is None):
        logger.info('Computing the preconditioning matrix for input features')

        train_lib.common.compute_preconditioning_matrix(
            args.dir, egs_dir, num_archives, run_opts,
            max_lda_jobs=args.max_lda_jobs,
            rand_prune=args.rand_prune)

    if args.stage <= -2 and (args.input_model is None):
        logger.info("Computing initial vector for FixedScaleComponent before"
                    " softmax, using priors^{prior_scale} and rescaling to"
                    " average 1".format(
                        prior_scale=args.presoftmax_prior_scale_power))

        common_train_lib.compute_presoftmax_prior_scale(
            args.dir, args.ali_dir, num_jobs, run_opts,
            presoftmax_prior_scale_power=args.presoftmax_prior_scale_power)

    if args.stage <= -1:
        logger.info("Preparing the initial acoustic model.")
        train_lib.acoustic_model.prepare_initial_acoustic_model(
            args.dir, args.ali_dir, run_opts,
            input_model=args.input_model)

    # set num_iters so that as close as possible, we process the data
    # $num_epochs times, i.e. $num_iters*$avg_num_jobs) ==
    # $num_epochs*$num_archives, where
    # avg_num_jobs=(num_jobs_initial+num_jobs_final)/2.
    num_archives_to_process = int(args.num_epochs * num_archives)
    num_archives_processed = 0
    num_iters = int((num_archives_to_process * 2) / (args.num_jobs_initial + args.num_jobs_final))

    # If do_final_combination is True, compute the set of models_to_combine.
    # Otherwise, models_to_combine will be none.
    if args.do_final_combination:
        models_to_combine = common_train_lib.get_model_combine_iters(
            num_iters, args.num_epochs,
            num_archives, args.max_models_combine,
            args.num_jobs_final)
    else:
        models_to_combine = None

    min_deriv_time = None
    max_deriv_time_relative = None
    if args.deriv_truncate_margin is not None:
        min_deriv_time = -args.deriv_truncate_margin - model_left_context
        max_deriv_time_relative = \
           args.deriv_truncate_margin + model_right_context

    logger.info("Training will run for {0} epochs = "
                "{1} iterations".format(args.num_epochs, num_iters))

    for iter in range(num_iters):
        if (args.exit_stage is not None) and (iter == args.exit_stage):
            logger.info("Exiting early due to --exit-stage {0}".format(iter))
            return

        current_num_jobs = common_train_lib.get_current_num_jobs(
            iter, num_iters,
            args.num_jobs_initial, args.num_jobs_step, args.num_jobs_final)

        if args.stage <= iter:
            model_file = "{dir}/{iter}.mdl".format(dir=args.dir, iter=iter)


            lrate = common_train_lib.get_learning_rate(iter, current_num_jobs,
                                                       num_iters,
                                                       num_archives_processed,
                                                       num_archives_to_process,
                                                       args.initial_effective_lrate,
                                                       args.final_effective_lrate)

            shrinkage_value = 1.0 - (args.proportional_shrink * lrate)
            if shrinkage_value <= 0.5:
                raise Exception("proportional-shrink={0} is too large, it gives "
                                "shrink-value={1}".format(args.proportional_shrink,
                                                          shrinkage_value))
            if args.shrink_value < shrinkage_value:
                shrinkage_value = (args.shrink_value
                                   if common_train_lib.should_do_shrinkage(
                                           iter, model_file,
                                           args.shrink_saturation_threshold) else 1.0)

            percent = num_archives_processed * 100.0 / num_archives_to_process
            epoch = (num_archives_processed * args.num_epochs
                     / num_archives_to_process)
            shrink_info_str = ''
            if shrinkage_value != 1.0:
                shrink_info_str = 'shrink: {0:0.5f}'.format(shrinkage_value)
            logger.info("Iter: {0}/{1}   Jobs: {2}   "
                        "Epoch: {3:0.2f}/{4:0.1f} ({5:0.1f}% complete)   "
                        "lr: {6:0.6f}   {7}".format(iter, num_iters - 1,
                                                    current_num_jobs,
                                                    epoch, args.num_epochs,
                                                    percent,
                                                    lrate, shrink_info_str))

            train_lib.common.train_one_iteration(
                dir=args.dir,
                iter=iter,
                srand=args.srand,
                egs_dir=egs_dir,
                num_jobs=current_num_jobs,
                num_archives_processed=num_archives_processed,
                num_archives=num_archives,
                learning_rate=lrate,
                dropout_edit_string=common_train_lib.get_dropout_edit_string(
                    args.dropout_schedule,
                    float(num_archives_processed) / num_archives_to_process,
                    iter),
                train_opts=' '.join(args.train_opts),
                shrinkage_value=shrinkage_value,
                minibatch_size_str=args.num_chunk_per_minibatch,
                min_deriv_time=min_deriv_time,
                max_deriv_time_relative=max_deriv_time_relative,
                momentum=args.momentum,
                max_param_change=args.max_param_change,
                shuffle_buffer_size=args.shuffle_buffer_size,
                run_opts=run_opts,
                backstitch_training_scale=args.backstitch_training_scale,
                backstitch_training_interval=args.backstitch_training_interval,
                compute_per_dim_accuracy=args.compute_per_dim_accuracy)

            if args.cleanup:
                # do a clean up everythin but the last 2 models, under certain
                # conditions
                common_train_lib.remove_model(
                    args.dir, iter-2, num_iters, models_to_combine,
                    args.preserve_model_interval)

            if args.email is not None:
                reporting_iter_interval = num_iters * args.reporting_interval
                if iter % reporting_iter_interval == 0:
                    # lets do some reporting
                    [report, times, data] = (
                        nnet3_log_parse.generate_acc_logprob_report(args.dir))
                    message = report
                    subject = ("Update : Expt {dir} : "
                               "Iter {iter}".format(dir=args.dir, iter=iter))
                    common_lib.send_mail(message, subject, args.email)

        num_archives_processed = num_archives_processed + current_num_jobs

    if args.stage <= num_iters:
        if args.do_final_combination:
            logger.info("Doing final combination to produce final.mdl")
            train_lib.common.combine_models(
                dir=args.dir, num_iters=num_iters,
                models_to_combine=models_to_combine, egs_dir=egs_dir,
                run_opts=run_opts,
                minibatch_size_str=args.num_chunk_per_minibatch,
                chunk_width=args.chunk_width,
                max_objective_evaluations=args.max_objective_evaluations,
                compute_per_dim_accuracy=args.compute_per_dim_accuracy)

    if args.stage <= num_iters + 1:
        logger.info("Getting average posterior for purposes of "
                    "adjusting the priors.")

        # If args.do_final_combination is true, we will use the combined model.
        # Otherwise, we will use the last_numbered model.
        real_iter = 'combined' if args.do_final_combination else num_iters
        avg_post_vec_file = train_lib.common.compute_average_posterior(
            dir=args.dir, iter=real_iter, egs_dir=egs_dir,
            num_archives=num_archives,
            prior_subset_size=args.prior_subset_size, run_opts=run_opts)

        logger.info("Re-adjusting priors based on computed posteriors")
        combined_or_last_numbered_model = "{dir}/{iter}.mdl".format(dir=args.dir,
                iter=real_iter)
        final_model = "{dir}/final.mdl".format(dir=args.dir)
        train_lib.common.adjust_am_priors(args.dir, combined_or_last_numbered_model,
                                          avg_post_vec_file, final_model,
                                          run_opts)

    if args.cleanup:
        logger.info("Cleaning up the experiment directory "
                    "{0}".format(args.dir))
        remove_egs = args.remove_egs
        if args.egs_dir is not None:
            # this egs_dir was not created by this experiment so we will not
            # delete it
            remove_egs = False

        common_train_lib.clean_nnet_dir(
            nnet_dir=args.dir, num_iters=num_iters, egs_dir=egs_dir,
            preserve_model_interval=args.preserve_model_interval,
            remove_egs=remove_egs)

    # do some reporting
    [report, times, data] = nnet3_log_parse.generate_acc_logprob_report(args.dir)
    if args.email is not None:
        common_lib.send_mail(report, "Update : Expt {0} : "
                                     "complete".format(args.dir), args.email)

    with open("{dir}/accuracy.report".format(dir=args.dir), "w") as f:
        f.write(report)

    common_lib.execute_command("steps/info/nnet3_dir_info.pl "
                               "{0}".format(args.dir))


def main():
    [args, run_opts] = get_args()
    try:
        train(args, run_opts)
        common_lib.wait_for_background_commands()
    except BaseException as e:
        # look for BaseException so we catch KeyboardInterrupt, which is
        # what we get when a background thread dies.
        if args.email is not None:
            message = ("Training session for experiment {dir} "
                       "died due to an error.".format(dir=args.dir))
            common_lib.send_mail(message, message, args.email)
        if not isinstance(e, KeyboardInterrupt):
            traceback.print_exc()
        sys.exit(1)


if __name__ == "__main__":
    main()


================================================
FILE: egs/steps/nnet3/train_tdnn.sh
================================================
#!/usr/bin/env bash

# THIS SCRIPT IS DEPRECATED, see ./train_dnn.py

# note, TDNN is the same as what we used to call multisplice.

# Copyright 2012-2015  Johns Hopkins University (Author: Daniel Povey).
#           2013  Xiaohui Zhang
#           2013  Guoguo Chen
#           2014  Vimal Manohar
#           2014  Vijayaditya Peddinti
# Apache 2.0.


# Begin configuration section.
cmd=run.pl
num_epochs=15      # Number of epochs of training;
                   # the number of iterations is worked out from this.
initial_effective_lrate=0.01
final_effective_lrate=0.001
pnorm_input_dim=3000
pnorm_output_dim=300
relu_dim=  # you can use this to make it use ReLU's instead of p-norms.
rand_prune=4.0 # Relates to a speedup we do for LDA.
minibatch_size=512  # This default is suitable for GPU-based training.
                    # Set it to 128 for multi-threaded CPU-based training.
max_param_change=2.0  # max param change per minibatch
samples_per_iter=400000 # each iteration of training, see this many samples
                        # per job.  This option is passed to get_egs.sh
num_jobs_initial=1  # Number of neural net jobs to run in parallel at the start of training
num_jobs_final=8   # Number of neural net jobs to run in parallel at the end of training
prior_subset_size=20000 # 20k samples per job, for computing priors.
num_jobs_compute_prior=10 # these are single-threaded, run on CPU.
get_egs_stage=0    # can be used for rerunning after partial
online_ivector_dir=
presoftmax_prior_scale_power=-0.25
use_presoftmax_prior_scale=true
remove_egs=true  # set to false to disable removing egs after training is done.

max_models_combine=20 # The "max_models_combine" is the maximum number of models we give
  # to the final 'combine' stage, but these models will themselves be averages of
  # iteration-number ranges.

shuffle_buffer_size=5000 # This "buffer_size" variable controls randomization of the samples
                # on each iter.  You could set it to 0 or to a large value for complete
                # randomization, but this would both consume memory and cause spikes in
                # disk I/O.  Smaller is easier on disk and memory but less random.  It's
                # not a huge deal though, as samples are anyway randomized right at the start.
                # (the point of this is to get data in different minibatches on different iterations,
                # since in the preconditioning method, 2 samples in the same minibatch can
                # affect each others' gradients.

add_layers_period=2 # by default, add new layers every 2 iterations.
stage=-6
exit_stage=-100 # you can set this to terminate the training early.  Exits before running this stage

# count space-separated fields in splice_indexes to get num-hidden-layers.
splice_indexes="-4,-3,-2,-1,0,1,2,3,4  0  -2,2  0  -4,4 0"
# Format : layer<hidden_layer>/<frame_indices>....layer<hidden_layer>/<frame_indices> "
# note: hidden layers which are composed of one or more components,
# so hidden layer indexing is different from component count

randprune=4.0 # speeds up LDA.
use_gpu=true    # if true, we run on GPU.
cleanup=true
egs_dir=
max_lda_jobs=10  # use no more than 10 jobs for the LDA accumulation.
lda_opts=
egs_opts=
transform_dir=     # If supplied, this dir used instead of alidir to find transforms.
cmvn_opts=  # will be passed to get_lda.sh and get_egs.sh, if supplied.
            # only relevant for "raw" features, not lda.
feat_type=raw  # or set to 'lda' to use LDA features.
align_cmd=              # The cmd that is passed to steps/nnet2/align.sh
align_use_gpu=          # Passed to use_gpu in steps/nnet2/align.sh [yes/no]
realign_times=          # List of times on which we realign.  Each time is
                        # floating point number strictly between 0 and 1, which
                        # will be multiplied by the num-iters to get an iteration
                        # number.
num_jobs_align=30       # Number of jobs for realignment
# End configuration section.
frames_per_eg=8 # to be passed on to get_egs.sh

trap 'for pid in $(jobs -pr); do kill -KILL $pid; done' INT QUIT TERM

echo "$0: THIS SCRIPT IS DEPRECATED"
echo "$0 $@"  # Print the command line for logging

if [ -f path.sh ]; then . ./path.sh; fi
. parse_options.sh || exit 1;

if [ $# != 4 ]; then
  echo "Usage: $0 [opts] <data> <lang> <ali-dir> <exp-dir>"
  echo " e.g.: $0 data/train data/lang exp/tri3_ali exp/tri4_nnet"
  echo ""
  echo "Main options (for others, see top of script file)"
  echo "  --config <config-file>                           # config file containing options"
  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
  echo "  --num-epochs <#epochs|15>                        # Number of epochs of training"
  echo "  --initial-effective-lrate <lrate|0.02> # effective learning rate at start of training."
  echo "  --final-effective-lrate <lrate|0.004>   # effective learning rate at end of training."
  echo "                                                   # data, 0.00025 for large data"
  echo "  --num-hidden-layers <#hidden-layers|2>           # Number of hidden layers, e.g. 2 for 3 hours of data, 4 for 100hrs"
  echo "  --add-layers-period <#iters|2>                   # Number of iterations between adding hidden layers"
  echo "  --presoftmax-prior-scale-power <power|-0.25>     # use the specified power value on the priors (inverse priors) to scale"
  echo "                                                   # the pre-softmax outputs (set to 0.0 to disable the presoftmax element scale)"
  echo "  --num-jobs-initial <num-jobs|1>                  # Number of parallel jobs to use for neural net training, at the start."
  echo "  --num-jobs-final <num-jobs|8>                    # Number of parallel jobs to use for neural net training, at the end"
  echo "  --num-threads <num-threads|16>                   # Number of parallel threads per job, for CPU-based training (will affect"
  echo "                                                   # results as well as speed; may interact with batch size; if you increase"
  echo "                                                   # this, you may want to decrease the batch size."
  echo "  --parallel-opts <opts|\"--num-threads 16 --mem 1G\">      # extra options to pass to e.g. queue.pl for processes that"
  echo "                                                   # use multiple threads... note, you might have to reduce --mem"
  echo "                                                   # versus your defaults, because it gets multiplied by the --num-threads argument."
  echo "  --minibatch-size <minibatch-size|128>            # Size of minibatch to process (note: product with --num-threads"
  echo "                                                   # should not get too large, e.g. >2k)."
  echo "  --samples-per-iter <#samples|400000>             # Number of samples of data to process per iteration, per"
  echo "                                                   # process."
  echo "  --splice-indexes <string|layer0/-4:-3:-2:-1:0:1:2:3:4> "
  echo "                                                   # Frame indices used for each splice layer."
  echo "                                                   # Format : layer<hidden_layer_index>/<frame_indices>....layer<hidden_layer>/<frame_indices> "
  echo "                                                   # (note: we splice processed, typically 40-dimensional frames"
  echo "  --lda-dim <dim|''>                               # Dimension to reduce spliced features to with LDA"
  echo "  --realign-times <list-of-times|\"\">             # A list of space-separated floating point numbers between 0.0 and"
  echo "                                                   # 1.0 to specify how far through training realignment is to be done"
  echo "  --align-cmd (utils/run.pl|utils/queue.pl <queue opts>) # passed to align.sh"
  echo "  --align-use-gpu (yes/no)                         # specify is gpu is to be used for realignment"
  echo "  --num-jobs-align <#njobs|30>                     # Number of jobs to perform realignment"
  echo "  --stage <stage|-4>                               # Used to run a partially-completed training process from somewhere in"
  echo "                                                   # the middle."


  exit 1;
fi

data=$1
lang=$2
alidir=$3
dir=$4

if [ ! -z "$realign_times" ]; then
  [ -z "$align_cmd" ] && echo "$0: realign_times specified but align_cmd not specified" && exit 1
  [ -z "$align_use_gpu" ] && echo "$0: realign_times specified but align_use_gpu not specified" && exit 1
fi

# Check some files.
for f in $data/feats.scp $lang/L.fst $alidir/ali.1.gz $alidir/final.mdl $alidir/tree; do
  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
done

# Copy phones.txt from ali-dir to dir. Later, steps/nnet3/decode.sh will
# use it to check compatibility between training and decoding phone-sets.
cp $alidir/phones.txt $dir

# Set some variables.
num_leaves=`tree-info $alidir/tree 2>/dev/null | grep num-pdfs | awk '{print $2}'` || exit 1
[ -z $num_leaves ] && echo "\$num_leaves is unset" && exit 1
[ "$num_leaves" -eq "0" ] && echo "\$num_leaves is 0" && exit 1

nj=`cat $alidir/num_jobs` || exit 1;  # number of jobs in alignment dir...
# in this dir we'll have just one job.
sdata=$data/split$nj
utils/split_data.sh $data $nj

mkdir -p $dir/log
echo $nj > $dir/num_jobs
cp $alidir/tree $dir

utils/lang/check_phones_compatible.sh $lang/phones.txt $alidir/phones.txt || exit 1;
cp $lang/phones.txt $dir || exit 1;

# First work out the feature and iVector dimension, needed for tdnn config creation.
case $feat_type in
  raw) feat_dim=$(feat-to-dim --print-args=false scp:$data/feats.scp -) || \
      { echo "$0: Error getting feature dim"; exit 1; }
    ;;
  lda)  [ ! -f $alidir/final.mat ] && echo "$0: With --feat-type lda option, expect $alidir/final.mat to exist."
   # get num-rows in lda matrix, which is the lda feature dim.
   feat_dim=$(matrix-dim --print-args=false $alidir/final.mat | cut -f 1)
    ;;
  *)
   echo "$0: Bad --feat-type '$feat_type';"; exit 1;
esac
if [ -z "$online_ivector_dir" ]; then
  ivector_dim=0
else
  ivector_dim=$(feat-to-dim scp:$online_ivector_dir/ivector_online.scp -) || exit 1;
fi


if [ $stage -le -5 ]; then
  echo "$0: creating neural net configs";

  if [ ! -z "$relu_dim" ]; then
    dim_opts="--relu-dim $relu_dim"
  else
    dim_opts="--pnorm-input-dim $pnorm_input_dim --pnorm-output-dim  $pnorm_output_dim"
  fi

  # create the config files for nnet initialization
  python steps/nnet3/make_tdnn_configs.py  \
    --splice-indexes "$splice_indexes"  \
    --feat-dim $feat_dim \
    --ivector-dim $ivector_dim  \
     $dim_opts \
    --use-presoftmax-prior-scale $use_presoftmax_prior_scale \
    --num-targets  $num_leaves  \
   $dir/configs || exit 1;

  # Initialize as "raw" nnet, prior to training the LDA-like preconditioning
  # matrix.  This first config just does any initial splicing that we do;
  # we do this as it's a convenient way to get the stats for the 'lda-like'
  # transform.
  $cmd $dir/log/nnet_init.log \
    nnet3-init --srand=-2 $dir/configs/init.config $dir/init.raw || exit 1;
fi

# sourcing the "vars" below sets
# left_context=(something)
# right_context=(something)
# num_hidden_layers=(something)
. $dir/configs/vars || exit 1;

context_opts="--left-context=$left_context --right-context=$right_context"

! [ "$num_hidden_layers" -gt 0 ] && echo \
 "$0: Expected num_hidden_layers to be defined" && exit 1;

[ -z "$transform_dir" ] && transform_dir=$alidir


if [ $stage -le -4 ] && [ -z "$egs_dir" ]; then
  extra_opts=()
  [ ! -z "$cmvn_opts" ] && extra_opts+=(--cmvn-opts "$cmvn_opts")
  [ ! -z "$feat_type" ] && extra_opts+=(--feat-type $feat_type)
  [ ! -z "$online_ivector_dir" ] && extra_opts+=(--online-ivector-dir $online_ivector_dir)
  extra_opts+=(--transform-dir $transform_dir)
  extra_opts+=(--left-context $left_context)
  extra_opts+=(--right-context $right_context)
  echo "$0: calling get_egs.sh"
  steps/nnet3/get_egs.sh $egs_opts "${extra_opts[@]}" \
      --samples-per-iter $samples_per_iter --stage $get_egs_stage \
      --cmd "$cmd" $egs_opts \
      --frames-per-eg $frames_per_eg \
      $data $alidir $dir/egs || exit 1;
fi

[ -z $egs_dir ] && egs_dir=$dir/egs

if [ "$feat_dim" != "$(cat $egs_dir/info/feat_dim)" ]; then
  echo "$0: feature dimension mismatch with egs, $feat_dim vs $(cat $egs_dir/info/feat_dim)";
  exit 1;
fi
if [ "$ivector_dim" != "$(cat $egs_dir/info/ivector_dim)" ]; then
  echo "$0: ivector dimension mismatch with egs, $ivector_dim vs $(cat $egs_dir/info/ivector_dim)";
  exit 1;
fi

# copy any of the following that exist, to $dir.
cp $egs_dir/{cmvn_opts,splice_opts,final.mat} $dir 2>/dev/null

# confirm that the egs_dir has the necessary context (especially important if
# the --egs-dir option was used on the command line).
egs_left_context=$(cat $egs_dir/info/left_context) || exit -1
egs_right_context=$(cat $egs_dir/info/right_context) || exit -1
 ( [ $egs_left_context -lt $left_context ] || \
   [ $egs_right_context -lt $right_context ] ) && \
   echo "$0: egs in $egs_dir have too little context" && exit -1;

frames_per_eg=$(cat $egs_dir/info/frames_per_eg) || { echo "error: no such file $egs_dir/info/frames_per_eg"; exit 1; }
num_archives=$(cat $egs_dir/info/num_archives) || { echo "error: no such file $egs_dir/info/frames_per_eg"; exit 1; }

# num_archives_expanded considers each separate label-position from
# 0..frames_per_eg-1 to be a separate archive.
num_archives_expanded=$[$num_archives*$frames_per_eg]

[ $num_jobs_initial -gt $num_jobs_final ] && \
  echo "$0: --initial-num-jobs cannot exceed --final-num-jobs" && exit 1;

[ $num_jobs_final -gt $num_archives_expanded ] && \
  echo "$0: --final-num-jobs cannot exceed #archives $num_archives_expanded." && exit 1;


if [ $stage -le -3 ]; then
  echo "$0: getting preconditioning matrix for input features."
  num_lda_jobs=$num_archives
  [ $num_lda_jobs -gt $max_lda_jobs ] && num_lda_jobs=$max_lda_jobs

  # Write stats with the same format as stats for LDA.
  $cmd JOB=1:$num_lda_jobs $dir/log/get_lda_stats.JOB.log \
      nnet3-acc-lda-stats --rand-prune=$rand_prune \
        $dir/init.raw "ark:$egs_dir/egs.JOB.ark" $dir/JOB.lda_stats || exit 1;

  all_lda_accs=$(for n in $(seq $num_lda_jobs); do echo $dir/$n.lda_stats; done)
  $cmd $dir/log/sum_transform_stats.log \
    sum-lda-accs $dir/lda_stats $all_lda_accs || exit 1;

  rm $all_lda_accs || exit 1;

  # this computes a fixed affine transform computed in the way we described in
  # Appendix C.6 of http://arxiv.org/pdf/1410.7455v6.pdf; it's a scaled variant
  # of an LDA transform but without dimensionality reduction.
  $cmd $dir/log/get_transform.log \
     nnet-get-feature-transform $lda_opts $dir/lda.mat $dir/lda_stats || exit 1;

  ln -sf ../lda.mat $dir/configs/lda.mat
fi


if [ $stage -le -2 ]; then
  echo "$0: preparing initial vector for FixedScaleComponent before softmax"
  echo "  ... using priors^$presoftmax_prior_scale_power and rescaling to average 1"

  # obtains raw pdf count
  $cmd JOB=1:$nj $dir/log/acc_pdf.JOB.log \
     ali-to-post "ark:gunzip -c $alidir/ali.JOB.gz|" ark:- \| \
     post-to-tacc --per-pdf=true  $alidir/final.mdl ark:- $dir/pdf_counts.JOB || exit 1;
  $cmd $dir/log/sum_pdf_counts.log \
       vector-sum --binary=false $dir/pdf_counts.* $dir/pdf_counts || exit 1;
  rm $dir/pdf_counts.*

  awk -v power=$presoftmax_prior_scale_power -v smooth=0.01 \
     '{ for(i=2; i<=NF-1; i++) { count[i-2] = $i;  total += $i; }
        num_pdfs=NF-2;  average_count = total/num_pdfs;
        for (i=0; i<num_pdfs; i++) stot += (scale[i] = (count[i] + smooth * average_count)^power)
        printf " [ "; for (i=0; i<num_pdfs; i++) printf("%f ", scale[i]*num_pdfs/stot); print "]" }' \
     $dir/pdf_counts > $dir/presoftmax_prior_scale.vec
  ln -sf ../presoftmax_prior_scale.vec $dir/configs/presoftmax_prior_scale.vec
fi

if [ $stage -le -1 ]; then
  # Add the first layer; this will add in the lda.mat and
  # presoftmax_prior_scale.vec.
  $cmd $dir/log/add_first_layer.log \
       nnet3-init --srand=-3 $dir/init.raw $dir/configs/layer1.config $dir/0.raw || exit 1;

  # Convert to .mdl, train the transitions, set the priors.
  $cmd $dir/log/init_mdl.log \
    nnet3-am-init $alidir/final.mdl $dir/0.raw - \| \
    nnet3-am-train-transitions - "ark:gunzip -c $alidir/ali.*.gz|" $dir/0.mdl || exit 1;
fi


# set num_iters so that as close as possible, we process the data $num_epochs
# times, i.e. $num_iters*$avg_num_jobs) == $num_epochs*$num_archives_expanded,
# where avg_num_jobs=(num_jobs_initial+num_jobs_final)/2.

num_archives_to_process=$[$num_epochs*$num_archives_expanded]
num_archives_processed=0
num_iters=$[($num_archives_to_process*2)/($num_jobs_initial+$num_jobs_final)]

! [ $num_iters -gt $[$finish_add_layers_iter+2] ] \
  && echo "$0: Insufficient epochs" && exit 1

finish_add_layers_iter=$[$num_hidden_layers * $add_layers_period]

echo "$0: Will train for $num_epochs epochs = $num_iters iterations"

if $use_gpu; then
  parallel_suffix=""
  train_queue_opt="--gpu 1"
  combine_queue_opt="--gpu 1"
  prior_gpu_opt="--use-gpu=yes"
  prior_queue_opt="--gpu 1"
  parallel_train_opts=
  if ! cuda-compiled; then
    echo "$0: WARNING: you are running with one thread but you have not compiled"
    echo "   for CUDA.  You may be running a setup optimized for GPUs.  If you have"
    echo "   GPUs and have nvcc installed, go to src/ and do ./configure; make"
    exit 1
  fi
else
  echo "$0: without using a GPU this will be very slow.  nnet3 does not yet support multiple threads."
  parallel_train_opts="--use-gpu=no"
  combine_queue_opt=""  # the combine stage will be quite slow if not using
                        # GPU, as we didn't enable that program to use
                        # multiple threads.
  prior_gpu_opt="--use-gpu=no"
  prior_queue_opt=""
fi


approx_iters_per_epoch_final=$[$num_archives_expanded/$num_jobs_final]
# First work out how many iterations we want to combine over in the final
# nnet3-combine-fast invocation.  (We may end up subsampling from these if the
# number exceeds max_model_combine).  The number we use is:
# min(max(max_models_combine, approx_iters_per_epoch_final),
#     1/2 * iters_after_last_layer_added)
num_iters_combine=$max_models_combine
if [ $num_iters_combine -lt $approx_iters_per_epoch_final ]; then
   num_iters_combine=$approx_iters_per_epoch_final
fi
half_iters_after_add_layers=$[($num_iters-$finish_add_layers_iter)/2]
if [ $num_iters_combine -gt $half_iters_after_add_layers ]; then
  num_iters_combine=$half_iters_after_add_layers
fi
first_model_combine=$[$num_iters-$num_iters_combine+1]

x=0

for realign_time in $realign_times; do
  # Work out the iterations on which we will re-align, if the --realign-times
  # option was used.  This is slightly approximate.
  ! perl -e "exit($realign_time > 0.0 && $realign_time < 1.0 ? 0:1);" && \
    echo "Invalid --realign-times option $realign_times: elements must be strictly between 0 and 1.";
  # the next formula is based on the one for mix_up_iter above.
  realign_iter=$(perl -e '($j,$k,$n,$p)=@ARGV; print int(0.5 + ($j==$k ? $n*$p : $n*(sqrt((1-$p)*$j*$j+$p*$k*$k)-$j)/($k-$j))); ' $num_jobs_initial $num_jobs_final $num_iters $realign_time) || exit 1;
  realign_this_iter[$realign_iter]=$realign_time
done

cur_egs_dir=$egs_dir

while [ $x -lt $num_iters ]; do
  [ $x -eq $exit_stage ] && echo "$0: Exiting early due to --exit-stage $exit_stage" && exit 0;

  this_num_jobs=$(perl -e "print int(0.5+$num_jobs_initial+($num_jobs_final-$num_jobs_initial)*$x/$num_iters);")

  ilr=$initial_effective_lrate; flr=$final_effective_lrate; np=$num_archives_processed; nt=$num_archives_to_process;
  this_learning_rate=$(perl -e "print (($x + 1 >= $num_iters ? $flr : $ilr*exp($np*log($flr/$ilr)/$nt))*$this_num_jobs);");

  echo "On iteration $x, learning rate is $this_learning_rate."

  if [ ! -z "${realign_this_iter[$x]}" ]; then
    prev_egs_dir=$cur_egs_dir
    cur_egs_dir=$dir/egs_${realign_this_iter[$x]}
  fi

  if [ $x -ge 0 ] && [ $stage -le $x ]; then
    if [ ! -z "${realign_this_iter[$x]}" ]; then
      time=${realign_this_iter[$x]}

      echo "Getting average posterior for purposes of adjusting the priors."
      # Note: this just uses CPUs, using a smallish subset of data.
      # always use the first egs archive, which makes the script simpler;
      # we're using different random subsets of it.
      rm $dir/post.$x.*.vec 2>/dev/null
      $cmd JOB=1:$num_jobs_compute_prior $dir/log/get_post.$x.JOB.log \
        nnet3-copy-egs --srand=JOB --frame=random $context_opts ark:$prev_egs_dir/egs.1.ark ark:- \| \
        nnet3-subset-egs --srand=JOB --n=$prior_subset_size ark:- ark:- \| \
        nnet3-merge-egs ark:- ark:- \| \
        nnet3-compute-from-egs --apply-exp=true "nnet3-am-copy --raw=true $dir/$x.mdl -|" ark:- ark:- \| \
        matrix-sum-rows ark:- ark:- \| vector-sum ark:- $dir/post.$x.JOB.vec || exit 1;

      sleep 3;  # make sure there is time for $dir/post.$x.*.vec to appear.

      $cmd $dir/log/vector_sum.$x.log \
        vector-sum $dir/post.$x.*.vec $dir/post.$x.vec || exit 1;
      rm $dir/post.$x.*.vec;

      echo "Re-adjusting priors based on computed posteriors"
      $cmd $dir/log/adjust_priors.$x.log \
        nnet3-am-adjust-priors $dir/$x.mdl $dir/post.$x.vec $dir/$x.mdl || exit 1;

      sleep 2

      steps/nnet3/align.sh --nj $num_jobs_align --cmd "$align_cmd" --use-gpu $align_use_gpu \
        --transform-dir "$transform_dir" --online-ivector-dir "$online_ivector_dir" \
        --iter $x $data $lang $dir $dir/ali_$time || exit 1

      steps/nnet3/relabel_egs.sh --cmd "$cmd" --iter $x $dir/ali_$time \
        $prev_egs_dir $cur_egs_dir || exit 1

      if $cleanup && [[ $prev_egs_dir =~ $dir/egs* ]]; then
        steps/nnet3/remove_egs.sh $prev_egs_dir
      fi
    fi

    # Set off jobs doing some diagnostics, in the background.
    # Use the egs dir from the previous iteration for the diagnostics
    $cmd $dir/log/compute_prob_valid.$x.log \
      nnet3-compute-prob "nnet3-am-copy --raw=true $dir/$x.mdl - |" \
            "ark:nnet3-merge-egs ark:$cur_egs_dir/valid_diagnostic.egs ark:- |" &
    $cmd $dir/log/compute_prob_train.$x.log \
      nnet3-compute-prob "nnet3-am-copy --raw=true $dir/$x.mdl - |" \
           "ark:nnet3-merge-egs ark:$cur_egs_dir/train_diagnostic.egs ark:- |" &

    if [ $x -gt 0 ]; then
      $cmd $dir/log/progress.$x.log \
        nnet3-show-progress --use-gpu=no "nnet3-am-copy --raw=true $dir/$[$x-1].mdl - |" "nnet3-am-copy --raw=true $dir/$x.mdl - |" \
        "ark:nnet3-merge-egs ark:$cur_egs_dir/train_diagnostic.egs ark:-|" '&&' \
        nnet3-info "nnet3-am-copy --raw=true $dir/$x.mdl - |" &
    fi

    echo "Training neural net (pass $x)"

    if [ $x -gt 0 ] && \
      [ $x -le $[($num_hidden_layers-1)*$add_layers_period] ] && \
      [ $[$x%$add_layers_period] -eq 0 ]; then
      do_average=false # if we've just mixed up, don't do averaging but take the
                       # best.
      cur_num_hidden_layers=$[1+$x/$add_layers_period]
      config=$dir/configs/layer$cur_num_hidden_layers.config
      raw="nnet3-am-copy --raw=true --learning-rate=$this_learning_rate $dir/$x.mdl - | nnet3-init --srand=$x - $config - |"
    else
      do_average=true
      if [ $x -eq 0 ]; then do_average=false; fi # on iteration 0, pick the best, don't average.
      raw="nnet3-am-copy --raw=true --learning-rate=$this_learning_rate $dir/$x.mdl -|"
    fi
    if $do_average; then
      this_minibatch_size=$minibatch_size
    else
      # on iteration zero or when we just added a layer, use a smaller minibatch
      # size (and we will later choose the output of just one of the jobs): the
      # model-averaging isn't always helpful when the model is changing too fast
      # (i.e. it can worsen the objective function), and the smaller minibatch
      # size will help to keep the update stable.
      this_minibatch_size=$[$minibatch_size/2];
    fi

    rm $dir/.error 2>/dev/null


    ( # this sub-shell is so that when we "wait" below,
      # we only wait for the training jobs that we just spawned,
      # not the diagnostic jobs that we spawned above.

      # We can't easily use a single parallel SGE job to do the main training,
      # because the computation of which archive and which --frame option
      # to use for each job is a little complex, so we spawn each one separately.
      for n in $(seq $this_num_jobs); do
        k=$[$num_archives_processed + $n - 1]; # k is a zero-based index that we'll derive
                                               # the other indexes from.
        archive=$[($k%$num_archives)+1]; # work out the 1-based archive index.
        frame=$[(($k/$num_archives)%$frames_per_eg)]; # work out the 0-based frame
        # index; this increases more slowly than the archive index because the
        # same archive with different frame indexes will give similar gradients,
        # so we want to separate them in time.

        $cmd $train_queue_opt $dir/log/train.$x.$n.log \
          nnet3-train $parallel_train_opts \
          --max-param-change=$max_param_change "$raw" \
          "ark:nnet3-copy-egs --frame=$frame $context_opts ark:$cur_egs_dir/egs.$archive.ark ark:- | nnet3-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x ark:- ark:-| nnet3-merge-egs --minibatch-size=$this_minibatch_size --discard-partial-minibatches=true ark:- ark:- |" \
          $dir/$[$x+1].$n.raw || touch $dir/.error &
      done
      wait
    )
    # the error message below is not that informative, but $cmd will
    # have printed a more specific one.
    [ -f $dir/.error ] && echo "$0: error on iteration $x of training" && exit 1;

    nnets_list=
    for n in `seq 1 $this_num_jobs`; do
      nnets_list="$nnets_list $dir/$[$x+1].$n.raw"
    done

    if $do_average; then
      # average the output of the different jobs.
      $cmd $dir/log/average.$x.log \
        nnet3-average $nnets_list - \| \
        nnet3-am-copy --set-raw-nnet=- $dir/$x.mdl $dir/$[$x+1].mdl || exit 1;
    else
      # choose the best from the different jobs.
      n=$(perl -e '($nj,$pat)=@ARGV; $best_n=1; $best_logprob=-1.0e+10; for ($n=1;$n<=$nj;$n++) {
          $fn = sprintf($pat,$n); open(F, "<$fn") || die "Error opening log file $fn";
          undef $logprob; while (<F>) { if (m/log-prob-per-frame=(\S+)/) { $logprob=$1; } }
          close(F); if (defined $logprob && $logprob > $best_logprob) { $best_logprob=$logprob;
          $best_n=$n; } } print "$best_n\n"; ' $num_jobs_nnet $dir/log/train.$x.%d.log) || exit 1;
      [ -z "$n" ] && echo "Error getting best model" && exit 1;
      $cmd $dir/log/select.$x.log \
        nnet3-am-copy --set-raw-nnet=$dir/$[$x+1].$n.raw  $dir/$x.mdl $dir/$[$x+1].mdl || exit 1;
    fi

    rm $nnets_list
    [ ! -f $dir/$[$x+1].mdl ] && exit 1;
    if [ -f $dir/$[$x-1].mdl ] && $cleanup && \
       [ $[($x-1)%100] -ne 0  ] && [ $[$x-1] -lt $first_model_combine ]; then
      rm $dir/$[$x-1].mdl
    fi
  fi
  x=$[$x+1]
  num_archives_processed=$[$num_archives_processed+$this_num_jobs]
done


if [ $stage -le $num_iters ]; then
  echo "Doing final combination to produce final.mdl"

  # Now do combination.  In the nnet3 setup, the logic
  # for doing averaging of subsets of the models in the case where
  # there are too many models to reliably esetimate interpolation
  # factors (max_models_combine) is moved into the nnet3-combine
  nnets_list=()
  for n in $(seq 0 $[num_iters_combine-1]); do
    iter=$[$first_model_combine+$n]
    mdl=$dir/$iter.mdl
    [ ! -f $mdl ] && echo "Expected $mdl to exist" && exit 1;
    nnets_list[$n]="nnet3-am-copy --raw=true $mdl -|";
  done

  # Below, we use --use-gpu=no to disable nnet3-combine-fast from using a GPU,
  # as if there are many models it can give out-of-memory error; and we set
  # num-threads to 8 to speed it up (this isn't ideal...)

  $cmd $combine_queue_opt $dir/log/combine.log \
    nnet3-combine --num-iters=40 \
       --enforce-sum-to-one=true --enforce-positive-weights=true \
       --verbose=3 "${nnets_list[@]}" "ark:nnet3-merge-egs --minibatch-size=1024 ark:$cur_egs_dir/combine.egs ark:-|" \
    "|nnet3-am-copy --set-raw-nnet=- $dir/$num_iters.mdl $dir/combined.mdl" || exit 1;

  # Compute the probability of the final, combined model with
  # the same subset we used for the previous compute_probs, as the
  # different subsets will lead to different probs.
  $cmd $dir/log/compute_prob_valid.final.log \
    nnet3-compute-prob "nnet3-am-copy --raw=true $dir/combined.mdl -|" \
    "ark:nnet3-merge-egs ark:$cur_egs_dir/valid_diagnostic.egs ark:- |" &
  $cmd $dir/log/compute_prob_train.final.log \
    nnet3-compute-prob  "nnet3-am-copy --raw=true $dir/combined.mdl -|" \
    "ark:nnet3-merge-egs ark:$cur_egs_dir/train_diagnostic.egs ark:- |" &
fi

if [ $stage -le $[$num_iters+1] ]; then
  echo "Getting average posterior for purposes of adjusting the priors."
  # Note: this just uses CPUs, using a smallish subset of data.
  if [ $num_jobs_compute_prior -gt $num_archives ]; then egs_part=1;
  else egs_part=JOB; fi
  rm $dir/post.$x.*.vec 2>/dev/null
  $cmd JOB=1:$num_jobs_compute_prior $prior_queue_opt $dir/log/get_post.$x.JOB.log \
    nnet3-copy-egs --frame=random $context_opts --srand=JOB ark:$cur_egs_dir/egs.$egs_part.ark ark:- \| \
    nnet3-subset-egs --srand=JOB --n=$prior_subset_size ark:- ark:- \| \
    nnet3-merge-egs ark:- ark:- \| \
    nnet3-compute-from-egs $prior_gpu_opt --apply-exp=true \
      "nnet3-am-copy --raw=true $dir/combined.mdl -|" ark:- ark:- \| \
    matrix-sum-rows ark:- ark:- \| vector-sum ark:- $dir/post.$x.JOB.vec || exit 1;

  sleep 3;  # make sure there is time for $dir/post.$x.*.vec to appear.

  $cmd $dir/log/vector_sum.$x.log \
   vector-sum $dir/post.$x.*.vec $dir/post.$x.vec || exit 1;

  rm $dir/post.$x.*.vec;

  echo "Re-adjusting priors based on computed posteriors"
  $cmd $dir/log/adjust_priors.final.log \
    nnet3-am-adjust-priors $dir/combined.mdl $dir/post.$x.vec $dir/final.mdl || exit 1;
fi


if [ ! -f $dir/final.mdl ]; then
  echo "$0: $dir/final.mdl does not exist."
  # we don't want to clean up if the training didn't succeed.
  exit 1;
fi

sleep 2

echo Done

if $cleanup; then
  echo Cleaning up data
  if $remove_egs && [[ $cur_egs_dir =~ $dir/egs* ]]; then
    steps/nnet2/remove_egs.sh $cur_egs_dir
  fi

  echo Removing most of the models
  for x in `seq 0 $num_iters`; do
    if [ $[$x%100] -ne 0 ] && [ $x -ne $num_iters ] && [ -f $dir/$x.mdl ]; then
       # delete all but every 100th model; don't delete the ones which combine to form the final model.
      rm $dir/$x.mdl
    fi
  done
fi

steps/info/nnet3_dir_info.pl $dir

exit 0


================================================
FILE: egs/steps/nnet3/xconfig_to_config.py
================================================
#!/usr/bin/env python3

# Copyright 2016-2018    Johns Hopkins University (Dan Povey)
#           2016    Vijayaditya Peddinti
#           2017    Google Inc. (vpeddinti@google.com)
# Apache 2.0.

# This is like xconfig_to_configs.py but with a simpler interface; it writes
# to a single named file.


import argparse
import os
import sys
from collections import defaultdict

sys.path.insert(0, 'steps/')
# the following is in case we weren't running this from the normal directory.
sys.path.insert(0, os.path.realpath(os.path.dirname(sys.argv[0])) + '/')

import libs.nnet3.xconfig.parser as xparser
import libs.common as common_lib


def get_args():
    # we add compulsory arguments as named arguments for readability
    parser = argparse.ArgumentParser(
        description="Reads an xconfig file and creates config files "
                    "for neural net creation and training",
        epilog='Search egs/*/*/local/{nnet3,chain}/*sh for examples')
    parser.add_argument('--xconfig-file', required=True,
                        help='Filename of input xconfig file')
    parser.add_argument('--existing-model',
                        help='Filename of previously trained neural net '
                             '(e.g. final.mdl) which is useful in case of '
                             'using nodes from list of component-nodes in '
                             'already trained model '
                             'to generate new config file for new model.'
                             'The context info is also generated using '
                             'a model generated by adding final.config '
                             'to the existing model.'
                             'e.g. In Transfer learning: generate new model using '
                             'component nodes in existing model.')
    parser.add_argument('--config-file-out', required=True,
                        help='Filename to write nnet config file.');
    parser.add_argument('--nnet-edits', type=str, default=None,
                        action=common_lib.NullstrToNoneAction,
                        help="""This option is useful in case the network you
                        are creating does not have an output node called
                        'output' (e.g. for multilingual setups).  You can set
                        this to an edit-string like: 'rename-node old-name=xxx
                        new-name=output' if node xxx plays the role of the
                        output node in this network.  This is only used for
                        computing the left/right context.""")

    print(' '.join(sys.argv), file=sys.stderr)

    args = parser.parse_args()

    return args


def write_config_file(config_file_out, all_layers):
    # config_basename_to_lines is map from the basename of the
    # config, as a string (i.e. 'ref', 'all', 'init') to a list of
    # strings representing lines to put in the config file.
    config_basename_to_lines = defaultdict(list)

    for layer in all_layers:
        try:
            pairs = layer.get_full_config()
            for config_basename, line in pairs:
                config_basename_to_lines[config_basename].append(line)
        except Exception as e:
            print("{0}: error producing config lines from xconfig "
                  "line '{1}': error was: {2}".format(sys.argv[0],
                                                      str(layer), repr(e)),
                  file=sys.stderr)
            # we use raise rather than raise(e) as using a blank raise
            # preserves the backtrace
            raise

    with open(config_file_out, 'w') as f:
        print('# This file was created by the command:\n'
              '# {0} '.format(sys.argv), file=f)
        lines = config_basename_to_lines['final']
        for line in lines:
            print(line, file=f)


def main():
    args = get_args()
    existing_layers = []
    if args.existing_model is not None:
        existing_layers = xparser.get_model_component_info(args.existing_model)
    all_layers = xparser.read_xconfig_file(args.xconfig_file, existing_layers)
    write_config_file(args.config_file_out, all_layers)


if __name__ == '__main__':
    main()


# test:
# (echo 'input dim=40 name=input'; echo 'output name=output input=Append(-1,0,1)')  >xconfig; steps/nnet3/xconfig_to_config.py --xconfig-file=xconfig --config-file-out=foo


================================================
FILE: egs/steps/nnet3/xconfig_to_configs.py
================================================
#!/usr/bin/env python

# Copyright 2016    Johns Hopkins University (Dan Povey)
#           2016    Vijayaditya Peddinti
#           2017    Google Inc. (vpeddinti@google.com)
# Apache 2.0.

# we're using python 3.x style print but want it to work in python 2.x,
from __future__ import print_function
import argparse
import os
import sys
from collections import defaultdict

sys.path.insert(0, 'steps/')
# the following is in case we weren't running this from the normal directory.
sys.path.insert(0, os.path.realpath(os.path.dirname(sys.argv[0])) + '/')

import libs.nnet3.xconfig.parser as xparser
import libs.common as common_lib


def get_args():
    # we add compulsary arguments as named arguments for readability
    parser = argparse.ArgumentParser(
        description="Reads an xconfig file and creates config files "
                    "for neural net creation and training",
        epilog='Search egs/*/*/local/{nnet3,chain}/*sh for examples')
    parser.add_argument('--xconfig-file', required=True,
                        help='Filename of input xconfig file')
    parser.add_argument('--existing-model',
                        help='Filename of previously trained neural net '
                             '(e.g. final.mdl) which is useful in case of '
                             'using nodes from list of component-nodes in '
                             'already trained model '
                             'to generate new config file for new model.'
                             'The context info is also generated using '
                             'a model generated by adding final.config '
                             'to the existing model.'
                             'e.g. In Transfer learning: generate new model using '
                             'component nodes in existing model.')
    parser.add_argument('--config-dir', required=True,
                        help='Directory to write config files and variables')
    parser.add_argument('--nnet-edits', type=str, default=None,
                        action=common_lib.NullstrToNoneAction,
                        help="""This option is useful in case the network you
                        are creating does not have an output node called
                        'output' (e.g. for multilingual setups).  You can set
                        this to an edit-string like: 'rename-node old-name=xxx
                        new-name=output' if node xxx plays the role of the
                        output node in this network.  This is only used for
                        computing the left/right context.""")

    print(' '.join(sys.argv), file=sys.stderr)

    args = parser.parse_args()
    args = check_args(args)

    return args


def check_args(args):
    if not os.path.exists(args.config_dir):
        os.makedirs(args.config_dir)
    return args


def backup_xconfig_file(xconfig_file, config_dir):
    """we write a copy of the xconfig file just to have a record of the
    original input.
    """
    try:
        xconfig_file_out = open(config_dir + '/xconfig', 'w')
    except:
        raise Exception('{0}: error opening file '
                        '{1}/xconfig for output'.format(
                            sys.argv[0], config_dir))
    try:
        xconfig_file_in = open(xconfig_file)
    except:
        raise Exception('{0}: error opening file {1} for input'
                        ''.format(sys.argv[0], config_dir))

    print("# This file was created by the command:\n"
          "# {0}\n"
          "# It is a copy of the source from which the config files in "
          "# this directory were generated.\n".format(' '.join(sys.argv)),
          file=xconfig_file_out)

    while True:
        line = xconfig_file_in.readline()
        if line == '':
            break
        print(line.strip(), file=xconfig_file_out)
    xconfig_file_out.close()
    xconfig_file_in.close()


def write_expanded_xconfig_files(config_dir, all_layers):
    """ This functions writes config_dir/xconfig.expanded.1 and
    config_dir/xconfig.expanded.2, showing some of the internal stages of
    processing the xconfig file before turning it into config files.
    """
    try:
        xconfig_file_out = open(config_dir + '/xconfig.expanded.1', 'w')
    except:
        raise Exception('{0}: error opening file '
                        '{1}/xconfig.expanded.1 for output'.format(
                            sys.argv[0], config_dir))

    print('# This file was created by the command:\n'
          '# ' + ' '.join(sys.argv) + '\n'
          '#It contains the same content as ./xconfig but it was parsed and\n'
          '#default config values were set.\n'
          '# See also ./xconfig.expanded.2\n', file=xconfig_file_out)

    for layer in all_layers:
        print('{}'.format(layer), file=xconfig_file_out)
    xconfig_file_out.close()

    try:
        xconfig_file_out = open(config_dir + '/xconfig.expanded.2', 'w')
    except:
        raise Exception('{0}: error opening file '
                        '{1}/xconfig.expanded.2 for output'.format(
                            sys.argv[0], config_dir))

    print('# This file was created by the command:\n'
          '# ' + ' '.join(sys.argv) + '\n'
          '# It contains the same content as ./xconfig but it was parsed,\n'
          '# default config values were set, \n'
          '# and Descriptors (input=xxx) were normalized.\n'
          '# See also ./xconfig.expanded.1\n',
          file=xconfig_file_out)

    for layer in all_layers:
        layer.normalize_descriptors()
        print('{}'.format(layer), file=xconfig_file_out)
    xconfig_file_out.close()


def get_config_headers():
    """ This function returns a map from config-file basename
    e.g. 'init', 'ref', 'layer1' to a documentation string that goes
    at the top of the file.
    """
    # resulting dict will default to the empty string for any config files not
    # explicitly listed here.
    ans = defaultdict(str)

    ans['init'] = (
        '# This file was created by the command:\n'
        '# ' + ' '.join(sys.argv) + '\n'
        '# It contains the input of the network and is used in\n'
        '# accumulating stats for an LDA-like transform of the\n'
        '# input features.\n')
    ans['ref'] = (
        '# This file was created by the command:\n'
        '# ' + ' '.join(sys.argv) + '\n'
        '# It contains the entire neural network, but with those\n'
        '# components that would normally require fixed vectors/matrices\n'
        '# read from disk, replaced with random initialization\n'
        '# (this applies to the LDA-like transform and the\n'
        '# presoftmax-prior-scale, if applicable).  This file\n'
        '# is used only to work out the left-context and right-context\n'
        '# of the network.\n')
    ans['final'] = (
        '# This file was created by the command:\n'
        '# ' + ' '.join(sys.argv) + '\n'
        '# It contains the entire neural network.\n')

    return ans


# This is where most of the work of this program happens.
def write_config_files(config_dir, all_layers):
    # config_basename_to_lines is map from the basename of the
    # config, as a string (i.e. 'ref', 'all', 'init') to a list of
    # strings representing lines to put in the config file.
    config_basename_to_lines = defaultdict(list)

    config_basename_to_header = get_config_headers()

    for layer in all_layers:
        try:
            pairs = layer.get_full_config()
            for config_basename, line in pairs:
                config_basename_to_lines[config_basename].append(line)
        except Exception as e:
            print("{0}: error producing config lines from xconfig "
                  "line '{1}': error was: {2}".format(sys.argv[0],
                                                      str(layer), repr(e)),
                  file=sys.stderr)
            # we use raise rather than raise(e) as using a blank raise
            # preserves the backtrace
            raise

    # remove previous init.config
    try:
        os.remove(config_dir + '/init.config')
    except OSError:
        pass

    for basename, lines in config_basename_to_lines.items():
        # check the lines num start with 'output-node':
        num_output_node_lines = sum( [ 1 if line.startswith('output-node' ) else 0
                                       for line in lines ] )
        if num_output_node_lines == 0:
            if basename == 'init':
                continue # do not write the init.config
            else:
                print('{0}: error in xconfig file {1}: may be lack of a '
                      'output layer'.format(sys.argv[0], sys.argv[2]),
                                            file=sys.stderr)
                raise

        header = config_basename_to_header[basename]
        filename = '{0}/{1}.config'.format(config_dir, basename)
        try:
            f = open(filename, 'w')
            print(header, file=f)
            for line in lines:
                print(line, file=f)
            f.close()
        except Exception as e:
            print('{0}: error writing to config file {1}: error is {2}'
                  ''.format(sys.argv[0], filename, repr(e)), file=sys.stderr)
            # we use raise rather than raise(e) as using a blank raise
            # preserves the backtrace
            raise


def add_nnet_context_info(config_dir, nnet_edits=None,
                          existing_model=None):
    """Create the 'vars' file that specifies model_left_context, etc."""

    common_lib.execute_command("nnet3-init {0} {1}/ref.config "
                               "{1}/ref.raw"
                               "".format(existing_model if
                                         existing_model is not None else "",
                                         config_dir))
    model = "{0}/ref.raw".format(config_dir)
    if nnet_edits is not None:
        model = "nnet3-copy --edits='{0}' {1} - |".format(nnet_edits,
                                                          model)
    out = common_lib.get_command_stdout('nnet3-info "{0}"'.format(model))
    # out looks like this
    # left-context: 7
    # right-context: 0
    # num-parameters: 90543902
    # modulus: 1
    # ...
    info = {}
    for line in out.split("\n")[:4]: # take 4 initial lines,
        parts = line.split(":")
        if len(parts) != 2:
            continue
        info[parts[0].strip()] = int(parts[1].strip())

    # Writing the 'vars' file:
    #   model_left_context=0
    #   model_right_context=7
    vf = open('{0}/vars'.format(config_dir), 'w')
    vf.write('model_left_context={0}\n'.format(info['left-context']))
    vf.write('model_right_context={0}\n'.format(info['right-context']))
    vf.close()

def check_model_contexts(config_dir, nnet_edits=None, existing_model=None):
    contexts = {}
    for file_name in ['init', 'ref']:
        if os.path.exists('{0}/{1}.config'.format(config_dir, file_name)):
            contexts[file_name] = {}
            common_lib.execute_command("nnet3-init {0} {1}/{2}.config "
                                       "{1}/{2}.raw"
                                       "".format(existing_model if
                                                 existing_model is not
                                                 None else '',
                                                 config_dir, file_name))
            model = "{0}/{1}.raw".format(config_dir, file_name)
            if nnet_edits is not None and file_name != 'init':
                model = "nnet3-copy --edits='{0}' {1} - |".format(nnet_edits,
                                                                  model)
            out = common_lib.get_command_stdout('nnet3-info "{0}"'.format(model))
            # out looks like this
            # left-context: 7
            # right-context: 0
            # num-parameters: 90543902
            # modulus: 1
            # ...
            for line in out.split("\n")[:4]: # take 4 initial lines,
                parts = line.split(":")
                if len(parts) != 2:
                    continue
                key = parts[0].strip()
                value = int(parts[1].strip())
                if key in ['left-context', 'right-context']:
                    contexts[file_name][key] = value

    if 'init' in contexts:
        assert('ref' in contexts)
        if ('left-context' in contexts['init'] and
            'left-context' in contexts['ref']):
            if ((contexts['init']['left-context']
                 > contexts['ref']['left-context'])
                or (contexts['init']['right-context']
                    > contexts['ref']['right-context'])):
               raise Exception(
                    "Model specified in {0}/init.config requires greater"
                    " context than the model specified in {0}/ref.config."
                    " This might be due to use of label-delay at the output"
                    " in ref.config. Please use delay=$label_delay in the"
                    " initial fixed-affine-layer of the network, to avoid"
                    " this issue.")


def main():
    args = get_args()
    backup_xconfig_file(args.xconfig_file, args.config_dir)
    existing_layers = []
    if args.existing_model is not None:
        existing_layers = xparser.get_model_component_info(args.existing_model)
    all_layers = xparser.read_xconfig_file(args.xconfig_file, existing_layers)
    write_expanded_xconfig_files(args.config_dir, all_layers)
    write_config_files(args.config_dir, all_layers)
    check_model_contexts(args.config_dir, args.nnet_edits,
                         existing_model=args.existing_model)
    add_nnet_context_info(args.config_dir, args.nnet_edits,
                          existing_model=args.existing_model)


if __name__ == '__main__':
    main()


# test:
# mkdir -p foo; (echo 'input dim=40 name=input'; echo 'output name=output input=Append(-1,0,1)')  >xconfig; ./xconfig_to_configs.py xconfig foo
#  mkdir -p foo; (echo 'input dim=40 name=input'; echo 'output-layer name=output dim=1924 input=Append(-1,0,1)')  >xconfig; ./xconfig_to_configs.py xconfig foo

# mkdir -p foo; (echo 'input dim=40 name=input'; echo 'relu-renorm-layer name=affine1 dim=1024'; echo 'output-layer name=output dim=1924 input=Append(-1,0,1)')  >xconfig; ./xconfig_to_configs.py xconfig foo

# mkdir -p foo; (echo 'input dim=100 name=ivector'; echo 'input dim=40 name=input'; echo 'fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=foo/bar/lda.mat'; echo 'output-layer name=output dim=1924 input=Append(-1,0,1)')  >xconfig; ./xconfig_to_configs.py xconfig foo


================================================
FILE: egs/steps/online/decode.sh
================================================
#!/usr/bin/env bash

# Copyright 2014  Johns Hopkins University (Author: Daniel Povey)
# Apache 2.0

# Begin configuration section.
stage=0
nj=4
cmd=run.pl
max_active=7000
beam=13.0
lattice_beam=6.0
acwt=0.083333 # note: only really affects adaptation and pruning (scoring is on
              # lattices).
per_utt=false
do_endpointing=false
do_speex_compressing=false
scoring_opts=
skip_scoring=false
# End configuration section.

echo "$0 $@"  # Print the command line for logging

[ -f ./path.sh ] && . ./path.sh; # source the path.
. parse_options.sh || exit 1;

if [ $# != 3 ]; then
   echo "Usage: $0 [options] <graph-dir> <data-dir> <decode-dir>"
   echo "... where <decode-dir> is assumed to be a sub-directory of the directory"
   echo " where the models are, as prepared by steps/online/prepare_online_decoding.sh"
   echo "e.g.: $0 exp/tri3b/graph data/test exp/tri3b_online/decode/"
   echo ""
   echo ""
   echo "main options (for others, see top of script file)"
   echo "  --config <config-file>                           # config containing options"
   echo "  --nj <nj>                                        # number of parallel jobs"
   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
   echo "  --acwt <float>                                   # acoustic scale used for lattice generation "
   echo "  --per-utt <true|false>                           # If true, decode per utterance without"
   echo "                                                   # carrying forward adaptation info from previous"
   echo "                                                   # utterances of each speaker."
   echo "  --scoring-opts <string>                          # options to local/score.sh"
   exit 1;
fi


graphdir=$1
data=$2
dir=$3
srcdir=`dirname $dir`; # The model directory is one level up from decoding directory.
sdata=$data/split$nj;

mkdir -p $dir/log
[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
echo $nj > $dir/num_jobs

for f in $srcdir/conf/online_decoding.conf $graphdir/HCLG.fst $graphdir/words.txt $data/wav.scp; do
  if [ ! -f $f ]; then
    echo "$0: no such file $f"
    exit 1;
  fi
done

if ! $per_utt; then
  spk2utt_rspecifier="ark:$sdata/JOB/spk2utt"
else
  mkdir -p $dir/per_utt
  for j in $(seq $nj); do
    awk '{print $1, $1}' <$sdata/$j/utt2spk >$dir/per_utt/utt2spk.$j || exit 1;
  done
  spk2utt_rspecifier="ark:$dir/per_utt/utt2spk.JOB"
fi

if $do_endpointing; then
  if $do_speex_compressing; then
    wav_rspecifier="ark:compress-uncompress-speex scp:$sdata/JOB/wav.scp ark:-|extend-wav-with-silence ark:- ark:-|"
  else
    wav_rspecifier="ark:extend-wav-with-silence scp:$sdata/JOB/wav.scp ark:-|"
  fi
else
  if $do_speex_compressing; then
    wav_rspecifier="ark:compress-uncompress-speex scp:$sdata/JOB/wav.scp ark:-|"
  else
    wav_rspecifier=scp:$sdata/JOB/wav.scp
  fi
fi

if [ $stage -le 0 ]; then
  $cmd JOB=1:$nj $dir/log/decode.JOB.log \
    online2-wav-gmm-latgen-faster --do-endpointing=$do_endpointing \
     --config=$srcdir/conf/online_decoding.conf \
     --max-active=$max_active --beam=$beam --lattice-beam=$lattice_beam \
     --acoustic-scale=$acwt --word-symbol-table=$graphdir/words.txt \
     $graphdir/HCLG.fst $spk2utt_rspecifier "$wav_rspecifier" \
      "ark:|gzip -c > $dir/lat.JOB.gz" || exit 1;
fi

if ! $skip_scoring ; then
  [ ! -x local/score.sh ] && \
    echo "Not scoring because local/score.sh does not exist or not executable." && exit 1;
  local/score.sh --cmd "$cmd" $scoring_opts $data $graphdir $dir
fi

exit 0;


================================================
FILE: egs/steps/online/nnet2/align.sh
================================================
#!/usr/bin/env bash
# Copyright      2012  Brno University of Technology (Author: Karel Vesely)
#           2013-2014  Johns Hopkins University (Author: Daniel Povey)
# Apache 2.0

# Computes training alignments using DNN.  This takes as input a directory
# prepared as for online-nnet2 decoding (e.g. by
# steps/online/nnet2/prepare_online_decoding.sh), and it computes the features
# directly from the wav.scp instead of relying on features dumped on disk;
# this avoids the hassle of having to dump suitably matched features.


# Begin configuration section.  
nj=4
cmd=run.pl
# Begin configuration.
scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
beam=10
retry_beam=40
iter=final
use_gpu=no

echo "$0 $@"  # Print the command line for logging

[ -f path.sh ] && . ./path.sh # source the path.
. parse_options.sh || exit 1;

if [ $# != 4 ]; then
   echo "Usage: $0 <data-dir> <lang-dir> <src-dir> <align-dir>"
   echo "e.g.: $0 data/train data/lang exp/nnet4 exp/nnet4_ali"
   echo "main options (for others, see top of script file)"
   echo "  --config <config-file>                           # config containing options"
   echo "  --nj <nj>                                        # number of parallel jobs"
   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
   exit 1;
fi

data=$1
lang=$2
srcdir=$3
dir=$4

oov=`cat $lang/oov.int` || exit 1;
mkdir -p $dir/log
echo $nj > $dir/num_jobs
sdata=$data/split$nj
[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;


for f in $srcdir/tree $srcdir/${iter}.mdl $data/wav.scp $lang/L.fst \
      $srcdir/conf/online_nnet2_decoding.conf; do
  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
done

utils/lang/check_phones_compatible.sh $lang/phones.txt $srcdir/phones.txt || exit 1;
cp $lang/phones.txt $dir || exit 1;
cp $srcdir/{tree,${iter}.mdl} $dir || exit 1;

grep -v '^--endpoint' $srcdir/conf/online_nnet2_decoding.conf >$dir/feature.conf || exit 1;


if [ -f $data/segments ]; then
  # note: in the feature extraction, because the program online2-wav-dump-features is sensitive to the
  # previous utterances within a speaker, we do the filtering after extracting the features.
  echo "$0 [info]: segments file exists: using that."
  feats="ark,s,cs:extract-segments scp:$sdata/JOB/wav.scp $sdata/JOB/segments ark:- | online2-wav-dump-features --config=$dir/feature.conf ark:$sdata/JOB/spk2utt ark,s,cs:- ark:- |"
else
  echo "$0 [info]: no segments file exists, using wav.scp."
  feats="ark,s,cs:online2-wav-dump-features --config=$dir/feature.conf ark:$sdata/JOB/spk2utt scp:$sdata/JOB/wav.scp ark:- |"
fi

echo "$0: aligning data in $data using model from $srcdir, putting alignments in $dir"

tra="ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt $sdata/JOB/text|";

$cmd JOB=1:$nj $dir/log/align.JOB.log \
  compile-train-graphs --read-disambig-syms=$lang/phones/disambig.int $dir/tree $srcdir/${iter}.mdl  $lang/L.fst "$tra" ark:- \| \
  nnet-align-compiled $scale_opts --use-gpu=$use_gpu --beam=$beam --retry-beam=$retry_beam \
    $srcdir/${iter}.mdl ark:- "$feats" "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1;

echo "$0: done aligning data."


================================================
FILE: egs/steps/online/nnet2/copy_data_dir.sh
================================================
#!/usr/bin/env bash

# Copyright 2013-2014  Johns Hopkins University (author: Daniel Povey)
# Apache 2.0

# Warning, this script is deprecated, please use utils/data/modify_speaker_info.sh

# This script is as utils/copy_data_dir.sh in that it copies a data-dir,
# but it supports the --utts-per-spk-max option.  If nonzero, it modifies
# the utt2spk and spk2utt files by splitting each speaker into multiple
# versions, so that each speaker has no more than --utts-per-spk-max
# utterances.

# begin configuration section
utts_per_spk_max=-1
# end configuration section

. utils/parse_options.sh

if [ $# != 2 ]; then
  echo "Usage: "
  echo "  $0 [options] <srcdir> <destdir>"
  echo "e.g.:"
  echo " $0 --utts-per-spk-max 2 data/train data/train-max2"
  echo "Options"
  echo "   --utts-per-spk-max <n>  # number of utterances per speaker maximum,"
  echo "                           # default -1 (meaning no maximum).  E.g. 2."
  exit 1;
fi


echo "$0: this script is deprecated, please use utils/data/modify_speaker_info.sh."

export LC_ALL=C

srcdir=$1
destdir=$2

if [ ! -f $srcdir/utt2spk ]; then
  echo "$0: no such file $srcdir/utt2spk"
  exit 1;
fi

set -e;
set -o pipefail

mkdir -p $destdir


if [ "$utts_per_spk_max" != -1 ]; then
  # create spk2utt file with reduced number of utterances per speaker.
  awk -v max=$utts_per_spk_max '{ n=2; count=0;
    while(n<=NF) {
      int_max=int(max)+ (rand() < (max-int(max))?1:0);
      nmax=n+int_max; count++; printf("%s-%06x", $1, count);
      for (;n<nmax&&n<=NF; n++) printf(" %s", $n); print "";} }' \
   <$srcdir/spk2utt >$destdir/spk2utt
  utils/spk2utt_to_utt2spk.pl <$destdir/spk2utt >$destdir/utt2spk

  if [ -f $srcdir/cmvn.scp ]; then
    # below, the first apply_map command outputs a cmvn.scp indexed by utt;
    # the second one outputs a cmvn.scp indexed by new speaker-id.
    utils/apply_map.pl -f 2 $srcdir/cmvn.scp <$srcdir/utt2spk | \
      utils/apply_map.pl -f 1 $destdir/utt2spk | sort | uniq > $destdir/cmvn.scp
    echo "$0: mapping cmvn.scp, but you may want to recompute it if it's needed,"
    echo " as it would probably change."
  fi
  if [ -f $srcdir/spk2gender ]; then
    utils/apply_map.pl -f 2 $srcdir/spk2gender <$srcdir/utt2spk | \
      utils/apply_map.pl -f 1 $destdir/utt2spk | sort | uniq >$destdir/spk2gender
  fi
else
  cp $srcdir/spk2utt $srcdir/utt2spk $destdir/
  [ -f $srcdir/spk2gender ] && cp $srcdir/spk2gender $destdir/
  [ -f $srcdir/cmvn.scp ] && cp $srcdir/cmvn.scp $destdir/
fi


for f in feats.scp segments wav.scp reco2file_and_channel text stm glm ctm; do
  [ -f $srcdir/$f ] && cp $srcdir/$f $destdir/
done

echo "$0: copied data from $srcdir to $destdir, with --utts-per-spk-max $utts_per_spk_max"
opts=
[ ! -f $srcdir/feats.scp ] && opts="--no-feats"
[ ! -f $srcdir/text ] && opts="$opts --no-text"
[ ! -f $srcdir/wav.scp ] && opts="$opts --no-wav"

utils/validate_data_dir.sh $opts $destdir


================================================
FILE: egs/steps/online/nnet2/copy_ivector_dir.sh
================================================
#!/usr/bin/env bash

# Copyright 2017  Johns Hopkins University (author: Hossein Hadian)
# Apache 2.0

# This script copies the necessary parts of an online ivector directory
# optionally applying a mapping to the ivector_online.scp file

utt2orig=

. utils/parse_options.sh

if [ $# != 2 ]; then
  echo "Usage: "
  echo "  $0 [options] <srcdir> <destdir>"
  echo "e.g.:"
  echo " $0 exp/nnet3/online_ivector_train exp/nnet3/online_ivector_train_fs"
  echo "Options"
  echo "   --utt2orig=<file>     # utterance id mapping to use"
  exit 1;
fi


srcdir=$1
destdir=$2

if [ ! -f $srcdir/ivector_period ]; then
  echo "$0: no such file $srcdir/ivector_period"
  exit 1;
fi

if [ "$destdir" == "$srcdir" ]; then
  echo "$0: this script requires <srcdir> and <destdir> to be different."
  exit 1
fi

set -e;

mkdir -p $destdir
cp -r $srcdir/{conf,ivector_period} $destdir
if [ -z $utt2orig ]; then
  cp $srcdir/ivector_online.scp $destdir
else
  utils/apply_map.pl -f 2 $srcdir/ivector_online.scp < $utt2orig > $destdir/ivector_online.scp
fi
cp $srcdir/final.ie.id $destdir

echo "$0: Copied necessary parts of online ivector directory $srcdir to $destdir"


================================================
FILE: egs/steps/online/nnet2/decode.sh
================================================
#!/usr/bin/env bash

# Copyright 2014  Johns Hopkins University (Author: Daniel Povey)
# Apache 2.0

# Begin configuration section.
stage=0
nj=4
cmd=run.pl
max_active=7000
threaded=false
modify_ivector_config=false #  only relevant to threaded decoder.
beam=15.0
lattice_beam=6.0
acwt=0.1   # note: only really affects adaptation and pruning (scoring is on
           # lattices).
per_utt=false
online=true  # only relevant to non-threaded decoder.
do_endpointing=false
do_speex_compressing=false
scoring_opts=
skip_scoring=false
silence_weight=1.0  # set this to a value less than 1 (e.g. 0) to enable silence weighting.
max_state_duration=40 # This only has an effect if you are doing silence
  # weighting.  This default is probably reasonable.  transition-ids repeated
  # more than this many times in an alignment are treated as silence.
iter=final
# End configuration section.

echo "$0 $@"  # Print the command line for logging

[ -f ./path.sh ] && . ./path.sh; # source the path.
. parse_options.sh || exit 1;

if [ $# != 3 ]; then
   echo "Usage: $0 [options] <graph-dir> <data-dir> <decode-dir>"
   echo "... where <decode-dir> is assumed to be a sub-directory of the directory"
   echo " where the models are, as prepared by steps/online/nnet2/prepare_online_decoding.sh"
   echo "e.g.: $0 exp/tri3b/graph data/test exp/tri3b_online/decode/"
   echo ""
   echo ""
   echo "main options (for others, see top of script file)"
   echo "  --config <config-file>                           # config containing options"
   echo "  --nj <nj>                                        # number of parallel jobs"
   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
   echo "  --acwt <float>                                   # acoustic scale used for lattice generation "
   echo "  --per-utt <true|false>                           # If true, decode per utterance without"
   echo "                                                   # carrying forward adaptation info from previous"
   echo "                                                   # utterances of each speaker.  Default: false"
   echo "  --online <true|false>                            # Set this to false if you don't really care about"
   echo "                                                   # simulating online decoding and just want the best"
   echo "                                                   # results.  This will use all the data within each"
   echo "                                                   # utterance (plus any previous utterance, if not in"
   echo "                                                   # per-utterance mode) to estimate the iVectors."
   echo "  --scoring-opts <string>                          # options to local/score.sh"
   echo "  --iter <iter>                                    # Iteration of model to decode; default is final."
   exit 1;
fi


graphdir=$1
data=$2
dir=$3
srcdir=`dirname $dir`; # The model directory is one level up from decoding directory.
if $per_utt; then
  utt_suffix=utt
  utt_opt="--per-utt"
else
  utt_suffix=
  utt_opt=
fi
sdata=$data/split${nj}${utt_suffix};

mkdir -p $dir/log
split_data.sh $utt_opt $data $nj || exit 1;
echo $nj > $dir/num_jobs

for f in $srcdir/conf/online_nnet2_decoding.conf $srcdir/${iter}.mdl \
    $graphdir/HCLG.fst $graphdir/words.txt $data/wav.scp; do
  if [ ! -f $f ]; then
    echo "$0: no such file $f"
    exit 1;
  fi
done

if ! $per_utt; then
  spk2utt_rspecifier="ark:$sdata/JOB/spk2utt"
else
  mkdir -p $dir/per_utt
  for j in $(seq $nj); do
    awk '{print $1, $1}' <$sdata/$j/utt2spk >$dir/per_utt/utt2spk.$j || exit 1;
  done
  spk2utt_rspecifier="ark:$dir/per_utt/utt2spk.JOB"
fi

if [ -f $data/segments ]; then
  wav_rspecifier="ark,s,cs:extract-segments scp,p:$sdata/JOB/wav.scp $sdata/JOB/segments ark:- |"
else
  wav_rspecifier="ark,s,cs:wav-copy scp,p:$sdata/JOB/wav.scp ark:- |"
fi
if $do_speex_compressing; then
  wav_rspecifier="$wav_rspecifier compress-uncompress-speex ark:- ark:- |"
fi
if $do_endpointing; then
  wav_rspecifier="$wav_rspecifier extend-wav-with-silence ark:- ark:- |"
fi

if [ "$silence_weight" != "1.0" ]; then
  silphones=$(cat $graphdir/phones/silence.csl) || exit 1
  silence_weighting_opts="--ivector-silence-weighting.max-state-duration=$max_state_duration --ivector-silence-weighting.silence_phones=$silphones --ivector-silence-weighting.silence-weight=$silence_weight"
else
  silence_weighting_opts=
fi


if $threaded; then
  decoder=online2-wav-nnet2-latgen-threaded
    # note: the decoder actually uses 4 threads, but the average usage will normally
    # be more like 2.
  parallel_opts="--num-threads 2"
  opts="--modify-ivector-config=$modify_ivector_config --verbose=1"
else
  decoder=online2-wav-nnet2-latgen-faster
  parallel_opts=
  opts="--online=$online"
fi

if [ $stage -le 0 ]; then
  $cmd $parallel_opts JOB=1:$nj $dir/log/decode.JOB.log \
    $decoder $opts $silence_weighting_opts --do-endpointing=$do_endpointing \
     --config=$srcdir/conf/online_nnet2_decoding.conf \
     --max-active=$max_active --beam=$beam --lattice-beam=$lattice_beam \
     --acoustic-scale=$acwt --word-symbol-table=$graphdir/words.txt \
     $srcdir/${iter}.mdl $graphdir/HCLG.fst $spk2utt_rspecifier "$wav_rspecifier" \
      "ark:|gzip -c > $dir/lat.JOB.gz" || exit 1;
fi

if ! $skip_scoring ; then
  [ ! -x local/score.sh ] && \
    echo "Not scoring because local/score.sh does not exist or not executable." && exit 1;
  local/score.sh --cmd "$cmd" $scoring_opts $data $graphdir $dir
fi

exit 0;


================================================
FILE: egs/steps/online/nnet2/dump_nnet_activations.sh
================================================
#!/usr/bin/env bash

# Copyright   2013  Daniel Povey
# Apache 2.0.

# This script was modified from ./extract_ivectors_online2.sh.  It is to be used
# when retraining the top layer of a system that was trained on another,
# out-of-domain dataset, on some in-domain dataset.  It takes as input a
# directory such as nnet_gpu_online as prepared by ./prepare_online_decoding.sh,
# and a data directory, and it processes the wave files to get features and iVectors,
# then puts it through all but the last layer of the neural net in that directory, and dumps
# those final activations in a feats.scp file in the output directory.  These files
# might be quite large.  A typical feature-dimension is 300; it's the p-norm output dim.
# We compress these files (note: the compression is lossy).


# Begin configuration section.
nj=30
cmd="run.pl"
stage=0
utts_per_spk_max=2 # maximum 2 utterances per "fake-speaker."

# End configuration section.

echo "$0 $@"  # Print the command line for logging

if [ -f path.sh ]; then . ./path.sh; fi
. parse_options.sh || exit 1;


if [ $# != 3 ]; then
  echo "Usage: $0 [options] <data> <srcdir> <output-dir>"
  echo " e.g.: $0 data/train exp/nnet2_online/nnet_a_online exp/nnet2_online/activations_train"
  echo "Output is in <output-dir>/feats.scp"
  echo "main options (for others, see top of script file)"
  echo "  --config <config-file>                           # config containing options"
  echo "  --cmd (utils/run.pl|utils/queue.pl <queue-opts>) # how to run jobs."
  echo "  --nj <n|10>                                      # Number of jobs (also see num-processes and num-threads)"
  echo "  --stage <stage|0>                                # To control partial reruns"
  echo "  --utts-per-spk-max <int;default=2>    # Controls splitting into 'fake speakers'."
  echo "                                        # Set to 1 if compatibility with utterance-by-utterance"
  echo "                                        # decoding is the only factor, and to larger if you care "
  echo "                                        # also about adaptation over several utterances."
  exit 1;
fi

data=$1
srcdir=$2
dir=$3

for f in $data/wav.scp $srcdir/conf/online_nnet2_decoding.conf $srcdir/final.mdl; do
  [ ! -f $f ] && echo "No such file $f" && exit 1;
done

# Set various variables.
mkdir -p $dir/log
echo $nj >$dir/num_jobs
sdata=$data/split$nj;
utils/split_data.sh $data $nj || exit 1;


mkdir -p $dir/conf $dir/feats
grep -v '^--endpoint' $srcdir/conf/online_nnet2_decoding.conf > $dir/conf/online_feature_pipeline.conf

if [ $stage -le 0 ]; then
  ns=$(wc -l <$data/spk2utt)
  if [ "$ns" == 1 -a "$utts_per_spk_max" != 1 ]; then
    echo "$0: you seem to have just one speaker in your database.  This is probably not a good idea."
    echo "  see http://kaldi-asr.org/doc/data_prep.html (search for 'bold') for why"
    echo "  Setting --utts-per-spk-max to 1."
    utts_per_spk_max=1
  fi

  mkdir -p $dir/spk2utt_fake
  for job in $(seq $nj); do 
   # create fake spk2utt files with reduced number of utterances per speaker,
   # so the network is well adapted to using iVectors from small amounts of
   # training data.
    awk -v max=$utts_per_spk_max '{ n=2; count=0; while(n<=NF) {
      nmax=n+max; count++; printf("%s-%06x", $1, count); for (;n<nmax&&n<=NF; n++) printf(" %s", $n); print "";} }' \
        <$sdata/$job/spk2utt >$dir/spk2utt_fake/spk2utt.$job
  done
fi

if [ $stage -le 1 ]; then
  info=$dir/nnet_info
  nnet-am-info $srcdir/final.mdl >$info
  nc=$(grep num-components $info | awk '{print $2}');
  if grep SumGroupComponent $info >/dev/null; then 
    nc_truncate=$[$nc-3]  # we did mix-up: remove AffineComponent,
                          # SumGroupComponent, SoftmaxComponent
  else
    nc_truncate=$[$nc-2]  # remove AffineComponent, SoftmaxComponent
  fi
  nnet-to-raw-nnet --truncate=$nc_truncate $srcdir/final.mdl $dir/nnet.raw
fi

if [ $stage -le 2 ]; then
  echo "$0: dumping neural net activations"

  # The next line is a no-op unless $dir/feats/storage/ exists; see utils/create_split_dir.pl.
  for j in $(seq $nj); do  utils/create_data_link.pl $dir/feats/feats.$j.ark; done

  if [ -f $data/segments ]; then
    wav_rspecifier="ark,s,cs:extract-segments scp,p:$sdata/JOB/wav.scp $sdata/JOB/segments ark:- |"
  else
    wav_rspecifier="scp,p:$sdata/JOB/wav.scp"
  fi
  $cmd JOB=1:$nj $dir/log/dump_activations.JOB.log \
    online2-wav-dump-features  --config=$dir/conf/online_feature_pipeline.conf \
      ark:$dir/spk2utt_fake/spk2utt.JOB "$wav_rspecifier" ark:- \| \
    nnet-compute $dir/nnet.raw ark:- ark:- \| \
    copy-feats --compress=true ark:- \
      ark,scp:$dir/feats/feats.JOB.ark,$dir/feats/feats.JOB.scp || exit 1;
fi

if [ $stage -le 3 ]; then
  echo "$0: combining activations across jobs"
  mkdir -p $dir/data
  cp -r $data/* $dir/data
  for j in $(seq $nj); do cat $dir/feats/feats.$j.scp; done >$dir/data/feats.scp || exit 1;
fi

if [ $stage -le 4 ]; then
  echo "$0: computing [fake] CMVN stats."
  # We shouldn't actually be doing CMVN, but the get_egs.sh script expects it,
  # so create fake CMVN stats.
  steps/compute_cmvn_stats.sh --fake $dir/data $dir/log $dir/feats || exit 1
fi


echo "$0: done.  Output is in $dir/data/feats.scp"


================================================
FILE: egs/steps/online/nnet2/extract_ivectors.sh
================================================
#!/usr/bin/env bash

# Copyright     2013  Daniel Povey
# Apache 2.0.


# This script computes iVectors in the same format as extract_ivectors_online.sh,
# except that they are actually not really computed online, they are first computed
# per speaker and just duplicated many times.
# This is mainly intended for use in decoding, where you want the best possible
# quality of iVectors.
#
# This setup also makes it possible to use a previous decoding or alignment, to
# down-weight silence in the stats (default is --silence-weight 0.0).
#
# This is for when you use the "online-decoding" setup in an offline task, and
# you want the best possible results.


# Begin configuration section.
nj=30
cmd="run.pl"
stage=0
num_gselect=5 # Gaussian-selection using diagonal model: number of Gaussians to select
min_post=0.025 # Minimum posterior to use (posteriors below this are pruned out)
ivector_period=10
posterior_scale=0.1 # Scale on the acoustic posteriors, intended to account for
                    # inter-frame correlations.  Making this small during iVector
                    # extraction is equivalent to scaling up the prior, and will
                    # will tend to produce smaller iVectors where data-counts are
                    # small.  It's not so important that this match the value
                    # used when training the iVector extractor, but more important
                    # that this match the value used when you do real online decoding
                    # with the neural nets trained with these iVectors.
max_count=100       # Interpret this as a number of frames times posterior scale...
                    # this config ensures that once the count exceeds this (i.e.
                    # 1000 frames, or 10 seconds, by default), we start to scale
                    # down the stats, accentuating the prior term.   This seems quite
                    # important for some reason.
sub_speaker_frames=0  # If >0, during iVector estimation we split each speaker
                      # into possibly many 'sub-speakers', each with at least
                      # this many frames of speech (evaluated after applying
                      # silence_weight, so will typically exclude silence.
                      # e.g. set this to 1000, and it will require at least 10 seconds
                      # of speech per sub-speaker.

compress=true       # If true, compress the iVectors stored on disk (it's lossy
                    # compression, as used for feature matrices).
silence_weight=0.0
acwt=0.1  # used if input is a decode dir, to get best path from lattices.
mdl=final  # change this if decode directory did not have ../final.mdl present.
num_threads=1 # Number of threads used by ivector-extract.  It is usually not
              # helpful to set this to > 1.  It is only useful if you have
              # fewer speakers than the number of jobs you want to run.

# End configuration section.

echo "$0 $@"  # Print the command line for logging

if [ -f path.sh ]; then . ./path.sh; fi
. parse_options.sh || exit 1;


if [ $# != 4 ] && [ $# != 5 ]; then
  echo "Usage: $0 [options] <data> <lang> <extractor-dir> [<alignment-dir>|<decode-dir>|<weights-archive>] <ivector-dir>"
  echo " e.g.: $0 data/test data/lang exp/nnet2_online/extractor exp/tri3/decode_test exp/nnet2_online/ivectors_test"
  echo "If <alignment-dir|decode-dir> is provided, it is converted to frame-weights "
  echo "giving silence frames a weight of --silence-weight (default: 0.0). "
  echo "If <weights-archive> is provided, it must be a single archive file compressed "
  echo "(using gunzip) containing per-frame weights for each utterance."
  echo "main options (for others, see top of script file)"
  echo "  --config <config-file>                           # config containing options"
  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
  echo "  --nj <n|10>                                      # Number of jobs (also see num-threads)"
  echo "  --num-threads <n|1>                              # Number of threads for each job"
  echo "                                                   # Ignored if <alignment-dir> or <decode-dir> supplied."
  echo "  --stage <stage|0>                                # To control partial reruns"
  echo "  --num-gselect <n|5>                              # Number of Gaussians to select using"
  echo "                                                   # diagonal model."
  echo "  --min-post <float;default=0.025>                 # Pruning threshold for posteriors"
  echo "  --ivector-period <int;default=10>                # How often to extract an iVector (frames)"
  echo "  --posterior-scale <float;default=0.1>            # Scale on posteriors in iVector extraction; "
  echo "                                                   # affects strength of prior term."

  exit 1;
fi

if [ $# -eq 4 ]; then
  data=$1
  lang=$2
  srcdir=$3
  dir=$4
else # 5 arguments
  data=$1
  lang=$2
  srcdir=$3
  ali_or_decode_dir_or_weights=$4
  dir=$5
fi

for f in $data/feats.scp $srcdir/final.ie $srcdir/final.dubm $srcdir/global_cmvn.stats $srcdir/splice_opts \
  $lang/phones.txt $srcdir/online_cmvn.conf $srcdir/final.mat; do
  [ ! -f $f ] && echo "$0: No such file $f" && exit 1;
done

mkdir -p $dir/log
silphonelist=$(cat $lang/phones/silence.csl) || exit 1;

if [ ! -z "$ali_or_decode_dir_or_weights" ]; then


  if [ -f $ali_or_decode_dir_or_weights/ali.1.gz ]; then
    if [ ! -f $ali_or_decode_dir_or_weights/${mdl}.mdl ]; then
      echo "$0: expected $ali_or_decode_dir_or_weights/${mdl}.mdl to exist."
      exit 1;
    fi
    nj_orig=$(cat $ali_or_decode_dir_or_weights/num_jobs) || exit 1;

    if [ $stage -le 0 ]; then
      rm $dir/weights.*.gz 2>/dev/null

      $cmd JOB=1:$nj_orig  $dir/log/ali_to_post.JOB.log \
        gunzip -c $ali_or_decode_dir_or_weights/ali.JOB.gz \| \
        ali-to-post ark:- ark:- \| \
        weight-silence-post $silence_weight $silphonelist $ali_or_decode_dir_or_weights/final.mdl ark:- ark:- \| \
        post-to-weights ark:- "ark:|gzip -c >$dir/weights.JOB.gz" || exit 1;

      # put all the weights in one archive.
      for j in $(seq $nj_orig); do gunzip -c $dir/weights.$j.gz; done | gzip -c >$dir/weights.gz || exit 1;
      rm $dir/weights.*.gz || exit 1;
    fi

  elif [ -f $ali_or_decode_dir_or_weights/lat.1.gz ]; then
    nj_orig=$(cat $ali_or_decode_dir_or_weights/num_jobs) || exit 1;
    if [ ! -f $ali_or_decode_dir_or_weights/../${mdl}.mdl ]; then
      echo "$0: expected $ali_or_decode_dir_or_weights/../${mdl}.mdl to exist."
      exit 1;
    fi


    if [ $stage -le 0 ]; then
      rm $dir/weights.*.gz 2>/dev/null

      $cmd JOB=1:$nj_orig  $dir/log/lat_to_post.JOB.log \
        lattice-best-path --acoustic-scale=$acwt "ark:gunzip -c $ali_or_decode_dir_or_weights/lat.JOB.gz|" ark:/dev/null ark:- \| \
        ali-to-post ark:- ark:- \| \
        weight-silence-post $silence_weight $silphonelist $ali_or_decode_dir_or_weights/../${mdl}.mdl ark:- ark:- \| \
        post-to-weights ark:- "ark:|gzip -c >$dir/weights.JOB.gz" || exit 1;

      # put all the weights in one archive.
      for j in $(seq $nj_orig); do gunzip -c $dir/weights.$j.gz; done | gzip -c >$dir/weights.gz || exit 1;
      rm $dir/weights.*.gz || exit 1;
    fi
  elif [ -f $ali_or_decode_dir_or_weights ] && gunzip -c $ali_or_decode_dir_or_weights >/dev/null; then
    cp $ali_or_decode_dir_or_weights $dir/weights.gz || exit 1;
  else
    echo "$0: expected ali.1.gz or lat.1.gz to exist in $ali_or_decode_dir_or_weights";
    exit 1;
  fi
fi

sdata=$data/split$nj;
utils/split_data.sh $data $nj || exit 1;

echo $ivector_period > $dir/ivector_period || exit 1;
splice_opts=$(cat $srcdir/splice_opts)

gmm_feats="ark,s,cs:apply-cmvn-online --spk2utt=ark:$sdata/JOB/spk2utt --config=$srcdir/online_cmvn.conf $srcdir/global_cmvn.stats scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |"
feats="ark,s,cs:splice-feats $splice_opts scp:$sdata/JOB/feats.scp ark:- | transform-feats $srcdir/final.mat ark:- ark:- |"

# This adds online-cmvn in $feats, upon request (configuration taken from UBM),
[ -f $srcdir/online_cmvn_iextractor ] && feats="$gmm_feats"


if [ $sub_speaker_frames -gt 0 ]; then

  if [ $stage -le 1 ]; then
  # We work out 'fake' spk2utt files that possibly split each speaker into multiple pieces.
    if [ ! -z "$ali_or_decode_dir_or_weights" ]; then
      gunzip -c $dir/weights.gz | copy-vector ark:- ark,t:- | \
        awk '{ sum=0; for (n=3;n<NF;n++) sum += $n; print $1, sum; }' > $dir/utt_counts || exit 1;
    else
      feat-to-len scp:$data/feats.scp ark,t:- > $dir/utt_counts || exit 1;
    fi
    if ! [ $(wc -l <$dir/utt_counts) -eq $(wc -l <$data/feats.scp) ]; then
      echo "$0: error getting per-utterance counts."
      exit 0;
    fi
    cat $data/spk2utt | python -c "
import sys
utt_counts = {}
trash = list(map(lambda x: utt_counts.update({x.split()[0]:float(x.split()[1])}), open('$dir/utt_counts').readlines()))
sub_speaker_frames = $sub_speaker_frames
lines = sys.stdin.readlines()
total_counts = {}
for line in lines:
  parts = line.split()
  spk = parts[0]
  total_counts[spk] = 0
  for utt in parts[1:]:
    total_counts[spk] += utt_counts[utt]

for line_index in range(len(lines)):
  line = lines[line_index]
  parts = line.split()
  spk = parts[0]

  numeric_id=0
  current_count = 0
  covered_count = 0
  current_utts = []
  for utt in parts[1:]:
    try:
      current_count += utt_counts[utt]
      covered_count += utt_counts[utt]
    except KeyError:
      raise Exception('No count found for the utterance {0}.'.format(utt))
    current_utts.append(utt)
    if ((current_count >= $sub_speaker_frames) and ((total_counts[spk] - covered_count) >= $sub_speaker_frames)) or (utt == parts[-1]):
      spk_partial = '{0}-{1:06x}'.format(spk, numeric_id)
      numeric_id += 1
      print ('{0} {1}'.format(spk_partial, ' '.join(current_utts)))
      current_utts = []
      current_count = 0
"> $dir/spk2utt || exit 1;
    mkdir -p $dir/split$nj
    # create split versions of our spk2utt file.
    for j in $(seq $nj); do
      mkdir -p $dir/split$nj/$j
      utils/filter_scp.pl -f 2 $sdata/$j/utt2spk <$dir/spk2utt >$dir/split$nj/$j/spk2utt || exit 1;
      utils/spk2utt_to_utt2spk.pl <$dir/split$nj/$j/spk2utt >$dir/split$nj/$j/utt2spk || exit 1;
    done
  fi
  this_sdata=$dir/split$nj
else
  this_sdata=$sdata
fi

if [ $stage -le 2 ]; then
  if [ ! -z "$ali_or_decode_dir_or_weights" ]; then
    $cmd --num-threads $num_threads JOB=1:$nj $dir/log/extract_ivectors.JOB.log \
      gmm-global-get-post --n=$num_gselect --min-post=$min_post $srcdir/final.dubm "$gmm_feats" ark:- \| \
      weight-post ark:- "ark,s,cs:gunzip -c $dir/weights.gz|" ark:- \| \
      ivector-extract --num-threads=$num_threads --acoustic-weight=$posterior_scale --compute-objf-change=true \
        --max-count=$max_count --spk2utt=ark:$this_sdata/JOB/spk2utt \
      $srcdir/final.ie "$feats" ark,s,cs:- ark,t:$dir/ivectors_spk.JOB.ark || exit 1;
  else
    $cmd --num-threads $num_threads JOB=1:$nj $dir/log/extract_ivectors.JOB.log \
      gmm-global-get-post --n=$num_gselect --min-post=$min_post $srcdir/final.dubm "$gmm_feats" ark:- \| \
      ivector-extract --num-threads=$num_threads --acoustic-weight=$posterior_scale --compute-objf-change=true \
        --max-count=$max_count --spk2utt=ark:$this_sdata/JOB/spk2utt \
      $srcdir/final.ie "$feats" ark,s,cs:- ark,t:$dir/ivectors_spk.JOB.ark || exit 1;
  fi
fi

# get an utterance-level set of iVectors (just duplicate the speaker-level ones).
# note: if $this_sdata is set $dir/split$nj, then these won't be real speakers, they'll
# be "sub-speakers" (speakers split up into multiple utterances).
if [ $stage -le 3 ]; then
  for j in $(seq $nj); do
    utils/apply_map.pl -f 2 $dir/ivectors_spk.$j.ark <$this_sdata/$j/utt2spk >$dir/ivectors_utt.$j.ark || exit 1;
  done
fi

ivector_dim=$[$(head -n 1 $dir/ivectors_spk.1.ark | wc -w) - 3] || exit 1;
echo  "$0: iVector dim is $ivector_dim"

base_feat_dim=$(feat-to-dim scp:$data/feats.scp -) || exit 1;

start_dim=$base_feat_dim
end_dim=$[$base_feat_dim+$ivector_dim-1]
absdir=$(utils/make_absolute.sh $dir)

if [ $stage -le 4 ]; then
  # here, we are just using the original features in $sdata/JOB/feats.scp for
  # their number of rows; we use the select-feats command to remove those
  # features and retain only the iVector features.
  $cmd JOB=1:$nj $dir/log/duplicate_feats.JOB.log \
    append-vector-to-feats scp:$sdata/JOB/feats.scp ark:$dir/ivectors_utt.JOB.ark ark:- \| \
    select-feats "$start_dim-$end_dim" ark:- ark:- \| \
    subsample-feats --n=$ivector_period ark:- ark:- \| \
    copy-feats --compress=$compress ark:- \
    ark,scp:$absdir/ivector_online.JOB.ark,$absdir/ivector_online.JOB.scp || exit 1;
fi

if [ $stage -le 5 ]; then
  echo "$0: combining iVectors across jobs"
  for j in $(seq $nj); do cat $dir/ivector_online.$j.scp; done >$dir/ivector_online.scp || exit 1;
fi

steps/nnet2/get_ivector_id.sh $srcdir > $dir/final.ie.id || exit 1

echo "$0: done extracting (pseudo-online) iVectors to $dir using the extractor in $srcdir."


================================================
FILE: egs/steps/online/nnet2/extract_ivectors_online.sh
================================================
#!/usr/bin/env bash

# Copyright     2013  Daniel Povey
# Apache 2.0.

set -o pipefail

# This script extracts iVectors for a set of utterances, given
# features and a trained iVector extractor.

# The script is based on ^/egs/sre08/v1/sid/extract_ivectors.sh.  Instead of
# extracting a single iVector per utterance, it extracts one every few frames
# (controlled by the --ivector-period option, e.g. 10, which is to save compute).
# This is used in training (and not-really-online testing) of neural networks
# for online decoding.

# Rather than treating each utterance separately, it carries forward
# information from one utterance to the next, within the speaker.


# Begin configuration section.
nj=30
cmd="run.pl"
stage=0
num_gselect=5 # Gaussian-selection using diagonal model: number of Gaussians to select
min_post=0.025 # Minimum posterior to use (posteriors below this are pruned out)
ivector_period=10
posterior_scale=0.1 # Scale on the acoustic posteriors, intended to account for
                    # inter-frame correlations.  Making this small during iVector
                    # extraction is equivalent to scaling up the prior, and will
                    # will tend to produce smaller iVectors where data-counts are
                    # small.  It's not so important that this match the value
                    # used when training the iVector extractor, but more important
                    # that this match the value used when you do real online decoding
                    # with the neural nets trained with these iVectors.
compress=true       # If true, compress the iVectors stored on disk (it's lossy
                    # compression, as used for feature matrices).
max_count=0         # The use of this option (e.g. --max-count 100) can make
                    # iVectors more consistent for different lengths of
                    # utterance, by scaling up the prior term when the
                    # data-count exceeds this value.  The data-count is after
                    # posterior-scaling, so assuming the posterior-scale is 0.1,
                    # --max-count 100 starts having effect after 1000 frames, or
                    # 10 seconds of data.
use_vad=false

# End configuration section.

echo "$0 $@"  # Print the command line for logging
if [ -f path.sh ]; then . ./path.sh; fi
. parse_options.sh || exit 1;


if [ $# != 3 ]; then
  echo "Usage: $0 [options] <data> <extractor-dir> <ivector-dir>"
  echo " e.g.: $0 data/train exp/nnet2_online/extractor exp/nnet2_online/ivectors_train"
  echo "main options (for others, see top of script file)"
  echo "  --config <config-file>                           # config containing options"
  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
  echo "  --nj <n|10>                                      # Number of jobs"
  echo "  --stage <stage|0>                                # To control partial reruns"
  echo "  --num-gselect <n|5>                              # Number of Gaussians to select using"
  echo "                                                   # diagonal model."
  echo "  --min-post <float;default=0.025>                 # Pruning threshold for posteriors"
  echo "  --ivector-period <int;default=10>                # How often to extract an iVector (frames)"
  exit 1;
fi

data=$1
srcdir=$2
dir=$3

extra_files=
if $use_vad; then
  extra_files=$data/vad.scp
fi

for f in $data/feats.scp $srcdir/final.ie $srcdir/final.dubm $srcdir/global_cmvn.stats $srcdir/splice_opts \
     $srcdir/online_cmvn.conf $srcdir/final.mat $extra_files; do
  [ ! -f $f ] && echo "$0: No such file $f" && exit 1;
done

# Set various variables.
mkdir -p $dir/log $dir/conf

sdata=$data/split$nj;
[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
#utils/split_data.sh $data $nj || exit 1;

echo $ivector_period > $dir/ivector_period || exit 1;
splice_opts=$(cat $srcdir/splice_opts)

# the program ivector-extract-online2 does a bunch of stuff in memory and is
# config-driven...  this was easier in this case because the same code is
# involved in online decoding.  We need to create a config file for iVector
# extraction.

ieconf=$dir/conf/ivector_extractor.conf
echo -n >$ieconf
cp $srcdir/online_cmvn.conf $dir/conf/ || exit 1;
echo "--cmvn-config=$dir/conf/online_cmvn.conf" >>$ieconf
for x in $(echo $splice_opts); do echo "$x"; done > $dir/conf/splice.conf
echo "--ivector-period=$ivector_period" >>$ieconf
echo "--splice-config=$dir/conf/splice.conf" >>$ieconf
echo "--lda-matrix=$srcdir/final.mat" >>$ieconf
echo "--global-cmvn-stats=$srcdir/global_cmvn.stats" >>$ieconf
echo "--diag-ubm=$srcdir/final.dubm" >>$ieconf
echo "--ivector-extractor=$srcdir/final.ie" >>$ieconf
echo "--num-gselect=$num_gselect"  >>$ieconf
echo "--min-post=$min_post" >>$ieconf
echo "--posterior-scale=$posterior_scale" >>$ieconf
echo "--max-remembered-frames=1000" >>$ieconf # the default
echo "--max-count=$max_count" >>$ieconf
[ -f $srcdir/online_cmvn_iextractor ] && echo "--online-cmvn-iextractor=true" >>$ieconf


absdir=$(utils/make_absolute.sh $dir)

for n in $(seq $nj); do
  # This will do nothing unless the directory $dir/storage exists;
  # it can be used to distribute the data among multiple machines.
  utils/create_data_link.pl $dir/ivector_online.$n.ark
done

if [ $stage -le 0 ]; then
  echo "$0: extracting iVectors"
  extra_opts=
  if $use_vad; then
    extra_opts="--frame-weights-rspecifier=scp:$data/vad.scp"
  fi

  $cmd JOB=1:$nj $dir/log/extract_ivectors.JOB.log \
    ivector-extract-online2 --config=$ieconf $extra_opts \
      ark:$sdata/JOB/spk2utt scp:$sdata/JOB/feats.scp ark:- \| \
    copy-feats --compress=$compress ark:- \
      ark,scp:$absdir/ivector_online.JOB.ark,$absdir/ivector_online.JOB.scp || exit 1;
fi

if [ $stage -le 1 ]; then
  echo "$0: combining iVectors across jobs"
  for j in $(seq $nj); do cat $dir/ivector_online.$j.scp; done >$dir/ivector_online.scp || exit 1;
fi

steps/nnet2/get_ivector_id.sh $srcdir > $dir/final.ie.id || exit 1

echo "$0: done extracting (online) iVectors to $dir using the extractor in $srcdir."


================================================
FILE: egs/steps/online/nnet2/get_egs.sh
================================================
#!/usr/bin/env bash

# Copyright 2012-2014 Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.

# This is modified from ../../nnet2/get_egs.sh.
# This script combines the
# nnet-example extraction with the feature extraction directly from wave files;
# it uses the program online2-wav-dump-feature to do all parts of feature
# extraction: MFCC/PLP/fbank, possibly plus pitch, plus iVectors.  This script
# is intended mostly for cross-system training for online decoding, where you
# initialize the nnet from an existing, larger system.


# Begin configuration section.
cmd=run.pl
num_utts_subset=300    # number of utterances in validation and training
                       # subsets used for shrinkage and diagnostics
num_valid_frames_combine=0 # #valid frames for combination weights at the very end.
num_train_frames_combine=10000 # # train frames for the above.
num_frames_diagnostic=4000 # number of frames for "compute_prob" jobs
samples_per_iter=400000 # each iteration of training, see this many samples
                        # per job.  This is just a guideline; it will pick a number
                        # that divides the number of samples in the entire data.
transform_dir=     # If supplied, overrides alidir
num_jobs_nnet=16    # Number of neural net jobs to run in parallel
stage=0
io_opts="--max-jobs-run 5" # for jobs with a lot of I/O, limits the number running at one time.
random_copy=false

echo "$0 $@"  # Print the command line for logging

if [ -f path.sh ]; then . ./path.sh; fi
. parse_options.sh || exit 1;


if [ $# != 4 ]; then
  echo "Usage: steps/online/nnet2/get_egs.sh [opts] <data> <ali-dir> <online-nnet-dir> <exp-dir>"
  echo " e.g.: steps/online/nnet2/get_egs.sh data/train exp/tri3_ali exp/nnet2_online/nnet_a_gpu_online/ exp/tri4_nnet"
  echo "In <online-nnet-dir>, it looks for final.mdl (need to compute required left and right context),"
  echo "and a configuration file conf/online_nnet2_decoding.conf which describes the features."
  echo "Main options (for others, see top of script file)"
  echo "  --config <config-file>                           # config file containing options"
  echo "  --cmd (utils/run.pl;utils/queue.pl <queue opts>) # how to run jobs."
  echo "  --num-jobs-nnet <num-jobs;16>                    # Number of parallel jobs to use for main neural net"
  echo "                                                   # training (will affect results as well as speed; try 8, 16)"
  echo "                                                   # Note: if you increase this, you may want to also increase"
  echo "                                                   # the learning rate."
  echo "  --samples-per-iter <#samples;400000>             # Number of samples of data to process per iteration, per"
  echo "                                                   # process."
  echo "  --feat-type <lda|raw>                            # (by default it tries to guess).  The feature type you want"
  echo "                                                   # to use as input to the neural net."
  echo "  --splice-width <width;4>                         # Number of frames on each side to append for feature input"
  echo "                                                   # (note: we splice processed, typically 40-dimensional frames"
  echo "  --num-frames-diagnostic <#frames;4000>           # Number of frames used in computing (train,valid) diagnostics"
  echo "  --num-valid-frames-combine <#frames;10000>       # Number of frames used in getting combination weights at the"
  echo "                                                   # very end."
  echo "  --stage <stage|0>                                # Used to run a partially-completed training process from somewhere in"
  echo "                                                   # the middle."

  exit 1;
fi

data=$1
alidir=$2
online_nnet_dir=$3
dir=$4


mdl=$online_nnet_dir/final.mdl # only needed for left and right context.
feature_conf=$online_nnet_dir/conf/online_nnet2_decoding.conf

for f in $data/wav.scp $alidir/ali.1.gz $alidir/final.mdl $alidir/tree $feature_conf $mdl; do
  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
done

nj=`cat $alidir/num_jobs` || exit 1;  # number of jobs in alignment dir...

sdata=$data/split$nj
utils/split_data.sh $data $nj

mkdir -p $dir/log
cp $alidir/tree $dir
grep -v '^--endpoint' $feature_conf >$dir/feature.conf || exit 1;

# Get list of validation utterances.
mkdir -p $dir/valid $dir/train_subset

awk '{print $1}' $data/utt2spk | utils/shuffle_list.pl | head -$num_utts_subset \
    > $dir/valid/uttlist || exit 1;

if [ -f $data/utt2uniq ]; then
  echo "File $data/utt2uniq exists, so augmenting valid/uttlist to"
  echo "include all perturbed versions of the same 'real' utterances."
  mv $dir/valid/uttlist $dir/valid/uttlist.tmp
  utils/utt2spk_to_spk2utt.pl $data/utt2uniq > $dir/uniq2utt
  cat $dir/valid/uttlist.tmp | utils/apply_map.pl $data/utt2uniq | \
    sort | uniq | utils/apply_map.pl $dir/uniq2utt | \
    awk '{for(n=1;n<=NF;n++) print $n;}' | sort  > $dir/valid/uttlist
  rm $dir/uniq2utt $dir/valid/uttlist.tmp
fi

awk '{print $1}' $data/utt2spk | utils/filter_scp.pl --exclude $dir/valid/uttlist | \
   utils/shuffle_list.pl | head -$num_utts_subset > $dir/train_subset/uttlist || exit 1;


for subdir in valid train_subset; do
  # In order for the iVector extraction to work right, we need to process all
  # utterances of the speakers which have utterances in valid/uttlist, and the
  # same for train_subset/uttlist.  We produce $dir/valid/uttlist_extended which
  # will contain all utterances of all speakers which have utterances in
  # $dir/valid/uttlist, and the same for $dir/train_subset/.

  utils/filter_scp.pl $dir/$subdir/uttlist <$data/utt2spk | awk '{print $2}' > $dir/$subdir/spklist || exit 1;
  utils/filter_scp.pl -f 2 $dir/$subdir/spklist <$data/utt2spk >$dir/$subdir/utt2spk || exit 1;
  utils/utt2spk_to_spk2utt.pl <$dir/$subdir/utt2spk >$dir/$subdir/spk2utt || exit 1;
  awk '{print $1}' <$dir/$subdir/utt2spk >$dir/$subdir/uttlist_extended || exit 1;
  rm $dir/$subdir/spklist
done

if [ -f $data/segments ]; then
  # note: in the feature extraction, because the program online2-wav-dump-features is sensitive to the
  # previous utterances within a speaker, we do the filtering after extracting the features.
  echo "$0 [info]: segments file exists: using that."
  feats="ark,s,cs:extract-segments scp:$sdata/JOB/wav.scp $sdata/JOB/segments ark:- | online2-wav-dump-features --config=$dir/feature.conf ark:$sdata/JOB/spk2utt ark,s,cs:- ark:- | subset-feats --exclude=$dir/valid/uttlist ark:- ark:- |"
  valid_feats="ark,s,cs:utils/filter_scp.pl $dir/valid/uttlist_extended $data/segments  | extract-segments scp:$data/wav.scp - ark:- | online2-wav-dump-features --config=$dir/feature.conf ark:$dir/valid/spk2utt ark,s,cs:- ark:- | subset-feats --include=$dir/valid/uttlist ark:- ark:- |"
  train_subset_feats="ark,s,cs:utils/filter_scp.pl $dir/train_subset/uttlist_extended $data/segments  | extract-segments scp:$data/wav.scp - ark:- | online2-wav-dump-features --config=$dir/feature.conf ark:$dir/train_subset/spk2utt ark,s,cs:- ark:- | subset-feats --include=$dir/train_subset/uttlist ark:- ark:- |"
else
  echo "$0 [info]: no segments file exists, using wav.scp."
  feats="ark,s,cs:online2-wav-dump-features --config=$dir/feature.conf ark:$sdata/JOB/spk2utt scp:$sdata/JOB/wav.scp ark:- | subset-feats --exclude=$dir/valid/uttlist ark:- ark:- |"
  valid_feats="ark,s,cs:utils/filter_scp.pl $dir/valid/uttlist_extended $data/wav.scp | online2-wav-dump-features --config=$dir/feature.conf ark:$dir/valid/spk2utt scp:- ark:- | subset-feats --include=$dir/valid/uttlist ark:- ark:- |"
  train_subset_feats="ark,s,cs:utils/filter_scp.pl $dir/train_subset/uttlist_extended $data/wav.scp | online2-wav-dump-features --config=$dir/feature.conf ark:$dir/train_subset/spk2utt scp:- ark:- | subset-feats --include=$dir/train_subset/uttlist ark:- ark:- |"
fi

ivector_dim=$(online2-wav-dump-features --config=$dir/feature.conf --print-ivector-dim=true) || exit 1;

! [ $ivector_dim -ge 0 ] && echo "$0: error getting iVector dim" && exit 1;


if [ $stage -le 0 ]; then
  echo "$0: working out number of frames of training data"
  num_frames=$(steps/nnet2/get_num_frames.sh $data)
  echo $num_frames > $dir/num_frames
else
  num_frames=`cat $dir/num_frames` || exit 1;
fi

# Working out number of iterations per epoch.
iters_per_epoch=`perl -e "print int($num_frames/($samples_per_iter * $num_jobs_nnet) + 0.5);"` || exit 1;
[ $iters_per_epoch -eq 0 ] && iters_per_epoch=1
samples_per_iter_real=$[$num_frames/($num_jobs_nnet*$iters_per_epoch)]
echo "$0: Every epoch, splitting the data up into $iters_per_epoch iterations,"
echo "$0: giving samples-per-iteration of $samples_per_iter_real (you requested $samples_per_iter)."

# Making soft links to storage directories.  This is a no-up unless
# the subdirectory $dir/egs/storage/ exists.  See utils/create_split_dir.pl
for x in `seq 1 $num_jobs_nnet`; do
  for y in `seq 0 $[$iters_per_epoch-1]`; do
    utils/create_data_link.pl $dir/egs/egs.$x.$y.ark
    utils/create_data_link.pl $dir/egs/egs_tmp.$x.$y.ark
  done
  for y in `seq 1 $nj`; do
    utils/create_data_link.pl $dir/egs/egs_orig.$x.$y.ark
  done
done

remove () { for x in $*; do [ -L $x ] && rm $(utils/make_absolute.sh $x); rm $x; done }

set -o pipefail
left_context=$(nnet-am-info $mdl | grep '^left-context' | awk '{print $2}') || exit 1;
right_context=$(nnet-am-info $mdl | grep '^right-context' | awk '{print $2}') || exit 1;
nnet_context_opts="--left-context=$left_context --right-context=$right_context"
set +o pipefail

mkdir -p $dir/egs

if [ $stage -le 2 ]; then
  rm $dir/.error 2>/dev/null

  echo "$0: extracting validation and training-subset alignments."
  set -o pipefail;
  for id in $(seq $nj); do gunzip -c $alidir/ali.$id.gz; done | \
    copy-int-vector ark:- ark,t:- | \
    utils/filter_scp.pl <(cat $dir/valid/uttlist $dir/train_subset/uttlist) | \
    gzip -c >$dir/ali_special.gz || exit 1;
  set +o pipefail; # unset the pipefail option.

  echo "Getting validation and training subset examples."
  $cmd $dir/log/create_valid_subset.log \
    nnet-get-egs $ivectors_opt $nnet_context_opts "$valid_feats" \
     "ark,s,cs:gunzip -c $dir/ali_special.gz | ali-to-pdf $alidir/final.mdl ark:- ark:- | ali-to-post ark:- ark:- |" \
     "ark:$dir/egs/valid_all.egs" || touch $dir/.error &
  $cmd $dir/log/create_train_subset.log \
    nnet-get-egs $ivectors_opt $nnet_context_opts "$train_subset_feats" \
    "ark,s,cs:gunzip -c $dir/ali_special.gz | ali-to-pdf $alidir/final.mdl ark:- ark:- | ali-to-post ark:- ark:- |" \
     "ark:$dir/egs/train_subset_all.egs" || touch $dir/.error &
  wait;
  [ -f $dir/.error ] && exit 1;
  echo "Getting subsets of validation examples for diagnostics and combination."
  $cmd $dir/log/create_valid_subset_combine.log \
    nnet-subset-egs --n=$num_valid_frames_combine ark:$dir/egs/valid_all.egs \
        ark:$dir/egs/valid_combine.egs || touch $dir/.error &
  $cmd $dir/log/create_valid_subset_diagnostic.log \
    nnet-subset-egs --n=$num_frames_diagnostic ark:$dir/egs/valid_all.egs \
    ark:$dir/egs/valid_diagnostic.egs || touch $dir/.error &

  $cmd $dir/log/create_train_subset_combine.log \
    nnet-subset-egs --n=$num_train_frames_combine ark:$dir/egs/train_subset_all.egs \
    ark:$dir/egs/train_combine.egs || touch $dir/.error &
  $cmd $dir/log/create_train_subset_diagnostic.log \
    nnet-subset-egs --n=$num_frames_diagnostic ark:$dir/egs/train_subset_all.egs \
    ark:$dir/egs/train_diagnostic.egs || touch $dir/.error &
  wait
  [ -f $dir/.error ] && echo "Error detected while creating egs" && exit 1;
  cat $dir/egs/valid_combine.egs $dir/egs/train_combine.egs > $dir/egs/combine.egs

  for f in $dir/egs/{combine,train_diagnostic,valid_diagnostic}.egs; do
    [ ! -s $f ] && echo "No examples in file $f" && exit 1;
  done
  rm $dir/egs/valid_all.egs $dir/egs/train_subset_all.egs $dir/egs/{train,valid}_combine.egs $dir/ali_special.gz
fi

if [ $stage -le 3 ]; then

  # Other scripts might need to know the following info:
  echo $num_jobs_nnet >$dir/egs/num_jobs_nnet
  echo $iters_per_epoch >$dir/egs/iters_per_epoch
  echo $samples_per_iter_real >$dir/egs/samples_per_iter

  echo "Creating training examples";
  # in $dir/egs, create $num_jobs_nnet separate files with training examples.
  # The order is not randomized at this point.

  egs_list=
  for n in `seq 1 $num_jobs_nnet`; do
    egs_list="$egs_list ark:$dir/egs/egs_orig.$n.JOB.ark"
  done
  echo "Generating training examples on disk"
  # The examples will go round-robin to egs_list.
  $cmd $io_opts JOB=1:$nj $dir/log/get_egs.JOB.log \
    nnet-get-egs $ivectors_opt $nnet_context_opts "$feats" \
    "ark,s,cs:gunzip -c $alidir/ali.JOB.gz | ali-to-pdf $alidir/final.mdl ark:- ark:- | ali-to-post ark:- ark:- |" ark:- \| \
    nnet-copy-egs ark:- $egs_list || exit 1;
fi

if [ $stage -le 4 ]; then
  echo "$0: rearranging examples into parts for different parallel jobs"
  # combine all the "egs_orig.JOB.*.scp" (over the $nj splits of the data) and
  # then split into multiple parts egs.JOB.*.scp for different parts of the
  # data, 0 .. $iters_per_epoch-1.

  if [ $iters_per_epoch -eq 1 ]; then
    echo "$0: Since iters-per-epoch == 1, just concatenating the data."
    for n in `seq 1 $num_jobs_nnet`; do
      cat $dir/egs/egs_orig.$n.*.ark > $dir/egs/egs_tmp.$n.0.ark || exit 1;
      remove $dir/egs/egs_orig.$n.*.ark
    done
  else # We'll have to split it up using nnet-copy-egs.
    egs_list=
    for n in `seq 0 $[$iters_per_epoch-1]`; do
      egs_list="$egs_list ark:$dir/egs/egs_tmp.JOB.$n.ark"
    done
    # note, the "|| true" below is a workaround for NFS bugs
    # we encountered running this script with Debian-7, NFS-v4.
    $cmd $io_opts JOB=1:$num_jobs_nnet $dir/log/split_egs.JOB.log \
      nnet-copy-egs --random=$random_copy --srand=JOB \
        "ark:cat $dir/egs/egs_orig.JOB.*.ark|" $egs_list || exit 1;
    remove $dir/egs/egs_orig.*.*.ark  2>/dev/null
  fi
fi

if [ $stage -le 5 ]; then
  # Next, shuffle the order of the examples in each of those files.
  # Each one should not be too large, so we can do this in memory.
  echo "Shuffling the order of training examples"
  echo "(in order to avoid stressing the disk, these won't all run at once)."

  for n in `seq 0 $[$iters_per_epoch-1]`; do
    $cmd $io_opts JOB=1:$num_jobs_nnet $dir/log/shuffle.$n.JOB.log \
      nnet-shuffle-egs "--srand=\$[JOB+($num_jobs_nnet*$n)]" \
      ark:$dir/egs/egs_tmp.JOB.$n.ark ark:$dir/egs/egs.JOB.$n.ark
    remove $dir/egs/egs_tmp.*.$n.ark
  done
fi

echo "$0: Finished preparing training examples"


================================================
FILE: egs/steps/online/nnet2/get_egs2.sh
================================================
#!/usr/bin/env bash

# Copyright 2012-2014 Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.
#
# This is modified from ../../nnet2/get_egs2.sh.  [note: get_egs2.sh is as get_egs.sh,
# but uses the newer, more compact way of writing egs. where we write multiple
# frames of labels in order to share the context.]
# This script combines the
# nnet-example extraction with the feature extraction directly from wave files;
# it uses the program online2-wav-dump-feature to do all parts of feature
# extraction: MFCC/PLP/fbank, possibly plus pitch, plus iVectors.  This script
# is intended mostly for cross-system training for online decoding, where you
# initialize the nnet from an existing, larger system.
#

# Begin configuration section.
cmd=run.pl
frames_per_eg=8   # number of frames of labels per example.  more->less disk space and
                  # less time preparing egs, but more I/O during training.
                  # note: the script may reduce this if reduce_frames_per_eg is true.

reduce_frames_per_eg=true  # If true, this script may reduce the frames_per_eg
                           # if there is only one archive and even with the
                           # reduced frames_pe_eg, the number of
                           # samples_per_iter that would result is less than or
                           # equal to the user-specified value.
num_utts_subset=300     # number of utterances in validation and training
                        # subsets used for shrinkage and diagnostics.
num_valid_frames_combine=0 # #valid frames for combination weights at the very end.
num_train_frames_combine=10000 # # train frames for the above.
num_frames_diagnostic=4000 # number of frames for "compute_prob" jobs
samples_per_iter=400000 # each iteration of training, see this many samples
                        # per job.  This is just a guideline; it will pick a number
                        # that divides the number of samples in the entire data.

stage=0
io_opts="--max-jobs-run 5" # for jobs with a lot of I/O, limits the number running at one time. 
random_copy=false

echo "$0 $@"  # Print the command line for logging

if [ -f path.sh ]; then . ./path.sh; fi
. parse_options.sh || exit 1;


if [ $# != 4 ]; then
  echo "Usage: $0 [opts] <data> <ali-dir> <online-nnet-dir> <egs-dir>"
  echo " e.g.: $0 data/train exp/tri3_ali exp/nnet2_online/nnet_a_gpu_online/ exp/nnet2_online/nnet_b/egs"
  echo ""
  echo "Main options (for others, see top of script file)"
  echo "  --config <config-file>                           # config file containing options"
  echo "  --cmd (utils/run.pl;utils/queue.pl <queue opts>) # how to run jobs."
  echo "  --samples-per-iter <#samples;400000>             # Number of samples of data to process per iteration, per"
  echo "                                                   # process."
  echo "  --feat-type <lda|raw>                            # (by default it tries to guess).  The feature type you want"
  echo "                                                   # to use as input to the neural net."
  echo "  --frames-per-eg <frames;8>                       # number of frames per eg on disk"
  echo "  --num-frames-diagnostic <#frames;4000>           # Number of frames used in computing (train,valid) diagnostics"
  echo "  --num-valid-frames-combine <#frames;10000>       # Number of frames used in getting combination weights at the"
  echo "                                                   # very end."
  echo "  --stage <stage|0>                                # Used to run a partially-completed training process from somewhere in"
  echo "                                                   # the middle."
  
  exit 1;
fi

data=$1
alidir=$2
online_nnet_dir=$3
dir=$4

mdl=$online_nnet_dir/final.mdl # only needed for left and right context.
feature_conf=$online_nnet_dir/conf/online_nnet2_decoding.conf


for f in $data/wav.scp $alidir/ali.1.gz $alidir/final.mdl $alidir/tree $mdl $feature_conf; do
  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
done


nj=`cat $alidir/num_jobs` || exit 1;  # number of jobs in alignment dir...

sdata=$data/split$nj
utils/split_data.sh $data $nj

mkdir -p $dir/log $dir/info
! cmp $alidir/tree $online_nnet_dir/tree && \
   echo "$0: warning, tree from alignment dir does not match tree from online-nnet dir (OK if for multilingual)"
cp $alidir/tree $dir
grep -v '^--endpoint' $feature_conf >$dir/feature.conf || exit 1;
mkdir -p $dir/valid $dir/train_subset

# Get list of validation utterances. 
awk '{print $1}' $data/utt2spk | utils/shuffle_list.pl | head -$num_utts_subset \
    > $dir/valid/uttlist || exit 1;

if [ -f $data/utt2uniq ]; then
  echo "File $data/utt2uniq exists, so augmenting valid/uttlist to"
  echo "include all perturbed versions of the same 'real' utterances."
  mv $dir/valid/uttlist $dir/valid/uttlist.tmp
  utils/utt2spk_to_spk2utt.pl $data/utt2uniq > $dir/uniq2utt
  cat $dir/valid/uttlist.tmp | utils/apply_map.pl $data/utt2uniq | \
    sort | uniq | utils/apply_map.pl $dir/uniq2utt | \
    awk '{for(n=1;n<=NF;n++) print $n;}' | sort  > $dir/valid/uttlist
  rm $dir/uniq2utt $dir/valid/uttlist.tmp
fi

awk '{print $1}' $data/utt2spk | utils/filter_scp.pl --exclude $dir/valid/uttlist | \
  utils/shuffle_list.pl | head -$num_utts_subset > $dir/train_subset/uttlist || exit 1;


for subdir in valid train_subset; do
  # In order for the iVector extraction to work right, we need to process all
  # utterances of the speakers which have utterances in valid/uttlist, and the
  # same for train_subset/uttlist.  We produce $dir/valid/uttlist_extended which
  # will contain all utterances of all speakers which have utterances in
  # $dir/valid/uttlist, and the same for $dir/train_subset/.

  utils/filter_scp.pl $dir/$subdir/uttlist <$data/utt2spk | awk '{print $2}' > $dir/$subdir/spklist || exit 1;
  utils/filter_scp.pl -f 2 $dir/$subdir/spklist <$data/utt2spk >$dir/$subdir/utt2spk || exit 1;
  utils/utt2spk_to_spk2utt.pl <$dir/$subdir/utt2spk >$dir/$subdir/spk2utt || exit 1;
  awk '{print $1}' <$dir/$subdir/utt2spk >$dir/$subdir/uttlist_extended || exit 1;
  rm $dir/$subdir/spklist
done


if [ -f $data/segments ]; then
  # note: in the feature extraction, because the program online2-wav-dump-features is sensitive to the
  # previous utterances within a speaker, we do the filtering after extracting the features.
  echo "$0 [info]: segments file exists: using that."
  feats="ark,s,cs:extract-segments scp:$sdata/JOB/wav.scp $sdata/JOB/segments ark:- | online2-wav-dump-features --config=$dir/feature.conf ark:$sdata/JOB/spk2utt ark,s,cs:- ark:- | subset-feats --exclude=$dir/valid/uttlist ark:- ark:- |"
  valid_feats="ark,s,cs:utils/filter_scp.pl $dir/valid/uttlist_extended $data/segments  | extract-segments scp:$data/wav.scp - ark:- | online2-wav-dump-features --config=$dir/feature.conf ark:$dir/valid/spk2utt ark,s,cs:- ark:- | subset-feats --include=$dir/valid/uttlist ark:- ark:- |"
  train_subset_feats="ark,s,cs:utils/filter_scp.pl $dir/train_subset/uttlist_extended $data/segments  | extract-segments scp:$data/wav.scp - ark:- | online2-wav-dump-features --config=$dir/feature.conf ark:$dir/train_subset/spk2utt ark,s,cs:- ark:- | subset-feats --include=$dir/train_subset/uttlist ark:- ark:- |"
else
  echo "$0 [info]: no segments file exists, using wav.scp."
  feats="ark,s,cs:online2-wav-dump-features --config=$dir/feature.conf ark:$sdata/JOB/spk2utt scp:$sdata/JOB/wav.scp ark:- | subset-feats --exclude=$dir/valid/uttlist ark:- ark:- |"
  valid_feats="ark,s,cs:utils/filter_scp.pl $dir/valid/uttlist_extended $data/wav.scp | online2-wav-dump-features --config=$dir/feature.conf ark:$dir/valid/spk2utt scp:- ark:- | subset-feats --include=$dir/valid/uttlist ark:- ark:- |"
  train_subset_feats="ark,s,cs:utils/filter_scp.pl $dir/train_subset/uttlist_extended $data/wav.scp | online2-wav-dump-features --config=$dir/feature.conf ark:$dir/train_subset/spk2utt scp:- ark:- | subset-feats --include=$dir/train_subset/uttlist ark:- ark:- |"
fi

ivector_dim=$(online2-wav-dump-features --config=$dir/feature.conf --print-ivector-dim=true) || exit 1;

! [ $ivector_dim -ge 0 ] && echo "$0: error getting iVector dim" && exit 1;


set -o pipefail
left_context=$(nnet-am-info $mdl | grep '^left-context' | awk '{print $2}') || exit 1;
right_context=$(nnet-am-info $mdl | grep '^right-context' | awk '{print $2}') || exit 1;
set +o pipefail


if [ $stage -le 0 ]; then
  echo "$0: working out number of frames of training data"
  num_frames=$(steps/nnet2/get_num_frames.sh $data)
  echo $num_frames > $dir/info/num_frames
else
  num_frames=`cat $dir/info/num_frames` || exit 1;
fi

# the + 1 is to round up, not down... we assume it doesn't divide exactly.
num_archives=$[$num_frames/($frames_per_eg*$samples_per_iter)+1]
# (for small data)- while reduce_frames_per_eg == true and the number of
# archives is 1 and would still be 1 if we reduced frames_per_eg by 1, reduce it
# by 1.
reduced=false
while $reduce_frames_per_eg && [ $frames_per_eg -gt 1 ] && \
  [ $[$num_frames/(($frames_per_eg-1)*$samples_per_iter)] -eq 0 ]; do
  frames_per_eg=$[$frames_per_eg-1]
  num_archives=1
  reduced=true
done
$reduced && echo "$0: reduced frames_per_eg to $frames_per_eg because amount of data is small."

echo $num_archives >$dir/info/num_archives
echo $frames_per_eg >$dir/info/frames_per_eg

# Working out number of egs per archive
egs_per_archive=$[$num_frames/($frames_per_eg*$num_archives)]
! [ $egs_per_archive -le $samples_per_iter ] && \
  echo "$0: script error: egs_per_archive=$egs_per_archive not <= samples_per_iter=$samples_per_iter" \
  && exit 1;

echo $egs_per_archive > $dir/info/egs_per_archive

echo "$0: creating $num_archives archives, each with $egs_per_archive egs, with"
echo "$0:   $frames_per_eg labels per example, and (left,right) context = ($left_context,$right_context)"

# Making soft links to storage directories.  This is a no-up unless
# the subdirectory $dir/storage/ exists.  See utils/create_split_dir.pl
for x in `seq $num_archives`; do
  utils/create_data_link.pl $dir/egs.$x.ark
  for y in `seq $nj`; do
    utils/create_data_link.pl $dir/egs_orig.$x.$y.ark
  done
done

nnet_context_opts="--left-context=$left_context --right-context=$right_context"

if [ $stage -le 2 ]; then
  echo "$0: Getting validation and training subset examples."
  rm $dir/.error 2>/dev/null
  echo "$0: ... extracting validation and training-subset alignments."
  set -o pipefail;
  for id in $(seq $nj); do gunzip -c $alidir/ali.$id.gz; done | \
    copy-int-vector ark:- ark,t:- | \
    utils/filter_scp.pl <(cat $dir/valid/uttlist $dir/train_subset/uttlist) | \
    gzip -c >$dir/ali_special.gz || exit 1;
  set +o pipefail; # unset the pipefail option.

  $cmd $dir/log/create_valid_subset.log \
    nnet-get-egs $ivectors_opt $nnet_context_opts "$valid_feats" \
    "ark,s,cs:gunzip -c $dir/ali_special.gz | ali-to-pdf $alidir/final.mdl ark:- ark:- | ali-to-post ark:- ark:- |" \
     "ark:$dir/valid_all.egs" || touch $dir/.error &
  $cmd $dir/log/create_train_subset.log \
    nnet-get-egs $ivectors_opt $nnet_context_opts "$train_subset_feats" \
     "ark,s,cs:gunzip -c $dir/ali_special.gz | ali-to-pdf $alidir/final.mdl ark:- ark:- | ali-to-post ark:- ark:- |" \
     "ark:$dir/train_subset_all.egs" || touch $dir/.error &
  wait;
  [ -f $dir/.error ] && echo "Error detected while creating train/valid egs" && exit 1;
  echo "... Getting subsets of validation examples for diagnostics and combination."
  $cmd $dir/log/create_valid_subset_combine.log \
    nnet-subset-egs --n=$num_valid_frames_combine ark:$dir/valid_all.egs \
        ark:$dir/valid_combine.egs || touch $dir/.error &
  $cmd $dir/log/create_valid_subset_diagnostic.log \
    nnet-subset-egs --n=$num_frames_diagnostic ark:$dir/valid_all.egs \
    ark:$dir/valid_diagnostic.egs || touch $dir/.error &

  $cmd $dir/log/create_train_subset_combine.log \
    nnet-subset-egs --n=$num_train_frames_combine ark:$dir/train_subset_all.egs \
    ark:$dir/train_combine.egs || touch $dir/.error &
  $cmd $dir/log/create_train_subset_diagnostic.log \
    nnet-subset-egs --n=$num_frames_diagnostic ark:$dir/train_subset_all.egs \
    ark:$dir/train_diagnostic.egs || touch $dir/.error &
  wait
  sleep 5  # wait for file system to sync.
  cat $dir/valid_combine.egs $dir/train_combine.egs > $dir/combine.egs

  for f in $dir/{combine,train_diagnostic,valid_diagnostic}.egs; do
    [ ! -s $f ] && echo "No examples in file $f" && exit 1;
  done
  rm $dir/valid_all.egs $dir/train_subset_all.egs $dir/{train,valid}_combine.egs $dir/ali_special.gz
fi

if [ $stage -le 3 ]; then
  # create egs_orig.*.*.ark; the first index goes to $num_archives,
  # the second to $nj (which is the number of jobs in the original alignment
  # dir)

  egs_list=
  for n in $(seq $num_archives); do
    egs_list="$egs_list ark:$dir/egs_orig.$n.JOB.ark"
  done
  echo "$0: Generating training examples on disk"
  
  # The examples will go round-robin to egs_list.
  $cmd $io_opts JOB=1:$nj $dir/log/get_egs.JOB.log \
    nnet-get-egs $ivectors_opt $nnet_context_opts --num-frames=$frames_per_eg "$feats" \
    "ark,s,cs:gunzip -c $alidir/ali.JOB.gz | ali-to-pdf $alidir/final.mdl ark:- ark:- | ali-to-post ark:- ark:- |" ark:- \| \
    nnet-copy-egs ark:- $egs_list || exit 1;
fi

if [ $stage -le 4 ]; then
  echo "$0: recombining and shuffling order of archives on disk"
  # combine all the "egs_orig.JOB.*.scp" (over the $nj splits of the data) and
  # shuffle the order, writing to the egs.JOB.ark

  egs_list=
  for n in $(seq $nj); do 
    egs_list="$egs_list $dir/egs_orig.JOB.$n.ark"
  done

  $cmd $io_opts $extra_opts JOB=1:$num_archives $dir/log/shuffle.JOB.log \
    nnet-shuffle-egs --srand=JOB "ark:cat $egs_list|" ark:$dir/egs.JOB.ark  || exit 1;
fi

if [ $stage -le 5 ]; then
  echo "$0: removing temporary archives"
  for x in `seq $num_archives`; do
    for y in `seq $nj`; do
      file=$dir/egs_orig.$x.$y.ark
      [ -L $file ] && rm $(utils/make_absolute.sh $file)
      rm $file
    done
  done
fi

echo "$0: Finished preparing training examples"


================================================
FILE: egs/steps/online/nnet2/get_egs_discriminative2.sh
================================================
#!/usr/bin/env bash

# Copyright 2012  Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.

# This script dumps examples MPE or MMI or state-level minimum bayes risk (sMBR)
# training of neural nets.  Note: for "criterion", smbr > mpe > mmi in terms of
# compatibility of the dumped egs, meaning you can use the egs dumped with
# --criterion smbr for MPE or MMI, and egs dumped with --criterion mpe for MMI
# training.  The discriminative training program itself doesn't enforce this and
# it would let you mix and match them arbitrarily; we area speaking in terms of
# the correctness of the algorithm that splits the lattices into pieces.

# Begin configuration section.
cmd=run.pl
criterion=smbr
drop_frames=false #  option relevant for MMI, affects how we dump examples.
samples_per_iter=400000 # measured in frames, not in "examples"
max_temp_archives=128 # maximum number of temp archives per input job, only
                      # affects the process of generating archives, not the
                      # final result.

stage=0
iter=final
cleanup=true
# End configuration section.


echo "$0 $@"  # Print the command line for logging

if [ -f path.sh ]; then . ./path.sh; fi
. parse_options.sh || exit 1;


if [ $# != 6 ]; then
  echo "Usage: $0 [opts] <data> <lang> <ali-dir> <denlat-dir> <src-online-nnet2-dir> <degs-dir>"
  echo " e.g.: $0 data/train data/lang exp/nnet2_online/nnet_a_online{_ali,_denlats,_degs}"
  echo ""
  echo "Main options (for others, see top of script file)"
  echo "  --config <config-file>                           # config file containing options"
  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs (probably would be good to add --max-jobs-run 5 or so if using"
  echo "                                                   # GridEngine (to avoid excessive NFS traffic)."
  echo "  --samples-per-iter <#samples|400000>             # Number of samples of data to process per iteration, per"
  echo "                                                   # process."
  echo "  --stage <stage|-8>                               # Used to run a partially-completed training process from somewhere in"
  echo "                                                   # the middle."
  echo "  --criterion <criterion|smbr>                     # Training criterion: may be smbr, mmi or mpfe"
  echo "  --online-ivector-dir <dir|"">                    # Directory for online-estimated iVectors, used in the"
  echo "                                                   # online-neural-net setup.  (but you may want to use"
  echo "                                                   # steps/online/nnet2/get_egs_discriminative2.sh instead)"
  exit 1;
fi

data=$1
lang=$2
alidir=$3
denlatdir=$4
srcdir=$5
dir=$6


# Check some files.
for f in $data/feats.scp $lang/L.fst $alidir/ali.1.gz $alidir/num_jobs $alidir/tree \
         $denlatdir/lat.1.gz $denlatdir/num_jobs $srcdir/$iter.mdl $srcdir/conf/online_nnet2_decoding.conf; do
  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
done

mkdir -p $dir/log $dir/info || exit 1;

utils/lang/check_phones_compatible.sh $lang/phones.txt $alidir/phones.txt || exit 1;
cp $lang/phones.txt $dir || exit 1;

nj=$(cat $denlatdir/num_jobs) || exit 1; # $nj is the number of
                                         # splits of the denlats and alignments.


nj_ali=$(cat $alidir/num_jobs) || exit 1;

sdata=$data/split$nj
utils/split_data.sh $data $nj


if [ $nj_ali -eq $nj ]; then
  ali_rspecifier="ark,s,cs:gunzip -c $alidir/ali.JOB.gz |"
else
  ali_rspecifier="scp:$dir/ali.scp"
  if [ $stage -le 1 ]; then
    echo "$0: number of jobs in den-lats versus alignments differ: dumping them as single archive and index."
    alis=$(for n in $(seq $nj_ali); do echo -n "$alidir/ali.$n.gz "; done)
    copy-int-vector --print-args=false \
      "ark:gunzip -c $alis|" ark,scp:$dir/ali.ark,$dir/ali.scp || exit 1;
  fi
fi


silphonelist=`cat $lang/phones/silence.csl` || exit 1;

cp $alidir/tree $dir
cp $lang/phones/silence.csl $dir/info || exit 1;
cp $srcdir/$iter.mdl $dir/final.mdl || exit 1;

grep -v '^--endpoint' $srcdir/conf/online_nnet2_decoding.conf >$dir/feature.conf || exit 1;

ivector_dim=$(online2-wav-dump-features --config=$dir/feature.conf --print-ivector-dim=true) || exit 1;

echo $ivector_dim > $dir/info/ivector_dim

! [ $ivector_dim -ge 0 ] && echo "$0: error getting iVector dim" && exit 1;

if [ -f $data/segments ]; then
  # note: in the feature extraction, because the program online2-wav-dump-features is sensitive to the
  # previous utterances within a speaker, we do the filtering after extracting the features.
  echo "$0 [info]: segments file exists: using that."
  feats="ark,s,cs:extract-segments scp:$sdata/JOB/wav.scp $sdata/JOB/segments ark:- | online2-wav-dump-features --config=$dir/feature.conf ark:$sdata/JOB/spk2utt ark,s,cs:- ark:- |"
else
  echo "$0 [info]: no segments file exists, using wav.scp."
  feats="ark,s,cs:online2-wav-dump-features --config=$dir/feature.conf ark:$sdata/JOB/spk2utt scp:$sdata/JOB/wav.scp ark:- |"
fi


if [ $stage -le 2 ]; then
  echo "$0: working out number of frames of training data"
  num_frames=$(steps/nnet2/get_num_frames.sh $data)

  echo $num_frames > $dir/info/num_frames

  # Working out total number of archives. Add one on the assumption the
  # num-frames won't divide exactly, and we want to round up.
  num_archives=$[$num_frames/$samples_per_iter + 1]

  # the next few lines relate to how we may temporarily split each input job
  # into fewer than $num_archives pieces, to avoid using an excessive
  # number of filehandles.
  archive_ratio=$[$num_archives/$max_temp_archives+1]
  num_archives_temp=$[$num_archives/$archive_ratio]
  # change $num_archives slightly to make it an exact multiple
  # of $archive_ratio.
  num_archives=$[$num_archives_temp*$archive_ratio]

  echo $num_archives >$dir/info/num_archives || exit 1
  echo $num_archives_temp >$dir/info/num_archives_temp || exit 1

  frames_per_archive=$[$num_frames/$num_archives]

  # note, this is the number of frames per archive prior to discarding frames.
  echo $frames_per_archive > $dir/info/frames_per_archive
else
  num_archives=$(cat $dir/info/num_archives) || exit 1;
  num_archives_temp=$(cat $dir/info/num_archives_temp) || exit 1;
  frames_per_archive=$(cat $dir/info/frames_per_archive) || exit 1;
fi

echo "$0: Splitting the data up into $num_archives archives (using $num_archives_temp temporary pieces per input job)"
echo "$0: giving samples-per-iteration of $frames_per_archive (you requested $samples_per_iter)."

# we create these data links regardless of the stage, as there are situations
# where we would want to recreate a data link that had previously been deleted.

if [ -d $dir/storage ]; then
  echo "$0: creating data links for distributed storage of degs"
  # See utils/create_split_dir.pl for how this 'storage' directory is created.
  for x in $(seq $nj); do
    for y in $(seq $num_archives_temp); do
      utils/create_data_link.pl $dir/degs_orig.$x.$y.ark
    done
  done
  for z in $(seq $num_archives); do
    utils/create_data_link.pl $dir/degs.$z.ark
  done
  if [ $num_archives_temp -ne $num_archives ]; then
    for z in $(seq $num_archives); do
      utils/create_data_link.pl $dir/degs_temp.$z.ark
    done
  fi
fi

if [ $stage -le 3 ]; then
  echo "$0: getting initial training examples by splitting lattices"

  degs_list=$(for n in $(seq $num_archives_temp); do echo -n "ark:$dir/degs_orig.JOB.$n.ark "; done)

  $cmd JOB=1:$nj $dir/log/get_egs.JOB.log \
    nnet-get-egs-discriminative --criterion=$criterion --drop-frames=$drop_frames \
      "$srcdir/$iter.mdl" "$feats" "$ali_rspecifier" "ark,s,cs:gunzip -c $denlatdir/lat.JOB.gz|" ark:- \| \
    nnet-copy-egs-discriminative $const_dim_opt ark:- $degs_list || exit 1;
  sleep 5;  # wait a bit so NFS has time to write files.
fi

if [ $stage -le 4 ]; then

  degs_list=$(for n in $(seq $nj); do echo -n "$dir/degs_orig.$n.JOB.ark "; done)

  if [ $num_archives -eq $num_archives_temp ]; then
    echo "$0: combining data into final archives and shuffling it"

    $cmd JOB=1:$num_archives $dir/log/shuffle.JOB.log \
      cat $degs_list \| nnet-shuffle-egs-discriminative --srand=JOB ark:- \
       ark:$dir/degs.JOB.ark || exit 1;
  else
    echo "$0: combining and re-splitting data into un-shuffled versions of final archives."

    archive_ratio=$[$num_archives/$num_archives_temp]
    ! [ $archive_ratio -gt 1 ] && echo "$0: Bad archive_ratio $archive_ratio" && exit 1;

    # note: the \$[ .. ] won't be evaluated until the job gets executed.  The
    # aim is to write to the archives with the final numbering, 1
    # ... num_archives, which is more than num_archives_temp.  The list with
    # \$[... ] expressions in it computes the set of final indexes for each
    # temporary index.
    degs_list_out=$(for n in $(seq $archive_ratio); do echo -n "ark:$dir/degs_temp.\$[((JOB-1)*$archive_ratio)+$n].ark "; done)
    # e.g. if dir=foo and archive_ratio=2, we'd have
    # degs_list_out='foo/degs_temp.$[((JOB-1)*2)+1].ark foo/degs_temp.$[((JOB-1)*2)+2].ark'

    $cmd JOB=1:$num_archives_temp $dir/log/resplit.JOB.log \
      cat $degs_list \| nnet-copy-egs-discriminative --srand=JOB ark:- \
      $degs_list_out || exit 1;
  fi
fi

if [ $stage -le 5 ] && [ $num_archives -ne $num_archives_temp ]; then
  echo "$0: shuffling final archives."

  $cmd JOB=1:$num_archives $dir/log/shuffle.JOB.log \
    nnet-shuffle-egs-discriminative --srand=JOB ark:$dir/degs_temp.JOB.ark \
      ark:$dir/degs.JOB.ark || exit 1

fi

if $cleanup; then
  echo "$0: removing temporary archives."
  for x in $(seq $nj); do
    for y in $(seq $num_archives_temp); do
      file=$dir/degs_orig.$x.$y.ark
      [ -L $file ] && rm $(utils/make_absolute.sh $file); rm $file
    done
  done
  if [ $num_archives_temp -ne $num_archives ]; then
    for z in $(seq $num_archives); do
      file=$dir/degs_temp.$z.ark
      [ -L $file ] && rm $(utils/make_absolute.sh $file); rm $file
    done
  fi
fi

echo "$0: Done."


================================================
FILE: egs/steps/online/nnet2/get_pca_transform.sh
================================================
#!/usr/bin/env bash

# Copyright 2016  David Snyder
#
# This script computes a PCA transform on top of spliced features processed with
# apply-cmvn-online.
#
#
# Apache 2.0.

# Begin configuration.
cmd=run.pl
config=
stage=0
dim=40 # The dim after applying PCA
normalize_variance=true # If the PCA transform normalizes the variance
normalize_mean=true # If the PCA transform centers
splice_opts=
online_cmvn_opts=
max_utts=5000 # maximum number of files to use
subsample=5 # subsample features with this periodicity

echo "$0 $@"  # Print the command line for logging

[ -f path.sh ] && . ./path.sh
. parse_options.sh || exit 1;

if [ $# != 2 ]; then
  echo "Usage: steps/nnet2/get_pca_transform.sh [options] <data> <dir>"
  echo " e.g.: steps/train_pca_transform.sh data/train_si84 exp/tri2b"
  echo "Main options (for others, see top of script file)"
  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
  echo "  --config <config-file>                           # config containing options"
  echo "  --stage <stage>                                  # stage to do partial re-run from."
  exit 1;
fi

data=$1
dir=$2

for f in $data/feats.scp ; do
  [ ! -f "$f" ] && echo "$0: expecting file $f to exist" && exit 1
done

mkdir -p $dir/log

echo "$splice_opts" >$dir/splice_opts # keep track of frame-splicing options
           # so that later stages of system building can know what they were.
echo $online_cmvn_opts > $dir/online_cmvn.conf # keep track of options to CMVN.

# create global_cmvn.stats
if ! matrix-sum --binary=false scp:$data/cmvn.scp - >$dir/global_cmvn.stats 2>/dev/null; then
  echo "$0: Error summing cmvn stats"
  exit 1
fi

feats="ark,s,cs:utils/subset_scp.pl --quiet $max_utts $data/feats.scp | apply-cmvn-online $online_cmvn_opts $dir/global_cmvn.stats scp:- ark:- | splice-feats $splice_opts ark:- ark:- | subsample-feats --n=$subsample ark:- ark:- |"

if [ $stage -le 0 ]; then
  $cmd $dir/log/pca_est.log \
    est-pca --dim=$dim --normalize-variance=$normalize_variance \
    --normalize-mean=$normalize_mean "$feats" $dir/final.mat || exit 1;
fi

echo "Done estimating PCA transform in $dir"

exit 0


================================================
FILE: egs/steps/online/nnet2/make_denlats.sh
================================================
#!/usr/bin/env bash
# Copyright 2012  Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.

# Create denominator lattices for MMI/MPE training.
# This version uses the online-nnet2 features.
#
# Creates its output in $dir/lat.*.gz

# Begin configuration section.
stage=0
nj=4
cmd=run.pl
sub_split=1
beam=13.0
lattice_beam=7.0
acwt=0.1
max_active=5000
max_mem=20000000 # This will stop the processes getting too large.
# This is in bytes, but not "real" bytes-- you have to multiply
# by something like 5 or 10 to get real bytes (not sure why so large)
num_threads=1
parallel_opts=  # ignored now.
# End configuration section.

echo "$0 $@"  # Print the command line for logging

[ -f ./path.sh ] && . ./path.sh; # source the path.
. parse_options.sh || exit 1;

if [ $# != 4 ]; then
  echo "Usage: steps/make_denlats.sh [options] <data-dir> <lang-dir> <src-dir> <exp-dir>"
  echo "  e.g.: steps/make_denlats.sh data/train data/lang exp/nnet2_online/nnet_a_online exp/nnet2_online/nnet_a_denlats"
  echo "Works for (delta|lda) features, and (with --transform-dir option) such features"
  echo " plus transforms."
  echo ""
  echo "Main options (for others, see top of script file)"
  echo "  --config <config-file>                           # config containing options"
  echo "  --nj <nj>                                        # number of parallel jobs"
  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
  echo "  --sub-split <n-split>                            # e.g. 40; use this for "
  echo "                           # large databases so your jobs will be smaller and"
  echo "                           # will (individually) finish reasonably soon."
  echo "  --num-threads  <n>                # number of threads per decoding job"
  exit 1;
fi

data=$1
lang=$2
srcdir=$3
dir=$4

for f in $data/wav.scp $lang/L.fst $srcdir/final.mdl $srcdir/conf/online_nnet2_decoding.conf; do
  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1;
done

sdata=$data/split$nj

thread_string=
[ $num_threads -gt 1 ] && thread_string="-parallel --num-threads=$num_threads"

mkdir -p $dir/log
split_data.sh $data $nj || exit 1;
echo $nj > $dir/num_jobs

utils/lang/check_phones_compatible.sh $lang/phones.txt $srcdir/phones.txt || exit 1;

oov=`cat $lang/oov.int` || exit 1;


# Compute grammar FST which corresponds to unigram decoding graph.
new_lang="$dir/"$(basename "$lang")


grep -v '^--endpoint' $srcdir/conf/online_nnet2_decoding.conf >$dir/feature.conf || exit 1;

if [ $stage -le 0 ]; then
  # mkgraph.sh expects a whole directory "lang", so put everything in one directory...
  # it gets L_disambig.fst and G.fst (among other things) from $dir/lang, and
  # final.mdl from $srcdir; the output HCLG.fst goes in $dir/graph.

  cp -rH $lang $dir/

  echo "Compiling decoding graph in $dir/dengraph"
  if [ -s $dir/dengraph/HCLG.fst ] && [ $dir/dengraph/HCLG.fst -nt $srcdir/final.mdl ]; then
    echo "Graph $dir/dengraph/HCLG.fst already exists: skipping graph creation."
  else
    echo "Making unigram grammar FST in $new_lang"
    cat $data/text | utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt | \
      awk '{for(n=2;n<=NF;n++){ printf("%s ", $n); } printf("\n"); }' | \
      utils/make_unigram_grammar.pl | fstcompile | fstarcsort --sort_type=ilabel > $new_lang/G.fst \
      || exit 1;
    utils/mkgraph.sh $new_lang $srcdir $dir/dengraph || exit 1;
  fi
fi


if [ -f $data/segments ]; then
  # note: in the feature extraction, because the program online2-wav-dump-features is sensitive to the
  # previous utterances within a speaker, we do the filtering after extracting the features.
  echo "$0 [info]: segments file exists: using that."
  feats="ark,s,cs:extract-segments scp:$sdata/JOB/wav.scp $sdata/JOB/segments ark:- | online2-wav-dump-features --config=$dir/feature.conf ark:$sdata/JOB/spk2utt ark,s,cs:- ark:- |"
else
  echo "$0 [info]: no segments file exists, using wav.scp."
  feats="ark,s,cs:online2-wav-dump-features --config=$dir/feature.conf ark:$sdata/JOB/spk2utt scp:$sdata/JOB/wav.scp ark:- |"
fi


# if this job is interrupted by the user, we want any background jobs to be
# killed too.
cleanup() {
  local pids=$(jobs -pr)
  [ -n "$pids" ] && kill $pids
}
trap "cleanup" INT QUIT TERM EXIT


if [ $sub_split -eq 1 ]; then
  $cmd --num-threads $num_threads JOB=1:$nj $dir/log/decode_den.JOB.log \
   nnet-latgen-faster$thread_string --beam=$beam --lattice-beam=$lattice_beam --acoustic-scale=$acwt \
    --max-mem=$max_mem --max-active=$max_active --word-symbol-table=$lang/words.txt $srcdir/final.mdl  \
     $dir/dengraph/HCLG.fst "$feats" "ark:|gzip -c >$dir/lat.JOB.gz" || exit 1;
else
  # each job from 1 to $nj is split into multiple pieces (sub-split), and we aim
  # to have at most two jobs running at each time.  The idea is that if we have stragglers
  # from one job, we can be processing another one at the same time.
  rm $dir/.error 2>/dev/null

  prev_pid=
  for n in `seq $[nj+1]`; do
    if [ $n -gt $nj ]; then
      this_pid=
    elif [ -f $dir/.done.$n ] && [ $dir/.done.$n -nt $srcdir/final.mdl ]; then
      echo "Not processing subset $n as already done (delete $dir/.done.$n if not)";
      this_pid=
    else
      sdata2=$data/split$nj/$n/split${sub_split}utt;
      split_data.sh --per-utt $sdata/$n $sub_split || exit 1;
      mkdir -p $dir/log/$n
      mkdir -p $dir/part
      feats_subset=`echo $feats | sed "s/trans.JOB/trans.$n/g" | sed s:JOB/:$n/split${sub_split}utt/JOB/:g`

      $cmd --num-threads $num_threads JOB=1:$sub_split $dir/log/$n/decode_den.JOB.log \
        nnet-latgen-faster$thread_string --beam=$beam --lattice-beam=$lattice_beam --acoustic-scale=$acwt \
        --max-mem=$max_mem --max-active=$max_active --word-symbol-table=$lang/words.txt $srcdir/final.mdl  \
          $dir/dengraph/HCLG.fst "$feats_subset" "ark:|gzip -c >$dir/lat.$n.JOB.gz" || touch $dir/.error &
      this_pid=$!
    fi
    if [ ! -z "$prev_pid" ]; then  # Wait for the previous job; merge the previous set of lattices.
      wait $prev_pid
      [ -f $dir/.error ] && echo "$0: error generating denominator lattices" && exit 1;
      rm $dir/.merge_error 2>/dev/null
      echo Merging archives for data subset $prev_n
      for k in `seq $sub_split`; do
        gunzip -c $dir/lat.$prev_n.$k.gz || touch $dir/.merge_error;
      done | gzip -c > $dir/lat.$prev_n.gz || touch $dir/.merge_error;
      [ -f $dir/.merge_error ] && echo "$0: Merging lattices for subset $prev_n failed (or maybe some other error)" && exit 1;
      rm $dir/lat.$prev_n.*.gz
      touch $dir/.done.$prev_n
    fi
    prev_n=$n
    prev_pid=$this_pid
  done
fi


echo "$0: done generating denominator lattices."


================================================
FILE: egs/steps/online/nnet2/prepare_online_decoding.sh
================================================
#!/usr/bin/env bash

# Copyright 2014  Johns Hopkins University (Author: Daniel Povey)
# Apache 2.0

# Begin configuration.
stage=0 # This allows restarting after partway, when something when wrong.
feature_type=mfcc
add_pitch=false
mfcc_config=conf/mfcc.conf # you can override any of these you need to override.
plp_config=conf/plp.conf
fbank_config=conf/fbank.conf 
# online_pitch_config is the config file for both pitch extraction and
# post-processing; we combine them into one because during training this
# is given to the program compute-and-process-kaldi-pitch-feats.
online_pitch_config=conf/online_pitch.conf

# Below are some options that affect the iVectors, and should probably
# match those used in extract_ivectors_online.sh.
num_gselect=5 # Gaussian-selection using diagonal model: number of Gaussians to select
posterior_scale=0.1 # Scale on the acoustic posteriors, intended to account for
                    # inter-frame correlations.
min_post=0.025 # Minimum posterior to use (posteriors below this are pruned out)
               # caution: you should use the same value in the online-estimation
               # code.
max_count=100   # This max-count of 100 can make iVectors more consistent for
                # different lengths of utterance, by scaling up the prior term
                # when the data-count exceeds this value.  The data-count is
                # after posterior-scaling, so assuming the posterior-scale is
                # 0.1, --max-count 100 starts having effect after 1000 frames,
                # or 10 seconds of data.
iter=final
# End configuration.

echo "$0 $@"  # Print the command line for logging

[ -f path.sh ] && . ./path.sh;
. parse_options.sh || exit 1;

if [ $# -ne 4 ] && [ $# -ne 3 ]; then
   echo "Usage: $0 [options] <lang-dir> [<ivector-extractor-dir>] <nnet-dir> <output-dir>"
   echo "e.g.: $0 data/lang exp/nnet2_online/extractor exp/nnet2_online/nnet exp/nnet2_online/nnet_online"
   echo "main options (for others, see top of script file)"
   echo "  --feature-type <mfcc|plp>                        # Type of the base features; "
   echo "                                                   # important to generate the correct"
   echo "                                                   # configs in <output-dir>/conf/"
   echo "  --add-pitch <true|false>                         # Append pitch features to cmvn"
   echo "                                                   # (default: false)"
   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
   echo "  --config <config-file>                           # config containing options"
   echo "  --iter <model-iteration|final>                   # iteration of model to take."
   echo "  --stage <stage>                                  # stage to do partial re-run from."
   exit 1;
fi


if [ $# -eq 4 ]; then
  lang=$1
  iedir=$2
  srcdir=$3
  dir=$4
else
  [ $# -eq 3 ] || exit 1;
  lang=$1
  iedir=
  srcdir=$2
  dir=$3
fi

for f in $lang/phones.txt $srcdir/${iter}.mdl $srcdir/tree; do
  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
done
if [ ! -z "$iedir" ]; then
  for f in final.{mat,ie,dubm} splice_opts global_cmvn.stats online_cmvn.conf; do
    [ ! -f $iedir/$f ] && echo "$0: no such file $iedir/$f" && exit 1;
  done
fi

utils/lang/check_phones_compatible.sh $lang/phones.txt $srcdir/phones.txt || exit 1;
mkdir -p $dir
cp $lang/phones.txt $dir || exit 1;

dir=$(utils/make_absolute.sh $dir) # Convert $dir to an absolute pathname, so that the
                        # configuration files we write will contain absolute
                        # pathnames.
mkdir -p $dir/conf


cp $srcdir/${iter}.mdl $dir/final.mdl || exit 1;
cp $srcdir/tree $dir/ || exit 1;
if [ ! -z "$iedir" ]; then
  mkdir -p $dir/ivector_extractor/
  cp $iedir/final.{mat,ie,dubm} $iedir/global_cmvn.stats $dir/ivector_extractor/ || exit 1;

  # The following things won't be needed directly by the online decoding, but
  # will allow us to run prepare_online_decoding.sh again with
  # $dir/ivector_extractor/ as the input directory (useful in certain
  # cross-system training scenarios).
  cp $iedir/splice_opts $iedir/online_cmvn.conf $dir/ivector_extractor/ || exit 1;
fi


mkdir -p $dir/conf
rm $dir/{plp,mfcc,fbank}.conf 2>/dev/null
echo "$0: preparing configuration files in $dir/conf"

if [ -f $dir/conf/online_nnet2_decoding.conf ]; then
  echo "$0: moving $dir/conf/online_nnet2_decoding.conf to $dir/conf/online_nnet2_decoding.conf.bak"
  mv $dir/conf/online_nnet2_decoding.conf $dir/conf/online_nnet2_decoding.conf.bak
fi

conf=$dir/conf/online_nnet2_decoding.conf
echo -n >$conf

echo "--feature-type=$feature_type" >>$conf

case "$feature_type" in
  mfcc)
    echo "--mfcc-config=$dir/conf/mfcc.conf" >>$conf
    cp $mfcc_config $dir/conf/mfcc.conf || exit 1;;
  plp)
    echo "--plp-config=$dir/conf/plp.conf" >>$conf
    cp $plp_config $dir/conf/plp.conf || exit 1;;
  fbank)
    echo "--fbank-config=$dir/conf/fbank.conf" >>$conf
    cp $fbank_config $dir/conf/fbank.conf || exit 1;;
  *)
    echo "Unknown feature type $feature_type"
esac


if [ ! -z "$iedir" ]; then
  ieconf=$dir/conf/ivector_extractor.conf
  echo -n >$ieconf
  echo "--ivector-extraction-config=$ieconf" >>$conf
  cp $iedir/online_cmvn.conf $dir/conf/online_cmvn.conf || exit 1;
  # the next line puts each option from splice_opts on its own line in the config.
  for x in $(cat $iedir/splice_opts); do echo "$x"; done > $dir/conf/splice.conf
  echo "--splice-config=$dir/conf/splice.conf" >>$ieconf
  echo "--cmvn-config=$dir/conf/online_cmvn.conf" >>$ieconf
  echo "--lda-matrix=$dir/ivector_extractor/final.mat" >>$ieconf
  echo "--global-cmvn-stats=$dir/ivector_extractor/global_cmvn.stats" >>$ieconf
  echo "--diag-ubm=$dir/ivector_extractor/final.dubm" >>$ieconf
  echo "--ivector-extractor=$dir/ivector_extractor/final.ie" >>$ieconf
  echo "--num-gselect=$num_gselect"  >>$ieconf
  echo "--min-post=$min_post" >>$ieconf
  echo "--posterior-scale=$posterior_scale" >>$ieconf # this is currently the default in the scripts.
  echo "--max-remembered-frames=1000" >>$ieconf # the default
  echo "--max-count=$max_count" >>$ieconf
fi

if $add_pitch; then
  echo "$0: enabling pitch features"
  echo "--add-pitch=true" >>$conf
  echo "$0: creating $dir/conf/online_pitch.conf"
  if [ ! -f $online_pitch_config ]; then
    echo "$0: expected file '$online_pitch_config' to exist.";
    exit 1;
  fi
  cp $online_pitch_config $dir/conf/online_pitch.conf || exit 1;
  echo "--online-pitch-config=$dir/conf/online_pitch.conf" >>$conf
fi

silphonelist=`cat $lang/phones/silence.csl` || exit 1;
echo "--endpoint.silence-phones=$silphonelist" >>$conf
echo "$0: created config file $conf"


================================================
FILE: egs/steps/online/nnet2/prepare_online_decoding_retrain.sh
================================================
#!/usr/bin/env bash

# Copyright 2014  Johns Hopkins University (Author: Daniel Povey)
# Apache 2.0

# This is as prepare_online_decoding.sh, but it's for a special case, where we
# already have a directory that's been prepared in that way, but for another
# corpus, and we have used the script
# steps/online/nnet2/dump_nnet_activations.sh to dump activations of the last
# hidden layer of that network on our data, and then steps/nnet2/retrain_fast.sh
# to train a neural net on top of those activations.  The job of this script is
# to take the original neural net, and the net that was trained on top of
# its last hidden layer, combine them, and create an online-decoding directory
# in the same format as is created by prepare_online_decoding.sh.
# All the options for the feature extraction and the iVector extractor
# are taken from the original directory from the other corpus.


# Begin configuration.
stage=0 # This allows restarting after partway, when something when wrong.
cleanup=true
cmd=run.pl
# End configuration.

echo "$0 $@"  # Print the command line for logging

[ -f path.sh ] && . ./path.sh;
. parse_options.sh || exit 1;

if [ $# -ne 3 ] && [ $# -ne 4 ]; then    
  echo "Usage: $0 [options] <orig-nnet-online-dir> [<new-lang-dir>] <new-nnet-dir> <new-nnet-online-dir>"
  echo "e.g.: $0 exp_other/nnet2_online/nnet_a_online data/lang exp/nnet2_online/nnet_a exp/nnet2_online/nnet_a_online"
  echo "main options (for others, see top of script file)"
  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
  echo "  --config <config-file>                           # config containing options"
  echo "  --stage <stage>                                  # stage to do partial re-run from."
  exit 1;
fi

if [ $# -eq 3 ]; then
  echo "$0: warning: it's better if you add the new <lang> directory as the 2nd argument."

  online_src=$1
  lang=
  nnet_src=$2
  dir=$3
else
  online_src=$1
  lang=$2
  nnet_src=$3
  dir=$4

  extra_files=$lang/words.txt
fi


for f in $online_src/conf/online_nnet2_decoding.conf $nnet_src/final.mdl $nnet_src/tree $extra_files; do
  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
done


dir_as_given=$dir
dir=$(utils/make_absolute.sh $dir) # Convert $dir to an absolute pathname, so that the
                        # configuration files we write will contain absolute
                        # pathnames.
mkdir -p $dir/conf $dir/log


cp $nnet_src/tree $dir/ || exit 1;


# There are a bunch of files that we will need to copy from $online_src, because
# we're aiming to have one self-contained directory that has everything in it.
mkdir -p $dir/ivector_extractor
cp -r $online_src/ivector_extractor/* $dir/ivector_extractor

[ ! -d $online_src/conf ] && \
  echo "Expected directory $online_src/conf to exist" && exit 1;

for x in $online_src/conf/*conf; do
  # Replace directory name starting $online_src with those starting with $dir.
  # We actually replace any directory names ending in /ivector_extractor/ or /conf/ 
  # with $dir/ivector_extractor/ or $dir/conf/
  cat $x | perl -ape "s:=(.+)/(ivector_extractor|conf)/:=$dir/\$2/:;" > $dir/conf/$(basename $x)
done

info=$dir/nnet_info
nnet-am-info $online_src/final.mdl >$info
nc=$(grep num-components $info | awk '{print $2}');
if grep SumGroupComponent $info >/dev/null; then 
  nc_truncate=$[$nc-3]  # we did mix-up: remove AffineComponent,
                          # SumGroupComponent, SoftmaxComponent
else
  nc_truncate=$[$nc-2]  # remove AffineComponent, SoftmaxComponent
fi
$cmd $dir/log/get_raw_nnet.log \
 nnet-to-raw-nnet --truncate=$nc_truncate $online_src/final.mdl $dir/first_nnet.raw || exit 1;

# Now create the final.mdl, by inserting $dir/first_nnet.raw at the beginning
# of the model in $nnet_src/final.mdl

$cmd $dir/log/append_nnet.log \
  nnet-insert --randomize-next-component=false --insert-at=0 \
  $nnet_src/final.mdl $dir/first_nnet.raw $dir/final.mdl || exit 1;

$cleanup && rm $dir/first_nnet.raw

if [ ! -z "$lang" ]; then
  # if the $lang option was provided, modify the silence-phones in the config;
  # these are only used for the endpointing code, but we should get this right.
  cp $dir/conf/online_nnet2_decoding.conf{,.tmp}
  silphones=$(cat $lang/phones/silence.csl) || exit 1;
  cat $dir/conf/online_nnet2_decoding.conf.tmp | \
    sed s/silence-phones=.\\+/silence-phones=$silphones/ > $dir/conf/online_nnet2_decoding.conf
  rm $dir/conf/online_nnet2_decoding.conf.tmp
fi

echo "$0: formatted neural net for online decoding in $dir_as_given"


================================================
FILE: egs/steps/online/nnet2/prepare_online_decoding_transfer.sh
================================================
#!/usr/bin/env bash

# Copyright 2014  Johns Hopkins University (Author: Daniel Povey)
# Apache 2.0

# This is as prepare_online_decoding.sh, but for transfer learning-- the case where
# you have an existing online-decoding directory where you have all the feature
# stuff, that you don't want to change, but 

# Begin configuration.
stage=0 # This allows restarting after partway, when something went wrong.
cmd=run.pl
iter=final
# End configuration.

echo "$0 $@"  # Print the command line for logging

[ -f path.sh ] && . ./path.sh;
. parse_options.sh || exit 1;

if [ $# -ne 4 ]; then    
  echo "Usage: $0 [options] <orig-nnet-online-dir> <new-lang-dir> <new-nnet-dir> <new-nnet-online-dir>"
  echo "e.g.: $0 exp_other/nnet2_online/nnet_a_online data/lang exp/nnet2_online/nnet_a exp/nnet2_online/nnet_a_online"
  echo "main options (for others, see top of script file)"
  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
  echo "  --config <config-file>                           # config containing options"
  echo "  --stage <stage>                                  # stage to do partial re-run from."
  exit 1;
fi

online_src=$1
lang=$2
nnet_src=$3
dir=$4

for f in $online_src/conf/online_nnet2_decoding.conf $nnet_src/final.mdl $nnet_src/tree $lang/words.txt; do
  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
done


dir_as_given=$dir
dir=$(utils/make_absolute.sh $dir) # Convert $dir to an absolute pathname, so that the
                        # configuration files we write will contain absolute
                        # pathnames.
mkdir -p $dir/conf $dir/log

utils/lang/check_phones_compatible.sh $lang/phones.txt $nnet_src/phones.txt || exit 1;
cp $lang/phones.txt $dir || exit 1;

cp $nnet_src/tree $dir/ || exit 1;

cp $nnet_src/$iter.mdl $dir/ || exit 1;


# There are a bunch of files that we will need to copy from $online_src, because
# we're aiming to have one self-contained directory that has everything in it.
mkdir -p $dir/ivector_extractor
cp -r $online_src/ivector_extractor/* $dir/ivector_extractor

[ ! -d $online_src/conf ] && \
  echo "Expected directory $online_src/conf to exist" && exit 1;

for x in $online_src/conf/*conf; do
  # Replace directory name starting $online_src with those starting with $dir.
  # We actually replace any directory names ending in /ivector_extractor/ or /conf/ 
  # with $dir/ivector_extractor/ or $dir/conf/
  cat $x | perl -ape "s:=(.+)/(ivector_extractor|conf)/:=$dir/\$2/:;" > $dir/conf/$(basename $x)
done


# modify the silence-phones in the config; these are only used for the
# endpointing code.
cp $dir/conf/online_nnet2_decoding.conf{,.tmp}
silphones=$(cat $lang/phones/silence.csl) || exit 1;
cat $dir/conf/online_nnet2_decoding.conf.tmp | \
  sed s/silence-phones=.\\+/silence-phones=$silphones/ > $dir/conf/online_nnet2_decoding.conf
rm $dir/conf/online_nnet2_decoding.conf.tmp

echo "$0: formatted neural net for online decoding in $dir_as_given"


================================================
FILE: egs/steps/online/nnet2/train_diag_ubm.sh
================================================
#!/usr/bin/env bash

# Copyright   2012  Johns Hopkins University (Author: Daniel Povey)
#             2013  Daniel Povey
# Apache 2.0.

# This script trains a diagonal UBM that we'll use in online iVector estimation,
# where the online-estimated iVector will be used as a secondary input to a deep
# neural net for single-pass DNN-based decoding.

# This script was modified from ../../sre08/v1/sid/train_diag_ubm.sh.  It trains
# a diagonal UBM on top of features processed with apply-cmvn-online and then
# transformed with an LDA+MLLT or PCA matrix (obtained from the source
# directory).  This script does not use the trained model from the source
# directory to initialize the diagonal GMM; instead, we initialize the GMM using
# gmm-global-init-from-feats, which sets the means to random data points and
# then does some iterations of E-M in memory.  After the in-memory
# initialization we train for a few iterations in parallel.  Note that if an
# LDA+MLLT transform matrix is used, there will be a slight mismatch in that the
# source LDA+MLLT matrix (final.mat) will have been estimated using standard
# CMVN, and we're using online CMVN.  We don't think this will have much effect.


# Begin configuration section.
nj=4
cmd=run.pl
num_iters=4
stage=-2
num_gselect=30 # Number of Gaussian-selection indices to use while training
               # the model.
num_frames=500000 # number of frames to keep in memory for initialization
num_iters_init=20
initial_gauss_proportion=0.5 # Start with half the target number of Gaussians
subsample=2 # subsample all features with this periodicity, in the main E-M phase.
cleanup=true
min_gaussian_weight=0.0001
remove_low_count_gaussians=true # set this to false if you need #gauss to stay fixed.
num_threads=16
parallel_opts=  # ignored now.
online_cmvn_config=conf/online_cmvn.conf
# End configuration section.

echo "$0 $@"  # Print the command line for logging

[ -f ./path.sh ] && . ./path.sh; # source the path.
. parse_options.sh || exit 1;


if [ $# != 4 ]; then
  echo "Usage: $0  <data> <num-gauss> <srcdir> <output-dir>"
  echo " e.g.: $0 data/train 1024 exp/tri3b/ exp/diag_ubm"
  echo "(in srcdir we find splice_opts and final.mat)"
  echo "Options: "
  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
  echo "  --nj <num-jobs|4>                                # number of parallel jobs to run."
  echo "  --num-iters <niter|20>                           # number of iterations of parallel "
  echo "                                                   # training (default: $num_iters)"
  echo "  --stage <stage|-2>                               # stage to do partial re-run from."
  echo "  --num-gselect <n|30>                             # Number of Gaussians per frame to"
  echo "                                                   # limit computation to, for speed"
  echo " --subsample <n|5>                                 # In main E-M phase, use every n"
  echo "                                                   # frames (a speedup)"
  echo "  --num-frames <n|500000>                          # Maximum num-frames to keep in memory"
  echo "                                                   # for model initialization"
  echo "  --num-iters-init <n|20>                          # Number of E-M iterations for model"
  echo "                                                   # initialization"
  echo " --initial-gauss-proportion <proportion|0.5>       # Proportion of Gaussians to start with"
  echo "                                                   # in initialization phase (then split)"
  echo " --num-threads <n|32>                              # number of threads to use in initialization"
  echo "                                                   # phase (must match with parallel-opts option)"
  echo " --min-gaussian-weight <weight|0.0001>             # min Gaussian weight allowed in GMM"
  echo "                                                   # initialization (this relatively high"
  echo "                                                   # value keeps counts fairly even)"
  exit 1;
fi

data=$1
num_gauss=$2
srcdir=$3
dir=$4

! [ $num_gauss -gt 0 ] && echo "Bad num-gauss $num_gauss" && exit 1;

sdata=$data/split$nj
mkdir -p $dir/log
utils/split_data.sh $data $nj || exit 1;

for f in $data/feats.scp "$online_cmvn_config" $srcdir/splice_opts $srcdir/final.mat; do
   [ ! -f "$f" ] && echo "$0: expecting file $f to exist" && exit 1
done

if [ -d "$dir" ]; then
  bak_dir=$(mktemp -d ${dir}/backup.XXX);
  echo "$0: Directory $dir already exists. Backing up diagonal UBM in ${bak_dir}";
  for f in $dir/final.mat $dir/final.dubm $dir/online_cmvn.conf $dir/global_cmvn.stats; do
    [ -f "$f" ] && mv $f ${bak_dir}/
  done
  [ -d "$dir/log" ] && mv $dir/log ${bak_dir}/
fi

splice_opts=$(cat $srcdir/splice_opts)
cp $srcdir/splice_opts $dir/ || exit 1;
cp $srcdir/final.mat $dir/ || exit 1;
cp $online_cmvn_config $dir/online_cmvn.conf || exit 1;

# create global_cmvn.stats
if ! matrix-sum --binary=false scp:$data/cmvn.scp - >$dir/global_cmvn.stats 2>/dev/null; then
  echo "$0: Error summing cmvn stats"
  exit 1
fi

# Note: there is no point subsampling all_feats, because gmm-global-init-from-feats
# effectively does subsampling itself (it keeps a random subset of the features).
all_feats="ark,s,cs:apply-cmvn-online --config=$online_cmvn_config --spk2utt=ark:$data/spk2utt $dir/global_cmvn.stats scp:$data/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $dir/final.mat ark:- ark:- |"
feats="ark,s,cs:apply-cmvn-online --config=$online_cmvn_config --spk2utt=ark:$sdata/JOB/spk2utt $dir/global_cmvn.stats scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $dir/final.mat ark:- ark:- | subsample-feats --n=$subsample ark:- ark:- |"

num_gauss_init=$(perl -e "print int($initial_gauss_proportion * $num_gauss); ");
! [ $num_gauss_init -gt 0 ] && echo "Invalid num-gauss-init $num_gauss_init" && exit 1;

if [ $stage -le -2 ]; then
  echo "$0: initializing model from E-M in memory, "
  echo "$0: starting from $num_gauss_init Gaussians, reaching $num_gauss;"
  echo "$0: for $num_iters_init iterations, using at most $num_frames frames of data"

  $cmd --num-threads $num_threads $dir/log/gmm_init.log \
    gmm-global-init-from-feats --num-threads=$num_threads --num-frames=$num_frames \
     --min-gaussian-weight=$min_gaussian_weight \
     --num-gauss=$num_gauss --num-gauss-init=$num_gauss_init --num-iters=$num_iters_init \
    "$all_feats" $dir/0.dubm || exit 1;
fi

# Store Gaussian selection indices on disk-- this speeds up the training passes.
if [ $stage -le -1 ]; then
  echo "Getting Gaussian-selection info"
  $cmd JOB=1:$nj $dir/log/gselect.JOB.log \
    gmm-gselect --n=$num_gselect $dir/0.dubm "$feats" \
      "ark:|gzip -c >$dir/gselect.JOB.gz" || exit 1;
fi

echo "$0: will train for $num_iters iterations, in parallel over"
echo "$0: $nj machines, parallelized with '$cmd'"

for x in `seq 0 $[$num_iters-1]`; do
  echo "$0: Training pass $x"
  if [ $stage -le $x ]; then
  # Accumulate stats.
    $cmd JOB=1:$nj $dir/log/acc.$x.JOB.log \
      gmm-global-acc-stats "--gselect=ark,s,cs:gunzip -c $dir/gselect.JOB.gz|" \
      $dir/$x.dubm "$feats" $dir/$x.JOB.acc || exit 1;
    if [ $x -lt $[$num_iters-1] ]; then # Don't remove low-count Gaussians till last iter,
      opt="--remove-low-count-gaussians=false" # or gselect info won't be valid any more.
    else
      opt="--remove-low-count-gaussians=$remove_low_count_gaussians"
    fi
    $cmd $dir/log/update.$x.log \
      gmm-global-est $opt --min-gaussian-weight=$min_gaussian_weight $dir/$x.dubm "gmm-global-sum-accs - $dir/$x.*.acc|" \
      $dir/$[$x+1].dubm || exit 1;

    if $cleanup; then
      rm $dir/$x.*.acc $dir/$x.dubm
    fi
  fi
done

if $cleanup; then
  rm $dir/gselect.*.gz
fi

mv $dir/$num_iters.dubm $dir/final.dubm || exit 1;
exit 0;


================================================
FILE: egs/steps/online/nnet2/train_ivector_extractor.sh
================================================
#!/usr/bin/env bash

# Copyright   2013  Daniel Povey
# Apache 2.0.

# This script is modified from ^/egs/sre08/v1/sid/train_ivector_extractor.sh.
# It trains an iVector extractor for use in DNN training.  In this version, the
# features used to obtain the Gaussian posteriors are based on sliding-window
# CMN, but the actual iVector extractor sees the original features without CMN.
# The idea is that the appropriate offset should be learned by the iVector
# extractor itself, so the neural net can take as input the non-CMN features
# together with the iVector.  [note: in future, we may just compute the
# posteriors on top of non-CMN input, we'll have to see what works better.]

# This script trains the i-vector extractor.  Note: there are 3 separate levels
# of parallelization: num_threads, num_processes, and num_jobs.  This may seem a
# bit excessive.  It has to do with minimizing memory usage and disk I/O,
# subject to various constraints.  The "num_threads" is how many threads a
# program uses; the "num_processes" is the number of separate processes a single
# job spawns, and then sums the accumulators in memory.  Our recommendation:
#  - Set num_threads to the minimum of (4, or how many virtual cores your machine has).
#    (because of needing to lock various global quantities, the program can't
#    use many more than 4 threads with good CPU utilization).
#  - Set num_processes to the number of virtual cores on each machine you have, divided by
#    num_threads.  E.g. 4, if you have 16 virtual cores.   If you're on a shared queue
#    that's busy with other people's jobs, it may be wise to set it to rather less
#    than this maximum though, or your jobs won't get scheduled.  And if memory is
#    tight you need to be careful; in our normal setup, each process uses about 5G.
#  - Set num_jobs to as many of the jobs (each using $num_threads * $num_processes CPUs)
#    your queue will let you run at one time, but don't go much more than 10 or 20, or
#    summing the accumulators will possibly get slow.  If you have a lot of data, you
#    may want more jobs, though.

# Begin configuration section.
nj=10   # this is the number of separate queue jobs we run, but each one
        # contains num_processes sub-jobs.. the real number of threads we
        # run is nj * num_processes * num_threads, and the number of
        # separate pieces of data is nj * num_processes.
num_threads=4
num_processes=4 # each job runs this many processes, each with --num-threads threads
cmd="run.pl"
stage=-4
ivector_dim=100 # dimension of the extracted i-vector
online_cmvn_iextractor=false # apply online-cmvn on i-vector input features, uses the configuration from UBM,
num_iters=10
num_gselect=5 # Gaussian-selection using diagonal model: number of Gaussians to select
posterior_scale=0.1 # Scale on the acoustic posteriors, intended to account for
                    # inter-frame correlations.
min_post=0.025 # Minimum posterior to use (posteriors below this are pruned out)
               # caution: you should use the same value in the online-estimation
               # code.
subsample=2  # This speeds up the training: training on every 2nd feature
             # (configurable) Since the features are highly correlated across
             # frames, we don't expect to lose too much from this.
parallel_opts=  # ignored now.
cleanup=true
# End configuration section.

echo "$0 $@"  # Print the command line for logging

if [ -f path.sh ]; then . ./path.sh; fi
. parse_options.sh || exit 1;


if [ $# != 3 ]; then
  echo "Usage: $0 <data> <diagonal-ubm-dir> <extractor-dir>"
  echo " e.g.: $0 data/train exp/nnet2_online/diag_ubm/ exp/nnet2_online/extractor"
  echo "main options (for others, see top of script file)"
  echo "  --config <config-file>                           # config containing options"
  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
  echo "  --num-iters <#iters|10>                          # Number of iterations of E-M"
  echo "  --nj <n|10>                                      # Number of jobs (also see num-processes and num-threads)"
  echo "  --num-processes <n|4>                            # Number of processes for each queue job (relates"
  echo "                                                   # to summing accs in memory)"
  echo "  --num-threads <n|4>                              # Number of threads for each process (can't be usefully"
  echo "                                                   # increased much above 4)"
  echo "  --stage <stage|-4>                               # To control partial reruns"
  echo "  --num-gselect <n|5>                              # Number of Gaussians to select using"
  echo "                                                   # diagonal model."
  exit 1;
fi

data=$1
srcdir=$2
dir=$3

for f in $srcdir/final.dubm $srcdir/final.mat $srcdir/global_cmvn.stats $srcdir/splice_opts \
      $srcdir/online_cmvn.conf  $data/feats.scp; do
  [ ! -f $f ] && echo "No such file $f" && exit 1;
done


if [ -d "$dir" ]; then
  bak_dir=$(mktemp -d ${dir}/backup.XXX);
  echo "$0: Directory $dir already exists. Backing up iVector extractor in ${bak_dir}";
  for f in $dir/final.ie $dir/*.ie $dir/final.mat $dir/final.dubm \
        $dir/online_cmvn.conf $dir/global_cmvn.stats; do
    [ -f "$f" ] &&  mv $f ${bak_dir}/
  done
  [ -d "$dir/log" ] && mv $dir/log ${bak_dir}/
fi

# Set various variables.
mkdir -p $dir/log
nj_full=$[$nj*$num_processes]
sdata=$data/split$nj_full;
utils/split_data.sh $data $nj_full || exit 1;

cp $srcdir/final.dubm $srcdir/final.mat $srcdir/global_cmvn.stats $srcdir/splice_opts \
      $srcdir/online_cmvn.conf $dir || exit 1;

splice_opts=$(cat $srcdir/splice_opts)

## Set up features.  $gmm_feats is the version of the features with online CMVN, that we use
## to get the Gaussian posteriors, $feats is the version of the features with no CMN.
gmm_feats="ark,s,cs:apply-cmvn-online --config=$dir/online_cmvn.conf --spk2utt=ark:$sdata/JOB/spk2utt $dir/global_cmvn.stats scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $dir/final.mat ark:- ark:- | subsample-feats --n=$subsample ark:- ark:- |"
feats="ark,s,cs:splice-feats $splice_opts scp:$sdata/JOB/feats.scp ark:- | transform-feats $dir/final.mat ark:- ark:- | subsample-feats --n=$subsample ark:- ark:- |"

## This adds online-cmvn in $feats, upon request (configuration taken from UBM),
## ('online_cmvn_iextractor' marks that we added online_cmvn_iextractor)
rm $dir/online_cmvn_iextractor 2>/dev/null || true
if $online_cmvn_iextractor; then
  feats="$gmm_feats"
  touch $dir/online_cmvn_iextractor
fi

# Initialize the i-vector extractor using the input GMM, which is converted to
# full because that's what the i-vector extractor expects.  Note: we have to do
# --use-weights=false to disable regression of the log weights on the ivector,
# because that would make the online estimation of the ivector difficult (since
# the online/real-time ivector estimation is the whole point of this script).
if [ $stage -le -2 ]; then
  $cmd $dir/log/init.log \
    ivector-extractor-init --ivector-dim=$ivector_dim --use-weights=false \
     "gmm-global-to-fgmm $dir/final.dubm -|" $dir/0.ie || exit 1
fi

# Do Gaussian selection and posterior extracion

# if we subsample frame, modify the posterior-scale; this is likely
# to make the original posterior-scale (before subsampling) suitable.
modified_posterior_scale=$(perl -e "print $posterior_scale * $subsample;");

if [ $stage -le -1 ]; then
  echo $nj_full > $dir/num_jobs
  echo "$0: doing Gaussian selection and posterior computation"
  $cmd JOB=1:$nj_full $dir/log/post.JOB.log \
    gmm-global-get-post --n=$num_gselect --min-post=$min_post $dir/final.dubm "$gmm_feats" ark:- \| \
    scale-post ark:- $modified_posterior_scale "ark:|gzip -c >$dir/post.JOB.gz" || exit 1;
else
  # make sure we at least have the right number of post.*.gz files.
  if ! [ $nj_full -eq $(cat $dir/num_jobs) ]; then
    echo "Num-jobs mismatch $nj_full versus $(cat $dir/num_jobs)"
    exit 1
  fi
fi

x=0
while [ $x -lt $num_iters ]; do
  if [ $stage -le $x ]; then
    rm $dir/.error 2>/dev/null

    Args=() # bash array of training commands for 1:nj, that put accs to stdout.
    for j in $(seq $nj_full); do
      Args[$j]=`echo "ivector-extractor-acc-stats --num-threads=$num_threads $dir/$x.ie '$feats' 'ark,s,cs:gunzip -c $dir/post.JOB.gz|' -|" | sed s/JOB/$j/g`
    done

    echo "Accumulating stats (pass $x)"
    for g in $(seq $nj); do
      start=$[$num_processes*($g-1)+1]
      $cmd --num-threads $[$num_threads*$num_processes] $dir/log/acc.$x.$g.log \
        ivector-extractor-sum-accs --parallel=true "${Args[@]:$start:$num_processes}" \
          $dir/acc.$x.$g || touch $dir/.error &
    done
    wait
    [ -f $dir/.error ] && echo "Error accumulating stats on iteration $x" && exit 1;
    accs=""
    for j in $(seq $nj); do
      accs+="$dir/acc.$x.$j "
    done
    echo "Summing accs (pass $x)"
    $cmd $dir/log/sum_acc.$x.log \
      ivector-extractor-sum-accs $accs $dir/acc.$x || exit 1;
    echo "Updating model (pass $x)"
    nt=$[$num_threads*$num_processes] # use the same number of threads that
                                      # each accumulation process uses, since we
                                      # can be sure the queue will support this many.
                                      #
                                      # The parallel-opts was either specified by
                                      # the user or we computed it correctly in
                                      # tge previous stages
    $cmd --num-threads $[$num_threads*$num_processes] $dir/log/update.$x.log \
      ivector-extractor-est --num-threads=$nt $dir/$x.ie $dir/acc.$x $dir/$[$x+1].ie || exit 1;
    rm $dir/acc.$x.*
    if $cleanup; then
      rm $dir/acc.$x $dir/$x.ie
    fi
  fi
  x=$[$x+1]
done

if $cleanup; then
  rm $dir/post.*.gz
fi

rm $dir/final.ie 2>/dev/null
ln -s $x.ie $dir/final.ie

# assign a unique id to this extractor
# we are not interested in the id itself, just pre-caching ...
steps/nnet2/get_ivector_id.sh $dir > /dev/null || exit 1


================================================
FILE: egs/steps/online/nnet3/decode.sh
================================================
#!/usr/bin/env bash

# Copyright 2014  Johns Hopkins University (Author: Daniel Povey)
#           2016  Api.ai (Author: Ilya Platonov)
# Apache 2.0

# Begin configuration section.
stage=0
nj=4
cmd=run.pl
frames_per_chunk=20
extra_left_context_initial=0
min_active=200
max_active=7000
beam=15.0
lattice_beam=6.0
acwt=0.1   # note: only really affects adaptation and pruning (scoring is on
           # lattices).
post_decode_acwt=1.0  # can be used in 'chain' systems to scale acoustics by 10 so the
                      # regular scoring script works.
per_utt=false
online=true  # only relevant to non-threaded decoder.
do_endpointing=false
do_speex_compressing=false
scoring_opts=
skip_scoring=false
silence_weight=1.0  # set this to a value less than 1 (e.g. 0) to enable silence weighting.
max_state_duration=40 # This only has an effect if you are doing silence
  # weighting.  This default is probably reasonable.  transition-ids repeated
  # more than this many times in an alignment are treated as silence.
iter=final
online_config=
# End configuration section.

echo "$0 $@"  # Print the command line for logging

[ -f ./path.sh ] && . ./path.sh; # source the path.
. parse_options.sh || exit 1;

if [ $# != 3 ]; then
   echo "Usage: $0 [options] <graph-dir> <data-dir> <decode-dir>"
   echo "... where <decode-dir> is assumed to be a sub-directory of the directory"
   echo " where the models are, as prepared by steps/online/nnet3/prepare_online_decoding.sh"
   echo "e.g.: $0 exp/chain/tdnn/graph data/test exp/chain/tdnn_online/decode/"
   echo ""
   echo ""
   echo "main options (for others, see top of script file)"
   echo "  --config <config-file>                           # config containing options"
   echo "  --online-config <config-file>                    # online decoder options"
   echo "  --nj <nj>                                        # number of parallel jobs"
   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
   echo "  --acwt <float>                                   # acoustic scale used for lattice generation "
   echo "  --per-utt <true|false>                           # If true, decode per utterance without"
   echo "                                                   # carrying forward adaptation info from previous"
   echo "                                                   # utterances of each speaker.  Default: false"
   echo "  --online <true|false>                            # Set this to false if you don't really care about"
   echo "                                                   # simulating online decoding and just want the best"
   echo "                                                   # results.  This will use all the data within each"
   echo "                                                   # utterance (plus any previous utterance, if not in"
   echo "                                                   # per-utterance mode) to estimate the iVectors."
   echo "  --scoring-opts <string>                          # options to local/score.sh"
   echo "  --iter <iter>                                    # Iteration of model to decode; default is final."
   exit 1;
fi


graphdir=$1
data=$2
dir=$3
srcdir=`dirname $dir`; # The model directory is one level up from decoding directory.
sdata=$data/split$nj;

if [ "$online_config" == "" ]; then
  online_config=$srcdir/conf/online.conf;
fi

mkdir -p $dir/log
[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
echo $nj > $dir/num_jobs

for f in $online_config $srcdir/${iter}.mdl \
    $graphdir/HCLG.fst $graphdir/words.txt $data/wav.scp; do
  if [ ! -f $f ]; then
    echo "$0: no such file $f"
    exit 1;
  fi
done

if ! $per_utt; then
  spk2utt_rspecifier="ark:$sdata/JOB/spk2utt"
else
  mkdir -p $dir/per_utt
  for j in $(seq $nj); do
    awk '{print $1, $1}' <$sdata/$j/utt2spk >$dir/per_utt/utt2spk.$j || exit 1;
  done
  spk2utt_rspecifier="ark:$dir/per_utt/utt2spk.JOB"
fi

if [ -f $data/segments ]; then
  wav_rspecifier="ark,s,cs:extract-segments scp,p:$sdata/JOB/wav.scp $sdata/JOB/segments ark:- |"
else
  wav_rspecifier="ark,s,cs:wav-copy scp,p:$sdata/JOB/wav.scp ark:- |"
fi
if $do_speex_compressing; then
  wav_rspecifier="$wav_rspecifier compress-uncompress-speex ark:- ark:- |"
fi
if $do_endpointing; then
  wav_rspecifier="$wav_rspecifier extend-wav-with-silence ark:- ark:- |"
fi

if [ "$silence_weight" != "1.0" ]; then
  silphones=$(cat $graphdir/phones/silence.csl) || exit 1
  silence_weighting_opts="--ivector-silence-weighting.max-state-duration=$max_state_duration --ivector-silence-weighting.silence_phones=$silphones --ivector-silence-weighting.silence-weight=$silence_weight"
else
  silence_weighting_opts=
fi


if [ "$post_decode_acwt" == 1.0 ]; then
  lat_wspecifier="ark:|gzip -c >$dir/lat.JOB.gz"
else
  lat_wspecifier="ark:|lattice-scale --acoustic-scale=$post_decode_acwt ark:- ark:- | gzip -c >$dir/lat.JOB.gz"
fi


if [ -f $srcdir/frame_subsampling_factor ]; then
  # e.g. for 'chain' systems
  frame_subsampling_opt="--frame-subsampling-factor=$(cat $srcdir/frame_subsampling_factor)"
fi

if [ $stage -le 0 ]; then
  $cmd JOB=1:$nj $dir/log/decode.JOB.log \
    online2-wav-nnet3-latgen-faster $silence_weighting_opts --do-endpointing=$do_endpointing \
    --frames-per-chunk=$frames_per_chunk \
    --extra-left-context-initial=$extra_left_context_initial \
    --online=$online \
       $frame_subsampling_opt \
     --config=$online_config \
     --min-active=$min_active --max-active=$max_active --beam=$beam --lattice-beam=$lattice_beam \
     --acoustic-scale=$acwt --word-symbol-table=$graphdir/words.txt \
     $srcdir/${iter}.mdl $graphdir/HCLG.fst $spk2utt_rspecifier "$wav_rspecifier" \
      "$lat_wspecifier" || exit 1;
fi

if ! $skip_scoring ; then
  [ ! -x local/score.sh ] && \
    echo "Not scoring because local/score.sh does not exist or not executable." && exit 1;
  local/score.sh --cmd "$cmd" $scoring_opts $data $graphdir $dir
fi

exit 0;


================================================
FILE: egs/steps/online/nnet3/decode_wake_word.sh
================================================
#!/bin/bash

# Copyright 2014  Johns Hopkins University (Author: Daniel Povey)
#           2016  Api.ai (Author: Ilya Platonov)
#      2019-2020  Yiming Wang
# Apache 2.0

# This script is modified from steps/online/nnet3/decode.sh for wake word detection decoding

# Begin configuration section.
stage=0
nj=4
acwt=0.1  # Just a default value, used for adaptation and beam-pruning..
cmd=run.pl
frames_per_chunk=20
extra_left_context_initial=0
min_active=200
max_active=7000
beam=15.0
per_utt=false
online=true  # only relevant to non-threaded decoder.
do_speex_compressing=false
scoring_opts=
skip_scoring=false
iter=final
online_config=
wake_word="嗨小问"
# End configuration section.

echo "$0 $@"  # Print the command line for logging

[ -f ./path.sh ] && . ./path.sh; # source the path.
. parse_options.sh || exit 1;

if [ $# != 3 ]; then
   echo "Usage: $0 [options] <graph-dir> <data-dir> <decode-dir>"
   echo "... where <decode-dir> is assumed to be a sub-directory of the directory"
   echo " where the models are, as prepared by steps/online/nnet3/prepare_online_decoding.sh"
   echo "e.g.: $0 exp/chain/tdnn/graph data/test exp/chain/tdnn_online/decode/"
   echo ""
   echo ""
   echo "main options (for others, see top of script file)"
   echo "  --config <config-file>                           # config containing options"
   echo "  --online-config <config-file>                    # online decoder options"
   echo "  --nj <nj>                                        # number of parallel jobs"
   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
   echo "  --per-utt <true|false>                           # If true, decode per utterance without"
   echo "                                                   # carrying forward adaptation info from previous"
   echo "                                                   # utterances of each speaker.  Default: false"
   echo "  --online <true|false>                            # Set this to false if you don't really care about"
   echo "                                                   # simulating online decoding and just want the best"
   echo "                                                   # results.  This will use all the data within each"
   echo "                                                   # utterance (plus any previous utterance, if not in"
   echo "                                                   # per-utterance mode) to estimate the iVectors."
   echo "  --scoring-opts <string>                          # options to local/score.sh"
   echo "  --iter <iter>                                    # Iteration of model to decode; default is final."
   exit 1;
fi


graphdir=$1
data=$2
dir=$3
srcdir=`dirname $dir`; # The model directory is one level up from decoding directory.
sdata=$data/split$nj;

if [ "$online_config" == "" ]; then
  online_config=$srcdir/conf/online.conf;
fi

mkdir -p $dir/log
[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
echo $nj > $dir/num_jobs

for f in $online_config $srcdir/${iter}.mdl \
    $graphdir/HCLG.fst $graphdir/words.txt $data/wav.scp; do
  if [ ! -f $f ]; then
    echo "$0: no such file $f"
    exit 1;
  fi
done

if ! $per_utt; then
  spk2utt_rspecifier="ark:$sdata/JOB/spk2utt"
else
  mkdir -p $dir/per_utt
  for j in $(seq $nj); do
    awk '{print $1, $1}' <$sdata/$j/utt2spk >$dir/per_utt/utt2spk.$j || exit 1;
  done
  spk2utt_rspecifier="ark:$dir/per_utt/utt2spk.JOB"
fi

if [ -f $data/segments ]; then
  wav_rspecifier="ark,s,cs:extract-segments scp,p:$sdata/JOB/wav.scp $sdata/JOB/segments ark:- |"
else
  wav_rspecifier="ark,s,cs:wav-copy scp,p:$sdata/JOB/wav.scp ark:- |"
fi
if $do_speex_compressing; then
  wav_rspecifier="$wav_rspecifier compress-uncompress-speex ark:- ark:- |"
fi

wake_word_id=$(cat $graphdir/words.txt | grep $wake_word | awk '{print $2}')

if [ -f $srcdir/frame_subsampling_factor ]; then
  # e.g. for 'chain' systems
  frame_subsampling_opt="--frame-subsampling-factor=$(cat $srcdir/frame_subsampling_factor)"
fi

if [ $stage -le 0 ]; then
  $cmd JOB=1:$nj $dir/log/decode.JOB.log \
    online2-wav-nnet3-wake-word-decoder-faster \
    --frames-per-chunk=$frames_per_chunk \
    --extra-left-context-initial=$extra_left_context_initial \
    --online=$online \
       $frame_subsampling_opt \
     --config=$online_config \
     --min-active=$min_active --max-active=$max_active --beam=$beam \
     --acoustic-scale=$acwt --wake-word-id=$wake_word_id \
     $srcdir/${iter}.mdl $graphdir/HCLG.fst $spk2utt_rspecifier "$wav_rspecifier" \
     $graphdir/words.txt ark,t:$dir/trans.JOB.txt \
     ark,t:$dir/ali.JOB.txt || exit 1;
fi

if [ $stage -le 1 ]; then
  for n in $(seq $nj); do
    cat $dir/trans.$n.txt
  done > $dir/trans.txt
  rm -f $dir/trans.*.txt
  for n in $(seq $nj); do
    cat $dir/ali.$n.txt
  done > $dir/ali.txt
  rm -f $dir/ali.*.txt
fi

if [ $stage -le 2 ] && ! $skip_scoring ; then
  [ ! -x local/score_online.sh ] && \
    echo "Not scoring because local/score.sh does not exist or not executable." && exit 1;
  local/score_online.sh $scoring_opts --wake-word $wake_word $data $graphdir $dir
fi

exit 0;


================================================
FILE: egs/steps/online/nnet3/prepare_online_decoding.sh
================================================
#!/usr/bin/env bash

# Copyright 2014  Johns Hopkins University (Author: Daniel Povey)
# Apache 2.0

# Begin configuration.
stage=0 # This allows restarting after partway, when something when wrong.
feature_type=mfcc
add_pitch=false
mfcc_config=conf/mfcc.conf # you can override any of these you need to override.
plp_config=conf/plp.conf
fbank_config=conf/fbank.conf

# online_pitch_config is the config file for both pitch extraction and
# post-processing; we combine them into one because during training this
# is given to the program compute-and-process-kaldi-pitch-feats.
online_pitch_config=conf/online_pitch.conf

# online_cmvn_config can be used both for nn-features and i-vector features.
# If the file $dir/online_cmvn exists, it is used for both feature streams.
# If $dir/online_cmvn does not exist, the config file is used only for normalizing
# the input of ubm in i-vector extractor, the rest of the system is without online-cmvn.
# The $dir/online_cmvn 'flag' file is created when training with online-cmvn.
online_cmvn_config=conf/online_cmvn.conf

# Below are some options that affect the iVectors, and should probably
# match those used in extract_ivectors_online.sh.
num_gselect=5 # Gaussian-selection using diagonal model: number of Gaussians to select
posterior_scale=0.1 # Scale on the acoustic posteriors, intended to account for
                    # inter-frame correlations.
min_post=0.025 # Minimum posterior to use (posteriors below this are pruned out)
               # caution: you should use the same value in the online-estimation
               # code.
max_count=100   # This max-count of 100 can make iVectors more consistent for
                # different lengths of utterance, by scaling up the prior term
                # when the data-count exceeds this value.  The data-count is
                # after posterior-scaling, so assuming the posterior-scale is
                # 0.1, --max-count 100 starts having effect after 1000 frames,
                # or 10 seconds of data.
ivector_period=10 # Number of frames for which the i-vector stays the same
                  # (use same value as from local/nnet3/run_ivector_common.sh).

iter=final
# End configuration.

echo "$0 $@"  # Print the command line for logging

[ -f path.sh ] && . ./path.sh;
. parse_options.sh || exit 1;

if [ $# -ne 4 ] && [ $# -ne 3 ]; then
   echo "Usage: $0 [options] <lang-dir> [<ivector-extractor-dir>] <nnet-dir> <output-dir>"
   echo "e.g.: $0 data/lang exp/nnet2_online/extractor exp/nnet2_online/nnet exp/nnet2_online/nnet_online"
   echo "main options (for others, see top of script file)"
   echo "  --feature-type <mfcc|plp>                        # Type of the base features; "
   echo "                                                   # important to generate the correct"
   echo "                                                   # configs in <output-dir>/conf/"
   echo "  --add-pitch <true|false>                         # Append pitch features to cmvn"
   echo "                                                   # (default: false)"
   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
   echo "  --config <config-file>                           # config containing options"
   echo "  --iter <model-iteration|final>                   # iteration of model to take."
   echo "  --stage <stage>                                  # stage to do partial re-run from."
   exit 1;
fi


if [ $# -eq 4 ]; then
  lang=$1
  iedir=$2
  srcdir=$3
  dir=$4
else
  [ $# -eq 3 ] || exit 1;
  lang=$1
  iedir=
  srcdir=$2
  dir=$3
fi

for f in $lang/phones/silence.csl $srcdir/${iter}.mdl $srcdir/tree; do
  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
done
if [ ! -z "$iedir" ]; then
  for f in final.{mat,ie,dubm} splice_opts global_cmvn.stats online_cmvn.conf; do
    [ ! -f $iedir/$f ] && echo "$0: no such file $iedir/$f" && exit 1;
  done
  if $add_pitch; then
    iedim=`matrix-dim $iedir/final.mat | awk '{print $1}'`
    amdim=`nnet3-am-info $srcdir/${iter}.mdl | grep "input-dim:" | awk '{print $2}'`
    [ $(($amdim-$iedim)) -eq 0 ] && echo "$0: remove pitch from the input of ivector extractor" && exit 1;
  fi
fi


dir=$(utils/make_absolute.sh $dir) # Convert $dir to an absolute pathname, so that the
                        # configuration files we write will contain absolute
                        # pathnames.
mkdir -p $dir/conf

utils/lang/check_phones_compatible.sh $lang/phones.txt $srcdir/phones.txt || exit 1;
cp $lang/phones.txt $dir || exit 1;

cp $srcdir/${iter}.mdl $dir/final.mdl || exit 1;
cp $srcdir/tree $dir/ || exit 1;
if [ -f $srcdir/frame_subsampling_factor ]; then
  cp $srcdir/frame_subsampling_factor $dir/
fi


if [ ! -z "$iedir" ]; then
  mkdir -p $dir/ivector_extractor/
  cp $iedir/final.{mat,ie,dubm} $iedir/global_cmvn.stats $dir/ivector_extractor/ || exit 1;

  # The following things won't be needed directly by the online decoding, but
  # will allow us to run prepare_online_decoding.sh again with
  # $dir/ivector_extractor/ as the input directory (useful in certain
  # cross-system training scenarios).
  cp $iedir/splice_opts $iedir/online_cmvn.conf $dir/ivector_extractor/ || exit 1;
fi


mkdir -p $dir/conf
rm $dir/{plp,mfcc,fbank}.conf 2>/dev/null
echo "$0: preparing configuration files in $dir/conf"

if [ -f $dir/conf/online.conf ]; then
  echo "$0: moving $dir/conf/online.conf to $dir/conf/online.conf.bak"
  mv $dir/conf/online.conf $dir/conf/online.conf.bak
fi

conf=$dir/conf/online.conf
echo -n >$conf

echo "--feature-type=$feature_type" >>$conf

case "$feature_type" in
  mfcc)
    echo "--mfcc-config=$dir/conf/mfcc.conf" >>$conf
    cp $mfcc_config $dir/conf/mfcc.conf || exit 1;;
  plp)
    echo "--plp-config=$dir/conf/plp.conf" >>$conf
    cp $plp_config $dir/conf/plp.conf || exit 1;;
  fbank)
    echo "--fbank-config=$dir/conf/fbank.conf" >>$conf
    cp $fbank_config $dir/conf/fbank.conf || exit 1;;
  *)
    echo "Unknown feature type $feature_type"
esac

cp $online_cmvn_config $dir/conf/online_cmvn.conf || exit 1;

if [ ! -z "$iedir" ]; then
  ieconf=$dir/conf/ivector_extractor.conf
  echo -n >$ieconf
  echo "--ivector-extraction-config=$ieconf" >>$conf

  # make sure that the online_cmvn config for i-extractor is same
  # as the one passed in with '--online_cmvn_config'
  ivec_cmvn_config=$iedir/online_cmvn.conf
  if ! $(cmp --silent $online_cmvn_config $ivec_cmvn_config); then
    echo "Error, configs must be the same:
      \$online_cmvn_config=$online_cmvn_config
      \$ivec_cmvn_config=$ivec_cmvn_config"
    exit 1;
  fi

  # the next line puts each option from splice_opts on its own line in the config.
  for x in $(cat $iedir/splice_opts); do echo "$x"; done > $dir/conf/splice.conf
  echo "--splice-config=$dir/conf/splice.conf" >>$ieconf
  echo "--cmvn-config=$dir/conf/online_cmvn.conf" >>$ieconf
  echo "--lda-matrix=$dir/ivector_extractor/final.mat" >>$ieconf
  echo "--global-cmvn-stats=$dir/ivector_extractor/global_cmvn.stats" >>$ieconf
  echo "--diag-ubm=$dir/ivector_extractor/final.dubm" >>$ieconf
  echo "--ivector-extractor=$dir/ivector_extractor/final.ie" >>$ieconf
  echo "--num-gselect=$num_gselect"  >>$ieconf
  echo "--min-post=$min_post" >>$ieconf
  echo "--posterior-scale=$posterior_scale" >>$ieconf # this is currently the default in the scripts.
  echo "--max-remembered-frames=1000" >>$ieconf # the default
  echo "--max-count=$max_count" >>$ieconf
  echo "--ivector-period=$ivector_period" >>$ieconf
  # activate online-cmvn for the i-extractor, not only the ubm,
  if [ -f $srcdir/online_cmvn ]; then
    cp $iedir/online_cmvn_iextractor $dir/ivector_extractor/ || exit 1
    echo "--online-cmvn-iextractor=true" >>$ieconf
  fi
fi

if $add_pitch; then
  echo "$0: enabling pitch features"
  echo "--add-pitch=true" >>$conf
  echo "$0: creating $dir/conf/online_pitch.conf"
  if [ ! -f $online_pitch_config ]; then
    echo "$0: expected file '$online_pitch_config' to exist.";
    exit 1;
  fi
  cp $online_pitch_config $dir/conf/online_pitch.conf || exit 1;
  echo "--online-pitch-config=$dir/conf/online_pitch.conf" >>$conf
fi

silphonelist=`cat $lang/phones/silence.csl` || exit 1;
echo "--endpoint.silence-phones=$silphonelist" >>$conf

# activate the online-cmvn in nnet input features,
if [ -f $srcdir/online_cmvn ]; then
  cp $srcdir/online_cmvn $dir/
  cp $srcdir/global_cmvn.stats $dir/
  echo "--cmvn-config=$dir/conf/online_cmvn.conf" >>$conf
  echo "--global-cmvn-stats=$dir/global_cmvn.stats" >>$conf
fi

echo "$0: created config file $conf"


================================================
FILE: egs/steps/online/prepare_online_decoding.sh
================================================
#!/usr/bin/env bash

# Copyright 2014  Johns Hopkins University (Author: Daniel Povey)
# Apache 2.0

# Begin configuration.
stage=0 # This allows restarting after partway, when something when wrong.
feature_type=mfcc
online_cmvn_config=conf/online_cmvn.conf
add_pitch=false
pitch_config=conf/pitch.conf
pitch_process_config=conf/pitch_process.conf
per_utt_basis=true # If true, then treat each utterance as a separate speaker
                   # for purposes of basis training... this is recommended if
                   # the number of actual speakers in your training set is less
                   # than (feature-dim) * (feature-dim+1).
per_utt_cmvn=false # If true, apply online CMVN normalization per utterance
                   # rather than per speaker.
silence_weight=0.01
cmd=run.pl
cleanup=true
# End configuration.

echo "$0 $@"  # Print the command line for logging

[ -f path.sh ] && . ./path.sh;
. parse_options.sh || exit 1;

if [ $# -ne 4 -a $# -ne 5 ]; then
   echo "Usage: $0 [options] <data-dir> <lang-dir> <sat-model-dir> [<MMI-model>] <output-dir>"
   echo "e.g.: $0 data/train data/lang exp/tri3b exp/tri3b_mmi/final.mdl exp/tri3b_online"
   echo "main options (for others, see top of script file)"
   echo "  --feature-type <mfcc|plp>                        # Type of the base features; "
   echo "                                                   # important to generate the correct"
   echo "                                                   # configs in <output-dir>/conf/"
   echo "  --online-cmvn-config <config>                    # config for online cmvn,"
   echo "                                                   # default conf/online_cmvn.conf"
   echo "  --add-pitch <true|false>                         # Append pitch features to cmvn"
   echo "                                                   # (default: false)"
   echo "  --per-utt-cmvn <true|false>                      # Apply online CMVN per utt, not"
   echo "                                                   # per speaker (default: false)"
   echo "  --per-utt-basis <true|false>                     # Do basis computation per utterance"
   echo "                                                   # (default: true)"
   echo "  --silence-weight <weight>                        # Weight on silence for basis fMLLR;"
   echo "                                                   # default 0.01."
   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
   echo "  --config <config-file>                           # config containing options"
   echo "  --stage <stage>                                  # stage to do partial re-run from."
   exit 1;
fi


if [ $# -eq 5 ]; then
  data=$1
  lang=$2
  srcdir=$3
  mmi_model=$4
  dir=$5
else
  data=$1
  lang=$2
  srcdir=$3
  mmi_model=$srcdir/final.mdl
  dir=$4
fi


for f in $srcdir/final.mdl $srcdir/ali.1.gz $data/feats.scp $lang/phones.txt \
    $mmi_model $online_cmvn_config; do
  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
done

nj=`cat $srcdir/num_jobs` || exit 1;
sdata=$data/split$nj;
[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;

mkdir -p $dir/log
echo $nj >$dir/num_jobs || exit 1;

utils/lang/check_phones_compatible.sh $lang/phones.txt $srcdir/phones.txt || exit 1;
cp $lang/phones.txt $dir || exit 1;

splice_opts=`cat $srcdir/splice_opts 2>/dev/null`
cmvn_opts=`cat $srcdir/cmvn_opts 2>/dev/null`
silphonelist=`cat $lang/phones/silence.csl` || exit 1;
cp $srcdir/splice_opts $srcdir/cmvn_opts $srcdir/final.mat $srcdir/final.mdl $dir/ 2>/dev/null

cp $mmi_model $dir/final.rescore_mdl

# Set up the unadapted features "$sifeats".
if [ -f $dir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
if ! $per_utt_cmvn; then
  online_cmvn_spk2utt_opt=
else
  online_cmvn_spk2utt_opt="--spk2utt=ark:$sdata/JOB/spk2utt"
fi


# create global_cmvn.stats
if ! matrix-sum --binary=false scp:$data/cmvn.scp - >$dir/global_cmvn.stats 2>/dev/null; then
  echo "$0: Error summing cmvn stats"
  exit 1
fi

if $add_pitch; then
  skip_opt="--skip-dims=13:14:15" # should make this more general.
fi

echo "$0: feature type is $feat_type";
case $feat_type in
  delta) sifeats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |"
        online_sifeats="ark,s,cs:apply-cmvn-online $skip_opt --config=$online_cmvn_config $dir/global_cmvn.stats $online_cmvn_spk2utt_opt scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
  lda) sifeats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $dir/final.mat ark:- ark:- |"
       online_sifeats="ark,s,cs:apply-cmvn-online $skip_opt --config=$online_cmvn_config $online_cmvn_spk2utt_opt $dir/global_cmvn.stats scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $dir/final.mat ark:- ark:- |";;
  *) echo "Invalid feature type $feat_type" && exit 1;
esac

# Set up the adapted features "$feats" for training set.
if [ -f $srcdir/trans.1 ]; then
  feats="$sifeats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$srcdir/trans.JOB ark:- ark:- |";
else
  feats="$sifeats";
fi


if $per_utt_basis; then
  spk2utt_opt=  # treat each utterance as separate speaker when computing basis.
  echo "Doing per-utterance adaptation for purposes of computing the basis."
else
  echo "Doing per-speaker adaptation for purposes of computing the basis."
  [ `cat $sdata/spk2utt | wc -l` -lt $[41*40] ] && \
    echo "Warning: number of speakers is small, might be better to use --per-utt=true."
  spk2utt_opt="--spk2utt=ark:$sdata/JOB/spk2utt"
fi

if [ $stage -le 0 ]; then
  echo "$0: Accumulating statistics for basis-fMLLR computation"
# Note: we get Gaussian level alignments with the "final.mdl" and the
# speaker adapted features.
  $cmd JOB=1:$nj $dir/log/basis_acc.JOB.log \
    ali-to-post "ark:gunzip -c $srcdir/ali.JOB.gz|" ark:- \| \
    weight-silence-post $silence_weight $silphonelist $dir/final.mdl ark:- ark:- \| \
    gmm-post-to-gpost $dir/final.mdl "$feats" ark:- ark:- \| \
    gmm-basis-fmllr-accs-gpost $spk2utt_opt \
    $dir/final.mdl "$sifeats" ark,s,cs:- $dir/basis.acc.JOB || exit 1;
fi

if [ $stage -le 1 ]; then
  echo "$0: computing the basis matrices."
  $cmd $dir/log/basis_training.log \
    gmm-basis-fmllr-training $dir/final.mdl $dir/fmllr.basis $dir/basis.acc.* || exit 1;
  if $cleanup; then
    rm $dir/basis.acc.* 2>/dev/null
  fi
fi

if [ $stage -le 2 ]; then
  echo "$0: accumulating stats for online alignment model."

  # Accumulate stats for "online alignment model"-- this model is computed with
  # the speaker-independent features and online CMVN, but matches
  # Gaussian-for-Gaussian with the final speaker-adapted model.

  $cmd JOB=1:$nj $dir/log/acc_alimdl.JOB.log \
    ali-to-post "ark:gunzip -c $srcdir/ali.JOB.gz|" ark:-  \| \
    gmm-acc-stats-twofeats $dir/final.mdl "$feats" "$online_sifeats" \
    ark,s,cs:- $dir/final.JOB.acc || exit 1;
  [ `ls $dir/final.*.acc | wc -w` -ne "$nj" ] && echo "$0: Wrong #accs" && exit 1;
  # Update model.
  $cmd $dir/log/est_online_alimdl.log \
    gmm-est --remove-low-count-gaussians=false $dir/final.mdl \
    "gmm-sum-accs - $dir/final.*.acc|" $dir/final.oalimdl  || exit 1;
  if $cleanup; then
    rm $dir/final.*.acc
  fi
fi

if [ $stage -le 3 ]; then
  mkdir -p $dir/conf
  rm $dir/{plp,mfcc}.conf 2>/dev/null
  echo "$0: preparing configuration files in $dir/conf"
  if [ -f $dir/conf/online_decoding.conf ]; then
    echo "$0: moving $dir/conf/online_decoding.conf to $dir/conf/online_decoding.conf.bak"
    mv $dir/conf/online_decoding.conf $dir/conf/online_decoding.conf.bak
  fi
  conf=$dir/conf/online_decoding.conf
  echo -n >$conf
  case "$feature_type" in
    mfcc)
      echo "$0: creating $dir/conf/mfcc.conf"
      echo "--mfcc-config=$dir/conf/mfcc.conf" >>$conf
      cp conf/mfcc.conf $dir/conf/ ;;
    plp)
      echo "$0: enabling plp features"
      echo "--feature-type=plp" >>$conf
      echo "$0: creating $dir/conf/plp.conf"
      echo "--plp-config=$dir/conf/plp.conf" >>$conf
      cp conf/plp.conf $dir/conf/ ;;
    *)
      echo "Unknown feature type $feature_type"
  esac
  if ! cp $online_cmvn_config $dir/conf/online_cmvn.conf; then
    echo "$0: error copying online cmvn config to $dir/conf/"
    exit 1;
  fi
  echo "--cmvn-config=$dir/conf/online_cmvn.conf" >>$conf
  if [ -f $dir/final.mat ]; then
    echo "$0: enabling feature splicing"
    echo "--splice-feats" >>$conf
    echo "$0: creating $dir/conf/splice.conf"
    for x in $(cat $dir/splice_opts); do echo $x; done > $dir/conf/splice.conf
    echo "--splice-config=$dir/conf/splice.conf" >>$conf
    echo "$0: enabling LDA"
    echo "--lda-matrix=$dir/final.mat" >>$conf
  else
    echo "$0: enabling deltas"
    echo "--add-deltas" >>$conf
  fi
  if $add_pitch; then
    echo "$0: enabling pitch features"
    echo "--add-pitch" >>$conf
    echo "$0: creating $dir/conf/pitch.conf"
    echo "--pitch-config=$dir/conf/pitch.conf" >>$conf
    if ! cp $pitch_config $dir/conf/pitch.conf; then
      echo "$0: error copying pitch config to $dir/conf/"
      exit 1;
    fi;
    echo "$0: creating $dir/conf/pitch_process.conf"
    echo "--pitch-process-config=$dir/conf/pitch_process.conf" >>$conf
    if ! cp $pitch_process_config $dir/conf/pitch_process.conf; then
      echo "$0: error copying pitch process config to $dir/conf/"
      exit 1;
    fi;
    nfields=$(sed -n '2,2p' $dir/global_cmvn.stats | \
      perl -e '$_ = <>; s/^\s+|\s+$//g; print scalar(split);');
    if [ $nfields != 17 ]; then
      echo "$0: $dir/global_cmvn.stats has $nfields entries per row (expected 17)."
      echo "$0: Did you append pitch features?"
      exit 1;
    fi
    #offset=$(sed -n '2,2p' $dir/global_cmvn.stats | \
    #  perl -e '$_ = <>; s/^\s+|\s+$//g; ($t, $c) = (split)[13, 16]; print -$t/$c;');
    #echo "--pov-offset=$offset" >>$dir/conf/pitch_process.conf
  fi

  echo "--fmllr-basis=$dir/fmllr.basis" >>$conf
  echo "--online-alignment-model=$dir/final.oalimdl" >>$conf
  echo "--model=$dir/final.mdl" >>$conf
  if ! cmp --quiet $dir/final.mdl $dir/final.rescore_mdl; then
    echo "--rescore-model=$dir/final.rescore_mdl" >>$conf
  fi
  echo "--silence-phones=$silphonelist" >>$conf
  echo "--endpoint.silence-phones=$silphonelist" >>$conf
  echo "--global-cmvn-stats=$dir/global_cmvn.stats" >>$conf
  echo "$0: created config file $conf"
fi


================================================
FILE: egs/steps/oracle_wer.sh
================================================
#!/usr/bin/env bash

# Copyright Johns Hopkins University (Author: Daniel Povey)  2013
# Apache 2.0.

# Begin configuration section.
wildcard_symbols=
cmd=run.pl
acwt=0.08333
beam=
stage=0
cleanup=true
# End configuration section.

. utils/parse_options.sh

echo "$0 $@"  # Print the command line for logging

if [ $# != 3 ]; then
   echo "Compute lattice oracle WER and depth, optionally pruning and minimizing the lattice"
   echo "beforehand.  To produce oracle WER, requires there to be a file 'text' in data dir"
   echo "(not usable if only stm is present)"
   echo ""
   echo "Usage: $0 [options] <data-dir> <lang-dir> <decode-dir>"
   echo "e.g.: $0 --wildcard-symbols=1:3:4 data/test data/lang exp/tri5/test_tg"
   echo "Options:"
   echo "  --wildcard-symbols <colon-separated-integer-list>  # Allows you to specify words"
   echo "                                                     # to be removed from both reference"
   echo "                                                     # and hypothesis before computing oracle."
   echo "  --cmd <cmd>                                        # How to run the jobs (default: run.pl)"
   echo "  --acwt <acwt>                                      # Acoustic scale, default $acwt: only"
   echo "                                                     # has an effect if --prune option used."
   echo "  --beam <prune-beam, e.g. 6.0>                      # Lattice pruning beam (optional; can"
   echo "                                                     # be used to compute oracle and depth at"
   echo "                                                     # various beams."
   echo "  --stage <stage>                                    # Used to control partial re-runs"
   echo "  --cleanup <true|false>                             # If true, remove pruned lattices."
   exit 1;
fi

. ./path.sh || exit 1;

data=$1
lang=$2
dir=$3


for f in $data/text $lang/words.txt $dir/lat.1.gz; do
  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1;
done

nj=`cat $dir/num_jobs` || exit 1;
oov_sym=`cat $lang/oov.int`
sdata=$data/split$nj;
split_data.sh $data $nj || exit 1;

nl=$(grep -v IGNORE_TIME_SEGMENT_IN_SCORING $data/text | wc -l)
if [ $nl -eq 0 ]; then
  echo "$0: error: $data/text only contains IGNORE_TIME_SEGMENT_IN_SCORING, or is empty."
  exit 1;
fi

if [ ! -z "$beam" ]; then
  prunedir=${dir}/lats_beam${beam}
  mkdir -p $prunedir/log
  
  if [ $stage -le 0 ]; then
    echo "$0: creating pruned lattices"
    $cmd JOB=1:$nj $prunedir/log/prune.JOB.log \
      lattice-prune --acoustic-scale=$acwt --beam=$beam  \
        "ark:gunzip -c $dir/lat.JOB.gz|" "ark:|gzip -c >$prunedir/lat.JOB.gz" || exit 1;
  fi
else
  prunedir=$dir
fi

mkdir -p $prunedir/log


if [ $stage -le 1 ]; then
  echo "$0: measuring lattice depth"
  $cmd JOB=1:$nj $prunedir/log/lattice_depth.JOB.log \
    lattice-depth "ark:gunzip -c $prunedir/lat.JOB.gz|" ark:/dev/null || exit 1;

  # look for lines like: LOG (blah:blah.cc:95) Overall density is 153.3 over 164361 frames
  grep -w Overall $prunedir/log/lattice_depth.*.log | \
    awk -v nj=$nj '{num+=$6*$8; den+=$8; nl++} END{ 
      if (nl != nj) { print "Error: expected " nj " lines, got " nl | "cat 1>&2"; }
      printf("%.2f ( %d / %d )\n", num/den, num, den); }' > $prunedir/depth || exit 1;
  echo -n "Depth is: "
  cat $prunedir/depth
fi


if [ $stage -le 2 ]; then
  echo "$0: measuring lattice oracle WER"
  $cmd JOB=1:$nj $prunedir/log/lattice_oracle.JOB.log \
    lattice-oracle --wildcard-symbols=$wildcard_symbols  \
    "ark:gunzip -c $prunedir/lat.JOB.gz|" \
   "ark:sym2int.pl --map-oov $oov_sym -f 2- $lang/words.txt $sdata/JOB/text | grep -v IGNORE_TIME_SEGMENT_IN_SCORING |"  \
   ark:/dev/null || exit 1;

  # look for lines like: LOG (blah:blah.cc:95) Overall %WER 25.6 [ 1243 / 6331, ... ]  
  grep -w Overall $prunedir/log/lattice_oracle.*.log | \
    awk -v nj=$nj '{num+=$7; den+=$9; ins+=$10; del+=$12; sb+=$14; nl++} END{ 
      if (nl != nj) { print "Error: expected " nj " lines, got " nl | "cat 1>&2"; }
      printf("%.2f%% [ %d / %d, %d insertions, %d deletions, %d substitutions ]\n", (100.0 * num/den), num, den, ins, del, sb); }' > \
      $prunedir/oracle_wer || exit 1;
  echo -n "Oracle WER is: "
  cat $prunedir/oracle_wer
fi

if $cleanup && [ ! -z $beam ]; then
  echo "$0: removing pruned lattices in $prunedir"
  rm $prunedir/lat.*.gz
fi

exit 0;


================================================
FILE: egs/steps/overlap/get_overlap_segments.py
================================================
#! /usr/bin/env python3
# Copyright   2020   Desh Raj
# Apache 2.0.
"""This script takes an input RTTM and transforms it in a 
particular way: all overlapping segments are re-labeled 
as "overlap". This is useful for 2 cases: 
1. By retaining just the overlap segments (grep overlap),
the resulting RTTM can be used to train an overlap
detector.
2. By retaining just the non-overlap segments (grep -v overlap),
the resulting file can be used to obtain (fairly) clean 
speaker embeddings from the single-speaker regions of the
recording.
The output is written to stdout.
"""

import argparse, os
import itertools
from collections import defaultdict

def get_args():
    parser = argparse.ArgumentParser(
        description="""This script filters an RTTM in several ways.""",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument("--label", type=str, default="overlap",
                        help="Label for the overlap segments")
    parser.add_argument("input_rttm", type=str,
                        help="path of input rttm file")
    args = parser.parse_args()
    return args

class Segment:
    """Stores all information about a segment"""

    def __init__(self, reco_id, start_time, dur = None, end_time = None, spk_id = None):
        self.reco_id = reco_id
        self.start_time = start_time
        if (dur is None):
            self.end_time = end_time
            self.dur = end_time - start_time
        else:
            self.dur = dur
            self.end_time = start_time + dur
        self.spk_id = spk_id

def groupby(iterable, keyfunc):
    """Wrapper around ``itertools.groupby`` which sorts data first."""
    iterable = sorted(iterable, key=keyfunc)
    for key, group in itertools.groupby(iterable, keyfunc):
        yield key, group

def find_overlapping_segments(segs, label):
    reco_id = segs[0].reco_id
    tokens = []
    for seg in segs:
        tokens.append(("BEG", seg.start_time))
        tokens.append(("END", seg.end_time))
    sorted_tokens = sorted(tokens, key=lambda x: x[1])
    
    overlap_segs = []
    spkr_count = 0
    ovl_begin = 0
    ovl_end = 0
    for token in sorted_tokens:
        if (token[0] == "BEG"):
            spkr_count +=1
            if (spkr_count == 2):
                ovl_begin = token[1]
        else:
            spkr_count -= 1
            if (spkr_count == 1):
                ovl_end = token[1]
                overlap_segs.append(Segment(reco_id, ovl_begin, end_time=ovl_end, spk_id=label))
    
    return overlap_segs

def find_single_speaker_segments(segs):
    reco_id = segs[0].reco_id
    tokens = []
    for seg in segs:
        tokens.append(("BEG", seg.start_time, seg.spk_id))
        tokens.append(("END", seg.end_time, seg.spk_id))
    sorted_tokens = sorted(tokens, key=lambda x: x[1])
    
    single_speaker_segs = []
    running_spkrs = set()
    for token in sorted_tokens:
        if (token[0] == "BEG"):
            running_spkrs.add(token[2])
            if (len(running_spkrs) == 1):
                seg_begin = token[1]
                cur_spkr = token[2]
            elif (len(running_spkrs) == 2):
                single_speaker_segs.append(Segment(reco_id, seg_begin, end_time=token[1], spk_id=cur_spkr))
        elif (token[0] == "END"):
            try:
                running_spkrs.remove(token[2])
            except:
                Warning ("Speaker not found")
            if (len(running_spkrs) == 1):
                seg_begin = token[1]
                cur_spkr = list(running_spkrs)[0]
            elif (len(running_spkrs) == 0):
                single_speaker_segs.append(Segment(reco_id, seg_begin, end_time=token[1], spk_id=cur_spkr))
    
    return single_speaker_segs

def main():
    args = get_args()

    # First we read all segments and store as a list of objects
    segments = []
    with open(args.input_rttm, 'r') as f:
        for line in f.readlines():
            parts = line.strip().split()
            segments.append(Segment(parts[1], float(parts[3]), dur=float(parts[4]), spk_id=parts[7]))

    # We group the segment list into a dictionary indexed by reco_id
    reco2segs = defaultdict(list,
        {reco_id : list(g) for reco_id, g in groupby(segments, lambda x: x.reco_id)})

    overlap_segs = []
    for reco_id in reco2segs.keys():
        segs = reco2segs[reco_id]
        overlap_segs.extend(find_overlapping_segments(segs, args.label))

    single_speaker_segs = []
    for reco_id in reco2segs.keys():
        segs = reco2segs[reco_id]
        single_speaker_segs.extend(find_single_speaker_segments(segs))
    final_segs = sorted(overlap_segs + single_speaker_segs, key = lambda x: (x.reco_id, x.start_time))
    
    rttm_str = "SPEAKER {0} 1 {1:7.3f} {2:7.3f} <NA> <NA> {3} <NA> <NA>"
    for seg in final_segs:
        if (seg.dur > 0):
            print(rttm_str.format(seg.reco_id, seg.start_time, seg.dur, seg.spk_id))


if __name__ == '__main__':
    main()


================================================
FILE: egs/steps/overlap/get_overlap_targets.py
================================================
#!/usr/bin/env python

# Copyright 2020  Desh Raj (Johns Hopkins University)
# Apache 2.0

# This script prepares targets for whole recordings for training
# an overlap detector system. It just takes as input a data dir
# where it is assumed that MFCC features have already been
# extracted. It also takes an overlap RTTM file containing
# "single" and "overlap" segments, ideally generated using the
# get_overlap_segments.py script. It uses these segments to 
# obtain per-frame targets for the recordings in the format:
# [ silence single overlap ]

from __future__ import division

import argparse
import logging
import numpy as np
import subprocess
import sys
import itertools
from collections import defaultdict

sys.path.insert(0, 'steps')
import libs.common as common_lib


def get_args():
    parser = argparse.ArgumentParser(
        description="""This script prepares targets for whole recordings for training
            an overlap detector system. It just takes as input a data dir
            where it is assumed that MFCC features have already been
            extracted. It also takes an overlap RTTM file containing
            "single" and "overlap" segments, ideally generated using the
            get_overlap_segments.py script. It uses these segments to 
            obtain per-frame targets for the recordings in the format:
            [ silence single overlap ]
        """)

    parser.add_argument("--frame-shift", type=float, default=0.01,
                        help="Frame shift value in seconds")
    parser.add_argument("--label-smoothing", type=float, default=0.0,
                        help="Value between 0 and 1. Amount of label smoothing to apply"
                        "to get soft labels instead of one-hot labels")
    parser.add_argument("reco2num_frames", type=str,
                        help="""The number of frames per reco
                        is used to determine the num-rows of the output matrix
                        """)
    parser.add_argument("overlap_rttm", type=str,
                        help="Input RTTM file containing single and overlap segments")
    parser.add_argument("out_targets_ark", type=str,
                        help="""Output archive to which the
                        recording-level matrix will be written in text
                        format""")

    args = parser.parse_args()

    if args.frame_shift < 0.0001 or args.frame_shift > 1:
        raise ValueError("--frame-shift should be in [0.0001, 1]; got {0}"
                         "".format(args.frame_shift))
    return args

class Segment:
    """Stores all information about a segment"""
    reco_id = ''
    spk_id = ''
    start_time = 0
    dur = 0
    end_time = 0

    def __init__(self, reco_id, start_time, dur = None, end_time = None, label = None):
        self.reco_id = reco_id
        self.start_time = start_time
        if (dur is None):
            self.end_time = end_time
            self.dur = end_time - start_time
        else:
            self.dur = dur
            self.end_time = start_time + dur
        self.label = label

def groupby(iterable, keyfunc):
    """Wrapper around ``itertools.groupby`` which sorts data first."""
    iterable = sorted(iterable, key=keyfunc)
    for key, group in itertools.groupby(iterable, keyfunc):
        yield key, group

def run(args):
    # Get all reco to num_frames, which will be used to decide the number of
    # rows of matrix
    reco2num_frames = {}
    with common_lib.smart_open(args.reco2num_frames) as f:
        for line in f:
            parts = line.strip().split()
            if len(parts) != 2:
                raise ValueError("Could not parse line {0}".format(line))
            reco2num_frames[parts[0]] = int(parts[1])

    # We read all segments and store as a list of objects
    segments = []
    with common_lib.smart_open(args.overlap_rttm) as f:
        for line in f.readlines():
            parts = line.strip().split()
            segments.append(Segment(parts[1], float(parts[3]), dur=float(parts[4]), label=parts[7]))

    # We group the segment list into a dictionary indexed by reco_id
    reco2segs = defaultdict(list,
        {reco_id : list(g) for reco_id, g in groupby(segments, lambda x: x.reco_id)})

    # Now, for each reco, create a matrix of shape num_frames x 3 and fill in using
    # the segments information for that reco
    reco2targets = {}
    for reco_id in reco2num_frames:
        segs = sorted(reco2segs[reco_id], key=lambda x: x.start_time)

        target_val = 1 - args.label_smoothing
        other_val = args.label_smoothing / 2
        silence_vec = np.array([target_val,other_val,other_val], dtype=np.float)
        single_vec = np.array([other_val,target_val,other_val], dtype=np.float)
        overlap_vec = np.array([other_val,other_val,target_val], dtype=np.float)
        num_targets = [0,0,0]

        # The default target (if not single or overlap) is silence
        targets_mat = np.tile(silence_vec, (reco2num_frames[reco_id],1))

        # Now iterate over all segments of the recording and assign targets
        for seg in segs:
            start_frame = int(seg.start_time / args.frame_shift)
            end_frame = min(int(seg.end_time / args.frame_shift), reco2num_frames[reco_id])
            num_frames = end_frame - start_frame
            if (num_frames <= 0):
                continue
            if (seg.label == "overlap"):
                targets_mat[start_frame:end_frame] = np.tile(overlap_vec, (num_frames,1))
                num_targets[2] += end_frame - start_frame
            else:
                targets_mat[start_frame:end_frame] = np.tile(single_vec, (num_frames,1))
                num_targets[1] += end_frame - start_frame

        num_targets[0] = reco2num_frames[reco_id] - sum(num_targets)
        # print ("{}: {}".format(reco_id, num_targets))
        reco2targets[reco_id] = targets_mat

    with common_lib.smart_open(args.out_targets_ark, 'w') as f:
        for reco_id in sorted(reco2targets.keys()):
            common_lib.write_matrix_ascii(f, reco2targets[reco_id].tolist(), key=reco_id)

def main():
    args = get_args()
    try:
        run(args)
    except Exception:
        raise

if __name__ == "__main__":
    main()

================================================
FILE: egs/steps/overlap/output_to_rttm.py
================================================
#!/usr/bin/env python

# Copyright 2017  Vimal Manohar
#           2018  Capital One (Author: Zhiyuan Guan)
# Apache 2.0

"""
This script converts frame-level overlap detector marks (in kaldi
integer vector text archive format) into kaldi segments and utt2spk.
The input integer vectors are expected to contain '1' for silence frames,
'2' for speech frames of single speaker, and '3' for overlap frames.
"""

from __future__ import print_function
import argparse
import logging
import sys

sys.path.insert(0, 'steps')
import libs.common as common_lib

logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
handler = logging.StreamHandler()
handler.setLevel(logging.INFO)
formatter = logging.Formatter("%(asctime)s [%(pathname)s:%(lineno)s - "
                              "%(funcName)s - %(levelname)s ] %(message)s")
handler.setFormatter(formatter)
logger.addHandler(handler)

global_verbose = 0


def get_args():
    parser = argparse.ArgumentParser(
        description="""
This script converts frame-level speech activity detection marks (in kaldi
integer vector text archive format) into kaldi segments and utt2spk.
The input integer vectors are expected to contain 1 for silence frames
and 2 for speech frames.
""",
        formatter_class=argparse.RawTextHelpFormatter)

    parser.add_argument("--verbose", type=int, choices=[0, 1, 2, 3],
                        default=0, help="Higher verbosity for more logging")

    parser.add_argument("--utt2dur", type=str,
                        help="File containing durations of utterances.")

    parser.add_argument("--frame-shift", type=float, default=0.01,
                        help="Frame shift to convert frame indexes to time")

    parser.add_argument("--segment-padding", type=float, default=0.2,
                        help="Additional padding on speech segments. But we "
                             "ensure that the padding does not go beyond the "
                             "adjacent segment.")

    parser.add_argument("--min-segment-dur", type=float, default=0,
                        help="Minimum duration (in seconds) required for a segment "
                             "to be included. This is before any padding. Segments "
                             "shorter than this duration will be removed.")

    parser.add_argument("--merge-consecutive-max-dur", type=float, default=0,
                        help="Merge consecutive segments as long as the merged "
                             "segment is no longer than this many seconds. The segments "
                             "are only merged if their boundaries are touching. "
                             "This is after padding by --segment-padding seconds."
                             "0 means do not merge. Use 'inf' to not limit the duration.")

    parser.add_argument("--region-type", type=str, default="overlap",
                        help="Specify if overlap or single-speaker or silence region "
                        "to output in the rttm")

    parser.add_argument("in_ovl", type=str,
                        help="Input file containing alignments in "
                             "text archive format")

    parser.add_argument("out_rttm", type=str,
                        help="Output kaldi segments file")

    args = parser.parse_args()

    global global_verbose
    global_verbose = args.verbose

    logger.info("Setting verbosity to {0}".format(global_verbose))

    if args.verbose >= 3:
        logger.setLevel(logging.DEBUG)
        handler.setLevel(logging.DEBUG)
    return args


def to_str(segment):
    assert len(segment) == 3
    return "[{0:.3f}, {1:.3f}, {2}]".format(segment[0], segment[1],
                                            segment[2])


class SegmenterStats(object):
    """Stores stats about the post-process stages"""

    def __init__(self):
        self.num_segments_initial = 0
        self.num_short_segments_filtered = 0
        self.num_merges = 0
        self.num_segments_final = 0
        self.initial_duration = 0.0
        self.padding_duration = 0.0
        self.filter_short_duration = 0.0
        self.final_duration = 0.0

    def add(self, other):
        """Adds stats from another object"""
        self.num_segments_initial += other.num_segments_initial
        self.num_short_segments_filtered += other.num_short_segments_filtered
        self.num_merges += other.num_merges
        self.num_segments_final += other.num_segments_final
        self.initial_duration += other.initial_duration
        self.filter_short_duration += other.filter_short_duration
        self.padding_duration += other.padding_duration
        self.final_duration += other.final_duration

    def __str__(self):
        return ("num-segments-initial={num_segments_initial}, "
                "num-short-segments-filtered={num_short_segments_filtered}, "
                "num-merges={num_merges}, "
                "num-segments-final={num_segments_final}, "
                "initial-duration={initial_duration}, "
                "filter-short-duration={filter_short_duration}, "
                "padding-duration={padding_duration}, "
                "final-duration={final_duration}".format(
            num_segments_initial=self.num_segments_initial,
            num_short_segments_filtered=self.num_short_segments_filtered,
            num_merges=self.num_merges,
            num_segments_final=self.num_segments_final,
            initial_duration=self.initial_duration,
            filter_short_duration=self.filter_short_duration,
            padding_duration=self.padding_duration,
            final_duration=self.final_duration))


def process_label(text_label):
    """Processes an input integer label and returns a 1, 2 or 3,
    where 1 is for silence, 2 is for single-speaker region, and 
    3 is for overlap.

    Arguments:
        text_label -- input label (must be integer)
    """
    prev_label = int(text_label)
    if prev_label not in [1, 2, 3]:
        raise ValueError("Expecting label to be 0 (silence), 1 (single speaker) or 2 (overlap); "
                         "got {}".format(prev_label))

    return prev_label


class Segmentation(object):
    """Stores segmentation for an utterances"""

    region_to_label = {'silence':1, 'single':2, 'overlap':3}

    def __init__(self, region_type):
        self.segments = None
        self.label = self.region_to_label[region_type]
        self.region_type = region_type
        self.stats = SegmenterStats()

    def initialize_segments(self, alignment, frame_shift=0.01):
        """Initializes segments from input alignment.
        The alignment is frame-level overlap detection marks,
        each of which must be 1, 2, or 3."""
        self.segments = []

        assert len(alignment) > 0

        prev_label = None
        prev_length = 0
        for i, text_label in enumerate(alignment):
            if prev_label is not None and int(text_label) != prev_label:
                if prev_label == self.label:
                    self.segments.append(
                        [float(i - prev_length) * frame_shift,
                         float(i) * frame_shift, prev_label])
                    self.stats.initial_duration += (prev_length * frame_shift)
                prev_label = process_label(text_label)
                prev_length = 0
            elif prev_label is None:
                prev_label = process_label(text_label)

            prev_length += 1

        if prev_length > 0 and prev_label == self.label:
            self.segments.append(
                [float(len(alignment) - prev_length) * frame_shift,
                 float(len(alignment)) * frame_shift, prev_label])
            self.stats.initial_duration += (prev_length * frame_shift)

        self.stats.num_segments_initial = len(self.segments)
        self.stats.num_segments_final = len(self.segments)
        self.stats.final_duration = self.stats.initial_duration

    def filter_short_segments(self, min_dur):
        """Filters out segments with durations shorter than 'min_dur'."""
        if min_dur <= 0:
            return

        segments_kept = []
        for segment in self.segments:
            assert segment[2] == self.label, segment
            dur = segment[1] - segment[0]
            if dur < min_dur:
                self.stats.filter_short_duration += dur
                self.stats.num_short_segments_filtered += 1
            else:
                segments_kept.append(segment)
        self.segments = segments_kept
        self.stats.num_segments_final = len(self.segments)
        self.stats.final_duration -= self.stats.filter_short_duration

    def pad_segments(self, segment_padding, max_duration=float("inf")):
        """Pads segments by duration 'segment_padding' on either sides, but
        ensures that the segments don't go beyond the neighboring segments
        or the duration of the utterance 'max_duration'."""
        if max_duration == None:
            max_duration = float("inf")
        for i, segment in enumerate(self.segments):
            assert segment[2] == self.label, segment
            segment[0] -= segment_padding  # try adding padding on the left side
            self.stats.padding_duration += segment_padding
            if segment[0] < 0.0:
                # Padding takes the segment start to before the beginning of the utterance.
                # Reduce padding.
                self.stats.padding_duration += segment[0]
                segment[0] = 0.0
            if i >= 1 and self.segments[i - 1][1] > segment[0]:
                # Padding takes the segment start to before the end the previous segment.
                # Reduce padding.
                self.stats.padding_duration -= (
                        self.segments[i - 1][1] - segment[0])
                segment[0] = self.segments[i - 1][1]

            segment[1] += segment_padding
            self.stats.padding_duration += segment_padding
            if segment[1] >= max_duration:
                # Padding takes the segment end beyond the max duration of the utterance.
                # Reduce padding.
                self.stats.padding_duration -= (segment[1] - max_duration)
                segment[1] = max_duration
            if (i + 1 < len(self.segments)
                    and segment[1] > self.segments[i + 1][0]):
                # Padding takes the segment end beyond the start of the next segment.
                # Reduce padding.
                self.stats.padding_duration -= (
                        segment[1] - self.segments[i + 1][0])
                segment[1] = self.segments[i + 1][0]
        self.stats.final_duration += self.stats.padding_duration

    def merge_consecutive_segments(self, max_dur):
        """Merge consecutive segments (happens after padding), provided that
        the merged segment is no longer than 'max_dur'."""
        if max_dur <= 0 or not self.segments:
            return

        merged_segments = [self.segments[0]]
        for segment in self.segments[1:]:
            assert segment[2] == self.label, segment
            if segment[0] == merged_segments[-1][1] and \
                    segment[1] - merged_segments[-1][0] <= max_dur:
                # The segment starts at the same time the last segment ends,
                # and the merged segment is shorter than 'max_dur'.
                # Extend the previous segment.
                merged_segments[-1][1] = segment[1]
                self.stats.num_merges += 1
            else:
                merged_segments.append(segment)

        self.segments = merged_segments
        self.stats.num_segments_final = len(self.segments)

    def write(self, key, file_handle):
        """Write segments to RTTM file"""
        if global_verbose >= 2:
            logger.info("For key {key}, got stats {stats}".format(
                key=key, stats=self.stats))
        rttm_str = "SPEAKER {0} 1 {1:7.3f} {2:7.3f} <NA> <NA> {3} <NA> <NA>"
        for segment in self.segments:
            print(rttm_str.format(key, segment[0], segment[1] - segment[0], self.region_type),
                file=file_handle)


def run(args):
    """The main function that does everything."""
    utt2dur = {}
    if args.utt2dur is not None:
        with common_lib.smart_open(args.utt2dur) as utt2dur_fh:
            for line in utt2dur_fh:
                parts = line.strip().split()
                if len(parts) != 2:
                    raise RuntimeError("Unable to parse line '{0}' in {1}"
                                       "".format(line.strip(), args.utt2dur))
                utt2dur[parts[0]] = float(parts[1])

    global_stats = SegmenterStats()
    with common_lib.smart_open(args.in_ovl) as in_ovl_fh, \
            common_lib.smart_open(args.out_rttm, 'w') as out_rttm_fh:
        for line in in_ovl_fh:
            parts = line.strip().split()
            utt_id = parts[0]

            if len(parts) < 2:
                raise RuntimeError("Unable to parse line '{0}' in {1}"
                                   "".format(line.strip(),
                                             in_ovl_fh))

            segmentation = Segmentation(args.region_type)
            segmentation.initialize_segments(
                parts[1:], args.frame_shift)
            segmentation.filter_short_segments(args.min_segment_dur)
            segmentation.pad_segments(args.segment_padding,
                                             None if args.utt2dur is None
                                             else utt2dur[utt_id])
            segmentation.merge_consecutive_segments(args.merge_consecutive_max_dur)
            segmentation.write(utt_id, out_rttm_fh)
            global_stats.add(segmentation.stats)
    logger.info(global_stats)


def main():
    """Parses arguments and calls the run method"""
    args = get_args()
    try:
        run(args)
    except Exception:
        raise


if __name__ == '__main__':
    main()


================================================
FILE: egs/steps/overlap/post_process_output.sh
================================================
#!/usr/bin/env bash

# Copyright 2015-17  Vimal Manohar
#           2020     Desh Raj
# Apache 2.0.

# This script post-processes the output of the overlap neural network,
# which is in the form of frame-level alignments, into an RTTM file.
# The alignments must be 0/1/2 denoting silence/single/overlap. Based
# on this, this script can also be used to get single speaker regions.

set -e -o pipefail -u
. ./path.sh

cmd=run.pl
stage=-10
nj=18

region_type=overlap # change this to "single" to get only single-speaker regions

# The values below are in seconds
frame_shift=0.01
segment_padding=0.2
min_segment_dur=0
merge_consecutive_max_dur=inf

. utils/parse_options.sh

if [ $# -ne 3 ]; then
  echo "This script post-processes the output of steps/segmentation/decode_sad.sh, "
  echo "which is in the form of frame-level alignments, into kaldi segments. "
  echo "The alignments must be speech activity detection marks i.e. 1 for silence "
  echo "and 2 for speech."
  echo "Usage: $0 <data-dir> <output-dir> <rttm-dir>"
  echo " e.g.: $0 data/dev_aspire_whole exp/vad_dev_aspire"
  exit 1
fi

data_dir=$1
output_dir=$2    # Alignment directory containing frame-level SAD labels
dir=$3

mkdir -p $dir

for f in $output_dir/ali.1.gz $output_dir/num_jobs; do
  if [ ! -f $f ]; then
    echo "$0: Could not find file $f" && exit 1
  fi
done

nj=`cat $output_dir/num_jobs` || exit 1
utils/split_data.sh $data_dir $nj

utils/data/get_utt2dur.sh $data_dir

if [ $stage -le 0 ]; then
  $cmd JOB=1:$nj $dir/log/segmentation.JOB.log \
    copy-int-vector "ark:gunzip -c $output_dir/ali.JOB.gz |" ark,t:- \| \
    steps/overlap/output_to_rttm.py \
      --region-type=$region_type \
      --frame-shift=$frame_shift --segment-padding=$segment_padding \
      --min-segment-dur=$min_segment_dur --merge-consecutive-max-dur=$merge_consecutive_max_dur \
      --utt2dur=$data_dir/utt2dur - $dir/rttm_${region_type}.JOB
fi

echo $nj > $dir/num_jobs

for n in $(seq $nj); do 
  cat $dir/rttm_${region_type}.$n
done > $dir/rttm_${region_type}


================================================
FILE: egs/steps/overlap/prepare_overlap_graph.py
================================================
#!/usr/bin/env python

# Copyright 2016  Vimal Manohar
# Apache 2.0

"""Prepares a graph with a simple HMM topology for segmentation
with minimum and maximum speech duration constraints and minimum silence
duration constraint. The graph is written to the 'output_graph', which
can be file or "-" for stdout.
"""

from __future__ import print_function
import argparse
import logging
import math
import os
import sys
import traceback

sys.path.insert(0, 'steps')
import libs.common as common_lib


def get_args():
    parser = argparse.ArgumentParser(
        description="""This script prepares a graph with a simple HMM topology
        for overlap detection with minimum and maximum speech duration constraints
        and minimum silence duration constraint. Additionally, we enforce the 
        constraint that there cannot be a direct transition between silence and
        overlap states. The graph is written to the 'output_graph', which can be 
        file or "-" for stdout.""",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    parser.add_argument("--transition-scale", type=float, default=1.0,
                        help="""Scale on transition probabilities relative to
                        LM weights""")
    parser.add_argument("--loopscale", type=float, default=0.1,
                        help="""Scale on self-loop log-probabilities relative
                        to LM weights""")

    parser.add_argument("--min-silence-duration", type=float, default=0.01,
                        help="""Minimum duration for silence""")
    parser.add_argument("--min-speech-duration", type=float, default=0.3,
                        help="""Minimum duration for speech""")
    parser.add_argument("--max-speech-duration", type=float, default=10.0,
                        help="""Maximum duration for speech""")
    parser.add_argument("--min-overlap-duration", type=float, default=0.1,
                        help="""Minimum duration for overlap""")
    parser.add_argument("--max-overlap-duration", type=float, default=5.0,
                        help="""Maximum duration for overlap""")
    parser.add_argument("--frame-shift", type=float, default=0.03,
                        help="""Frame shift in seconds""")

    parser.add_argument("--edge-silence-probability", type=float,
                        default=0.5,
                        help="Probability of silence at the edges.")
    parser.add_argument("--transition-probability", type=float, default=0.1,
                        help="Transition probability for silence to speech "
                        "or vice-versa")

    parser.add_argument("output_graph", type=str,
                        help="Output graph")
    args = parser.parse_args()

    args.min_states_silence = int(args.min_silence_duration / args.frame_shift
                                  + 0.5)
    args.min_states_speech = int(args.min_speech_duration / args.frame_shift
                                 + 0.5)
    args.max_states_speech = int(args.max_speech_duration / args.frame_shift
                                 + 0.5)
    args.min_states_overlap = int(args.min_overlap_duration / args.frame_shift
                                 + 0.5)
    args.max_states_overlap = int(args.max_overlap_duration / args.frame_shift
                                 + 0.5)

    return args


def print_states(args, file_handle):
    # Initial transition to silence
    print ("0 1 silence silence {0}".format(-math.log(args.edge_silence_probability)),
           file=file_handle)
    silence_start_state = 1

    # Silence min duration transitions
    # 1->2, 2->3 and so on until
    # (1 + min_states_silence - 2) -> (1 + min_states_silence - 1)  ...
    for state in range(silence_start_state,
                       silence_start_state + args.min_states_silence - 1):
        print ("{state} {next_state} silence silence {cost}".format(
                    state=state, next_state=state + 1, cost=0.0),
               file=file_handle)
    silence_last_state = silence_start_state + args.min_states_silence - 1

    # Silence self-loop
    print ("{state} {state} silence silence {cost}".format(
                state=silence_last_state, cost=0.0),
           file=file_handle)

    speech_start_state = silence_last_state + 1
    # Initial transition to speech
    print ("0 {state} single single {cost}".format(
                state=speech_start_state,
                cost=-math.log(1.0 - args.edge_silence_probability)),
           file=file_handle)

    # Silence to speech transition
    print ("{sil_state} {speech_state} single single {cost}".format(
                sil_state=silence_last_state,
                speech_state=speech_start_state,
                cost=-math.log(args.transition_probability)),
           file=file_handle)
     
    # Speech min duration
    for state in range(speech_start_state,
                       speech_start_state + args.min_states_speech - 1):
        print ("{state} {next_state} single single {cost}".format(
                    state=state, next_state=state + 1, cost=0.0),
               file=file_handle)

    speech_last_state = speech_start_state + args.max_states_speech - 1
    overlap_start_state = speech_last_state + 1
    # Speech max duration
    for state in range(speech_start_state + args.min_states_speech - 1,
                       speech_start_state + args.max_states_speech - 1):
        print ("{state} {next_state} single single {cost}".format(
                    state=state, next_state=state + 1, cost=0.0),
               file=file_handle)

        print ("{state} {sil_state} silence silence {cost}".format(
                    state=state, sil_state=silence_start_state,
                    cost=-math.log(args.transition_probability)),
               file=file_handle)

        print ("{state} {ovl_state} overlap overlap {cost}".format(
                    state=state, ovl_state=overlap_start_state,
                    cost=-math.log(args.transition_probability)),
               file=file_handle)

    # Transition to silence after max duration of speech
    print ("{state} {sil_state} silence silence {cost}".format(
                state=speech_last_state, sil_state=silence_start_state,
                cost=0.0),
           file=file_handle)

    
    # Transition to overlap after max duration of speech
    print ("{state} {ovl_state} overlap overlap {cost}".format(
                state=speech_last_state, ovl_state=overlap_start_state,
                cost=0),
           file=file_handle)

    # Overlap min duration
    for state in range(overlap_start_state,
                       overlap_start_state + args.min_states_overlap - 1):
        print ("{state} {next_state} overlap overlap {cost}".format(
                    state=state, next_state=state + 1, cost=0.0),
               file=file_handle)

    # Overlap max duration
    for state in range(overlap_start_state + args.min_states_overlap - 1,
                       overlap_start_state + args.max_states_overlap - 1):
        print ("{state} {next_state} overlap overlap {cost}".format(
                    state=state, next_state=state + 1, cost=0.0),
               file=file_handle)

        print ("{state} {speech_state} single single {cost}".format(
                    state=state, speech_state=speech_start_state,
                    cost=-math.log(args.transition_probability)),
               file=file_handle)
    overlap_last_state = overlap_start_state + args.max_states_overlap - 1

    # Transition to speech after max duration of overlap
    print ("{state} {speech_state} single single {cost}".format(
                state=overlap_last_state, speech_state=speech_start_state,
                cost=0.0),
           file=file_handle)

    for state in range(1, speech_start_state):
        print ("{state} {cost}".format(
                    state=state, cost=-math.log(args.edge_silence_probability)),
               file=file_handle)

    for state in range(speech_start_state, speech_last_state + 1):
        print ("{state} {cost}".format(
                    state=state,
                    cost=-math.log(1.0 - args.edge_silence_probability)),
               file=file_handle)

    for state in range(overlap_start_state, overlap_last_state + 1):
        print ("{state} {cost}".format(
                    state=state,
                    cost=0),
               file=file_handle) 


def main():
    try:
        args = get_args()
        with common_lib.smart_open(args.output_graph, 'w') as f:
            print_states(args, f)
    except Exception:
        raise


if __name__ == '__main__':
    main()


================================================
FILE: egs/steps/paste_feats.sh
================================================
#!/usr/bin/env bash

# Copyright 2014  Brno University of Technology (Author: Karel Vesely)
# Copyright 2012  Johns Hopkins University (Author: Daniel Povey)
# Apache 2.0
# This script appends the features in two or more data directories.

# To be run from .. (one directory up from here)
# see ../run.sh for example

# Begin configuration section.
cmd=run.pl
nj=4
length_tolerance=10 # length tolerance in frames (trim to shortest)
compress=true
# End configuration section.

echo "$0 $@"  # Print the command line for logging

if [ -f path.sh ]; then . ./path.sh; fi
. parse_options.sh || exit 1;

if [ $# -lt 5 ]; then
   echo "usage: $0 [options] <src-data-dir1> <src-data-dir2> [<src-data-dirN>] <dest-data-dir> <log-dir> <path-to-storage-dir>";
   echo "e.g.: $0 data/train_mfcc data/train_bottleneck data/train_combined exp/append_mfcc_plp mfcc"
   echo "options: "
   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
   exit 1;
fi

data_src_arr=(${@:1:$(($#-3))}) #array of source data-dirs
data=${@: -3: 1}
logdir=${@: -2: 1}
ark_dir=${@: -1: 1} #last arg.

data_src_first=${data_src_arr[0]} # get 1st src dir

# make $ark_dir an absolute pathname.
ark_dir=`perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' $ark_dir ${PWD}`

for data_src in ${data_src_arr[@]}; do
  utils/split_data.sh $data_src $nj || exit 1;
done

mkdir -p $ark_dir $logdir

mkdir -p $data
cp $data_src_first/* $data/ 2>/dev/null # so we get the other files, such as utt2spk.
rm $data/cmvn.scp 2>/dev/null
rm $data/feats.scp 2>/dev/null

# use "name" as part of name of the archive.
name=`basename $data`

# get list of source scp's for pasting
data_src_args=
for data_src in ${data_src_arr[@]}; do
  data_src_args="$data_src_args scp:$data_src/split$nj/JOB/feats.scp"
done

for n in $(seq $nj); do
  # the next command does nothing unless $ark_dir/storage/ exists, see
  # utils/create_data_link.pl for more info.
  utils/create_data_link.pl $ark_dir/pasted_$name.$n.ark
done

$cmd JOB=1:$nj $logdir/append.JOB.log \
   paste-feats --length-tolerance=$length_tolerance $data_src_args ark:- \| \
   copy-feats --compress=$compress ark:- \
    ark,scp:$ark_dir/pasted_$name.JOB.ark,$ark_dir/pasted_$name.JOB.scp || exit 1;

# concatenate the .scp files together.
for ((n=1; n<=nj; n++)); do
  cat $ark_dir/pasted_$name.$n.scp >> $data/feats.scp || exit 1;
done > $data/feats.scp || exit 1;


nf=`cat $data/feats.scp | wc -l`
nu=`cat $data/utt2spk | wc -l`
if [ $nf -ne $nu ]; then
  echo "It seems not all of the feature files were successfully processed ($nf != $nu);"
  echo "consider using utils/fix_data_dir.sh $data"
fi

echo "Succeeded pasting features for $name into $data"


================================================
FILE: egs/steps/pytorchnn/check_py.py
================================================
import numpy as np
import torch


================================================
FILE: egs/steps/pytorchnn/compute_sentence_scores.py
================================================
# Copyright 2020    Ke Li

""" This script computes sentence scores with a PyTorch trained neural LM.
    It is called by steps/pytorchnn/lmrescore_nbest_pytorchnn.sh
"""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os
import argparse
from collections import defaultdict

import torch
import torch.nn as nn


def load_nbest(path):
    r"""Read nbest lists.

    Assume the input file format is as following:
        en_4156-A_030185-030248-1 oh yeah
        en_4156-A_030470-030672-1 well i'm going to have mine and two more classes
        en_4156-A_030470-030672-2 well i'm gonna have mine and two more classes
        ...

    Args:
        path (str): A file of nbest lists with the above format.

    Returns:
        The nbest lists represented by a dictionary from string to a list of
        strings. The key is utterance id and the value is the hypotheses.
    """

    nbest = defaultdict()
    with open(path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            try:
                key, hyp = line.split(' ', 1)
            except ValueError:
                key = line
                hyp = ' '
            key = key.rsplit('-', 1)[0]
            if key not in nbest:
                nbest[key] = [hyp]
            else:
                nbest[key].append(hyp)
    return nbest


def read_vocab(path):
    r"""Read vocabulary.

    Args:
        path (str): A file with a word and its integer index per line.

    Returns:
        A vocabulary represented by a dictionary from string to int (starting
        from 0).
    """

    word2idx = {}
    idx2word = []
    with open(path, 'r', encoding='utf-8') as f:
        for line in f:
            word = line.split()
            assert len(word) == 2
            word = word[0]
            if word not in word2idx:
                idx2word.append(word)
                word2idx[word] = len(idx2word) - 1
    return word2idx


def get_input_and_target(hyp, vocab):
    r"""Convert a sentence to lists of integers, with input and target separately.

    Args:
        hyp (str):  Sentence, with words separated by spaces, e.g. 'hello there'
        vocab:      A dictionary from string to int, e.g. {'<s>':0, 'hello':1,
                    'there':2, 'apple':3, ...}

    Returns:
        A pair of lists, one with the integerized input sequence, one with the
        integerized output/target sequence: in this case ([0, 1, 2], [1 2 0]),
        because the input sequence has '<s>' added at the start and the output
        sequence has '<s>' added at the end.
        Words that are not in the vocabulary will be converted to '<unk>', which
        is expected to be in the vocabulary if there are out-of-vocabulary words.
    """

    input_string = '<s> ' + hyp
    output_string = hyp + ' <s>'
    input_ids, output_ids = [], []
    for word in input_string.split():
        try:
            input_ids.append(vocab[word.lower()])
        except KeyError:
            input_ids.append(vocab['<unk>'])
    for word in output_string.split():
        try:
            output_ids.append(vocab[word.lower()])
        except KeyError:
            output_ids.append(vocab['<unk>'])
    return input_ids, output_ids


def compute_sentence_score(model, criterion, ntokens, data, target,
                           model_type='LSTM', hidden=None):
    r"""Compute neural language model score of a sentence.

    Args:
        model:      A neural language model.
        criterion:  Training criterion of a neural language model, e.g.
                    cross entropy.
        ntokens:    Vocabulary size.
        data:       Integerized input sentence.
        target:     Integerized target sentence.
        model_type: Model type, e.g. LSTM or Transformer or others.
        hidden:     Initial hidden state for getting the score of the input
                    sentence with a recurrent-typed neural language model
                    (optional).

    Returns:
        The score (negative log-likelihood) of the input sequence from a neural
        language model. If the model is recurrent-typed, the function has an
        extra output: the last hidden state after computing the score of the
        input sequence.
    """

    length = len(data)
    data = torch.LongTensor(data).view(-1, 1).contiguous()
    target = torch.LongTensor(target).view(-1).contiguous()
    with torch.no_grad():
        if model_type == 'Transformer':
            output = model(data)
        else:
            output, hidden = model(data, hidden)
        loss = criterion(output.view(-1, ntokens), target)
    sent_score = length * loss.item()
    if model_type == 'Transformer':
        return sent_score
    return sent_score, hidden


def compute_scores(nbest, model, criterion, ntokens, vocab, model_type='LSTM'):
    r"""Compute sentence scores of nbest lists from a neural language model.

    Args:
        nbest:      The nbest lists represented by a dictionary from string
                    to a list of strings.
        model:      A neural language model.
        criterion:  Training criterion of a neural language model, e.g.
                    cross entropy.
        ntokens:    Vocabulary size.
        model_type: Model type, e.g. LSTM or Transformer or others.

    Returns:
        The nbest litsts and their scores represented by a dictionary from
        string to a pair of a hypothesis and its neural language model score.
    """

    # Turn on evaluation mode which disables dropout.
    model.eval()
    nbest_and_scores = defaultdict(float)
    if model_type != 'Transformer':
        hidden = model.init_hidden(1)
    for key in nbest.keys():
        if model_type != 'Transformer':
            cached_hiddens = []
        for hyp in nbest[key]:
            x, target = get_input_and_target(hyp, vocab)
            if model_type == 'Transformer':
                score = compute_sentence_score(model, criterion, ntokens, x,
                                               target, model_type)
            else:
                score, new_hidden = compute_sentence_score(model, criterion,
                                                           ntokens, x, target,
                                                           model_type, hidden)
                cached_hiddens.append(new_hidden)
            if key in nbest_and_scores:
                nbest_and_scores[key].append((hyp, score))
            else:
                nbest_and_scores[key] = [(hyp, score)]
        # For RNN based LMs, initialize the current initial hidden states with
        # those from hypotheses of a preceeding previous utterance.
        # This achieves modest WER reductions compared with zero initialization
        # as it provides context from previous utterances. We observe that using
        # hidden states from which hypothesis of the previous utterance for
        # initialization almost doesn't make a difference. So to make the code
        # more general, the hidden states from the first hypothesis of the
        # previous utterance is used for initialization. You can also use those
        # from the one best hypothesis or just average hidden states from all
        # hypotheses of the previous utterance.
        if model_type != 'Transformer':
            hidden = cached_hiddens[0]
    return nbest_and_scores


def write_scores(nbest_and_scores, path):
    r"""Write out sentence scores of nbest lists in the following format:
        en_4156-A_030185-030248-1 7.98671
        en_4156-A_030470-030672-1 46.5938
        en_4156-A_030470-030672-2 46.9522
        ...

    Args:
        nbest_and_scores: The nbest lists and their scores represented by a
                          dictionary from string to a pair of a hypothesis and
                          its neural language model score.
        path (str):       A output file of nbest lists' scores in the above format.
    """

    with open(path, 'w', encoding='utf-8') as f:
        for key in nbest_and_scores.keys():
            for idx, (_, score) in enumerate(nbest_and_scores[key], 1):
                current_key = '-'.join([key, str(idx)])
                f.write('%s %.4f\n' % (current_key, score))
    print("Write to %s" % path)


def main():
    parser = argparse.ArgumentParser(description="Compute sentence scores of "
                                     "nbest lists with a PyTorch trained "
                                     "neural language model.")
    parser.add_argument('--nbest-list', type=str, required=True,
                        help="N-best hypotheses for rescoring")
    parser.add_argument('--outfile', type=str, required=True,
                        help="Output file with language model scores associated "
                        "with each hypothesis")
    parser.add_argument('--vocabulary', type=str, required=True,
                        help="Vocabulary used for training")
    parser.add_argument('--model-path', type=str, required=True,
                        help="Path to a pretrained neural model.")
    parser.add_argument('--model', type=str, default='LSTM',
                        help='Network type. can be RNN, LSTM or Transformer.')
    parser.add_argument('--emsize', type=int, default=200,
                        help='size of word embeddings')
    parser.add_argument('--nhid', type=int, default=200,
                        help='number of hidden units per layer')
    parser.add_argument('--nlayers', type=int, default=2,
                        help='number of layers')
    parser.add_argument('--nhead', type=int, default=2,
                        help='the number of heads in the encoder/decoder of the '
                        'transformer model')
    args = parser.parse_args()
    assert os.path.exists(args.nbest_list), "Nbest list path does not exists."
    assert os.path.exists(args.vocabulary), "Vocabulary path does not exists."
    assert os.path.exists(args.model_path), "Model path does not exists."

    print("Load vocabulary")
    vocab = read_vocab(args.vocabulary)
    ntokens = len(vocab)
    print("Load model and criterion")
    import model
    if args.model == 'Transformer':
        model = model.TransformerModel(ntokens, args.emsize, args.nhead,
                                       args.nhid, args.nlayers,
                                       activation="gelu", tie_weights=True)
    else:
        model = model.RNNModel(args.model, ntokens, args.emsize, args.nhid,
                               args.nlayers, tie_weights=True)
    with open(args.model_path, 'rb') as f:
        model.load_state_dict(torch.load(f, map_location=lambda storage, loc: storage))
        if args.model in ['RNN_TANH', 'RNN_RELU', 'LSTM', 'GRU']:
            model.rnn.flatten_parameters()
    criterion = nn.CrossEntropyLoss()
    print("Load nbest list")
    nbest = load_nbest(args.nbest_list)
    print("Compute sentence scores with a ", args.model, " model")
    nbest_and_scores = compute_scores(nbest, model, criterion, ntokens, vocab,
                                      model_type=args.model)
    print("Write sentence scores out")
    write_scores(nbest_and_scores, args.outfile)


if __name__ == '__main__':
    main()


================================================
FILE: egs/steps/pytorchnn/data.py
================================================
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os
import torch


class Dictionary(object):
    def __init__(self):
        self.word2idx = {}
        self.idx2word = []

    def read_vocab(self, path):
        with open(path, 'r', encoding='utf-8') as f:
            for line in f:
                word = line.split()
                assert (len(word) == 2)
                word = word[0]
                if word not in self.word2idx:
                    self.idx2word.append(word)
                    self.word2idx[word] = len(self.idx2word) - 1

    def __len__(self):
        return len(self.idx2word)


class Corpus(object):
    def __init__(self, path):
        self.dictionary = Dictionary()
        self.dictionary.read_vocab(os.path.join(path, 'words.txt'))
        self.train = self.tokenize(os.path.join(path, 'train.txt'))
        self.valid = self.tokenize(os.path.join(path, 'valid.txt'))
        self.test = self.tokenize(os.path.join(path, 'test.txt'))

    def tokenize(self, path):
        """Tokenizes a text file."""
        assert os.path.exists(path)
        with open(path, 'r', encoding='utf-8') as f:
            all_ids = []
            for line in f:
                words = line.split() + ['<s>']
                ids = []
                for word in words:
                    if word in self.dictionary.word2idx:
                        ids.append(self.dictionary.word2idx[word])
                    else:
                        ids.append(self.dictionary.word2idx['<unk>'])
                all_ids.append(torch.tensor(ids).type(torch.int64))
            data = torch.cat(all_ids)

        return data


================================================
FILE: egs/steps/pytorchnn/lmrescore_nbest_pytorchnn.sh
================================================
#!/usr/bin/env bash

# This script is very similar to rnnlm/lmrescore_nbest.sh, and it performs N-best
# LM rescoring with Pytorch trained neural LMs.

# Begin configuration section.
N=10
model_type=LSTM # LSTM, GRU or Transformer
embedding_dim=650
hidden_dim=650
nlayers=2
nhead=6
inv_acwt=10
cmd=run.pl
use_phi=false  # This is kind of an obscure option.  If true, we'll remove the old
  # LM weights (times 1-RNN_scale) using a phi (failure) matcher, which is
  # appropriate if the old LM weights were added in this way, e.g. by
  # lmrescore.sh.  Otherwise we'll use normal composition, which is appropriate
  # if the lattices came directly from decoding.  This won't actually make much
  # difference (if any) to WER, it's more so we know we are doing the right thing.
test=false # Activate a testing option.
stage=1 # Stage of this script, for partial reruns.
skip_scoring=false
keep_ali=true
# End configuration section.

echo "$0 $*"  # Print the command line for logging

[ -f ./path.sh ] && . ./path.sh
. utils/parse_options.sh

if [ $# != 7 ]; then
   echo "Do language model rescoring of lattices (partially remove old LM, add new LM)"
   echo "This version applies an neural LM and mixes it with the n-gram LM scores"
   echo "previously in the lattices, controlled by the first parameter (nnlm-weight)"
   echo ""
   echo "Usage: $0 [options] <nn-weight> <old-lang-dir> <nn-model-dir> vocab <data-dir> <input-decode-dir> <output-decode-dir>"
   echo "Main options:"
   echo "  --inv-acwt <inv-acwt>          # default 12.  e.g. --inv-acwt 17.  Equivalent to LM scale to use."
   echo "                                 # for N-best list generation... note, we'll score at different acwt's"
   echo "  --cmd <run.pl|queue.pl [opts]> # how to run jobs."
   echo "  --phi (true|false)             # Should be set to true if the source lattices were created"
   echo "                                 # by lmrescore.sh, false if they came from decoding."
   echo "  --N <N>                        # Value of N in N-best rescoring (default: 10)"
   exit 1;
fi

nnweight=$1 # weight of a neural network LM
oldlang=$2
nn_model=$3
vocabulary=$4
data=$5
indir=$6
dir=$7

acwt=$(perl -e "print (1.0/$inv_acwt);")

# Figures out if the old LM is G.fst or G.carpa
oldlm=$oldlang/G.fst
if [ -f $oldlang/G.carpa ]; then
  oldlm=$oldlang/G.carpa
elif [ ! -f $oldlm ]; then
  echo "$0: expecting either $oldlang/G.fst or $oldlang/G.carpa to exist" &&\
    exit 1;
fi

for f in $nn_model $vocabulary $indir/lat.1.gz; do
  [ ! -f $f ] && echo "$0: expected file $f to exist." && exit 1;
done

nj=$(cat $indir/num_jobs) || exit 1;
mkdir -p $dir;
cp $indir/num_jobs $dir/num_jobs

adir=$dir/archives

phi=$(grep -w '#0' $oldlang/words.txt | awk '{print $2}')

rm $dir/.error 2>/dev/null
mkdir -p $dir/log

# First convert lattice to N-best.  Be careful because this
# will be quite sensitive to the acoustic scale; this should be close
# to the one we'll finally get the best WERs with.
# Note: the lattice-rmali part here is just because we don't
# need the alignments for what we're doing.
if [ $stage -le 1 ]; then
  echo "$0: converting lattices to N-best lists."
  if $keep_ali; then
    $cmd JOB=1:$nj $dir/log/lat2nbest.JOB.log \
      lattice-to-nbest --acoustic-scale=$acwt --n=$N \
      "ark:gunzip -c $indir/lat.JOB.gz|" \
      "ark:|gzip -c >$dir/nbest1.JOB.gz" || exit 1;
  else
    $cmd JOB=1:$nj $dir/log/lat2nbest.JOB.log \
      lattice-to-nbest --acoustic-scale=$acwt --n=$N \
      "ark:gunzip -c $indir/lat.JOB.gz|" ark:- \|  \
      lattice-rmali ark:- "ark:|gzip -c >$dir/nbest1.JOB.gz" || exit 1;
  fi
fi

# next remove part of the old LM probs.
if [ "$oldlm" == "$oldlang/G.fst" ]; then
  if $use_phi; then
    if [ $stage -le 2 ]; then
      echo "$0: removing old LM scores."
      # Use the phi-matcher style of composition.. this is appropriate
      # if the old LM scores were added e.g. by lmrescore.sh, using
      # phi-matcher composition.
      $cmd JOB=1:$nj $dir/log/remove_old.JOB.log \
        lattice-scale --acoustic-scale=-1 --lm-scale=-1 "ark:gunzip -c $dir/nbest1.JOB.gz|" ark:- \| \
        lattice-compose --phi-label=$phi ark:- $oldlm ark:- \| \
        lattice-scale --acoustic-scale=-1 --lm-scale=-1 ark:- "ark:|gzip -c >$dir/nbest2.JOB.gz" \
        || exit 1;
    fi
  else
    if [ $stage -le 2 ]; then
      echo "$0: removing old LM scores."
      # this approach chooses the best path through the old LM FST, while
      # subtracting the old scores.  If the lattices came straight from decoding,
      # this is what we want.  Note here: each FST in "nbest1.JOB.gz" is a linear FST,
      # it has no alternatives (the N-best format works by having multiple keys
      # for each utterance).  When we do "lattice-1best" we are selecting the best
      # path through the LM, there are no alternatives to consider within the
      # original lattice.
      $cmd JOB=1:$nj $dir/log/remove_old.JOB.log \
        lattice-scale --acoustic-scale=-1 --lm-scale=-1 "ark:gunzip -c $dir/nbest1.JOB.gz|" ark:- \| \
        lattice-compose ark:- "fstproject --project_output=true $oldlm |" ark:- \| \
        lattice-1best ark:- ark:- \| \
        lattice-scale --acoustic-scale=-1 --lm-scale=-1 ark:- "ark:|gzip -c >$dir/nbest2.JOB.gz" \
        || exit 1;
    fi
  fi
else
  if [ $stage -le 2 ]; then
    echo "$0: removing old LM scores."
    $cmd JOB=1:$nj $dir/log/remove_old.JOB.log \
      lattice-lmrescore-const-arpa --lm-scale=-1.0 \
      "ark:gunzip -c $dir/nbest1.JOB.gz|" $oldlm \
      "ark:|gzip -c >$dir/nbest2.JOB.gz"  || exit 1;
  fi
fi

if [ $stage -le 3 ]; then
# Decompose the n-best lists into 4 archives.
  echo "$0: creating separate-archive form of N-best lists."
  $cmd JOB=1:$nj $dir/log/make_new_archives.JOB.log \
    mkdir -p $adir.JOB '&&' \
    nbest-to-linear "ark:gunzip -c $dir/nbest2.JOB.gz|" \
    "ark,t:$adir.JOB/ali" "ark,t:$adir.JOB/words" \
    "ark,t:$adir.JOB/lmwt.nolm" "ark,t:$adir.JOB/acwt" || exit 1;
fi

if [ $stage -le 4 ]; then
  echo "$0: doing the same with old LM scores."
# Create an archive with the LM scores before we
# removed the LM probs (will help us do interpolation).
$cmd JOB=1:$nj $dir/log/make_old_archives.JOB.log \
  nbest-to-linear "ark:gunzip -c $dir/nbest1.JOB.gz|" "ark:/dev/null" \
  "ark:/dev/null" "ark,t:$adir.JOB/lmwt.withlm" "ark:/dev/null" || exit 1;
fi

if $test; then # This branch is a sanity check that at the acwt where we generated
  # the N-best list, we get the same WER.
  echo "$0 [testing branch]: generating lattices without changing scores."
  $cmd JOB=1:$nj $dir/log/test.JOB.log \
    linear-to-nbest "ark:$adir.JOB/ali" "ark:$adir.JOB/words" "ark:$adir.JOB/lmwt.withlm" \
     "ark:$adir.JOB/acwt" ark:- \| \
    nbest-to-lattice ark:- "ark:|gzip -c >$dir/lat.JOB.gz" || exit 1;
  exit 0;
fi

if [ $stage -le 5 ]; then
  echo "$0: Creating archives with text-form of words, and LM scores without graph scores."
    # Do some small tasks; for these we don't use the queue, it will only slow us down.
  for n in $(seq $nj); do
    utils/int2sym.pl -f 2- $oldlang/words.txt < $adir.$n/words > $adir.$n/words_text || exit 1;
    mkdir -p $adir.$n/temp
    paste $adir.$n/lmwt.nolm $adir.$n/lmwt.withlm | awk '{print $1, ($4-$2);}' > \
      $adir.$n/lmwt.lmonly || exit 1;
  done
fi

if [ $stage -le 6 ]; then
  echo "$0: invoking steps/pytorchnn/compute_sentence_scores.py which computes sentence scores with a PyTorch trained neural LM."
  $cmd JOB=1:$nj $dir/log/compute_sentence_scores_pytorchnn.JOB.log \
    PYTHONPATH=steps/pytorchnn python steps/pytorchnn/compute_sentence_scores.py \
        --nbest-list $adir.JOB/words_text \
        --outfile $adir.JOB/lmwt.nn \
        --vocabulary $vocabulary \
        --model-path $nn_model \
        --model $model_type \
        --emsize $embedding_dim \
        --nhid $hidden_dim \
        --nlayers $nlayers \
        --nhead $nhead
fi

if [ $stage -le 7 ]; then
  echo "$0: reconstructing total LM+graph scores including interpolation of neural LM and old LM scores."
  for n in $(seq $nj); do
    paste $adir.$n/lmwt.nolm $adir.$n/lmwt.lmonly $adir.$n/lmwt.nn | awk -v nnweight=$nnweight \
      '{ key=$1; graphscore=$2; lmscore=$4; nnscore=$6;
     score = graphscore+(nnweight*nnscore)+((1-nnweight)*lmscore);
     print $1,score; } ' > $adir.$n/lmwt.interp.$nnweight || exit 1;
  done
fi

if [ $stage -le 8 ]; then
  echo "$0: reconstructing archives back into lattices."
  $cmd JOB=1:$nj $dir/log/reconstruct_lattice.JOB.log \
    linear-to-nbest "ark:$adir.JOB/ali" "ark:$adir.JOB/words" \
    "ark:$adir.JOB/lmwt.interp.$nnweight" "ark:$adir.JOB/acwt" ark:- \| \
    nbest-to-lattice ark:- "ark:|gzip -c >$dir/lat.JOB.gz" || exit 1;
fi

if ! $skip_scoring ; then
  echo "scoring..."
  [ ! -x local/score.sh ] && \
    echo "Not scoring because local/score.sh does not exist or not executable." && exit 1;
  local/score.sh --cmd "$cmd" $data $oldlang $dir ||
    { echo "$0: Scoring failed. (ignore by '--skip-scoring true')"; exit 1; }
fi

exit 0;


================================================
FILE: egs/steps/pytorchnn/model.py
================================================
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import math
import torch
import torch.nn as nn


class RNNModel(nn.Module):
    """Container module with an encoder, a recurrent module, and a decoder."""
    def __init__(self, rnn_type, ntoken, ninp, nhid, nlayers, dropout=0.5,
                 tie_weights=False):
        super(RNNModel, self).__init__()

        self.rnn_type = rnn_type
        self.nhid = nhid
        self.nlayers = nlayers
        self.drop = nn.Dropout(dropout)
        self.encoder = nn.Embedding(ntoken, ninp)
        if rnn_type in ['LSTM', 'GRU']:
            self.rnn = getattr(nn, rnn_type)(ninp, nhid, nlayers, dropout=dropout)
        else:
            try:
                nonlinearity = {'RNN_TANH': 'tanh', 'RNN_RELU': 'relu'}[rnn_type]
            except KeyError:
                raise ValueError("""An invalid option for `--model` was supplied,
                      options are ['LSTM', 'GRU', 'RNN_TANH' or 'RNN_RELU']""")
            self.rnn = nn.RNN(ninp, nhid, nlayers, nonlinearity=nonlinearity,
                              dropout=dropout)
        self.decoder = nn.Linear(nhid, ntoken)

        if tie_weights:
            if nhid != ninp:
                raise ValueError('When using the tied flag, nhid must be equal '
                                 'to emsize.')
            self.decoder.weight = self.encoder.weight

        self.init_weights()

    def init_weights(self):
        initrange = 0.1
        nn.init.uniform_(self.encoder.weight, -initrange, initrange)
        nn.init.zeros_(self.decoder.bias)
        nn.init.uniform_(self.decoder.weight, -initrange, initrange)

    def forward(self, x, hidden):
        emb = self.drop(self.encoder(x))
        output, hidden = self.rnn(emb, hidden)
        output = self.drop(output)
        decoded = self.decoder(output)
        return decoded, hidden

    def init_hidden(self, bsz):
        weight = next(self.parameters())
        if self.rnn_type == 'LSTM':
            return (weight.new_zeros(self.nlayers, bsz, self.nhid),
                    weight.new_zeros(self.nlayers, bsz, self.nhid))
        return weight.new_zeros(self.nlayers, bsz, self.nhid)


class PositionalEncoding(nn.Module):
    r"""Inject some information about the relative or absolute position of the
        tokens in the sequence. The positional encodings have the same dimension
        as the embeddings, so that the two can be summed. Here, we use sine and
        cosine functions of different frequencies.
    .. math::
        \text{PosEncoder}(pos, 2i) = sin(pos/10000^(2i/d_model))
        \text{PosEncoder}(pos, 2i+1) = cos(pos/10000^(2i/d_model))
        \text{where pos is the word position and i is the embed idx)
    Args:
        d_model: the embed dim (required).
        dropout: the dropout value (default=0.1).
        max_len: the max. length of the incoming sequence (default=5000).
    Examples:
        >>> pos_encoder = PositionalEncoding(d_model)
    """

    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        r"""Inputs of forward function
        Args:
            x: the sequence fed to the positional encoder model (required).
        Shape:
            x: [sequence length, batch size, embed dim]
            output: [sequence length, batch size, embed dim]
        Examples:
            >>> output = pos_encoder(x)
        """

        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)


class TransformerModel(nn.Module):
    """Container module with an encoder, a transformer module, and a decoder."""

    def __init__(self, ntoken, ninp, nhead, nhid, nlayers, dropout=0.5,
                 activation="relu", tie_weights=False):
        super(TransformerModel, self).__init__()
        try:
            from torch.nn import TransformerEncoder, TransformerEncoderLayer
        except ImportError:
            raise ImportError('TransformerEncoder module does not exist in '
                              'PyTorch 1.1 or lower.')
        self.model_type = 'Transformer'
        self.src_mask = None
        self.pos_encoder = PositionalEncoding(ninp, dropout)
        encoder_layers = TransformerEncoderLayer(ninp, nhead, nhid, dropout,
                                                 activation)
        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
        self.encoder = nn.Embedding(ntoken, ninp)
        self.ninp = ninp
        self.decoder = nn.Linear(ninp, ntoken)
        if tie_weights:
            if nhid != ninp:
                raise ValueError('When using the tied flag, nhid must be equal '
                                 'to emsize.')
            self.decoder.weight = self.encoder.weight
        self.init_weights()

    def _generate_square_subsequent_mask(self, sz):
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(
                mask == 1, float(0.0))
        return mask

    def init_weights(self):
        initrange = 0.1
        nn.init.uniform_(self.encoder.weight, -initrange, initrange)
        nn.init.zeros_(self.decoder.bias)
        nn.init.uniform_(self.decoder.weight, -initrange, initrange)

    def forward(self, src, has_mask=True):
        if has_mask:
            device = src.device
            if self.src_mask is None or self.src_mask.size(0) != len(src):
                mask = self._generate_square_subsequent_mask(len(src)).to(device)
                self.src_mask = mask
        else:
            self.src_mask = None
        src = self.encoder(src) * math.sqrt(self.ninp)
        src = self.pos_encoder(src)
        output = self.transformer_encoder(src, self.src_mask)
        output = self.decoder(output)
        return output


================================================
FILE: egs/steps/pytorchnn/train.py
================================================
""" This script is modified based on the word language model example in PyTorch:
    https://github.com/pytorch/examples/tree/master/word_language_model
    An example of model training and N-best rescoring can be found here:
    egs/swbd/s5c/local/pytorchnn/run_nnlm.sh
"""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import argparse
import time
import math
import random
import torch
import torch.nn as nn
import torch.optim as optim

import data
import model

parser = argparse.ArgumentParser(description="Train and evaluate a neural "
                                 "language model with PyTorch.")
# Model options
parser.add_argument('--data', type=str, default='./data/pytorchnn',
                    help='location of the data corpus')
parser.add_argument('--model', type=str, default='LSTM',
                    help='type of model architecture. can be RNN_TANH, '
                    'RNN_RELU, LSTM, GRU or Transformer.')
parser.add_argument('--emsize', type=int, default=200,
                    help='size of word embeddings')
parser.add_argument('--nhid', type=int, default=200,
                    help='number of hidden units per layer')
parser.add_argument('--nlayers', type=int, default=2,
                    help='number of layers')
parser.add_argument('--nhead', type=int, default=2,
                    help='the number of heads in the encoder/decoder of the '
                    'transformer model')

# Training options
parser.add_argument('--lr', type=float, default=0.1,
                    help='initial learning rate')
parser.add_argument('--batch-size', type=int, default=20, metavar='N',
                    help='batch size')
parser.add_argument('--epochs', type=int, default=20,
                    help='upper epoch limit')
parser.add_argument('--seq_len', type=int, default=35,
                    help='sequence length limit')
parser.add_argument('--clip', type=float, default=0.25,
                    help='gradient clipping')
parser.add_argument('--dropout', type=float, default=0.2,
                    help='dropout applied to layers')
parser.add_argument('--tied', action='store_true',
                    help='tie the word embedding and softmax weights')
parser.add_argument('--optimizer', type=str, default='SGD',
                    help='type of optimizer')
parser.add_argument('--log-interval', type=int, default=200, metavar='N',
                    help='report interval')

# Device options
parser.add_argument('--cuda', action='store_true', help='use CUDA')
parser.add_argument('--save', type=str, default='model.pt',
                    help='path to save the final model')
parser.add_argument('--seed', type=int, default=1111,
                    help='random seed')

args = parser.parse_args()
params = vars(args)

# Set the random seed for reproducibility
random.seed(args.seed)
torch.manual_seed(args.seed)
if torch.cuda.is_available():
    if not args.cuda:
        print('WARNING: You have a CUDA device, so you should probably run '
              'with --cuda')
    else:
        torch.cuda.manual_seed_all(args.seed)

print('Configurations')
for arg, p in params.items():
    print(arg, p)

device = torch.device("cuda" if args.cuda else "cpu")

#############################
# Load data
#############################
corpus = data.Corpus(args.data)


def batchify(data, bsz, random_start_idx=False):
    # Work out how cleanly we can divide the dataset into bsz parts.
    nbatch = data.size(0) // bsz
    # Shuffle data
    if random_start_idx:
        start_idx = random.randint(0, data.size(0) % bsz - 1)
    else:
        start_idx = 0
    # Trim off any extra elements that wouldn't cleanly fit (remainders).
    data = data.narrow(0, start_idx, nbatch * bsz)
    # Evenly divide the data across the bsz batches
    data = data.view(bsz, -1).t().contiguous()
    return data.to(device)


eval_batch_size = 20
train_data = batchify(corpus.train, args.batch_size)
val_data = batchify(corpus.valid, eval_batch_size)
test_data = batchify(corpus.test, eval_batch_size)

#############################
# Build the model
#############################
ntokens = len(corpus.dictionary)
if args.model == 'Transformer':
    # The activation function can be 'relu' (default) or 'gelu'
    model = model.TransformerModel(ntokens, args.emsize, args.nhead, args.nhid,
                      args.nlayers, args.dropout, "gelu", args.tied).to(device)
else:
    model = model.RNNModel(args.model, ntokens, args.emsize, args.nhid,
                           args.nlayers, args.dropout, args.tied).to(device)

total_params = sum(x.data.nelement() for x in model.parameters())
print('Args: {}'.format(args))
print('Model total parameters: {}'.format(total_params))

criterion = nn.CrossEntropyLoss()

#############################
# Training part
#############################


def repackage_hidden(h):
    """Wraps hidden states in new Tensors, to detach them from their history."""
    if isinstance(h, torch.Tensor):
        return h.detach()
    return tuple(repackage_hidden(v) for v in h)


# Divide the source data into chunks of length args.seq_len.
def get_batch(source, i):
    seq_len = min(args.seq_len, len(source) - 1 - i)
    data = source[i: i + seq_len]
    target = source[i + 1: i + 1 + seq_len].view(-1)
    return data, target


def train():
    # Turn on training model which enables dropout.
    model.train()
    total_loss = 0.
    start_time = time.time()
    if args.model != 'Transformer':
        hidden = model.init_hidden(args.batch_size)
    for batch, i in enumerate(range(0, train_data.size(0) - 1, args.seq_len)):
        data, targets = get_batch(train_data, i)
        optimizer.zero_grad()
        if args.model == 'Transformer':
            output = model(data)
        else:
            # Starting each batch, the hidden state is detached from how it was
            # previously produced. Otherwise, the model would try
            # backpropagating all the way to start of the dataset.
            hidden = repackage_hidden(hidden)
            output, hidden = model(data, hidden)

        loss = criterion(output.view(-1, ntokens), targets)
        loss.backward()

        # 'clip_grad_norm' helps prevent the exploding gradient problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip)
        optimizer.step()

        total_loss += loss.item()
        if batch % args.log_interval == 0 and batch > 0:
            cur_loss = total_loss / args.log_interval
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.3f} | '
                  'ms/batch {:5.2f} | loss {:5.2f} | ppl {:8.2f}'.format(
                      epoch, batch, len(train_data) // args.seq_len, lr,
                      elapsed * 1000 / args.log_interval, cur_loss,
                      math.exp(cur_loss)))
            total_loss = 0.
            start_time = time.time()


def evaluate(source):
    # Turn on evaluation mode which disables dropout.
    model.eval()
    total_loss = 0.
    if args.model != 'Transformer':
        hidden = model.init_hidden(eval_batch_size)
    # Speed up evaluation with torch.no_grad()
    with torch.no_grad():
        for i in range(0, source.size(0) - 1, args.seq_len):
            data, targets = get_batch(source, i)
            if args.model == 'Transformer':
                output = model(data)
            else:
                output, hidden = model(data, hidden)
                hidden = repackage_hidden(hidden)
            loss = criterion(output.view(-1, ntokens), targets)
            total_loss += len(data) * loss.item()
    return total_loss / (len(source) - 1)


#############################
# Train the model
#############################
lr = args.lr
best_val_loss = None
optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=0.9,
                      weight_decay=1e-5)
counter = 0
print("Start training")
try:
    for epoch in range(1, args.epochs + 1):
        epoch_start_time = time.time()
        train()
        val_loss = evaluate(val_data)
        print('-' * 89)
        print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
              'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time),
                                         val_loss, math.exp(val_loss)))
        print('-' * 89)

        # Save the model if validation loss is the best we've seen so far.
        # Saving state_dict is preferable.
        if not best_val_loss or val_loss < best_val_loss:
            with open(args.save, 'wb') as f:
                torch.save(model.state_dict(), f)
            best_val_loss = val_loss
        else:
            lr /= 2.
            optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.9,
                                  weight_decay=1e-5)
            counter += 1

        # Early stopping
        if counter == 8:
            break
except KeyboardInterrupt:
    print('-' * 89)
    print('Exiting from training early')

# Load the best saved model.
with open(args.save, 'rb') as f:
    model.load_state_dict(torch.load(f, map_location=lambda storage, loc: storage))
    if args.model in ['RNN_TANH', 'RNN_RELU', 'LSTM', 'GRU']:
        model.rnn.flatten_parameters()

# Run on test data.
test_loss = evaluate(test_data)
print('=' * 89)
print('| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format(
      test_loss, math.exp(test_loss)))
print('=' * 89)


================================================
FILE: egs/steps/resegment_data.sh
================================================
#!/usr/bin/env bash

# Copyright Johns Hopkins University (Author: Daniel Povey) 2013.  Apache 2.0.

# This script segments speech data based on some kind of decoding of
# whole recordings (e.g. whole conversation sides.  See 
# egs/swbd/s5b/local/run_resegment.sh for an example of usage.
# You'll probably want to use the script resegment_text.sh

# begin configuration section.
stage=0
cmd=run.pl
cleanup=true
segmentation_opts=  # E.g. set this as --segmentation-opts "--silence-proportion 0.2 --max-segment-length 10"

#end configuration section.

[ -f ./path.sh ] && . ./path.sh
. parse_options.sh || exit 1;

if [ $# -ne 5 ]; then
  echo "Usage: $0 [options] <in-data-dir> <lang> <decode-dir|ali-dir> <out-data-dir> <temp/log-dir>"
  echo " Options:"
  echo "    --cmd (run.pl|queue.pl...)      # specify how to run the sub-processes."
  echo "    --stage (0|1|2)                 # start scoring script from part-way through."
  echo "    --segmentation-opts '--opt1 opt1val --opt2 opt2val' # options for segmentation.pl"
  echo "e.g.:"
  echo "$0 data/train_unseg exp/tri3b/decode_train_unseg data/train_seg exp/tri3b_resegment"
  exit 1;
fi

data=$1
lang=$2
alidir=$3 # may actually be decode-dir.
data_out=$4
dir=$5

mkdir -p $data_out || exit 1;
rm $data_out/* 2>/dev/null # Old stuff that's partial can cause problems later if
                           # we call fix_data_dir.sh; it will cause things to be 
                           # thrown out.
mkdir -p $dir/log || exit 1;

utils/lang/check_phones_compatible.sh $lang/phones.txt $alidir/phones.txt || exit 1;
cp $alidir/phones.txt $dir || exit 1;

for f in $data/feats.scp $lang/phones.txt $alidir/ali.1.gz $alidir/num_jobs; do
  if [ ! -f $f ]; then 
    echo "$0: no such file $f"
    exit 1;
    fi
done

if [ -f $alidir/final.mdl ]; then
  model=$alidir/final.mdl
else
  if [ ! -f $alidir/../final.mdl ]; then
    echo "$0: found no model in $alidir/final.mdl or $alidir/../final.mdl"
    exit 1;
  fi
  model=$alidir/../final.mdl
fi

# get lists of sil,noise,nonsil phones
# convert *.ali.gz to *.ali.gz with 0,1,2.
# run perl script..
# output segments?


if ! [ `cat $lang/phones/optional_silence.txt | wc -w` -eq 1 ]; then
  echo "Error: this script only works if $lang/phones/optional_silence.txt contains exactly one entry.";
  echo "You'd have to modify the script to handle other cases."
  exit 1;
fi

silphone=`cat $lang/phones/optional_silence.txt` 
# silphone will typically be "sil" or "SIL". 

# 3 sets of phones: 0 is silence, 1 is noise, 2 is speech.,
(
 echo "$silphone 0"
 grep -v -w $silphone $lang/phones/silence.txt | awk '{print $1, 1;}'
 cat $lang/phones/nonsilence.txt | awk '{print $1, 2;}'
) > $dir/phone_map.txt


nj=`cat $alidir/num_jobs` || exit 1;

if [ $stage -le 0 ]; then
  $cmd JOB=1:$nj $dir/log/resegment.JOB.log \
    ali-to-phones --per-frame=true "$model" "ark:gunzip -c $alidir/ali.JOB.gz|" ark,t:- \| \
    utils/int2sym.pl -f 2- $lang/phones.txt \| \
    utils/apply_map.pl -f 2- $dir/phone_map.txt \| \
    utils/segmentation.pl $segmentation_opts \| \
    gzip -c '>' $dir/segments.JOB.gz
fi

if [ $stage -le 1 ]; then
  if [ -f $data/reco2file_and_channel ]; then
    cp $data/reco2file_and_channel $data_out/reco2file_and_channel
  fi
  if [ -f $data/wav.scp ]; then
    cp $data/wav.scp $data_out/wav.scp
  else
    echo "Expected file $data/wav.scp to exist" # or there is really nothing to copy.
    exit 1
  fi
  for f in glm stm; do 
    if [ -f $data/$f ]; then
      cp $data/$f $data_out/$f
    fi
  done

  for n in `seq $nj`; do gunzip -c $dir/segments.$n.gz; done | \
    sort > $data_out/segments || exit 1;

  [ ! -s $data_out/segments ] && echo "No data produced" && exit 1;

  # We'll make the speaker-ids be the same as the recording-ids (e.g. conversation
  # sides).  This will normally be OK for telephone data.
  cat $data_out/segments | awk '{print $1, $2}' > $data_out/utt2spk || exit 1
  utils/utt2spk_to_spk2utt.pl $data_out/utt2spk > $data_out/spk2utt || exit 1

  if $cleanup; then
    rm $dir/segments.*.gz
  fi
fi

cat $data_out/segments | awk '{num_secs += $4 - $3;} END{print "Number of hours of data is " (num_secs/3600);}'


================================================
FILE: egs/steps/resegment_text.sh
================================================
#!/usr/bin/env bash

# Copyright Johns Hopkins University (Author: Daniel Povey) 2013.  Apache 2.0.

# This script takes two data directories that represent different
# segmentations of the same data (both must have "segments" files and
# the recording-ids must match), and it converts the text in one directory
# to correspond to the segmentation in the other.  Its output is the
# "text" file in the second directory.  To get the alignments, it
# must be provided an "alignment" directory where the training data
# from the first directory has been aligned.

# begin configuration section.
stage=0
cmd=run.pl

#end configuration section.

[ -f ./path.sh ] && . ./path.sh
. parse_options.sh || exit 1;

if [ $# -ne 5 ]; then
  echo "Usage: $0 [options] <in-data-dir> <lang> <ali-dir|model-dir> <out-data-dir> <temp/log-dir>"
  echo " Options:"
  echo "    --cmd (run.pl|queue.pl...)      # specify how to run the sub-processes."
  echo "    --stage (0|1|2)                 # start scoring script from part-way through."
  echo "e.g.:"
  echo "$0 data/train data/lang exp/tri3b_ali_all data/train_reseg exp/tri3b_resegment"
  exit 1;
fi

data=$1
lang=$2
alidir=$3
data_out=$4
dir=$5


mkdir -p $dir/log || exit 1;

for f in $data/feats.scp $lang/phones.txt $alidir/ali.1.gz $alidir/num_jobs \
   $alidir/final.mdl $data_out/reco2file_and_channel $data_out/segments; do
  if [ ! -f $f ]; then 
    echo "$0: no such file $f"
    exit 1;
  fi
done


if [ $stage -le 0 ]; then
  echo "$0: calling get_train_ctm.sh to produce ctms of the alignments."
  # Caution: this will produce logs in $alidir/log/get_ctm.log
  steps/get_train_ctm.sh --cmd "$cmd" $data $lang $alidir || exit 1;  
fi


if [ $stage -le 1 ]; then
  if [ ! -s $alidir/ctm ]; then
    echo "$0: file $data/ctm does not exist or is empty."
    exit 1;
  fi
  echo "$0: converting ctm to a format where we have the recording-id ..."
  echo "$0: ... in place of the side and channel, e.g. sw02008-B instead of sw02008 B"

  cat $alidir/ctm | awk -v r=$data_out/reco2file_and_channel  \
   'BEGIN{while((getline < r) > 0) { if(NF!=3) {exit(1);} map[ $2 "&" $3 ] = $1;}}
    {if (NF!=5) {print "bad line " $0; exit(2);} reco = map[$1 "&" $2];
     if (length(reco) == 0) { print "Bad key " $1 "&" $2; exit(3); } 
     print reco, $3, $4, $5; } ' > $dir/ctm_per_reco
fi

if [ $stage -le 2 ]; then
  cat $data_out/segments | perl -e '
     @ARGV == 1 || die;
     $ctm_per_reco = shift @ARGV;
     $chunk_size = 3;
     open(C, "<$ctm_per_reco") || die "opening ctm file $ctm_per_reco";
     # we build up an associative array indexed by a pair of ids: $reco,$n
     # where $n is a 5-second chunk of time.
     sub to_chunk { my $t = shift @_; return int($t / $chunk_size); }
     while (<C>) {
       @A = split;  @A == 4 || die "Bad line $_ in $ctm_per_reco";
       ($reco, $start, $length, $word) = @A;
       $chunk = to_chunk($start);
       if (! defined $reco2list{$reco,$chunk} ){ $reco2list{$reco,$chunk} = [ ]; } # new anonymous array
       $arrayref = $reco2list{$reco,$chunk};
       push @$arrayref, [ $start, $length, $word ]; # another level of anonymous array..
     }
     $num_utts = 0; $num_empty = 0;
     while(<STDIN>) {
       @A = split;  @A == 4 || die "Bad line $_ in stdin";
       ($utt, $reco, $start, $end) = @A;
       @text = ();
       for ($chunk = to_chunk($start); $chunk <= to_chunk($end); $chunk++) {
         $arrayref = $reco2list{$reco,$chunk};
         if (defined $arrayref) {
           foreach $entry ( @$arrayref ) { # note, $entry is itself an arrayref
                                           # to an array containing $start $end $word.
             $word_start = $$entry[0];
             if ($word_start >= $start && $word_start <= $end) {
               $word_end = $$entry[1] + $word_start;
               if ($word_end >= $start && $word_end <= $end) {
                 $word = $$entry[2]; defined $word || die;
                 push @text, $word;
               }
             }
           }
         }
       }
       $num_utts++;
       if (@text > 0) { $t = join(" ", @text); print "$utt $t\n";; }
       else { $num_empty++; }
     }
     print STDERR "Processed $num_utts utterances, of which $num_empty had no text.\n"; ' \
       $dir/ctm_per_reco | sort > $data_out/text || exit 1;

  nw_old=`cat $data/text | wc | awk '{print $2 - $1}'`
  nw_new=`cat $data_out/text | wc | awk '{print $2 - $1}'`
  echo "Number of words of training text changed from $nw_old to $nw_new";

  if [ ! -s $data_out/text ]; then
    echo "$0: produced empty output.  Something went wrong."
    exit 1;
  fi
fi


================================================
FILE: egs/steps/rnnlmrescore.sh
================================================
#!/usr/bin/env bash

# please see lmrescore_rnnlm_lat.sh which is a newer script using lattices.

# Begin configuration section.
N=10
inv_acwt=12
cmd=run.pl
use_phi=false  # This is kind of an obscure option.  If true, we'll remove the old
  # LM weights (times 1-RNN_scale) using a phi (failure) matcher, which is
  # appropriate if the old LM weights were added in this way, e.g. by
  # lmrescore.sh.  Otherwise we'll use normal composition, which is appropriate
  # if the lattices came directly from decoding.  This won't actually make much
  # difference (if any) to WER, it's more so we know we are doing the right thing.
test=false # Activate a testing option.
stage=1 # Stage of this script, for partial reruns.
rnnlm_ver=rnnlm-0.3e
skip_scoring=false
keep_ali=true
# End configuration section.


echo "$0 $@"  # Print the command line for logging

[ -f ./path.sh ] && . ./path.sh
. utils/parse_options.sh


if [ $# != 6 ]; then
   echo "Do language model rescoring of lattices (partially remove old LM, add new LM)"
   echo "This version applies an RNNLM and mixes it with the LM scores"
   echo "previously in the lattices., controlled by the first parameter (rnnlm-weight)"
   echo ""
   echo "Usage: utils/rnnlmrescore.sh <rnn-weight> <old-lang-dir> <rnn-dir> <data-dir> <input-decode-dir> <output-decode-dir>"
   echo "Main options:"
   echo "  --inv-acwt <inv-acwt>          # default 12.  e.g. --inv-acwt 17.  Equivalent to LM scale to use."
   echo "                                 # for N-best list generation... note, we'll score at different acwt's"
   echo "  --cmd <run.pl|queue.pl [opts]> # how to run jobs."
   echo "  --phi (true|false)             # Should be set to true if the source lattices were created"
   echo "                                 # by lmrescore.sh, false if they came from decoding."
   echo "  --N <N>                        # Value of N in N-best rescoring (default: 10)"
   exit 1;
fi


rnnweight=$1
oldlang=$2
rnndir=$3
data=$4
indir=$5
dir=$6


acwt=`perl -e "print (1.0/$inv_acwt);"` # Note: we'll actually produce lattices
 # that will be scored at a range of acoustic weights.  This acwt should be close
 # to the final one we'll pick, though, for best performance (it controls the
 # N-best list generation).

# Figures out if the old LM is G.fst or G.carpa
oldlm=$oldlang/G.fst
if [ -f $oldlang/G.carpa ]; then
  oldlm=$oldlang/G.carpa
elif [ ! -f $oldlm ]; then
  echo "$0: expecting either $oldlang/G.fst or $oldlang/G.carpa to exist" &&\
    exit 1;
fi

for f in $rnndir/rnnlm $data/feats.scp $indir/lat.1.gz; do
  [ ! -f $f ] && echo "$0: expected file $f to exist." && exit 1;
done

nj=`cat $indir/num_jobs` || exit 1;
mkdir -p $dir;
cp $indir/num_jobs $dir/num_jobs

adir=$dir/archives

phi=`grep -w '#0' $oldlang/words.txt | awk '{print $2}'`

rm $dir/.error 2>/dev/null
mkdir -p $dir/log

# First convert lattice to N-best.  Be careful because this
# will be quite sensitive to the acoustic scale; this should be close
# to the one we'll finally get the best WERs with.
# Note: the lattice-rmali part here is just because we don't
# need the alignments for what we're doing.
if [ $stage -le 1 ]; then
  echo "$0: converting lattices to N-best."
  if $keep_ali; then
    $cmd JOB=1:$nj $dir/log/lat2nbest.JOB.log \
      lattice-to-nbest --acoustic-scale=$acwt --n=$N \
      "ark:gunzip -c $indir/lat.JOB.gz|" \
      "ark:|gzip -c >$dir/nbest1.JOB.gz" || exit 1;
  else
    $cmd JOB=1:$nj $dir/log/lat2nbest.JOB.log \
      lattice-to-nbest --acoustic-scale=$acwt --n=$N \
      "ark:gunzip -c $indir/lat.JOB.gz|" ark:- \|  \
      lattice-rmali ark:- "ark:|gzip -c >$dir/nbest1.JOB.gz" || exit 1;
  fi
fi

# next remove part of the old LM probs.
if [ "$oldlm" == "$oldlang/G.fst" ]; then
  if $use_phi; then
    if [ $stage -le 2 ]; then
      echo "$0: removing old LM scores."
      # Use the phi-matcher style of composition.. this is appropriate
      # if the old LM scores were added e.g. by lmrescore.sh, using
      # phi-matcher composition.
      $cmd JOB=1:$nj $dir/log/remove_old.JOB.log \
        lattice-scale --acoustic-scale=-1 --lm-scale=-1 "ark:gunzip -c $dir/nbest1.JOB.gz|" ark:- \| \
        lattice-compose --phi-label=$phi ark:- $oldlm ark:- \| \
        lattice-scale --acoustic-scale=-1 --lm-scale=-1 ark:- "ark:|gzip -c >$dir/nbest2.JOB.gz" \
        || exit 1;
    fi
  else
    if [ $stage -le 2 ]; then
      echo "$0: removing old LM scores."
      # this approach chooses the best path through the old LM FST, while
      # subtracting the old scores.  If the lattices came straight from decoding,
      # this is what we want.  Note here: each FST in "nbest1.JOB.gz" is a linear FST,
      # it has no alternatives (the N-best format works by having multiple keys
      # for each utterance).  When we do "lattice-1best" we are selecting the best
      # path through the LM, there are no alternatives to consider within the
      # original lattice.
      $cmd JOB=1:$nj $dir/log/remove_old.JOB.log \
        lattice-scale --acoustic-scale=-1 --lm-scale=-1 "ark:gunzip -c $dir/nbest1.JOB.gz|" ark:- \| \
        lattice-compose ark:- "fstproject --project_output=true $oldlm |" ark:- \| \
        lattice-1best ark:- ark:- \| \
        lattice-scale --acoustic-scale=-1 --lm-scale=-1 ark:- "ark:|gzip -c >$dir/nbest2.JOB.gz" \
        || exit 1;
    fi
  fi
else
  if [ $stage -le 2 ]; then
    echo "$0: removing old LM scores."
    $cmd JOB=1:$nj $dir/log/remove_old.JOB.log \
      lattice-lmrescore-const-arpa --lm-scale=-1.0 \
      "ark:gunzip -c $dir/nbest1.JOB.gz|" $oldlm \
      "ark:|gzip -c >$dir/nbest2.JOB.gz"  || exit 1;
  fi
fi

if [ $stage -le 3 ]; then
# Decompose the n-best lists into 4 archives.
  echo "$0: creating separate-archive form of N-best lists."
  $cmd JOB=1:$nj $dir/log/make_new_archives.JOB.log \
    mkdir -p $adir.JOB '&&' \
    nbest-to-linear "ark:gunzip -c $dir/nbest2.JOB.gz|" \
    "ark,t:$adir.JOB/ali" "ark,t:$adir.JOB/words" \
    "ark,t:$adir.JOB/lmwt.nolm" "ark,t:$adir.JOB/acwt" || exit 1;
fi

if [ $stage -le 4 ]; then
  echo "$0: doing the same with old LM scores."
# Create an archive with the LM scores before we
# removed the LM probs (will help us do interpolation).
$cmd JOB=1:$nj $dir/log/make_old_archives.JOB.log \
  nbest-to-linear "ark:gunzip -c $dir/nbest1.JOB.gz|" "ark:/dev/null" \
  "ark:/dev/null" "ark,t:$adir.JOB/lmwt.withlm" "ark:/dev/null" || exit 1;
fi

if $test; then # This branch is a sanity check that at the acwt where we generated
  # the N-best list, we get the same WER.
  echo "$0 [testing branch]: generating lattices without changing scores."
  $cmd JOB=1:$nj $dir/log/test.JOB.log \
    linear-to-nbest "ark:$adir.JOB/ali" "ark:$adir.JOB/words" "ark:$adir.JOB/lmwt.withlm" \
     "ark:$adir.JOB/acwt" ark:- \| \
    nbest-to-lattice ark:- "ark:|gzip -c >$dir/lat.JOB.gz" || exit 1;
  exit 0;
fi

if [ $stage -le 5 ]; then
  echo "$0: Creating archives with text-form of words, and LM scores without graph scores."
    # Do some small tasks; for these we don't use the queue, it will only slow us down.
  for n in `seq $nj`; do
    utils/int2sym.pl -f 2- $oldlang/words.txt < $adir.$n/words > $adir.$n/words_text || exit 1;
    mkdir -p $adir.$n/temp
    paste $adir.$n/lmwt.nolm $adir.$n/lmwt.withlm | awk '{print $1, ($4-$2);}' > \
      $adir.$n/lmwt.lmonly || exit 1;
  done
fi
if [ $stage -le 6 ]; then
  echo "$0: invoking utils/rnnlm_compute_scores.sh which calls rnnlm, to get RNN LM scores."
  $cmd JOB=1:$nj $dir/log/rnnlm_compute_scores.JOB.log \
    utils/rnnlm_compute_scores.sh --rnnlm_ver $rnnlm_ver $rnndir $adir.JOB/temp $adir.JOB/words_text $adir.JOB/lmwt.rnn \
    || exit 1;
fi
if [ $stage -le 7 ]; then
  echo "$0: reconstructing total LM+graph scores including interpolation of RNNLM and old LM scores."
  for n in `seq $nj`; do
    paste $adir.$n/lmwt.nolm $adir.$n/lmwt.lmonly $adir.$n/lmwt.rnn | awk -v rnnweight=$rnnweight \
      '{ key=$1; graphscore=$2; lmscore=$4; rnnscore=$6;
     score = graphscore+(rnnweight*rnnscore)+((1-rnnweight)*lmscore);
     print $1,score; } ' > $adir.$n/lmwt.interp.$rnnweight || exit 1;
  done
fi

if [ $stage -le 8 ]; then
  echo "$0: reconstructing archives back into lattices."
  $cmd JOB=1:$nj $dir/log/reconstruct_lattice.JOB.log \
    linear-to-nbest "ark:$adir.JOB/ali" "ark:$adir.JOB/words" \
    "ark:$adir.JOB/lmwt.interp.$rnnweight" "ark:$adir.JOB/acwt" ark:- \| \
    nbest-to-lattice ark:- "ark:|gzip -c >$dir/lat.JOB.gz" || exit 1;
fi

if ! $skip_scoring ; then
  [ ! -x local/score.sh ] && \
    echo "Not scoring because local/score.sh does not exist or not executable." && exit 1;
  local/score.sh --cmd "$cmd" $data $oldlang $dir ||
    { echo "$0: Scoring failed. (ignore by '--skip-scoring true')"; exit 1; }
fi

exit 0;


================================================
FILE: egs/steps/scoring/score_kaldi_cer.sh
================================================
#!/usr/bin/env bash
# Copyright 2012-2014  Johns Hopkins University (Author: Daniel Povey, Yenda Trmal)
# Apache 2.0

# This script computes the CER (Character Error Rate) as opposed to the script
# local/score_kaldi.sh (which computes WER i.e. Word Error Rate).
# if you need to compute both the WER and CER, you can use the stage parameters
# i.e. write your own local/score.sh that will contain
# 
# steps/scoring/score_kaldi_wer.sh "$@"
# steps/scoring/score_kaldi_cer.sh --stage 2 "$@"
#
# NOTE it would work without the --stage 2, but this way its more effective
# as the lattice decoding won't be run twice.


[ -f ./path.sh ] && . ./path.sh

# begin configuration section.
cmd=run.pl
decode_mbr=false
stats=true
beam=6
stage=0
word_ins_penalty=0.0,0.5,1.0
min_lmwt=7
max_lmwt=17
iter=final
#end configuration section.

echo "$0 $@"  # Print the command line for logging
[ -f ./path.sh ] && . ./path.sh
. parse_options.sh || exit 1;

if [ $# -ne 3 ]; then
  echo "Usage: $0 [--cmd (run.pl|queue.pl...)] <data-dir> <lang-dir|graph-dir> <decode-dir>"
  echo " Options:"
  echo "    --cmd (run.pl|queue.pl...)      # specify how to run the sub-processes."
  echo "    --stage (0|1|2)                 # start scoring script from part-way through."
  echo "    --decode_mbr (true/false)       # maximum bayes risk decoding (confusion network)."
  echo "    --min_lmwt <int>                # minumum LM-weight for lattice rescoring "
  echo "    --max_lmwt <int>                # maximum LM-weight for lattice rescoring "
  exit 1;
fi

data=$1
lang_or_graph=$2
dir=$3

symtab=$lang_or_graph/words.txt

for f in $symtab $dir/lat.1.gz $data/text; do
  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
done


ref_filtering_cmd="cat"
[ -x local/wer_output_filter ] && ref_filtering_cmd="local/wer_output_filter"
[ -x local/wer_ref_filter ] && ref_filtering_cmd="local/wer_ref_filter"
hyp_filtering_cmd="cat"
[ -x local/wer_output_filter ] && hyp_filtering_cmd="local/wer_output_filter"
[ -x local/wer_hyp_filter ] && hyp_filtering_cmd="local/wer_hyp_filter"


if $decode_mbr ; then
  echo "$0: scoring with MBR, word insertion penalty=$word_ins_penalty"
else
  echo "$0: scoring with word insertion penalty=$word_ins_penalty"
fi


mkdir -p $dir/scoring_kaldi
cat $data/text | $ref_filtering_cmd > $dir/scoring_kaldi/test_filt.txt || exit 1;
if [ $stage -le 0 ]; then

  for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do
    mkdir -p $dir/scoring_kaldi/penalty_$wip/log

    if $decode_mbr ; then
      $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring_kaldi/penalty_$wip/log/best_path.LMWT.log \
        acwt=\`perl -e \"print 1.0/LMWT\"\`\; \
        lattice-scale --inv-acoustic-scale=LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \
        lattice-add-penalty --word-ins-penalty=$wip ark:- ark:- \| \
        lattice-prune --beam=$beam ark:- ark:- \| \
        lattice-mbr-decode  --word-symbol-table=$symtab \
        ark:- ark,t:- \| \
        utils/int2sym.pl -f 2- $symtab \| \
        $hyp_filtering_cmd '>' $dir/scoring_kaldi/penalty_$wip/LMWT.txt || exit 1;

    else
      $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring_kaldi/penalty_$wip/log/best_path.LMWT.log \
        lattice-scale --inv-acoustic-scale=LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \
        lattice-add-penalty --word-ins-penalty=$wip ark:- ark:- \| \
        lattice-best-path --word-symbol-table=$symtab ark:- ark,t:- \| \
        utils/int2sym.pl -f 2- $symtab \| \
        $hyp_filtering_cmd '>' $dir/scoring_kaldi/penalty_$wip/LMWT.txt || exit 1;
    fi

    $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring_kaldi/penalty_$wip/log/score.LMWT.log \
      cat $dir/scoring_kaldi/penalty_$wip/LMWT.txt \| \
      compute-wer --text --mode=present \
      ark:$dir/scoring_kaldi/test_filt.txt  ark,p:- ">&" $dir/wer_LMWT_$wip || exit 1;

  done
fi


# the stage 2 is intentional, to allow nice coexistence with score_kaldi.sh
# in cases user would be combining calls to these two scripts as shown in
# the example at the top of the file. Otherwise we or he/she would have to
# filter the script parameters instead of simple forwarding.
if [ $stage -le 2 ] ; then
  files=($dir/scoring_kaldi/test_filt.txt)
  for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do
    for lmwt in $(seq $min_lmwt $max_lmwt); do
      files+=($dir/scoring_kaldi/penalty_${wip}/${lmwt}.txt)
    done
  done

  for f in "${files[@]}" ; do
    fout=${f%.txt}.chars.txt
    if [ -x local/character_tokenizer ]; then
      cat $f |  local/character_tokenizer > $fout
    else
      cat $f |  perl -CSDA -ane '
        {
          print $F[0];
          foreach $s (@F[1..$#F]) {
            if (($s =~ /\[.*\]/) || ($s =~ /\<.*\>/) || ($s =~ "!SIL")) {
              print " $s";
            } else {
              @chars = split "", $s;
              foreach $c (@chars) {
                print " $c";
              }
            }
          }
          print "\n";
        }' > $fout
    fi
  done

  for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do
    $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring_kaldi/penalty_$wip/log/score.cer.LMWT.log \
      cat $dir/scoring_kaldi/penalty_$wip/LMWT.chars.txt \| \
      compute-wer --text --mode=present \
      ark:$dir/scoring_kaldi/test_filt.chars.txt  ark,p:- ">&" $dir/cer_LMWT_$wip || exit 1;
  done
fi

if [ $stage -le 3 ] ; then
  for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do
    for lmwt in $(seq $min_lmwt $max_lmwt); do
      # adding /dev/null to the command list below forces grep to output the filename
      grep WER $dir/cer_${lmwt}_${wip} /dev/null
    done
  done | utils/best_wer.sh  >& $dir/scoring_kaldi/best_cer || exit 1

  best_cer_file=$(awk '{print $NF}' $dir/scoring_kaldi/best_cer)
  best_wip=$(echo $best_cer_file | awk -F_ '{print $NF}')
  best_lmwt=$(echo $best_cer_file | awk -F_ '{N=NF-1; print $N}')

  if [ -z "$best_lmwt" ]; then
    echo "$0: we could not get the details of the best CER from the file $dir/cer_*.  Probably something went wrong."
    exit 1;
  fi

  if $stats; then
    mkdir -p $dir/scoring_kaldi/cer_details
    echo $best_lmwt > $dir/scoring_kaldi/cer_details/lmwt # record best language model weight
    echo $best_wip > $dir/scoring_kaldi/cer_details/wip # record best word insertion penalty

    $cmd $dir/scoring_kaldi/log/stats1.cer.log \
      cat $dir/scoring_kaldi/penalty_$best_wip/${best_lmwt}.chars.txt \| \
      align-text --special-symbol="'***'" ark:$dir/scoring_kaldi/test_filt.chars.txt ark:- ark,t:- \|  \
      utils/scoring/wer_per_utt_details.pl --special-symbol "'***'" \| tee $dir/scoring_kaldi/cer_details/per_utt \|\
       utils/scoring/wer_per_spk_details.pl $data/utt2spk \> $dir/scoring_kaldi/cer_details/per_spk || exit 1;

    $cmd $dir/scoring_kaldi/log/stats2.cer.log \
      cat $dir/scoring_kaldi/cer_details/per_utt \| \
      utils/scoring/wer_ops_details.pl --special-symbol "'***'" \| \
      sort -b -i -k 1,1 -k 4,4rn -k 2,2 -k 3,3 \> $dir/scoring_kaldi/cer_details/ops || exit 1;

    $cmd $dir/scoring_kaldi/log/cer_bootci.cer.log \
      compute-wer-bootci --mode=present \
        ark:$dir/scoring_kaldi/test_filt.chars.txt ark:$dir/scoring_kaldi/penalty_$best_wip/${best_lmwt}.chars.txt \
        '>' $dir/scoring_kaldi/cer_details/cer_bootci || exit 1;

  fi
fi

# If we got here, the scoring was successful.
# As a  small aid to prevent confusion, we remove all wer_{?,??} files;
# these originate from the previous version of the scoring files
# i keep both statement here because it could lead to confusion about
# the capabilities of the script (we don't do cer in the script)
rm $dir/wer_{?,??} 2>/dev/null
rm $dir/cer_{?,??} 2>/dev/null

exit 0;


================================================
FILE: egs/steps/scoring/score_kaldi_compare.sh
================================================
#!/usr/bin/env bash
# Copyright 2016 Nicolas Serrano
# Apache 2.0

[ -f ./path.sh ] && . ./path.sh

# begin configuration section.
cmd=run.pl
replications=10000
#end configuration section.

echo "$0 $@"  # Print the command line for logging
[ -f ./path.sh ] && . ./path.sh
. parse_options.sh || exit 1;

if [ $# -ne 3 ]; then
  echo "Usage: $0 [--cmd (run.pl|queue.pl...)] <score-dir1> <score-dir2> <score-compare-dir>"
  echo " Options:"
  echo "    --cmd (run.pl|queue.pl...)      # specify how to run the sub-processes."
  echo "    --replications <int>            # number of bootstrap evaluation to compute confidence."
  exit 1;
fi

dir1=$1
dir2=$2
dir_compare=$3

mkdir -p $dir_compare/log

for d in $dir1 $dir2; do
  for f in test_filt.txt best_wer; do
    [ ! -f $d/$f ] && echo "$0: no such file $d/$f" && exit 1;
  done
done


best_wer_file1=$(awk '{print $NF}' $dir1/best_wer)
best_transcript_file1=$(echo $best_wer_file1 | sed -e 's=.*/wer_==' | \
        awk -v FS='_' -v dir=$dir1 '{print dir"/penalty_"$2"/"$1".txt"}')

best_wer_file2=$(awk '{print $NF}' $dir2/best_wer)
best_transcript_file2=$(echo $best_wer_file2 | sed -e 's=.*/wer_==' | \
        awk -v FS='_' -v dir=$dir2 '{print dir"/penalty_"$2"/"$1".txt"}')

$cmd $dir_compare/log/score_compare.log \
  compute-wer-bootci --replications=$replications \
    ark:$dir1/test_filt.txt ark:$best_transcript_file1 ark:$best_transcript_file2 \
    '>' $dir_compare/wer_bootci_comparison || exit 1;

exit 0;


================================================
FILE: egs/steps/scoring/score_kaldi_wer.sh
================================================
#!/usr/bin/env bash
# Copyright 2012-2014  Johns Hopkins University (Author: Daniel Povey, Yenda Trmal)
# Apache 2.0

# See the script steps/scoring/score_kaldi_cer.sh in case you need to evalutate CER

[ -f ./path.sh ] && . ./path.sh

# begin configuration section.
cmd=run.pl
stage=0
decode_mbr=false
stats=true
beam=6
word_ins_penalty=0.0,0.5,1.0
min_lmwt=7
max_lmwt=17
iter=final
#end configuration section.

echo "$0 $@"  # Print the command line for logging
[ -f ./path.sh ] && . ./path.sh
. parse_options.sh || exit 1;

if [ $# -ne 3 ]; then
  echo "Usage: $0 [--cmd (run.pl|queue.pl...)] <data-dir> <lang-dir|graph-dir> <decode-dir>"
  echo " Options:"
  echo "    --cmd (run.pl|queue.pl...)      # specify how to run the sub-processes."
  echo "    --stage (0|1|2)                 # start scoring script from part-way through."
  echo "    --decode_mbr (true/false)       # maximum bayes risk decoding (confusion network)."
  echo "    --min_lmwt <int>                # minumum LM-weight for lattice rescoring "
  echo "    --max_lmwt <int>                # maximum LM-weight for lattice rescoring "
  exit 1;
fi

data=$1
lang_or_graph=$2
dir=$3

symtab=$lang_or_graph/words.txt

for f in $symtab $dir/lat.1.gz $data/text; do
  [ ! -f $f ] && echo "score.sh: no such file $f" && exit 1;
done


ref_filtering_cmd="cat"
[ -x local/wer_output_filter ] && ref_filtering_cmd="local/wer_output_filter"
[ -x local/wer_ref_filter ] && ref_filtering_cmd="local/wer_ref_filter"
hyp_filtering_cmd="cat"
[ -x local/wer_output_filter ] && hyp_filtering_cmd="local/wer_output_filter"
[ -x local/wer_hyp_filter ] && hyp_filtering_cmd="local/wer_hyp_filter"


if $decode_mbr ; then
  echo "$0: scoring with MBR, word insertion penalty=$word_ins_penalty"
else
  echo "$0: scoring with word insertion penalty=$word_ins_penalty"
fi


mkdir -p $dir/scoring_kaldi
cat $data/text | $ref_filtering_cmd > $dir/scoring_kaldi/test_filt.txt || exit 1;
if [ $stage -le 0 ]; then

  for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do
    mkdir -p $dir/scoring_kaldi/penalty_$wip/log

    if $decode_mbr ; then
      $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring_kaldi/penalty_$wip/log/best_path.LMWT.log \
        acwt=\`perl -e \"print 1.0/LMWT\"\`\; \
        lattice-scale --inv-acoustic-scale=LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \
        lattice-add-penalty --word-ins-penalty=$wip ark:- ark:- \| \
        lattice-prune --beam=$beam ark:- ark:- \| \
        lattice-mbr-decode  --word-symbol-table=$symtab \
        ark:- ark,t:- \| \
        utils/int2sym.pl -f 2- $symtab \| \
        $hyp_filtering_cmd '>' $dir/scoring_kaldi/penalty_$wip/LMWT.txt || exit 1;

    else
      $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring_kaldi/penalty_$wip/log/best_path.LMWT.log \
        lattice-scale --inv-acoustic-scale=LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \
        lattice-add-penalty --word-ins-penalty=$wip ark:- ark:- \| \
        lattice-best-path --word-symbol-table=$symtab ark:- ark,t:- \| \
        utils/int2sym.pl -f 2- $symtab \| \
        $hyp_filtering_cmd '>' $dir/scoring_kaldi/penalty_$wip/LMWT.txt || exit 1;
    fi

    $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring_kaldi/penalty_$wip/log/score.LMWT.log \
      cat $dir/scoring_kaldi/penalty_$wip/LMWT.txt \| \
      compute-wer --text --mode=present \
      ark:$dir/scoring_kaldi/test_filt.txt  ark,p:- ">&" $dir/wer_LMWT_$wip || exit 1;

  done
fi


if [ $stage -le 1 ]; then

  for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do
    for lmwt in $(seq $min_lmwt $max_lmwt); do
      # adding /dev/null to the command list below forces grep to output the filename
      grep WER $dir/wer_${lmwt}_${wip} /dev/null
    done
  done | utils/best_wer.sh  >& $dir/scoring_kaldi/best_wer || exit 1

  best_wer_file=$(awk '{print $NF}' $dir/scoring_kaldi/best_wer)
  best_wip=$(echo $best_wer_file | awk -F_ '{print $NF}')
  best_lmwt=$(echo $best_wer_file | awk -F_ '{N=NF-1; print $N}')

  if [ -z "$best_lmwt" ]; then
    echo "$0: we could not get the details of the best WER from the file $dir/wer_*.  Probably something went wrong."
    exit 1;
  fi

  if $stats; then
    mkdir -p $dir/scoring_kaldi/wer_details
    echo $best_lmwt > $dir/scoring_kaldi/wer_details/lmwt # record best language model weight
    echo $best_wip > $dir/scoring_kaldi/wer_details/wip # record best word insertion penalty

    $cmd $dir/scoring_kaldi/log/stats1.log \
      cat $dir/scoring_kaldi/penalty_$best_wip/$best_lmwt.txt \| \
      align-text --special-symbol="'***'" ark:$dir/scoring_kaldi/test_filt.txt ark:- ark,t:- \|  \
      utils/scoring/wer_per_utt_details.pl --special-symbol "'***'" \| tee $dir/scoring_kaldi/wer_details/per_utt \|\
       utils/scoring/wer_per_spk_details.pl $data/utt2spk \> $dir/scoring_kaldi/wer_details/per_spk || exit 1;

    $cmd $dir/scoring_kaldi/log/stats2.log \
      cat $dir/scoring_kaldi/wer_details/per_utt \| \
      utils/scoring/wer_ops_details.pl --special-symbol "'***'" \| \
      sort -b -i -k 1,1 -k 4,4rn -k 2,2 -k 3,3 \> $dir/scoring_kaldi/wer_details/ops || exit 1;

    $cmd $dir/scoring_kaldi/log/wer_bootci.log \
      compute-wer-bootci --mode=present \
        ark:$dir/scoring_kaldi/test_filt.txt ark:$dir/scoring_kaldi/penalty_$best_wip/$best_lmwt.txt \
        '>' $dir/scoring_kaldi/wer_details/wer_bootci || exit 1;

  fi
fi

# If we got here, the scoring was successful.
# As a  small aid to prevent confusion, we remove all wer_{?,??} files;
# these originate from the previous version of the scoring files
# i keep both statement here because it could lead to confusion about
# the capabilities of the script (we don't do cer in the script)
rm $dir/wer_{?,??} 2>/dev/null
rm $dir/cer_{?,??} 2>/dev/null

exit 0;


================================================
FILE: egs/steps/search_index.sh
================================================
#!/usr/bin/env bash

# Copyright 2012  Johns Hopkins University (Author: Guoguo Chen)
# Apache 2.0

# Begin configuration section.  
cmd=run.pl
nbest=-1
strict=true
indices_dir=
frame_subsampling_factor=1
# End configuration section.

echo "$0 $@"  # Print the command line for logging

[ -f ./path.sh ] && . ./path.sh; # source the path.
. parse_options.sh || exit 1;

if [ $# != 2 ]; then
   echo "Usage: steps/search_index.sh [options] <kws-data-dir> <kws-dir>"
   echo " e.g.: steps/search_index.sh data/kws exp/sgmm2_5a_mmi/decode/kws/"
   echo ""
   echo "main options (for others, see top of script file)"
   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
   echo "  --nbest <int>                                    # return n best results. (-1 means all)"
   echo "  --indices-dir <path>                             # where the indices should be stored, by default it will be in <kws-dir>"
   exit 1;
fi


kwsdatadir=$1;
kwsdir=$2;

if [ -z $indices_dir ] ; then
  indices_dir=$kwsdir
fi

mkdir -p $kwsdir/log;
nj=`cat $indices_dir/num_jobs` || exit 1;
if [ -f $kwsdatadir/keywords.fsts.gz ]; then
  keywords="\"gunzip -c $kwsdatadir/keywords.fsts.gz|\""
elif [ -f $kwsdatadir/keywords.fsts ]; then
  keywords=$kwsdatadir/keywords.fsts;
else
  echo "$0: no such file $kwsdatadir/keywords.fsts[.gz]" && exit 1;
fi

for f in $indices_dir/index.1.gz ; do
  [ ! -f $f ] && echo "make_index.sh: no such file $f" && exit 1;
done

$cmd JOB=1:$nj $kwsdir/log/search.JOB.log \
  kws-search --strict=$strict --negative-tolerance=-1 \
  --frame-subsampling-factor=${frame_subsampling_factor} \
  "ark:gzip -cdf $indices_dir/index.JOB.gz|" ark:$keywords \
  "ark,t:|gzip -c > $kwsdir/result.JOB.gz" \
  "ark,t:|gzip -c > $kwsdir/stats.JOB.gz" || exit 1;

exit 0;


================================================
FILE: egs/steps/segmentation/ali_to_targets.sh
================================================
#!/usr/bin/env bash

# Copyright 2017  Vimal Manohar
# Apache 2.0

# This script converts alignments into targets for training neural network
# for speech activity detection. The mapping from phones to speech / silence / garbage
# is defined by the options --silence-phones and --garbage-phones.
# This is similar to the script steps/segmentation/lats_to_targets.sh which 
# converts lattices to targets. See that script for details about the 
# targets matrix.

set -o pipefail

silence_phones=
garbage_phones=
max_phone_duration=0.5

cmd=run.pl

[ -f ./path.sh ] && . ./path.sh
. utils/parse_options.sh

if [ $# -ne 4 ]; then
  cat <<EOF
  This script converts alignments into targets for training neural network
  for speech activity detection. The mapping from phones to speech / silence / garbage
  is defined by the options --silence-phones and --garbage-phones.

  This is similar to the script steps/segmentation/lats_to_targets.sh which 
  converts lattices to targets. See that script for details about the 
  targets matrix.

  Usage: steps/segmentation/ali_to_targets.sh <data-dir> <lang> <ali-dir> <targets-dir>"
  e.g.: steps/segmentation/ali_to_targets.sh \
  --silence-phones data/lang/phones/optional_silence.txt \
  --garbage-phones data/lang/phones/silence.txt \
  --max-phone-duration 0.5 \
  data/train_split10s data/lang \
  exp/segmentation1a/tri3b_train_split10s_ali \
  exp/segmentation1a/tri3b_train_split10s_targets
EOF
  exit 1
fi

data=$1
lang=$2
ali_dir=$3
dir=$4

if [ -f $ali_dir/final.mdl ]; then
  srcdir=$ali_dir
else
  srcdir=$ali_dir/..
fi

for f in $data/utt2spk $ali_dir/ali.1.gz $srcdir/final.mdl; do 
  if [ ! -f $f ]; then
    echo "$0: Could not find file $f"
    exit 1
  fi
done

mkdir -p $dir

if [ -z "$garbage_phones" ]; then
  oov_phone=$(steps/segmentation/internal/get_oov_phone.py $lang) || exit 1
  echo $oov_phone | utils/int2sym.pl $lang/phones.txt > $dir/garbage_phones.txt || exit 1
else 
  cp $garbage_phones $dir/garbage_phones.txt || exit 1
fi

if [ -z "$silence_phones" ]; then
  cat $lang/silence_phones.txt | \
    utils/filter_scp.pl --exclude $dir/garbage_phones.txt > \
    $dir/silence_phones.txt
else 
  cp $silence_phones $dir/silence_phones.txt
fi

nj=$(cat $ali_dir/num_jobs) || exit 1

$cmd JOB=1:$nj $dir/log/get_arc_info.JOB.log \
  ali-to-phones --ctm-output --frame-shift=1 \
    $srcdir/final.mdl "ark:gunzip -c $ali_dir/ali.JOB.gz |" - \| \
  utils/int2sym.pl -f 5 $lang/phones.txt \| \
  awk '{print $1" "int($3)" "int($4)" 1.0 "$5}' \> \
  $dir/arc_info_sym.JOB.txt || exit 1

# make $dir an absolute pathname.
dir=`perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' $dir ${PWD}`

frame_subsampling_factor=1
if [ -f $srcdir/frame_subsampling_factor ]; then
  frame_subsampling_factor=$(cat $srcdir/frames_subsampling_factor)
  echo $frame_subsampling_factor > $dir/frame_subsampling_factor
fi

frame_shift=$(utils/data/get_frame_shift.sh $data) || exit 1
max_phone_len=$(perl -e "print int($max_phone_duration / $frame_shift)")

$cmd JOB=1:$nj $dir/log/get_targets.JOB.log \
  steps/segmentation/internal/arc_info_to_targets.py \
    --silence-phones=$dir/silence_phones.txt \
    --garbage-phones=$dir/garbage_phones.txt \
    --max-phone-length=$max_phone_len \
    $dir/arc_info_sym.JOB.txt - \| \
  copy-feats ark,t:- \
    ark,scp:$dir/targets.JOB.ark,$dir/targets.JOB.scp || exit 1

for n in $(seq $nj); do
  cat $dir/targets.$n.scp
done > $dir/targets.scp

steps/segmentation/validate_targets_dir.sh $dir $data || exit 1

echo "$0: Done creating targets in $dir/targets.scp"


================================================
FILE: egs/steps/segmentation/combine_targets_dirs.sh
================================================
#!/usr/bin/env bash

# Copyright 2017 Nagendra Kumar Goel
#           2018 Vimal Manohar   
# Apache 2.0.

# This script combines targets directory into a new targets directory 
# containing targets from all the input targets directories.

echo "$0 $@"  # Print the command line for logging

if [ -f path.sh ]; then . ./path.sh; fi
. parse_options.sh || exit 1;

if [ $# -lt 3 ]; then
  echo "Usage: $0 [options] <data> <dest-targets-dir> <src-targets-dir1> <src-targets-dir2> ..."
  echo "e.g.: $0 data/train exp/targets_combined exp/targets_1 exp/targets_2"
  exit 1;
fi

export LC_ALL=C

data=$1;
shift;
dest=$1;
shift;
first_src=$1;

mkdir -p $dest;
rm -f $dest/{targets.*.ark,frame_subsampling_factor} 2>/dev/null

frame_subsampling_factor=1
if [ -f $first_src/frame_subsampling_factor ]; then
  cp $first_src/frame_subsampling_factor $dest
  frame_subsampling_factor=$(cat $dest/frame_subsampling_factor)
fi

for d in $*; do
  this_frame_subsampling_factor=1
  if [ -f $d/frame_subsampling_factor ]; then
    this_frame_subsampling_factor=$(cat $d/frame_subsampling_factor)
  fi

  if [ $this_frame_subsampling_factor != $frame_subsampling_factor ]; then
    echo "$0: Cannot combine targets directories with different frame-subsampling-factors" 1>&2
    exit 1
  fi

  cat $d/targets.scp
done | sort -k1,1 > $dest/targets.scp || exit 1

steps/segmentation/validate_targets_dir.sh $dest $data || exit 1

echo "Combined targets and stored in $dest"
exit 0


================================================
FILE: egs/steps/segmentation/convert_targets_dir_to_whole_recording.sh
================================================
#!/usr/bin/env bash

# Copyright 2017  Vimal Manohar
# Apache 2.0

# This script converts targets corresponding to 'data' at segments level 
# in 'targets_dir' to whole-recording level corresponding to the 
# whole-recording data directory 'whole_data'.

# The targets for the whole-recording are created by simply copying the targets 
# for the in-segment region, while setting the out-of-segment region targets
# to the target values contained in the file specified 
# (in kaldi vector text format) by --default-targets option.
# By default, the 'default_targets' would be [ 0 0 0 ].
# Note that the script steps/segmentation/get_targets_for_out_of_segments.sh 
# can be used to get targets only for the out-of-segment regions. It is 
# better to use that when you need specific target values like all silence 
# ([ 1 0 0 ]) or all garbage ([ 0 0 1 ]) for the out-of-segment regions. 
# That way you can control how the out-of-segment target values are 
# combined using the weights in steps/segmentation/merge_targets_dirs.sh

nj=4
cmd=run.pl
default_targets=   # vector of default targets in text format

set -o pipefail -u

[ -f ./path.sh ] && . ./path.sh
. utils/parse_options.sh

if [ $# -ne 4 ]; then
  cat <<EOF
  This script converts targets corresponding to 'data' at segments level 
  in 'targets_dir' to whole-recording level corresponding to the 
  whole-recording data directory 'whole_data'.
  See top of the script for more details.

  Usage: steps/segmentation/convert_targets_to_whole_recording.sh <data-dir> <whole-data-dir> <targets-dir> <whole-targets-dir>
   e.g.: steps/segmentation/convert_targets_to_whole_recording.sh \
    data/train_split10s data/train_whole \
    exp/segmentation1a/tri3b_train_split10s_targets \
    exp/segmentation1a/tri3b_train_whole_targets
EOF
  exit 1
fi

data=$1
whole_data=$2
targets_dir=$3
dir=$4

if [ ! -f $data/segments ]; then
  awk '{print $1}' $whole_data/wav.scp > $dir/recos
  utils/filter_scp.pl $data/utt2spk $dir/recos > $dir/recos.data

  nr=$(cat $dir/reco | wc -l)
  nu=$(cat $dir/recos.data | wc -l) 

  if [ $nu -lt $[$nr - ($nr/20)] ]; then
    echo "Found less that 95% the recordings of $whole_data in $data."
    exit 1;
  fi

  cp $targets_dir/targets.scp $dir
  cp $targets_dir/frame_subsampling_factor $dir || true

  exit 0
fi

for f in $data/segments $targets_dir/targets.scp \
  $whole_data/wav.scp; do
  if [ ! -f $f ]; then 
    echo "$0: Could not find file $f" 
    exit 1
  fi
done

frame_shift=$(utils/data/get_frame_shift.sh $data) || exit 1
frame_subsampling_factor=1
if [ -f $targets_dir/frame_subsampling_factor ]; then
  frame_subsampling_factor=$(cat $targets_dir/frames_subsampling_factor) || exit 1
fi
frame_shift=`perl -e "print ($frame_shift * $frame_subsampling_factor);"`

mkdir -p $dir/split${nj}reco
split_scps=
for n in $(seq $nj); do
  split_scps="$split_scps $dir/split${nj}reco/wav.$n.scp"
done
utils/split_scp.pl $whole_data/wav.scp $split_scps

utils/data/get_reco2utt_for_data.sh $data > $dir/reco2utt

mkdir -p $dir/split${nj}reco
utils/filter_scps.pl JOB=1:$nj $dir/split${nj}reco/wav.JOB.scp $dir/reco2utt \
  $dir/split${nj}reco/reco2utt.JOB || exit 1
utils/filter_scps.pl -f 2 JOB=1:$nj $dir/split${nj}reco/wav.JOB.scp $data/segments \
    $dir/split${nj}reco/segments.JOB || exit 1
utils/filter_scps.pl JOB=1:$nj $dir/split${nj}reco/segments.JOB $targets_dir/targets.scp \
    $dir/split${nj}reco/targets.JOB.scp || exit 1

# make $dir an absolute pathname.
dir=`perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' $dir ${PWD}`

utils/data/get_utt2num_frames.sh --cmd "$cmd" --nj $nj $whole_data
cp $whole_data/utt2num_frames $dir/reco2num_frames

$cmd JOB=1:$nj $dir/log/merge_targets_to_reco.JOB.log \
  steps/segmentation/internal/merge_segment_targets_to_recording.py \
    --reco2num-frames=$dir/reco2num_frames --frame-shift=$frame_shift \
    --default-targets="$default_targets" \
    $dir/split${nj}reco/reco2utt.JOB $dir/split${nj}reco/segments.JOB \
    $dir/split${nj}reco/targets.JOB.scp - \| \
  copy-feats ark,t:- ark,scp:$dir/targets.JOB.ark,$dir/targets.JOB.scp || exit 1

for n in $(seq $nj); do
  cat $dir/targets.$n.scp
done | sort -k1,1 > $dir/targets.scp

steps/segmentation/validate_targets_dir.sh $dir $whole_data || exit 1

echo "$0: Converted targets to whole recordings in $dir"
exit 0


================================================
FILE: egs/steps/segmentation/convert_utt2spk_and_segments_to_rttm.py
================================================
#!/usr/bin/env python

# Copyright 2016  Vimal Manohar
# Apache 2.0.

"""This script converts kaldi-style utt2spk and segments to a NIST RTTM
file.

The RTTM format is
<type> <file-id> <channel-id> <begin-time> \
        <duration> <ortho> <stype> <name> <conf>

<type> = SPEAKER for each segment.
<file-id> - the File-ID of the recording
<channel-id> - the Channel-ID, usually 1
<begin-time> - start time of segment
<duration> - duration of segment
<ortho> - <NA> (this is ignored)
<stype> - <NA> (this is ignored)
<name> - speaker name or id
<conf> - <NA> (this is ignored)

"""

from __future__ import print_function
import argparse
import sys

sys.path.insert(0, 'steps')
import libs.common as common_lib


def get_args():
    parser = argparse.ArgumentParser(
        description="""This script converts kaldi-style utt2spk and
        segments to a NIST RTTM file""")

    parser.add_argument("--reco2file-and-channel", type=str,
                        action=common_lib.NullstrToNoneAction,
                        help="""Input reco2file_and_channel.
                        The format is <recording-id> <file-id> <channel-id>.
                        If not provided, then <recording-id> is taken as the
                        <file-id> with <channel-id> = 1.""")
    parser.add_argument("utt2spk", type=str,
                        help="Input utt2spk file")
    parser.add_argument("segments", type=str,
                        help="Input segments file")
    parser.add_argument("rttm_file", type=str,
                        help="Output RTTM file")

    args = parser.parse_args()
    return args


def main():
    args = get_args()

    if args.reco2file_and_channel is not None:
        reco2file_and_channel = {}
        with common_lib.smart_open(args.reco2file_and_channel) as fh:
            for line in fh:
                parts = line.strip().split()
                reco2file_and_channel[parts[0]] = (parts[1], parts[2])

    utt2spk = {}
    with common_lib.smart_open(args.utt2spk) as fh:
        for line in fh:
            parts = line.strip().split()
            utt2spk[parts[0]] = parts[1]

    with common_lib.smart_open(args.segments) as segments_reader, \
            common_lib.smart_open(args.rttm_file, 'w') as rttm_writer:
        for line in segments_reader:
            parts = line.strip().split()

            utt = parts[0]
            spkr = utt2spk[utt]

            reco = parts[1]
            file_id = reco
            channel = 1

            if args.reco2file_and_channel is not None:
                try:
                    file_id, channel = reco2file_and_channel[reco]
                except KeyError:
                    raise RuntimeError(
                        "Could not find recording {0} in {1}".format(
                            reco, args.reco2file_and_channel))

            start_time = float(parts[2])
            duration = float(parts[3]) - start_time

            print("SPEAKER {0} {1} {2:7.2f} {3:7.2f} "
                  "<NA> <NA> {4} <NA>".format(
                      file_id, channel, start_time,
                      duration, spkr), file=rttm_writer)


if __name__ == '__main__':
    main()


================================================
FILE: egs/steps/segmentation/copy_targets_dir.sh
================================================
#!/usr/bin/env bash

# Copyright    2017  Nagendra Kumar Goel
#              2014  Johns Hopkins University (author: Nagendra K Goel)
# Apache 2.0

# This script makes a copy of targets directory (by copying targets.scp),
# possibly adding a specified prefix or a suffix to the utterance names.

# begin configuration section
utt_prefix=
utt_suffix=
# end configuration section

if [ -f ./path.sh ]; then . ./path.sh; fi
. ./utils/parse_options.sh

if [ $# != 2 ]; then
  echo "Usage: "
  echo "  $0 [options] <srcdir> <destdir>"
  echo "e.g.:"
  echo " $0  --utt-prefix=1- exp/segmentation_1a/train_whole_combined_targets_sub3 exp/segmentation_1a/train_whole_combined_targets_sub3_rev1"
  echo "Options"
  echo "   --utt-prefix=<prefix>     # Prefix for utterance ids, default empty"
  echo "   --utt-suffix=<suffix>     # Suffix for utterance ids, default empty"
  exit 1;
fi

export LC_ALL=C

srcdir=$1
destdir=$2

mkdir -p $destdir

if [ -f $srcdir/frame_subsampling_factor ]; then
  cp $srcdir/frame_subsampling_factor $destdir
fi

cat $srcdir/targets.scp | awk -v p=$utt_prefix -v s=$utt_suffix \
  '{printf("%s %s%s%s\n", $1, p, $1, s);}' > $destdir/utt_map

cat $srcdir/targets.scp | utils/apply_map.pl -f 1 $destdir/utt_map | \
  sort -k1,1 > $destdir/targets.scp

echo "$0: copied targets from $srcdir to $destdir"


================================================
FILE: egs/steps/segmentation/decode_sad.sh
================================================
#!/usr/bin/env bash

# Copyright 2016  Vimal Manohar
# Apache 2.0.

# This script does Viterbi decoding using a matrix of frame log-likelihoods 
# with the columns corresponding to the pdfs.
# It is a wrapper around the binary decode-faster.

set -e
set -o pipefail

cmd=run.pl
nj=4
acwt=0.1
beam=8
max_active=1000
transform=   # Transformation matrix to apply on the input archives read from output.scp

. ./path.sh

. utils/parse_options.sh

if [ $# -ne 3 ]; then
  echo "Usage: $0 <graph-dir> <nnet_output_dir> <decode-dir>"
  echo " e.g.: $0 "
  exit 1 
fi

graph_dir=$1
nnet_output_dir=$2
dir=$3

mkdir -p $dir/log

echo $nj > $dir/num_jobs

for f in $graph_dir/HCLG.fst $nnet_output_dir/output.scp $extra_files; do
  if [ ! -f $f ]; then
    echo "$0: Could not find file $f"
    exit 1
  fi
done

rspecifier="ark:utils/split_scp.pl -j $nj \$[JOB-1] $nnet_output_dir/output.scp | copy-feats scp:- ark:- |"

# Apply a transformation on the input matrix to combine 
# probs from different columns to pseudo-likelihoods
if [ ! -z "$transform" ]; then
  rspecifier="$rspecifier transform-feats $transform ark:- ark:- |"
fi

# Convert pseudo-likelihoods to pseudo log-likelihood
rspecifier="$rspecifier copy-matrix --apply-log ark:- ark:- |"

decoder_opts+=(--acoustic-scale=$acwt --beam=$beam --max-active=$max_active)

$cmd JOB=1:$nj $dir/log/decode.JOB.log \
  decode-faster ${decoder_opts[@]} \
  $graph_dir/HCLG.fst "$rspecifier" \
  ark:/dev/null "ark:| gzip -c > $dir/ali.JOB.gz"


================================================
FILE: egs/steps/segmentation/detect_speech_activity.sh
================================================
#!/usr/bin/env bash

# Copyright 2016-17  Vimal Manohar
#              2017  Nagendra Kumar Goel
# Apache 2.0.

# This script does nnet3-based speech activity detection given an input 
# kaldi data directory and outputs a segmented kaldi data directory.
# This script can also do music detection and other similar segmentation
# using appropriate options such as --output-name output-music.

set -e 
set -o pipefail
set -u

if [ -f ./path.sh ]; then . ./path.sh; fi

affix=  # Affix for the segmentation
nj=32
cmd=run.pl
stage=-1

# Feature options (Must match training)
mfcc_config=conf/mfcc_hires.conf
feat_affix=   # Affix for the type of feature used

convert_data_dir_to_whole=true    # If true, the input data directory is 
                                  # first converted to whole data directory (i.e. whole recordings)
                                  # and segmentation is done on that.
                                  # If false, then the original segments are 
                                  # retained and they are split into sub-segments.

output_name=output   # The output node in the network
sad_name=sad    # Base name for the directory storing the computed loglikes
                # Can be music for music detection
segmentation_name=segmentation  # Base name for the directory doing segmentation
                                # Can be segmentation_music for music detection

# SAD network config
iter=final  # Model iteration to use

# Contexts must ideally match training for LSTM models, but
# may not necessarily for stats components
extra_left_context=0  # Set to some large value, typically 40 for LSTM (must match training)
extra_right_context=0  
extra_left_context_initial=-1
extra_right_context_final=-1
frames_per_chunk=150

# Decoding options
graph_opts="--min-silence-duration=0.03 --min-speech-duration=0.3 --max-speech-duration=10.0"
acwt=0.3

# These <from>_in_<to>_weight represent the fraction of <from> probability 
# to transfer to <to> class.
# e.g. --speech-in-sil-weight=0.0 --garbage-in-sil-weight=0.0 --sil-in-speech-weight=0.0 --garbage-in-speech-weight=0.3
transform_probs_opts=""

# Postprocessing options
segment_padding=0.2   # Duration (in seconds) of padding added to segments 
min_segment_dur=0   # Minimum duration (in seconds) required for a segment to be included
                    # This is before any padding. Segments shorter than this duration will be removed.
                    # This is an alternative to --min-speech-duration above.
merge_consecutive_max_dur=0   # Merge consecutive segments as long as the merged segment is no longer than this many
                              # seconds. The segments are only merged if their boundaries are touching.
                              # This is after padding by --segment-padding seconds.
                              # 0 means do not merge. Use 'inf' to not limit the duration.

echo $* 

. utils/parse_options.sh

if [ $# -ne 5 ]; then
  echo "This script does nnet3-based speech activity detection given an input kaldi "
  echo "data directory and outputs an output kaldi data directory."
  echo "See script for details of the options to be supplied."
  echo "Usage: $0 <src-data-dir> <sad-nnet-dir> <mfcc-dir> <work-dir> <out-data-dir>"
  echo " e.g.: $0 ~/workspace/egs/ami/s5b/data/sdm1/dev exp/nnet3_sad_snr/nnet_tdnn_j_n4 \\"
  echo "    mfcc_hires exp/segmentation_sad_snr/nnet_tdnn_j_n4 data/ami_sdm1_dev"
  echo ""
  echo "Options: "
  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
  echo "  --nj <num-job>                                 # number of parallel jobs to run."
  echo "  --stage <stage>                                # stage to do partial re-run from."
  echo "  --convert-data-dir-to-whole <true|false>    # If true, the input data directory is "
  echo "                                              # first converted to whole data directory (i.e. whole recordings) "
  echo "                                              # and segmentation is done on that."
  echo "                                              # If false, then the original segments are "
  echo "                                              # retained and they are split into sub-segments."
  echo "  --output-name <name>    # The output node in the network"
  echo "  --extra-left-context  <context|0>   # Set to some large value, typically 40 for LSTM (must match training)"
  echo "  --extra-right-context  <context|0>   # For BLSTM or statistics pooling"
  exit 1
fi

src_data_dir=$1   # The input data directory that needs to be segmented.
                  # If convert_data_dir_to_whole is true, any segments in that will be ignored.
sad_nnet_dir=$2   # The SAD neural network
mfcc_dir=$3       # The directory to store the features
dir=$4            # Work directory
data_dir=$5       # The output data directory will be ${data_dir}_seg

affix=${affix:+_$affix}
feat_affix=${feat_affix:+_$feat_affix}

data_id=`basename $data_dir`
sad_dir=${dir}/${sad_name}${affix}_${data_id}_whole${feat_affix}
seg_dir=${dir}/${segmentation_name}${affix}_${data_id}_whole${feat_affix}

if $convert_data_dir_to_whole; then
  test_data_dir=data/${data_id}_whole${feat_affix}_hires
  if [ $stage -le 0 ]; then
    rm -r ${test_data_dir} || true
    utils/data/convert_data_dir_to_whole.sh $src_data_dir ${test_data_dir}
  fi
else
  test_data_dir=data/${data_id}${feat_affix}_hires
  if [ $stage -le 0 ]; then
    rm -r ${test_data_dir} || true
    utils/copy_data_dir.sh $src_data_dir $test_data_dir
  fi
fi

###############################################################################
## Extract input features 
###############################################################################

if [ $stage -le 1 ]; then
  utils/fix_data_dir.sh $test_data_dir
  steps/make_mfcc.sh --mfcc-config $mfcc_config --nj $nj --cmd "$cmd" --write-utt2num-frames true \
    ${test_data_dir} exp/make_hires$feat_affix/${data_id} $mfcc_dir
  steps/compute_cmvn_stats.sh ${test_data_dir} exp/make_hires$feat_affix/${data_id} $mfcc_dir
  utils/fix_data_dir.sh ${test_data_dir}
fi

###############################################################################
## Forward pass through the network network and dump the log-likelihoods.
###############################################################################

frame_subsampling_factor=1
if [ -f $sad_nnet_dir/frame_subsampling_factor ]; then
  frame_subsampling_factor=$(cat $sad_nnet_dir/frame_subsampling_factor)
fi

mkdir -p $dir
if [ $stage -le 4 ]; then
  if [ "$(readlink -f $sad_nnet_dir)" != "$(readlink -f $dir)" ]; then
    cp $sad_nnet_dir/cmvn_opts $dir || exit 1
  fi

  ########################################################################
  ## Initialize neural network for decoding using the output $output_name
  ########################################################################

  if [ ! -z "$output_name" ] && [ "$output_name" != output ]; then
    $cmd $dir/log/get_nnet_${output_name}.log \
      nnet3-copy --edits="rename-node old-name=$output_name new-name=output" \
      $sad_nnet_dir/$iter.raw $dir/${iter}_${output_name}.raw || exit 1
    iter=${iter}_${output_name}
  else 
    if ! diff $sad_nnet_dir/$iter.raw $dir/$iter.raw; then
      cp $sad_nnet_dir/$iter.raw $dir/
    fi
  fi

  steps/nnet3/compute_output.sh --nj $nj --cmd "$cmd" \
    --iter ${iter} \
    --extra-left-context $extra_left_context \
    --extra-right-context $extra_right_context \
    --extra-left-context-initial $extra_left_context_initial \
    --extra-right-context-final $extra_right_context_final \
    --frames-per-chunk $frames_per_chunk --apply-exp true \
    --frame-subsampling-factor $frame_subsampling_factor \
    ${test_data_dir} $dir $sad_dir || exit 1
fi

###############################################################################
## Prepare FST we search to make speech/silence decisions.
###############################################################################

utils/data/get_utt2dur.sh --nj $nj --cmd "$cmd" $test_data_dir || exit 1
frame_shift=$(utils/data/get_frame_shift.sh $test_data_dir) || exit 1

graph_dir=${dir}/graph_${output_name}
if [ $stage -le 5 ]; then
  mkdir -p $graph_dir

  # 1 for silence and 2 for speech
  cat <<EOF > $graph_dir/words.txt
<eps> 0
silence 1
speech 2
EOF

  $cmd $graph_dir/log/make_graph.log \
    steps/segmentation/internal/prepare_sad_graph.py $graph_opts \
      --frame-shift=$(perl -e "print $frame_shift * $frame_subsampling_factor") - \| \
    fstcompile --isymbols=$graph_dir/words.txt --osymbols=$graph_dir/words.txt '>' \
      $graph_dir/HCLG.fst
fi

###############################################################################
## Do Viterbi decoding to create per-frame alignments.
###############################################################################

post_vec=$sad_nnet_dir/post_${output_name}.vec
if [ ! -f $sad_nnet_dir/post_${output_name}.vec ]; then
  if [ ! -f $sad_nnet_dir/post_${output_name}.txt ]; then
    echo "$0: Could not find $sad_nnet_dir/post_${output_name}.vec. "
    echo "Re-run the corresponding stage in the training script possibly "
    echo "with --compute-average-posteriors=true or compute the priors "
    echo "from the training labels"
    exit 1
  else
    post_vec=$sad_nnet_dir/post_${output_name}.txt
  fi
fi

mkdir -p $seg_dir
if [ $stage -le 6 ]; then
  steps/segmentation/internal/get_transform_probs_mat.py \
    --priors="$post_vec" $transform_probs_opts > $seg_dir/transform_probs.mat

  steps/segmentation/decode_sad.sh --acwt $acwt --cmd "$cmd" \
    --nj $nj \
    --transform "$seg_dir/transform_probs.mat" \
    $graph_dir $sad_dir $seg_dir
fi

###############################################################################
## Post-process segmentation to create kaldi data directory.
###############################################################################

if [ $stage -le 7 ]; then
  steps/segmentation/post_process_sad_to_segments.sh \
    --segment-padding $segment_padding --min-segment-dur $min_segment_dur \
    --merge-consecutive-max-dur $merge_consecutive_max_dur \
    --cmd "$cmd" --frame-shift $(perl -e "print $frame_subsampling_factor * $frame_shift") \
    ${test_data_dir} ${seg_dir} ${seg_dir}
fi

if [ $stage -le 8 ]; then
  utils/data/subsegment_data_dir.sh ${test_data_dir} ${seg_dir}/segments \
    ${data_dir}_seg
  cp $src_data_dir/wav.scp ${data_dir}_seg
  cp $src_data_dir/{stm,reco2file_and_channel,glm} ${data_dir}_seg/ || true
  utils/fix_data_dir.sh ${data_dir}_seg
fi

echo "$0: Created output segmented kaldi data directory in ${data_dir}_seg"
exit 0


================================================
FILE: egs/steps/segmentation/evaluate_segmentation.pl
================================================
#!/usr/bin/env perl

# Copyright 2014  Johns Hopkins University (Author: Sanjeev Khudanpur), Vimal Manohar 
# Apache 2.0

################################################################################
#
# This script was written to check the goodness of automatic segmentation tools
# It assumes input in the form of two Kaldi segments files, i.e. a file each of
# whose lines contain four space-separated values:
#
#    UtteranceID  FileID  StartTime EndTime
#
# It computes # missed frames, # false positives and # overlapping frames.
#
################################################################################

if ($#ARGV == 1) {
    $ReferenceSegmentation = $ARGV[0];
    $HypothesizedSegmentation = $ARGV[1];
    printf STDERR ("Comparing reference segmentation\n\t%s\nwith proposed segmentation\n\t%s\n",
		   $ReferenceSegmentation,
		   $HypothesizedSegmentation);
} else {
    printf STDERR "This program compares the reference segmenation with the proposted segmentation\n";
    printf STDERR "Usage: $0 reference_segments_filename proposed_segments_filename\n";
    printf STDERR "e.g. $0 data/dev10h/segments data/dev10h.seg/segments\n";
    exit (0);
}

################################################################################
# First read the reference segmentation, and
# store the start- and end-times of all segments in each file.
################################################################################

open (SEGMENTS, "cat $ReferenceSegmentation | sort -k2,2 -k3n,3 -k4n,4 |")
    || die "Unable to open $ReferenceSegmentation";
$numLines = 0;
while ($line=<SEGMENTS>) {
    chomp $line;
    @field = split("[ \t]+", $line);
    unless ($#field == 3) {
  exit (1);
	printf STDERR "Skipping unparseable line in file $ReferenceSegmentation\n\t$line\n";
	next;
    }
    $fileID = $field[1];
    unless (exists $firstSeg{$fileID}) {
	$firstSeg{$fileID} = $numLines;
	$actualSpeech{$fileID} = 0.0;
	$hypothesizedSpeech{$fileID} = 0.0;
	$foundSpeech{$fileID} = 0.0;
	$falseAlarm{$fileID} = 0.0;
	$minStartTime{$fileID} = 0.0;
	$maxEndTime{$fileID} = 0.0;
    }
    $refSegName[$numLines] = $field[0];
    $refSegStart[$numLines] = $field[2];
    $refSegEnd[$numLines] = $field[3];
    $actualSpeech{$fileID} += ($field[3]-$field[2]);
    $minStartTime{$fileID} = $field[2] if ($minStartTime{$fileID}>$field[2]);
    $maxEndTime{$fileID} = $field[3] if ($maxEndTime{$fileID}<$field[3]);
    $lastSeg{$fileID} = $numLines;
    ++$numLines;
}
close(SEGMENTS);
print STDERR "Read $numLines segments from $ReferenceSegmentation\n";

################################################################################
# Process hypothesized segments sequentially, and gather speech/nonspeech stats
################################################################################

open (SEGMENTS, "cat $HypothesizedSegmentation | sort -k2,2 -k1,1 |")
    # Kaldi segments files are sorted by UtteranceID, but we re-sort them here
    # so that all segments of a file are read together, sorted by start-time.
    || die "Unable to open $HypothesizedSegmentation";
$numLines = 0;
$totalHypSpeech = 0.0;
$totalFoundSpeech = 0.0;
$totalFalseAlarm = 0.0;
$numShortSegs = 0;
$numLongSegs = 0;
while ($line=<SEGMENTS>) {
    chomp $line;
    @field = split("[ \t]+", $line);
    unless ($#field == 3) {
  exit (1);
	printf STDERR "Skipping unparseable line in file $HypothesizedSegmentation\n\t$line\n";
	next;
    }
    $fileID = $field[1];
    $segStart = $field[2];
    $segEnd = $field[3];
    if (exists $firstSeg{$fileID}) {
	# This FileID exists in the reference segmentation
	# So gather statistics for this UtteranceID
	$hypothesizedSpeech{$fileID} += ($segEnd-$segStart);
	$totalHypSpeech += ($segEnd-$segStart);
	if (($segStart>=$maxEndTime{$fileID}) || ($segEnd<=$minStartTime{$fileID})) {
	    # This entire segment is a false alarm
	    $falseAlarm{$fileID} += ($segEnd-$segStart);
	    $totalFalseAlarm += ($segEnd-$segStart);
	} else {
	    # This segment may overlap one or more reference segments
	    $p = $firstSeg{$fileID};
	    while ($refSegEnd[$p]<=$segStart) {
		++$p;
	    }
	    # The overlap, if any, begins at the reference segment p
	    $q = $lastSeg{$fileID};
	    while ($refSegStart[$q]>=$segEnd) {
		--$q;
	    }
	    # The overlap, if any, ends at the reference segment q
	    if ($q<$p) {
		# This segment sits entirely in the nonspeech region
		# between the two reference speech segments q and p
 		$falseAlarm{$fileID} += ($segEnd-$segStart);
		$totalFalseAlarm += ($segEnd-$segStart);
	    } else {
		if (($segEnd-$segStart)<0.20) {
		    # For diagnosing Pascal's VAD segmentation
		    print STDOUT "Found short speech region $line\n";
		    ++$numShortSegs;
		} elsif (($segEnd-$segStart)>60.0) {
		    ++$numLongSegs;
		    # For diagnosing Pascal's VAD segmentation
		    print STDOUT "Found long speech region $line\n";
		}
		# There is some overlap with segments p through q
		for ($s=$p; $s<=$q; ++$s) {
		    if ($segStart<$refSegStart[$s]) {
			# There is a leading false alarm portion before s
			$falseAlarm{$fileID} += ($refSegStart[$s]-$segStart);
			$totalFalseAlarm += ($refSegStart[$s]-$segStart);
			$segStart=$refSegStart[$s];
		    }
		    $speechPortion = ($refSegEnd[$s]<$segEnd) ?
			($refSegEnd[$s]-$segStart) : ($segEnd-$segStart);
		    $foundSpeech{$fileID} += $speechPortion;
		    $totalFoundSpeech += $speechPortion;
		    $segStart=$refSegEnd[$s];
		}
		if ($segEnd>$segStart) {
		    # There is a trailing false alarm portion after q
		    $falseAlarm{$fileID} += ($segEnd-$segStart);
		    $totalFalseAlarm += ($segEnd-$segStart);
		}
	    }
	}
    } else {
	# This FileID does not exist in the reference segmentation
	# So all this speech counts as a false alarm
  exit (1);
	printf STDERR ("Unexpected fileID in hypothesized segments: %s", $fileID);
	$totalFalseAlarm += ($segEnd-$segStart);
    }
    ++$numLines;
}
close(SEGMENTS);
print STDERR "Read $numLines segments from $HypothesizedSegmentation\n";

################################################################################
# Now that all hypothesized segments have been processed, compute needed stats
################################################################################

$totalActualSpeech = 0.0;
$totalNonSpeechEst = 0.0; # This is just a crude estimate of total nonspeech.
foreach $fileID (sort keys %actualSpeech) {
    $totalActualSpeech += $actualSpeech{$fileID};
    $totalNonSpeechEst += $maxEndTime{$fileID} - $actualSpeech{$fileID};
    #######################################################################
    # Print file-wise statistics to STDOUT; can pipe to /dev/null is needed
    #######################################################################
    printf STDOUT ("%s: %.2f min actual speech, %.2f min hypothesized: %.2f min overlap (%d\%), %.2f min false alarm (~%d\%)\n",
		   $fileID,
		   ($actualSpeech{$fileID}/60.0),
		   ($hypothesizedSpeech{$fileID}/60.0),
		   ($foundSpeech{$fileID}/60.0),
		   ($foundSpeech{$fileID}*100/($actualSpeech{$fileID}+0.01)),
		   ($falseAlarm{$fileID}/60.0),
		   ($falseAlarm{$fileID}*100/($maxEndTime{$fileID}-$actualSpeech{$fileID}+0.01)));
}

################################################################################
# Finally, we have everything needed to report the segmentation statistics.
################################################################################

printf STDERR ("------------------------------------------------------------------------\n");
printf STDERR ("TOTAL: %.2f hrs actual speech, %.2f hrs hypothesized: %.2f hrs overlap (%d\%), %.2f hrs false alarm (~%d\%)\n",
		   ($totalActualSpeech/3600.0),
		   ($totalHypSpeech/3600.0),
		   ($totalFoundSpeech/3600.0),
		   ($totalFoundSpeech*100/($totalActualSpeech+0.000001)),
		   ($totalFalseAlarm/3600.0),
		   ($totalFalseAlarm*100/($totalNonSpeechEst+0.000001)));
printf STDERR ("\t$numShortSegs segments < 0.2 sec and $numLongSegs segments > 60.0 sec\n");
printf STDERR ("------------------------------------------------------------------------\n");


================================================
FILE: egs/steps/segmentation/get_targets_for_out_of_segments.sh
================================================
#!/usr/bin/env bash

# Copyright 2017  Vimal Manohar
# Apache 2.0

# This script prepares targets for whole recordings for training 
# speech activity detection system on the out-of-segment regions. 
# See the script steps/segmentation/lats_to_targets.sh for details about the 
# targets matrix.
# The out-of-segment regions are assigned the target values in the 
# file specified (in kaldi vector text format) by --default-targets option. 
# The in-segment regions are all assigned [ 0 0 0 ], 
# which means they don't contribute to the training. We will later be 
# combining these targets with other targets obtained from 
# supervision-constrained lattices and decoded lattices using the 
# script steps/segmentation/merge_targets.sh.
# By default, the 'default_targets' would be [ 1 0 0 ], which means all
# the out-of-segment regions are assumed as silence. But depending, on
# the application and data, this could be [ 0 0 0 ] or [ 0 0 1 ] or
# something with fractional weights.

nj=4
cmd=run.pl
default_targets=   # vector of default targets in text format
frame_subsampling_factor=1

set -o pipefail -u

[ -f ./path.sh ] && . ./path.sh
. utils/parse_options.sh

if [ $# -ne 3 ]; then
  cat <<EOF
  This script prepares targets for whole recordings for training 
  speech activity detection system on the out-of-segment regions. 
  See the top of the script for details.
  Usage: steps/segmentation/get_targets_for_out_of_segments.sh <data-dir> <whole-data-dir> <targets-dir>
   e.g.: steps/segmentation/get_targets_for_out_of_segments.sh \
    data/train_split10s data/train_whole \
    exp/segmentation1a/out_of_train_split10s_train_whole_default_targets
EOF
  exit 1
fi

data=$1
whole_data=$2
dir=$3

for f in $data/segments $whole_data/wav.scp; do
  if [ ! -f $f ]; then 
    echo "$0: Could not find file $f" 
    exit 1
  fi
done

frame_shift=$(utils/data/get_frame_shift.sh $data) || exit 1

mkdir -p $dir/split${nj}reco
split_scps=
for n in $(seq $nj); do
  split_scps="$split_scps $dir/split${nj}reco/wav.$n.scp"
done
utils/split_scp.pl $whole_data/wav.scp $split_scps

utils/data/get_reco2utt_for_data.sh $data > $dir/reco2utt

mkdir -p $dir/split${nj}reco
utils/filter_scps.pl JOB=1:$nj $dir/split${nj}reco/wav.JOB.scp $dir/reco2utt \
  $dir/split${nj}reco/reco2utt.JOB || exit 1
utils/filter_scps.pl -f 2 JOB=1:$nj $dir/split${nj}reco/wav.JOB.scp $data/segments \
    $dir/split${nj}reco/segments.JOB || exit 1

# make $dir an absolute pathname.
dir=`perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' $dir ${PWD}`

utils/data/get_utt2num_frames.sh $whole_data
cp $whole_data/utt2num_frames $dir/reco2num_frames

$cmd JOB=1:$nj $dir/log/get_default_targets.JOB.log \
  steps/segmentation/internal/get_default_targets_for_out_of_segments.py \
    --reco2num-frames=$dir/reco2num_frames \
    --default-targets="$default_targets" \
    $dir/split${nj}reco/reco2utt.JOB $dir/split${nj}reco/segments.JOB - \| \
  subsample-feats --n=$frame_subsampling_factor ark,t:- ark:- \| \
  copy-feats ark:- ark,scp:$dir/targets.JOB.ark,$dir/targets.JOB.scp || exit 1

if [ $frame_subsampling_factor -ne 1 ]; then
  echo $frame_subsampling_factor > $dir/frame_subsampling_factor
fi

for n in $(seq $nj); do
  cat $dir/targets.$n.scp
done | sort -k1,1 > $dir/targets.scp

steps/segmentation/validate_targets_dir.sh $dir $whole_data || exit 1

echo "$0: Got default targets for out-of-segments regions in $whole_data corresponding to segments in $data"

exit 0


================================================
FILE: egs/steps/segmentation/internal/arc_info_to_targets.py
================================================
#!/usr/bin/env python

# Copyright 2017  Vimal Manohar
# Apache 2.0

"""
This script converts arc-info into targets for training
speech activity detection network. The output is a matrix archive
with each matrix having 3 columns -- silence, speech and garbage.
The posterior probabilities of the phones of each of the classes are
summed up to get the target matrix values.
"""

import argparse
import logging
import sys

sys.path.insert(0, 'steps')
import libs.common as common_lib

logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
handler = logging.StreamHandler()
handler.setLevel(logging.INFO)
formatter = logging.Formatter("%(asctime)s [%(pathname)s:%(lineno)s - "
                              "%(funcName)s - %(levelname)s ] %(message)s")
handler.setFormatter(formatter)
logger.addHandler(handler)


def get_args():
    parser = argparse.ArgumentParser(
        description="""This script converts arc-info into targets for training
        speech activity detection network. The output is a matrix archive
        with each matrix having 3 columns -- silence, speech and garbage.
        The posterior probabilities of the phones of each of the classes are
        summed up to get the target matrix values.
        """)

    parser.add_argument("--silence-phones", type=str,
                        required=True,
                        help="File containing a list of phones that will be "
                        "treated as silence")
    parser.add_argument("--garbage-phones", type=str,
                        required=True,
                        help="File containing a list of phones that will be "
                        "treated as garbage class")
    parser.add_argument("--max-phone-length", type=int, default=50,
                        help="""Maximum number of frames allowed for a speech
                        phone above which the arc is treated as garbage.""")

    parser.add_argument("arc_info", type=str,
                        help="Arc info file (output of lattice-arc-post). "
                        "See the help for lattice-arc-post for information "
                        "about the format of this input.")
    parser.add_argument("targets_file", type=str,
                        help="File to write targets matrix archive in text "
                        "format")
    args = parser.parse_args()
    return args


def run(args):
    silence_phones = {}
    with common_lib.smart_open(args.silence_phones) as silence_phones_fh:
        for line in silence_phones_fh:
            silence_phones[line.strip().split()[0]] = 1

    if len(silence_phones) == 0:
        raise RuntimeError("Could not find any phones in {silence}"
                           "".format(silence=args.silence_phones))

    garbage_phones = {}
    with common_lib.smart_open(args.garbage_phones) as garbage_phones_fh:
        for line in garbage_phones_fh:
            word = line.strip().split()[0]
            if word in silence_phones:
                raise RuntimeError("Word '{word}' is in both {silence} "
                                   "and {garbage}".format(
                                       word=word,
                                       silence=args.silence_phones,
                                       garbage=args.garbage_phones))
            garbage_phones[word] = 1

    if len(garbage_phones) == 0:
        raise RuntimeError("Could not find any phones in {garbage}"
                           "".format(garbage=args.garbage_phones))

    num_utts = 0
    num_err = 0
    targets = []
    prev_utt = ""

    with common_lib.smart_open(args.arc_info) as arc_info_reader, \
            common_lib.smart_open(args.targets_file, 'w') as targets_writer:
        for line in arc_info_reader:
            try:
                parts = line.strip().split()
                utt = parts[0]

                if utt != prev_utt:
                    if prev_utt != "":
                        if len(targets) > 0:
                            num_utts += 1
                            common_lib.write_matrix_ascii(
                                targets_writer, targets, key=prev_utt)
                        else:
                            num_err += 1
                    prev_utt = utt
                    targets = []

                start_frame = int(parts[1])
                num_frames = int(parts[2])
                post = float(parts[3])
                phone = parts[4]

                if start_frame + num_frames > len(targets):
                    for t in range(len(targets), start_frame + num_frames):
                        targets.append([0, 0, 0])
                    assert start_frame + num_frames == len(targets)

                for t in range(start_frame, start_frame + num_frames):
                    if phone in silence_phones:
                        targets[t][0] += post
                    elif num_frames > args.max_phone_length:
                        targets[t][2] += post
                    elif phone in garbage_phones:
                        targets[t][2] += post
                    else:
                        targets[t][1] += post
            except Exception:
                logger.error("Failed to process line {line} in {f}"
                             "".format(line=line.strip(), f=args.arc_info))
                logger.error("len(targets) = {l}".format(l=len(targets)))
                raise

    if prev_utt != "":
        if len(targets) > 0:
            num_utts += 1
            common_lib.write_matrix_ascii(args.targets_file, targets,
                                          key=prev_utt)
        else:
            num_err += 1

    logger.info("Wrote {num_utts} targets; failed with {num_err}"
                "".format(num_utts=num_utts, num_err=num_err))
    if num_utts == 0 or num_err >= num_utts // 2:
        raise RuntimeError


def main():
    args = get_args()

    try:
        run(args)
    except Exception:
        raise


if __name__ == "__main__":
    main()


================================================
FILE: egs/steps/segmentation/internal/find_oov_phone.py
================================================
#!/usr/bin/env python

# Copyright 2017  Vimal Manohar
# Apache 2.0

"""This script finds the OOV phone by reading the OOV word from
oov.int in the input <lang> directory and the lexicon
<lang>/phones/align_lexicon.int.
It prints the OOV phone to stdout, if it can find a single phone
mapping for the OOV word."""
from __future__ import print_function

import sys


def main():
    if len(sys.argv) != 2:
        raise RuntimeError("Usage: {0} <lang>".format(sys.argv[0]))

    lang = sys.argv[1]

    oov_int = int(open("{0}/oov.int").readline())
    assert oov_int > 0

    oov_mapped_to_multiple_phones = False
    for line in open("{0}/phones/align_lexicon.int"):
        parts = line.strip().split()

        if len(parts) < 3:
            raise RuntimeError("Could not parse line {0} in "
                               "{1}/phones/align_lexicon.int"
                               "".format(line, lang))

        w = int(parts[0])
        if w != oov_int:
            continue

        if len(parts[2:]) > 1:
            # Try to find a single phone mapping for OOV
            oov_mapped_to_multiple_phones = True
            continue

        p = int(parts[2])
        print ("{0}".format(p))

        raise SystemExit(0)

    if oov_mapped_to_multiple_phones:
        raise RuntimeError("OOV word found, but is mapped to multiples phones. "
                           "This is an unusual case.")

    raise RuntimeError("Could not find OOV word in "
                       "{0}/phones/align_lexicon.int".format(lang))


if __name__ != "__main__":
    main()


================================================
FILE: egs/steps/segmentation/internal/get_default_targets_for_out_of_segments.py
================================================
#!/usr/bin/env python

# Copyright 2017  Vimal Manohar
# Apache 2.0

"""
This script gets targets for the whole recording
by adding 'default_targets' vector read from file specified by
--default-targets option for the out-of-segments regions and
zeros for all other frames. See steps/segmentation/lats_to_targets.sh
for details about the targets matrix.
By default, the 'default_targets' would be [ 1 0 0 ], which means all
the out-of-segment regions are assumed as silence. But depending, on
the application and data, this could be [ 0 0 0 ] or [ 0 0 1 ] or
something with fractional weights.
"""
from __future__ import division

import argparse
import logging
import numpy as np
import subprocess
import sys

sys.path.insert(0, 'steps')
import libs.common as common_lib

logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
handler = logging.StreamHandler()
handler.setLevel(logging.INFO)
formatter = logging.Formatter("%(asctime)s [%(pathname)s:%(lineno)s - "
                              "%(funcName)s - %(levelname)s ] %(message)s")
handler.setFormatter(formatter)
logger.addHandler(handler)


def get_args():
    parser = argparse.ArgumentParser(
        description="""This script gets targets for the whole recording
        by adding 'default_targets' vector read from file specified by
        --default-targets option for the out-of-segments regions and
        zeros for all other frames. See steps/segmentation/lats_to_targets.sh
        for details about the targets matrix.
        By default, the 'default_targets' would be [ 1 0 0 ], which means all
        the out-of-segment regions are assumed as silence. But depending, on
        the application and data, this could be [ 0 0 0 ] or [ 0 0 1 ] or
        something with fractional weights.
        """)

    parser.add_argument("--frame-shift", type=float, default=0.01,
                        help="Frame shift value in seconds")
    parser.add_argument("--default-targets", type=str, default=None,
                        action=common_lib.NullstrToNoneAction,
                        help="Vector of default targets for out-of-segments "
                        "region")
    parser.add_argument("--length-tolerance", type=int, default=2,
                        help="Tolerate length mismatches of this many frames")
    parser.add_argument("--verbose", type=int, default=0, choices=[0,1,2],
                        help="Verbose level")

    parser.add_argument("--reco2num-frames", type=str, required=True,
                        action=common_lib.NullstrToNoneAction,
                        help="""The number of frames per reco
                        is used to determine the num-rows of the output matrix
                        """)
    parser.add_argument("reco2utt", type=str,
                        help="""reco2utt file.
                        The format is <reco> <utt-1> <utt-2> ... <utt-N>""")
    parser.add_argument("segments", type=str,
                        help="Input kaldi segments file")
    parser.add_argument("out_targets_ark", type=str,
                        help="""Output archive to which the
                        recording-level matrix will be written in text
                        format""")

    args = parser.parse_args()

    if args.frame_shift < 0.0001 or args.frame_shift > 1:
        raise ValueError("--frame-shift should be in [0.0001, 1]; got {0}"
                         "".format(args.frame_shift))

    if args.verbose >= 2:
        logger.setLevel(logging.DEBUG)
        handler.setLevel(logging.DEBUG)

    return args


def run(args):
    reco2utt = {}
    with common_lib.smart_open(args.reco2utt) as f:
        for line in f:
            parts = line.strip().split()
            if len(parts) < 2:
                raise ValueError("Could not parse line {0}".format(line))
            reco2utt[parts[0]] = parts[1:]

    reco2num_frames = {}
    with common_lib.smart_open(args.reco2num_frames) as f:
        for line in f:
            parts = line.strip().split()
            if len(parts) != 2:
                raise ValueError("Could not parse line {0}".format(line))
            if parts[0] not in reco2utt:
                continue
            reco2num_frames[parts[0]] = int(parts[1])

    segments = {}
    with common_lib.smart_open(args.segments) as f:
        for line in f:
            parts = line.strip().split()
            if len(parts) not in [4, 5]:
                raise ValueError("Could not parse line {0}".format(line))
            utt = parts[0]
            reco = parts[1]
            if reco not in reco2utt:
                continue
            start_time = float(parts[2])
            end_time = float(parts[3])
            segments[utt] = [reco, start_time, end_time]

    num_utt_err = 0
    num_utt = 0
    num_reco = 0

    if args.default_targets is not None:
        default_targets = np.matrix(common_lib.read_matrix_ascii(args.default_targets))
    else:
        default_targets = np.matrix([[1, 0, 0]])
    assert (np.shape(default_targets)[0] == 1
            and np.shape(default_targets)[1] == 3)

    with common_lib.smart_open(args.out_targets_ark, 'w') as f:
        for reco, utts in reco2utt.items():
            reco_mat = np.repeat(default_targets, reco2num_frames[reco],
                                 axis=0)
            utts.sort(key=lambda x: segments[x][1])   # sort on start time
            for i, utt in enumerate(utts):
                if utt not in segments:
                    num_utt_err += 1
                    continue
                segment = segments[utt]

                start_frame = int(segment[1] / args.frame_shift)
                end_frame = int(segment[2] / args.frame_shift)
                num_frames = end_frame - start_frame

                if end_frame > reco2num_frames[reco]:
                    end_frame = reco2num_frames[reco]
                    num_frames = end_frame - start_frame

                reco_mat[start_frame:end_frame] = np.zeros([num_frames, 3])
                num_utt += 1

            if reco_mat.shape[0] > 0:
                common_lib.write_matrix_ascii(f, reco_mat.tolist(),
                                              key=reco)
                num_reco += 1

    logger.info("Got default out-of-segment targets for {num_reco} recordings "
                "containing {num_utt} in-segment regions; "
                "failed to account {num_utt_err} utterances"
                "".format(num_reco=num_reco, num_utt=num_utt,
                          num_utt_err=num_utt_err))

    if num_utt == 0 or num_utt_err > num_utt // 2 or num_reco == 0:
        raise RuntimeError


def main():
    args = get_args()
    try:
        run(args)
    except Exception:
        raise


if __name__ == "__main__":
    main()


================================================
FILE: egs/steps/segmentation/internal/get_transform_probs_mat.py
================================================
#!/usr/bin/env python

# Copyright 2017  Vimal Manohar
# Apache 2.0

import argparse
import sys
sys.path.insert(0, 'steps')

import libs.common as common_lib

def get_args():
    parser = argparse.ArgumentParser(
        description="""This script writes to stdout a transformation matrix
    to convert a 3x1 probability vector to a
    2x1 pseudo-likelihood vector by first dividing by 3x1 priors vector.""")

    parser.add_argument("--priors", type=str, default=None,
                        action=common_lib.NullstrToNoneAction,
                        help="Priors vector used to remove the priors from "
                        "the neural network output posteriors to "
                        "convert them to likelihoods")

    parser.add_argument("--sil-in-speech-weight", type=float,
                        default=0.0,
                        help="The fraction of silence probability "
                        "to add to speech")
    parser.add_argument("--speech-in-sil-weight", type=float,
                        default=0.0,
                        help="The fraction of speech probability "
                        "to add to silence")
    parser.add_argument("--garbage-in-speech-weight", type=float,
                        default=0.0,
                        help="The fraction of garbage probability "
                        "to add to speech")
    parser.add_argument("--garbage-in-sil-weight", type=float,
                        default=0.0,
                        help="The fraction of garbage probability "
                        "to add to silence")
    parser.add_argument("--sil-scale", type=float,
                        default=1.0, help="""Scale on the silence probability
                        (make this more than one to encourage
                        decoding silence).""")

    args = parser.parse_args()

    return args


def run(args):
    priors = [[1.0, 1.0, 1.0]]
    if args.priors is not None:
        priors = common_lib.read_matrix_ascii(args.priors)
        if len(priors) != 0 and len(priors[0]) != 3:
            raise RuntimeError("Invalid dimension for priors {0}"
                               "".format(priors))

    priors_sum = sum(priors[0])
    sil_prior = priors[0][0] / priors_sum
    speech_prior = priors[0][1] / priors_sum
    garbage_prior = priors[0][2] / priors_sum

    transform_mat = [[args.sil_scale / sil_prior,
                      args.speech_in_sil_weight / speech_prior,
                      args.garbage_in_sil_weight / garbage_prior],
                     [args.sil_in_speech_weight / sil_prior,
                      1.0 / speech_prior,
                      args.garbage_in_speech_weight / garbage_prior]]

    common_lib.write_matrix_ascii(sys.stdout, transform_mat)


def main():
    args = get_args()
    run(args)


if __name__ == "__main__":
    main()


================================================
FILE: egs/steps/segmentation/internal/merge_segment_targets_to_recording.py
================================================
#!/usr/bin/env python

# Copyright 2017  Vimal Manohar
# Apache 2.0

"""
This script merges targets matrices corresponding to
segments into targets matrix for whole recording. The frames that are not
in any of the segments are assigned the default targets vector, specified by
the option --default-targets or [ 0 0 0 ] if unspecified.
"""
from __future__ import division

import argparse
import logging
import numpy as np
import subprocess
import sys

sys.path.insert(0, 'steps')
import libs.common as common_lib

logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
handler = logging.StreamHandler()
handler.setLevel(logging.INFO)
formatter = logging.Formatter("%(asctime)s [%(pathname)s:%(lineno)s - "
                              "%(funcName)s - %(levelname)s ] %(message)s")
handler.setFormatter(formatter)
logger.addHandler(handler)


def get_args():
    parser = argparse.ArgumentParser(
        description="""This script merges targets matrices corresponding to
        segments into targets matrix for whole recording.""")

    parser.add_argument("--frame-shift", type=float, default=0.01,
                        help="Frame shift value in seconds")
    parser.add_argument("--default-targets", type=str, default=None,
                        action=common_lib.NullstrToNoneAction,
                        help="Vector of default targets for out-of-segments "
                        "region")
    parser.add_argument("--length-tolerance", type=int, default=4,
                        help="Tolerate length mismatches of this many frames")
    parser.add_argument("--verbose", type=int, default=0, choices=[0, 1, 2],
                        help="Verbose level")

    parser.add_argument("--reco2num-frames", type=str, required=True,
                        action=common_lib.NullstrToNoneAction,
                        help="""The number of frames per reco
                        is used to determine the num-rows of the output matrix
                        """)
    parser.add_argument("reco2utt", type=str,
                        help="""reco2utt file.
                        The format is <reco> <utt-1> <utt-2> ... <utt-N>""")
    parser.add_argument("segments", type=str,
                        help="Input kaldi segments file")
    parser.add_argument("targets_scp", type=str,
                        help="""SCP of input targets matrices.
                        The matrices are indexed by the utterance-id.""")
    parser.add_argument("out_targets_ark", type=str,
                        help="""Output archive to which the
                        recording-level matrix will be written in text
                        format""")

    args = parser.parse_args()

    if args.frame_shift < 0.0001 or args.frame_shift > 1:
        raise ValueError("--frame-shift should be in [0.0001, 1]; got {0}"
                         "".format(args.frame_shift))

    if args.verbose >= 2:
        logger.setLevel(logging.DEBUG)
        handler.setLevel(logging.DEBUG)

    return args


def read_reco2utt_file(reco2utt_file):
    # Read reco2utt file
    reco2utt = {}
    with common_lib.smart_open(reco2utt_file) as fh:
        for line in fh:
            parts = line.strip().split()
            if len(parts) < 2:
                raise ValueError("Could not parse line {0} in reco2utt "
                                 "file {1}".format(line, reco2utt_file))
            reco2utt[parts[0]] = parts[1:]
    return reco2utt


def read_reco2num_frames_file(reco2num_frames_file):
    # Read reco2num_frames file
    reco2num_frames = {}
    with common_lib.smart_open(reco2num_frames_file) as fh:
        for line in fh:
            parts = line.strip().split()
            if len(parts) != 2:
                raise ValueError("Could not parse line {0} in "
                                 "reco2num-frames file {1}".format(
                                     line, reco2num_frames_file))
            reco2num_frames[parts[0]] = int(parts[1])
    return reco2num_frames


def read_segments_file(segments_file, reco2utt):
    # Read segments from segments file
    segments = {}
    with common_lib.smart_open(segments_file) as fh:
        for line in fh:
            parts = line.strip().split()
            if len(parts) not in [4, 5]:
                raise ValueError("Could not parse line {0} in "
                                 "segments file {1}".format(line, segments))
            utt = parts[0]
            reco = parts[1]
            if reco not in reco2utt:
                continue
            start_time = float(parts[2])
            end_time = float(parts[3])
            segments[utt] = [reco, start_time, end_time]
    return segments


def read_targets_scp(targets_scp, segments):
    # Read the SCP file containing targets
    targets = {}
    with common_lib.smart_open(targets_scp) as fh:
        for line in fh:
            parts = line.strip().split()
            if len(parts) != 2:
                raise ValueError("Could not parse line {0} in "
                                 "targets scp file".format(line, targets_scp))
            utt = parts[0]
            if utt not in segments:
                continue
            targets[utt] = parts[1]
    return targets


def run(args):
    reco2utt = read_reco2utt_file(args.reco2utt)
    reco2num_frames = read_reco2num_frames_file(args.reco2num_frames)
    segments = read_segments_file(args.segments, reco2utt)
    targets = read_targets_scp(args.targets_scp, segments)

    if args.default_targets is not None:
        # Read the vector of default targets for out-of-segment regions
        default_targets = np.matrix(
            common_lib.read_matrix_ascii(args.default_targets))
    else:
        default_targets = np.zeros([1, 3])
    assert (np.shape(default_targets)[0] == 1
            and np.shape(default_targets)[1] == 3)

    num_utt_err = 0
    num_utt = 0
    num_reco = 0

    with common_lib.smart_open(args.out_targets_ark, 'w') as fh:
        for reco, utts in reco2utt.items():
            # Read a recording and the list of its utterances from the
            # reco2utt dictionary
            reco_mat = np.repeat(default_targets, reco2num_frames[reco],
                                 axis=0)
            utts.sort(key=lambda x: segments[x][1])   # sort on start time

            end_frame_accounted = 0

            for i, utt in enumerate(utts):
                if utt not in segments or utt not in targets:
                    num_utt_err += 1
                    continue
                segment = segments[utt]

                # Read the targets corresponding to the segments
                cmd = ("copy-feats --binary=false {mat_fn} -"
                       "".format(mat_fn=targets[utt]))
                p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE,
                                     stderr=subprocess.PIPE)

                try:
                    mat = np.matrix(common_lib.read_matrix_ascii(p.stdout),
                                    dtype='float32')
                except Exception:
                    logger.error("Command '{cmd}' failed".format(cmd=cmd))
                    raise
                finally:
                    [stdout, stderr] = p.communicate()
                    if p.returncode is not None and p.returncode != 0:
                        raise RuntimeError(
                            'Command "{cmd}" failed with status {status}; '
                            'stderr = {stderr}'.format(cmd=cmd, status=-p.returncode,
                                                       stderr=stderr))

                start_frame = int(segment[1] / args.frame_shift + 0.5)
                end_frame = int(segment[2] / args.frame_shift + 0.5)
                num_frames = end_frame - start_frame

                if num_frames <= 0:
                    raise ValueError("Invalid line in segments file {0}"
                                     "".format(segment))

                if abs(mat.shape[0] - num_frames) > args.length_tolerance:
                    logger.warning("For utterance {utt}, mismatch in segment "
                                   "length and targets matrix size; "
                                   "{s_len} vs {t_len}".format(
                                       utt=utt, s_len=num_frames,
                                       t_len=mat.shape[0]))
                    num_utt_err += 1
                    continue

                # Fix end_frame and num_frames if the segment goes beyond
                # the length of the recording.
                if end_frame > reco2num_frames[reco]:
                    end_frame = reco2num_frames[reco]
                    num_frames = end_frame - start_frame

                # Fix "num_frames" and "end_frame" if "num_frames" is lower
                # than the size of the targets matrix "mat"
                num_frames = min(num_frames, mat.shape[0])
                end_frame = start_frame + num_frames

                if num_frames <= 0:
                    logger.warning("For utterance {utt}, start-frame {start} "
                                   "is outside the recording"
                                   "".format(utt=utt, start=start_frame))
                    num_utt_err += 1
                    continue

                if end_frame < end_frame_accounted:
                    logger.warning("For utterance {utt}, end-frame {end} "
                                   "is before the end of a previous segment. "
                                   "i.e. this segment is completely within "
                                   "another segment. Ignoring this segment."
                                   "".format(utt=utt, end=end_frame))
                    num_utt_err +=1
                    continue

                if start_frame < end_frame_accounted:
                    # Segment overlaps with a previous utterance
                    # Combine targets using a weighted interpolation using a
                    # triangular window with a weight of 1 at the start/end of
                    # overlap and 0 at the end/start of the segment
                    for n in range(0, end_frame_accounted - start_frame):
                        w = float(n) / float(end_frame_accounted - start_frame)
                        reco_mat[n + start_frame, :] = (
                            reco_mat[n + start_frame, :] * (1.0 - w)
                            + mat[n, :] * w)

                    if end_frame > end_frame_accounted:
                        reco_mat[end_frame_accounted:end_frame, :] = (
                            mat[(end_frame_accounted-start_frame):
                                (end_frame-start_frame), :])
                else:
                    # No overlap with the previous utterances.
                    # So just add it to the output.
                    reco_mat[start_frame:end_frame, :] = (
                        mat[0:num_frames, :])
                logger.debug("reco_mat shape = %s, mat shape = %s, "
                             "start_frame = %d, end_frame = %d", reco_mat.shape,
                             mat.shape, start_frame, end_frame)

                end_frame_accounted = end_frame
                num_utt += 1

            if reco_mat.shape[0] > 0:
                common_lib.write_matrix_ascii(fh, reco_mat,
                                              key=reco)
                num_reco += 1

    logger.info("Merged {num_utt} segment targets from {num_reco} recordings; "
                "failed with {num_utt_err} utterances"
                "".format(num_utt=num_utt, num_reco=num_reco,
                          num_utt_err=num_utt_err))

    if num_utt == 0 or num_utt_err > num_utt // 2 or num_reco == 0:
        raise RuntimeError


def main():
    args = get_args()
    try:
        run(args)
    except Exception:
        raise


if __name__ == "__main__":
    main()


================================================
FILE: egs/steps/segmentation/internal/merge_targets.py
================================================
#!/usr/bin/env python3

# Copyright 2017  Vimal Manohar
# Apache 2.0

"""
This script merges targets created from multiple sources (systems) into
single targets matrices.

Usage: merge_targets.py [options] <pasted-targets> <out-targets>
 e.g.: paste-feats scp:targets1.scp scp:targets2.scp ark,t:- | merge_targets.py --dim=3 - - | copy-feats ark,t:- ark:-

<pasted-targets> is matrix archive with matrices corresponding to
targets from multiple sources appended together using paste-feats.
The column dimension is num-sources * dim, which dim is specified by --dim
option.
"""

import argparse
import logging
import numpy as np
import sys

sys.path.insert(0, 'steps')
import libs.common as common_lib

logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
handler = logging.StreamHandler()
handler.setLevel(logging.INFO)
formatter = logging.Formatter("%(asctime)s [%(pathname)s:%(lineno)s - "
                              "%(funcName)s - %(levelname)s ] %(message)s")
handler.setFormatter(formatter)
logger.addHandler(handler)


def get_args():
    parser = argparse.ArgumentParser(
        description="""
    This script merges targets created from multiple sources (systems) into
    single targets matrices.
    Usage: merge_targets.py [options] <pasted-targets> <out-targets>
     e.g.: paste-feats scp:targets1.scp scp:targets2.scp ark,t:- | merge_targets.py --dim=3 - - | copy-feats ark,t:- ark:-
    """,
        formatter_class=argparse.RawTextHelpFormatter)

    parser.add_argument("--weights", type=str, default="",
                        help="A comma-separated list of weights corresponding "
                        "to each targets source being combined. "
                        "Weights will be normalized internally to sum-to-one.")
    parser.add_argument("--dim", type=int, default=3,
                        help="Number of columns corresponding to each "
                        "target matrix")
    parser.add_argument("--remove-mismatch-frames", type=str, default=False,
                        choices=["true", "false"],
                        action=common_lib.StrToBoolAction,
                        help="If true, the mismatch frames are removed by "
                        "setting targets to 0 in the following cases:\n"
                        "a) If none of the sources have a column with value "
                        "> 0.5\n"
                        "b) If two sources have columns with value > 0.5, but "
                        "they occur at different indexes e.g. silence prob is "
                        "> 0.5 for the targets from alignment, and speech prob "
                        "> 0.5 for the targets from decoding.")

    parser.add_argument("pasted_targets", type=str,
                        help="Input target matrices with columns appended "
                        "together using paste-feats. Its column dimension is "
                        "num-sources * dim, which dim is specified by --dim "
                        "option.")
    parser.add_argument("out_targets", type=str,
                        help="Output target matrices")

    args = parser.parse_args()

    if args.weights != "":
        args.weights = [float(x) for x in args.weights.split(",")]
        weights_sum = sum(args.weights)
        args.weights = [x / weights_sum for x in args.weights]
    else:
        args.weights = None

    return args


def should_remove_frame(row, dim):
    """Returns True if the frame needs to be removed.

    Input:
        row -- a list of values (of dimension num-sources x dim) corresponding
               to the targets for one of the frames
        dim -- Usually 3. The number of sources can be computed as the
               len(row) / dim.

    The frame is determined to be removed in the following cases:
        1) None of the values > 0.5.
        2) More than one source has best value >= 0.5, but at different
           indexes in the source.
    e.g. [ 1 0 0 0.6 0 0.4 0 0 0 ]   # kept because 1 and 0.6 are both > 0.5
                                     # at the same class namely 0
                                     # source[0] = [ 1 0 0 ]
                                     # source[1] = [ 0.6 0 0.4 ]
                                     # source[2] = [ 0 0 0 ]
    e.g. [ 0 0 0 0.4 0 0.6 1 0 0 ]   # removed because source[1] has best value
                                     # 0.6 > 0.5 at class 2 and source[2] has
                                     # best value 1 > 0.5 at class 0.
                                     # source[0] = [ 0 0 0 ]
                                     # source[1] = [ 0.4 0 0.6 ]
                                     # source[2] = [ 0 0 0 ]
    """
    assert len(row) % dim == 0
    num_sources = len(row) // dim

    max_idx = np.argmax(row)
    max_val = row[max_idx]

    if max_val < 0.5:
        # All the values < 0.5. So we are not confident of any sources.
        # Remove frame.
        return True

    best_source = max_idx // dim
    best_class = max_idx % dim

    confident_in_source = []  # List of length num_sources
                              # Element 'i' is 1,
                              # if the best value for the source 'i' is > 0.5
    best_values_for_source = []  # Element 'i' is a pair (value, class),
                                 # where 'class' is argmax over the scores
                                 # corresponding to the source 'i' and
                                 # 'value' is the corresponding score.
    for source_idx in range(num_sources):
        idx = np.argmax(row[(source_idx * dim):
                            ((source_idx+1) * dim)])
        val = row[source_idx * dim + idx]
        confident_in_source.append(bool(val > 0.5))
        best_values_for_source.append((val, idx))

    if sum(confident_in_source) == 1:
        # We are confident in only one source. Keep frame.
        return False

    for source_idx in range(num_sources):
        if source_idx == best_source:
            assert confident_in_source[source_idx]
            continue
        if not confident_in_source[source_idx]:
            continue
        else:
            # We are confident in a source other than the 'best_source'.
            # If it's index is different from the 'best_class', then it is
            # a mismatch and the frame must be removed.
            val, idx = best_values_for_source[source_idx]
            assert val > 0.5
            if idx != best_class:
                return True
    return False


def run(args):
    num_done = 0

    with common_lib.smart_open(args.pasted_targets) as targets_reader, \
            common_lib.smart_open(args.out_targets, 'w') as targets_writer:
        for key, mat in common_lib.read_mat_ark(targets_reader):
            mat = np.matrix(mat)
            if mat.shape[1] % args.dim != 0:
                raise RuntimeError(
                    "For utterance {utt} in {f}, num-columns {nc} "
                    "is not a multiple of dim {dim}"
                    "".format(utt=key, f=args.pasted_targets.name,
                              nc=mat.shape[1], dim=args.dim))
            num_sources = mat.shape[1] // args.dim

            out_mat = np.matrix(np.zeros([mat.shape[0], args.dim]))

            if args.remove_mismatch_frames:
                for n in range(mat.shape[0]):
                    if should_remove_frame(mat[n, :].getA()[0], args.dim):
                        out_mat[n, :] = np.zeros([1, args.dim])
                    else:
                        for i in range(num_sources):
                            out_mat[n, :] += (
                                mat[n, (i * args.dim) : ((i+1) * args.dim)]
                                * (1.0 if args.weights is None
                                   else args.weights[i]))
            else:
                # Just interpolate the targets
                for i in range(num_sources):
                    out_mat += (
                        mat[:, (i * args.dim) : ((i+1) * args.dim)]
                        * (1.0 if args.weights is None else args.weights[i]))

            common_lib.write_matrix_ascii(targets_writer, out_mat.tolist(),
                                          key=key)
            num_done += 1

    logger.info("Merged {num_done} target matrices"
                "".format(num_done=num_done))

    if num_done == 0:
        raise RuntimeError


def main():
    args = get_args()
    try:
        run(args)
    except Exception:
        raise


if __name__ == '__main__':
    main()


================================================
FILE: egs/steps/segmentation/internal/prepare_sad_graph.py
================================================
#!/usr/bin/env python

# Copyright 2016  Vimal Manohar
# Apache 2.0

"""Prepares a graph with a simple HMM topology for segmentation
with minimum and maximum speech duration constraints and minimum silence
duration constraint. The graph is written to the 'output_graph', which
can be file or "-" for stdout.
"""

from __future__ import print_function
import argparse
import logging
import math
import os
import sys
import traceback

sys.path.insert(0, 'steps')
import libs.common as common_lib


logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
handler = logging.StreamHandler()
handler.setLevel(logging.INFO)
formatter = logging.Formatter("%(asctime)s [%(filename)s:%(lineno)s - "
                              "%(funcName)s - %(levelname)s ] %(message)s")
handler.setFormatter(formatter)
logger.addHandler(handler)


def get_args():
    parser = argparse.ArgumentParser(
        description="""This script prepares a graph with a simple HMM topology
        for segmentation with minimum and maximum speech duration constraints
        and minimum silence duration constraint. The graph is written to the
        'output_graph', which can be file or "-" for stdout.  for segmentation
        with minimum and maximum speech duration constraints and minimum silence
        duration constraint.""",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    parser.add_argument("--transition-scale", type=float, default=1.0,
                        help="""Scale on transition probabilities relative to
                        LM weights""")
    parser.add_argument("--loopscale", type=float, default=0.1,
                        help="""Scale on self-loop log-probabilities relative
                        to LM weights""")

    parser.add_argument("--min-silence-duration", type=float, default=0.03,
                        help="""Minimum duration for silence""")
    parser.add_argument("--min-speech-duration", type=float, default=0.3,
                        help="""Minimum duration for speech""")
    parser.add_argument("--max-speech-duration", type=float, default=10.0,
                        help="""Maximum duration for speech""")
    parser.add_argument("--frame-shift", type=float, default=0.03,
                        help="""Frame shift in seconds""")

    parser.add_argument("--edge-silence-probability", type=float,
                        default=0.5,
                        help="Probability of silence at the edges.")
    parser.add_argument("--transition-probability", type=float, default=0.1,
                        help="Transition probability for silence to speech "
                        "or vice-versa")

    parser.add_argument("output_graph", type=str,
                        help="Output graph")
    args = parser.parse_args()

    args.min_states_silence = int(args.min_silence_duration / args.frame_shift
                                  + 0.5)
    args.min_states_speech = int(args.min_speech_duration / args.frame_shift
                                 + 0.5)
    args.max_states_speech = int(args.max_speech_duration / args.frame_shift
                                 + 0.5)

    return args


def print_states(args, file_handle):
    # Initial transition to silence
    print ("0 1 silence silence {0}".format(-math.log(args.edge_silence_probability)),
           file=file_handle)
    silence_start_state = 1

    # Silence min duration transitions
    # 1->2, 2->3 and so on until
    # (1 + min_states_silence - 2) -> (1 + min_states_silence - 1)  ...
    for state in range(silence_start_state,
                       silence_start_state + args.min_states_silence - 1):
        print ("{state} {next_state} silence silence {cost}".format(
                    state=state, next_state=state + 1, cost=0.0),
               file=file_handle)
    silence_last_state = silence_start_state + args.min_states_silence - 1

    # Silence self-loop
    print ("{state} {state} silence silence {cost}".format(
                state=silence_last_state, cost=0.0),
           file=file_handle)

    speech_start_state = silence_last_state + 1
    # Initial transition to speech
    print ("0 {state} speech speech {cost}".format(
                state=speech_start_state,
                cost=-math.log(1.0 - args.edge_silence_probability)),
           file=file_handle)

    # Silence to speech transition
    print ("{sil_state} {speech_state} speech speech {cost}".format(
                sil_state=silence_last_state,
                speech_state=speech_start_state,
                cost=-math.log(args.transition_probability)),
           file=file_handle)

    # Speech min duration
    for state in range(speech_start_state,
                       speech_start_state + args.min_states_speech - 1):
        print ("{state} {next_state} speech speech {cost}".format(
                    state=state, next_state=state + 1, cost=0.0),
               file=file_handle)

    # Speech max duration
    for state in range(speech_start_state + args.min_states_speech - 1,
                       speech_start_state + args.max_states_speech - 1):
        print ("{state} {next_state} speech speech {cost}".format(
                    state=state, next_state=state + 1, cost=0.0),
               file=file_handle)

        print ("{state} {sil_state} silence silence {cost}".format(
                    state=state, sil_state=silence_start_state,
                    cost=-math.log(args.transition_probability)),
               file=file_handle)
    speech_last_state = speech_start_state + args.max_states_speech - 1

    # Transition to silence after max duration of speech
    print ("{state} {sil_state} silence silence {cost}".format(
                state=speech_last_state, sil_state=silence_start_state,
                cost=0.0),
           file=file_handle)

    for state in range(1, speech_start_state):
        print ("{state} {cost}".format(
                    state=state, cost=-math.log(args.edge_silence_probability)),
               file=file_handle)

    for state in range(speech_start_state, speech_last_state + 1):
        print ("{state} {cost}".format(
                    state=state,
                    cost=-math.log(1.0 - args.edge_silence_probability)),
               file=file_handle)


def main():
    try:
        args = get_args()
        with common_lib.smart_open(args.output_graph, 'w') as f:
            print_states(args, f)
    except Exception:
        raise


if __name__ == '__main__':
    main()


================================================
FILE: egs/steps/segmentation/internal/resample_targets.py
================================================
#!/usr/bin/env python

# Copyright 2017  Vimal Manohar
# Apache 2.0

"""
This script reads a Kaldi text archive of matrices from 'targets_in_ark' (e.g.
'-' for standard input), modifies them by subsampling them, and writes the
modified archive to 'targets_out_ark'.
This form of 'subsampling' is similar to taking every n'th frame (specifically:
every n'th row), except that we average over blocks of size 'n' instead of
taking every n'th element.
Thus, this script is similar to the binary 'subsample-feats' except that
it subsamples by averaging.
"""

import argparse
import logging
import numpy as np
import sys

sys.path.insert(0, 'steps')
import libs.common as common_lib

logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
handler = logging.StreamHandler()
handler.setLevel(logging.INFO)
formatter = logging.Formatter("%(asctime)s [%(pathname)s:%(lineno)s - "
                              "%(funcName)s - %(levelname)s ] %(message)s")
handler.setFormatter(formatter)
logger.addHandler(handler)


def get_args():
    parser = argparse.ArgumentParser(
        description="""
This script reads a Kaldi text archive of matrices from 'targets_in_ark' (e.g.
'-' for standard input), modifies them by subsampling them, and writes the
modified archive to 'targets_out_ark'.
This form of 'subsampling' is similar to taking every n'th frame (specifically:
every n'th row), except that we average over blocks of size 'n' instead of
taking every n'th element.
Thus, this script is similar to the binary 'subsample-feats' except that
it subsamples by averaging.""")

    parser.add_argument("--subsampling-factor", type=int, default=1,
                        help="The sampling rate is scaled by this factor")
    parser.add_argument("--verbose", type=int, default=0, choices=[0,1,2],
                        help="Verbose level")

    parser.add_argument("targets_in_ark", type=argparse.FileType('r'),
                        help="Input targets archive")
    parser.add_argument("targets_out_ark", type=argparse.FileType('w'),
                        help="Output targets archive")

    args = parser.parse_args()

    if args.subsampling_factor < 1:
        raise ValueError("Invalid --subsampling-factor value {0}".format(
                            args.subsampling_factor))

    if args.verbose >= 2:
        logger.setLevel(logging.DEBUG)
        handler.setLevel(logging.DEBUG)

    return args


def run(args):
    num_utts = 0
    for key, mat in common_lib.read_mat_ark(args.targets_in_ark):
        mat = np.matrix(mat)
        if args.subsampling_factor > 0:
            num_indexes = ((mat.shape[0] + args.subsampling_factor - 1)
                            / args.subsampling_factor)

        out_mat = np.zeros([num_indexes, mat.shape[1]])
        i = 0
        for k in range(int(args.subsampling_factor / 2.0),
                       mat.shape[0], args.subsampling_factor):
            st = int(k - float(args.subsampling_factor) / 2.0)
            end = int(k + float(args.subsampling_factor) / 2.0)

            if st < 0:
                st = 0
            if end > mat.shape[0]:
                end = mat.shape[0]

            try:
                out_mat[i, :] = np.sum(mat[st:end, :], axis=0) / float(end - st)
            except IndexError:
                logger.error("mat.shape = {0}, st = {1}, end = {2}"
                             "".format(mat.shape, st, end))
                raise
            assert i == k / args.subsampling_factor
            i += 1

        common_lib.write_matrix_ascii(args.targets_out_ark, out_mat, key=key)
        num_utts += 1
    args.targets_in_ark.close()
    args.targets_out_ark.close()

    logger.info("Sub-sampled {num_utts} target matrices"
                "".format(num_utts=num_utts))


def main():
    args = get_args()
    try:
        run(args)
    except Exception as e:
        logger.error("Script failed; traceback = ", exc_info=True)
        raise SystemExit(1)
    finally:
        for f in [args.targets_in_ark, args.targets_out_ark]:
            if f is not None:
                f.close()


if __name__ == "__main__":
    main()


================================================
FILE: egs/steps/segmentation/internal/sad_to_segments.py
================================================
#!/usr/bin/env python

# Copyright 2017  Vimal Manohar
#           2018  Capital One (Author: Zhiyuan Guan)
# Apache 2.0

"""
This script converts frame-level speech activity detection marks (in kaldi
integer vector text archive format) into kaldi segments and utt2spk.
The input integer vectors are expected to contain '1' for silence frames
and '2' for speech frames.
"""

from __future__ import print_function
import argparse
import logging
import sys

sys.path.insert(0, 'steps')
import libs.common as common_lib

logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
handler = logging.StreamHandler()
handler.setLevel(logging.INFO)
formatter = logging.Formatter("%(asctime)s [%(pathname)s:%(lineno)s - "
                              "%(funcName)s - %(levelname)s ] %(message)s")
handler.setFormatter(formatter)
logger.addHandler(handler)

global_verbose = 0


def get_args():
    parser = argparse.ArgumentParser(
        description="""
This script converts frame-level speech activity detection marks (in kaldi
integer vector text archive format) into kaldi segments and utt2spk.
The input integer vectors are expected to contain 1 for silence frames
and 2 for speech frames.
""",
        formatter_class=argparse.RawTextHelpFormatter)

    parser.add_argument("--verbose", type=int, choices=[0, 1, 2, 3],
                        default=0, help="Higher verbosity for more logging")

    parser.add_argument("--utt2dur", type=str,
                        help="File containing durations of utterances.")

    parser.add_argument("--frame-shift", type=float, default=0.01,
                        help="Frame shift to convert frame indexes to time")

    parser.add_argument("--segment-padding", type=float, default=0.2,
                        help="Additional padding on speech segments. But we "
                             "ensure that the padding does not go beyond the "
                             "adjacent segment.")

    parser.add_argument("--min-segment-dur", type=float, default=0,
                        help="Minimum duration (in seconds) required for a segment "
                             "to be included. This is before any padding. Segments "
                             "shorter than this duration will be removed.")

    parser.add_argument("--merge-consecutive-max-dur", type=float, default=0,
                        help="Merge consecutive segments as long as the merged "
                             "segment is no longer than this many seconds. The segments "
                             "are only merged if their boundaries are touching. "
                             "This is after padding by --segment-padding seconds."
                             "0 means do not merge. Use 'inf' to not limit the duration.")

    parser.add_argument("in_sad", type=str,
                        help="Input file containing alignments in "
                             "text archive format")

    parser.add_argument("out_segments", type=str,
                        help="Output kaldi segments file")

    args = parser.parse_args()

    global global_verbose
    global_verbose = args.verbose

    logger.info("Setting verbosity to {0}".format(global_verbose))

    if args.verbose >= 3:
        logger.setLevel(logging.DEBUG)
        handler.setLevel(logging.DEBUG)
    return args


def to_str(segment):
    assert len(segment) == 3
    return "[{0:.3f}, {1:.3f}, {2}]".format(segment[0], segment[1],
                                            segment[2])


class SegmenterStats(object):
    """Stores stats about the post-process stages"""

    def __init__(self):
        self.num_segments_initial = 0
        self.num_short_segments_filtered = 0
        self.num_merges = 0
        self.num_segments_final = 0
        self.initial_duration = 0.0
        self.padding_duration = 0.0
        self.filter_short_duration = 0.0
        self.final_duration = 0.0

    def add(self, other):
        """Adds stats from another object"""
        self.num_segments_initial += other.num_segments_initial
        self.num_short_segments_filtered += other.num_short_segments_filtered
        self.num_merges += other.num_merges
        self.num_segments_final += other.num_segments_final
        self.initial_duration += other.initial_duration
        self.filter_short_duration += other.filter_short_duration
        self.padding_duration += other.padding_duration
        self.final_duration += other.final_duration

    def __str__(self):
        return ("num-segments-initial={num_segments_initial}, "
                "num-short-segments-filtered={num_short_segments_filtered}, "
                "num-merges={num_merges}, "
                "num-segments-final={num_segments_final}, "
                "initial-duration={initial_duration}, "
                "filter-short-duration={filter_short_duration}, "
                "padding-duration={padding_duration}, "
                "final-duration={final_duration}".format(
            num_segments_initial=self.num_segments_initial,
            num_short_segments_filtered=self.num_short_segments_filtered,
            num_merges=self.num_merges,
            num_segments_final=self.num_segments_final,
            initial_duration=self.initial_duration,
            filter_short_duration=self.filter_short_duration,
            padding_duration=self.padding_duration,
            final_duration=self.final_duration))


def process_label(text_label):
    """Processes an input integer label and returns a 1 or 2,
    where 1 is for silence and 2 is for speech.

    Arguments:
        text_label -- input label (must be integer)
    """
    prev_label = int(text_label)
    if prev_label not in [1, 2]:
        raise ValueError("Expecting label to 1 (non-speech) or 2 (speech); "
                         "got {}".format(prev_label))

    return prev_label


class Segmentation(object):
    """Stores segmentation for an utterances"""

    def __init__(self):
        self.segments = None
        self.stats = SegmenterStats()

    def initialize_segments(self, alignment, frame_shift=0.01):
        """Initializes segments from input alignment.
        The alignment is frame-level speech-activity detection marks,
        each of which must be 1 or 2."""
        self.segments = []

        assert len(alignment) > 0

        prev_label = None
        prev_length = 0
        for i, text_label in enumerate(alignment):
            if prev_label is not None and int(text_label) != prev_label:
                if prev_label == 2:
                    self.segments.append(
                        [float(i - prev_length) * frame_shift,
                         float(i) * frame_shift, prev_label])
                    self.stats.initial_duration += (prev_length * frame_shift)
                prev_label = process_label(text_label)
                prev_length = 0
            elif prev_label is None:
                prev_label = process_label(text_label)

            prev_length += 1

        if prev_length > 0 and prev_label == 2:
            self.segments.append(
                [float(len(alignment) - prev_length) * frame_shift,
                 float(len(alignment)) * frame_shift, prev_label])
            self.stats.initial_duration += (prev_length * frame_shift)

        self.stats.num_segments_initial = len(self.segments)
        self.stats.num_segments_final = len(self.segments)
        self.stats.final_duration = self.stats.initial_duration

    def filter_short_segments(self, min_dur):
        """Filters out segments with durations shorter than 'min_dur'."""
        if min_dur <= 0:
            return

        segments_kept = []
        for segment in self.segments:
            assert segment[2] == 2, segment
            dur = segment[1] - segment[0]
            if dur < min_dur:
                self.stats.filter_short_duration += dur
                self.stats.num_short_segments_filtered += 1
            else:
                segments_kept.append(segment)
        self.segments = segments_kept
        self.stats.num_segments_final = len(self.segments)
        self.stats.final_duration -= self.stats.filter_short_duration

    def pad_speech_segments(self, segment_padding, max_duration=float("inf")):
        """Pads segments by duration 'segment_padding' on either sides, but
        ensures that the segments don't go beyond the neighboring segments
        or the duration of the utterance 'max_duration'."""
        if max_duration == None:
            max_duration = float("inf")
        for i, segment in enumerate(self.segments):
            assert segment[2] == 2, segment
            segment[0] -= segment_padding  # try adding padding on the left side
            self.stats.padding_duration += segment_padding
            if segment[0] < 0.0:
                # Padding takes the segment start to before the beginning of the utterance.
                # Reduce padding.
                self.stats.padding_duration += segment[0]
                segment[0] = 0.0
            if i >= 1 and self.segments[i - 1][1] > segment[0]:
                # Padding takes the segment start to before the end the previous segment.
                # Reduce padding.
                self.stats.padding_duration -= (
                        self.segments[i - 1][1] - segment[0])
                segment[0] = self.segments[i - 1][1]

            segment[1] += segment_padding
            self.stats.padding_duration += segment_padding
            if segment[1] >= max_duration:
                # Padding takes the segment end beyond the max duration of the utterance.
                # Reduce padding.
                self.stats.padding_duration -= (segment[1] - max_duration)
                segment[1] = max_duration
            if (i + 1 < len(self.segments)
                    and segment[1] > self.segments[i + 1][0]):
                # Padding takes the segment end beyond the start of the next segment.
                # Reduce padding.
                self.stats.padding_duration -= (
                        segment[1] - self.segments[i + 1][0])
                segment[1] = self.segments[i + 1][0]
        self.stats.final_duration += self.stats.padding_duration

    def merge_consecutive_segments(self, max_dur):
        """Merge consecutive segments (happens after padding), provided that
        the merged segment is no longer than 'max_dur'."""
        if max_dur <= 0 or not self.segments:
            return

        merged_segments = [self.segments[0]]
        for segment in self.segments[1:]:
            assert segment[2] == 2, segment
            if segment[0] == merged_segments[-1][1] and \
                    segment[1] - merged_segments[-1][0] <= max_dur:
                # The segment starts at the same time the last segment ends,
                # and the merged segment is shorter than 'max_dur'.
                # Extend the previous segment.
                merged_segments[-1][1] = segment[1]
                self.stats.num_merges += 1
            else:
                merged_segments.append(segment)

        self.segments = merged_segments
        self.stats.num_segments_final = len(self.segments)

    def write(self, key, file_handle):
        """Write segments to file"""
        if global_verbose >= 2:
            logger.info("For key {key}, got stats {stats}".format(
                key=key, stats=self.stats))
        for segment in self.segments:
            seg_id = "{key}-{st:07d}-{end:07d}".format(
                key=key, st=int(segment[0] * 100), end=int(segment[1] * 100))
            print("{seg_id} {key} {st:.2f} {end:.2f}".format(
                seg_id=seg_id, key=key, st=segment[0], end=segment[1]),
                file=file_handle)


def run(args):
    """The main function that does everything."""
    utt2dur = {}
    if args.utt2dur is not None:
        with common_lib.smart_open(args.utt2dur) as utt2dur_fh:
            for line in utt2dur_fh:
                parts = line.strip().split()
                if len(parts) != 2:
                    raise RuntimeError("Unable to parse line '{0}' in {1}"
                                       "".format(line.strip(), args.utt2dur))
                utt2dur[parts[0]] = float(parts[1])

    global_stats = SegmenterStats()
    with common_lib.smart_open(args.in_sad) as in_sad_fh, \
            common_lib.smart_open(args.out_segments, 'w') as out_segments_fh:
        for line in in_sad_fh:
            parts = line.strip().split()
            utt_id = parts[0]

            if len(parts) < 2:
                raise RuntimeError("Unable to parse line '{0}' in {1}"
                                   "".format(line.strip(),
                                             in_sad_fh))

            segmentation = Segmentation()
            segmentation.initialize_segments(
                parts[1:], args.frame_shift)
            segmentation.filter_short_segments(args.min_segment_dur)
            segmentation.pad_speech_segments(args.segment_padding,
                                             None if args.utt2dur is None
                                             else utt2dur[utt_id])
            segmentation.merge_consecutive_segments(args.merge_consecutive_max_dur)
            segmentation.write(utt_id, out_segments_fh)
            global_stats.add(segmentation.stats)
    logger.info(global_stats)


def main():
    """Parses arguments and calls the run method"""
    args = get_args()
    try:
        run(args)
    except Exception:
        raise


if __name__ == '__main__':
    main()


================================================
FILE: egs/steps/segmentation/internal/verify_phones_list.py
================================================
#!/usr/bin/env python

# Copyright 2017  Vimal Manohar
# Apache 2.0

"""This script verifies the list of phones read from stdin are valid
phones present in lang/phones.txt."""

import argparse
import sys

def get_args():
    parser = argparse.ArgumentParser(description="""
    This script verifies the list of phones read from stdin are valid
    phones present in lang/phones.txt.""")

    parser.add_argument("phones", type=str,
                        help="File containing the list of all phones as the "
                        "first column")

    args = parser.parse_args()
    return args


def main():
    args = get_args()
    phones = set()
    for line in open(args.phones):
        phones.add(line.strip().split()[0])

    for line in sys.stdin.readlines():
        p = line.strip()

        if p not in phones:
            sys.stderr.write("Could not find phone {p} in {f}"
                             "\n".format(p=p, f=args.phones))
            raise SystemExit(1)


if __name__ == "__main__":
    main()


================================================
FILE: egs/steps/segmentation/lats_to_targets.sh
================================================
#!/usr/bin/env bash

# Copyright 2017  Vimal Manohar
# Apache 2.0

# This script converts lattices into targets for training neural network
# for speech activity detection. The targets is a matrix of size 
# (num-frames-subsampled x 3)
# with each row representing probabilities for speech, silence and 
# garbage classes for the corresponding frame (after subsampling). The 
# probability values are lattice posteriors for the 3 classes and are
# obtained by summing up phone arc posteriors for the phones
# corresponding to each class.
# The mapping from phones to speech / silence / garbage classes
# is defined by the options --silence-phones and --garbage-phones.
# Also "speech" phones longer than --max-phone-duration seconds are 
# treated as "garbage".

set -o pipefail

silence_phones=
garbage_phones=
max_phone_duration=0.5
acwt=0.1

cmd=run.pl

[ -f ./path.sh ] && . ./path.sh
. utils/parse_options.sh

if [ $# -ne 4 ]; then
  cat <<EOF
  This script converts lattices into targets for training neural network
  for speech activity detection. The targets is a matrix of size 
  (num-frames-subsampled x 3)
  with each row representing probabilities for speech, silence and 
  garbage classes for the corresponding frame (after subsampling). The 
  probability values are lattice posteriors for the 3 classes and are
  obtained by summing up phone arc posteriors for the phones
  corresponding to each class.
  The mapping from phones to speech / silence / garbage classes
  is defined by the options --silence-phones and --garbage-phones.
  Also "speech" phones longer than --max-phone-duration seconds are 
  treated as "garbage".

  Usage: steps/segmentation/lats_to_targets.sh <data-dir> <lang> <lattice-dir> <targets-dir>"
  e.g.: steps/segmentation/lats_to_targets.sh \
  --silence-phones exp/segmentation1a/silence_phones.txt \
  --garbage-phones exp/segmentation1a/garbage_phones.txt \
  --max-phone-duration 0.5 \
  data/train_split10s data/lang \
  exp/segmentation1a/tri3b_train_split10s_lats \
  exp/segmentation1a/tri3b_train_split10s_targets

  note: 
  silence_phones.txt and garbage_phones.txt must list phones, one per line.
  garbage_phones.txt can contain phones corresponding to ambiguous items like 
  OOV, laugh and spoken noise that you want to map to "garbage class".
  silence_phones.txt might just contain the phones from 
  data/lang/phones/silence_phones.txt other than the garbage phones. These
  are mapped to the "silence" class.
EOF
  exit 1
fi

data=$1
lang=$2
lats_dir=$3
dir=$4

if [ -f $lats_dir/final.mdl ]; then
  srcdir=$lats_dir
else
  srcdir=$lats_dir/..
fi

for f in $data/utt2spk $lats_dir/lat.1.gz $srcdir/final.mdl; do 
  if [ ! -f $f ]; then
    echo "$0: Could not find file $f"
    exit 1
  fi
done

mkdir -p $dir

if [ -z "$garbage_phones" ]; then
  oov_phone=$(steps/segmentation/internal/get_oov_phone.py $lang) || exit 1
  echo $oov_phone | utils/int2sym.pl $lang/phones.txt > $dir/garbage_phones.txt || exit 1
else 
  cp $garbage_phones $dir/garbage_phones.txt || exit 1
fi

if [ -z "$silence_phones" ]; then
  cat $lang/silence_phones.txt | \
    utils/filter_scp.pl --exclude $dir/garbage_phones.txt > \
    $dir/silence_phones.txt
else 
  cp $silence_phones $dir/silence_phones.txt
fi

nj=$(cat $lats_dir/num_jobs) || exit 1

$cmd JOB=1:$nj $dir/log/get_arc_info.JOB.log \
  lattice-push "ark:gunzip -c $lats_dir/lat.JOB.gz |" ark:- \| \
  lattice-align-phones --replace-output-symbols=true $srcdir/final.mdl ark:- ark:- \| \
  lattice-arc-post --acoustic-scale=$acwt $srcdir/final.mdl ark:- - \| \
  utils/int2sym.pl -f 5 $lang/phones.txt '>' \
  $dir/arc_info_sym.JOB.txt || exit 1

# make $dir an absolute pathname.
dir=`perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' $dir ${PWD}`

frame_subsampling_factor=1
if [ -f $srcdir/frame_subsampling_factor ]; then
  frame_subsampling_factor=$(cat $srcdir/frames_subsampling_factor)
  echo $frame_subsampling_factor > $dir/frame_subsampling_factor
fi

frame_shift=$(utils/data/get_frame_shift.sh $data) || exit 1
max_phone_len=$(perl -e "print int($max_phone_duration / $frame_shift)")

$cmd JOB=1:$nj $dir/log/get_targets.JOB.log \
  steps/segmentation/internal/arc_info_to_targets.py \
    --silence-phones=$dir/silence_phones.txt \
    --garbage-phones=$dir/garbage_phones.txt \
    --max-phone-length=$max_phone_len \
    $dir/arc_info_sym.JOB.txt - \| \
  copy-feats ark,t:- \
    ark,scp:$dir/targets.JOB.ark,$dir/targets.JOB.scp || exit 1

for n in $(seq $nj); do
  cat $dir/targets.$n.scp
done > $dir/targets.scp

steps/segmentation/validate_targets_dir.sh $dir $data || exit 1

echo "$0: Done creating targets in $dir/targets.scp"


================================================
FILE: egs/steps/segmentation/merge_targets_dirs.sh
================================================
#!/usr/bin/env bash

# Copyright 2017  Vimal Manohar
# Apache 2.0

# This script merges targets dirs created from multiple sources (systems) into
# single targets matrices. See steps/segmentation/lats_to_targets.sh for 
# details about the format of the targets.

# This script merges targets from multiple sources using weights supplied 
# by --weights option. Also the option --remove-mismatch-frames can be 
# used to remove frames different sources have mismatched labels.
# e.g. We can check if the labels from supervision-constrained lattices 
# and those from decoding match.

cmd=run.pl 
nj=4
weights=        # A comma-separated list of weights corresponding to each
                # target source being combined. Must match the number of 
                # source target directories.
remove_mismatch_frames=true     # If true, the mismatch frames are removed by 
                                # setting targets to 0 in the following cases:
                                # a) If none of the sources have a column with value > 0.5
                                # b) If two sources have columns with value > 0.5, but
                                # they occur at different indexes e.g. silence prob is > 0.5 for the
                                # targets from alignment, and speech prob > 0.5 for the targets from
                                # decoding

[ -f ./path.sh ] && . ./path.sh 
. utils/parse_options.sh

if [ $# -lt 3 ]; then
  cat <<EOF
  This script merges targets dirs created from multiple sources (systems) into
  single targets matrices.
  See top of the script for more details.

  Usage: steps/segmentation/merge_targets_dirs.py <data> <targets-1> <targets-2> ... <merged-targets>
  e.g.: steps/segmentation/merge_targets_dirs.py --weights 1.0,0.5 \
      data/train_whole \
      exp/segmentation1a/tri3b_train_whole_sup_targets_sub3 \
      exp/segmentation1a/tri3b_train_whole_targets_sub3 \
      exp/segmentation1a/tri3b_train_whole_combined_targets_sub3
EOF
  exit 1
fi

data=$1
dir=${@: -1}  # last argument to the script
shift;

targets_dirs=( $@ )  # read the remaining arguments into an array
unset targets_dirs[${#targets_dirs[@]}-1]  # 'pop' the last argument which is odir
num_sources=${#targets_dirs[@]}  # number of targets to combine

utils/data/split_data.sh --per-utt $data $nj
sdata=${data}/split${nj}utt

frame_subsampling_factor=1
if [ -f ${targets_dirs[0]}/frame_subsampling_factor ]; then
  frame_subsampling_factor=$(cat ${targets_dirs[0]}/frame_subsampling_factor) || exit 1
fi

mkdir -p $dir/split${nj}

target_id=1
for t in ${targets_dirs[@]}; do
  this_frame_subsampling_factor=1
  if [ -f $t/frame_subsampling_factor ]; then
    this_frame_subsampling_factor=$(cat $t/frame_subsampling_factor) || exit 1
  fi
  if [ $this_frame_subsampling_factor -ne $frame_subsampling_factor ]; then
    echo "$0: Mismatch in frame_subsampling_factor in $t and ${targets_dirs[0]}; $this_frame_subsampling_factor vs $frame_subsampling_factor"
    exit 1
  fi

  utils/filter_scps.pl JOB=1:$nj $sdata/JOB/utt2spk \
    $t/targets.scp $dir/split${nj}/in_targets.$target_id.JOB.scp

  targets_rspecifiers+=("scp:$dir/split${nj}/in_targets.$target_id.JOB.scp")
  target_id=$[target_id+1]
done

# convert $dir to an absolute pathname.
fdir=`perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' $dir ${PWD}`

$cmd JOB=1:$nj $dir/log/merge_targets.JOB.log \
  paste-feats "${targets_rspecifiers[@]}" ark,t:- \| \
  steps/segmentation/internal/merge_targets.py --weights="$weights" \
    --remove-mismatch-frames=$remove_mismatch_frames - - \| \
  copy-feats ark,t:- ark,scp:$fdir/targets.JOB.ark,$fdir/targets.JOB.scp || exit 1

for n in `seq $nj`; do
  cat $dir/targets.$n.scp
done > $dir/targets.scp

rm $dir/targets.*.scp   # cleanup

if [ $frame_subsampling_factor -ne 1 ]; then
  echo $frame_subsampling_factor > $dir/frame_subsampling_factor
fi

steps/segmentation/validate_targets_dir.sh $dir $data || exit 1

echo "$0: Merged target directories to $dir"

exit 0


================================================
FILE: egs/steps/segmentation/post_process_sad_to_segments.sh
================================================
#!/usr/bin/env bash

# Copyright 2015-17  Vimal Manohar
# Apache 2.0.

# This script post-processes the output of steps/segmentation/decode_sad.sh,
# which is in the form of frame-level alignments, into a 'segments' file.
# The alignments must be speech activity detection marks i.e. 1 for silence 
# and 2 for speech.

set -e -o pipefail -u
. ./path.sh

cmd=run.pl
stage=-10
nj=18

# The values below are in seconds
frame_shift=0.01
segment_padding=0.2
min_segment_dur=0
merge_consecutive_max_dur=0

. utils/parse_options.sh

if [ $# -ne 3 ]; then
  echo "This script post-processes the output of steps/segmentation/decode_sad.sh, "
  echo "which is in the form of frame-level alignments, into kaldi segments. "
  echo "The alignments must be speech activity detection marks i.e. 1 for silence "
  echo "and 2 for speech."
  echo "Usage: $0 <data-dir> <vad-dir> <segmentation-dir>"
  echo " e.g.: $0 data/dev_aspire_whole exp/vad_dev_aspire"
  exit 1
fi

data_dir=$1
vad_dir=$2    # Alignment directory containing frame-level SAD labels
dir=$3

mkdir -p $dir

for f in $vad_dir/ali.1.gz $vad_dir/num_jobs; do
  if [ ! -f $f ]; then
    echo "$0: Could not find file $f" && exit 1
  fi
done

nj=`cat $vad_dir/num_jobs` || exit 1
utils/split_data.sh $data_dir $nj

utils/data/get_utt2dur.sh $data_dir

if [ $stage -le 0 ]; then
  $cmd JOB=1:$nj $dir/log/segmentation.JOB.log \
    copy-int-vector "ark:gunzip -c $vad_dir/ali.JOB.gz |" ark,t:- \| \
    steps/segmentation/internal/sad_to_segments.py \
      --frame-shift=$frame_shift --segment-padding=$segment_padding \
      --min-segment-dur=$min_segment_dur --merge-consecutive-max-dur=$merge_consecutive_max_dur \
      --utt2dur=$data_dir/utt2dur - $dir/segments.JOB
fi

echo $nj > $dir/num_jobs

for n in $(seq $nj); do 
  cat $dir/segments.$n
done > $dir/segments


================================================
FILE: egs/steps/segmentation/prepare_targets_gmm.sh
================================================
#! /bin/bash

# Copyright 2017  Vimal Manohar
# Apache 2.0
  
# This script prepares targets for training neural network for 
# speech activity detction. 
# See steps/segmentation/lats_to_targets.sh for details about the 
# format of the targets.

# The targets are obtained from a combination
# of supervision-constrained lattices and lattices obtained by decoding. 
# Also, we assume that the out-of-segment regions are all silence (target 
# values of [ 1 0 0 ]. We merge the targets from the multiple sources 
# by a weighted average using weights specified by --weights. Also, 
# the frames where the labels from multiple sources do not match are 
# removed in the script steps/segmentation/merge_targets_dirs.sh.

# In this script, we use GMMs trained for ASR on in-domain data 
# to generate the lattices required for creating the targets. To generate
# supervision-constrained lattices, we use speaker-adapted GMM models. To 
# generate lattices without supervision, we use speaker-independent GMM models
# from the LDA+MLLT stage, but apply per-recording cepstral mean subtraction.
# The phones in the lattices are mapped deterministically to 
# 0, 1, and 2 representing respectively silence, speech and garbage classes.
# The mapping is defined by --garbage-phones-list and --silence-phones-list
# options. But when these are unspecified, the silence phones other than
# oov are mapped to silence class and the oov is mapped to garbage class.

stage=-1
train_cmd=run.pl
decode_cmd=run.pl
nj=4
reco_nj=4

lang_test=    # If different from $lang
graph_dir=    # If not provided, a new one will be created using $lang_test

garbage_phones_list=
silence_phones_list=

# Uniform segmentation options for decoding whole recordings. All values are in
# seconds.
max_segment_duration=10
overlap_duration=2.5
max_remaining_duration=5  # If the last remaining piece when splitting uniformly
                          # is smaller than this duration, then the last piece 
                          # is  merged with the previous.
remove_mismatch_frames=true

# List of weights on labels obtained from alignment, 
# labels obtained from decoding and default labels in out-of-segment regions
merge_weights=1.0,0.1,0.5

[ -f ./path.sh ] && . ./path.sh 

set -e -u -o pipefail
. utils/parse_options.sh 

if [ $# -ne 6 ]; then
  cat <<EOF
  This script prepares targets for training neural network for 
  speech activity detction. The targets are obtained from a combination
  of supervision-constrained lattices and lattices obtained by decoding. 
  See comments in the script for more details.

  Usage: $0 <lang> <data> <whole-recording-data> <ali-model-dir> <model-dir> <dir>
   e.g.: $0 data/lang data/train data/train_whole exp/tri5 exp/tri4 exp/segmentation_1a
  
  Note: <whole-recording-data> is expected to have feats.scp and <data> 
  expected to have segments file. We will get the features for <data> by 
  using row ranges of <whole-recording-data>/feats.scp. This script will 
  work on a copy of <data> created to have the recording-id as the speaker-id.
EOF
  exit 1
fi

lang=$1   # Must match the one used to train the models
in_data_dir=$2
in_whole_data_dir=$3
ali_model_dir=$4  # Model directory used to align the $data_dir to get target 
                  # labels for training SAD. This should typically be a
                  # speaker-adapted system.
model_dir=$5      # Model direcotry used to decode the whole-recording version
                  # of the $data_dir to get target labels for training SAD. This
                  # should typically be a speaker-independent system like
                  # LDA+MLLT system.
dir=$6

mkdir -p $dir

if [ -z "$lang_test" ]; then
  lang_test=$lang
fi

extra_files=
if [ -z "$graph_dir" ]; then
  extra_files="$extra_files $lang_test/G.fst $lang_test/phones.txt"
else
  extra_files="$extra_files $graph_dir/HCLG.fst $graph_dir/phones.txt"
fi

for f in $in_whole_data_dir/feats.scp $in_data_dir/segments \
  $lang/phones.txt $garbage_phones_list $silence_phones_list \
  $ali_model_dir/final.mdl $model_dir/final.mdl $extra_files; do
  if [ ! -f $f ]; then
    echo "$0: Could not find file $f"
    exit 1
  fi
done

utils/validate_data_dir.sh --no-feats $in_data_dir || exit 1
utils/validate_data_dir.sh --no-text $in_whole_data_dir || exit 1

if ! cat $garbage_phones_list $silence_phones_list | \
  steps/segmentation/internal/verify_phones_list.py $lang/phones.txt; then
  echo "$0: Invalid $garbage_phones_list $silence_phones_list"
  exit 1
fi

data_id=$(basename $in_data_dir)
whole_data_id=$(basename $in_whole_data_dir)

if [ $stage -le 0 ]; then
  rm -r $dir/$data_id 2>/dev/null || true
  mkdir -p $dir/$data_id

  utils/data/modify_speaker_info_to_recording.sh \
    $in_data_dir $dir/$data_id || exit 1
  utils/validate_data_dir.sh --no-feats $dir/$data_id || exit 1
fi 

# Work with a temporary data directory with recording-id as the speaker labels.
data_dir=$dir/${data_id}

###############################################################################
# Get feats for the manual segments
###############################################################################
if [ $stage -le 1 ]; then
  utils/data/subsegment_data_dir.sh $in_whole_data_dir ${data_dir}/segments ${data_dir}/tmp
  cp $data_dir/tmp/feats.scp $data_dir

  steps/compute_cmvn_stats.sh $data_dir || exit 1
fi

if [ $stage -le 2 ]; then
  utils/copy_data_dir.sh $in_whole_data_dir $dir/$whole_data_id

  utils/fix_data_dir.sh $dir/$whole_data_id

  # Copy the CMVN stats to the whole directory
  cp $data_dir/cmvn.scp $dir/$whole_data_id
fi

# Work with a temporary data directory with CMVN stats computed using 
# only the segments from the original data directory.
whole_data_dir=$dir/$whole_data_id

###############################################################################
# Obtain supervision-constrained lattices
###############################################################################
sup_lats_dir=$dir/`basename ${ali_model_dir}`_sup_lats_${data_id}
if [ $stage -le 3 ]; then
  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" \
    ${data_dir} ${lang} ${ali_model_dir} $sup_lats_dir || exit 1
fi

###############################################################################
# Uniformly segment whole data directory for decoding
###############################################################################
uniform_seg_data_dir=$dir/${whole_data_id}_uniformseg_${max_segment_duration}sec
uniform_seg_data_id=`basename $uniform_seg_data_dir`

if [ $stage -le 4 ]; then
  utils/data/get_segments_for_data.sh ${whole_data_dir} > \
    ${whole_data_dir}/segments

  mkdir -p $uniform_seg_data_dir

  utils/data/get_uniform_subsegments.py \
    --max-segment-duration $max_segment_duration \
    --overlap-duration $overlap_duration \
    --max-remaining-duration $max_remaining_duration \
    ${whole_data_dir}/segments > $uniform_seg_data_dir/sub_segments

  utils/data/subsegment_data_dir.sh $whole_data_dir \
    $uniform_seg_data_dir/sub_segments $uniform_seg_data_dir
  cp $whole_data_dir/cmvn.scp $uniform_seg_data_dir/
fi

model_id=$(basename $model_dir)
###############################################################################
# Create graph dir for decoding
###############################################################################
if [ -z "$graph_dir" ]; then
  graph_dir=$dir/$model_id/graph
  if [ $stage -le 5 ]; then
    if [ ! -f $graph_dir/HCLG.fst ]; then
      rm -r $dir/lang_test 2>/dev/null || true
      cp -r $lang_test/ $dir/lang_test
      utils/mkgraph.sh $dir/lang_test $model_dir $graph_dir || exit 1
    fi
  fi
fi

###############################################################################
# Decode uniformly segmented data directory
###############################################################################
model_id=$(basename $model_dir)
decode_dir=$dir/${model_id}/decode_${uniform_seg_data_id}
if [ $stage -le 6 ]; then 
  mkdir -p $decode_dir
  
  cp $model_dir/{final.mdl,final.mat,*_opts,tree} $dir/${model_id}
  cp $model_dir/phones.txt $dir/$model_id

  # We use a small beam and max-active since we are only interested in 
  # the speech / silence decisions, not the exact word sequences.
  steps/decode.sh --cmd "$decode_cmd --mem 2G" --nj $nj \
    --max-active 1000 --beam 10.0 \
    --decode-extra-opts "--word-determinize=false" --skip-scoring true \
    $graph_dir $uniform_seg_data_dir $decode_dir
fi

ali_model_id=`basename $ali_model_dir`
###############################################################################
# Get frame-level targets from lattices for nnet training
# Targets are matrices of 3 columns -- silence, speech and garbage
# The target values are obtained by summing up posterior probabilites of 
# arcs from lattice-arc-post over silence, speech and garbage phones.
###############################################################################
if [ $stage -le 7 ]; then
  steps/segmentation/lats_to_targets.sh --cmd "$train_cmd" \
    --silence-phones "$silence_phones_list" \
    --garbage-phones "$garbage_phones_list" \
    --max-phone-duration 0.5 \
    $data_dir $lang $sup_lats_dir \
    $dir/${ali_model_id}_${data_id}_sup_targets
fi

if [ $stage -le 8 ]; then
  steps/segmentation/lats_to_targets.sh --cmd "$train_cmd" \
    --silence-phones "$silence_phones_list" \
    --garbage-phones "$garbage_phones_list" \
    --max-phone-duration 0.5 \
    $uniform_seg_data_dir $lang $decode_dir \
    $dir/${model_id}_${uniform_seg_data_id}_targets
fi

###############################################################################
# Convert targets to be w.r.t. whole data directory and subsample the 
# targets by a factor of 3.
# Since the targets from transcript-constrained lattices have only values 
# for the manual segments, these are converted to whole recording-levels 
# by inserting [ 0 0 0 ] for the out-of-manual segment regions.
###############################################################################
if [ $stage -le 9 ]; then
  steps/segmentation/convert_targets_dir_to_whole_recording.sh --cmd "$train_cmd" --nj $reco_nj \
    $data_dir $whole_data_dir \
    $dir/${ali_model_id}_${data_id}_sup_targets \
    $dir/${ali_model_id}_${whole_data_id}_sup_targets
  
  steps/segmentation/resample_targets_dir.sh --cmd "$train_cmd" --nj $reco_nj 3 \
    $whole_data_dir \
    $dir/${ali_model_id}_${whole_data_id}_sup_targets \
    $dir/${ali_model_id}_${whole_data_id}_sup_targets_sub3
fi

###############################################################################
# Convert the targets from decoding to whole recording. 
###############################################################################
if [ $stage -le 10 ]; then
  steps/segmentation/convert_targets_dir_to_whole_recording.sh --cmd "$train_cmd" --nj $reco_nj \
    $dir/${uniform_seg_data_id} $whole_data_dir \
    $dir/${model_id}_${uniform_seg_data_id}_targets \
    $dir/${model_id}_${whole_data_id}_targets

  steps/segmentation/resample_targets_dir.sh --cmd "$train_cmd" --nj $reco_nj 3 \
    $whole_data_dir \
    $dir/${model_id}_${whole_data_id}_targets \
    $dir/${model_id}_${whole_data_id}_targets_sub3
fi

###############################################################################
# "default targets" values for the out-of-manual-segment regions.
# We assume in this setup that this is silence i.e. [ 1 0 0 ].
###############################################################################

if [ $stage -le 11 ]; then
  echo " [ 1 0 0 ]" > $dir/default_targets.vec
  steps/segmentation/get_targets_for_out_of_segments.sh --cmd "$train_cmd" \
    --nj $reco_nj --frame-subsampling-factor 3 \
    --default-targets $dir/default_targets.vec \
    $data_dir $whole_data_dir $dir/out_of_seg_${whole_data_id}_default_targets_sub3
fi

###############################################################################
# Merge targets for the same data from multiple sources (systems)
# --weights is used to weight targets from alignment with a higher weight 
# the targets from decoding. 
# If --remove-mismatch-frames is true, then if alignment and decoding 
# disagree (more than 0.5 probability on different classes), then those frames
# are removed by setting targets to [ 0 0 0 ]. 
###############################################################################
if [ $stage -le 12 ]; then
  steps/segmentation/merge_targets_dirs.sh --cmd "$train_cmd" --nj $reco_nj \
    --weights $merge_weights --remove-mismatch-frames $remove_mismatch_frames \
    $whole_data_dir \
    $dir/${ali_model_id}_${whole_data_id}_sup_targets_sub3 \
    $dir/${model_id}_${whole_data_id}_targets_sub3 \
    $dir/out_of_seg_${whole_data_id}_default_targets_sub3 \
    $dir/${whole_data_id}_combined_targets_sub3
fi

cp $dir/${whole_data_id}_combined_targets_sub3/targets.scp $dir/

echo "$0: Prepared targets in $dir/targets.scp"


================================================
FILE: egs/steps/segmentation/resample_targets_dir.sh
================================================
#!/usr/bin/env bash

# Copyright 2017  Vimal Manohar
# Apache 2.0

# This script resamples the targets matrix by the specified <subsampling-factor>.
# If <subsampling-factor> is negative, then the targets will be upsampled 
# by -<subsampling-factor>.
# This script is a wrapper to steps/segmentation/internal/resample_targets.py,
# which works very similar to the binary subsample-feats. See that script
# for details about how the resampling is done.

# See the script steps/segmentation/lats_to_targets.sh for details about 
# the format of the targets.

nj=4
cmd=run.pl

set -o pipefail -u

[ -f ./path.sh ] && . ./path.sh
. utils/parse_options.sh

if [ $# -ne 4 ]; then
  cat <<EOF
  This script resamples the targets matrix by the specified subsampling factor.
  If <subsampling-factor> is negative, then the targets will be upsampled 
  by -<subsampling-factor>.
  See top of the script for more details.

  Usage: steps/segmentation/resample_targets.sh <subsampling-factor> <data-dir> <targets-dir> <resampled-targets-dir>
   e.g.: steps/segmentation/resample_targets.sh 3 \
    data/train_whole \
    exp/segmentation1a/tri3b_train_whole_targets \
    exp/segmentation1a/tri3b_train_whole_targets_sub3
EOF
  exit 1
fi

subsampling_factor=$1
data=$2
targets_dir=$3
dir=$4

frame_subsampling_factor=1
if [ -f $targets_dir/frame_subsampling_factor ]; then
  frame_subsampling_factor=$(cat $targets_dir/frames_subsampling_factor)
fi

for f in $targets_dir/targets.scp $data/feats.scp; do 
  if [ ! -f $f ]; then 
    echo "$0: Could not find file $f" 
    exit 1
  fi
done

steps/segmentation/validate_targets_dir.sh $targets_dir $data || exit 1

mkdir -p $dir

mkdir -p $targets_dir/split$nj
split_scps=
for n in $(seq $nj); do
  split_scps="$split_scps $targets_dir/split${nj}/targets.$n.scp"
done
utils/split_scp.pl $targets_dir/targets.scp $split_scps

# make $dir an absolute pathname.
dir=`perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' $dir ${PWD}`

if [ $subsampling_factor -eq 1 ]; then
  cp $targets_dir/targets.scp $dir
  cp $targets_dir/frame_subsampling_factor $dir || true
elif [ $subsampling_factor -gt 1 ]; then
  $cmd JOB=1:$nj $dir/log/resample_targets.JOB.log \
    copy-feats scp:$targets_dir/split${nj}/targets.JOB.scp ark,t:- \| \
    steps/segmentation/internal/resample_targets.py \
      --subsampling-factor=$subsampling_factor \
      - - \| \
    copy-feats ark,t:- ark,scp:$dir/targets.JOB.ark,$dir/targets.JOB.scp || exit 1

  perl -e "print $frame_subsampling_factor * $subsampling_factor" > \
    $dir/frame_subsampling_factor || exit 1
else
  $cmd JOB=1:$nj $dir/log/resample_targets.JOB.log \
    subsample-feats --n=$subsampling_factor \
      scp:$targets_dir/split${nj}/targets.JOB.scp \
      ark,scp:$dir/targets.JOB.ark,$dir/targets.JOB.scp || exit 1

  perl -e "print $frame_subsampling_factor * (-$subsampling_factor)" > \
    $dir/frame_subsampling_factor || exit 1
fi 
 
for n in $(seq $nj); do
  cat $dir/targets.$n.scp
done > $dir/targets.scp

steps/segmentation/validate_targets_dir.sh $targets_dir $data

echo "$0: Resampled targets in $dir"
exit 0


================================================
FILE: egs/steps/segmentation/validate_targets_dir.sh
================================================
#!/usr/bin/env bash

# Copyright 2017  Vimal Manohar
# Apache 2.0

# This script validates a 'targets_dir' as created by lats_to_targets.sh.
# See that script for details about the format of the targets.

[ -f ./path.sh ] && . ./path.sh

if [ $# -ne 2 ]; then
  cat <<EOF
  This script validates a 'targets_dir' as created by lats_to_targets.sh.
  See that script for details about the format of the targets.

  Usage: steps/segmentation/validate_targets_dir.sh <targets-dir> <data-dir>
  e.g.: steps/segmentation/validate_targets_dir.sh \
    exp/segmentation1a/tri3b_train_split10s_targets \
    data/train_split10s
EOF
  exit 1
fi

targets_dir=$1
data=$2

tmpdir=$(mktemp -d /tmp/kaldi.XXXX);
trap 'rm -rf "$tmpdir"' EXIT HUP INT PIPE TERM

export LC_ALL=C

function check_sorted_and_uniq {
  ! awk '{print $1}' $1 | sort | uniq | cmp -s - <(awk '{print $1}' $1) && \
    echo "$0: file $1 is not in sorted order or has duplicates" && exit 1;
}

for f in $targets_dir/targets.scp $data/utt2spk; do 
  if [ ! -f $f ]; then
    echo "$0: Could not find $f"
    exit 1
  fi
done

utils/data/validate_data_dir.sh --no-text --no-wav --no-spk-sort \
  $data || exit 1

check_sorted_and_uniq $targets_dir/targets.scp

nu=`cat $data/utt2spk | wc -l` || exit 1
nt=`cat $targets_dir/targets.scp | wc -l` || exit 1
if [ $nt -ne $nu ]; then
  echo "WARNING: It seems not all of the targets files were successfully created in "
  echo "$targets_dir/targets.scp for $data ($nt != $nu)."
fi

if [ $nt -lt $[$nu - ($nu/20)] ]; then
  echo "Less than 95% the targets were successfully generated.  Probably a serious error."
  exit 1
fi

head -n 100 $targets_dir/targets.scp | sort -k1,1 | feat-to-len scp:- ark,t:$tmpdir/len.targets || exit 1
utils/filter_scp.pl $tmpdir/len.targets $data/feats.scp | sort -k1,1 | feat-to-len scp:- ark,t:$tmpdir/len.feats || exit 1

frame_subsampling_factor=1
if [ -f $targets_dir/frame_subsampling_factor ]; then
  frame_subsampling_factor=$(cat $targets_dir/frame_subsampling_factor) || exit 1
fi

utils/filter_scp.pl $tmpdir/len.feats $tmpdir/len.targets | \
  paste -d ' ' - $tmpdir/len.feats | python -c "
import sys
num_lines = 0
for line in sys.stdin:
  parts = line.strip().split()
  if parts[0] != parts[2]:
    continue
  len_target = int(parts[1])
  len_feats = int(float(parts[3]) / $frame_subsampling_factor)
  diff = abs(len_target - len_feats)
  if diff > 3:
    sys.stderr.write('Mismatch in length for utterance {utt} between '
                     'targets and feats: {0} vs {1}; diff={2}'.format(
                      len_target, len_feats, diff, utt=parts[0]))
    sys.exit(1)
  num_lines += 1" || exit 1

echo "$0: Successfully validated data-directory $data"


================================================
FILE: egs/steps/select_feats.sh
================================================
#!/usr/bin/env bash

# Copyright 2014  Johns Hopkins University (Author: Daniel Povey)
# Apache 2.0

# This script is deprecated. Use utils/data/limit_feature_dim.sh.

# This script selects some specified dimensions of the features in the
# input data directory.

# To be run from .. (one directory up from here)
# see ../run.sh for example

# Begin configuration section.
cmd=run.pl
nj=4
compress=true
# End configuration section.

echo "$0 $@"  # Print the command line for logging

if [ -f path.sh ]; then . ./path.sh; fi
. parse_options.sh || exit 1;

if [ $# -lt 3 ] || [ $# -gt 5 ]; then
   echo "usage: $0 [options] <selector> <src-data-dir>  <dest-data-dir> [<log-dir> [<path-to-storage-dir>] ]";
   echo "e.g.: $0 0-12 data/train_mfcc_pitch data/train_mfcconly exp/select_pitch_train mfcc"
   echo "Note: <log-dir> defaults to <data-dir>/log, and <path-to-storage-dir> defaults to <data-dir>/data"
   echo "options: "
   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
   exit 1;
fi

selector="$1"
data_in=$2
data=$3
if [ $# -gt 3 ];then
  logdir=$4
else
  logdir=$data/log
fi

if [ $# -gt 4 ];then
  ark_dir=$5
else
  ark_dir=$data/data
fi

# make $ark_dir an absolute pathname.
ark_dir=`perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' $ark_dir ${PWD}`


utils/split_data.sh $data_in $nj || exit 1;

mkdir -p $ark_dir $logdir
mkdir -p $data

cp $data_in/* $data/ 2>/dev/null # so we get the other files, such as utt2spk.
rm $data/cmvn.scp 2>/dev/null
rm $data/feats.scp 2>/dev/null

# use "name" as part of name of the archive.
name=`basename $data`

for j in $(seq $nj); do
  # the next command does nothing unless $mfccdir/storage/ exists, see
  # utils/create_data_link.pl for more info.
  utils/create_data_link.pl $ark_dir/selected_$name.$j.ark
done

$cmd JOB=1:$nj $logdir/append.JOB.log \
   select-feats "$selector" scp:$data_in/split$nj/JOB/feats.scp ark:- \| \
   copy-feats --compress=$compress ark:- \
    ark,scp:$ark_dir/selected_$name.JOB.ark,$ark_dir/selected_$name.JOB.scp || exit 1;

# concatenate the .scp files together.
for ((n=1; n<=nj; n++)); do
  cat $ark_dir/selected_$name.$n.scp >> $data/feats.scp || exit 1;
done > $data/feats.scp || exit 1;


nf=`cat $data/feats.scp | wc -l`
nu=`cat $data/utt2spk | wc -l`
if [ $nf -ne $nu ]; then
  echo "It seems not all of the feature files were successfully processed ($nf != $nu);"
  exit 1;
fi

echo "Succeeded selecting features for $name into $data"


================================================
FILE: egs/steps/shift_feats.sh
================================================
#!/usr/bin/env bash

# Copyright 2016    Vimal Manohar
# Apache 2.0

# This script is deprecated. The newer script utils/data/shift_feats.sh
# should be used instead.

# This script shifts the feats in the input data directory and creates a
# new directory <input-data>_fs<num-frames-shift> with shifted feats.
# If the shift is negative, the initial frames get truncated and the
# last frame repeated; if positive, vice versa.
# Used to prepare data for sequence training of models with
# frame_subsampling_factor != 1 (e.g. chain models).

# To be run from .. (one directory up from here)
# see ../run.sh for example

# Begin configuration section.
cmd=run.pl
nj=4
compress=true
# End configuration section.

echo "$0 $@"  # Print the command line for logging

if [ -f path.sh ]; then . ./path.sh; fi
. parse_options.sh || exit 1;

if [ $# -ne 4 ]; then
   echo "This script is deprecated. The newer script utils/data/shift_feats.sh"
   echo "should be used instead."
   echo "usage: $0 [options] <frame-shift> <src-data-dir> <log-dir> <path-to-storage-dir>";
   echo "e.g.: $0 -1 data/train exp/shift-1_train mfcc"
   echo "options: "
   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
   exit 1;
fi

num_frames_shift=$1
data_in=$2
logdir=$3
featdir=$4

utt_prefix="fs$num_frames_shift-"
spk_prefix="fs$num_frames_shift-"

# make $featdir an absolute pathname.
featdir=`perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' $featdir ${PWD}`

utils/split_data.sh $data_in $nj || exit 1;

data=${data_in}_fs$num_frames_shift

mkdir -p $featdir $logdir
mkdir -p $data

utils/copy_data_dir.sh --utt-prefix $utt_prefix --spk-prefix $spk_prefix \
  $data_in $data

rm $data/feats.scp 2>/dev/null

# use "name" as part of name of the archive.
name=`basename $data`

for j in $(seq $nj); do
  # the next command does nothing unless $mfccdir/storage/ exists, see
  # utils/create_data_link.pl for more info.
  utils/create_data_link.pl $featdir/raw_feats_$name.$j.ark
done

$cmd JOB=1:$nj $logdir/shift.JOB.log \
  shift-feats --shift=$num_frames_shift \
  scp:$data_in/split$nj/JOB/feats.scp ark:- \| \
  copy-feats --compress=$compress ark:- \
  ark,scp:$featdir/raw_feats_$name.JOB.ark,$featdir/raw_feats_$name.JOB.scp || exit 1;

# concatenate the .scp files together.
for ((n=1; n<=nj; n++)); do
  cat $featdir/raw_feats_$name.$n.scp
done | awk -v nfs=$num_frames_shift '{print "fs"nfs"-"$0}'>$data/feats.scp || exit 1;

nf=`cat $data/feats.scp | wc -l`
nu=`cat $data/utt2spk | wc -l`
if [ $nf -ne $nu ]; then
  echo "It seems not all of the feature files were successfully processed ($nf != $nu);"
  exit 1;
fi

echo "Succeeded shifting features for $name into $data"


================================================
FILE: egs/steps/subset_ali_dir.sh
================================================
#!/usr/bin/env bash

# Copyright 2017  Vimal Manohar
# Apache 2.0.

cmd=run.pl

if [ -f ./path.sh ]; then . ./path.sh; fi

. ./utils/parse_options.sh

if [ $# -ne 4 ]; then
  cat <<EOF
  This script creates an alignment directory containing a subset of 
  utterances contained in <subset-data-dir> from the 
  original alignment directory containing alignments for utterances in
  <full-data-dir>.

  The number of split jobs in the output alignment directory is 
  equal to the number of jobs in the original alignment directory, 
  unless the subset data directory has too few speakers.

  Usage: $0 [options] <full-data-dir> <subset-data-dir> <ali-dir> <subset-ali-dir>
   e.g.: $0 data/train_sp data/train exp/tri3_ali_sp exp/tri3_ali

  Options: 
      --cmd (utils/run.pl|utils/queue.pl <queue opts>)  # how to run jobs.
EOF
  exit 1
fi

data=$1
subset_data=$2
ali_dir=$3
dir=$4

nj=$(cat $ali_dir/num_jobs) || exit 1
utils/split_data.sh $data $nj

mkdir -p $dir
cp $ali_dir/{final.mdl,*.mat,*_opts,tree} $dir/ || true
cp -r $ali_dir/phones $dir 2>/dev/null || true

$cmd JOB=1:$nj $dir/log/copy_alignments.JOB.log \
  copy-int-vector "ark:gunzip -c $ali_dir/ali.JOB.gz |" \
  ark,scp:$dir/ali_tmp.JOB.ark,$dir/ali_tmp.JOB.scp || exit 1

for n in `seq $nj`; do
  cat $dir/ali_tmp.$n.scp 
done > $dir/ali_tmp.scp

num_spk=$(cat $subset_data/spk2utt | wc -l)
if [ $num_spk -lt $nj ]; then
  nj=$num_spk
fi

utils/split_data.sh $subset_data $nj
$cmd JOB=1:$nj $dir/log/filter_alignments.JOB.log \
  copy-int-vector \
  "scp:utils/filter_scp.pl $subset_data/split${nj}/JOB/utt2spk $dir/ali_tmp.scp |" \
  "ark:| gzip -c > $dir/ali.JOB.gz" || exit 1

echo $nj > $dir/num_jobs

rm $dir/ali_tmp.*.{ark,scp} $dir/ali_tmp.scp

exit 0


================================================
FILE: egs/steps/tandem/align_fmllr.sh
================================================
#!/usr/bin/env bash
# Copyright 2012  Johns Hopkins University (Author: Daniel Povey)
#                 Korbinian Riedhammer
# Apache 2.0

# Computes training alignments; assumes features are (LDA+MLLT or delta+delta-delta)
# + fMLLR (probably with SAT models).
# It first computes an alignment with the final.alimdl (or the final.mdl if final.alimdl
# is not present), then does 2 iterations of fMLLR estimation.

# If you supply the --use-graphs option, it will use the training
# graphs from the source directory (where the model is).  In this
# case the number of jobs must match the source directory.


# Begin configuration section.
stage=0
nj=4
cmd=run.pl
use_graphs=false
# Begin configuration.
scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
beam=10
retry_beam=40
boost_silence=1.0 # factor by which to boost silence during alignment.
fmllr_update_type=full
# End configuration options.

echo "$0 $@"  # Print the command line for logging

[ -f path.sh ] && . ./path.sh # source the path.
. parse_options.sh || exit 1;

if [ $# != 5 ]; then
   echo "usage: steps/tandem/align_fmllr.sh <data1-dir> <data2-dir> <lang-dir> <src-dir> <align-dir>"
   echo "e.g.:  steps/tandem/align_fmllr.sh {mfcc,bottleneck}/data/train data/lang exp/tri1 exp/tri1_ali"
   echo "main options (for others, see top of script file)"
   echo "  --config <config-file>                           # config containing options"
   echo "  --nj <nj>                                        # number of parallel jobs"
   echo "  --use-graphs true                                # use graphs in src-dir"
   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
   echo "  --fmllr-update-type (full|diag|offset|none)      # default full."
   exit 1;
fi

data1=$1
data2=$2
lang=$3
srcdir=$4
dir=$5

oov=`cat $lang/oov.int` || exit 1;
silphonelist=`cat $lang/phones/silence.csl` || exit 1;

mkdir -p $dir/log
echo $nj > $dir/num_jobs

utils/lang/check_phones_compatible.sh $lang/phones.txt $srcdir/phones.txt || exit 1;
cp $lang/phones.txt $dir || exit 1;

sdata1=$data1/split$nj
sdata2=$data2/split$nj
[[ -d $sdata1 && $data1/feats.scp -ot $sdata1 ]] || split_data.sh $data1 $nj || exit 1;
[[ -d $sdata2 && $data2/feats.scp -ot $sdata2 ]] || split_data.sh $data2 $nj || exit 1;

cp $srcdir/{tree,final.mdl} $dir || exit 1;
cp $srcdir/final.occs $dir;

# Set up features.
splice_opts=`cat $srcdir/splice_opts 2>/dev/null` # frame-splicing options.
normft2=`cat $srcdir/normft2 2>/dev/null`

if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi

case $feat_type in
  delta)
    echo "$0: feature type is $feat_type"
    ;;
  lda)
    echo "$0: feature type is $feat_type"
    cp $srcdir/{lda,final}.mat $dir/ || exit 1;
    ;;
  *) echo "$0: invalid feature type $feat_type" && exit 1;
esac

# set up feature stream 1;  this are usually spectral features, so we will add
# deltas or splice them
feats1="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata1/JOB/utt2spk scp:$sdata1/JOB/cmvn.scp scp:$sdata1/JOB/feats.scp ark:- |"

if [ "$feat_type" == "delta" ]; then
  feats1="$feats1 add-deltas ark:- ark:- |"
elif [ "$feat_type" == "lda" ]; then
  feats1="$feats1 splice-feats $splice_opts ark:- ark:- | transform-feats $dir/lda.mat ark:- ark:- |"
fi

# set up feature stream 2;  this are usually bottleneck or posterior features,
# which may be normalized if desired
feats2="scp:$sdata2/JOB/feats.scp"

if [ "$normft2" == "true" ]; then
  feats2="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata2/JOB/utt2spk scp:$sdata2/JOB/cmvn.scp $feats2 ark:- |"
fi

# assemble tandem features
sifeats="ark,s,cs:paste-feats '$feats1' '$feats2' ark:- |"

# add transformation, if applicable
if [ "$feat_type" == "lda" ]; then
  sifeats="$sifeats transform-feats $dir/final.mat ark:- ark:- |"
fi

# splicing/normalization options
cp $srcdir/{splice_opts,normft2,tandem} $dir 2>/dev/null


## Set up model and alignment model.
mdl=$srcdir/final.mdl
if [ -f $srcdir/final.alimdl ]; then
  alimdl=$srcdir/final.alimdl
else
  alimdl=$srcdir/final.mdl
fi
[ ! -f $mdl ] && echo "$0: no such model $mdl" && exit 1;
alimdl_cmd="gmm-boost-silence --boost=$boost_silence `cat $lang/phones/optional_silence.csl` $alimdl - |"
mdl_cmd="gmm-boost-silence --boost=$boost_silence `cat $lang/phones/optional_silence.csl` $mdl - |"


## Work out where we're getting the graphs from.
if $use_graphs; then
  [ "$nj" != "`cat $srcdir/num_jobs`" ] && \
    echo "$0: you specified --use-graphs true, but #jobs mismatch." && exit 1;
  [ ! -f $srcdir/fsts.1.gz ] && echo "No graphs in $srcdir" && exit 1;
  graphdir=$srcdir
else
  graphdir=$dir
  if [ $stage -le 0 ]; then
    echo "$0: compiling training graphs"
    tra="ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt $sdata1/JOB/text|";
    $cmd JOB=1:$nj $dir/log/compile_graphs.JOB.log  \
      compile-train-graphs --read-disambig-syms=$lang/phones/disambig.int $dir/tree $dir/final.mdl  $lang/L.fst "$tra" \
        "ark:|gzip -c >$dir/fsts.JOB.gz" || exit 1;
  fi
fi


if [ $stage -le 1 ]; then
  echo "$0: aligning data in $data1 ($data2) using $alimdl and speaker-independent features."
  $cmd JOB=1:$nj $dir/log/align_pass1.JOB.log \
    gmm-align-compiled $scale_opts --beam=$beam --retry-beam=$retry_beam "$alimdl_cmd" \
    "ark:gunzip -c $graphdir/fsts.JOB.gz|" "$sifeats" "ark:|gzip -c >$dir/pre_ali.JOB.gz" || exit 1;
fi

if [ $stage -le 2 ]; then
  echo "$0: computing fMLLR transforms"
  if [ "$alimdl" != "$mdl" ]; then
    $cmd JOB=1:$nj $dir/log/fmllr.JOB.log \
      ali-to-post "ark:gunzip -c $dir/pre_ali.JOB.gz|" ark:- \| \
      weight-silence-post 0.0 $silphonelist $alimdl ark:- ark:- \| \
      gmm-post-to-gpost $alimdl "$sifeats" ark:- ark:- \| \
      gmm-est-fmllr-gpost --fmllr-update-type=$fmllr_update_type \
      --spk2utt=ark:$sdata1/JOB/spk2utt $mdl "$sifeats" \
      ark,s,cs:- ark:$dir/trans.JOB || exit 1;
  else
    $cmd JOB=1:$nj $dir/log/fmllr.JOB.log \
      ali-to-post "ark:gunzip -c $dir/pre_ali.JOB.gz|" ark:- \| \
      weight-silence-post 0.0 $silphonelist $alimdl ark:- ark:- \| \
      gmm-est-fmllr --fmllr-update-type=$fmllr_update_type \
      --spk2utt=ark:$sdata1/JOB/spk2utt $mdl "$sifeats" \
      ark,s,cs:- ark:$dir/trans.JOB || exit 1;
  fi
fi

feats="$sifeats transform-feats --utt2spk=ark:$sdata1/JOB/utt2spk ark:$dir/trans.JOB ark:- ark:- |"

if [ $stage -le 3 ]; then
  echo "$0: doing final alignment."
  $cmd JOB=1:$nj $dir/log/align_pass2.JOB.log \
    gmm-align-compiled $scale_opts --beam=$beam --retry-beam=$retry_beam "$mdl_cmd" \
    "ark:gunzip -c $graphdir/fsts.JOB.gz|" "$feats" "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1;
fi

rm $dir/pre_ali.*.gz

echo "$0: done aligning data."

utils/summarize_warnings.pl $dir/log

exit 0;


================================================
FILE: egs/steps/tandem/align_sgmm2.sh
================================================
#!/usr/bin/env bash
# Copyright 2012  Johns Hopkins University (Author: Daniel Povey)
#                 Korbinian Riedhammer
# Apache 2.0

# Computes training alignments and (if needed) speaker-vectors, given an
# SGMM system.  If the system is built on top of SAT, you should supply
# transforms with the --transform-dir option.

# If you supply the --use-graphs option, it will use the training
# graphs from the source directory.

# Begin configuration section.
stage=0
nj=4
cmd=run.pl
use_graphs=false # use graphs from srcdir
use_gselect=false # use gselect info from srcdir [regardless, we use
   # Gaussian-selection info, we might have to compute it though.]
gselect=15  # Number of Gaussian-selection indices for SGMMs.
# Begin configuration.
scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
beam=10
retry_beam=40
transform_dir=  # directory to find fMLLR transforms in.
# End configuration options.

echo "$0 $@"  # Print the command line for logging

[ -f path.sh ] && . ./path.sh # source the path.
. parse_options.sh || exit 1;

if [ $# != 5 ]; then
   echo "usage: steps/tandem/align_sgmm2.sh <data-dir1> <data-dir2> <lang-dir> <src-dir> <align-dir>"
   echo "e.g.:  steps/tandem/align_sgmm2.sh --transform-dir exp/tri3b {mfcc,bottleneck}/data/train data/lang \\"
   echo "           exp/sgmm4a exp/sgmm5a_ali"
   echo "main options (for others, see top of script file)"
   echo "  --config <config-file>                           # config containing options"
   echo "  --nj <nj>                                        # number of parallel jobs"
   echo "  --use-graphs true                                # use graphs in src-dir"
   echo "  --transform-dir <transform-dir>                  # directory to find fMLLR transforms"
   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
   exit 1;
fi

data1=$1
data2=$2
lang=$3
srcdir=$4
dir=$5

oov=`cat $lang/oov.int` || exit 1;
silphonelist=`cat $lang/phones/silence.csl` || exit 1;

mkdir -p $dir/log
echo $nj > $dir/num_jobs

utils/lang/check_phones_compatible.sh $lang/phones.txt $srcdir/phones.txt || exit 1;
cp $lang/phones.txt $dir || exit 1;

sdata1=$data1/split$nj
sdata2=$data2/split$nj
[[ -d $sdata1 && $data1/feats.scp -ot $sdata1 ]] || split_data.sh $data1 $nj || exit 1;
[[ -d $sdata2 && $data2/feats.scp -ot $sdata2 ]] || split_data.sh $data2 $nj || exit 1;

cp $srcdir/{tree,final.mdl} $dir || exit 1;
[ -f $srcdir/final.alimdl ] && cp $srcdir/final.alimdl $dir
cp $srcdir/final.occs $dir;

## Set up features.

splice_opts=`cat $srcdir/splice_opts 2>/dev/null` # frame-splicing options.
normft2=`cat $srcdir/normft2 2>/dev/null`

if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi

case $feat_type in
  delta)
    echo "$0: feature type is $feat_type"
    ;;
  lda)
    echo "$0: feature type is $feat_type"
    cp $srcdir/{lda,final}.mat $dir/ || exit 1;
    ;;
  *) echo "$0: invalid feature type $feat_type" && exit 1;
esac

# set up feature stream 1;  this are usually spectral features, so we will add
# deltas or splice them
feats1="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata1/JOB/utt2spk scp:$sdata1/JOB/cmvn.scp scp:$sdata1/JOB/feats.scp ark:- |"

if [ "$feat_type" == "delta" ]; then
  feats1="$feats1 add-deltas ark:- ark:- |"
elif [ "$feat_type" == "lda" ]; then
  feats1="$feats1 splice-feats $splice_opts ark:- ark:- | transform-feats $dir/lda.mat ark:- ark:- |"
fi

# set up feature stream 2;  this are usually bottleneck or posterior features,
# which may be normalized if desired
feats2="scp:$sdata2/JOB/feats.scp"

if [ "$normft2" == "true" ]; then
  feats2="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata2/JOB/utt2spk scp:$sdata2/JOB/cmvn.scp $feats2 ark:- |"
fi

# assemble tandem features
feats="ark,s,cs:paste-feats '$feats1' '$feats2' ark:- |"

# add transformation, if applicable
if [ "$feat_type" == "lda" ]; then
  feats="$feats transform-feats $dir/final.mat ark:- ark:- |"
fi

# splicing/normalization options
cp $srcdir/{splice_opts,normft2,tandem} $dir 2>/dev/null

if [ ! -z "$transform_dir" ]; then
  echo "$0: using transforms from $transform_dir"
  [ ! -f $transform_dir/trans.1 ] && echo "$0: no such file $transform_dir/trans.1" && exit 1;
  [ "$nj" -ne "`cat $transform_dir/num_jobs`" ] \
    && echo "$0: #jobs mismatch with transform-dir." && exit 1;
  feats="$feats transform-feats --utt2spk=ark:$sdata1/JOB/utt2spk ark,s,cs:$transform_dir/trans.JOB ark:- ark:- |"
elif grep 'transform-feats --utt2spk' $srcdir/log/acc.0.1.log 2>/dev/null; then
  echo "$0: **WARNING**: you seem to be using an SGMM system trained with transforms,"
  echo "  but you are not providing the --transform-dir option during alignment."
fi
##

## Set up model and alignment model.
mdl=$srcdir/final.mdl
if [ -f $srcdir/final.alimdl ]; then
  alimdl=$srcdir/final.alimdl
else
  alimdl=$srcdir/final.mdl
fi
[ ! -f $mdl ] && echo "$0: no such model $mdl" && exit 1;

## Work out where we're getting the graphs from.
if $use_graphs; then
  [ "$nj" != "`cat $srcdir/num_jobs`" ] && \
    echo "$0: you specified --use-graphs true, but #jobs mismatch." && exit 1;
  [ ! -f $srcdir/fsts.1.gz ] && echo "No graphs in $srcdir" && exit 1;
  graphdir=$srcdir
  ln.pl $srcdir/fsts.*.gz $dir
else
  graphdir=$dir
  if [ $stage -le 0 ]; then
    echo "$0: compiling training graphs"
    tra="ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt $sdata1/JOB/text|";
    $cmd JOB=1:$nj $dir/log/compile_graphs.JOB.log  \
      compile-train-graphs --read-disambig-syms=$lang/phones/disambig.int $dir/tree $dir/final.mdl  $lang/L.fst "$tra" \
        "ark:|gzip -c >$dir/fsts.JOB.gz" || exit 1;
  fi
fi

## Work out where we're getting the Gaussian-selection info from
if $use_gselect; then
  [ "$nj" != "`cat $srcdir/num_jobs`" ] && \
    echo "$0: you specified --use-gselect true, but #jobs mismatch." && exit 1;
  [ ! -f $srcdir/gselect.1.gz ] && echo "No gselect info in $srcdir" && exit 1;
  graphdir=$srcdir
  gselect_opt="--gselect=ark:gunzip -c $srcdir/gselect.JOB.gz|"
  ln.pl $srcdir/gselect.*.gz $dir
else
  graphdir=$dir
  if [ $stage -le 1 ]; then
    echo "$0: computing Gaussian-selection info"
    # Note: doesn't matter whether we use $alimdl or $mdl, they will
    # have the same gselect info.
    $cmd JOB=1:$nj $dir/log/gselect.JOB.log \
      sgmm2-gselect --full-gmm-nbest=$gselect $alimdl \
      "$feats" "ark:|gzip -c >$dir/gselect.JOB.gz" || exit 1;
  fi
  gselect_opt="--gselect=ark:gunzip -c $dir/gselect.JOB.gz|"
fi


if [ $alimdl == $mdl ]; then
  # Speaker-independent decoding-- just one pass.  Not normal.
  T=`sgmm2-info $mdl | grep 'speaker vector space' | awk '{print $NF}'` || exit 1;
  [ "$T" -ne 0 ] && echo "No alignment model, yet speaker vector space nonempty" && exit 1;

  if [ $stage -le 2 ]; then
    echo "$0: aligning data in $data using model $mdl (no speaker-vectors)"
    $cmd JOB=1:$nj $dir/log/align_pass1.JOB.log \
      sgmm2-align-compiled $scale_opts --beam=$beam --retry-beam=$retry_beam $alimdl \
      "ark:gunzip -c $graphdir/fsts.JOB.gz|" "$feats" "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1;
  fi
  echo "$0: done aligning data."
  exit 0;
fi

# Continue with system with speaker vectors.
if [ $stage -le 2 ]; then
  echo "$0: aligning data in $data using model $alimdl"
  $cmd JOB=1:$nj $dir/log/align_pass1.JOB.log \
    sgmm2-align-compiled $scale_opts "$gselect_opt" --beam=$beam --retry-beam=$retry_beam $alimdl \
    "ark:gunzip -c $graphdir/fsts.JOB.gz|" "$feats" "ark:|gzip -c >$dir/pre_ali.JOB.gz" || exit 1;
fi

if [ $stage -le 3 ]; then
  echo "$0: computing speaker vectors (1st pass)"
  $cmd JOB=1:$nj $dir/log/spk_vecs1.JOB.log \
    ali-to-post "ark:gunzip -c $dir/pre_ali.JOB.gz|" ark:- \| \
    weight-silence-post 0.0 $silphonelist $alimdl ark:- ark:- \| \
    sgmm2-post-to-gpost "$gselect_opt" $alimdl "$feats" ark:- ark:- \| \
    sgmm2-est-spkvecs-gpost --spk2utt=ark:$sdata1/JOB/spk2utt \
     $mdl "$feats" ark,s,cs:- ark:$dir/pre_vecs.JOB || exit 1;
fi

if [ $stage -le 4 ]; then
  echo "$0: computing speaker vectors (2nd pass)"
  $cmd JOB=1:$nj $dir/log/spk_vecs2.JOB.log \
    ali-to-post "ark:gunzip -c $dir/pre_ali.JOB.gz|" ark:- \| \
    weight-silence-post 0.0 $silphonelist $alimdl ark:- ark:- \| \
    sgmm2-est-spkvecs --spk2utt=ark:$sdata1/JOB/spk2utt "$gselect_opt" \
     --spk-vecs=ark:$dir/pre_vecs.JOB $mdl "$feats" ark,s,cs:- ark:$dir/vecs.JOB || exit 1;
  rm $dir/pre_vecs.*
fi

if [ $stage -le 5 ]; then
  echo "$0: doing final alignment."
  $cmd JOB=1:$nj $dir/log/align_pass2.JOB.log \
    sgmm2-align-compiled $scale_opts "$gselect_opt" --beam=$beam --retry-beam=$retry_beam \
     --utt2spk=ark:$sdata1/JOB/utt2spk --spk-vecs=ark:$dir/vecs.JOB \
     $mdl "ark:gunzip -c $graphdir/fsts.JOB.gz|" "$feats" "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1;
fi

rm $dir/pre_ali.*.gz

echo "$0: done aligning data."

utils/summarize_warnings.pl $dir/log

exit 0;


================================================
FILE: egs/steps/tandem/align_si.sh
================================================
#!/usr/bin/env bash
# Copyright 2012  Johns Hopkins University (Author: Daniel Povey)
#                 Korbinian Riedhammer
# Apache 2.0

# Computes training alignments using a model with delta or
# LDA+MLLT features.

# If you supply the "--use-graphs true" option, it will use the training
# graphs from the source directory (where the model is).  In this
# case the number of jobs must match with the source directory.


# Begin configuration section.
nj=4
cmd=run.pl
use_graphs=false
# Begin configuration.
scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
beam=10
retry_beam=40
boost_silence=1.0 # Factor by which to boost silence during alignment.
# End configuration options.

echo "$0 $@"  # Print the command line for logging

[ -f path.sh ] && . ./path.sh # source the path.
. parse_options.sh || exit 1;

if [ $# != 5 ]; then
   echo "usage: steps/tandem/align_si.sh <data1-dir> <data2-dir> <lang-dir> <src-dir> <align-dir>"
   echo "e.g.:  steps/tandem/align_si.sh {mfcc,bottleneck}/data/train data/lang exp/tri1 exp/tri1_ali"
   echo "main options (for others, see top of script file)"
   echo "  --config <config-file>                           # config containing options"
   echo "  --nj <nj>                                        # number of parallel jobs"
   echo "  --use-graphs true                                # use graphs in src-dir"
   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
   exit 1;
fi

data1=$1
data2=$2
lang=$3
srcdir=$4
dir=$5

oov=`cat $lang/oov.int` || exit 1;
mkdir -p $dir/log
echo $nj > $dir/num_jobs

utils/lang/check_phones_compatible.sh $lang/phones.txt $srcdir/phones.txt || exit 1;
cp $lang/phones.txt $dir || exit 1;
# Set up the features

sdata1=$data1/split$nj
sdata2=$data2/split$nj
[[ -d $sdata1 && $data1/feats.scp -ot $sdata1 ]] || split_data.sh $data1 $nj || exit 1;
[[ -d $sdata2 && $data2/feats.scp -ot $sdata2 ]] || split_data.sh $data2 $nj || exit 1;

cp $srcdir/{tree,final.mdl} $dir || exit 1;
cp $srcdir/final.occs $dir;

# Get some info on the feature types
splice_opts=`cat $srcdir/splice_opts 2>/dev/null` # frame-splicing options.
normft2=`cat $srcdir/normft2 2>/dev/null` || exit 1;

if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi

# for lda-type features, we need to copy both the lda (for baseft) and mllt
# transformation (for the pasted features)
case $feat_type in
  delta)
    echo "$0: feature type is $feat_type"
    ;;
  lda)
    echo "$0: feature type is $feat_type"
    cp $srcdir/{lda,final}.mat $dir/ || exit 1;
   ;;
  *) echo "$0: invalid feature type $feat_type" && exit 1;
esac

# set up feature stream 1;  this are usually spectral features, so we will add
# deltas or splice them
feats1="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata1/JOB/utt2spk scp:$sdata1/JOB/cmvn.scp scp:$sdata1/JOB/feats.scp ark:- |"

if [ "$feat_type" == "delta" ]; then
  feats1="$feats1 add-deltas ark:- ark:- |"
elif [ "$feat_type" == "lda" ]; then
  feats1="$feats1 splice-feats $splice_opts ark:- ark:- | transform-feats $dir/lda.mat ark:- ark:- |"
fi

# set up feature stream 2;  this are usually bottleneck or posterior features,
# which may be normalized if desired
feats2="scp:$sdata2/JOB/feats.scp"

if [ "$normft2" == "true" ]; then
  feats2="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata2/JOB/utt2spk scp:$sdata2/JOB/cmvn.scp $feats2 ark:- |"
fi

# assemble tandem features
feats="ark,s,cs:paste-feats '$feats1' '$feats2' ark:- |"

# add transformation, if applicable
if [ "$feat_type" == "lda" ]; then
  feats="$feats transform-feats $dir/final.mat ark:- ark:- |"
fi

# splicing/normalization options
cp $srcdir/{tandem,splice_opts,normft2} $dir 2>/dev/null

echo "$0: aligning data in $data using model from $srcdir, putting alignments in $dir"

mdl="gmm-boost-silence --boost=$boost_silence `cat $lang/phones/optional_silence.csl` $dir/final.mdl - |"

if $use_graphs; then
  [ $nj != "`cat $srcdir/num_jobs`" ] && echo "$0: mismatch in num-jobs" && exit 1;
  [ ! -f $srcdir/fsts.1.gz ] && echo "$0: no such file $srcdir/fsts.1.gz" && exit 1;

  $cmd JOB=1:$nj $dir/log/align.JOB.log \
    gmm-align-compiled $scale_opts --beam=$beam --retry-beam=$retry_beam "$mdl" \
      "ark:gunzip -c $srcdir/fsts.JOB.gz|" "$feats" "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1;
else
  tra="ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt $sdata1/JOB/text|";
  # We could just use gmm-align in the next line, but it's less efficient as it compiles the
  # training graphs one by one.
  $cmd JOB=1:$nj $dir/log/align.JOB.log \
    compile-train-graphs --read-disambig-syms=$lang/phones/disambig.int $dir/tree $dir/final.mdl  $lang/L.fst "$tra" ark:- \| \
    gmm-align-compiled $scale_opts --beam=$beam --retry-beam=$retry_beam "$mdl" ark:- \
      "$feats" "ark,t:|gzip -c >$dir/ali.JOB.gz" || exit 1;
fi

echo "$0: done aligning data."


================================================
FILE: egs/steps/tandem/decode.sh
================================================
#!/usr/bin/env bash

# Copyright 2012  Johns Hopkins University (Author: Daniel Povey)
# Apache 2.0

# Begin configuration section.
transform_dir=
iter=
model= # You can specify the model to use (e.g. if you want to use the .alimdl)
nj=4
cmd=run.pl
max_active=7000
beam=13.0
lattice_beam=6.0
acwt=0.083333 # note: only really affects pruning (scoring is on lattices).
min_lmwt=9
max_lmwt=20
skip_scoring=false
# End configuration section.

echo "$0 $@"  # Print the command line for logging

[ -f ./path.sh ] && . ./path.sh; # source the path.
. parse_options.sh || exit 1;

if [ $# != 4 ]; then
   echo "Usage: steps/tandem/decode.sh [options] <graph-dir> <data1-dir> <data2-dir> <decode-dir>"
   echo "... where <decode-dir> is assumed to be a sub-directory of the directory"
   echo " where the model is."
   echo "e.g.: steps/tandem/decode.sh exp/mono/graph {mfcc,bottleneck}/data/test_dev93 exp/mono/decode_dev93"
   echo ""
   echo "This script works on CMN + (delta+delta-delta | LDA+MLLT) features; it works out"
   echo "what type of features you used (assuming it's one of these two)"
   echo ""
   echo "main options (for others, see top of script file)"
   echo "  --config <config-file>                           # config containing options"
   echo "  --nj <nj>                                        # number of parallel jobs"
   echo "  --iter <iter>                                    # Iteration of model to test."
   echo "  --model <model>                                  # which model to use (e.g. to"
   echo "                                                   # specify the final.alimdl)"
   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
   echo "  --transform-dir <trans-dir>                      # dir to find fMLLR transforms "
   echo "  --acwt <float>                                   # acoustic scale used for lattice generation "
   echo "  --min-lmwt <int>                                 # minumum LM-weight for lattice rescoring "
   echo "  --max-lmwt <int>                                 # maximum LM-weight for lattice rescoring "
   echo "                                                   # speaker-adapted decoding"
   exit 1;
fi


graphdir=$1
data1=$2
data2=$3
dir=$4
srcdir=`dirname $dir`; # The model directory is one level up from decoding directory.

mkdir -p $dir/log

sdata1=$data1/split$nj;
sdata2=$data2/split$nj;
[[ -d $sdata1 && $data1/feats.scp -ot $sdata1 ]] || split_data.sh $data1 $nj || exit 1;
[[ -d $sdata2 && $data2/feats.scp -ot $sdata2 ]] || split_data.sh $data2 $nj || exit 1;

echo $nj > $dir/num_jobs

if [ -z "$model" ]; then # if --model <mdl> was not specified on the command line...
  if [ -z $iter ]; then model=$srcdir/final.mdl;
  else model=$srcdir/$iter.mdl; fi
fi

for f in $sdata1/1/feats.scp $sdata1/1/cmvn.scp $sdata2/1/feats.scp $model $graphdir/HCLG.fst; do
  [ ! -f $f ] && echo "decode.sh: no such file $f" && exit 1;
done

# Set up features.

# Get some info on the feature types
splice_opts=`cat $srcdir/splice_opts 2>/dev/null` # frame-splicing options.
normft2=`cat $srcdir/normft2 2>/dev/null`


if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
echo "decode.sh: feature type is $feat_type";

case $feat_type in
  delta)
    echo "$0: feature type is $feat_type"
    ;;
  lda)
    echo "$0: feature type is $feat_type"
    ;;
  *) echo "$0: invalid feature type $feat_type" && exit 1;
esac

# set up feature stream 1;  this are usually spectral features, so we will add
# deltas or splice them
feats1="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata1/JOB/utt2spk scp:$sdata1/JOB/cmvn.scp scp:$sdata1/JOB/feats.scp ark:- |"

if [ "$feat_type" == "delta" ]; then
  feats1="$feats1 add-deltas ark:- ark:- |"
elif [ "$feat_type" == "lda" ]; then
  if [ -e $srcdir/lda.mat ]; then
    feats1="$feats1 splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/lda.mat ark:- ark:- |"
  else
    feats1="$feats1 add-deltas ark:- ark:- |"
  fi
fi

# set up feature stream 2;  this are usually bottleneck or posterior features,
# which may be normalized if desired
feats2="scp:$sdata2/JOB/feats.scp"

if [ "$normft2" == "true" ]; then
  echo "Using cmvn for feats2"
  feats2="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata2/JOB/utt2spk scp:$sdata2/JOB/cmvn.scp $feats2 ark:- |"
fi

# assemble tandem features
feats="ark,s,cs:paste-feats '$feats1' '$feats2' ark:- |"

# add transformation, if applicable
if [ "$feat_type" == "lda" ]; then
  feats="$feats transform-feats $srcdir/final.mat ark:- ark:- |"
fi

# speaker dependent transformations as requested
if [ ! -z "$transform_dir" ]; then # add transforms to features...
  echo "Using fMLLR transforms from $transform_dir"
  [ ! -f $transform_dir/trans.1 ] && echo "Expected $transform_dir/trans.1 to exist."
  [ "`cat $transform_dir/num_jobs`" -ne $nj ] && \
     echo "Mismatch in number of jobs with $transform_dir";
  feats="$feats transform-feats --utt2spk=ark:$sdata1/JOB/utt2spk ark:$transform_dir/trans.JOB ark:- ark:- |"
fi

$cmd JOB=1:$nj $dir/log/decode.JOB.log \
 gmm-latgen-faster --max-active=$max_active --beam=$beam --lattice-beam=$lattice_beam \
   --acoustic-scale=$acwt --allow-partial=true --word-symbol-table=$graphdir/words.txt \
  $model $graphdir/HCLG.fst "$feats" "ark:|gzip -c > $dir/lat.JOB.gz" || exit 1;

if ! $skip_scoring ; then
  [ ! -x local/score.sh ] && \
    echo "Not scoring because local/score.sh does not exist or not executable." && exit 1;
  local/score.sh --cmd "$cmd" --min_lmwt $min_lmwt --max_lmwt $max_lmwt $data1 $graphdir $dir ||
    { echo "$0: Scoring failed. (ignore by '--skip-scoring true')"; exit 1; }
fi

exit 0;


================================================
FILE: egs/steps/tandem/decode_fmllr.sh
================================================
#!/usr/bin/env bash

# Copyright 2012  Johns Hopkins University (Author: Daniel Povey)
#                 Korbinian Riedhammer

# Decoding script that does fMLLR.  This can be on top of delta+delta-delta, or
# LDA+MLLT features.

# There are 3 models involved potentially in this script,
# and for a standard, speaker-independent system they will all be the same.
# The "alignment model" is for the 1st-pass decoding and to get the
# Gaussian-level alignments for the "adaptation model" the first time we
# do fMLLR.  The "adaptation model" is used to estimate fMLLR transforms
# and to generate state-level lattices.  The lattices are then rescored
# with the "final model".
#
# The following table explains where we get these 3 models from.
# Note: $srcdir is one level up from the decoding directory.
#
#   Model              Default source:
#
#  "alignment model"   $srcdir/final.alimdl              --alignment-model <model>
#                     (or $srcdir/final.mdl if alimdl absent)
#  "adaptation model"  $srcdir/final.mdl                 --adapt-model <model>
#  "final model"       $srcdir/final.mdl                 --final-model <model>


# Begin configuration section
first_beam=10.0 # Beam used in initial, speaker-indep. pass
first_max_active=2000 # max-active used in initial pass.
alignment_model=
adapt_model=
final_model=
stage=0
acwt=0.083333 # Acoustic weight used in getting fMLLR transforms, and also in
              # lattice generation.
max_active=7000
beam=13.0
lattice_beam=6.0
nj=4
silence_weight=0.01
cmd=run.pl
si_dir=
fmllr_update_type=full
skip_scoring=false
# End configuration section

echo "$0 $@"  # Print the command line for logging

[ -f ./path.sh ] && . ./path.sh; # source the path.
. parse_options.sh || exit 1;

if [ $# != 4 ]; then
   echo "Usage: steps/decode_fmllr.sh [options] <graph-dir> <data1-dir> <data2-dir> <decode-dir>"
   echo " e.g.: steps/decode_fmllr.sh exp/tri2b/graph {mfcc,bottleneck}/data/test_dev93 exp/tri2b/decode_dev93"
   echo "main options (for others, see top of script file)"
   echo "  --config <config-file>                   # config containing options"
   echo "  --nj <nj>                                # number of parallel jobs"
   echo "  --cmd <cmd>                              # Command to run in parallel with"
   echo "  --adapt-model <adapt-mdl>                # Model to compute transforms with"
   echo "  --alignment-model <ali-mdl>              # Model to get Gaussian-level alignments for"
   echo "                                           # 1st pass of transform computation."
   echo "  --final-model <finald-mdl>               # Model to finally decode with"
   echo "  --si-dir <speaker-indep-decoding-dir>    # use this to skip 1st pass of decoding"
   echo "                                           # Caution-- must be with same tree"
   echo "  --acwt <acoustic-weight>                 # default 0.08333 ... used to get posteriors"

   exit 1;
fi


graphdir=$1
data1=$2
data2=$3
dir=`echo $4 | sed 's:/$::g'` # remove any trailing slash.

srcdir=`dirname $dir`; # Assume model directory one level up from decoding directory.

mkdir -p $dir/log

sdata1=$data1/split$nj;
sdata2=$data2/split$nj;
[[ -d $sdata1 && $data1/feats.scp -ot $sdata1 ]] || split_data.sh $data1 $nj || exit 1;
[[ -d $sdata2 && $data2/feats.scp -ot $sdata2 ]] || split_data.sh $data2 $nj || exit 1;

echo $nj > $dir/num_jobs


silphonelist=`cat $graphdir/phones/silence.csl` || exit 1;

# Some checks.  Note: we don't need $srcdir/tree but we expect
# it should exist, given the current structure of the scripts.
for f in $graphdir/HCLG.fst $data1/feats.scp $data2/feats.scp $srcdir/tree; do
  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
done

## Work out name of alignment model. ##
if [ -z "$alignment_model" ]; then
  if [ -f "$srcdir/final.alimdl" ]; then alignment_model=$srcdir/final.alimdl;
  else alignment_model=$srcdir/final.mdl; fi
fi
[ ! -f "$alignment_model" ] && echo "$0: no alignment model $alignment_model " && exit 1;
##

## Do the speaker-independent decoding, if --si-dir option not present. ##
if [ -z "$si_dir" ]; then # we need to do the speaker-independent decoding pass.
  si_dir=${dir}.si # Name it as our decoding dir, but with suffix ".si".
  if [ $stage -le 0 ]; then
    steps/tandem/decode_si.sh --acwt $acwt --nj $nj --cmd "$cmd" --beam $first_beam --model $alignment_model --max-active $first_max_active $graphdir $data1 $data2 $si_dir || exit 1;
  fi
fi
##

## Some checks, and setting of defaults for variables.
[ "$nj" -ne "`cat $si_dir/num_jobs`" ] && echo "Mismatch in #jobs with si-dir" && exit 1;
[ ! -f "$si_dir/lat.1.gz" ] && echo "No such file $si_dir/lat.1.gz" && exit 1;
[ -z "$adapt_model" ] && adapt_model=$srcdir/final.mdl
[ -z "$final_model" ] && final_model=$srcdir/final.mdl
for f in $adapt_model $final_model; do
  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
done
##


# Set up features.

splice_opts=`cat $srcdir/splice_opts 2>/dev/null` # frame-splicing options.
normft2=`cat $srcdir/normft2 2>/dev/null`

if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi

case $feat_type in
  delta)
    echo "$0: feature type is $feat_type"
    ;;
  lda)
    echo "$0: feature type is $feat_type"
    ;;
  *) echo "$0: invalid feature type $feat_type" && exit 1;
esac

# set up feature stream 1;  this are usually spectral features, so we will add
# deltas or splice them
feats1="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata1/JOB/utt2spk scp:$sdata1/JOB/cmvn.scp scp:$sdata1/JOB/feats.scp ark:- |"

if [ "$feat_type" == "delta" ]; then
  feats1="$feats1 add-deltas ark:- ark:- |"
elif [ "$feat_type" == "lda" ]; then
  feats1="$feats1 splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/lda.mat ark:- ark:- |"
fi

# set up feature stream 2;  this are usually bottleneck or posterior features,
# which may be normalized if desired
feats2="scp:$sdata2/JOB/feats.scp"

if [ "$normft2" == "true" ]; then
  echo "Using cmvn for feats2"
  feats2="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata2/JOB/utt2spk scp:$sdata2/JOB/cmvn.scp $feats2 ark:- |"
fi

# assemble tandem features
sifeats="ark,s,cs:paste-feats '$feats1' '$feats2' ark:- |"

# add transformation, if applicable
if [ "$feat_type" == "lda" ]; then
  sifeats="$sifeats transform-feats $srcdir/final.mat ark:- ark:- |"
fi


## Now get the first-pass fMLLR transforms.
if [ $stage -le 1 ]; then
  echo "$0: getting first-pass fMLLR transforms."
  $cmd JOB=1:$nj $dir/log/fmllr_pass1.JOB.log \
    gunzip -c $si_dir/lat.JOB.gz \| \
    lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \
    weight-silence-post $silence_weight $silphonelist $alignment_model ark:- ark:- \| \
    gmm-post-to-gpost $alignment_model "$sifeats" ark:- ark:- \| \
    gmm-est-fmllr-gpost --fmllr-update-type=$fmllr_update_type \
    --spk2utt=ark:$sdata1/JOB/spk2utt $adapt_model "$sifeats" ark,s,cs:- \
    ark:$dir/pre_trans.JOB || exit 1;
fi
##

pass1feats="$sifeats transform-feats --utt2spk=ark:$sdata1/JOB/utt2spk ark:$dir/pre_trans.JOB ark:- ark:- |"

## Do the main lattice generation pass.  Note: we don't determinize the lattices at
## this stage, as we're going to use them in acoustic rescoring with the larger
## model, and it's more correct to store the full state-level lattice for this purpose.
if [ $stage -le 2 ]; then
  echo "$0: doing main lattice generation phase"
  $cmd JOB=1:$nj $dir/log/decode.JOB.log \
    gmm-latgen-faster --max-active=$max_active --beam=$beam --lattice-beam=$lattice_beam \
    --acoustic-scale=$acwt  \
    --determinize-lattice=false --allow-partial=true --word-symbol-table=$graphdir/words.txt \
    $adapt_model $graphdir/HCLG.fst "$pass1feats" "ark:|gzip -c > $dir/lat.tmp.JOB.gz" \
    || exit 1;
fi
##

## Do a second pass of estimating the transform-- this time with the lattices
## generated from the alignment model.  Compose the transforms to get
## $dir/trans.1, etc.
if [ $stage -le 3 ]; then
  echo "$0: estimating fMLLR transforms a second time."
  $cmd JOB=1:$nj $dir/log/fmllr_pass2.JOB.log \
    lattice-determinize-pruned --acoustic-scale=$acwt --beam=4.0 \
    "ark:gunzip -c $dir/lat.tmp.JOB.gz|" ark:- \| \
    lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \
    weight-silence-post $silence_weight $silphonelist $adapt_model ark:- ark:- \| \
    gmm-est-fmllr --fmllr-update-type=$fmllr_update_type \
    --spk2utt=ark:$sdata1/JOB/spk2utt $adapt_model "$pass1feats" \
    ark,s,cs:- ark:$dir/trans_tmp.JOB '&&' \
    compose-transforms --b-is-affine=true ark:$dir/trans_tmp.JOB ark:$dir/pre_trans.JOB \
    ark:$dir/trans.JOB  || exit 1;
fi
##

feats="$sifeats transform-feats --utt2spk=ark:$sdata1/JOB/utt2spk ark:$dir/trans.JOB ark:- ark:- |"

# Rescore the state-level lattices with the final adapted features, and the final model
# (which by default is $srcdir/final.mdl, but which may be specified on the command line,
# useful in case of discriminatively trained systems).
# At this point we prune and determinize the lattices and write them out, ready for
# language model rescoring.

if [ $stage -le 4 ]; then
  echo "$0: doing a final pass of acoustic rescoring."
  $cmd JOB=1:$nj $dir/log/acoustic_rescore.JOB.log \
    gmm-rescore-lattice $final_model "ark:gunzip -c $dir/lat.tmp.JOB.gz|" "$feats" ark:- \| \
    lattice-determinize-pruned --acoustic-scale=$acwt --beam=$lattice_beam ark:- \
    "ark:|gzip -c > $dir/lat.JOB.gz" '&&' rm $dir/lat.tmp.JOB.gz || exit 1;
fi

if ! $skip_scoring ; then
  [ ! -x local/score.sh ] && \
    echo "$0: not scoring because local/score.sh does not exist or not executable." && exit 1;
  local/score.sh --cmd "$cmd" $data1 $graphdir $dir ||
    { echo "$0: Scoring failed. (ignore by '--skip-scoring true')"; exit 1; }
fi

rm $dir/{trans_tmp,pre_trans}.*

exit 0;


================================================
FILE: egs/steps/tandem/decode_sgmm2.sh
================================================
#!/usr/bin/env bash

# Copyright 2012  Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.
#                 Korbinian Riedhammer

# This script does decoding with an SGMM system, with speaker vectors.
# If the SGMM system was
# built on top of fMLLR transforms from a conventional system, you should
# provide the --transform-dir option.

# Begin configuration section.
stage=1
transform_dir=    # dir to find fMLLR transforms.
nj=4 # number of decoding jobs.
acwt=0.1  # Just a default value, used for adaptation and beam-pruning..
cmd=run.pl
beam=13.0
gselect=15  # Number of Gaussian-selection indices for SGMMs.  [Note:
            # the first_pass_gselect variable is used for the 1st pass of
            # decoding and can be tighter.
first_pass_gselect=3 # Use a smaller number of Gaussian-selection indices in
            # the 1st pass of decoding (lattice generation).
max_active=7000

#WARNING: This option is renamed lattice_beam (it was renamed to follow the naming
#         in the other scripts
lattice_beam=6.0 # Beam we use in lattice generation.
vecs_beam=4.0 # Beam we use to prune lattices while getting posteriors for
    # speaker-vector computation.  Can be quite tight (actually we could
    # probably just do best-path.
use_fmllr=false
fmllr_iters=10
fmllr_min_count=1000
skip_scoring=false
# End configuration section.

echo "$0 $@"  # Print the command line for logging

[ -f ./path.sh ] && . ./path.sh; # source the path.
. parse_options.sh || exit 1;

if [ $# -ne 4 ]; then
  echo "Usage: steps/tandem/decode_sgmm2.sh [options] <graph-dir> <data-dir1> <data-dir2> <decode-dir>"
  echo " e.g.: steps/tandem/decode_sgmm2.sh --transform-dir exp/tri3b/decode_dev93_tgpr \\"
  echo "      exp/sgmm3a/graph_tgpr {mfcc,bottleneck}/data/test_dev93 exp/sgmm3a/decode_dev93_tgpr"
  echo "main options (for others, see top of script file)"
  echo "  --transform-dir <decoding-dir>           # directory of previous decoding"
  echo "                                           # where we can find transforms for SAT systems."
  echo "  --config <config-file>                   # config containing options"
  echo "  --nj <nj>                                # number of parallel jobs"
  echo "  --cmd <cmd>                              # Command to run in parallel with"
  echo "  --beam <beam>                            # Decoding beam; default 13.0"
  exit 1;
fi

graphdir=$1
data1=$2
data2=$3
dir=$4
srcdir=`dirname $dir`; # Assume model directory one level up from decoding directory.

for f in $graphdir/HCLG.fst $data1/feats.scp $data2/feats.scp $srcdir/final.mdl; do
  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
done

silphonelist=`cat $graphdir/phones/silence.csl` || exit 1
gselect_opt="--gselect=ark:gunzip -c $dir/gselect.JOB.gz|"
gselect_opt_1stpass="$gselect_opt copy-gselect --n=$first_pass_gselect ark:- ark:- |"

mkdir -p $dir/log
echo $nj > $dir/num_jobs

sdata1=$data1/split$nj;
sdata2=$data2/split$nj;
[[ -d $sdata1 && $data1/feats.scp -ot $sdata1 ]] || split_data.sh $data1 $nj || exit 1;
[[ -d $sdata2 && $data2/feats.scp -ot $sdata2 ]] || split_data.sh $data2 $nj || exit 1;

## Set up features.


splice_opts=`cat $srcdir/splice_opts 2>/dev/null` # frame-splicing options.
normft2=`cat $srcdir/normft2 2>/dev/null`

if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi

case $feat_type in
  delta)
    echo "$0: feature type is $feat_type"
    ;;
  lda)
    echo "$0: feature type is $feat_type"
    cp $srcdir/{lda,final}.mat $dir/
    ;;
  *) echo "$0: invalid feature type $feat_type" && exit 1;
esac

# set up feature stream 1;  this are usually spectral features, so we will add
# deltas or splice them
feats1="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata1/JOB/utt2spk scp:$sdata1/JOB/cmvn.scp scp:$sdata1/JOB/feats.scp ark:- |"

if [ "$feat_type" == "delta" ]; then
  feats1="$feats1 add-deltas ark:- ark:- |"
elif [ "$feat_type" == "lda" ]; then
  feats1="$feats1 splice-feats $splice_opts ark:- ark:- | transform-feats $dir/lda.mat ark:- ark:- |"
fi

# set up feature stream 2;  this are usually bottleneck or posterior features,
# which may be normalized if desired
feats2="scp:$sdata2/JOB/feats.scp"

if [ "$normft2" == "true" ]; then
  echo "Using cmvn for feats2"
  feats2="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata2/JOB/utt2spk scp:$sdata2/JOB/cmvn.scp $feats2 ark:- |"
fi

# assemble tandem features
feats="ark,s,cs:paste-feats '$feats1' '$feats2' ark:- |"

# add transformation, if applicable
if [ "$feat_type" == "lda" ]; then
  feats="$feats transform-feats $dir/final.mat ark:- ark:- |"
fi

# splicing/normalization options
cp $srcdir/{splice_opts,normft2,tandem} $dir 2>/dev/null

if [ ! -z "$transform_dir" ]; then
  echo "$0: using transforms from $transform_dir"
  [ ! -f $transform_dir/trans.1 ] && echo "$0: no such file $transform_dir/trans.1" && exit 1;
  [ "$nj" -ne "`cat $transform_dir/num_jobs`" ] \
    && echo "$0: #jobs mismatch with transform-dir." && exit 1;
  feats="$feats transform-feats --utt2spk=ark:$sdata1/JOB/utt2spk ark,s,cs:$transform_dir/trans.JOB ark:- ark:- |"
elif grep 'transform-feats --utt2spk' $srcdir/log/acc.0.1.log 2>/dev/null; then
  echo "$0: **WARNING**: you seem to be using an SGMM system trained with transforms,"
  echo "  but you are not providing the --transform-dir option in test time."
fi
##

## Save Gaussian-selection info to disk.
# Note: we can use final.mdl regardless of whether there is an alignment model--
# they use the same UBM.

if [ $stage -le 1 ]; then
  $cmd JOB=1:$nj $dir/log/gselect.JOB.log \
    sgmm2-gselect --full-gmm-nbest=$gselect $srcdir/final.mdl \
    "$feats" "ark:|gzip -c >$dir/gselect.JOB.gz" || exit 1;
fi

# Generate state-level lattice which we can rescore.  This is done with the alignment
# model and no speaker-vectors.
if [ $stage -le 2 ]; then
  $cmd JOB=1:$nj $dir/log/decode_pass1.JOB.log \
    sgmm2-latgen-faster --max-active=$max_active --beam=$beam --lattice-beam=$lattice_beam \
    --acoustic-scale=$acwt --determinize-lattice=false --allow-partial=true \
    --word-symbol-table=$graphdir/words.txt "$gselect_opt_1stpass" $srcdir/final.alimdl \
    $graphdir/HCLG.fst "$feats" "ark:|gzip -c > $dir/pre_lat.JOB.gz" || exit 1;
fi

# Estimate speaker vectors (1st pass).  Prune before determinizing
# because determinization can take a while on un-pruned lattices.
# Note: the sgmm2-post-to-gpost stage is necessary because we have
# a separate alignment-model and final model, otherwise we'd skip it
# and use sgmm2-est-spkvecs.
if [ $stage -le 3 ]; then
  $cmd JOB=1:$nj $dir/log/vecs_pass1.JOB.log \
    gunzip -c $dir/pre_lat.JOB.gz \| \
    lattice-prune --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \
    lattice-determinize-pruned --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \
    lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \
    weight-silence-post 0.0 $silphonelist $srcdir/final.alimdl ark:- ark:- \| \
    sgmm2-post-to-gpost "$gselect_opt" $srcdir/final.alimdl "$feats" ark:- ark:- \| \
    sgmm2-est-spkvecs-gpost --spk2utt=ark:$sdata1/JOB/spk2utt \
     $srcdir/final.mdl "$feats" ark,s,cs:- "ark:$dir/pre_vecs.JOB" || exit 1;
fi

# Estimate speaker vectors (2nd pass).  Since we already have spk vectors,
# at this point we need to rescore the lattice to get the correct posteriors.
if [ $stage -le 4 ]; then
  $cmd JOB=1:$nj $dir/log/vecs_pass2.JOB.log \
    gunzip -c $dir/pre_lat.JOB.gz \| \
    sgmm2-rescore-lattice --spk-vecs=ark:$dir/pre_vecs.JOB --utt2spk=ark:$sdata1/JOB/utt2spk \
      "$gselect_opt" $srcdir/final.mdl ark:- "$feats" ark:- \| \
    lattice-prune --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \
    lattice-determinize-pruned --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \
    lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \
    weight-silence-post 0.0 $silphonelist $srcdir/final.mdl ark:- ark:- \| \
    sgmm2-est-spkvecs --spk2utt=ark:$sdata1/JOB/spk2utt "$gselect_opt" --spk-vecs=ark:$dir/pre_vecs.JOB \
     $srcdir/final.mdl "$feats" ark,s,cs:- "ark:$dir/vecs.JOB" || exit 1;
fi
rm $dir/pre_vecs.*

if $use_fmllr; then
  # Estimate fMLLR transforms (note: these may be on top of any
  # fMLLR transforms estimated with the baseline GMM system.
  if [ $stage -le 5 ]; then # compute fMLLR transforms.
    echo "$0: computing fMLLR transforms."
    if [ ! -f $srcdir/final.fmllr_mdl ] || [ $srcdir/final.fmllr_mdl -ot $srcdir/final.mdl ]; then
      echo "$0: computing pre-transform for fMLLR computation."
      sgmm2-comp-prexform $srcdir/final.mdl $srcdir/final.occs $srcdir/final.fmllr_mdl || exit 1;
    fi
    $cmd JOB=1:$nj $dir/log/fmllr.JOB.log \
      gunzip -c $dir/pre_lat.JOB.gz \| \
      sgmm2-rescore-lattice --spk-vecs=ark:$dir/vecs.JOB --utt2spk=ark:$sdata1/JOB/utt2spk \
      "$gselect_opt" $srcdir/final.mdl ark:- "$feats" ark:- \| \
      lattice-prune --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \
      lattice-determinize-pruned --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \
      lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \
      weight-silence-post 0.0 $silphonelist $srcdir/final.mdl ark:- ark:- \| \
      sgmm2-est-fmllr --spk2utt=ark:$sdata1/JOB/spk2utt "$gselect_opt" --spk-vecs=ark:$dir/vecs.JOB \
       --fmllr-iters=$fmllr_iters --fmllr-min-count=$fmllr_min_count \
      $srcdir/final.fmllr_mdl "$feats" ark,s,cs:- "ark:$dir/trans.JOB" || exit 1;
  fi
  feats="$feats transform-feats --utt2spk=ark:$sdata1/JOB/utt2spk ark,s,cs:$dir/trans.JOB ark:- ark:- |"
fi

# Now rescore the state-level lattices with the adapted features and the
# corresponding model.  Prune and determinize the lattices to limit
# their size.
if [ $stage -le 6 ]; then
  $cmd JOB=1:$nj $dir/log/rescore.JOB.log \
    sgmm2-rescore-lattice "$gselect_opt" --utt2spk=ark:$sdata1/JOB/utt2spk --spk-vecs=ark:$dir/vecs.JOB \
    $srcdir/final.mdl "ark:gunzip -c $dir/pre_lat.JOB.gz|" "$feats" ark:- \| \
    lattice-determinize-pruned --acoustic-scale=$acwt --beam=$lattice_beam ark:- \
    "ark:|gzip -c > $dir/lat.JOB.gz" || exit 1;
fi
rm $dir/pre_lat.*.gz

# The output of this script is the files "lat.*.gz"-- we'll rescore this at different
# acoustic scales to get the final output.

if ! $skip_scoring ; then
  if [ $stage -le 7 ]; then
    [ ! -x local/score.sh ] && \
      echo "Not scoring because local/score.sh does not exist or not executable." && exit 1;
    local/score.sh --cmd "$cmd" $data1 $graphdir $dir ||
      { echo "$0: Scoring failed. (ignore by '--skip-scoring true')"; exit 1; }
  fi
fi

exit 0;


================================================
FILE: egs/steps/tandem/make_denlats.sh
================================================
#!/usr/bin/env bash
# Copyright 2012  Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.
#                 Korbinian Riedhammer

# Create denominator lattices for MMI/MPE training.
# Creates its output in $dir/lat.*.gz

# Begin configuration section.
nj=4
cmd=run.pl
sub_split=1
beam=13.0
lattice_beam=7.0
acwt=0.1
max_active=5000
transform_dir=
max_mem=20000000 # This will stop the processes getting too large.
# This is in bytes, but not "real" bytes-- you have to multiply
# by something like 5 or 10 to get real bytes (not sure why so large)
# End configuration section.

echo "$0 $@"  # Print the command line for logging

[ -f ./path.sh ] && . ./path.sh; # source the path.
. parse_options.sh || exit 1;

if [ $# != 5 ]; then
   echo "Usage: steps/make_tandem_denlats.sh [options] <data1-dir> <data2-dir> <lang-dir> <src-dir> <exp-dir>"
   echo "  e.g.: steps/make_tandem_denlats.sh {mfcc,bottleneck}/data/train data/lang exp/tri1 exp/tri1_denlats"
   echo "Works for (delta|lda) features, and (with --transform-dir option) such features"
   echo " plus transforms."
   echo ""
   echo "Main options (for others, see top of script file)"
   echo "  --config <config-file>                           # config containing options"
   echo "  --nj <nj>                                        # number of parallel jobs"
   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
   echo "  --sub-split <n-split>                            # e.g. 40; use this for "
   echo "                           # large databases so your jobs will be smaller and"
   echo "                           # will (individually) finish reasonably soon."
   echo "  --transform-dir <transform-dir>   # directory to find fMLLR transforms."
   exit 1;
fi

data1=$1
data2=$2
lang=$3
srcdir=$4
dir=$5

mkdir -p $dir/log
echo $nj > $dir/num_jobs

utils/lang/check_phones_compatible.sh $lang/phones.txt $srcdir/phones.txt || exit 1;

sdata1=$data1/split$nj
sdata2=$data2/split$nj
[[ -d $sdata1 && $data1/feats.scp -ot $sdata1 ]] || split_data.sh $data1 $nj || exit 1;
[[ -d $sdata2 && $data2/feats.scp -ot $sdata2 ]] || split_data.sh $data2 $nj || exit 1;

oov=`cat $lang/oov.int` || exit 1;

mkdir -p $dir

cp -r $lang $dir/

# Compute grammar FST which corresponds to unigram decoding graph.

cat $data1/text | utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt | \
  awk '{for(n=2;n<=NF;n++){ printf("%s ", $n); } printf("\n"); }' | \
  utils/make_unigram_grammar.pl | fstcompile > $dir/lang/G.fst \
   || exit 1;

# mkgraph.sh expects a whole directory "lang", so put everything in one directory...
# it gets L_disambig.fst and G.fst (among other things) from $dir/lang, and
# final.mdl from $srcdir; the output HCLG.fst goes in $dir/graph.


if [ -s $dir/dengraph/HCLG.fst ]; then
   echo "Graph $dir/dengraph/HCLG.fst already exists: skipping graph creation."
else
  utils/mkgraph.sh $dir/lang $srcdir $dir/dengraph || exit 1;
fi


## Set up features.
splice_opts=`cat $srcdir/splice_opts 2>/dev/null` # frame-splicing options.
normft2=`cat $srcdir/normft2 2>/dev/null`

if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi

case $feat_type in
  delta)
    echo "$0: feature type is $feat_type"
    ;;
  lda)
    echo "$0: feature type is $feat_type"
    cp $srcdir/{lda,final}.mat $dir/
    ;;
  *) echo "$0: invalid feature type $feat_type" && exit 1;
esac

# set up feature stream 1;  this are usually spectral features, so we will add
# deltas or splice them
feats1="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata1/JOB/utt2spk scp:$sdata1/JOB/cmvn.scp scp:$sdata1/JOB/feats.scp ark:- |"

if [ "$feat_type" == "delta" ]; then
  feats1="$feats1 add-deltas ark:- ark:- |"
elif [ "$feat_type" == "lda" ]; then
  feats1="$feats1 splice-feats $splice_opts ark:- ark:- | transform-feats $dir/lda.mat ark:- ark:- |"
fi

# set up feature stream 2;  this are usually bottleneck or posterior features,
# which may be normalized if desired
feats2="scp:$sdata2/JOB/feats.scp"

if [ "$normft2" == "true" ]; then
  feats2="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata2/JOB/utt2spk scp:$sdata2/JOB/cmvn.scp $feats2 ark:- |"
fi

# assemble tandem features
feats="ark,s,cs:paste-feats '$feats1' '$feats2' ark:- |"

# add transformation, if applicable
if [ "$feat_type" == "lda" ]; then
  feats="$feats transform-feats $dir/final.mat ark:- ark:- |"
fi

# splicing/normalization options
cp $srcdir/{splice_opts,normft2,tandem} $dir 2>/dev/null


if [ ! -z "$transform_dir" ]; then # add transforms to features...
  echo "$0: using fMLLR transforms from $transform_dir"
  [ ! -f $transform_dir/trans.1 ] && echo "Expected $transform_dir/trans.1 to exist."
  [ "`cat $transform_dir/num_jobs`" -ne "$nj" ] \
    && echo "$0: mismatch in number of jobs with $transform_dir" && exit 1;
  [ -f $srcdir/final.mat ] && ! cmp $transform_dir/final.mat $srcdir/final.mat && \
     echo "$0: LDA transforms differ between $srcdir and $transform_dir"
  feats="$feats transform-feats --utt2spk=ark:$sdata1/JOB/utt2spk ark:$transform_dir/trans.JOB ark:- ark:- |"
else
  if [ -f $srcdir/final.alimdl ]; then
    echo "$0: you seem to have a SAT system but you did not supply the --transform-dir option.";
    exit 1;
  fi
fi


# if this job is interrupted by the user, we want any background jobs to be
# killed too.
cleanup() {
  local pids=$(jobs -pr)
  [ -n "$pids" ] && kill $pids
}
trap "cleanup" INT QUIT TERM EXIT


if [ $sub_split -eq 1 ]; then
  $cmd JOB=1:$nj $dir/log/decode_den.JOB.log \
   gmm-latgen-faster --beam=$beam --lattice-beam=$lattice_beam --acoustic-scale=$acwt \
    --max-mem=$max_mem --max-active=$max_active --word-symbol-table=$lang/words.txt $srcdir/final.mdl  \
     $dir/dengraph/HCLG.fst "$feats" "ark:|gzip -c >$dir/lat.JOB.gz" || exit 1;
else
  # each job from 1 to $nj is split into multiple pieces (sub-split), and we aim
  # to have at most two jobs running at each time.  The idea is that if we have stragglers
  # from one job, we can be processing another one at the same time.
  rm $dir/.error 2>/dev/null

  prev_pid=
  for n in `seq $[nj+1]`; do
    if [ $n -gt $nj ]; then
      this_pid=
    elif [ -f $dir/.done.$n ] && [ $dir/.done.$n -nt $srcdir/final.mdl ]; then
      echo "Not processing subset $n as already done (delete $dir/.done.$n if not)";
      this_pid=
    else
      ssdata1=$data1/split$nj/$n/split${sub_split}utt;
      split_data.sh --per-utt $sdata1/$n $sub_split || exit 1;
      ssdata2=$data2/split$nj/$n/split${sub_split}utt;
      split_data.sh --per-utt $sdata2/$n $sub_split || exit 1;
      mkdir -p $dir/log/$n
      mkdir -p $dir/part
      feats_subset=`echo $feats | sed "s/trans.JOB/trans.$n/g" | sed s:JOB/:$n/split${sub_split}utt/JOB/:g`

      $cmd $parallel_opts JOB=1:$sub_split $dir/log/$n/decode_den.JOB.log \
        gmm-latgen-faster --beam=$beam --lattice-beam=$lattice_beam --acoustic-scale=$acwt \
        --max-mem=$max_mem --max-active=$max_active --word-symbol-table=$lang/words.txt $srcdir/final.mdl  \
          $dir/dengraph/HCLG.fst "$feats_subset" "ark:|gzip -c >$dir/lat.$n.JOB.gz" || touch .error &
      this_pid=$!
    fi
    if [ ! -z "$prev_pid" ]; then  # Wait for the previous job; merge the previous set of lattices.
      wait $prev_pid
      [ -f $dir/.error ] && echo "$0: error generating denominator lattices" && exit 1;
      rm $dir/.merge_error 2>/dev/null
      echo Merging archives for data subset $prev_n
      for k in `seq $sub_split`; do
        gunzip -c $dir/lat.$prev_n.$k.gz || touch $dir/.merge_error;
      done | gzip -c > $dir/lat.$prev_n.gz || touch $dir/.merge_error;
      [ -f $dir/.merge_error ] && echo "$0: Merging lattices for subset $prev_n failed (or maybe some other error)" && exit 1;
      rm $dir/lat.$prev_n.*.gz
      touch $dir/.done.$prev_n
    fi
    prev_n=$n
    prev_pid=$this_pid
  done
fi


echo "$0: done generating denominator lattices."


================================================
FILE: egs/steps/tandem/make_denlats_sgmm2.sh
================================================
#!/usr/bin/env bash
# Copyright 2012  Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.
#                 Korbinian Riedhammer

# Create denominator lattices for MMI/MPE training, with SGMM models.  If the
# features have fMLLR transforms you have to supply the --transform-dir option.
# It gets any speaker vectors from the "alignment dir" ($srcdir).  Note: this is
# possibly a slight mismatch because the speaker vectors come from supervised
# adaptation.

# Begin configuration section.
nj=4
cmd=run.pl
sub_split=1
beam=13.0
lattice_beam=7.0
acwt=0.1
max_active=5000
transform_dir=
max_mem=20000000 # This will stop the processes getting too large.
# End configuration section.

echo "$0 $@"  # Print the command line for logging

[ -f ./path.sh ] && . ./path.sh; # source the path.
. parse_options.sh || exit 1;

if [ $# != 5 ]; then
   echo "Usage: steps/tandem/make_denlats_sgmm2.sh [options] <data1-dir> <data2-dir> <lang-dir> <src-dir|srcdir> <exp-dir>"
   echo "  e.g.: steps/tandem/make_denlats_sgmm2.sh {mfcc,bottleneck}/data1/train data1/lang exp/sgmm4a_ali exp/sgmm4a_denlats"
   echo "Works for (delta|lda) features, and (with --transform-dir option) such features"
   echo " plus transforms."
   echo ""
   echo "Main options (for others, see top of script file)"
   echo "  --config <config-file>                           # config containing options"
   echo "  --nj <nj>                                        # number of parallel jobs"
   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
   echo "  --sub-split <n-split>                            # e.g. 40; use this for "
   echo "                           # large databases so your jobs will be smaller and"
   echo "                           # will (individually) finish reasonably soon."
   echo "  --transform-dir <transform-dir>   # directory to find fMLLR transforms."
   exit 1;
fi

data1=$1
data2=$2
lang=$3
srcdir=$4 # could also be $srcdir, but only if no vectors supplied.
dir=$5

mkdir -p $dir/log
echo $nj > $dir/num_jobs

utils/lang/check_phones_compatible.sh $lang/phones.txt $srcdir/phones.txt || exit 1;

sdata1=$data1/split$nj
sdata2=$data2/split$nj
[[ -d $sdata1 && $data1/feats.scp -ot $sdata1 ]] || split_data.sh $data1 $nj || exit 1;
[[ -d $sdata2 && $data2/feats.scp -ot $sdata2 ]] || split_data.sh $data2 $nj || exit 1;

oov=`cat $lang/oov.int` || exit 1;

mkdir -p $dir

cp -r $lang $dir/

# Compute grammar FST which corresponds to unigram decoding graph.

cat $data1/text | utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt | \
  awk '{for(n=2;n<=NF;n++){ printf("%s ", $n); } printf("\n"); }' | \
  utils/make_unigram_grammar.pl | fstcompile > $dir/lang/G.fst \
   || exit 1;

# mkgraph.sh expects a whole directory "lang", so put everything in one directory...
# it gets L_disambig.fst and G.fst (among other things) from $dir/lang, and
# final.mdl from $srcdir; the output HCLG.fst goes in $dir/graph.

if [ -s $dir/dengraph/HCLG.fst ]; then
   echo "Graph $dir/dengraph/HCLG.fst already exists: skipping graph creation."
else
  utils/mkgraph.sh $dir/lang $srcdir $dir/dengraph || exit 1;
fi


## Set up features.
splice_opts=`cat $srcdir/splice_opts 2>/dev/null` # frame-splicing options.
normft2=`cat $srcdir/normft2 2>/dev/null`

if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi

case $feat_type in
  delta)
    echo "$0: feature type is $feat_type"
    ;;
  lda)
    echo "$0: feature type is $feat_type"
    cp $srcdir/{lda,final}.mat $dir/ || exit 1;
    ;;
  *) echo "$0: invalid feature type $feat_type" && exit 1;
esac

# set up feature stream 1;  this are usually spectral features, so we will add
# deltas or splice them
feats1="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata1/JOB/utt2spk scp:$sdata1/JOB/cmvn.scp scp:$sdata1/JOB/feats.scp ark:- |"

if [ "$feat_type" == "delta" ]; then
  feats1="$feats1 add-deltas ark:- ark:- |"
elif [ "$feat_type" == "lda" ]; then
  feats1="$feats1 splice-feats $splice_opts ark:- ark:- | transform-feats $dir/lda.mat ark:- ark:- |"
fi

# set up feature stream 2;  this are usually bottleneck or posterior features,
# which may be normalized if desired
feats2="scp:$sdata2/JOB/feats.scp"

if [ "$normft2" == "true" ]; then
  feats2="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata2/JOB/utt2spk scp:$sdata2/JOB/cmvn.scp $feats2 ark:- |"
fi

# assemble tandem features
feats="ark,s,cs:paste-feats '$feats1' '$feats2' ark:- |"

# add transformation, if applicable
if [ "$feat_type" == "lda" ]; then
  feats="$feats transform-feats $dir/final.mat ark:- ark:- |"
fi

# splicing/normalization options
cp $srcdir/{splice_opts,normft2,tandem} $dir 2>/dev/null

if [ ! -z "$transform_dir" ]; then # add transforms to features...
  echo "$0: using fMLLR transforms from $transform_dir"
  [ ! -f $transform_dir/trans.1 ] && echo "Expected $transform_dir/trans.1 to exist."
  [ "`cat $transform_dir/num_jobs`" -ne "$nj" ] \
    && echo "$0: mismatch in number of jobs with $transform_dir" && exit 1;
  [ -f $srcdir/final.mat ] && ! cmp $transform_dir/final.mat $srcdir/final.mat && \
     echo "$0: LDA transforms differ between $srcdir and $transform_dir"
  feats="$feats transform-feats --utt2spk=ark:$sdata1/JOB/utt2spk ark:$transform_dir/trans.JOB ark:- ark:- |"
else
  echo "Assuming you don't have a SAT system, since no --transform-dir option supplied "
fi

if [ -f $srcdir/gselect.1.gz ]; then
  gselect_opt="--gselect=ark:gunzip -c $srcdir/gselect.JOB.gz|"
else
  echo "$0: no such file $srcdir/gselect.1.gz" && exit 1;
fi

if [ -f $srcdir/vecs.1 ]; then
  spkvecs_opt="--spk-vecs=ark:$srcdir/vecs.JOB --utt2spk=ark:$sdata1/JOB/utt2spk"
else
  if [ -f $srcdir/final.alimdl ]; then
    echo "$0: You seem to have an SGMM system with speaker vectors,"
    echo "yet we can't find speaker vectors.  Perhaps you supplied"
    echo "the model director instead of the alignment directory?"
    exit 1;
  fi
fi

if [ $sub_split -eq 1 ]; then
  $cmd JOB=1:$nj $dir/log/decode_den.JOB.log \
   sgmm2-latgen-faster $spkvecs_opt "$gselect_opt" --beam=$beam \
     --lattice-beam=$lattice_beam --acoustic-scale=$acwt \
     --max-mem=$max_mem --max-active=$max_active --word-symbol-table=$lang/words.txt $srcdir/final.mdl  \
     $dir/dengraph/HCLG.fst "$feats" "ark:|gzip -c >$dir/lat.JOB.gz" || exit 1;
else
  for n in `seq $nj`; do
    if [ -f $dir/.done.$n ] && [ $dir/.done.$n -nt $srcdir/final.mdl ]; then
      echo "Not processing subset $n as already done (delete $dir/.done.$n if not)";
    else
      ssdata1=$data1/split$nj/$n/split${sub_split}utt;
      split_data.sh --per-utt $sdata1/$n $sub_split || exit 1;
      ssdata2=$data2/split$nj/$n/split${sub_split}utt;
      split_data.sh --per-utt $sdata2/$n $sub_split || exit 1;
      mkdir -p $dir/log/$n
      mkdir -p $dir/part
      feats_subset=`echo $feats | sed "s/trans.JOB/trans.$n/g" | sed s:JOB/:$n/split${sub_split}utt/JOB/:g`
      spkvecs_opt_subset=`echo $spkvecs_opt | sed "s/JOB/$n/g"`
      gselect_opt_subset=`echo $gselect_opt | sed "s/JOB/$n/g"`
      $cmd JOB=1:$sub_split $dir/log/$n/decode_den.JOB.log \
        sgmm2-latgen-faster $spkvecs_opt_subset "$gselect_opt_subset" \
          --beam=$beam --lattice-beam=$lattice_beam \
          --acoustic-scale=$acwt --max-mem=$max_mem --max-active=$max_active \
          --word-symbol-table=$lang/words.txt $srcdir/final.mdl  \
          $dir/dengraph/HCLG.fst "$feats_subset" "ark:|gzip -c >$dir/lat.$n.JOB.gz" || exit 1;
      echo Merging archives for data subset $n
      rm $dir/.error 2>/dev/null;
      for k in `seq $sub_split`; do
        gunzip -c $dir/lat.$n.$k.gz || touch $dir/.error;
      done | gzip -c > $dir/lat.$n.gz || touch $dir/.error;
      [ -f $dir/.error ] && echo Merging lattices for subset $n failed && exit 1;
      rm $dir/lat.$n.*.gz
      touch $dir/.done.$n
    fi
  done
fi


echo "$0: done generating denominator lattices with SGMMs."


================================================
FILE: egs/steps/tandem/mk_aslf_lda_mllt.sh
================================================
#!/usr/bin/env bash

# Copyright 2012  Johns Hopkins University (Author: Daniel Povey)
#                 Korbinian Riedhammer

# Decoding script that does fMLLR.  This can be on top of delta+delta-delta, or
# LDA+MLLT features.

# There are 3 models involved potentially in this script,
# and for a standard, speaker-independent system they will all be the same.
# The "alignment model" is for the 1st-pass decoding and to get the
# Gaussian-level alignments for the "adaptation model" the first time we
# do fMLLR.  The "adaptation model" is used to estimate fMLLR transforms
# and to generate state-level lattices.  The lattices are then rescored
# with the "final model".
#
# The following table explains where we get these 3 models from.
# Note: $srcdir is one level up from the decoding directory.
#
#   Model              Default source:
#
#  "alignment model"   $srcdir/final.alimdl              --alignment-model <model>
#                     (or $srcdir/final.mdl if alimdl absent)
#  "adaptation model"  $srcdir/final.mdl                 --adapt-model <model>
#  "final model"       $srcdir/final.mdl                 --final-model <model>


# Begin configuration section
alignment_model=
adapt_model=
final_model=
transform_dir=
stage=0
acwt=0.083333 # Acoustic weight used in getting fMLLR transforms, and also in
              # lattice generation.
max_active=7000
beam=13.0
lattice_beam=6.0
nj=4
silence_weight=0.01
cmd=run.pl
si_dir=
fmllr_update_type=full
# End configuration section

echo "$0 $@"  # Print the command line for logging

[ -f ./path.sh ] && . ./path.sh; # source the path.
. parse_options.sh || exit 1;

if [ $# != 4 ]; then
   echo "Usage: steps/tandem/mk_aslf_lda_mllt.sh [options] <graph-dir> <data1-dir> <data2-dir> <decode-dir>"
   echo " e.g.: steps/tandem/mk_aslf_lda_mllt.sh exp/tri2b/graph {mfcc,bottleneck}/data/test_dev93 exp/tri2b/decode_dev93"
   echo "main options (for others, see top of script file)"
   echo "  --config <config-file>                   # config containing options"
   echo "  --nj <nj>                                # number of parallel jobs"
   echo "  --cmd <cmd>                              # Command to run in parallel with"
   echo "  --adapt-model <adapt-mdl>                # Model to compute transforms with"
   echo "  --alignment-model <ali-mdl>              # Model to get Gaussian-level alignments for"
   echo "                                           # 1st pass of transform computation."
   echo "  --final-model <finald-mdl>               # Model to finally decode with"
   echo "  --si-dir <speaker-indep-decoding-dir>    # use this to skip 1st pass of decoding"
   echo "                                           # Caution-- must be with same tree"
   echo "  --acwt <acoustic-weight>                 # default 0.08333 ... used to get posteriors"

   exit 1;
fi


graphdir=$1
data1=$2
data2=$3
dir=`echo $4 | sed 's:/$::g'` # remove any trailing slash.

srcdir=`dirname $dir`; # Assume model directory one level up from decoding directory.

mkdir -p $dir/log`

sdata1=$data1/split$nj;
sdata2=$data2/split$nj;
[[ -d $sdata1 && $data1/feats.scp -ot $sdata1 ]] || split_data.sh $data1 $nj || exit 1;
[[ -d $sdata2 && $data2/feats.scp -ot $sdata2 ]] || split_data.sh $data2 $nj || exit 1;

echo $nj > $dir/num_jobs

# Some checks.  Note: we don't need $srcdir/tree but we expect
# it should exist, given the current structure of the scripts.
for f in $graphdir/HCLG.fst $data1/feats.scp $data2/feats.scp $srcdir/tree; do
  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
done


## Some checks, and setting of defaults for variables.
[ "$nj" -ne "`cat $dir/num_jobs`" ] && echo "Mismatch in #jobs with si-dir" && exit 1;
[ -z "$adapt_model" ] && adapt_model=$srcdir/final.mdl
[ -z "$final_model" ] && final_model=$srcdir/final.mdl
for f in $adapt_model $final_model; do
  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
done
##


# Set up features.

splice_opts=`cat $srcdir/splice_opts 2>/dev/null` # frame-splicing options.
normft2=`cat $srcdir/normft2 2>/dev/null`

if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi

case $feat_type in
  delta)
    echo "$0: feature type is $feat_type"
    ;;
  lda)
    echo "$0: feature type is $feat_type"
    ;;
  *) echo "$0: invalid feature type $feat_type" && exit 1;
esac

# set up feature stream 1;  this are usually spectral features, so we will add
# deltas or splice them
feats1="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata1/JOB/utt2spk scp:$sdata1/JOB/cmvn.scp scp:$sdata1/JOB/feats.scp ark:- |"

if [ "$feat_type" == "delta" ]; then
  feats1="$feats1 add-deltas ark:- ark:- |"
elif [ "$feat_type" == "lda" ]; then
  feats1="$feats1 splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/lda.mat ark:- ark:- |"
fi

# set up feature stream 2;  this are usually bottleneck or posterior features,
# which may be normalized if desired
feats2="scp:$sdata2/JOB/feats.scp"

if [ "$normft2" == "true" ]; then
  echo "Using cmvn for feats2"
  feats2="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata2/JOB/utt2spk scp:$sdata2/JOB/cmvn.scp $feats2 ark:- |"
fi

# assemble tandem features
sifeats="ark,s,cs:paste-feats '$feats1' '$feats2' ark:- |"

# add transformation, if applicable
if [ "$feat_type" == "lda" ]; then
  sifeats="$sifeats transform-feats $srcdir/final.mat ark:- ark:- |"
fi

if [ -e $dir/trans.1. ]; then
  echo "Using fMLLR transforms in $dir"
  feats="$sifeats transform-feats --utt2spk=ark:$sdata1/JOB/utt2spk ark:$dir/trans.JOB ark:- ark:- |"
elif [ ! -z "$transform_dir" ]; then
  echo "$0: using transforms from $transform_dir"
  [ ! -f $transform_dir/trans.1 ] && echo "$0: no such file $transform_dir/trans.1" && exit 1;
  [ "$nj" -ne "`cat $transform_dir/num_jobs`" ] \
    && echo "$0: #jobs mismatch with transform-dir." && exit 1;
  feats="$sifeats transform-feats --utt2spk=ark:$sdata1/JOB/utt2spk ark,s,cs:$transform_dir/trans.JOB ark:- ark:- |"
elif grep 'transform-feats --utt2spk' $srcdir/log/acc.0.1.log 2>/dev/null; then
  echo "$0: **WARNING**: you seem to be using an SGMM system trained with transforms,"
  echo "  but you are not providing the --transform-dir option in test time."
fi


# Rescore the state-level lattices with the final adapted features, and the final model
# (which by default is $srcdir/final.mdl, but which may be specified on the command line,
# useful in case of discriminatively trained systems).
# At this point we prune and determinize the lattices and write them out, ready for
# language model rescoring.

echo "Rescoring lattices, converting to slf"
mkdir -p $dir/slf
$cmd JOB=1:$nj $dir/log/rescore.slf.JOB.log \
  lattice-align-words $graphdir/phones/word_boundary.int $final_model "ark:gunzip -c $dir/lat.JOB.gz |" ark:- \| \
  gmm-rescore-lattice $final_model ark:- "$feats" ark,t:- \| \
  utils/int2sym.pl -f 3 $graphdir/words.txt \| \
  utils/convert_slf.pl - $dir/slf

exit 0;


================================================
FILE: egs/steps/tandem/mk_aslf_sgmm2.sh
================================================
#!/usr/bin/env bash

# Copyright 2012  Johns Hopkins University (Author: Daniel Povey)
#                 Korbinian Riedhammer

# Decoding script that does fMLLR.  This can be on top of delta+delta-delta, or
# LDA+MLLT features.

# There are 3 models involved potentially in this script,
# and for a standard, speaker-independent system they will all be the same.
# The "alignment model" is for the 1st-pass decoding and to get the
# Gaussian-level alignments for the "adaptation model" the first time we
# do fMLLR.  The "adaptation model" is used to estimate fMLLR transforms
# and to generate state-level lattices.  The lattices are then rescored
# with the "final model".
#
# The following table explains where we get these 3 models from.
# Note: $srcdir is one level up from the decoding directory.
#
#   Model              Default source:
#
#  "alignment model"   $srcdir/final.alimdl              --alignment-model <model>
#                     (or $srcdir/final.mdl if alimdl absent)
#  "adaptation model"  $srcdir/final.mdl                 --adapt-model <model>
#  "final model"       $srcdir/final.mdl                 --final-model <model>


# Begin configuration section
alignment_model=
adapt_model=
final_model=
transform_dir=
stage=0
acwt=0.083333 # Acoustic weight used in getting fMLLR transforms, and also in
              # lattice generation.
max_active=7000
beam=13.0
lattice_beam=6.0
nj=4
silence_weight=0.01
cmd=run.pl
si_dir=
fmllr_update_type=full
# End configuration section

echo "$0 $@"  # Print the command line for logging

[ -f ./path.sh ] && . ./path.sh; # source the path.
. parse_options.sh || exit 1;

if [ $# != 4 ]; then
   echo "Usage: steps/tandem/mk_aslf_sgmm2.sh [options] <graph-dir> <data1-dir> <data2-dir> <decode-dir>"
   echo " e.g.: steps/tandem/mk_aslf_sgmm2.sh exp/tri2b/graph {mfcc,bottleneck}/data/test_dev93 exp/tri2b/decode_dev93"
   echo "main options (for others, see top of script file)"
   echo "  --config <config-file>                   # config containing options"
   echo "  --nj <nj>                                # number of parallel jobs"
   echo "  --cmd <cmd>                              # Command to run in parallel with"
   echo "  --adapt-model <adapt-mdl>                # Model to compute transforms with"
   echo "  --alignment-model <ali-mdl>              # Model to get Gaussian-level alignments for"
   echo "                                           # 1st pass of transform computation."
   echo "  --final-model <finald-mdl>               # Model to finally decode with"
   echo "  --si-dir <speaker-indep-decoding-dir>    # use this to skip 1st pass of decoding"
   echo "                                           # Caution-- must be with same tree"
   echo "  --acwt <acoustic-weight>                 # default 0.08333 ... used to get posteriors"

   exit 1;
fi


graphdir=$1
data1=$2
data2=$3
dir=`echo $4 | sed 's:/$::g'` # remove any trailing slash.

srcdir=`dirname $dir`; # Assume model directory one level up from decoding directory.

mkdir -p $dir/log

sdata1=$data1/split$nj;
sdata2=$data2/split$nj;
[[ -d $sdata1 && $data1/feats.scp -ot $sdata1 ]] || split_data.sh $data1 $nj || exit 1;
[[ -d $sdata2 && $data2/feats.scp -ot $sdata2 ]] || split_data.sh $data2 $nj || exit 1;

echo $nj > $dir/num_jobs

# Some checks.  Note: we don't need $srcdir/tree but we expect
# it should exist, given the current structure of the scripts.
for f in $graphdir/HCLG.fst $data1/feats.scp $data2/feats.scp $srcdir/tree; do
  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
done


## Some checks, and setting of defaults for variables.
[ "$nj" -ne "`cat $dir/num_jobs`" ] && echo "Mismatch in #jobs with si-dir" && exit 1;
[ -z "$adapt_model" ] && adapt_model=$srcdir/final.mdl
[ -z "$final_model" ] && final_model=$srcdir/final.mdl
for f in $adapt_model $final_model; do
  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
done
##


# Set up features.

splice_opts=`cat $srcdir/splice_opts 2>/dev/null` # frame-splicing options.
normft2=`cat $srcdir/normft2 2>/dev/null`

if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi

case $feat_type in
  delta)
    echo "$0: feature type is $feat_type"
    ;;
  lda)
    echo "$0: feature type is $feat_type"
    ;;
  *) echo "$0: invalid feature type $feat_type" && exit 1;
esac

# set up feature stream 1;  this are usually spectral features, so we will add
# deltas or splice them
feats1="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata1/JOB/utt2spk scp:$sdata1/JOB/cmvn.scp scp:$sdata1/JOB/feats.scp ark:- |"

if [ "$feat_type" == "delta" ]; then
  feats1="$feats1 add-deltas ark:- ark:- |"
elif [ "$feat_type" == "lda" ]; then
  feats1="$feats1 splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/lda.mat ark:- ark:- |"
fi

# set up feature stream 2;  this are usually bottleneck or posterior features,
# which may be normalized if desired
feats2="scp:$sdata2/JOB/feats.scp"

if [ "$normft2" == "true" ]; then
  echo "Using cmvn for feats2"
  feats2="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata2/JOB/utt2spk scp:$sdata2/JOB/cmvn.scp $feats2 ark:- |"
fi

# assemble tandem features
sifeats="ark,s,cs:paste-feats '$feats1' '$feats2' ark:- |"

# add transformation, if applicable
if [ "$feat_type" == "lda" ]; then
  sifeats="$sifeats transform-feats $srcdir/final.mat ark:- ark:- |"
fi

if [ -e $dir/trans.1. ]; then
  echo "Using fMLLR transforms in $dir"
  feats="$sifeats transform-feats --utt2spk=ark:$sdata1/JOB/utt2spk ark:$dir/trans.JOB ark:- ark:- |"
elif [ ! -z "$transform_dir" ]; then
  echo "$0: using transforms from $transform_dir"
  [ ! -f $transform_dir/trans.1 ] && echo "$0: no such file $transform_dir/trans.1" && exit 1;
  [ "$nj" -ne "`cat $transform_dir/num_jobs`" ] \
    && echo "$0: #jobs mismatch with transform-dir." && exit 1;
  feats="$sifeats transform-feats --utt2spk=ark:$sdata1/JOB/utt2spk ark,s,cs:$transform_dir/trans.JOB ark:- ark:- |"
elif grep 'transform-feats --utt2spk' $srcdir/log/acc.0.1.log 2>/dev/null; then
  echo "$0: **WARNING**: you seem to be using an SGMM system trained with transforms,"
  echo "  but you are not providing the --transform-dir option in test time."
fi


# Rescore the state-level lattices with the final adapted features, and the final model
# (which by default is $srcdir/final.mdl, but which may be specified on the command line,
# useful in case of discriminatively trained systems).
# At this point we prune and determinize the lattices and write them out, ready for
# language model rescoring.

echo "Rescoring lattices, converting to slf"
mkdir -p $dir/slf
$cmd JOB=1:$nj $dir/log/rescore.slf.JOB.log \
  lattice-align-words $graphdir/phones/word_boundary.int $final_model "ark:gunzip -c $dir/lat.JOB.gz |" ark:- \| \
  sgmm2-rescore-lattice --spk-vecs=ark:$dir/vecs.JOB --utt2spk=ark:$sdata1/JOB/utt2spk \
    "--gselect=ark:gunzip -c $dir/gselect.JOB.gz |" $final_model ark:- "$feats" ark,t:- \| \
  utils/int2sym.pl -f 3 $graphdir/words.txt \| \
  utils/convert_slf.pl - $dir/slf

exit 0;


================================================
FILE: egs/steps/tandem/train_deltas.sh
================================================
#!/usr/bin/env bash

# Copyright 2012  Johns Hopkins University (Author: Daniel Povey)
#                 Korbinian Riedhammer
# Apache 2.0

# Begin configuration.
stage=-4 #  This allows restarting after partway, when something when wrong.
config=
cmd=run.pl
scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
realign_iters="10 20 30";
num_iters=35    # Number of iterations of training
max_iter_inc=25 # Last iter to increase #Gauss on.
beam=10
retry_beam=40
boost_silence=1.0 # Factor by which to boost silence likelihoods in alignment
power=0.2 # Exponent for number of gaussians according to occurrence counts
cluster_thresh=-1  # for build-tree control final bottom-up clustering of leaves
normft2=true  # typically, the tandem features will be normalized already b/c of pca
# End configuration.

echo "$0 $@"  # Print the command line for logging

[ -f path.sh ] && . ./path.sh;
. parse_options.sh || exit 1;

if [ $# != 7 ]; then
   echo "Usage: steps/tandem/train_deltas.sh <num-leaves> <tot-gauss> <data1-dir> <data2-dir> <lang-dir> <alignment-dir> <exp-dir>"
   echo " e.g.: steps/tandem/train_deltas.sh 2000 10000 {mfcc,bottleneck}/data/train_si84_half data/lang exp/mono_ali exp/tri1"
   echo "main options (for others, see top of script file)"
   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
   echo "  --config <config-file>                           # config containing options"
   echo "  --stage <stage>                                  # stage to do partial re-run from."
   echo "  --normft2 (true|false)                           # apply CMVN to second features?"
   exit 1;
fi

numleaves=$1
totgauss=$2
data1=$3
data2=$4
lang=$5
alidir=$6
dir=$7

for f in $alidir/final.mdl $alidir/ali.1.gz $data1/feats.scp $data2/feats.scp $lang/phones.txt; do
  [ ! -f $f ] && echo "train_tandem.sh: no such file $f" && exit 1;
done

numgauss=$numleaves
incgauss=$[($totgauss-$numgauss)/$max_iter_inc] # per-iter increment for #Gauss
oov=`cat $lang/oov.int` || exit 1;
ciphonelist=`cat $lang/phones/context_indep.csl` || exit 1;
nj=`cat $alidir/num_jobs` || exit 1;
mkdir -p $dir/log
echo $nj > $dir/num_jobs

utils/lang/check_phones_compatible.sh $lang/phones.txt $alidir/phones.txt || exit 1;
cp $lang/phones.txt $dir || exit 1;

# Set up features

sdata1=$data1/split$nj;
sdata2=$data2/split$nj;
[[ -d $sdata1 && $data1/feats.scp -ot $sdata1 ]] || split_data.sh $data1 $nj || exit 1;
[[ -d $sdata2 && $data2/feats.scp -ot $sdata2 ]] || split_data.sh $data2 $nj || exit 1;

# Set up stream 1 (usually spectral features, so we use deltas)
feats1="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata1/JOB/utt2spk scp:$sdata1/JOB/cmvn.scp scp:$sdata1/JOB/feats.scp ark:- | add-deltas ark:- ark:- |"

# Set up stream 2 (usually bottleneck/posteriors), normalize if desired
feats2="scp:$sdata2/JOB/feats.scp"
if [ "$normft2" == "true" ]; then
  feats2="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata2/JOB/utt2spk scp:$sdata2/JOB/cmvn.scp $feats2 ark:- |"
fi

feats="ark,s,cs:paste-feats '$feats1' '$feats2' ark:- |"

# save config
echo $feats > $dir/tandem
echo $normft2 > $dir/normft2

rm $dir/.error 2>/dev/null

if [ $stage -le -3 ]; then
  echo "$0: accumulating tree stats"
  $cmd JOB=1:$nj $dir/log/acc_tree.JOB.log \
    acc-tree-stats  --ci-phones=$ciphonelist $alidir/final.mdl "$feats" \
     "ark:gunzip -c $alidir/ali.JOB.gz|" $dir/JOB.treeacc || exit 1;
  sum-tree-stats $dir/treeacc $dir/*.treeacc 2>$dir/log/sum_tree_acc.log || exit 1;
  rm $dir/*.treeacc
fi

if [ $stage -le -2 ]; then
  echo "$0: getting questions for tree-building, via clustering"
  # preparing questions, roots file...
  cluster-phones $dir/treeacc $lang/phones/sets.int $dir/questions.int 2> $dir/log/questions.log || exit 1;
  cat $lang/phones/extra_questions.int >> $dir/questions.int
  compile-questions $lang/topo $dir/questions.int $dir/questions.qst 2>$dir/log/compile_questions.log || exit 1;

  echo "$0: building the tree"
  $cmd $dir/log/build_tree.log \
    build-tree --verbose=1 --max-leaves=$numleaves \
    --cluster-thresh=$cluster_thresh $dir/treeacc $lang/phones/roots.int \
    $dir/questions.qst $lang/topo $dir/tree || exit 1;

  gmm-init-model  --write-occs=$dir/1.occs  \
    $dir/tree $dir/treeacc $lang/topo $dir/1.mdl 2> $dir/log/init_model.log || exit 1;
  grep 'no stats' $dir/log/init_model.log && echo "This is a bad warning.";

  gmm-mixup --mix-up=$numgauss $dir/1.mdl $dir/1.occs $dir/1.mdl 2>$dir/log/mixup.log || exit 1;
  rm $dir/treeacc
fi

if [ $stage -le -1 ]; then
  # Convert the alignments.
  echo "$0: converting alignments from $alidir to use current tree"
  $cmd JOB=1:$nj $dir/log/convert.JOB.log \
    convert-ali $alidir/final.mdl $dir/1.mdl $dir/tree \
     "ark:gunzip -c $alidir/ali.JOB.gz|" "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1;
fi

if [ $stage -le 0 ]; then
  echo "$0: compiling graphs of transcripts"
  $cmd JOB=1:$nj $dir/log/compile_graphs.JOB.log \
    compile-train-graphs --read-disambig-syms=$lang/phones/disambig.int $dir/tree $dir/1.mdl  $lang/L.fst  \
     "ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt < $data1/split$nj/JOB/text |" \
      "ark:|gzip -c >$dir/fsts.JOB.gz" || exit 1;
fi

x=1
while [ $x -lt $num_iters ]; do
  echo "$0: training pass $x"
  if [ $stage -le $x ]; then
    if echo $realign_iters | grep -w $x >/dev/null; then
      echo "$0: aligning data"
      mdl="gmm-boost-silence --boost=$boost_silence `cat $lang/phones/optional_silence.csl` $dir/$x.mdl - |"
      $cmd JOB=1:$nj $dir/log/align.$x.JOB.log \
        gmm-align-compiled $scale_opts --beam=$beam --retry-beam=$retry_beam "$mdl" \
         "ark:gunzip -c $dir/fsts.JOB.gz|" "$feats" \
         "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1;
    fi
    $cmd JOB=1:$nj $dir/log/acc.$x.JOB.log \
      gmm-acc-stats-ali  $dir/$x.mdl "$feats" \
       "ark,s,cs:gunzip -c $dir/ali.JOB.gz|" $dir/$x.JOB.acc || exit 1;
    $cmd $dir/log/update.$x.log \
      gmm-est --mix-up=$numgauss --power=$power \
        --write-occs=$dir/$[$x+1].occs $dir/$x.mdl \
       "gmm-sum-accs - $dir/$x.*.acc |" $dir/$[$x+1].mdl || exit 1;
    rm $dir/$x.mdl $dir/$x.*.acc
    rm $dir/$x.occs
  fi
  [ $x -le $max_iter_inc ] && numgauss=$[$numgauss+$incgauss];
  x=$[$x+1];
done

rm $dir/final.mdl 2>/dev/null
ln -s $x.mdl $dir/final.mdl
ln -s $x.occs $dir/final.occs

# Summarize warning messages...
utils/summarize_warnings.pl  $dir/log

echo "$0: Done training tandem system in $dir"


================================================
FILE: egs/steps/tandem/train_lda_mllt.sh
================================================
#!/usr/bin/env bash

# Copyright 2012  Johns Hopkins University (Author: Daniel Povey)
#                 Korbinian Riedhammer
# Apache 2.0.

# Begin configuration.
cmd=run.pl
config=
stage=-5
scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
realign_iters="10 20 30";
mllt_iters="2 4 6 12";
num_iters=35    # Number of iterations of training
max_iter_inc=25  # Last iter to increase #Gauss on.
beam=10
retry_beam=40
boost_silence=1.0 # Factor by which to boost silence likelihoods in alignment
power=0.2 # Exponent for number of gaussians according to occurrence counts
randprune=4.0 # This is approximately the ratio by which we will speed up the
              # LDA and MLLT calculations via randomized pruning.
splice_opts=
cluster_thresh=-1  # for build-tree control final bottom-up clustering of leaves

dim1=30  # dimension first stream (spectral features)
dim2=40  # dimension second stream (pasted features, usually bn/posteriors)

# apply CMVN to the second feature stream
normft2=true

# do an extra LDA after pasting the features?
extra_lda=false

# End configuration.

echo "$0 $@"  # Print the command line for logging

[ -f path.sh ] && . ./path.sh
. parse_options.sh || exit 1;

if [ $# != 7 ]; then
  echo "Usage: steps/tandem/train_lda_mllt.sh [options] <#leaves> <#gauss> <data1> <data2> <lang> <alignments> <dir>"
  echo " e.g.: steps/tandem/train_lda_mllt.sh 2500 15000 {mfcc,bottleneck}/data/train_si84 data/lang exp/tri1_ali_si84 exp/tri2b"
  echo "Main options (for others, see top of script file)"
  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
  echo "  --config <config-file>                           # config containing options"
  echo "  --stage <stage>                                  # stage to do partial re-run from."
  echo "  --normft2 (true|false)                           # apply CMVN to second data set (true)"
  echo "  --extra-lda (true|false)                         # apply extra LDA after feature paste (false)"
  echo "  --dim1 <n>                                       # dimension of the first feature stream by HLDA"
  echo "  --dim2 <m>                                       # dimension of of the pasted features after 2nd HLDA"
  exit 1;
fi

numleaves=$1
totgauss=$2
data1=$3
data2=$4
lang=$5
alidir=$6
dir=$7

for f in $alidir/final.mdl $alidir/ali.1.gz $data1/feats.scp $data2/feats.scp $lang/phones.txt; do
  [ ! -f $f ] && echo "train_tandem_lda_mllt.sh: no such file $f" && exit 1;
done

numgauss=$numleaves
incgauss=$[($totgauss-$numgauss)/$max_iter_inc] # per-iter #gauss increment
oov=`cat $lang/oov.int` || exit 1;
nj=`cat $alidir/num_jobs` || exit 1;
silphonelist=`cat $lang/phones/silence.csl` || exit 1;
ciphonelist=`cat $lang/phones/context_indep.csl` || exit 1;

mkdir -p $dir/log
echo $nj >$dir/num_jobs

utils/lang/check_phones_compatible.sh $lang/phones.txt $alidir/phones.txt || exit 1;
cp $lang/phones.txt $dir || exit 1;


# Set up features.

sdata1=$data1/split$nj;
sdata2=$data2/split$nj;
[[ -d $sdata1 && $data1/feats.scp -ot $sdata1 ]] || split_data.sh $data1 $nj || exit 1;
[[ -d $sdata2 && $data2/feats.scp -ot $sdata2 ]] || split_data.sh $data2 $nj || exit 1;

# set up feature stream 1;  here we assume spectral features which we will 
# splice instead of deltas
feats1="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata1/JOB/utt2spk scp:$sdata1/JOB/cmvn.scp scp:$sdata1/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- |"

# Now estimate LDA, which will only be applied to the spectral features
# (assuming that the tandem features were already discriminatively trained).
# This is instead of the deltas.
if [ $stage -le -5 ]; then
  echo "Accumulating LDA statistics (this only applies to the base feature part)."
  $cmd JOB=1:$nj $dir/log/lda_acc.JOB.log \
    ali-to-post "ark:gunzip -c $alidir/ali.JOB.gz|" ark:- \| \
      weight-silence-post 0.0 $silphonelist $alidir/final.mdl ark:- ark:- \| \
      acc-lda --rand-prune=$randprune $alidir/final.mdl "$feats1" ark,s,cs:- \
       $dir/lda.JOB.acc || exit 1;
  est-lda --write-full-matrix=$dir/full.mat --dim=$dim1 $dir/lda.mat $dir/lda.*.acc \
      2>$dir/log/lda_est.log || exit 1;
  rm $dir/lda.*.acc
fi

# add transform to the features
feats1="$feats1 transform-feats $dir/lda.mat ark:- ark:- |"

# set up feature stream 2;  this are usually bottleneck or posterior features, 
# which may be normalized if desired
feats2="scp:$sdata2/JOB/feats.scp"

if [ "$normft2" == "true" ]; then
  feats2="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata2/JOB/utt2spk scp:$sdata2/JOB/cmvn.scp $feats2 ark:- |"
fi

# assemble tandem features;  note: $feats gets overwritten later in the script
# once we have MLLT matrices
tandemfeats="ark,s,cs:paste-feats '$feats1' '$feats2' ark:- |"
feats="$tandemfeats"

# keep track of splicing/normalization options
echo $splice_opts > $dir/splice_opts
echo $normft2 > $dir/normft2


# Begin training;  initially, we have no MLLT matrix
cur_mllt_iter=0

if [ $stage -le -4 -a $extra_lda == true ]; then
  echo "Accumulating LDA statistics (for tandem features this time)."
  $cmd JOB=1:$nj $dir/log/lda_acc.JOB.log \
    ali-to-post "ark:gunzip -c $alidir/ali.JOB.gz|" ark:- \| \
    weight-silence-post 0.0 $silphonelist $alidir/final.mdl ark:- ark:- \| \
    acc-lda --rand-prune=$randprune $alidir/final.mdl "$tandemfeats" ark,s,cs:- \
    $dir/lda.JOB.acc || exit 1;
  est-lda --write-full-matrix=$dir/full.mat --dim=$dim2 $dir/0.mat $dir/lda.*.acc \
    2>$dir/log/lda_est.log || exit 1;
  rm $dir/lda.*.acc
  
  feats="$tandemfeats transform-feats $dir/0.mat ark:- ark:- |"
fi

# keep track of the features
echo $feats > $dir/tandem

if [ $stage -le -3 ]; then
  echo "Accumulating tree stats"
  $cmd JOB=1:$nj $dir/log/acc_tree.JOB.log \
   acc-tree-stats  --ci-phones=$ciphonelist $alidir/final.mdl "$feats" \
     "ark:gunzip -c $alidir/ali.JOB.gz|" $dir/JOB.treeacc || exit 1;
  [ `ls $dir/*.treeacc | wc -w` -ne "$nj" ] && echo "Wrong #tree-accs" && exit 1;
  $cmd $dir/log/sum_tree_acc.log \
    sum-tree-stats $dir/treeacc $dir/*.treeacc || exit 1;
  rm $dir/*.treeacc
fi


if [ $stage -le -2 ]; then
  echo "Getting questions for tree clustering."
  # preparing questions, roots file...
  cluster-phones $dir/treeacc $lang/phones/sets.int $dir/questions.int 2> $dir/log/questions.log || exit 1;
  cat $lang/phones/extra_questions.int >> $dir/questions.int
  compile-questions $lang/topo $dir/questions.int $dir/questions.qst 2>$dir/log/compile_questions.log || exit 1;

  echo "Building the tree"
  $cmd $dir/log/build_tree.log \
    build-tree --verbose=1 --max-leaves=$numleaves \
    --cluster-thresh=$cluster_thresh $dir/treeacc $lang/phones/roots.int \
    $dir/questions.qst $lang/topo $dir/tree || exit 1;

  gmm-init-model  --write-occs=$dir/1.occs  \
    $dir/tree $dir/treeacc $lang/topo $dir/1.mdl 2> $dir/log/init_model.log || exit 1;
  grep 'no stats' $dir/log/init_model.log && echo "This is a bad warning.";

  # could mix up if we wanted:
  # gmm-mixup --mix-up=$numgauss $dir/1.mdl $dir/1.occs $dir/1.mdl 2>$dir/log/mixup.log || exit 1;
  rm $dir/treeacc
fi


if [ $stage -le -1 ]; then
  # Convert the alignments.
  echo "Converting alignments from $alidir to use current tree"
  $cmd JOB=1:$nj $dir/log/convert.JOB.log \
    convert-ali $alidir/final.mdl $dir/1.mdl $dir/tree \
     "ark:gunzip -c $alidir/ali.JOB.gz|" "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1;
fi

if [ $stage -le 0 ]; then
  echo "Compiling graphs of transcripts"
  $cmd JOB=1:$nj $dir/log/compile_graphs.JOB.log \
    compile-train-graphs --read-disambig-syms=$lang/phones/disambig.int $dir/tree $dir/1.mdl  $lang/L.fst  \
     "ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt < $data1/split$nj/JOB/text |" \
      "ark:|gzip -c >$dir/fsts.JOB.gz" || exit 1;
fi


x=1
while [ $x -lt $num_iters ]; do
  echo Training pass $x
  if echo $realign_iters | grep -w $x >/dev/null && [ $stage -le $x ]; then
    echo Aligning data
    mdl="gmm-boost-silence --boost=$boost_silence `cat $lang/phones/optional_silence.csl` $dir/$x.mdl - |"
    $cmd JOB=1:$nj $dir/log/align.$x.JOB.log \
      gmm-align-compiled $scale_opts --beam=$beam --retry-beam=$retry_beam "$mdl" \
      "ark:gunzip -c $dir/fsts.JOB.gz|" "$feats" \
      "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1;
  fi
  if echo $mllt_iters | grep -w $x >/dev/null; then
    if [ $stage -le $x ]; then
      echo "Estimating MLLT"
      $cmd JOB=1:$nj $dir/log/macc.$x.JOB.log \
        ali-to-post "ark:gunzip -c $dir/ali.JOB.gz|" ark:- \| \
        weight-silence-post 0.0 $silphonelist $dir/$x.mdl ark:- ark:- \| \
        gmm-acc-mllt --rand-prune=$randprune  $dir/$x.mdl "$feats" ark:- $dir/$x.JOB.macc \
        || exit 1;
      est-mllt $dir/$x.mat.new $dir/$x.*.macc 2> $dir/log/mupdate.$x.log || exit 1;
      gmm-transform-means  $dir/$x.mat.new $dir/$x.mdl $dir/$x.mdl \
        2> $dir/log/transform_means.$x.log || exit 1;
      
      # see if this is the first MLLT iteration and there is no lda;  otherwise compose transforms
      if [ $cur_mllt_iter == 0 -a $extra_lda == false ]; then
        mv $dir/$x.mat.new $dir/$x.mat || exit 1;
      else
        compose-transforms --print-args=false $dir/$x.mat.new $dir/$cur_mllt_iter.mat $dir/$x.mat || exit 1;
      fi

      rm $dir/$x.*.macc
    fi

    # update features
    feats="$tandemfeats transform-feats $dir/$x.mat ark:- ark:- |"
    cur_mllt_iter=$x
  fi

  if [ $stage -le $x ]; then
    $cmd JOB=1:$nj $dir/log/acc.$x.JOB.log \
      gmm-acc-stats-ali  $dir/$x.mdl "$feats" \
      "ark,s,cs:gunzip -c $dir/ali.JOB.gz|" $dir/$x.JOB.acc || exit 1;
    $cmd $dir/log/update.$x.log \
      gmm-est --write-occs=$dir/$[$x+1].occs --mix-up=$numgauss --power=$power \
        $dir/$x.mdl "gmm-sum-accs - $dir/$x.*.acc |" $dir/$[$x+1].mdl || exit 1;
    rm $dir/$x.mdl $dir/$x.*.acc $dir/$x.occs 
  fi
  [ $x -le $max_iter_inc ] && numgauss=$[$numgauss+$incgauss];
  x=$[$x+1];
done

rm $dir/final.{mdl,mat,occs} 2>/dev/null
ln -s $x.mdl $dir/final.mdl
ln -s $x.occs $dir/final.occs
ln -s $cur_mllt_iter.mat $dir/final.mat

# Summarize warning messages...

utils/summarize_warnings.pl $dir/log

echo Done training system with LDA+MLLT tandem features in $dir


================================================
FILE: egs/steps/tandem/train_mllt.sh
================================================
#!/usr/bin/env bash

# Copyright 2012  Johns Hopkins University (Author: Daniel Povey)
#                 Korbinian Riedhammer
# Apache 2.0.

# This is a vanilla tandem system where the first stream is just extended with
# delta+deltadeltas, in contrast to the train_lda_mllt.sh script, where the
# temoporal context of the first stream is modeled via HLDA

# Begin configuration.
cmd=run.pl
config=
stage=-5
scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
realign_iters="10 20 30";
mllt_iters="2 4 6 12";
num_iters=35    # Number of iterations of training
max_iter_inc=25  # Last iter to increase #Gauss on.
beam=10
retry_beam=40
boost_silence=1.0 # Factor by which to boost silence likelihoods in alignment
power=0.2 # Exponent for number of gaussians according to occurrence counts
randprune=4.0 # This is approximately the ratio by which we will speed up the
              # LDA and MLLT calculations via randomized pruning.
cluster_thresh=-1  # for build-tree control final bottom-up clustering of leaves

# apply CMVN to the second feature stream?
normft2=true

# Do additional LDA after pasting the features
dim2=40
extra_lda=false

# End configuration.

echo "$0 $@"  # Print the command line for logging

[ -f path.sh ] && . ./path.sh
. parse_options.sh || exit 1;

if [ $# != 7 ]; then
  echo "Usage: steps/tandem/train_mllt.sh [options] <#leaves> <#gauss> <data1> <data2> <lang> <alignments> <dir>"
  echo " e.g.: steps/tandem/train_mllt.sh 2500 15000 {mfcc,bottleneck}/data/train_si84 data/lang exp/tri1_ali_si84 exp/tri2b"
  echo "Main options (for others, see top of script file)"
  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
  echo "  --config <config-file>                           # config containing options"
  echo "  --stage <stage>                                  # stage to do partial re-run from."
  echo "  --normft2 (true|false)                           # apply CMVN to second data set (true)"
  echo "  --extra-lda (true|false)                         # apply extra LDA after feature paste (false)"
  echo "  --dim2 <n>                                       # dimension of the pasted features after 2nd HLDA"
  exit 1;
fi

numleaves=$1
totgauss=$2
data1=$3
data2=$4
lang=$5
alidir=$6
dir=$7

for f in $alidir/final.mdl $alidir/ali.1.gz $data1/feats.scp $data2/feats.scp $lang/phones.txt; do
  [ ! -f $f ] && echo "train_tandem_lda_mllt.sh: no such file $f" && exit 1;
done

numgauss=$numleaves
incgauss=$[($totgauss-$numgauss)/$max_iter_inc] # per-iter #gauss increment
oov=`cat $lang/oov.int` || exit 1;
nj=`cat $alidir/num_jobs` || exit 1;
silphonelist=`cat $lang/phones/silence.csl` || exit 1;
ciphonelist=`cat $lang/phones/context_indep.csl` || exit 1;

mkdir -p $dir/log
echo $nj >$dir/num_jobs

utils/lang/check_phones_compatible.sh $lang/phones.txt $alidir/phones.txt || exit 1;
cp $lang/phones.txt $dir || exit 1;


# Set up features.

sdata1=$data1/split$nj;
sdata2=$data2/split$nj;
[[ -d $sdata1 && $data1/feats.scp -ot $sdata1 ]] || split_data.sh $data1 $nj || exit 1;
[[ -d $sdata2 && $data2/feats.scp -ot $sdata2 ]] || split_data.sh $data2 $nj || exit 1;

# set up feature stream 1;  here we assume spectral features which we will 
# splice instead of deltas
feats1="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata1/JOB/utt2spk scp:$sdata1/JOB/cmvn.scp scp:$sdata1/JOB/feats.scp ark:- | add-deltas ark:- ark:- |"

# set up feature stream 2;  this are usually bottleneck or posterior features, 
# which may be normalized if desired
feats2="scp:$sdata2/JOB/feats.scp"

if [ "$normft2" == "true" ]; then
  feats2="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata2/JOB/utt2spk scp:$sdata2/JOB/cmvn.scp $feats2 ark:- |"
fi

# assemble tandem features;  note: $feats gets overwritten later in the script
# once we have MLLT matrices
tandemfeats="ark,s,cs:paste-feats '$feats1' '$feats2' ark:- |"
feats="$tandemfeats"

# keep track of splicing/normalization options
echo $feats > $dir/tandem
echo $normft2 > $dir/normft2


# Begin training;  initially, we have no MLLT matrix
cur_mllt_iter=0

if [ $stage -le -4 -a $extra_lda == true ]; then
  echo "Accumulating LDA statistics (for tandem features this time)."
  $cmd JOB=1:$nj $dir/log/lda_acc.JOB.log \
    ali-to-post "ark:gunzip -c $alidir/ali.JOB.gz|" ark:- \| \
    weight-silence-post 0.0 $silphonelist $alidir/final.mdl ark:- ark:- \| \
    acc-lda --rand-prune=$randprune $alidir/final.mdl "$tandemfeats" ark,s,cs:- \
    $dir/lda.JOB.acc || exit 1;
  est-lda --write-full-matrix=$dir/full.mat --dim=$dim2 $dir/0.mat $dir/lda.*.acc \
    2>$dir/log/lda_est.log || exit 1;
  rm $dir/lda.*.acc
  
  feats="$tandemfeats transform-feats $dir/0.mat ark:- ark:- |"
fi

if [ $stage -le -3 ]; then
  echo "Accumulating tree stats"
  $cmd JOB=1:$nj $dir/log/acc_tree.JOB.log \
   acc-tree-stats  --ci-phones=$ciphonelist $alidir/final.mdl "$feats" \
     "ark:gunzip -c $alidir/ali.JOB.gz|" $dir/JOB.treeacc || exit 1;
  [ `ls $dir/*.treeacc | wc -w` -ne "$nj" ] && echo "Wrong #tree-accs" && exit 1;
  $cmd $dir/log/sum_tree_acc.log \
    sum-tree-stats $dir/treeacc $dir/*.treeacc || exit 1;
  rm $dir/*.treeacc
fi


if [ $stage -le -2 ]; then
  echo "Getting questions for tree clustering."
  # preparing questions, roots file...
  cluster-phones $dir/treeacc $lang/phones/sets.int $dir/questions.int 2> $dir/log/questions.log || exit 1;
  cat $lang/phones/extra_questions.int >> $dir/questions.int
  compile-questions $lang/topo $dir/questions.int $dir/questions.qst 2>$dir/log/compile_questions.log || exit 1;

  echo "Building the tree"
  $cmd $dir/log/build_tree.log \
    build-tree --verbose=1 --max-leaves=$numleaves \
    --cluster-thresh=$cluster_thresh $dir/treeacc $lang/phones/roots.int \
    $dir/questions.qst $lang/topo $dir/tree || exit 1;

  gmm-init-model  --write-occs=$dir/1.occs  \
    $dir/tree $dir/treeacc $lang/topo $dir/1.mdl 2> $dir/log/init_model.log || exit 1;
  grep 'no stats' $dir/log/init_model.log && echo "This is a bad warning.";

  # could mix up if we wanted:
  # gmm-mixup --mix-up=$numgauss $dir/1.mdl $dir/1.occs $dir/1.mdl 2>$dir/log/mixup.log || exit 1;
  rm $dir/treeacc
fi


if [ $stage -le -1 ]; then
  # Convert the alignments.
  echo "Converting alignments from $alidir to use current tree"
  $cmd JOB=1:$nj $dir/log/convert.JOB.log \
    convert-ali $alidir/final.mdl $dir/1.mdl $dir/tree \
     "ark:gunzip -c $alidir/ali.JOB.gz|" "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1;
fi

if [ $stage -le 0 ]; then
  echo "Compiling graphs of transcripts"
  $cmd JOB=1:$nj $dir/log/compile_graphs.JOB.log \
    compile-train-graphs --read-disambig-syms=$lang/phones/disambig.int $dir/tree $dir/1.mdl  $lang/L.fst  \
     "ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt < $data1/split$nj/JOB/text |" \
      "ark:|gzip -c >$dir/fsts.JOB.gz" || exit 1;
fi


x=1
while [ $x -lt $num_iters ]; do
  echo Training pass $x
  if echo $realign_iters | grep -w $x >/dev/null && [ $stage -le $x ]; then
    echo Aligning data
    mdl="gmm-boost-silence --boost=$boost_silence `cat $lang/phones/optional_silence.csl` $dir/$x.mdl - |"
    $cmd JOB=1:$nj $dir/log/align.$x.JOB.log \
      gmm-align-compiled $scale_opts --beam=$beam --retry-beam=$retry_beam "$mdl" \
      "ark:gunzip -c $dir/fsts.JOB.gz|" "$feats" \
      "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1;
  fi
  if echo $mllt_iters | grep -w $x >/dev/null; then
    if [ $stage -le $x ]; then
      echo "Estimating MLLT"
      $cmd JOB=1:$nj $dir/log/macc.$x.JOB.log \
        ali-to-post "ark:gunzip -c $dir/ali.JOB.gz|" ark:- \| \
        weight-silence-post 0.0 $silphonelist $dir/$x.mdl ark:- ark:- \| \
        gmm-acc-mllt --rand-prune=$randprune  $dir/$x.mdl "$feats" ark:- $dir/$x.JOB.macc \
        || exit 1;
      est-mllt $dir/$x.mat.new $dir/$x.*.macc 2> $dir/log/mupdate.$x.log || exit 1;
      gmm-transform-means  $dir/$x.mat.new $dir/$x.mdl $dir/$x.mdl \
        2> $dir/log/transform_means.$x.log || exit 1;
      
      # see if this is the first MLLT iteration and there is no lda;  otherwise compose transforms
      if [ $cur_mllt_iter == 0 -a $extra_lda == false ]; then
        mv $dir/$x.mat.new $dir/$x.mat || exit 1;
      else
        compose-transforms --print-args=false $dir/$x.mat.new $dir/$cur_mllt_iter.mat $dir/$x.mat || exit 1;
      fi

      rm $dir/$x.*.macc
    fi

    # update features
    feats="$tandemfeats transform-feats $dir/$x.mat ark:- ark:- |"
    cur_mllt_iter=$x
  fi

  if [ $stage -le $x ]; then
    $cmd JOB=1:$nj $dir/log/acc.$x.JOB.log \
      gmm-acc-stats-ali  $dir/$x.mdl "$feats" \
      "ark,s,cs:gunzip -c $dir/ali.JOB.gz|" $dir/$x.JOB.acc || exit 1;
    $cmd $dir/log/update.$x.log \
      gmm-est --write-occs=$dir/$[$x+1].occs --mix-up=$numgauss --power=$power \
        $dir/$x.mdl "gmm-sum-accs - $dir/$x.*.acc |" $dir/$[$x+1].mdl || exit 1;
    rm $dir/$x.mdl $dir/$x.*.acc $dir/$x.occs 
  fi
  [ $x -le $max_iter_inc ] && numgauss=$[$numgauss+$incgauss];
  x=$[$x+1];
done

rm $dir/final.{mdl,mat,occs} 2>/dev/null
ln -s $x.mdl $dir/final.mdl
ln -s $x.occs $dir/final.occs
ln -s $cur_mllt_iter.mat $dir/final.mat

# Summarize warning messages...

utils/summarize_warnings.pl $dir/log

echo Done training system with LDA+MLLT tandem features in $dir


================================================
FILE: egs/steps/tandem/train_mmi.sh
================================================
#!/usr/bin/env bash
# Copyright 2012  Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.
#                 Korbinian Riedhammer

# MMI training (or optionally boosted MMI, if you give the --boost option).
# 4 iterations (by default) of Extended Baum-Welch update.
#
# For the numerator we have a fixed alignment rather than a lattice--
# this actually follows from the way lattices are defined in Kaldi, which
# is to have a single path for each word (output-symbol) sequence.

# Begin configuration section.
cmd=run.pl
num_iters=4
boost=0.0
cancel=true # if true, cancel num and den counts on each frame.
tau=400
weight_tau=10
acwt=0.1
stage=0
# End configuration section

echo "$0 $@"  # Print the command line for logging

[ -f ./path.sh ] && . ./path.sh; # source the path.
. parse_options.sh || exit 1;

if [ $# -ne 6 ]; then
  echo "Usage: steps/train_tandem_mmi.sh <data1> <data2> <lang> <ali> <denlats> <exp>"
  echo " e.g.: steps/train_tandem_mmi.sh {mfcc,bottleneck}/data/train_si84 data/lang exp/tri2b_ali_si84 exp/tri2b_denlats_si84 exp/tri2b_mmi"
  echo "Main options (for others, see top of script file)"
  echo "  --boost <boost-weight>                           # (e.g. 0.1), for boosted MMI.  (default 0)"
  echo "  --cancel (true|false)                            # cancel stats (true by default)"
  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
  echo "  --config <config-file>                           # config containing options"
  echo "  --stage <stage>                                  # stage to do partial re-run from."
  echo "  --tau                                            # tau for i-smooth to last iter (default 200)"

  exit 1;
fi

data1=$1
data2=$2
lang=$3
alidir=$4
denlatdir=$5
dir=$6

mkdir -p $dir/log

utils/lang/check_phones_compatible.sh $lang/phones.txt $alidir/phones.txt || exit 1;
cp $lang/phones.txt $dir || exit 1;

for f in $data1/feats.scp $data2/feats.scp $alidir/{tree,final.mdl,ali.1.gz} $denlatdir/lat.1.gz; do
  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
done
nj=`cat $alidir/num_jobs` || exit 1;
[ "$nj" -ne "`cat $denlatdir/num_jobs`" ] && \
  echo "$alidir and $denlatdir have different num-jobs" && exit 1;

mkdir -p $dir/log
echo $nj > $dir/num_jobs
cp $alidir/{final.mdl,tree} $dir
silphonelist=`cat $lang/phones/silence.csl` || exit 1;


# Set up features

sdata1=$data1/split$nj
sdata2=$data2/split$nj
[[ -d $sdata1 && $data1/feats.scp -ot $sdata1 ]] || split_data.sh $data1 $nj || exit 1;
[[ -d $sdata2 && $data2/feats.scp -ot $sdata2 ]] || split_data.sh $data2 $nj || exit 1;

splice_opts=`cat $alidir/splice_opts 2>/dev/null` # frame-splicing options.
normft2=`cat $alidir/normft2 2>/dev/null`

if [ -f $alidir/final.mat ]; then feat_type=lda; else feat_type=delta; fi

case $feat_type in
  delta)
    echo "$0: feature type is $feat_type"
    ;;
  lda)
    echo "$0: feature type is $feat_type"
    cp $alidir/{lda,final}.mat $dir/ || exit 1;
    ;;
  *) echo "$0: invalid feature type $feat_type" && exit 1;
esac

# set up feature stream 1;  this are usually spectral features, so we will add
# deltas or splice them
feats1="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata1/JOB/utt2spk scp:$sdata1/JOB/cmvn.scp scp:$sdata1/JOB/feats.scp ark:- |"

if [ "$feat_type" == "delta" ]; then
  feats1="$feats1 add-deltas ark:- ark:- |"
elif [ "$feat_type" == "lda" ]; then
  feats1="$feats1 splice-feats $splice_opts ark:- ark:- | transform-feats $dir/lda.mat ark:- ark:- |"
fi

# set up feature stream 2;  this are usually bottleneck or posterior features,
# which may be normalized if desired
feats2="scp:$sdata2/JOB/feats.scp"

if [ "$normft2" == "true" ]; then
  feats2="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata2/JOB/utt2spk scp:$sdata2/JOB/cmvn.scp $feats2 ark:- |"
fi

# assemble tandem features
feats="ark,s,cs:paste-feats '$feats1' '$feats2' ark:- |"

# add transformation, if applicable
if [ "$feat_type" == "lda" ]; then
  feats="$feats transform-feats $dir/final.mat ark:- ark:- |"
fi

# splicing/normalization options
cp $alidir/{splice_opts,normft2,tandem} $dir 2>/dev/null

if [ -f $alidir/trans.1 ]; then
  echo "$0: using transforms from $alidir"
  feats="$feats transform-feats --utt2spk=ark:$sdata1/JOB/utt2spk ark,s,cs:$alidir/trans.JOB ark:- ark:- |"
fi
##

lats="ark:gunzip -c $denlatdir/lat.JOB.gz|"
if [[ "$boost" != "0.0" && "$boost" != 0 ]]; then
  lats="$lats lattice-boost-ali --b=$boost --silence-phones=$silphonelist $alidir/final.mdl ark:- 'ark,s,cs:gunzip -c $alidir/ali.JOB.gz|' ark:- |"
fi


cur_mdl=$alidir/final.mdl
x=0
while [ $x -lt $num_iters ]; do
  echo "Iteration $x of MMI training"
  # Note: the num and den states are accumulated at the same time, so we
  # can cancel them per frame.
  if [ $stage -le $x ]; then
    $cmd JOB=1:$nj $dir/log/acc.$x.JOB.log \
      gmm-rescore-lattice $cur_mdl "$lats" "$feats" ark:- \| \
      lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \
      sum-post --merge=$cancel --scale1=-1 \
      ark:- "ark,s,cs:gunzip -c $alidir/ali.JOB.gz | ali-to-post ark:- ark:- |" ark:- \| \
      gmm-acc-stats2 $cur_mdl "$feats" ark,s,cs:- \
      $dir/num_acc.$x.JOB.acc $dir/den_acc.$x.JOB.acc || exit 1;

    n=`echo $dir/{num,den}_acc.$x.*.acc | wc -w`;
    [ "$n" -ne $[$nj*2] ] && \
      echo "Wrong number of MMI accumulators $n versus 2*$nj" && exit 1;
    $cmd $dir/log/den_acc_sum.$x.log \
      gmm-sum-accs $dir/den_acc.$x.acc $dir/den_acc.$x.*.acc || exit 1;
    rm $dir/den_acc.$x.*.acc
    $cmd $dir/log/num_acc_sum.$x.log \
      gmm-sum-accs $dir/num_acc.$x.acc $dir/num_acc.$x.*.acc || exit 1;
    rm $dir/num_acc.$x.*.acc

  # note: this tau value is for smoothing towards model parameters, not
  # as in the Boosted MMI paper, not towards the ML stats as in the earlier
  # work on discriminative training (e.g. my thesis).
  # You could use gmm-ismooth-stats to smooth to the ML stats, if you had
  # them available [here they're not available if cancel=true].

    $cmd $dir/log/update.$x.log \
      gmm-est-gaussians-ebw --tau=$tau $cur_mdl $dir/num_acc.$x.acc $dir/den_acc.$x.acc - \| \
      gmm-est-weights-ebw --weight-tau=$weight_tau - $dir/num_acc.$x.acc $dir/den_acc.$x.acc $dir/$[$x+1].mdl || exit 1;
    rm $dir/{den,num}_acc.$x.acc
  fi
  cur_mdl=$dir/$[$x+1].mdl

  # Some diagnostics: the objective function progress and auxiliary-function
  # improvement.

  tail -n 50 $dir/log/acc.$x.*.log | perl -e '$acwt=shift @ARGV; while(<STDIN>) { if(m/gmm-acc-stats2.+Overall weighted acoustic likelihood per frame was (\S+) over (\S+) frames/) { $tot_aclike += $1*$2; $tot_frames1 += $2; } if(m|lattice-to-post.+Overall average log-like/frame is (\S+) over (\S+) frames.  Average acoustic like/frame is (\S+)|) { $tot_den_lat_like += $1*$2; $tot_frames2 += $2; $tot_den_aclike += $3*$2; } } if (abs($tot_frames1 - $tot_frames2) > 0.01*($tot_frames1 + $tot_frames2)) { print STDERR "Frame-counts disagree $tot_frames1 versus $tot_frames2\n"; } $tot_den_lat_like /= $tot_frames2; $tot_den_aclike /= $tot_frames2; $tot_aclike *= ($acwt / $tot_frames1);  $num_like = $tot_aclike + $tot_den_aclike; $per_frame_objf = $num_like - $tot_den_lat_like; print "$per_frame_objf $tot_frames1\n"; ' $acwt > $dir/tmpf
  objf=`cat $dir/tmpf | awk '{print $1}'`;
  nf=`cat $dir/tmpf | awk '{print $2}'`;
  rm $dir/tmpf
  impr=`grep -w Overall $dir/log/update.$x.log | awk '{x += $10*$12;} END{print x;}'`
  impr=`perl -e "print ($impr*$acwt/$nf);"` # We multiply by acwt, and divide by $nf which is the "real" number of frames.
  echo "Iteration $x: objf was $objf, MMI auxf change was $impr" | tee $dir/objf.$x.log
  x=$[$x+1]
done

echo "MMI training finished"

rm $dir/final.mdl 2>/dev/null
ln -s $x.mdl $dir/final.mdl

exit 0;


================================================
FILE: egs/steps/tandem/train_mmi_sgmm2.sh
================================================
#!/usr/bin/env bash
# Copyright 2012  Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.
#                 Korbinian Riedhammer

# MMI training (or optionally boosted MMI, if you give the --boost option),
# for SGMMs.  4 iterations (by default) of Extended Baum-Welch update.
#
# Begin configuration section.
cmd=run.pl
num_iters=4
boost=0.0
cancel=true # if true, cancel num and den counts on each frame.
acwt=0.1
stage=0
update_opts=
transform_dir=
# End configuration section

echo "$0 $@"  # Print the command line for logging

[ -f ./path.sh ] && . ./path.sh; # source the path.
. parse_options.sh || exit 1;

if [ $# -ne 6 ]; then
  echo "Usage: steps/tandem/train_mmi_sgmm2.sh <data1> <data2> <lang> <ali> <denlats> <exp>"
  echo " e.g.: steps/tandem/train_mmi_sgmm2.sh {mfcc,bottleneck}/data1/train_si84 data1/lang exp/tri2b_ali_si84 exp/tri2b_denlats_si84 exp/tri2b_mmi"
  echo "Main options (for others, see top of script file)"
  echo "  --boost <boost-weight>                           # (e.g. 0.1), for boosted MMI.  (default 0)"
  echo "  --cancel (true|false)                            # cancel stats (true by default)"
  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
  echo "  --config <config-file>                           # config containing options"
  echo "  --stage <stage>                                  # stage to do partial re-run from."
  echo "  --transform-dir <transform-dir>                  # directory to find fMLLR transforms."
  exit 1;
fi

data1=$1
data2=$2
lang=$3
alidir=$4
denlatdir=$5
dir=$6
mkdir -p $dir/log

utils/lang/check_phones_compatible.sh $lang/phones.txt $alidir/phones.txt || exit 1;
cp $lang/phones.txt $dir || exit 1;

for f in $data1/feats.scp $alidir/{tree,final.mdl,ali.1.gz} $denlatdir/lat.1.gz; do
  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
done
nj=`cat $alidir/num_jobs` || exit 1;
[ "$nj" -ne "`cat $denlatdir/num_jobs`" ] && \
  echo "$alidir and $denlatdir have different num-jobs" && exit 1;

mkdir -p $dir/log
echo $nj > $dir/num_jobs

cp $alidir/{final.mdl,tree} $dir
silphonelist=`cat $lang/phones/silence.csl` || exit 1;


# Set up features

sdata1=$data1/split$nj
sdata2=$data2/split$nj
[[ -d $sdata1 && $data1/feats.scp -ot $sdata1 ]] || split_data.sh $data1 $nj || exit 1;
[[ -d $sdata2 && $data2/feats.scp -ot $sdata2 ]] || split_data.sh $data2 $nj || exit 1;

splice_opts=`cat $alidir/splice_opts 2>/dev/null` # frame-splicing options.
normft2=`cat $alidir/normft2 2>/dev/null`

if [ -f $alidir/final.mat ]; then feat_type=lda; else feat_type=delta; fi

case $feat_type in
  delta)
    echo "$0: feature type is $feat_type"
    ;;
  lda)
    echo "$0: feature type is $feat_type"
    cp $alidir/{lda,final}.mat $dir/ || exit 1;
    ;;
  *) echo "$0: invalid feature type $feat_type" && exit 1;
esac

# set up feature stream 1;  this are usually spectral features, so we will add
# deltas or splice them
feats1="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata1/JOB/utt2spk scp:$sdata1/JOB/cmvn.scp scp:$sdata1/JOB/feats.scp ark:- |"

if [ "$feat_type" == "delta" ]; then
  feats1="$feats1 add-deltas ark:- ark:- |"
elif [ "$feat_type" == "lda" ]; then
  feats1="$feats1 splice-feats $splice_opts ark:- ark:- | transform-feats $dir/lda.mat ark:- ark:- |"
fi

# set up feature stream 2;  this are usually bottleneck or posterior features,
# which may be normalized if desired
feats2="scp:$sdata2/JOB/feats.scp"

if [ "$normft2" == "true" ]; then
  feats2="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata2/JOB/utt2spk scp:$sdata2/JOB/cmvn.scp $feats2 ark:- |"
fi

# assemble tandem features
feats="ark,s,cs:paste-feats '$feats1' '$feats2' ark:- |"

# add transformation, if applicable
if [ "$feat_type" == "lda" ]; then
  feats="$feats transform-feats $dir/final.mat ark:- ark:- |"
fi

# splicing/normalization options
cp $alidir/{splice_opts,normft2,tandem} $dir 2>/dev/null

if [ ! -z "$transform_dir" ]; then
  echo "$0: using transforms from $transform_dir"
  [ ! -f $transform_dir/trans.1 ] && echo "$0: no such file $transform_dir/trans.1" \
    && exit 1;
  feats="$feats transform-feats --utt2spk=ark:$sdata1/JOB/utt2spk ark,s,cs:$transform_dir/trans.JOB ark:- ark:- |"
else
  echo "$0: no fMLLR transforms."
fi

if [ -f $alidir/vecs.1 ]; then
  echo "$0: using speaker vectors from $alidir"
  spkvecs_opt="--spk-vecs=ark:$alidir/vecs.JOB --utt2spk=ark:$sdata1/JOB/utt2spk"
else
  echo "$0: no speaker vectors."
  spkvecs_opt=
fi

if [ -f $alidir/gselect.1.gz ]; then
  echo "$0: using Gaussian-selection info from $alidir"
  gselect_opt="--gselect=ark:gunzip -c $alidir/gselect.JOB.gz|"
else
  echo "$0: error: no Gaussian-selection info found" && exit 1;
fi

lats="ark:gunzip -c $denlatdir/lat.JOB.gz|"
if [[ "$boost" != "0.0" && "$boost" != 0 ]]; then
  lats="$lats lattice-boost-ali --b=$boost --silence-phones=$silphonelist $alidir/final.mdl ark:- 'ark,s,cs:gunzip -c $alidir/ali.JOB.gz|' ark:- |"
fi


cur_mdl=$alidir/final.mdl
x=0
while [ $x -lt $num_iters ]; do
  echo "Iteration $x of MMI training"
  # Note: the num and den states are accumulated at the same time, so we
  # can cancel them per frame.
  if [ $stage -le $x ]; then
    $cmd JOB=1:$nj $dir/log/acc.$x.JOB.log \
      sgmm2-rescore-lattice "$gselect_opt" $spkvecs_opt $cur_mdl "$lats" "$feats" ark:- \| \
      lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \
      sum-post --merge=$cancel --scale1=-1 \
      ark:- "ark,s,cs:gunzip -c $alidir/ali.JOB.gz | ali-to-post ark:- ark:- |" ark:- \| \
      sgmm2-acc-stats2 "$gselect_opt" $spkvecs_opt $cur_mdl "$feats" ark,s,cs:- \
        $dir/num_acc.$x.JOB.acc $dir/den_acc.$x.JOB.acc || exit 1;

    n=`echo $dir/{num,den}_acc.$x.*.acc | wc -w`;
    [ "$n" -ne $[$nj*2] ] && \
      echo "Wrong number of MMI accumulators $n versus 2*$nj" && exit 1;
    $cmd $dir/log/den_acc_sum.$x.log \
      sgmm2-sum-accs $dir/den_acc.$x.acc $dir/den_acc.$x.*.acc || exit 1;
    rm $dir/den_acc.$x.*.acc
    $cmd $dir/log/num_acc_sum.$x.log \
      sgmm2-sum-accs $dir/num_acc.$x.acc $dir/num_acc.$x.*.acc || exit 1;
    rm $dir/num_acc.$x.*.acc

    $cmd $dir/log/update.$x.log \
     sgmm2-est-ebw $update_opts $cur_mdl $dir/num_acc.$x.acc $dir/den_acc.$x.acc $dir/$[$x+1].mdl || exit 1;
  fi
  cur_mdl=$dir/$[$x+1].mdl


  # Some diagnostics: the objective function progress and auxiliary-function
  # improvement.  Note: this code is same as in train_mmi.sh
  tail -n 50 $dir/log/acc.$x.*.log | perl -e '$acwt=shift @ARGV; while(<STDIN>) { if(m/sgmm2-acc-stats2.+Overall weighted acoustic likelihood per frame was (\S+) over (\S+) frames/) { $tot_aclike += $1*$2; $tot_frames1 += $2; } if(m|lattice-to-post.+Overall average log-like/frame is (\S+) over (\S+) frames.  Average acoustic like/frame is (\S+)|) { $tot_den_lat_like += $1*$2; $tot_frames2 += $2; $tot_den_aclike += $3*$2; } } if (abs($tot_frames1 - $tot_frames2) > 0.01*($tot_frames1 + $tot_frames2)) { print STDERR "Frame-counts disagree $tot_frames1 versus $tot_frames2\n"; } $tot_den_lat_like /= $tot_frames2; $tot_den_aclike /= $tot_frames2; $tot_aclike *= ($acwt / $tot_frames1);  $num_like = $tot_aclike + $tot_den_aclike; $per_frame_objf = $num_like - $tot_den_lat_like; print "$per_frame_objf $tot_frames1\n"; ' $acwt > $dir/tmpf
  objf=`cat $dir/tmpf | awk '{print $1}'`;
  nf=`cat $dir/tmpf | awk '{print $2}'`;
  rm $dir/tmpf
  impr=`grep -w Overall $dir/log/update.$x.log | awk '{x += $10*$12;} END{print x;}'`
  impr=`perl -e "print ($impr*$acwt/$nf);"` # We multiply by acwt, and divide by $nf which is the "real" number of frames.
  echo "Iteration $x: objf was $objf, MMI auxf change was $impr" | tee $dir/objf.$x.log
  x=$[$x+1]
done

echo "MMI training finished"

rm $dir/final.mdl 2>/dev/null
ln -s $x.mdl $dir/final.mdl

exit 0;


================================================
FILE: egs/steps/tandem/train_mono.sh
================================================
#!/usr/bin/env bash
# Copyright 2012  Johns Hopkins University (Author: Daniel Povey)
#                 Korbinian Riedhammer
# Apache 2.0


# To be run from ..
# Flat start and monophone training, with delta-delta features.
# This script applies cepstral mean normalization (per speaker).

# Begin configuration section.
nj=4
cmd=run.pl
scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
num_iters=40    # Number of iterations of training
max_iter_inc=30 # Last iter to increase #Gauss on.
totgauss=1000 # Target #Gaussians.  
boost_silence=1.0 # Factor by which to boost silence likelihoods in alignment
realign_iters="1 2 3 4 5 6 7 8 9 10 12 14 16 18 20 23 26 29 32 35 38";
config= # name of config file.
stage=-4
power=0.2 # exponent to determine number of gaussians from occurrence counts
normft2=true # typically, the tandem features will already be normalized due to pca
# End configuration section.

echo "$0 $@"  # Print the command line for logging

if [ -f path.sh ]; then . ./path.sh; fi
. parse_options.sh || exit 1;

if [ $# != 4 ]; then
  echo "Usage: steps/tandem/train_mono.sh [options] <data1-dir> <data2-dir> <lang-dir> <exp-dir>"
  echo " e.g.: steps/tandem/train_mono.sh {mfcc,bottleneck}/data/train.1k data/lang exp/mono"
  echo "main options (for others, see top of script file)"
  echo "  --config <config-file>                           # config containing options"
  echo "  --nj <nj>                                        # number of parallel jobs"
  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
  echo "  --normft2 (true|false)                           # apply CMVN to second features?"
  exit 1;
fi

data1=$1
data2=$2
lang=$3
dir=$4

oov_sym=`cat $lang/oov.int` || exit 1;

mkdir -p $dir/log
echo $nj > $dir/num_jobs

cp $lang/phones.txt $dir || exit 1;

# Set up features.

sdata1=$data1/split$nj;
sdata2=$data2/split$nj;
[[ -d $sdata1 && $data1/feats.scp -ot $sdata1 ]] || split_data.sh $data1 $nj || exit 1;
[[ -d $sdata2 && $data2/feats.scp -ot $sdata2 ]] || split_data.sh $data2 $nj || exit 1;

# Use deltas on the first tream (most likely this will be MFCCs or alike)
feats1="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata1/JOB/utt2spk scp:$sdata1/JOB/cmvn.scp scp:$sdata1/JOB/feats.scp ark:- | add-deltas ark:- ark:- |"

# Second stream will most likely be bottleneck or posteriors, so normalize
# if desired
feats2="scp:$sdata2/JOB/feats.scp"
if [ "$normft2" == "true" ]; then
  feats2="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata2/JOB/utt2spk scp:$sdata2/JOB/cmvn.scp $feats2 ark:- |"
fi

# paste features
feats="ark,s,cs:paste-feats '$feats1' '$feats2' ark:- |"
example_feats="`echo '$feats' | sed s/JOB/1/g`";

# get dimension
allfeats=$(echo $feats | sed s:JOB:..:g)
feat_dim=$(feat-to-dim --print-args=false "$allfeats" - 2> $dir/log/feat_dim)

# save stats
echo $feats > $dir/tandem
echo $normft2 > $dir/normft2

echo "$0: Initializing monophone system."

[ ! -f $lang/phones/sets.int ] && exit 1;
shared_phones_opt="--shared-phones=$lang/phones/sets.int"

if [ $stage -le -3 ]; then
# Note: JOB=. makes it use the whole set;  we want that to make sure we have phoneme 
  $cmd JOB=1 $dir/log/init.log \
    gmm-init-mono $shared_phones_opt "--train-feats=$allfeats" $lang/topo $feat_dim \
    $dir/0.mdl $dir/tree || exit 1;
fi

numgauss=`gmm-info --print-args=false $dir/0.mdl | grep gaussians | awk '{print $NF}'`
incgauss=$[($totgauss-$numgauss)/$max_iter_inc] # per-iter increment for #Gauss

if [ $stage -le -2 ]; then
  echo "$0: Compiling training graphs"
  $cmd JOB=1:$nj $dir/log/compile_graphs.JOB.log \
    compile-train-graphs --read-disambig-syms=$lang/phones/disambig.int $dir/tree $dir/0.mdl  $lang/L.fst \
    "ark:sym2int.pl --map-oov $oov_sym -f 2- $lang/words.txt < $sdata1/JOB/text|" \
    "ark:|gzip -c >$dir/fsts.JOB.gz" || exit 1;
fi

if [ $stage -le -1 ]; then
  echo "$0: Aligning data equally (pass 0)"
  $cmd JOB=1:$nj $dir/log/align.0.JOB.log \
    align-equal-compiled "ark:gunzip -c $dir/fsts.JOB.gz|" "$feats" ark,t:-  \| \
    gmm-acc-stats-ali --binary=true $dir/0.mdl "$feats" ark:- \
    $dir/0.JOB.acc || exit 1;
fi

# In the following steps, the --min-gaussian-occupancy=3 option is important, otherwise
# we fail to est "rare" phones and later on, they never align properly.

if [ $stage -le 0 ]; then
  gmm-est --min-gaussian-occupancy=3  --mix-up=$numgauss --power=$power \
    $dir/0.mdl "gmm-sum-accs - $dir/0.*.acc|" $dir/1.mdl 2> $dir/log/update.0.log || exit 1;
  rm $dir/0.*.acc
fi


beam=6 # will change to 10 below after 1st pass
# note: using slightly wider beams for WSJ vs. RM.
x=1
while [ $x -lt $num_iters ]; do
  echo "$0: Pass $x"
  if [ $stage -le $x ]; then
    if echo $realign_iters | grep -w $x >/dev/null; then
      echo "$0: Aligning data"
      mdl="gmm-boost-silence --boost=$boost_silence `cat $lang/phones/optional_silence.csl` $dir/$x.mdl - |"
      $cmd JOB=1:$nj $dir/log/align.$x.JOB.log \
        gmm-align-compiled $scale_opts --beam=$beam --retry-beam=$[$beam*4] "$mdl" \
        "ark:gunzip -c $dir/fsts.JOB.gz|" "$feats" "ark,t:|gzip -c >$dir/ali.JOB.gz" \
        || exit 1;
    fi
    $cmd JOB=1:$nj $dir/log/acc.$x.JOB.log \
      gmm-acc-stats-ali  $dir/$x.mdl "$feats" "ark:gunzip -c $dir/ali.JOB.gz|" \
      $dir/$x.JOB.acc || exit 1;

    $cmd $dir/log/update.$x.log \
      gmm-est --write-occs=$dir/$[$x+1].occs --mix-up=$numgauss --power=$power $dir/$x.mdl \
      "gmm-sum-accs - $dir/$x.*.acc|" $dir/$[$x+1].mdl || exit 1;
    rm $dir/$x.mdl $dir/$x.*.acc $dir/$x.occs 2>/dev/null
  fi
  if [ $x -le $max_iter_inc ]; then
     numgauss=$[$numgauss+$incgauss];
  fi
  beam=10
  x=$[$x+1]
done

( cd $dir; rm final.{mdl,occs} 2>/dev/null; ln -s $x.mdl final.mdl; ln -s $x.occs final.occs )

utils/summarize_warnings.pl $dir/log

echo "Done training tandem mono-phone system in $dir"

# example of showing the alignments:
# show-alignments data/lang/phones.txt $dir/30.mdl "ark:gunzip -c $dir/ali.0.gz|" | head -4


================================================
FILE: egs/steps/tandem/train_sat.sh
================================================
#!/usr/bin/env bash
# Copyright 2012  Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.
#                 Korbinian Riedhammer

# This does Speaker Adapted Training (SAT), i.e. train on
# fMLLR-adapted features.  It can be done on top of either LDA+MLLT, or
# delta and delta-delta features.  If there are no transforms supplied
# in the alignment directory, it will estimate transforms itself before
# building the tree (and in any case, it estimates transforms a number
# of times during training).


# Begin configuration section.
stage=-5
fmllr_update_type=full
cmd=run.pl
scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
beam=10
retry_beam=40
boost_silence=1.0 # Factor by which to boost silence likelihoods in alignment
realign_iters="10 20 30";
fmllr_iters="2 4 6 12";
silence_weight=0.0 # Weight on silence in fMLLR estimation.
num_iters=35   # Number of iterations of training
max_iter_inc=25 # Last iter to increase #Gauss on.
power=0.2 # Exponent for number of gaussians according to occurrence counts
cluster_thresh=-1  # for build-tree control final bottom-up clustering of leaves
normft2=true
# End configuration section.

echo "$0 $@"  # Print the command line for logging

[ -f path.sh ] && . ./path.sh
. parse_options.sh || exit 1;

if [ $# != 7 ]; then
  echo "Usage: steps/tandem/train_sat.sh <#leaves> <#gauss> <data1> <data2> <lang> <ali-dir> <exp-dir>"
  echo " e.g.: steps/tandem/train_sat.sh 2500 15000 {mfcc,bottleneck}/data/train_si84 data/lang exp/tri2b_ali_si84 exp/tri3b"
  echo "Main options (for others, see top of script file)"
  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
  echo "  --config <config-file>                           # config containing options"
  echo "  --stage <stage>                                  # stage to do partial re-run from."
  exit 1;
fi

numleaves=$1
totgauss=$2
data1=$3
data2=$4
lang=$5
alidir=$6
dir=$7

for f in $data1/feats.scp $data2/feats.scp $lang/phones.txt $alidir/final.mdl $alidir/ali.1.gz; do
  [ ! -f $f ] && echo "train_tandem_sat.sh: no such file $f" && exit 1;
done

numgauss=$numleaves
incgauss=$[($totgauss-$numgauss)/$max_iter_inc]  # per-iter #gauss increment
oov=`cat $lang/oov.int`
nj=`cat $alidir/num_jobs` || exit 1;
silphonelist=`cat $lang/phones/silence.csl`
ciphonelist=`cat $lang/phones/context_indep.csl` || exit 1;


mkdir -p $dir/log

utils/lang/check_phones_compatible.sh $lang/phones.txt $alidir/phones.txt || exit 1;
cp $lang/phones.txt $dir || exit 1;

echo $nj >$dir/num_jobs

sdata1=$data1/split$nj;
sdata2=$data2/split$nj;
[[ -d $sdata1 && $data1/feats.scp -ot $sdata1 ]] || split_data.sh $data1 $nj || exit 1;
[[ -d $sdata2 && $data2/feats.scp -ot $sdata2 ]] || split_data.sh $data2 $nj || exit 1;

# Set up features.

# We will use the same settings as with the alidir
splice_opts=`cat $alidir/splice_opts 2>/dev/null` # frame-splicing options.
normft2=`cat $alidir/normft2 2>/dev/null`

if [ -f $alidir/final.mat ]; then feat_type=lda; else feat_type=delta; fi

case $feat_type in
  delta)
    echo "$0: feature type is $feat_type"
    ;;
  lda)
    echo "$0: feature type is $feat_type"
    cp $alidir/{lda,final}.mat $dir/ || exit 1;
    ;;
  *) echo "$0: invalid feature type $feat_type" && exit 1;
esac

# set up feature stream 1;  this are usually spectral features, so we will add
# deltas or splice them
feats1="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata1/JOB/utt2spk scp:$sdata1/JOB/cmvn.scp scp:$sdata1/JOB/feats.scp ark:- |"

if [ "$feat_type" == "delta" ]; then
  feats1="$feats1 add-deltas ark:- ark:- |"
elif [ "$feat_type" == "lda" ]; then
  feats1="$feats1 splice-feats $splice_opts ark:- ark:- | transform-feats $dir/lda.mat ark:- ark:- |"
fi

# set up feature stream 2;  this are usually bottleneck or posterior features,
# which may be normalized if desired
feats2="scp:$sdata2/JOB/feats.scp"

if [ "$normft2" == "true" ]; then
  feats2="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata2/JOB/utt2spk scp:$sdata2/JOB/cmvn.scp $feats2 ark:- |"
fi

# assemble tandem features
sifeats="ark,s,cs:paste-feats '$feats1' '$feats2' ark:- |"

# add transformation, if applicable
if [ "$feat_type" == "lda" ]; then
  sifeats="$sifeats transform-feats $dir/final.mat ark:- ark:- |"
fi

# splicing/normalization options
cp $alidir/{splice_opts,tandem,normft2} $dir 2>/dev/null


## Get initial fMLLR transforms (possibly from alignment dir)
if [ -f $alidir/trans.1 ]; then
  echo "$0: Using transforms from $alidir"
  feats="$sifeats transform-feats --utt2spk=ark:$sdata1/JOB/utt2spk ark,s,cs:$alidir/trans.JOB ark:- ark:- |"
  cur_trans_dir=$alidir
else
  if [ $stage -le -4 ]; then
    echo "$0: obtaining initial fMLLR transforms since not present in $alidir"
    $cmd JOB=1:$nj $dir/log/fmllr.0.JOB.log \
      ali-to-post "ark:gunzip -c $alidir/ali.JOB.gz|" ark:- \| \
      weight-silence-post $silence_weight $silphonelist $alidir/final.mdl ark:- ark:- \| \
      gmm-est-fmllr --fmllr-update-type=$fmllr_update_type \
      --spk2utt=ark:$sdata1/JOB/spk2utt $alidir/final.mdl "$sifeats" \
      ark:- ark:$dir/trans.JOB || exit 1;
  fi
  feats="$sifeats transform-feats --utt2spk=ark:$sdata1/JOB/utt2spk ark,s,cs:$dir/trans.JOB ark:- ark:- |"
  cur_trans_dir=$dir
fi

if [ $stage -le -3 ]; then
  # Get tree stats.
  echo "$0: Accumulating tree stats"
  $cmd JOB=1:$nj $dir/log/acc_tree.JOB.log \
    acc-tree-stats  --ci-phones=$ciphonelist $alidir/final.mdl "$feats" \
    "ark:gunzip -c $alidir/ali.JOB.gz|" $dir/JOB.treeacc || exit 1;
  [ "`ls $dir/*.treeacc | wc -w`" -ne "$nj" ] && echo "$0: Wrong #tree-accs" && exit 1;
  $cmd $dir/log/sum_tree_acc.log \
    sum-tree-stats $dir/treeacc $dir/*.treeacc || exit 1;
  rm $dir/*.treeacc
fi

if [ $stage -le -2 ]; then
  echo "$0: Getting questions for tree clustering."
  # preparing questions, roots file...
  cluster-phones $dir/treeacc $lang/phones/sets.int $dir/questions.int 2> $dir/log/questions.log || exit 1;
  cat $lang/phones/extra_questions.int >> $dir/questions.int
  compile-questions $lang/topo $dir/questions.int $dir/questions.qst 2>$dir/log/compile_questions.log || exit 1;

  echo "$0: Building the tree"
  $cmd $dir/log/build_tree.log \
    build-tree --verbose=1 --max-leaves=$numleaves \
    --cluster-thresh=$cluster_thresh $dir/treeacc $lang/phones/roots.int \
    $dir/questions.qst $lang/topo $dir/tree || exit 1;

  gmm-init-model  --write-occs=$dir/1.occs  \
    $dir/tree $dir/treeacc $lang/topo $dir/1.mdl 2> $dir/log/init_model.log || exit 1;
  grep 'no stats' $dir/log/init_model.log && echo "$0: This is a bad warning.";

  rm $dir/treeacc
fi


if [ $stage -le -1 ]; then
  # Convert the alignments.
  echo "$0: Converting alignments from $alidir to use current tree"
  $cmd JOB=1:$nj $dir/log/convert.JOB.log \
    convert-ali $alidir/final.mdl $dir/1.mdl $dir/tree \
     "ark:gunzip -c $alidir/ali.JOB.gz|" "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1;
fi

if [ $stage -le 0 ]; then
  echo "$0: Compiling graphs of transcripts"
  $cmd JOB=1:$nj $dir/log/compile_graphs.JOB.log \
    compile-train-graphs --read-disambig-syms=$lang/phones/disambig.int $dir/tree $dir/1.mdl  $lang/L.fst  \
     "ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt < $sdata1/JOB/text |" \
      "ark:|gzip -c >$dir/fsts.JOB.gz" || exit 1;
fi

x=1
while [ $x -lt $num_iters ]; do
   echo Pass $x
  if echo $realign_iters | grep -w $x >/dev/null && [ $stage -le $x ]; then
    echo Aligning data
    mdl="gmm-boost-silence --boost=$boost_silence `cat $lang/phones/optional_silence.csl` $dir/$x.mdl - |"
    $cmd JOB=1:$nj $dir/log/align.$x.JOB.log \
      gmm-align-compiled $scale_opts --beam=$beam --retry-beam=$retry_beam "$mdl" \
      "ark:gunzip -c $dir/fsts.JOB.gz|" "$feats" \
      "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1;
  fi

  if echo $fmllr_iters | grep -w $x >/dev/null; then
    if [ $stage -le $x ]; then
      echo Estimating fMLLR transforms
      # We estimate a transform that's additional to the previous transform;
      # we'll compose them.
      $cmd JOB=1:$nj $dir/log/fmllr.$x.JOB.log \
        ali-to-post "ark:gunzip -c $dir/ali.JOB.gz|" ark:-  \| \
        weight-silence-post $silence_weight $silphonelist $dir/$x.mdl ark:- ark:- \| \
        gmm-est-fmllr --fmllr-update-type=$fmllr_update_type \
        --spk2utt=ark:$sdata1/JOB/spk2utt $dir/$x.mdl \
        "$feats" ark:- ark:$dir/tmp_trans.JOB || exit 1;
      for n in `seq $nj`; do
        ! ( compose-transforms --b-is-affine=true \
          ark:$dir/tmp_trans.$n ark:$cur_trans_dir/trans.$n ark:$dir/composed_trans.$n \
          && mv $dir/composed_trans.$n $dir/trans.$n && \
          rm $dir/tmp_trans.$n ) 2>$dir/log/compose_transforms.$x.log \
          && echo "$0: Error composing transforms" && exit 1;
      done
    fi
    feats="$sifeats transform-feats --utt2spk=ark:$sdata1/JOB/utt2spk ark:$dir/trans.JOB ark:- ark:- |"
    cur_trans_dir=$dir
  fi

  if [ $stage -le $x ]; then
    $cmd JOB=1:$nj $dir/log/acc.$x.JOB.log \
      gmm-acc-stats-ali $dir/$x.mdl "$feats" \
      "ark,s,cs:gunzip -c $dir/ali.JOB.gz|" $dir/$x.JOB.acc || exit 1;
    [ `ls $dir/$x.*.acc | wc -w` -ne "$nj" ] && echo "$0: Wrong #accs" && exit 1;
    $cmd $dir/log/update.$x.log \
      gmm-est --power=$power --write-occs=$dir/$[$x+1].occs --mix-up=$numgauss $dir/$x.mdl \
      "gmm-sum-accs - $dir/$x.*.acc |" $dir/$[$x+1].mdl || exit 1;
    rm $dir/$x.mdl $dir/$x.*.acc
    rm $dir/$x.occs
  fi
  [ $x -le $max_iter_inc ] && numgauss=$[$numgauss+$incgauss];
  x=$[$x+1];
done


if [ $stage -le $x ]; then
  # Accumulate stats for "alignment model"-- this model is
  # computed with the speaker-independent features, but matches Gaussian-for-Gaussian
  # with the final speaker-adapted model.
  $cmd JOB=1:$nj $dir/log/acc_alimdl.JOB.log \
    ali-to-post "ark:gunzip -c $dir/ali.JOB.gz|" ark:-  \| \
    gmm-acc-stats-twofeats $dir/$x.mdl "$feats" "$sifeats" \
    ark,s,cs:- $dir/$x.JOB.acc || exit 1;
  [ `ls $dir/$x.*.acc | wc -w` -ne "$nj" ] && echo "$0: Wrong #accs" && exit 1;
  # Update model.
  $cmd $dir/log/est_alimdl.log \
    gmm-est --power=$power --remove-low-count-gaussians=false $dir/$x.mdl \
    "gmm-sum-accs - $dir/$x.*.acc|" $dir/$x.alimdl  || exit 1;
  rm $dir/$x.*.acc
fi

rm $dir/final.{mdl,alimdl,occs} 2>/dev/null
ln -s $x.mdl $dir/final.mdl
ln -s $x.occs $dir/final.occs
ln -s $x.alimdl $dir/final.alimdl


utils/summarize_warnings.pl $dir/log
(
  echo "$0: Likelihood evolution:"
  for x in `seq $[$num_iters-1]`; do
    tail -n 30 $dir/log/acc.$x.*.log | awk '/Overall avg like/{l += $(NF-3)*$(NF-1); t += $(NF-1); }
        /Overall average logdet/{d += $(NF-3)*$(NF-1); t2 += $(NF-1);}
        END{ d /= t2; l /= t; printf("%s ", d+l); } '
  done
  echo
) | tee $dir/log/summary.log

echo Done


================================================
FILE: egs/steps/tandem/train_sgmm2.sh
================================================
#!/usr/bin/env bash

# Copyright 2012  Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.
#                 Korbinian Riedhammer

# SGMM training, with speaker vectors.  This script would normally be called on
# top of fMLLR features obtained from a conventional system, but it also works
# on top of any type of speaker-independent features (based on
# deltas+delta-deltas or LDA+MLLT).  For more info on SGMMs, see the paper "The
# subspace Gaussian mixture model--A structured model for speech recognition".
# (Computer Speech and Language, 2011).

# Begin configuration section.
nj=4
cmd=run.pl
stage=-6 # use this to resume partially finished training
context_opts= # e.g. set it to "--context-width=5 --central-position=2"  for a
# quinphone system.
scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
num_iters=25   # Total number of iterations of training
num_iters_alimdl=3 # Number of iterations for estimating alignment model.
max_iter_inc=15 # Last iter to increase #substates on.
realign_iters="5 10 15"; # Iters to realign on.
spkvec_iters="5 8 12 17" # Iters to estimate speaker vectors on.
increase_iters="6 10 14"; # Iters on which to increase phn dim and/or spk dim;
    # rarely necessary, and if it is, only the 1st will normally be necessary.
rand_prune=0.1 # Randomized-pruning parameter for posteriors, to speed up training.
               # Bigger -> more pruning; zero = no pruning.
phn_dim=  # You can use this to set the phonetic subspace dim. [default: feat-dim+1]
spk_dim=  # You can use this to set the speaker subspace dim. [default: feat-dim]
power=0.2 # Exponent for number of gaussians according to occurrence counts
beam=8
self_weight=0.9
retry_beam=40
leaves_per_group=5 # Relates to the SCTM (state-clustered tied-mixture) aspect:
                   # average number of pdfs in a "group" of pdfs.
update_m_iter=4
spk_dep_weights=true # [Symmetric SGMM] set this to false if you don't want "u" (i.e. to turn off
                      # symmetric SGMM.
normft2=true
# End configuration section.

echo "$0 $@"  # Print the command line for logging

if [ -f path.sh ]; then . ./path.sh; fi
. parse_options.sh || exit 1;


if [ $# != 8 ]; then
  echo "Usage: steps/tandem/train_sgmm2.sh <num-leaves> <num-substates> <data1> <data2> <lang> <ali-dir> <ubm> <exp-dir>"
  echo " e.g.: steps/tandem/train_sgmm2.sh 5000 8000 {mfcc,bottleneck}/data/train_si84 data/lang \\"
  echo "                      exp/tri3b_ali_si84 exp/ubm4a/final.ubm exp/sgmm4a"
  echo "main options (for others, see top of script file)"
  echo "  --config <config-file>                           # config containing options"
  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
  echo "  --silence-weight <sil-weight>                    # weight for silence (e.g. 0.5 or 0.0)"
  echo "  --num-iters <#iters>                             # Number of iterations of E-M"
  echo "  --leaves-per-group <#leaves>                     # Average #leaves shared in one group"
  exit 1;
fi

num_pdfs=$1  # final #leaves, at 2nd level of tree.
totsubstates=$2
data1=$3
data2=$4
lang=$5
alidir=$6
ubm=$7
dir=$8

num_groups=$[$num_pdfs/$leaves_per_group]
first_spkvec_iter=`echo $spkvec_iters | awk '{print $1}'` || exit 1;

# Check some files.
for f in $data1/feats.scp $data2/feats.scp $lang/L.fst $alidir/ali.1.gz $alidir/final.mdl $ubm; do
  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
done


# Set some variables.
oov=`cat $lang/oov.int`
silphonelist=`cat $lang/phones/silence.csl`
if [ "$self_weight" == "1.0" ]; then
  numsubstates=$num_groups # Initial #-substates.
else
  numsubstates=$num_pdfs # Initial #-substates.
fi
incsubstates=$[($totsubstates-$numsubstates)/$max_iter_inc] # per-iter increment for #substates
feat_dim=`gmm-info $alidir/final.mdl 2>/dev/null | awk '/feature dimension/{print $NF}'` || exit 1;
[ $feat_dim -eq $feat_dim ] || exit 1; # make sure it's numeric.
[ -z $phn_dim ] && phn_dim=$[$feat_dim+1]
[ -z $spk_dim ] && spk_dim=$feat_dim
nj=`cat $alidir/num_jobs` || exit 1;

mkdir -p $dir/log
echo $nj > $dir/num_jobs

utils/lang/check_phones_compatible.sh $lang/phones.txt $alidir/phones.txt || exit 1;
cp $lang/phones.txt $dir || exit 1;

sdata1=$data1/split$nj;
sdata2=$data2/split$nj;
[[ -d $sdata1 && $data1/feats.scp -ot $sdata1 ]] || split_data.sh $data1 $nj || exit 1;
[[ -d $sdata2 && $data2/feats.scp -ot $sdata2 ]] || split_data.sh $data2 $nj || exit 1;

spkvecs_opt=  # Empty option for now, until we estimate the speaker vectors.
gselect_opt="--gselect=ark,s,cs:gunzip -c $dir/gselect.JOB.gz|"

## Set up features.


# We will use the same settings as with the alidir
splice_opts=`cat $alidir/splice_opts 2>/dev/null` # frame-splicing options.
normft2=`cat $alidir/normft2 2>/dev/null`

if [ -f $alidir/final.mat ]; then feat_type=lda; else feat_type=delta; fi

case $feat_type in
  delta)
    echo "$0: feature type is $feat_type"
    ;;
  lda)
    echo "$0: feature type is $feat_type"
    cp $alidir/{lda,final}.mat $dir/ || exit 1;
    ;;
  *) echo "$0: invalid feature type $feat_type" && exit 1;
esac

# set up feature stream 1;  this are usually spectral features, so we will add
# deltas or splice them
feats1="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata1/JOB/utt2spk scp:$sdata1/JOB/cmvn.scp scp:$sdata1/JOB/feats.scp ark:- |"

if [ "$feat_type" == "delta" ]; then
  feats1="$feats1 add-deltas ark:- ark:- |"
elif [ "$feat_type" == "lda" ]; then
  feats1="$feats1 splice-feats $splice_opts ark:- ark:- | transform-feats $dir/lda.mat ark:- ark:- |"
fi

# set up feature stream 2;  this are usually bottleneck or posterior features,
# which may be normalized if desired
feats2="scp:$sdata2/JOB/feats.scp"

if [ "$normft2" == "true" ]; then
  feats2="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata2/JOB/utt2spk scp:$sdata2/JOB/cmvn.scp $feats2 ark:- |"
fi

# assemble tandem features
feats="ark,s,cs:paste-feats '$feats1' '$feats2' ark:- |"

# add transformation, if applicable
if [ "$feat_type" == "lda" ]; then
  feats="$feats transform-feats $dir/final.mat ark:- ark:- |"
fi

# splicing/normalization options
cp $alidir/{splice_opts,tandem,normft2} $dir 2>/dev/null

if [ -f $alidir/trans.1 ]; then
  echo "$0: using transforms from $alidir"
  feats="$feats transform-feats --utt2spk=ark:$sdata1/JOB/utt2spk ark,s,cs:$alidir/trans.JOB ark:- ark:- |"
fi
##


if [ $stage -le -6 ]; then
  echo "$0: accumulating tree stats"
  $cmd JOB=1:$nj $dir/log/acc_tree.JOB.log \
    acc-tree-stats  --ci-phones=$ciphonelist $alidir/final.mdl "$feats" \
    "ark:gunzip -c $alidir/ali.JOB.gz|" $dir/JOB.treeacc || exit 1;
  [ "`ls $dir/*.treeacc | wc -w`" -ne "$nj" ] && echo "$0: Wrong #tree-stats" && exit 1;
  sum-tree-stats $dir/treeacc $dir/*.treeacc 2>$dir/log/sum_tree_acc.log || exit 1;
  rm $dir/*.treeacc
fi

if [ $stage -le -5 ]; then
  echo "$0: Getting questions for tree clustering."
  # preparing questions, roots file...
  cluster-phones $dir/treeacc $lang/phones/sets.int $dir/questions.int 2> $dir/log/questions.log || exit 1;
  cat $lang/phones/extra_questions.int >> $dir/questions.int
  compile-questions $lang/topo $dir/questions.int $dir/questions.qst 2>$dir/log/compile_questions.log || exit 1;

  echo "$0: Building the tree"
  $cmd $dir/log/build_tree.log \
    build-tree-two-level --binary=false --verbose=1 --max-leaves-first=$num_groups \
     --max-leaves-second=$num_pdfs $dir/treeacc $lang/phones/roots.int \
     $dir/questions.qst $lang/topo $dir/tree $dir/pdf2group.map || exit 1;
fi

if [ $stage -le -4 ]; then
  echo "$0: Initializing the model"
  # Note: if phn_dim > feat_dim+1 or spk_dim > feat_dim, these dims
  # will be truncated on initialization.
  $cmd $dir/log/init_sgmm.log \
    sgmm2-init --spk-dep-weights=$spk_dep_weights --self-weight=$self_weight \
       --pdf-map=$dir/pdf2group.map --phn-space-dim=$phn_dim \
       --spk-space-dim=$spk_dim $lang/topo $dir/tree $ubm $dir/0.mdl || exit 1;
fi

if [ $stage -le -3 ]; then
  echo "$0: doing Gaussian selection"
  $cmd JOB=1:$nj $dir/log/gselect.JOB.log \
    sgmm2-gselect $dir/0.mdl "$feats" \
    "ark,t:|gzip -c >$dir/gselect.JOB.gz" || exit 1;
fi

if [ $stage -le -2 ]; then
  echo "$0: compiling training graphs"
  text="ark:sym2int.pl --map-oov $oov -f 2- $lang/words.txt < $sdata1/JOB/text|"
  $cmd JOB=1:$nj $dir/log/compile_graphs.JOB.log \
    compile-train-graphs --read-disambig-syms=$lang/phones/disambig.int $dir/tree $dir/0.mdl  $lang/L.fst  \
    "$text" "ark:|gzip -c >$dir/fsts.JOB.gz" || exit 1;
fi

if [ $stage -le -1 ]; then
  echo "$0: converting alignments"
  $cmd JOB=1:$nj $dir/log/convert_ali.JOB.log \
    convert-ali $alidir/final.mdl $dir/0.mdl $dir/tree "ark:gunzip -c $alidir/ali.JOB.gz|" \
    "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1;
fi


x=0
while [ $x -lt $num_iters ]; do
   echo "$0: training pass $x ... "
   if echo $realign_iters | grep -w $x >/dev/null && [ $stage -le $x ]; then
     echo "$0: re-aligning data"
     $cmd JOB=1:$nj $dir/log/align.$x.JOB.log  \
       sgmm2-align-compiled $spkvecs_opt $scale_opts "$gselect_opt" \
       --utt2spk=ark:$sdata1/JOB/utt2spk --beam=$beam --retry-beam=$retry_beam \
       $dir/$x.mdl "ark:gunzip -c $dir/fsts.JOB.gz|" "$feats" \
       "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1;
   fi
   if [ $spk_dim -gt 0 ] && echo $spkvec_iters | grep -w $x >/dev/null; then
     if [ $stage -le $x ]; then
       $cmd JOB=1:$nj $dir/log/spkvecs.$x.JOB.log \
         ali-to-post "ark:gunzip -c $dir/ali.JOB.gz|" ark:- \| \
         weight-silence-post 0.01 $silphonelist $dir/$x.mdl ark:- ark:- \| \
         sgmm2-est-spkvecs --rand-prune=$rand_prune --spk2utt=ark:$sdata1/JOB/spk2utt \
         $spkvecs_opt "$gselect_opt" $dir/$x.mdl "$feats" ark,s,cs:- \
         ark:$dir/tmp_vecs.JOB '&&' mv $dir/tmp_vecs.JOB $dir/vecs.JOB || exit 1;
     fi
     spkvecs_opt="--spk-vecs=ark:$dir/vecs.JOB"
   fi
   if [ $x -eq 0 ]; then
     flags=vwcSt # on the first iteration, don't update projections M or N
   elif [ $spk_dim -gt 0 -a $[$x%2] -eq 1 -a $x -ge $first_spkvec_iter ]; then
     # Update N if we have speaker-vector space and x is odd,
     # and we've already updated the speaker vectors...
     flags=vNwSct
   else
     if [ $x -ge $update_m_iter ]; then
       flags=vMwSct # udpate M.
     else
       flags=vwSct # no M on early iters, if --update-m-iter option given.
     fi
   fi
   $spk_dep_weights && [ $x -ge $first_spkvec_iter ] && flags=${flags}u; # update
   # spk-weight projections "u".

   if [ $stage -le $x ]; then
     $cmd JOB=1:$nj $dir/log/acc.$x.JOB.log \
       sgmm2-acc-stats $spkvecs_opt --utt2spk=ark:$sdata1/JOB/utt2spk \
       --update-flags=$flags "$gselect_opt" --rand-prune=$rand_prune \
       $dir/$x.mdl "$feats" "ark,s,cs:gunzip -c $dir/ali.JOB.gz | ali-to-post ark:- ark:-|" \
       $dir/$x.JOB.acc || exit 1;
   fi

   # The next option is needed if the user specifies a phone or speaker sub-space
   # dimension that's higher than the "normal" one.
   increase_dim_opts=
   if echo $increase_dim_iters | grep -w $x >/dev/null; then
     increase_dim_opts="--increase-phn-dim=$phn_dim --increase-spk-dim=$spk_dim"
     # Note: the command below might have a null effect on some iterations.
     if [ $spk_dim -gt $feat_dim ]; then
       cmd JOB=1:$nj $dir/log/copy_vecs.$x.JOB.log \
         copy-vector --print-args=false --change-dim=$spk_dim \
         ark:$dir/vecs.JOB ark:$dir/vecs_tmp.$JOB '&&' \
         mv $dir/vecs_tmp.JOB $dir/vecs.JOB || exit 1;
     fi
   fi

   if [ $stage -le $x ]; then
     $cmd $dir/log/update.$x.log \
       sgmm2-est --update-flags=$flags --split-substates=$numsubstates \
       $increase_dim_opts --power=$power --write-occs=$dir/$[$x+1].occs \
       $dir/$x.mdl "sgmm2-sum-accs - $dir/$x.*.acc|" $dir/$[$x+1].mdl || exit 1;
     rm $dir/$x.mdl $dir/$x.*.acc $dir/$x.occs 2>/dev/null
   fi
   if [ $x -lt $max_iter_inc ]; then
     numsubstates=$[$numsubstates+$incsubstates]
   fi
   x=$[$x+1];
done

rm $dir/final.mdl $dir/final.occs 2>/dev/null
ln -s $x.mdl $dir/final.mdl
ln -s $x.occs $dir/final.occs

if [ $spk_dim -gt 0 ]; then
  # We need to create an "alignment model" that's been trained
  # without the speaker vectors, to do the first-pass decoding with.
  # in test time.

  # We do this for a few iters, in this recipe.
  final_mdl=$dir/$x.mdl
  cur_alimdl=$dir/$x.mdl
  while [ $x -lt $[$num_iters+$num_iters_alimdl] ]; do
    echo "$0: building alignment model (pass $x)"
    if [ $x -eq $num_iters ]; then # 1st pass of building alimdl.
      flags=MwcS # don't update v the first time.  Note-- we never update transitions.
      # they wouldn't change anyway as we use the same alignment as previously.
    else
      flags=vMwcS
    fi
    if [ $stage -le $x ]; then
      $cmd JOB=1:$nj $dir/log/acc_ali.$x.JOB.log \
        ali-to-post "ark:gunzip -c $dir/ali.JOB.gz|" ark:- \| \
        sgmm2-post-to-gpost $spkvecs_opt "$gselect_opt" \
         --utt2spk=ark:$sdata1/JOB/utt2spk $final_mdl "$feats" ark,s,cs:- ark:- \| \
        sgmm2-acc-stats-gpost --rand-prune=$rand_prune --update-flags=$flags \
          $cur_alimdl "$feats" ark,s,cs:- $dir/$x.JOB.aliacc || exit 1;
      $cmd $dir/log/update_ali.$x.log \
        sgmm2-est --update-flags=$flags --remove-speaker-space=true --power=$power \
        $cur_alimdl "sgmm2-sum-accs - $dir/$x.*.aliacc|" $dir/$[$x+1].alimdl || exit 1;
      rm $dir/$x.*.aliacc || exit 1;
      [ $x -gt $num_iters ]  && rm $dir/$x.alimdl
    fi
    cur_alimdl=$dir/$[$x+1].alimdl
    x=$[$x+1]
  done
  rm $dir/final.alimdl 2>/dev/null
  ln -s $x.alimdl $dir/final.alimdl
fi

utils/summarize_warnings.pl $dir/log

echo Done


================================================
FILE: egs/steps/tandem/train_ubm.sh
================================================
#!/usr/bin/env bash
# Copyright 2012  Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.

# This trains a UBM (i.e. a mixture of Gaussians), by clustering
# the Gaussians from a trained HMM/GMM system and then doing a few
# iterations of UBM training.
# We mostly use this for SGMM systems.

# Begin configuration section.
nj=4
cmd=run.pl
silence_weight=  # You can set it to e.g. 0.0, to weight down silence in training.
stage=-2
num_gselect1=50 # first stage of Gaussian-selection
num_gselect2=25 # second stage.
intermediate_num_gauss=2000
num_iters=3
# End configuration section.

echo "$0 $@"  # Print the command line for logging

if [ -f path.sh ]; then . ./path.sh; fi
. parse_options.sh || exit 1;


if [ $# != 6 ]; then
  echo "Usage: steps/train_tandem_ubm.sh <num-gauss> <data1> <data2> <lang> <ali-dir> <exp>"
  echo " e.g.: steps/train_tandem_ubm.sh 400 {mfcc,bottneneck}/data/train_si84 data/lang exp/tri2b_ali_si84 exp/ubm3c"
  echo "main options (for others, see top of script file)"
  echo "  --config <config-file>                           # config containing options"
  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
  echo "  --silence-weight <sil-weight>                    # weight for silence (e.g. 0.5 or 0.0)"
  echo "  --num-iters <#iters>                             # Number of iterations of E-M"
  exit 1;
fi

num_gauss=$1
data1=$2
data2=$3
lang=$4
alidir=$5
dir=$6

for f in $data1/feats.scp $data2/feats.scp $lang/L.fst $alidir/ali.1.gz $alidir/final.mdl; do
  [ ! -f $f ] && echo "No such file $f" && exit 1;
done

if [ $[$num_gauss*2] -gt $intermediate_num_gauss ]; then
  echo "intermediate_num_gauss was too small $intermediate_num_gauss"
  intermediate_num_gauss=$[$num_gauss*2];
  echo "setting it to $intermediate_num_gauss"
fi


# Set various variables.
silphonelist=`cat $lang/phones/silence.csl` || exit 1;
nj=`cat $alidir/num_jobs` || exit 1;

mkdir -p $dir/log
echo $nj > $dir/num_jobs

utils/lang/check_phones_compatible.sh $lang/phones.txt $alidir/phones.txt || exit 1;
cp $lang/phones.txt $dir || exit 1;

sdata1=$data1/split$nj;
sdata2=$data2/split$nj;

[[ -d $sdata1 && $data1/feats.scp -ot $sdata1 ]] || split_data.sh $data1 $nj || exit 1;
[[ -d $sdata2 && $data2/feats.scp -ot $sdata2 ]] || split_data.sh $data2 $nj || exit 1;

splice_opts=`cat $alidir/splice_opts 2>/dev/null` # frame-splicing options.
normft2=`cat $alidir/normft2 2>/dev/null`

## Set up features.

if [ -f $alidir/final.mat ]; then feat_type=lda; else feat_type=delta; fi

case $feat_type in
  delta)
    echo "$0: feature type is $feat_type"
    ;;
  lda)
    echo "$0: feature type is $feat_type"
    cp $alidir/{lda,final}.mat $dir/ || exit 1;
    ;;
  *) echo "$0: invalid feature type $feat_type" && exit 1;
esac

# set up feature stream 1;  this are usually spectral features, so we will add
# deltas or splice them
feats1="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata1/JOB/utt2spk scp:$sdata1/JOB/cmvn.scp scp:$sdata1/JOB/feats.scp ark:- |"

if [ "$feat_type" == "delta" ]; then
  feats1="$feats1 add-deltas ark:- ark:- |"
elif [ "$feat_type" == "lda" ]; then
  feats1="$feats1 splice-feats $splice_opts ark:- ark:- | transform-feats $dir/lda.mat ark:- ark:- |"
fi

# set up feature stream 2;  this are usually bottleneck or posterior features,
# which may be normalized if desired
feats2="scp:$sdata2/JOB/feats.scp"

if [ "$normft2" == "true" ]; then
  feats2="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata2/JOB/utt2spk scp:$sdata2/JOB/cmvn.scp $feats2 ark:- |"
fi

# assemble tandem features
feats="ark,s,cs:paste-feats '$feats1' '$feats2' ark:- |"

# add transformation, if applicable
if [ "$feat_type" == "lda" ]; then
  feats="$feats transform-feats $dir/final.mat ark:- ark:- |"
fi

# splicing/normalization options
cp $alidir/{splice_opts,tandem,normft2} $dir 2>/dev/null

if [ -f $alidir/trans.1 ]; then
  echo "$0: using transforms from $alidir"
  feats="$feats transform-feats --utt2spk=ark:$sdata1/JOB/utt2spk ark,s,cs:$alidir/trans.JOB ark:- ark:- |"
fi
##

if [ ! -z "$silence_weight" ]; then
  weights_opt="--weights='ark,s,cs:gunzip -c $alidir/ali.JOB.gz | ali-to-post ark:- ark:- | weight-silence-post $silence_weight $silphonelist $alidir/final.mdl ark:- ark:- | post-to-weights ark:- ark:- |'"
else
  weights_opt=
fi

if [ $stage -le -2 ]; then
  echo "$0: clustering model $alidir/final.mdl to get initial UBM"
  $cmd $dir/log/cluster.log \
    init-ubm --intermediate-num-gauss=$intermediate_num_gauss --ubm-num-gauss=$num_gauss \
    --verbose=2 --fullcov-ubm=true $alidir/final.mdl $alidir/final.occs \
    $dir/0.ubm   || exit 1;
fi

# Do initial phase of Gaussian selection and save it to disk -- later on we'll
# do more Gaussian selection to further prune, as the model changes.


if [ $stage -le -1 ]; then
  echo "$0: doing Gaussian selection"
  $cmd JOB=1:$nj $dir/log/gselect.JOB.log \
    gmm-gselect --n=$num_gselect1 "fgmm-global-to-gmm $dir/0.ubm - |" "$feats" \
    "ark:|gzip -c >$dir/gselect.JOB.gz" || exit 1;
fi


x=0
while [ $x -lt $num_iters ]; do
  echo "Pass $x"
  $cmd JOB=1:$nj $dir/log/acc.$x.JOB.log \
    gmm-gselect --n=$num_gselect2 "--gselect=ark,s,cs:gunzip -c $dir/gselect.JOB.gz|" \
    "fgmm-global-to-gmm $dir/$x.ubm - |" "$feats" ark:- \| \
    fgmm-global-acc-stats $weights_opt --gselect=ark,s,cs:- $dir/$x.ubm "$feats" \
    $dir/$x.JOB.acc || exit 1;
  lowcount_opt="--remove-low-count-gaussians=false"
  [ $[$x+1] -eq $num_iters ] && lowcount_opt=   # Only remove low-count Gaussians
  # on last iter-- we can't do it earlier, or the Gaussian-selection info would
  # be mismatched.
  $cmd $dir/log/update.$x.log \
    fgmm-global-est $lowcount_opt --verbose=2 $dir/$x.ubm "fgmm-global-sum-accs - $dir/$x.*.acc |" \
      $dir/$[$x+1].ubm || exit 1;
  rm $dir/$x.*.acc $dir/$x.ubm
  x=$[$x+1]
done

rm $dir/gselect.*.gz
rm $dir/final.ubm 2>/dev/null
mv $dir/$x.ubm $dir/final.ubm || exit 1;


================================================
FILE: egs/steps/tfrnnlm/check_py.py
================================================
import numpy as np
import tensorflow as tf


================================================
FILE: egs/steps/tfrnnlm/check_tensorflow_installed.sh
================================================
#!/usr/bin/env bash

# this script checks if TF is installed to be used with python
#                    and if TF related binaries in kaldi is ready to use
. ./path.sh

if which lattice-lmrescore-tf-rnnlm 2>&1>/dev/null; then
  echo TensorFlow relate binaries found. This is good.
else
  echo TF related binaries not compiled.
  echo You need to go to tools/ and run extras/install_tensorflow_cc.sh first
  echo and then do \"make\" under both src/tfrnnlm and src/tfrnnlmbin
  exit 1
fi

echo

if python steps/tfrnnlm/check_py.py 2>/dev/null; then
  echo TensorFlow ready to use on the python side. This is good.
else
  echo TensorFlow not found on the python side.
  echo Please go to tools/ and run extras/install_tensorflow_py.sh to install it
  echo If you already have TensorFlow installed somewhere else, you would need
  echo to add it to your PATH
  exit 1
fi


================================================
FILE: egs/steps/tfrnnlm/lmrescore_rnnlm_lat.sh
================================================
#!/usr/bin/env bash

# Copyright 2015  Guoguo Chen
#           2017  Hainan Xu
# Apache 2.0

# This script rescores lattices with RNNLM trained with TensorFlow.
# A faster and more accurate version of the algorithm is at
# steps/tfrnnlm/lmrescore_rnnlm_lat_pruned.sh which is prefered
# One example recipe of this script is at egs/ami/s5/local/tfrnnlm/run_lstm_fast.sh

# Begin configuration section.
cmd=run.pl
skip_scoring=false
max_ngram_order=4 # Approximate the lattice-rescoring by limiting the max-ngram-order
                  # if it's set, it merges histories in the lattice if they share
                  # the same ngram history and this prevents the lattice from 
                  # exploding exponentially. Details of the n-gram approximation
                  # method are described in section 2.3 of the paper
                  # http://www.danielpovey.com/files/2018_icassp_lattice_pruning.pdf
weight=0.5  # Interpolation weight for RNNLM.
# End configuration section.

echo "$0 $@"  # Print the command line for logging

. ./utils/parse_options.sh

if [ $# != 5 ]; then
   echo "Does language model rescoring of lattices (remove old LM, add new LM)"
   echo "with TensorFlow RNNLM."
   echo ""
   echo "Usage: $0 [options] <old-lang-dir> <rnnlm-dir> \\"
   echo "                   <data-dir> <input-decode-dir> <output-decode-dir>"
   echo " e.g.: $0 data/lang_tg data/tensorflow_lstm data/test \\"
   echo "                   exp/tri3/test_tg exp/tri3/test_tfrnnlm"
   echo "options: [--cmd (run.pl|queue.pl [queue opts])]"
   exit 1;
fi

[ -f path.sh ] && . ./path.sh;

oldlang=$1
rnnlm_dir=$2
data=$3
indir=$4
outdir=$5

oldlm=$oldlang/G.fst
if [ -f $oldlang/G.carpa ]; then
  oldlm=$oldlang/G.carpa
elif [ ! -f $oldlm ]; then
  echo "$0: expecting either $oldlang/G.fst or $oldlang/G.carpa to exist" &&\
    exit 1;
fi

echo "$0: using $oldlm as old LM"

[ ! -d $rnnlm_dir/rnnlm ] && echo "$0: Missing tf model folder $rnnlm_dir/rnnlm" && exit 1;

for f in $rnnlm_dir/unk.probs $oldlang/words.txt $indir/lat.1.gz; do
  [ ! -f $f ] && echo "$0: Missing file $f" && exit 1
done

awk -v n=$0 -v w=$weight 'BEGIN {if (w < 0 || w > 1) {
  print n": Interpolation weight should be in the range of [0, 1]"; exit 1;}}' \
  || exit 1;

oldlm_command="fstproject --project_output=true $oldlm |"

mkdir -p $outdir/log
nj=`cat $indir/num_jobs` || exit 1;
cp $indir/num_jobs $outdir

oldlm_weight=`perl -e "print -1.0 * $weight;"`
if [ "$oldlm" == "$oldlang/G.fst" ]; then
  $cmd JOB=1:$nj $outdir/log/rescorelm.JOB.log \
    lattice-lmrescore --lm-scale=$oldlm_weight \
    "ark:gunzip -c $indir/lat.JOB.gz|" "$oldlm_command" ark:-  \| \
    lattice-lmrescore-tf-rnnlm --lm-scale=$weight \
    --max-ngram-order=$max_ngram_order \
    $rnnlm_dir/unk.probs $rnnlm_dir/wordlist.rnn.final $oldlang/words.txt ark:- "$rnnlm_dir/rnnlm" \
    "ark,t:|gzip -c>$outdir/lat.JOB.gz" || exit 1;
else
  $cmd JOB=1:$nj $outdir/log/rescorelm.JOB.log \
    lattice-lmrescore-const-arpa --lm-scale=$oldlm_weight \
    "ark:gunzip -c $indir/lat.JOB.gz|" "$oldlm" ark:-  \| \
    lattice-lmrescore-tf-rnnlm --lm-scale=$weight \
    --max-ngram-order=$max_ngram_order \
    $rnnlm_dir/unk.probs $rnnlm_dir/wordlist.rnn.final $oldlang/words.txt ark:- "$rnnlm_dir/rnnlm" \
    "ark,t:|gzip -c>$outdir/lat.JOB.gz" || exit 1;
fi
if ! $skip_scoring ; then
  err_msg="$0: Not scoring because local/score.sh does not exist or not executable."
  [ ! -x local/score.sh ] && echo $err_msg && exit 1;
  local/score.sh --cmd "$cmd" $data $oldlang $outdir
else
  echo "$0: Not scoring because --skip-scoring was specified."
fi

exit 0;


================================================
FILE: egs/steps/tfrnnlm/lmrescore_rnnlm_lat_pruned.sh
================================================
#!/usr/bin/env bash

# Copyright 2015  Guoguo Chen
#           2017  Hainan Xu
# Apache 2.0

# This script rescores lattices with RNNLM trained with TensorFlow.
# It uses a pruned algorithm to speed up the runtime and improve the accuracy.
# which is an improved version over steps/tfrnnlm/lmrescore_rnnlm_lat.sh,
# which uses the exact same interface
# The details of the pruning algorithm is described in
# http://www.danielpovey.com/files/2018_icassp_lattice_pruning.pdf
# One example recipe of this script is at egs/ami/s5/local/tfrnnlm/run_lstm_fast.sh

# Begin configuration section.
cmd=run.pl
skip_scoring=false
max_ngram_order=4 # Approximate the lattice-rescoring by limiting the max-ngram-order
                  # if it's set, it merges histories in the lattice if they share
                  # the same ngram history and this prevents the lattice from 
                  # exploding exponentially. Details of the n-gram approximation
                  # method are described in section 2.3 of the paper
                  # http://www.danielpovey.com/files/2018_icassp_lattice_pruning.pdf
acwt=0.1
weight=0.5  # Interpolation weight for RNNLM.
# End configuration section.

echo "$0 $@"  # Print the command line for logging

. ./utils/parse_options.sh

if [ $# != 5 ]; then
   echo "Does language model rescoring of lattices (remove old LM, add new LM)"
   echo "with RNNLM."
   echo ""
   echo "Usage: $0 [options] <old-lang-dir> <rnnlm-dir> \\"
   echo "                   <data-dir> <input-decode-dir> <output-decode-dir>"
   echo " e.g.: $0 data/lang_tg data/tensorflow_lstm data/test \\"
   echo "                   exp/tri3/test_tg exp/tri3/test_tfrnnlm"
   echo "options: [--cmd (run.pl|queue.pl [queue opts])]"
   exit 1;
fi

[ -f path.sh ] && . ./path.sh;

oldlang=$1
rnnlm_dir=$2
data=$3
indir=$4
outdir=$5

oldlm=$oldlang/G.fst
carpa_option=

if [ -f $oldlang/G.carpa ]; then
  oldlm=$oldlang/G.carpa
  carpa_option="--use-const-arpa=true"
elif [ ! -f $oldlm ]; then
  echo "$0: expecting either $oldlang/G.fst or $oldlang/G.carpa to exist" &&\
    exit 1;
fi

echo "$0: using $oldlm as old LM"

[ ! -d $rnnlm_dir/rnnlm ] && echo "$0: Missing tf model folder $rnnlm_dir/rnnlm" && exit 1;

for f in $rnnlm_dir/unk.probs $oldlang/words.txt $indir/lat.1.gz; do
  [ ! -f $f ] && echo "$0: Missing file $f" && exit 1
done

awk -v n=$0 -v w=$weight 'BEGIN {if (w < 0 || w > 1) {
  print n": Interpolation weight should be in the range of [0, 1]"; exit 1;}}' \
  || exit 1;

mkdir -p $outdir/log
nj=`cat $indir/num_jobs` || exit 1;
cp $indir/num_jobs $outdir

$cmd JOB=1:$nj $outdir/log/rescorelm.JOB.log \
  lattice-lmrescore-tf-rnnlm-pruned --lm-scale=$weight \
  --acoustic-scale=$acwt --max-ngram-order=$max_ngram_order \
  $carpa_option $oldlm $oldlang/words.txt \
  $rnnlm_dir/unk.probs $rnnlm_dir/wordlist.rnn.final "$rnnlm_dir/rnnlm" \
  "ark:gunzip -c $indir/lat.JOB.gz|" "ark,t:|gzip -c>$outdir/lat.JOB.gz" || exit 1;

if ! $skip_scoring ; then
  err_msg="$0: Not scoring because local/score.sh does not exist or not executable."
  [ ! -x local/score.sh ] && echo $err_msg && exit 1;
  local/score.sh --cmd "$cmd" $data $oldlang $outdir
else
  echo "$0: Not scoring because --skip-scoring was specified."
fi

exit 0;


================================================
FILE: egs/steps/tfrnnlm/lstm.py
================================================
# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
# Copyright (C) 2017 Intellisist, Inc. (Author: Hainan Xu)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

# this script trains a vanilla RNNLM with TensorFlow. 
# to call the script, do
# python steps/tfrnnlm/lstm.py --data_path=$datadir \
#        --save_path=$savepath --vocab_path=$rnn.wordlist [--hidden-size=$size]
#
# One example recipe is at egs/ami/s5/local/tfrnnlm/run_lstm.sh

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import absl
import absl.flags as flags
import tensorflow as tf

import reader

flags.DEFINE_integer("hidden_size", 200, "hidden dim of RNN")

flags.DEFINE_string("data_path", None,
                    "Where the training/test data is stored.")
flags.DEFINE_string("vocab_path", None,
                    "Where the wordlist file is stored.")
flags.DEFINE_string("save_path", "export",
                    "Model output directory.")
flags.DEFINE_bool("use_fp16", False,
                  "Train using 16-bit floats instead of 32bit floats")

FLAGS = flags.FLAGS


class Config(object):
  init_scale = 0.1
  learning_rate = 1.0
  max_grad_norm = 5
  num_layers = 2
  num_steps = 20
  hidden_size = 200
  max_epoch = 4
  max_max_epoch = 13
  keep_prob = 1.0
  lr_decay = 0.5
  batch_size = 64


def data_type():
  return tf.float16 if FLAGS.use_fp16 else tf.float32


class RNNLMModel(tf.Module):
  """The RNN model itself."""

  def __init__(self, config, logits_bias_initializer=None):
    super().__init__()
    self._config = config

    size = config.hidden_size
    vocab_size = config.vocab_size
    dt = data_type()

    def lstm_cell():
      return tf.keras.layers.LSTMCell(size, dtype=dt, unit_forget_bias=False)

    def add_dropout(cell):
      if config.keep_prob < 1:
        cell = tf.nn.RNNCellDropoutWrapper(cell=cell, output_keep_prob=config.keep_prob)
      return cell

    self.embedding = tf.keras.layers.Embedding(vocab_size, size, dtype=dt)
    self.cells = [lstm_cell() for _ in range(config.num_layers)]
    self.rnn = tf.keras.layers.RNN(self.cells, return_sequences=True)

    if logits_bias_initializer is None:
      logits_bias_initializer = 'zeros'
    self.fc = tf.keras.layers.Dense(vocab_size, bias_initializer=logits_bias_initializer)

    # only used in training
    self.training_cells = [add_dropout(cell) for cell in self.cells]
    self.training_rnn = tf.keras.layers.RNN(self.training_cells, return_sequences=True)

  def get_logits(self, word_ids, is_training=False):
    rnn = self.training_rnn if is_training else self.rnn
    inputs = self.embedding(word_ids)
    if is_training and self._config.keep_prob < 1:
      inputs = tf.nn.dropout(inputs, 1 - self._config.keep_prob)
    rnn_out = rnn(inputs)
    logits = self.fc(rnn_out)
    return logits

  def get_loss(self, word_ids, labels, is_training=False):
    logits = self.get_logits(word_ids, is_training)
    loss_obj = tf.losses.SparseCategoricalCrossentropy(from_logits=True)
    return loss_obj(labels, logits)

  def get_score(self, logits):
    """Take logits as input, output a score."""
    return tf.nn.log_softmax(logits)

  @tf.function
  def get_initial_state(self):
    """Exported function which emits zeroed RNN context vector."""
    # This seems a bug in TensorFlow, but passing tf.int32 makes the state tensor also int32.
    fake_input = tf.constant(0, dtype=tf.float32, shape=[1, 1])
    initial_state = tf.stack(self.rnn.get_initial_state(fake_input))
    return {"initial_state": initial_state}

  @tf.function
  def single_step(self, context, word_id):
    """Exported function which perform one step of the RNN model."""
    rnn = tf.keras.layers.RNN(self.cells, return_state=True)
    context = tf.unstack(context)
    context = [tf.unstack(c) for c in context]

    inputs = self.embedding(word_id)
    rnn_out_and_states = rnn(inputs, initial_state=context)

    rnn_out = rnn_out_and_states[0]
    rnn_states = tf.stack(rnn_out_and_states[1:])

    logits = self.fc(rnn_out)
    output = self.get_score(logits)
    log_prob = output[0, word_id[0, 0]]
    return {"log_prob": log_prob, "rnn_states": rnn_states, "rnn_out": rnn_out}


class RNNLMModelTrainer(tf.Module):
  """This class contains training code."""

  def __init__(self, model: RNNLMModel, config):
    super().__init__()
    self.model = model
    self.learning_rate = tf.Variable(1e-3, dtype=tf.float32, trainable=False)
    self.optimizer = tf.optimizers.SGD(learning_rate=self.learning_rate)
    self.max_grad_norm = config.max_grad_norm

    self.eval_mean_loss = tf.metrics.Mean()

  def train_one_epoch(self, data_producer, learning_rate, verbose=True):
    print("start epoch with learning rate {}".format(learning_rate))
    self.learning_rate.assign(learning_rate)

    for i, (inputs, labels) in enumerate(data_producer.iterate()):
      loss = self._train_step(inputs, labels)
      if verbose and i % (data_producer.epoch_size // 10) == 1:
        print("{}/{}: loss={}".format(i, data_producer.epoch_size, loss))

  @tf.function
  def evaluate(self, data_producer):
    self.eval_mean_loss.reset_states()
    for i, (inputs, labels) in enumerate(data_producer.iterate()):
      loss = self.model.get_loss(inputs, labels)
      self.eval_mean_loss.update_state(loss)

    return self.eval_mean_loss.result()

  @tf.function
  def _train_step(self, inputs, labels):
    with tf.GradientTape() as tape:
      loss = self.model.get_loss(inputs, labels, is_training=True)

    tvars = self.model.trainable_variables
    grads = tape.gradient(loss, tvars)
    clipped_grads, _ = tf.clip_by_global_norm(grads, self.max_grad_norm)
    self.optimizer.apply_gradients(zip(clipped_grads, tvars))
    return loss


def get_config():
  return Config()


def main(_):
  # Turn this on to try the model code with this source file itself!
  __TESTING = False

  if __TESTING:
    (train_data, valid_data), word_map = reader.rnnlm_gen_data(__file__, reader.__file__)
  else:
    if not FLAGS.data_path:
      raise ValueError("Must set --data_path to RNNLM data directory")

    raw_data = reader.rnnlm_raw_data(FLAGS.data_path, FLAGS.vocab_path)
    train_data, valid_data, _, word_map = raw_data

  config = get_config()
  config.hidden_size = FLAGS.hidden_size
  config.vocab_size = len(word_map)

  if __TESTING:
    # use a much smaller scale on our tiny test data
    config.num_steps = 8
    config.batch_size = 4

  model = RNNLMModel(config)
  train_producer = reader.RNNLMProducer(train_data, config.batch_size, config.num_steps)
  trainer = RNNLMModelTrainer(model, config)

  valid_producer = reader.RNNLMProducer(valid_data, config.batch_size, config.num_steps)

  # Save variables to disk if you want to prevent crash...
  # Data producer can also be saved to preverse feeding progress.
  checkpoint = tf.train.Checkpoint(trainer=trainer, data_feeder=train_producer)
  manager = tf.train.CheckpointManager(checkpoint, "checkpoints/", 5)

  for i in range(config.max_max_epoch):
    lr_decay = config.lr_decay ** max(i + 1 - config.max_epoch, 0.0)
    lr = config.learning_rate * lr_decay
    trainer.train_one_epoch(train_producer, lr)
    manager.save()

    eval_loss = trainer.evaluate(valid_producer)
    print("validating: loss={}".format(eval_loss))

  # Export
  print("Saving model to %s." % FLAGS.save_path)
  spec = [tf.TensorSpec(shape=[config.num_layers, 2, 1, config.hidden_size], dtype=data_type(), name="context"),
          tf.TensorSpec(shape=[1, 1], dtype=tf.int32, name="word_id")]
  cfunc = model.single_step.get_concrete_function(*spec)
  cfunc2 = model.get_initial_state.get_concrete_function()
  tf.saved_model.save(model, FLAGS.save_path, signatures={"single_step": cfunc, "get_initial_state": cfunc2})


if __name__ == "__main__":
  absl.app.run(main)


================================================
FILE: egs/steps/tfrnnlm/lstm_fast.py
================================================
# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
# Copyright (C) 2017 Intellisist, Inc. (Author: Hainan Xu)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

# this script trains a vanilla RNNLM with TensorFlow. 
# to call the script, do
# python steps/tfrnnlm/lstm_fast.py --data_path=$datadir \
#        --save_path=$savepath --vocab_path=$rnn.wordlist [--hidden-size=$size]
#
# One example recipe is at egs/ami/s5/local/tfrnnlm/run_vanilla_rnnlm.sh

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import absl
import absl.flags as flags
import tensorflow as tf
from tensorflow.python.keras.losses import LossFunctionWrapper

import reader
from lstm import RNNLMModel, RNNLMModelTrainer

# flags.DEFINE_integer("hidden_size", 200, "hidden dim of RNN")
#
# flags.DEFINE_string("data_path", None,
#                     "Where the training/test data is stored.")
# flags.DEFINE_string("vocab_path", None,
#                     "Where the wordlist file is stored.")
# flags.DEFINE_string("save_path", "export",
#                     "Model output directory.")
# flags.DEFINE_bool("use_fp16", False,
#                   "Train using 16-bit floats instead of 32bit floats")

FLAGS = flags.FLAGS


class Config(object):
  """Small config."""
  init_scale = 0.1
  learning_rate = 1
  max_grad_norm = 5
  num_layers = 2
  num_steps = 20
  hidden_size = 200
  max_epoch = 4
  max_max_epoch = 13
  keep_prob = 1.0
  lr_decay = 0.8
  batch_size = 64


def data_type():
  return tf.float16 if FLAGS.use_fp16 else tf.float32


# this new "softmax" function we show can train a "self-normalized" RNNLM where
# the sum of the output is automatically (close to) 1.0
# which saves a lot of computation for lattice-rescoring
def new_softmax(labels, logits):
  flatten_labels = tf.reshape(labels, [-1])
  n_samples = tf.shape(flatten_labels)[0]
  flatten_logits = tf.reshape(logits, shape=[n_samples, -1])
  f_logits = tf.exp(flatten_logits)
  row_sums = tf.reduce_sum(f_logits, -1) # this is the negative part of the objf

  t2 = tf.expand_dims(flatten_labels, 1)
  range = tf.expand_dims(tf.range(n_samples), 1)
  ind = tf.concat([range, t2], 1)
  res = tf.gather_nd(flatten_logits, ind)

  return -res + row_sums - 1


class MyFastLossFunction(LossFunctionWrapper):
  def __init__(self):
    super().__init__(new_softmax)


class FastRNNLMModel(RNNLMModel):
  def __init__(self, config):
    super().__init__(config, tf.constant_initializer(-9))

  def get_loss(self, word_ids, labels, is_training=False):
    logits = self.get_logits(word_ids, is_training)
    loss_obj = MyFastLossFunction()
    return loss_obj(labels, logits)

  def get_score(self, logits):
    # In this implementation, logits can be used as dist output
    return logits


def get_config():
  return Config()


def main(_):
  # Turn this on to try the model code with this source file itself!
  __TESTING = False

  if __TESTING:
    (train_data, valid_data), word_map = reader.rnnlm_gen_data(__file__, reader.__file__)
  else:
    if not FLAGS.data_path:
      raise ValueError("Must set --data_path to RNNLM data directory")

    raw_data = reader.rnnlm_raw_data(FLAGS.data_path, FLAGS.vocab_path)
    train_data, valid_data, _, word_map = raw_data

  config = get_config()
  config.hidden_size = FLAGS.hidden_size
  config.vocab_size = len(word_map)

  if __TESTING:
    # use a much smaller scale on our tiny test data
    config.num_steps = 8
    config.batch_size = 4

  model = FastRNNLMModel(config)
  train_producer = reader.RNNLMProducer(train_data, config.batch_size, config.num_steps)
  trainer = RNNLMModelTrainer(model, config)

  valid_producer = reader.RNNLMProducer(valid_data, config.batch_size, config.num_steps)

  # Save variables to disk if you want to prevent crash...
  # Data producer can also be saved to preverse feeding progress.
  checkpoint = tf.train.Checkpoint(trainer=trainer, data_feeder=train_producer)
  manager = tf.train.CheckpointManager(checkpoint, "checkpoints/", 5)

  for i in range(config.max_max_epoch):
    lr_decay = config.lr_decay ** max(i + 1 - config.max_epoch, 0.0)
    lr = config.learning_rate * lr_decay
    trainer.train_one_epoch(train_producer, lr)
    manager.save()

    eval_loss = trainer.evaluate(valid_producer)
    print("validating: loss={}".format(eval_loss))

  # Export
  print("Saving model to %s." % FLAGS.save_path)
  spec = [tf.TensorSpec(shape=[config.num_layers, 2, 1, config.hidden_size], dtype=data_type(), name="context"),
          tf.TensorSpec(shape=[1, 1], dtype=tf.int32, name="word_id")]
  cfunc = model.single_step.get_concrete_function(*spec)
  cfunc2 = model.get_initial_state.get_concrete_function()
  tf.saved_model.save(model, FLAGS.save_path, signatures={"single_step": cfunc, "get_initial_state": cfunc2})


if __name__ == "__main__":
  absl.app.run(main)


================================================
FILE: egs/steps/tfrnnlm/reader.py
================================================
# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
# Copyright (C) 2017 Intellisist, Inc. (Author: Hainan Xu)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================


"""Utilities for parsing RNNLM text files."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import collections
import os

import tensorflow as tf

def _read_words(filename):
  with tf.gfile.GFile(filename, "r") as f:
    return f.read().decode("utf-8").split()

def _build_vocab(filename):
  words = _read_words(filename)
  word_to_id = dict(list(zip(words, list(range(len(words))))))
  return word_to_id


def _file_to_word_ids(filename, word_to_id):
  data = _read_words(filename)
  return [word_to_id[word] for word in data if word in word_to_id]


def rnnlm_raw_data(data_path, vocab_path):
  """Load RNNLM raw data from data directory "data_path".

  Args:
    data_path: string path to the directory where train/valid files are stored

  Returns:
    tuple (train_data, valid_data, test_data, vocabulary)
    where each of the data objects can be passed to RNNLMIterator.
  """

  train_path = os.path.join(data_path, "train")
  valid_path = os.path.join(data_path, "valid")

  word_to_id = _build_vocab(vocab_path)
  train_data = _file_to_word_ids(train_path, word_to_id)
  valid_data = _file_to_word_ids(valid_path, word_to_id)
  vocabulary = len(word_to_id)
  return train_data, valid_data, vocabulary, word_to_id


def rnnlm_gen_data(*files):
  """Generates data and vocab from files.

  This function is used solely for testing.
  """
  import collections
  import re

  all_words = collections.Counter()
  all_word_lists = []
  for f in files:
    with open(f, mode="r") as fp:
      text = fp.read()

    word_list = re.split("[^A-Za-z]", text)
    word_list = list(filter(None, word_list))
    all_words.update(word_list)
    all_word_lists.append(word_list)

  word_to_id = {word: i for i, (word, _) in enumerate(all_words.most_common())}

  def convert(word_list):
    return [word_to_id[word] for word in word_list]

  all_word_ids = [convert(word_list) for word_list in all_word_lists]
  return all_word_ids, word_to_id


class RNNLMProducer(tf.Module):
  """This is the data feeder."""

  def __init__(self, raw_data, batch_size, num_steps, name=None):
    super().__init__(name)
    self.batch_size = batch_size
    self.num_steps = num_steps
    self.epoch_size = (len(raw_data) - 1) // num_steps // batch_size

    # load data into a variable so that it will be separated from graph
    self._raw_data = tf.Variable(raw_data, dtype=tf.int32, trainable=False)

    ds_x = tf.data.Dataset.from_tensor_slices(self._raw_data)
    ds_y = ds_x.skip(1)
    ds = tf.data.Dataset.zip((ds_x, ds_y))
    # form samples
    ds = ds.batch(num_steps, drop_remainder=True)
    # form batches
    self._ds = ds.batch(batch_size, drop_remainder=True)

  def iterate(self):
    return self._ds


if __name__ == "__main__":
  samples = list(range(100))
  ds = RNNLMProducer(samples, 4, 8)
  print(ds.epoch_size)
  for data in ds.iterate():
    print(data)


================================================
FILE: egs/steps/tfrnnlm/vanilla_rnnlm.py
================================================
# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
# Copyright (C) 2017 Intellisist, Inc. (Author: Hainan Xu)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

# this script trains a vanilla RNNLM with TensorFlow. 
# to call the script, do
# python steps/tfrnnlm/vanilla_rnnlm.py --data_path=$datadir \
#        --save_path=$savepath --vocab_path=$rnn.wordlist [--hidden-size=$size]
#
# One example recipe is at egs/ami/s5/local/tfrnnlm/run_vanilla_rnnlm.sh

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import sys

import inspect
import time

import numpy as np
import tensorflow as tf

import reader

flags = tf.flags
logging = tf.logging

flags.DEFINE_integer("hidden_size", 200, "hidden dim of RNN")

flags.DEFINE_string("data_path", None,
                    "Where the training/test data is stored.")
flags.DEFINE_string("vocab_path", None,
                    "Where the wordlist file is stored.")
flags.DEFINE_string("save_path", None,
                    "Model output directory.")
flags.DEFINE_bool("use_fp16", False,
                  "Train using 16-bit floats instead of 32bit floats")

FLAGS = flags.FLAGS

class Config(object):
  """Small config."""
  init_scale = 0.1
  learning_rate = 0.2
  max_grad_norm = 1
  num_layers = 1
  num_steps = 20
  hidden_size = 200
  max_epoch = 4
  max_max_epoch = 20
  keep_prob = 1
  lr_decay = 0.95
  batch_size = 64

def data_type():
  return tf.float16 if FLAGS.use_fp16 else tf.float32


class RnnlmInput(object):
  """The input data."""

  def __init__(self, config, data, name=None):
    self.batch_size = batch_size = config.batch_size
    self.num_steps = num_steps = config.num_steps
    self.epoch_size = ((len(data) // batch_size) - 1) // num_steps
    self.input_data, self.targets = reader.rnnlm_producer(
        data, batch_size, num_steps, name=name)

class RnnlmModel(object):
  """The RNNLM model."""

  def __init__(self, is_training, config, input_):
    self._input = input_

    batch_size = input_.batch_size
    num_steps = input_.num_steps
    size = config.hidden_size
    vocab_size = config.vocab_size

    def rnn_cell():
      # With the latest TensorFlow source code (as of Mar 27, 2017),
      # the BasicLSTMCell will need a reuse parameter which is unfortunately not
      # defined in TensorFlow 1.0. To maintain backwards compatibility, we add
      # an argument check here:
      if 'reuse' in inspect.getargspec(
          tf.contrib.rnn.BasicRNNCell.__init__).args:
        return tf.contrib.rnn.BasicRNNCell(size,
                                           reuse=tf.get_variable_scope().reuse)
      else:
        return tf.contrib.rnn.BasicRNNCell(size)
    attn_cell = rnn_cell

    if is_training and config.keep_prob < 1:
      def attn_cell():
        return tf.contrib.rnn.DropoutWrapper(
            rnn_cell(), output_keep_prob=config.keep_prob)

    self.cell = tf.contrib.rnn.MultiRNNCell(
        [attn_cell() for _ in range(config.num_layers)], state_is_tuple=True)

    self._initial_state = self.cell.zero_state(batch_size, data_type())
    self._initial_state_single = self.cell.zero_state(1, data_type())

    self.initial = tf.reshape(tf.stack(axis=0, values=self._initial_state_single), [config.num_layers, 1, size], name="test_initial_state")

    # first implement the less efficient version
    test_word_in = tf.placeholder(tf.int32, [1, 1], name="test_word_in")

    state_placeholder = tf.placeholder(tf.float32, [config.num_layers, 1, size], name="test_state_in")
    # unpacking the input state context 
    l = tf.unstack(state_placeholder, axis=0)
    test_input_state = tuple(
               [l[idx] for idx in range(config.num_layers)]
    )

    with tf.device("/cpu:0"):
      self.embedding = tf.get_variable(
          "embedding", [vocab_size, size], dtype=data_type())

      inputs = tf.nn.embedding_lookup(self.embedding, input_.input_data)
      test_inputs = tf.nn.embedding_lookup(self.embedding, test_word_in)

    # test time
    with tf.variable_scope("RNN"):
      (test_cell_output, test_output_state) = self.cell(test_inputs[:, 0, :], test_input_state)

    test_state_out = tf.reshape(tf.stack(axis=0, values=test_output_state), [config.num_layers, 1, size], name="test_state_out")
    test_cell_out = tf.reshape(test_cell_output, [1, size], name="test_cell_out")
    # above is the first part of the graph for test
    # test-word-in
    #               > ---- > test-state-out
    # test-state-in        > test-cell-out


    # below is the 2nd part of the graph for test
    # test-word-out
    #               > prob(word | test-word-out)
    # test-cell-in

    test_word_out = tf.placeholder(tf.int32, [1, 1], name="test_word_out")
    cellout_placeholder = tf.placeholder(tf.float32, [1, size], name="test_cell_in")

    softmax_w = tf.get_variable(
        "softmax_w", [size, vocab_size], dtype=data_type())
    softmax_b = tf.get_variable("softmax_b", [vocab_size], dtype=data_type())

    test_logits = tf.matmul(cellout_placeholder, softmax_w) + softmax_b
    test_softmaxed = tf.nn.log_softmax(test_logits)

    p_word = test_softmaxed[0, test_word_out[0,0]]
    test_out = tf.identity(p_word, name="test_out")

    if is_training and config.keep_prob < 1:
      inputs = tf.nn.dropout(inputs, config.keep_prob)

    # Simplified version of models/tutorials/rnn/rnn.py's rnn().
    # This builds an unrolled LSTM for tutorial purposes only.
    # In general, use the rnn() or state_saving_rnn() from rnn.py.
    #
    # The alternative version of the code below is:
    #
    # inputs = tf.unstack(inputs, num=num_steps, axis=1)
    # outputs, state = tf.contrib.rnn.static_rnn(
    #     cell, inputs, initial_state=self._initial_state)
    outputs = []
    state = self._initial_state
    with tf.variable_scope("RNN"):
      for time_step in range(num_steps):
        if time_step > -1: tf.get_variable_scope().reuse_variables()
        (cell_output, state) = self.cell(inputs[:, time_step, :], state)
        outputs.append(cell_output)

    output = tf.reshape(tf.stack(axis=1, values=outputs), [-1, size])
    logits = tf.matmul(output, softmax_w) + softmax_b
    loss = tf.contrib.legacy_seq2seq.sequence_loss_by_example(
        [logits],
        [tf.reshape(input_.targets, [-1])],
        [tf.ones([batch_size * num_steps], dtype=data_type())])
    self._cost = cost = tf.reduce_sum(loss) / batch_size
    self._final_state = state

    if not is_training:
      return

    self._lr = tf.Variable(0.0, trainable=False)
    tvars = tf.trainable_variables()
    grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars),
                                      config.max_grad_norm)
    optimizer = tf.train.MomentumOptimizer(self._lr, 0.9)
    self._train_op = optimizer.apply_gradients(
        list(zip(grads, tvars)),
        global_step=tf.contrib.framework.get_or_create_global_step())

    self._new_lr = tf.placeholder(
        tf.float32, shape=[], name="new_learning_rate")
    self._lr_update = tf.assign(self._lr, self._new_lr)

  def assign_lr(self, session, lr_value):
    session.run(self._lr_update, feed_dict={self._new_lr: lr_value})

  @property
  def input(self):
    return self._input

  @property
  def initial_state(self):
    return self._initial_state

  @property
  def cost(self):
    return self._cost

  @property
  def final_state(self):
    return self._final_state

  @property
  def lr(self):
    return self._lr

  @property
  def train_op(self):
    return self._train_op

def run_epoch(session, model, eval_op=None, verbose=False):
  """Runs the model on the given data."""
  start_time = time.time()
  costs = 0.0
  iters = 0
  state = session.run(model.initial_state)

  fetches = {
      "cost": model.cost,
      "final_state": model.final_state,
  }
  if eval_op is not None:
    fetches["eval_op"] = eval_op

  for step in range(model.input.epoch_size):
    feed_dict = {}
    for i, h in enumerate(model.initial_state):
      feed_dict[h] = state[i]

    vals = session.run(fetches, feed_dict)
    cost = vals["cost"]
    state = vals["final_state"]

    costs += cost
    iters += model.input.num_steps

    if verbose and step % (model.input.epoch_size // 10) == 10:
      print("%.3f perplexity: %.3f speed: %.0f wps" %
            (step * 1.0 / model.input.epoch_size, np.exp(costs / iters),
             iters * model.input.batch_size / (time.time() - start_time)))

  return np.exp(costs / iters)


def get_config():
  return Config()

def main(_):
  if not FLAGS.data_path:
    raise ValueError("Must set --data_path to RNNLM data directory")

  raw_data = reader.rnnlm_raw_data(FLAGS.data_path, FLAGS.vocab_path)
  train_data, valid_data, _, word_map = raw_data

  config = get_config()
  config.hidden_size = FLAGS.hidden_size
  config.vocab_size = len(word_map)
  eval_config = get_config()
  eval_config.batch_size = 1
  eval_config.num_steps = 1

  with tf.Graph().as_default():
    initializer = tf.random_uniform_initializer(-config.init_scale,
                                                config.init_scale)

    with tf.name_scope("Train"):
      train_input = RnnlmInput(config=config, data=train_data, name="TrainInput")
      with tf.variable_scope("Model", reuse=None, initializer=initializer):
        m = RnnlmModel(is_training=True, config=config, input_=train_input)
      tf.summary.scalar("Training Loss", m.cost)
      tf.summary.scalar("Learning Rate", m.lr)

    with tf.name_scope("Valid"):
      valid_input = RnnlmInput(config=config, data=valid_data, name="ValidInput")
      with tf.variable_scope("Model", reuse=True, initializer=initializer):
        mvalid = RnnlmModel(is_training=False, config=config, input_=valid_input)
      tf.summary.scalar("Validation Loss", mvalid.cost)

    sv = tf.train.Supervisor(logdir=FLAGS.save_path)
    with sv.managed_session() as session:
      for i in range(config.max_max_epoch):
        lr_decay = config.lr_decay ** max(i + 1 - config.max_epoch, 0.0)

        m.assign_lr(session, config.learning_rate * lr_decay)

        print("Epoch: %d Learning rate: %.3f" % (i + 1, session.run(m.lr)))
        train_perplexity = run_epoch(session, m, eval_op=m.train_op,
                                     verbose=True)

        print("Epoch: %d Train Perplexity: %.3f" % (i + 1, train_perplexity))
        valid_perplexity = run_epoch(session, mvalid)
        print("Epoch: %d Valid Perplexity: %.3f" % (i + 1, valid_perplexity))

      if FLAGS.save_path:
        print("Saving model to %s." % FLAGS.save_path)
        sv.saver.save(session, FLAGS.save_path)

if __name__ == "__main__":
  tf.app.run()


================================================
FILE: egs/steps/train_deltas.sh
================================================
#!/usr/bin/env bash

# Copyright 2012  Johns Hopkins University (Author: Daniel Povey)
# Apache 2.0

# Begin configuration.
stage=-4 #  This allows restarting after partway, when something when wrong.
config=
cmd=run.pl
scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
realign_iters="10 20 30";
num_iters=35    # Number of iterations of training
max_iter_inc=25 # Last iter to increase #Gauss on.
beam=10
careful=false
retry_beam=40
boost_silence=1.0 # Factor by which to boost silence likelihoods in alignment
power=0.25 # Exponent for number of gaussians according to occurrence counts
cluster_thresh=-1  # for build-tree control final bottom-up clustering of leaves
norm_vars=false # deprecated.  Prefer --cmvn-opts "--norm-vars=true"
                # use the option --cmvn-opts "--norm-means=false"
cmvn_opts=
delta_opts=
context_opts=   # use"--context-width=5 --central-position=2" for quinphone
# End configuration.

echo "$0 $@"  # Print the command line for logging

[ -f path.sh ] && . ./path.sh;
. parse_options.sh || exit 1;

if [ $# != 6 ]; then
   echo "Usage: steps/train_deltas.sh <num-leaves> <tot-gauss> <data-dir> <lang-dir> <alignment-dir> <exp-dir>"
   echo "e.g.: steps/train_deltas.sh 2000 10000 data/train_si84_half data/lang exp/mono_ali exp/tri1"
   echo "main options (for others, see top of script file)"
   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
   echo "  --config <config-file>                           # config containing options"
   echo "  --stage <stage>                                  # stage to do partial re-run from."
   exit 1;
fi

numleaves=$1
totgauss=$2
data=$3
lang=$4
alidir=$5
dir=$6

for f in $alidir/final.mdl $alidir/ali.1.gz $data/feats.scp $lang/phones.txt; do
  [ ! -f $f ] && echo "train_deltas.sh: no such file $f" && exit 1;
done

numgauss=$numleaves
incgauss=$[($totgauss-$numgauss)/$max_iter_inc] # per-iter increment for #Gauss
oov=`cat $lang/oov.int` || exit 1;
ciphonelist=`cat $lang/phones/context_indep.csl` || exit 1;
nj=`cat $alidir/num_jobs` || exit 1;
mkdir -p $dir/log
echo $nj > $dir/num_jobs

utils/lang/check_phones_compatible.sh $lang/phones.txt $alidir/phones.txt || exit 1;
cp $lang/phones.txt $dir || exit 1;

sdata=$data/split$nj;
split_data.sh $data $nj || exit 1;


[ $(cat $alidir/cmvn_opts 2>/dev/null | wc -c) -gt 1 ] && [ -z "$cmvn_opts" ] && \
  echo "$0: warning: ignoring CMVN options from source directory $alidir"
$norm_vars && cmvn_opts="--norm-vars=true $cmvn_opts"
echo $cmvn_opts  > $dir/cmvn_opts # keep track of options to CMVN.
[ ! -z $delta_opts ] && echo $delta_opts > $dir/delta_opts

feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas $delta_opts ark:- ark:- |"

rm $dir/.error 2>/dev/null

if [ $stage -le -3 ]; then
  echo "$0: accumulating tree stats"
  $cmd JOB=1:$nj $dir/log/acc_tree.JOB.log \
    acc-tree-stats $context_opts \
    --ci-phones=$ciphonelist $alidir/final.mdl "$feats" \
    "ark:gunzip -c $alidir/ali.JOB.gz|" $dir/JOB.treeacc || exit 1;
  sum-tree-stats $dir/treeacc $dir/*.treeacc 2>$dir/log/sum_tree_acc.log || exit 1;
  rm $dir/*.treeacc
fi

if [ $stage -le -2 ]; then
  echo "$0: getting questions for tree-building, via clustering"
  # preparing questions, roots file...
  cluster-phones $context_opts $dir/treeacc $lang/phones/sets.int \
    $dir/questions.int 2> $dir/log/questions.log || exit 1;
  cat $lang/phones/extra_questions.int >> $dir/questions.int
  compile-questions $context_opts $lang/topo $dir/questions.int \
    $dir/questions.qst 2>$dir/log/compile_questions.log || exit 1;

  echo "$0: building the tree"
  $cmd $dir/log/build_tree.log \
    build-tree $context_opts --verbose=1 --max-leaves=$numleaves \
    --cluster-thresh=$cluster_thresh $dir/treeacc $lang/phones/roots.int \
    $dir/questions.qst $lang/topo $dir/tree || exit 1;

  $cmd $dir/log/init_model.log \
    gmm-init-model  --write-occs=$dir/1.occs  \
      $dir/tree $dir/treeacc $lang/topo $dir/1.mdl || exit 1;
  if grep 'no stats' $dir/log/init_model.log; then
     echo "** The warnings above about 'no stats' generally mean you have phones **"
     echo "** (or groups of phones) in your phone set that had no corresponding data. **"
     echo "** You should probably figure out whether something went wrong, **"
     echo "** or whether your data just doesn't happen to have examples of those **"
     echo "** phones. **"
  fi

  gmm-mixup --mix-up=$numgauss $dir/1.mdl $dir/1.occs $dir/1.mdl 2>$dir/log/mixup.log || exit 1;
  rm $dir/treeacc
fi

if [ $stage -le -1 ]; then
  # Convert the alignments.
  echo "$0: converting alignments from $alidir to use current tree"
  $cmd JOB=1:$nj $dir/log/convert.JOB.log \
    convert-ali $alidir/final.mdl $dir/1.mdl $dir/tree \
     "ark:gunzip -c $alidir/ali.JOB.gz|" "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1;
fi

if [ $stage -le 0 ]; then
  echo "$0: compiling graphs of transcripts"
  $cmd JOB=1:$nj $dir/log/compile_graphs.JOB.log \
    compile-train-graphs --read-disambig-syms=$lang/phones/disambig.int $dir/tree $dir/1.mdl  $lang/L.fst  \
     "ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt < $sdata/JOB/text |" \
      "ark:|gzip -c >$dir/fsts.JOB.gz" || exit 1;
fi

x=1
while [ $x -lt $num_iters ]; do
  echo "$0: training pass $x"
  if [ $stage -le $x ]; then
    if echo $realign_iters | grep -w $x >/dev/null; then
      echo "$0: aligning data"
      mdl="gmm-boost-silence --boost=$boost_silence `cat $lang/phones/optional_silence.csl` $dir/$x.mdl - |"
      $cmd JOB=1:$nj $dir/log/align.$x.JOB.log \
        gmm-align-compiled $scale_opts --beam=$beam --retry-beam=$retry_beam --careful=$careful "$mdl" \
         "ark:gunzip -c $dir/fsts.JOB.gz|" "$feats" \
         "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1;
    fi
    $cmd JOB=1:$nj $dir/log/acc.$x.JOB.log \
      gmm-acc-stats-ali  $dir/$x.mdl "$feats" \
       "ark,s,cs:gunzip -c $dir/ali.JOB.gz|" $dir/$x.JOB.acc || exit 1;
    $cmd $dir/log/update.$x.log \
      gmm-est --mix-up=$numgauss --power=$power \
        --write-occs=$dir/$[$x+1].occs $dir/$x.mdl \
       "gmm-sum-accs - $dir/$x.*.acc |" $dir/$[$x+1].mdl || exit 1;
    rm $dir/$x.mdl $dir/$x.*.acc
    rm $dir/$x.occs
  fi
  [ $x -le $max_iter_inc ] && numgauss=$[$numgauss+$incgauss];
  x=$[$x+1];
done

rm $dir/final.mdl $dir/final.occs 2>/dev/null
ln -s $x.mdl $dir/final.mdl
ln -s $x.occs $dir/final.occs

steps/diagnostic/analyze_alignments.sh --cmd "$cmd" $lang $dir

# Summarize warning messages...
utils/summarize_warnings.pl  $dir/log

steps/info/gmm_dir_info.pl $dir

echo "$0: Done training system with delta+delta-delta features in $dir"

exit 0


================================================
FILE: egs/steps/train_diag_ubm.sh
================================================
#!/usr/bin/env bash

# Copyright Johns Hopkins University (Author: Daniel Povey),  2012.
# Apache 2.0.

# Train a diagonal mixture of Gaussians.  This is trained without
# reference to class labels-- except that, optionally, you can down-weight
# silence phones, and alignments are needed for that.
#
# The current use for this is in fMMI training.

# Begin configuration section.
nj=4
cmd=run.pl
num_iters=3
silence_weight=
stage=-2
# The value "intermediate" is a number of Gaussians we first obtain by clustering
# the Gaussians within each state of the model, before clustering down to
# $num_Gauss.  This is for efficiency.  It's not a very important parameter,
# as far as I know.
intermediate=2000
num_gselect=50 # Number of Gaussian-selection indices to use while training
               # the model.
# End configuration section.

echo "$0 $@"  # Print the command line for logging

[ -f ./path.sh ] && . ./path.sh; # source the path.
. parse_options.sh || exit 1;


if [ $# != 5 ]; then
  echo "Usage: steps/train_diag_ubm.sh <num-gauss> <data> <lang> <alignment-dir|src-dir> <dir>"
  echo " e.g.: steps/train_diag_ubm.sh 400 data/train_si84 data/lang exp/tri2b_ali_si84 exp/ubm3c"
  echo "Options: "
  echo "  --silence-weight <sil-weight>                  # default 1.0.  Use to down-weight silence."
  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
  echo "  --nj <num-job>                                 # number of parallel jobs to run."
  echo "  --num-iters <niter>                            # number of iterations of training (default: $num_iters)"
  echo "  --stage <stage>                                # stage to do partial re-run from."
  exit 1;
fi

num_gauss=$1
data=$2
lang=$3
alidir=$4
dir=$5

silphonelist=`cat $lang/phones/silence.csl` || exit 1;

sdata=$data/split$nj
splice_opts=`cat $alidir/splice_opts 2>/dev/null`
cmvn_opts=`cat $alidir/cmvn_opts 2>/dev/null`
delta_opts=`cat $alidir/delta_opts 2>/dev/null`
mkdir -p $dir/log
[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
echo $nj > $dir/num_jobs

utils/lang/check_phones_compatible.sh $lang/phones.txt $alidir/phones.txt || exit 1;
cp $lang/phones.txt $dir || exit 1;

if [ -f $alidir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
echo "$0: feature type is $feat_type"

case $feat_type in
  delta) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas $delta_opts ark:- ark:- |";;
  lda) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |"
    cp $alidir/final.mat $dir
    ;;
  *) echo "Invalid feature type $feat_type" && exit 1;
esac

if [ -f $alidir/trans.1 ]; then
  echo Using transforms from $alidir;
  [ "$nj" -ne "`cat $alidir/num_jobs`" ] && \
    echo "The number of jobs differs from alignment directory $alidir." && exit 1;
  feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$alidir/trans.JOB ark:- ark:- |"
fi

if [ ! -z "$silence_weight" ]; then
  [ ! -f $alidir/ali.1.gz ] && \
    echo "You specified weighting for silence but $alidir/ali.1.gz does not exist." && exit 1;
  [ "$nj" -ne "`cat $alidir/num_jobs`" ] && \
    echo "You specified silence weight but $alidir has different #jobs." && exit 1;
  weights="--weights='ark,s,cs:gunzip -c $alidir/ali.JOB.gz | ali-to-post ark:- ark:- | weight-silence-post $silence_weight $silphonelist $alidir/final.mdl ark:- ark:- | post-to-weights ark:- ark:- |'"
else
  weights=
fi

# $intermediate should be more than $num_gauss..
[ $[$num_gauss*2] -gt $intermediate ] && intermediate=$[$num_gauss*2] \
  && echo "Setting intermediate=$intermediate (it was too small)";

if [ $stage -le -2 ]; then
 echo "Clustering Gaussians in $alidir/final.mdl"
 $cmd $dir/log/cluster.log \
  init-ubm --fullcov-ubm=false --intermediate-num-gauss=$intermediate \
    --ubm-num-gauss=$num_gauss $alidir/final.mdl $alidir/final.occs $dir/0.dubm   || exit 1;
fi

# Store Gaussian selection indices on disk-- this speeds up the training passes.
if [ $stage -le -1 ]; then
  echo Getting Gaussian-selection info
  $cmd JOB=1:$nj $dir/log/gselect.JOB.log \
    gmm-gselect --n=$num_gselect $dir/0.dubm "$feats" \
      "ark:|gzip -c >$dir/gselect.JOB.gz" || exit 1;
fi

for x in `seq 0 $[$num_iters-1]`; do
  echo "Training pass $x"
  if [ $stage -le $x ]; then
  # Accumulate stats.
    $cmd JOB=1:$nj $dir/log/acc.$x.JOB.log \
      gmm-global-acc-stats $weights "--gselect=ark,s,cs:gunzip -c $dir/gselect.JOB.gz|" \
      $dir/$x.dubm "$feats" $dir/$x.JOB.acc || exit 1;
    if [ $x -lt $[$num_iters-1] ]; then # Don't remove low-count Gaussians till last iter,
      opt="--remove-low-count-gaussians=false" # or gselect info won't be valid any more.
    fi
    $cmd $dir/log/update.$x.log \
      gmm-global-est $opt $dir/$x.dubm "gmm-global-sum-accs - $dir/$x.*.acc|" \
      $dir/$[$x+1].dubm || exit 1;
    rm $dir/$x.*.acc $dir/$x.dubm
  fi
done

rm $dir/gselect.*.gz
mv $dir/$num_iters.dubm $dir/final.dubm || exit 1;
exit 0;


================================================
FILE: egs/steps/train_lda_mllt.sh
================================================
#!/usr/bin/env bash

# Copyright 2012  Johns Hopkins University (Author: Daniel Povey)
#
# LDA+MLLT refers to the way we transform the features after computing
# the MFCCs: we splice across several frames, reduce the dimension (to 40
# by default) using Linear Discriminant Analysis), and then later estimate,
# over multiple iterations, a diagonalizing transform known as MLLT or STC.
# See http://kaldi-asr.org/doc/transform.html for more explanation.
#
# Apache 2.0.

# Begin configuration.
cmd=run.pl
config=
stage=-5
scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
realign_iters="10 20 30";
mllt_iters="2 4 6 12";
num_iters=35    # Number of iterations of training
max_iter_inc=25  # Last iter to increase #Gauss on.
dim=40
beam=10
retry_beam=40
careful=false
boost_silence=1.0 # Factor by which to boost silence likelihoods in alignment
power=0.25 # Exponent for number of gaussians according to occurrence counts
randprune=4.0 # This is approximately the ratio by which we will speed up the
              # LDA and MLLT calculations via randomized pruning.
splice_opts=
cluster_thresh=-1  # for build-tree control final bottom-up clustering of leaves
norm_vars=false # deprecated.  Prefer --cmvn-opts "--norm-vars=false"
cmvn_opts=
context_opts=   # use "--context-width=5 --central-position=2" for quinphone.
# End configuration.
train_tree=true  # if false, don't actually train the tree.
use_lda_mat=  # If supplied, use this LDA[+MLLT] matrix.

echo "$0 $@"  # Print the command line for logging

[ -f path.sh ] && . ./path.sh
. parse_options.sh || exit 1;

if [ $# != 6 ]; then
  echo "Usage: steps/train_lda_mllt.sh [options] <#leaves> <#gauss> <data> <lang> <alignments> <dir>"
  echo " e.g.: steps/train_lda_mllt.sh 2500 15000 data/train_si84 data/lang exp/tri1_ali_si84 exp/tri2b"
  echo "Main options (for others, see top of script file)"
  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
  echo "  --config <config-file>                           # config containing options"
  echo "  --stage <stage>                                  # stage to do partial re-run from."
  exit 1;
fi

numleaves=$1
totgauss=$2
data=$3
lang=$4
alidir=$5
dir=$6

for f in $alidir/final.mdl $alidir/ali.1.gz $data/feats.scp $lang/phones.txt; do
  [ ! -f $f ] && echo "train_lda_mllt.sh: no such file $f" && exit 1;
done

numgauss=$numleaves
incgauss=$[($totgauss-$numgauss)/$max_iter_inc] # per-iter #gauss increment
oov=`cat $lang/oov.int` || exit 1;
nj=`cat $alidir/num_jobs` || exit 1;
silphonelist=`cat $lang/phones/silence.csl` || exit 1;
ciphonelist=`cat $lang/phones/context_indep.csl` || exit 1;

mkdir -p $dir/log

utils/lang/check_phones_compatible.sh $lang/phones.txt $alidir/phones.txt || exit 1;
cp $lang/phones.txt $dir || exit 1;

echo $nj >$dir/num_jobs
echo "$splice_opts" >$dir/splice_opts # keep track of frame-splicing options
           # so that later stages of system building can know what they were.


[ $(cat $alidir/cmvn_opts 2>/dev/null | wc -c) -gt 1 ] && [ -z "$cmvn_opts" ] && \
  echo "$0: warning: ignoring CMVN options from source directory $alidir"
$norm_vars && cmvn_opts="--norm-vars=true $cmvn_opts"
echo $cmvn_opts > $dir/cmvn_opts # keep track of options to CMVN.

sdata=$data/split$nj;
split_data.sh $data $nj || exit 1;

splicedfeats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- |"
# Note: $feats gets overwritten later in the script.
feats="$splicedfeats transform-feats $dir/0.mat ark:- ark:- |"


if [ $stage -le -5 ]; then
  if [ -z "$use_lda_mat" ]; then
    echo "$0: Accumulating LDA statistics."
    rm $dir/lda.*.acc 2>/dev/null
    $cmd JOB=1:$nj $dir/log/lda_acc.JOB.log \
    ali-to-post "ark:gunzip -c $alidir/ali.JOB.gz|" ark:- \| \
      weight-silence-post 0.0 $silphonelist $alidir/final.mdl ark:- ark:- \| \
      acc-lda --rand-prune=$randprune $alidir/final.mdl "$splicedfeats" ark,s,cs:- \
      $dir/lda.JOB.acc || exit 1;
    est-lda --write-full-matrix=$dir/full.mat --dim=$dim $dir/0.mat $dir/lda.*.acc \
      2>$dir/log/lda_est.log || exit 1;
    rm $dir/lda.*.acc
  else
    echo "$0: Using supplied LDA matrix $use_lda_mat"
    cp $use_lda_mat $dir/0.mat || exit 1;
    [ ! -z "$mllt_iters" ] && \
      echo "$0: Warning: using supplied LDA matrix $use_lda_mat but we will do MLLT," && \
      echo "     which you might not want; to disable MLLT, specify --mllt-iters ''" && \
      sleep 5
  fi
fi

cur_lda_iter=0

if [ $stage -le -4 ] && $train_tree; then
  echo "$0: Accumulating tree stats"
  $cmd JOB=1:$nj $dir/log/acc_tree.JOB.log \
    acc-tree-stats $context_opts \
    --ci-phones=$ciphonelist $alidir/final.mdl "$feats" \
    "ark:gunzip -c $alidir/ali.JOB.gz|" $dir/JOB.treeacc || exit 1;
  [ `ls $dir/*.treeacc | wc -w` -ne "$nj" ] && echo "$0: Wrong #tree-accs" && exit 1;
  $cmd $dir/log/sum_tree_acc.log \
    sum-tree-stats $dir/treeacc $dir/*.treeacc || exit 1;
  rm $dir/*.treeacc
fi


if [ $stage -le -3 ] && $train_tree; then
  echo "$0: Getting questions for tree clustering."
  # preparing questions, roots file...
  cluster-phones $context_opts $dir/treeacc $lang/phones/sets.int \
    $dir/questions.int 2> $dir/log/questions.log || exit 1;
  cat $lang/phones/extra_questions.int >> $dir/questions.int
  compile-questions $context_opts $lang/topo $dir/questions.int \
    $dir/questions.qst 2>$dir/log/compile_questions.log || exit 1;

  echo "$0: Building the tree"
  $cmd $dir/log/build_tree.log \
    build-tree $context_opts --verbose=1 --max-leaves=$numleaves \
    --cluster-thresh=$cluster_thresh $dir/treeacc $lang/phones/roots.int \
    $dir/questions.qst $lang/topo $dir/tree || exit 1;
fi

if [ $stage -le -2 ]; then
  echo "$0: Initializing the model"
  if $train_tree; then
    gmm-init-model  --write-occs=$dir/1.occs  \
      $dir/tree $dir/treeacc $lang/topo $dir/1.mdl 2> $dir/log/init_model.log || exit 1;
    grep 'no stats' $dir/log/init_model.log && echo "This is a bad warning.";
    rm $dir/treeacc
  else
    cp $alidir/tree $dir/ || exit 1;
    $cmd JOB=1 $dir/log/init_model.log \
      gmm-init-model-flat $dir/tree $lang/topo $dir/1.mdl \
        "$feats subset-feats ark:- ark:-|" || exit 1;
  fi
fi


if [ $stage -le -1 ]; then
  # Convert the alignments.
  echo "$0: Converting alignments from $alidir to use current tree"
  $cmd JOB=1:$nj $dir/log/convert.JOB.log \
    convert-ali $alidir/final.mdl $dir/1.mdl $dir/tree \
     "ark:gunzip -c $alidir/ali.JOB.gz|" "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1;
fi

if [ $stage -le 0 ] && [ "$realign_iters" != "" ]; then
  echo "$0: Compiling graphs of transcripts"
  $cmd JOB=1:$nj $dir/log/compile_graphs.JOB.log \
    compile-train-graphs --read-disambig-syms=$lang/phones/disambig.int $dir/tree $dir/1.mdl  $lang/L.fst  \
     "ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt < $data/split$nj/JOB/text |" \
      "ark:|gzip -c >$dir/fsts.JOB.gz" || exit 1;
fi


x=1
while [ $x -lt $num_iters ]; do
  echo Training pass $x
  if echo $realign_iters | grep -w $x >/dev/null && [ $stage -le $x ]; then
    echo Aligning data
    mdl="gmm-boost-silence --boost=$boost_silence `cat $lang/phones/optional_silence.csl` $dir/$x.mdl - |"
    $cmd JOB=1:$nj $dir/log/align.$x.JOB.log \
      gmm-align-compiled $scale_opts --beam=$beam --retry-beam=$retry_beam --careful=$careful "$mdl" \
      "ark:gunzip -c $dir/fsts.JOB.gz|" "$feats" \
      "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1;
  fi
  if echo $mllt_iters | grep -w $x >/dev/null; then
    if [ $stage -le $x ]; then
      echo "$0: Estimating MLLT"
      $cmd JOB=1:$nj $dir/log/macc.$x.JOB.log \
        ali-to-post "ark:gunzip -c $dir/ali.JOB.gz|" ark:- \| \
        weight-silence-post 0.0 $silphonelist $dir/$x.mdl ark:- ark:- \| \
        gmm-acc-mllt --rand-prune=$randprune  $dir/$x.mdl "$feats" ark:- $dir/$x.JOB.macc \
        || exit 1;
      est-mllt $dir/$x.mat.new $dir/$x.*.macc 2> $dir/log/mupdate.$x.log || exit 1;
      gmm-transform-means  $dir/$x.mat.new $dir/$x.mdl $dir/$x.mdl \
        2> $dir/log/transform_means.$x.log || exit 1;
      compose-transforms --print-args=false $dir/$x.mat.new $dir/$cur_lda_iter.mat $dir/$x.mat || exit 1;
      rm $dir/$x.*.macc
    fi
    feats="$splicedfeats transform-feats $dir/$x.mat ark:- ark:- |"
    cur_lda_iter=$x
  fi

  if [ $stage -le $x ]; then
    $cmd JOB=1:$nj $dir/log/acc.$x.JOB.log \
      gmm-acc-stats-ali  $dir/$x.mdl "$feats" \
      "ark,s,cs:gunzip -c $dir/ali.JOB.gz|" $dir/$x.JOB.acc || exit 1;
    $cmd $dir/log/update.$x.log \
      gmm-est --write-occs=$dir/$[$x+1].occs --mix-up=$numgauss --power=$power \
        $dir/$x.mdl "gmm-sum-accs - $dir/$x.*.acc |" $dir/$[$x+1].mdl || exit 1;
    rm $dir/$x.mdl $dir/$x.*.acc $dir/$x.occs
  fi
  [ $x -le $max_iter_inc ] && numgauss=$[$numgauss+$incgauss];
  x=$[$x+1];
done

rm $dir/final.{mdl,mat,occs} 2>/dev/null
ln -s $x.mdl $dir/final.mdl
ln -s $x.occs $dir/final.occs
ln -s $cur_lda_iter.mat $dir/final.mat

steps/diagnostic/analyze_alignments.sh --cmd "$cmd" $lang $dir

# Summarize warning messages...
utils/summarize_warnings.pl $dir/log

steps/info/gmm_dir_info.pl $dir

echo "$0: Done training system with LDA+MLLT features in $dir"

exit 0


================================================
FILE: egs/steps/train_lvtln.sh
================================================
#!/usr/bin/env bash

# Copyright 2012-2014  Johns Hopkins University (Author: Daniel Povey)
# Copyright 2014       Vimal Manohar
# This training script trains linear-VTLN models starting from an existing
# system based on either LDA+MLLT or delta+delta-delta features.
# Works with either mfcc or plp features, but you need to set the 
# --base-feat-type option.
# The resulting system can be used with align_lvtln.sh and/or decode_lvtln.sh
# to get VTLN warping factors for data, for warped data extraction, or (for
# the training data) you can use the warping factors this script outputs
# in $dir/final.warp
#
# Apache 2.0

# Begin configuration.
stage=-6 #  This allows restarting after partway, when something when wrong.
config=
cmd=run.pl
scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
realign_iters="10 20 30";
num_iters=35    # Number of iterations of training
max_iter_inc=25 # Last iter to increase #Gauss on.
beam=10
retry_beam=40
boost_silence=1.0 # Factor by which to boost silence likelihoods in alignment
power=0.25 # Exponent for number of gaussians according to occurrence counts
cluster_thresh=-1  # for build-tree control final bottom-up clustering of leaves
cmvn_opts=  # you can supply e.g. --cmvn-opts "--norm-vars=true" to turn on variance
            # normalization, but only if base system is the delta type, not LDA.
lvtln_iters="2 4 6 8 10 12 14 16 20"; # iters on which to recompute LVTLN transform"
num_utt_lvtln_init=200; # number of utterances (subset) to initialize
                        # LVTLN transform.  Not too critical.
min_warp=0.85
max_warp=1.25
warp_step=0.01
base_feat_type=mfcc # or could be PLP.
logdet_scale=0.0

# End configuration.

echo "$0 $@"  # Print the command line for logging

[ -f path.sh ] && . ./path.sh;
. parse_options.sh || exit 1;

num_classes=$(perl -e "print int(1.5 + ($max_warp - $min_warp) / $warp_step);") || exit 1;
default_class=$(perl -e "print int(0.5 + (1.0 - $min_warp) / $warp_step);") || exit 1;

if [ $# != 6 ]; then
   echo "Usage: $0 <num-leaves> <tot-gauss> <data-dir> <lang-dir> <alignment-dir> <exp-dir>"
   echo "e.g.: $0 2000 10000 data/train_si84_half data/lang exp/mono_ali exp/tri1"
   echo "main options (for others, see top of script file)"
   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
   echo "  --config <config-file>                           # config containing options"
   echo "  --stage <stage>                                  # stage to do partial re-run from."
   exit 1;
fi

numleaves=$1
totgauss=$2
data=$3
lang=$4
alidir=$5
dir=$6

for f in $alidir/final.mdl $alidir/ali.1.gz $data/feats.scp $lang/phones.txt $data/wav.scp; do
  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
done

numgauss=$numleaves
incgauss=$[($totgauss-$numgauss)/$max_iter_inc] # per-iter increment for #Gauss
oov=`cat $lang/oov.int` || exit 1;
silphonelist=`cat $lang/phones/silence.csl` || exit 1;
ciphonelist=`cat $lang/phones/context_indep.csl` || exit 1;
nj=`cat $alidir/num_jobs` || exit 1;
splice_opts=`cat $alidir/splice_opts 2>/dev/null`
mkdir -p $dir/log
echo $nj > $dir/num_jobs

utils/lang/check_phones_compatible.sh $lang/phones.txt $alidir/phones.txt || exit 1;
cp $lang/phones.txt $dir || exit 1;

sdata=$data/split$nj;
split_data.sh $data $nj || exit 1;


cp $alidir/splice_opts $dir 2>/dev/null


if [ ! -f $alidir/final.mat ]; then
  [ $(cat $alidir/cmvn_opts 2>/dev/null | wc -c) -gt 1 ] && [ -z "$cmvn_opts" ] && \
    echo "$0: warning: ignoring CMVN options from $alidir.";
  echo $cmvn_opts > $dir/cmvn_opts

  echo "$0: Using delta+delta-delta features since $alidir/final.mat does not exist"
  sifeats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |"
  feats="$sifeats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$dir/trans.JOB ark:- ark:- |"
  # for the subsets of features that we use to estimate the linear transforms, we don't
  # bother with CMVN.  This will give us wrong offsets on the transforms, but it will end
  # up not mattering because we allow an arbitrary offset (bias) term when we apply
  # these transforms.
  featsub_warped="ark:add-deltas ark:$dir/feats.CLASS.ark ark:- |" # you need to define CLASS when invoking $cmd.
  featsub_unwarped="ark:add-deltas ark:$dir/feats.$default_class.ark ark:- |"
else
  echo "$0: Using LDA features"
  [ ! -z "$cmvn_opts" ] && echo  "$0: you cannot supply --cmvn-opts if base system is LDA."
  cp $alidir/final.mat $alidir/full.mat $alidir/splice_opts $alidir/cmvn_opts $dir 2>/dev/null 
  cmvn_opts=`cat $dir/cmvn_opts 2>/dev/null`
  sifeats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $dir/final.mat ark:- ark:- |"
  feats="$sifeats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$dir/trans.JOB ark:- ark:- |"
  featsub_warped="ark:splice-feats $splice_opts ark:$dir/feats.CLASS.ark ark:- | transform-feats $dir/final.mat ark:- ark:- |" # you need to define CLASS when invoking $cmd.
  featsub_unwarped="ark:splice-feats $splice_opts ark:$dir/feats.$default_class.ark ark:- | transform-feats $dir/final.mat ark:- ark:- |"  
fi

if [ -f $data/utt2warp ]; then
  echo "$0: source data directory $data appears to already have VTLN.";
  exit 1;
fi

# create a small subset of utterances for purposes of initializing the LVTLN transform
# utils/shuffle_list.pl is deterministic, unlike sort -R.
cat $data/utt2spk | awk '{print $1}' | utils/shuffle_list.pl | \
  head -n $num_utt_lvtln_init > $dir/utt_subset

if [ $stage -le -6 ]; then
  echo "$0: computing warped subset of features"
  if [ -f $data/segments ]; then
    echo "$0 [info]: segments file exists: using that."
    subset_feats="utils/filter_scp.pl $dir/utt_subset $data/segments | extract-segments scp:$data/wav.scp - ark:- "
  else
    echo "$0 [info]: no segments file exists: using wav.scp directly."
    subset_feats="utils/filter_scp.pl $dir/utt_subset $data/wav.scp | wav-copy scp:- ark:- "
  fi
  rm $dir/.error 2>/dev/null
  for c in $(seq 0 $[$num_classes-1]); do
    this_warp=$(perl -e "print ($min_warp + ($c*$warp_step));")
    $cmd $dir/log/compute_warped_feats.$c.log \
      $subset_feats \| compute-${base_feat_type}-feats --verbose=2 \
      --config=conf/${base_feat_type}.conf --vtln-warp=$this_warp ark:- ark:- \| \
      copy-feats --compress=true ark:- ark:$dir/feats.$c.ark || touch $dir/.error &
  done
  wait;
  if [ -f $dir/.error ]; then
    echo "$0: Computing warped features failed: check $dir/log/compute_warped_feats.*.log"
    exit 1;
  fi
fi

if ! utils/filter_scp.pl $dir/utt_subset $data/feats.scp | \
  compare-feats --threshold=0.98 scp:-  ark:$dir/feats.$default_class.ark >&/dev/null; then
  echo "$0: features stored on disk differ from those computed with no warping."
  echo "    Possibly your feature type is wrong (--base-feat-type option)"
  exit 1;
fi
  
if [ -f $data/segments ]; then
  subset_utts="ark:extract-segments scp:$sdata/JOB/wav.scp $sdata/JOB/segments ark:- |"
else
  echo "$0 [info]: no segments file exists: using wav.scp directly."
  subset_utts="ark:wav-copy scp:$sdata/JOB/wav.scp ark:- |"
fi

if [ $stage -le -5 ]; then
  echo "$0: initializing base LVTLN transforms in $dir/0.lvtln (ignore warnings below)"
  dim=$(feat-to-dim "$featsub_unwarped" - ) || exit 1;

  $cmd $dir/log/init_lvtln.log \
    gmm-init-lvtln --dim=$dim --num-classes=$num_classes --default-class=$default_class \
      $dir/0.lvtln || exit 1;

  $cmd JOB=1:$nj $dir/log/get_weights.JOB.log \
    ali-to-post "ark:gunzip -c $alidir/ali.JOB.gz |" ark:- \| \
    weight-silence-post 0.0 "$silphonelist" $alidir/final.mdl ark:- ark:- \| \
    post-to-weights ark:- "ark,scp:$dir/weights.JOB.ark,$dir/weights.JOB.scp" || exit 1

  for n in `seq 1 $nj`; do 
    cat $dir/weights.$n.scp
  done > $dir/weights.scp

  for c in $(seq 0 $[$num_classes-1]); do
    this_warp=$(perl -e "print ($min_warp + ($c*$warp_step));")
    orig_feats=ark:$dir/feats.$default_class.ark
    warped_feats=ark:$dir/feats.$c.ark
    logfile=$dir/log/train_special.$c.log
    this_featsub_warped="$(echo $featsub_warped | sed s/CLASS/$c/)"
    if ! gmm-train-lvtln-special --warp=$this_warp --normalize-var=true \
      --weights-in="scp:$dir/weights.scp" \
      $c $dir/0.lvtln $dir/0.lvtln \
      "$featsub_unwarped" "$this_featsub_warped" 2>$logfile; then
      echo "$0: Error training LVTLN transform, see $logfile";
      exit 1;
    fi
  done  
  rm $dir/final.lvtln 2>/dev/null
  ln -s 0.lvtln $dir/final.lvtln
fi

if [ $stage -le -4 ]; then
  echo "$0: computing initial LVTLN transforms for speakers"

  if [ -f $alidir/final.alimdl ]; then
    # if the base system was trained with SAT, it's probably better
    # to use the .alimdl, trained speaker-independent, to get the
    # LVTLN transforms (LVTLN may be closer to an unadapted system).
    echo "$0: to get initial LVTLN transforms, using $alidir/final.alimdl"
    srcmodel=$alidir/final.alimdl
  else
    srcmodel=$alidir/final.mdl
  fi

  $cmd JOB=1:$nj $dir/log/lvtln.0.JOB.log \
    ali-to-post "ark:gunzip -c $alidir/ali.JOB.gz|" ark:- \| \
    weight-silence-post 0.0 "$silphonelist" $alidir/final.mdl ark:- ark:- \| \
    gmm-post-to-gpost $srcmodel "$sifeats" ark:- ark:- \| \
    gmm-est-lvtln-trans --logdet-scale=$logdet_scale --verbose=1 \
      --spk2utt=ark:$sdata/JOB/spk2utt $srcmodel \
      $dir/0.lvtln "$sifeats" ark:- ark:$dir/trans.JOB ark,t:$dir/warp.0.JOB || exit 1
  
  # consolidate the warps into one file.
  for j in $(seq $nj); do cat $dir/warp.0.$j; done > $dir/warp.0
  rm $dir/warp.0.*
fi

if [ $stage -le -3 ]; then
  echo "$0: accumulating tree stats"
  $cmd JOB=1:$nj $dir/log/acc_tree.JOB.log \
    acc-tree-stats  --ci-phones=$ciphonelist $alidir/final.mdl "$feats" \
     "ark:gunzip -c $alidir/ali.JOB.gz|" $dir/JOB.treeacc || exit 1;
  sum-tree-stats $dir/treeacc $dir/*.treeacc 2>$dir/log/sum_tree_acc.log || exit 1;
  rm $dir/*.treeacc
fi

if [ $stage -le -2 ]; then
  echo "$0: getting questions for tree-building, via clustering"
  # preparing questions, roots file...
  cluster-phones $dir/treeacc $lang/phones/sets.int $dir/questions.int 2> $dir/log/questions.log || exit 1;
  cat $lang/phones/extra_questions.int >> $dir/questions.int
  compile-questions $lang/topo $dir/questions.int $dir/questions.qst 2>$dir/log/compile_questions.log || exit 1;

  echo "$0: building the tree"
  $cmd $dir/log/build_tree.log \
    build-tree --verbose=1 --max-leaves=$numleaves \
    --cluster-thresh=$cluster_thresh $dir/treeacc $lang/phones/roots.int \
    $dir/questions.qst $lang/topo $dir/tree || exit 1;

  gmm-init-model  --write-occs=$dir/1.occs  \
    $dir/tree $dir/treeacc $lang/topo $dir/1.mdl 2> $dir/log/init_model.log || exit 1;
  grep 'no stats' $dir/log/init_model.log && echo "This is a bad warning.";

  gmm-mixup --mix-up=$numgauss $dir/1.mdl $dir/1.occs $dir/1.mdl 2>$dir/log/mixup.log || exit 1;
  rm $dir/treeacc
fi

if [ $stage -le -1 ]; then
  # Convert the alignments.
  echo "$0: converting alignments from $alidir to use current tree"
  $cmd JOB=1:$nj $dir/log/convert.JOB.log \
    convert-ali $alidir/final.mdl $dir/1.mdl $dir/tree \
     "ark:gunzip -c $alidir/ali.JOB.gz|" "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1;
fi

if [ $stage -le 0 ]; then
  echo "$0: compiling graphs of transcripts"
  $cmd JOB=1:$nj $dir/log/compile_graphs.JOB.log \
    compile-train-graphs --read-disambig-syms=$lang/phones/disambig.int $dir/tree $dir/1.mdl  $lang/L.fst  \
     "ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt < $data/split$nj/JOB/text |" \
      "ark:|gzip -c >$dir/fsts.JOB.gz" || exit 1;
fi

x=1
while [ $x -lt $num_iters ]; do
  echo "$0: training pass $x"
  if echo $realign_iters | grep -w $x >/dev/null; then
    if [ $stage -le $x ]; then
      echo "$0: aligning data"
      mdl="gmm-boost-silence --boost=$boost_silence `cat $lang/phones/optional_silence.csl` $dir/$x.mdl - |"
      $cmd JOB=1:$nj $dir/log/align.$x.JOB.log \
        gmm-align-compiled $scale_opts --beam=$beam --retry-beam=$retry_beam "$mdl" \
         "ark:gunzip -c $dir/fsts.JOB.gz|" "$feats" \
         "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1;
    fi
  fi
  if echo $lvtln_iters | grep -w $x >/dev/null; then
    if [ $stage -le $x ]; then
      echo "Re-estimating LVTLN transforms"
      $cmd JOB=1:$nj $dir/log/lvtln.$x.JOB.log \
        ali-to-post "ark:gunzip -c $dir/ali.JOB.gz|" ark:-  \| \
        weight-silence-post 0.0 $silphonelist $dir/$x.mdl ark:- ark:- \| \
        gmm-post-to-gpost $dir/$x.mdl "$feats" ark:- ark:- \| \
        gmm-est-lvtln-trans --logdet-scale=$logdet_scale --verbose=1 \
          --spk2utt=ark:$sdata/JOB/spk2utt $dir/$x.mdl \
          $dir/0.lvtln "$sifeats" ark:- ark:$dir/new_trans.JOB ark,t:$dir/warp.$x.JOB || exit 1
      # consolidate the warps into one file.
      for j in $(seq $nj); do mv $dir/new_trans.$j $dir/trans.$j; done
      for j in $(seq $nj); do cat $dir/warp.$x.$j; done > $dir/warp.$x
      rm $dir/warp.$x.*
    fi
  fi

  if [ $stage -le $x ]; then
    $cmd JOB=1:$nj $dir/log/acc.$x.JOB.log \
      gmm-acc-stats-ali  $dir/$x.mdl "$feats" \
      "ark,s,cs:gunzip -c $dir/ali.JOB.gz|" $dir/$x.JOB.acc || exit 1;
    $cmd $dir/log/update.$x.log \
      gmm-est --mix-up=$numgauss --power=$power \
      --write-occs=$dir/$[$x+1].occs $dir/$x.mdl \
      "gmm-sum-accs - $dir/$x.*.acc |" $dir/$[$x+1].mdl || exit 1;
    rm $dir/$x.mdl $dir/$x.*.acc
    rm $dir/$x.occs
  fi
  [ $x -le $max_iter_inc ] && numgauss=$[$numgauss+$incgauss];
  x=$[$x+1];
done


if [ $stage -le $x ]; then
  # Accumulate stats for "alignment model"-- this model is computed with the
  # speaker-independent features, but matches Gaussian-for-Gaussian with the
  # final speaker-adapted model.
  $cmd JOB=1:$nj $dir/log/acc_alimdl.JOB.log \
    ali-to-post "ark:gunzip -c $dir/ali.JOB.gz|" ark:-  \| \
    gmm-acc-stats-twofeats $dir/$x.mdl "$feats" "$sifeats" \
    ark,s,cs:- $dir/$x.JOB.acc || exit 1;
  [ `ls $dir/$x.*.acc | wc -w` -ne "$nj" ] && echo "$0: Wrong #accs" && exit 1;
  # Update model.
  $cmd $dir/log/est_alimdl.log \
    gmm-est --power=$power --remove-low-count-gaussians=false $dir/$x.mdl \
    "gmm-sum-accs - $dir/$x.*.acc|" $dir/$x.alimdl  || exit 1;
  rm $dir/$x.*.acc
fi

if true; then # Diagnostics
  last_iter=$(echo 0 $lvtln_iters  | awk '{print $NF;}')
  ln -sf warp.$last_iter $dir/final.warp
  if [ -f $data/spk2gender ]; then 
    # To make it easier to eyeball the male and female speakers' warps
    # separately, separate them out.
    for g in m f; do # means: for gender in male female
      cat $dir/final.warp | \
        utils/filter_scp.pl <(grep -w $g $data/spk2gender | awk '{print $1}') > $dir/final.warp.$g
      echo -n "The last few warp factors for gender $g are: "
      tail -n 10 $dir/final.warp.$g | awk '{printf("%s ", $2);}'; 
      echo
    done
  fi
fi

ln -sf $x.mdl $dir/final.mdl
ln -sf $x.occs $dir/final.occs
ln -sf $x.alimdl $dir/final.alimdl

# Summarize warning messages...
utils/summarize_warnings.pl  $dir/log

echo "$0: Done training LVTLN system in $dir"


================================================
FILE: egs/steps/train_map.sh
================================================
#!/usr/bin/env bash
# Copyright 2012  Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.


# Train a model on top of existing features (no feature-space learning of any
# kind is done).  This script does not re-train the tree, it just does one iteration
# of MAP adaptation to the model in the input alignment-directory.  It's useful for
# adapting a system to a specific gender, or new acoustic conditions.

# Note: what we implement here is not the MAP from the paper by Gauvain and Lee,
# it's the simpler (and, I believe, more widely used) so-called "relevance MAP",
# implemented in HTK, where we add a fixed count "tau" of fake Gaussian stats
# generated from the old model, to the new 'in-domain' stats from the features
# and alignments provided;  and we only update the mean.  So if the new count
# is zero it just gives you the Gaussian parameters from the old model, but as
# you get more than about tau counts, it approaches the in-domain stats.
# We use 'gmm-ismooth-stats' in the command line because the equations for this
# are the same as the equations for i-smoothing in discriminative training
# (for which, see my [Dan Povey's] PhD thesis).

# Begin configuration..
cmd=run.pl
stage=0
tau=20 # smoothing constant used in MAP estimation, corresponds to the number of
       # "fake counts" that we add for the old model.  Larger tau corresponds to less
       # aggressive re-estimation, and more smoothing.  You might want to try 10 or 15 also
# End configuration section.

echo "$0 $@"  # Print the command line for logging

[ -f path.sh ] && . ./path.sh
. parse_options.sh || exit 1;

if [ $# != 4 ]; then
  echo "Usage: steps/train_map.sh <data> <lang> <ali-dir> <exp-dir>"
  echo " e.g.: steps/train_map.sh data/train_si84_female data/lang exp/tri3c_ali_si84_female exp/tri4b_female"
  echo "Main options (for others, see top of script file)"
  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
  echo "  --config <config-file>                           # config containing options"
  echo "  --stage <stage>                                  # stage to do partial re-run from."
  exit 1;
fi

data=$1
lang=$2
alidir=$3
dir=$4

for f in $data/feats.scp $lang/L.fst $alidir/ali.1.gz $alidir/final.mdl; do
  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
done

# Set various variables.
nj=`cat $alidir/num_jobs` || exit 1;
sdata=$data/split$nj
splice_opts=`cat $alidir/splice_opts 2>/dev/null` # frame-splicing options.
cmvn_opts=`cat $alidir/cmvn_opts 2>/dev/null`
delta_opts=`cat $alidir/delta_opts 2>/dev/null`


mkdir -p $dir/log

utils/lang/check_phones_compatible.sh $lang/phones.txt $alidir/phones.txt || exit 1;
cp $lang/phones.txt $dir || exit 1;

cp $alidir/tree $dir
# link ali.*.gz from $alidir to dest directory.
utils/ln.pl $alidir/ali.*.gz $dir


echo $nj >$dir/num_jobs
cp $alidir/splice_opts $dir 2>/dev/null
cp $alidir/cmvn_opts $dir 2>/dev/null # cmn/cmvn option.
cp $alidir/delta_opts $dir 2>/dev/null
[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;

## Set up features.
if [ -f $alidir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
echo "$0: feature type is $feat_type"

case $feat_type in
  delta) sifeats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas $delta_opts ark:- ark:- |";;
  lda) sifeats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |"
    cp $alidir/final.mat $dir
    cp $alidir/full.mat $dir 2>/dev/null
    ;;
  *) echo "Invalid feature type $feat_type" && exit 1;
esac
if [ -f $alidir/trans.1 ]; then
  echo "$0: using transforms from $alidir"
  ln.pl $alidir/trans.* $dir # Link them to dest dir.
  feats="$sifeats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$dir/trans.JOB ark:- ark:- |"
else
  feats="$sifeats"
fi
##

if [ $stage -le 0 ]; then
  $cmd JOB=1:$nj $dir/log/acc.JOB.log \
    gmm-acc-stats-ali  $alidir/final.mdl "$feats" \
    "ark,s,cs:gunzip -c $alidir/ali.JOB.gz|"  $dir/0.JOB.acc || exit 1;

  [ "`ls $dir/0.*.acc | wc -w`" -ne "$nj" ] && echo "$0: wrong #accs" && exit 1;

  $cmd $dir/log/sum_accs.log \
    gmm-sum-accs $dir/0.acc $dir/0.*.acc || exit 1;

  rm $dir/0.*.acc
fi

if [ $stage -le 1 ]; then
  # Update only the model means.  This is traditional in MAP estimation.
  $cmd $dir/log/update.log \
     gmm-ismooth-stats --smooth-from-model --tau=$tau $alidir/final.mdl $dir/0.acc - \| \
     gmm-est --update-flags=m --write-occs=$dir/final.occs --remove-low-count-gaussians=false \
           $alidir/final.mdl - $dir/final.mdl || exit 1;
fi

echo Done


================================================
FILE: egs/steps/train_mmi.sh
================================================
#!/usr/bin/env bash
# Copyright 2012  Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.

# MMI training (or optionally boosted MMI, if you give the --boost option).
# 4 iterations (by default) of Extended Baum-Welch update.
#
# For the numerator we have a fixed alignment rather than a lattice--
# this actually follows from the way lattices are defined in Kaldi, which
# is to have a single path for each word (output-symbol) sequence.

# Begin configuration section.
cmd=run.pl
num_iters=4
boost=0.0
cancel=true # if true, cancel num and den counts on each frame.
drop_frames=false # if true, ignore stats from frames where num + den
                       # have no overlap. 
tau=400
weight_tau=10
acwt=0.1
stage=0
# End configuration section

echo "$0 $@"  # Print the command line for logging

[ -f ./path.sh ] && . ./path.sh; # source the path.
. parse_options.sh || exit 1;

if [ $# -ne 5 ]; then
  echo "Usage: steps/train_mmi.sh <data> <lang> <ali> <denlats> <exp>"
  echo " e.g.: steps/train_mmi.sh data/train_si84 data/lang exp/tri2b_ali_si84 exp/tri2b_denlats_si84 exp/tri2b_mmi"
  echo "Main options (for others, see top of script file)"
  echo "  --boost <boost-weight>                           # (e.g. 0.1), for boosted MMI.  (default 0)"
  echo "  --cancel (true|false)                            # cancel stats (true by default)"
  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
  echo "  --config <config-file>                           # config containing options"
  echo "  --stage <stage>                                  # stage to do partial re-run from."
  echo "  --tau                                            # tau for i-smooth to last iter (default 200)"
  
  exit 1;
fi

data=$1
lang=$2
alidir=$3
denlatdir=$4
dir=$5
mkdir -p $dir/log

utils/lang/check_phones_compatible.sh $lang/phones.txt $alidir/phones.txt || exit 1;
cp $lang/phones.txt $dir || exit 1;

for f in $data/feats.scp $alidir/{tree,final.mdl,ali.1.gz} $denlatdir/lat.1.gz; do
  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
done
nj=`cat $alidir/num_jobs` || exit 1;
[ "$nj" -ne "`cat $denlatdir/num_jobs`" ] && \
  echo "$alidir and $denlatdir have different num-jobs" && exit 1;

sdata=$data/split$nj
splice_opts=`cat $alidir/splice_opts 2>/dev/null`
cmvn_opts=`cat $alidir/cmvn_opts 2>/dev/null`
delta_opts=`cat $alidir/delta_opts 2>/dev/null`
mkdir -p $dir/log
cp $alidir/splice_opts $dir 2>/dev/null
cp $alidir/cmvn_opts $dir 2>/dev/null # cmn/cmvn option.
cp $alidir/delta_opts $dir 2>/dev/null
[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
echo $nj > $dir/num_jobs

cp $alidir/tree $dir
cp $alidir/final.mdl $dir/0.mdl

silphonelist=`cat $lang/phones/silence.csl` || exit 1;

# Set up features

if [ -f $alidir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
echo "$0: feature type is $feat_type"

case $feat_type in
  delta) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas $delta_opts ark:- ark:- |";;
  lda) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |"
    cp $alidir/final.mat $dir    
    ;;
  *) echo "Invalid feature type $feat_type" && exit 1;
esac

[ -f $alidir/trans.1 ] && echo Using transforms from $alidir && \
  feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$alidir/trans.JOB ark:- ark:- |"

lats="ark:gunzip -c $denlatdir/lat.JOB.gz|"
if [[ "$boost" != "0.0" && "$boost" != 0 ]]; then
  lats="$lats lattice-boost-ali --b=$boost --silence-phones=$silphonelist $alidir/final.mdl ark:- 'ark,s,cs:gunzip -c $alidir/ali.JOB.gz|' ark:- |"
fi


x=0
while [ $x -lt $num_iters ]; do
  echo "Iteration $x of MMI training"
  # Note: the num and den states are accumulated at the same time, so we
  # can cancel them per frame.
  if [ $stage -le $x ]; then
    $cmd JOB=1:$nj $dir/log/acc.$x.JOB.log \
      gmm-rescore-lattice $dir/$x.mdl "$lats" "$feats" ark:- \| \
      lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \
      sum-post --drop-frames=$drop_frames --merge=$cancel --scale1=-1 \
      ark:- "ark,s,cs:gunzip -c $alidir/ali.JOB.gz | ali-to-post ark:- ark:- |" ark:- \| \
      gmm-acc-stats2 $dir/$x.mdl "$feats" ark,s,cs:- \
      $dir/num_acc.$x.JOB.acc $dir/den_acc.$x.JOB.acc || exit 1;

    n=`echo $dir/{num,den}_acc.$x.*.acc | wc -w`;
    [ "$n" -ne $[$nj*2] ] && \
      echo "Wrong number of MMI accumulators $n versus 2*$nj" && exit 1;
    $cmd $dir/log/den_acc_sum.$x.log \
      gmm-sum-accs $dir/den_acc.$x.acc $dir/den_acc.$x.*.acc || exit 1;
    rm $dir/den_acc.$x.*.acc
    $cmd $dir/log/num_acc_sum.$x.log \
      gmm-sum-accs $dir/num_acc.$x.acc $dir/num_acc.$x.*.acc || exit 1;
    rm $dir/num_acc.$x.*.acc

  # note: this tau value is for smoothing towards model parameters, not
  # as in the Boosted MMI paper, not towards the ML stats as in the earlier
  # work on discriminative training (e.g. my thesis).  
  # You could use gmm-ismooth-stats to smooth to the ML stats, if you had
  # them available [here they're not available if cancel=true].

    $cmd $dir/log/update.$x.log \
      gmm-est-gaussians-ebw --tau=$tau $dir/$x.mdl $dir/num_acc.$x.acc $dir/den_acc.$x.acc - \| \
      gmm-est-weights-ebw --weight-tau=$weight_tau - $dir/num_acc.$x.acc $dir/den_acc.$x.acc $dir/$[$x+1].mdl || exit 1;
    rm $dir/{den,num}_acc.$x.acc
  fi

  # Some diagnostics: the objective function progress and auxiliary-function
  # improvement.

  tail -n 50 $dir/log/acc.$x.*.log | perl -e '$acwt=shift @ARGV; while(<STDIN>) { if(m/gmm-acc-stats2.+Overall weighted acoustic likelihood per frame was (\S+) over (\S+) frames/) { $tot_aclike += $1*$2; $tot_frames1 += $2; } if(m|lattice-to-post.+Overall average log-like/frame is (\S+) over (\S+) frames.  Average acoustic like/frame is (\S+)|) { $tot_den_lat_like += $1*$2; $tot_frames2 += $2; $tot_den_aclike += $3*$2; } } if (abs($tot_frames1 - $tot_frames2) > 0.01*($tot_frames1 + $tot_frames2)) { print STDERR "Frame-counts disagree $tot_frames1 versus $tot_frames2\n"; } $tot_den_lat_like /= $tot_frames2; $tot_den_aclike /= $tot_frames2; $tot_aclike *= ($acwt / $tot_frames1);  $num_like = $tot_aclike + $tot_den_aclike; $per_frame_objf = $num_like - $tot_den_lat_like; print "$per_frame_objf $tot_frames1\n"; ' $acwt > $dir/tmpf
  objf=`cat $dir/tmpf | awk '{print $1}'`;
  nf=`cat $dir/tmpf | awk '{print $2}'`;
  rm $dir/tmpf
  impr=`grep -w Overall $dir/log/update.$x.log | awk '{x += $10*$12;} END{print x;}'`
  impr=`perl -e "print ($impr*$acwt/$nf);"` # We multiply by acwt, and divide by $nf which is the "real" number of frames.
  echo "Iteration $x: objf was $objf, MMI auxf change was $impr" | tee $dir/objf.$x.log
  x=$[$x+1]
done

echo "MMI training finished"

rm $dir/final.mdl 2>/dev/null
ln -s $x.mdl $dir/final.mdl

exit 0;


================================================
FILE: egs/steps/train_mmi_fmmi.sh
================================================
#!/usr/bin/env bash
# by Johns Hopkins University (Author: Daniel Povey), 2012.  Apache 2.0.

# This script does MMI discriminative training, including
# feature-space (like fMPE) and model-space components. 
# If you give the --boost option it does "boosted MMI" (BMMI).
# On the iterations of training it alternates feature-space
# and model-space training.  We do 8 iterations in total--
# 4 of each type ((B)MMI, f(B)MMI)


# Begin configuration section.
cmd=run.pl
schedule="fmmi fmmi fmmi fmmi mmi mmi mmi mmi"
boost=0.0
learning_rate=0.01
tau=400 # For model.  Note: we're doing smoothing "to the previous iteration",
    # so --smooth-from-model so 400 seems like a more sensible default
    # than 100.  We smooth to the previous iteration because now
    # we are discriminatively training the features (and not using
    # the indirect differential), so it seems like it wouldn't make 
    # sense to use any element of ML.
weight_tau=10 # for model weights.
cancel=true # if true, cancel num and den counts as described in 
     # the boosted MMI paper. 
drop_frames=false # if true, ignore stats from frames where num + den
                       # have no overlap. 
indirect=true # if true, use indirect derivative.
acwt=0.1
stage=-1
ngselect=2; # Just the 2 top Gaussians.  Beyond that, adding more Gaussians
            # wouldn't make much difference since the posteriors would be very small.
# End configuration section.

echo "$0 $@"  # Print the command line for logging

[ -f ./path.sh ] && . ./path.sh;
. parse_options.sh || exit 1;


if [ $# != 6 ]; then
  echo "Usage: steps/train_mmi_fmmi.sh <data> <lang> <ali-dir> <diag-ubm-dir> <denlat-dir> <exp-dir>"
  echo " e.g.: steps/train_mmi_fmmi.sh data/train_si84 data/lang exp/tri2b_ali_si84 exp/ubm2d exp/tri2b_denlats_si84 exp/tri2b_fmmi"
  echo "Main options (for others, see top of script file)"
  echo "  --boost <boost-weight>                           # (e.g. 0.1) ... boosted MMI."
  echo "  --cancel (true|false)                            # cancel stats (true by default)"
  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
  echo "  --config <config-file>                           # config containing options"
  echo "  --stage <stage>                                  # stage to do partial re-run from."
  echo "  --tau                                            # tau for i-smooth to last iter (default 200)"
  echo "  --learning-rate                                  # learning rate for fMMI, default 0.01"
  echo "  --schedule                                       # learning schedule: by default,"
  echo "                                                   # \"fmmi mmi fmmi mmi fmmi mmi fmmi mmi\""
  exit 1;
fi


data=$1
lang=$2
alidir=$3
dubmdir=$4  # where diagonal UBM is.
denlatdir=$5
dir=$6

silphonelist=`cat $lang/phones/silence.csl`
mkdir -p $dir/log

utils/lang/check_phones_compatible.sh $lang/phones.txt $alidir/phones.txt || exit 1;
cp $lang/phones.txt $dir || exit 1;

for f in $data/feats.scp $lang/phones.txt $dubmdir/final.dubm $alidir/final.mdl \
    $alidir/ali.1.gz $denlatdir/lat.1.gz; do
  [ ! -f $f ] && echo "Expected file $f to exist" && exit 1;
done
cp $alidir/final.mdl $alidir/tree $dir || exit 1;
nj=`cat $alidir/num_jobs` || exit 1;
[ "$nj" -ne "`cat $denlatdir/num_jobs`" ] && \
  echo "$alidir and $denlatdir have different num-jobs" && exit 1;
sdata=$data/split$nj
splice_opts=`cat $alidir/splice_opts 2>/dev/null` # frame-splicing options.
cmvn_opts=`cat $alidir/cmvn_opts 2>/dev/null`
delta_opts=`cat $alidir/delta_opts 2>/dev/null`
mkdir -p $dir/log
cp $alidir/splice_opts $dir 2>/dev/null # frame-splicing options.
cp $alidir/cmvn_opts $dir 2>/dev/null # cmn/cmvn option.
cp $alidir/delta_opts $dir 2>/dev/null
[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;


if [ -f $alidir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
echo "$0: feature type is $feat_type"

# Note: $feats is the features before fMPE.
case $feat_type in
  delta) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas $delta_opts ark:- ark:- |";;
  lda) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |"
    cp $alidir/final.mat $dir    
    ;;
  *) echo "Invalid feature type $feat_type" && exit 1;
esac

[ -f $alidir/trans.1 ] && echo Using transforms from $alidir && \
  feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$alidir/trans.JOB ark:- ark:- |"

lats="ark:gunzip -c $denlatdir/lat.JOB.gz|"
if [[ "$boost" != "0.0" && "$boost" != 0 ]]; then
  lats="$lats lattice-boost-ali --b=$boost --silence-phones=$silphonelist $alidir/final.mdl ark:- 'ark,s,cs:gunzip -c $alidir/ali.JOB.gz|' ark:- |"
fi


fmpefeats="$feats" # At first, the features "after fMPE" are the same as the 
                   # base features.


# Initialize the fMPE object.  Note: we call it .fmpe because
# that's what it was called in the original paper, but since
# we're using the MMI objective function, it's really fMMI.

fmpe-init $dubmdir/final.dubm $dir/0.fmpe 2>$dir/log/fmpe_init.log || exit 1;


if [ $stage -le -1 ]; then
  # Get the gselect (Gaussian selection) info for fMPE.
  # Note: fMPE object starts with GMM object, so can be read
  # as one.
  $cmd JOB=1:$nj $dir/log/gselect.JOB.log \
    gmm-gselect --n=$ngselect $dir/0.fmpe "$feats" \
    "ark:|gzip -c >$dir/gselect.JOB.gz" || exit 1;
fi

cp $alidir/final.mdl $dir/0.mdl

x=0
num_iters=`echo $schedule | wc -w`

while [ $x -lt $num_iters ]; do
  iter_type=`echo $schedule | cut -d ' ' -f $[$x+1]`
  case $iter_type in 
    fmmi)
    echo "Iteration $x: doing fMMI"
    if [ $stage -le $x ]; then
      numpost="ark,s,cs:gunzip -c $alidir/ali.JOB.gz| ali-to-post ark:- ark:-|"
        # Note: the command gmm-fmpe-acc-stats below requires the pre-fMPE features.
      $cmd JOB=1:$nj $dir/log/acc_fmmi.$x.JOB.log \
        gmm-rescore-lattice $dir/$x.mdl "$lats" "$fmpefeats" ark:- \| \
        lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \
        sum-post --drop-frames=$drop_frames --scale1=-1 ark:- "$numpost" ark:- \| \
        gmm-fmpe-acc-stats $dir/$x.mdl $dir/$x.fmpe "$feats" \
        "ark,s,cs:gunzip -c $dir/gselect.JOB.gz|" ark,s,cs:- \
        $dir/$x.JOB.fmpe_acc || exit 1;
      
      ( fmpe-sum-accs $dir/$x.fmpe_acc $dir/$x.*.fmpe_acc && \
        rm $dir/$x.*.fmpe_acc && \
        fmpe-est --learning-rate=$learning_rate $dir/$x.fmpe $dir/$x.fmpe_acc $dir/$[$x+1].fmpe ) \
        2>$dir/log/est_fmpe.$x.log || exit 1;
    fi
    # We need to set the features to use the correct fMPE object.
    fmpefeats="$feats fmpe-apply-transform $dir/$[$x+1].fmpe ark:- 'ark,s,cs:gunzip -c $dir/gselect.JOB.gz|' ark:- |" 
    rm $dir/$[x+1].mdl 2>/dev/null; ln -s $x.mdl $dir/$[$x+1].mdl # link previous model.
    # Now, diagnostics.
    objf_nf=`grep Overall $dir/log/acc_fmmi.$x.*.log | grep gmm-fmpe-acc-stats | awk '{ p+=$10*$12; nf+=$12; } END{print p/nf, nf;}'`
    objf=`echo $objf_nf | awk '{print $1}'`;
    nf=`echo $objf_nf | awk '{print $2}'`;
    impr=`grep Objf $dir/log/est_fmpe.$x.log | awk '{print $NF}'`
    impr=`perl -e "print ($impr/$nf);"` # normalize by #frames.
    echo On iter $x, objf was $objf, auxf improvement from fMMI was $impr | tee $dir/objf.$x.log
    ;;
    mmi) # MMI iteration.
    echo "Iteration $x: doing MMI (getting stats)..."
    # Get denominator stats...  For simplicity we rescore the lattice
    # on all iterations, even though it shouldn't be necessary on the zeroth
    # (but we want this script to work even if $alidir doesn't contain the
    # model used to generate the lattice).
    if [ $stage -le $x ]; then
      $cmd JOB=1:$nj $dir/log/acc.$x.JOB.log \
        gmm-rescore-lattice $dir/$x.mdl "$lats" "$fmpefeats" ark:- \| \
        lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \
        sum-post --drop-frames=$drop_frames --merge=$cancel --scale1=-1 \
        ark:- "ark,s,cs:gunzip -c $alidir/ali.JOB.gz | ali-to-post ark:- ark:- |" ark:- \| \
        gmm-acc-stats2 $dir/$x.mdl "$fmpefeats" ark,s,cs:- \
        $dir/num_acc.$x.JOB.acc $dir/den_acc.$x.JOB.acc || exit 1;

      n=`echo $dir/{num,den}_acc.$x.*.acc | wc -w`;
      [ "$n" -ne $[$nj*2] ] && \
        echo "Wrong number of MMI accumulators $n versus 2*$nj" && exit 1;
      $cmd $dir/log/den_acc_sum.$x.log \
        gmm-sum-accs $dir/den_acc.$x.acc $dir/den_acc.$x.*.acc || exit 1;
      rm $dir/den_acc.$x.*.acc
      $cmd $dir/log/num_acc_sum.$x.log \
        gmm-sum-accs $dir/num_acc.$x.acc $dir/num_acc.$x.*.acc || exit 1;
      rm $dir/num_acc.$x.*.acc

      # note: this tau value is for smoothing to model parameters;
      # you need to use gmm-ismooth-stats to smooth to the ML stats,
      # but anyway this script does canceling of num and den stats on
      # each frame (as suggested in the Boosted MMI paper) which would
      # make smoothing to ML impossible without accumulating extra stats.
      $cmd $dir/log/update.$x.log \
        gmm-est-gaussians-ebw --tau=$tau $dir/$x.mdl $dir/num_acc.$x.acc $dir/den_acc.$x.acc - \| \
        gmm-est-weights-ebw --weight-tau=$weight_tau - $dir/num_acc.$x.acc $dir/den_acc.$x.acc $dir/$[$x+1].mdl || exit 1;
    else 
      echo "not doing this iteration because --stage=$stage"
    fi
  
    # Some diagnostics.. note, this objf is somewhat comparable to the
    # MMI objective function divided by the acoustic weight, and differences in it
    # are comparable to the auxf improvement printed by the update program.
    objf_nf=`grep Overall $dir/log/acc.$x.*.log | grep gmm-acc-stats2 | awk '{ p+=$10*$12; nf+=$12; } END{print p/nf, nf;}'`
    objf=`echo $objf_nf | awk '{print $1}'`;
    nf=`echo $objf_nf | awk '{print $2}'`;
    impr=`grep -w Overall $dir/log/update.$x.log | awk '{x += $10*$12;} END{print x;}'`
    impr=`perl -e "print ($impr/$nf);"` # renormalize by "real" #frames, to correct
    # for the canceling of stats.
    echo On iter $x, objf was $objf, auxf improvement was $impr | tee $dir/objf.$x.log
    rm $dir/$[x+1].fmpe 2>/dev/null; ln -s $x.fmpe $dir/$[$x+1].fmpe # link previous fMPE transform
    ;;
    *) echo "Invalid --schedule option: expected only mmi or fmmi.";
  esac
  x=$[$x+1]
done

echo "Succeeded with $num_iters iters iterations of MMI+fMMI training (boosting factor = $boost)"

rm $dir/final.mdl 2>/dev/null; ln -s $num_iters.mdl $dir/final.mdl
rm $dir/final.fmpe 2>/dev/null; ln -s $num_iters.fmpe $dir/final.fmpe 

# Now do some cleanup.
rm $dir/gselect.*.gz $dir/*.acc $dir/*.fmpe_acc
exit 0;


================================================
FILE: egs/steps/train_mmi_fmmi_indirect.sh
================================================
#!/usr/bin/env bash
# by Johns Hopkins University (Author: Daniel Povey), 2012.  Apache 2.0.

# This script does MMI discriminative training, including
# feature-space (like fMPE) and model-space components. 
# If you give the --boost option it does "boosted MMI" (BMMI).
# On the iterations of training it alternates feature-space
# and model-space training.  We do 8 iterations in total--
# 4 of each type ((B)MMI, f(B)MMI)


# Begin configuration section.
cmd=run.pl
schedule="fmmi mmi fmmi mmi fmmi mmi fmmi mmi"
boost=0.0
learning_rate=0.02
tau=200 # For model.  Note: we're doing smoothing "to the previous iteration",
    # so --smooth-from-model so 200 seems like a more sensible default
    # than 100.  We smooth to the previous iteration because now
    # we are discriminatively training the features (and not using
    # the indirect differential), so it seems like it wouldn't make 
    # sense to use any element of ML.
cancel=true # if true, cancel num and den counts as described in 
     # the boosted MMI paper. 
drop_frames=false # if true, ignore stats from frames where num + den
                       # have no overlap. 
acwt=0.1
stage=-1
ngselect=2; # Just the 2 top Gaussians.  Beyond that, adding more Gaussians
            # wouldn't make much difference since the posteriors would be very small.
# End configuration section.

echo "$0 $@"  # Print the command line for logging

[ -f ./path.sh ] && . ./path.sh;
. parse_options.sh || exit 1;


if [ $# != 6 ]; then
  echo "Usage: steps/train_mmi_fmmi.sh <data> <lang> <ali-dir> <diag-ubm-dir> <denlat-dir> <exp-dir>"
  echo " e.g.: steps/train_mmi_fmmi.sh data/train_si84 data/lang exp/tri2b_ali_si84 exp/ubm2d exp/tri2b_denlats_si84 exp/tri2b_fmmi"
  echo "Main options (for others, see top of script file)"
  echo "  --boost <boost-weight>                           # (e.g. 0.1) ... boosted MMI."
  echo "  --cancel (true|false)                            # cancel stats (true by default)"
  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
  echo "  --config <config-file>                           # config containing options"
  echo "  --stage <stage>                                  # stage to do partial re-run from."
  echo "  --tau                                            # tau for i-smooth to last iter (default 200)"
  echo "  --learning-rate                                  # learning rate for fMMI, default 0.01"
  echo "  --schedule                                       # learning schedule: by default,"
  echo "                                                   # \"fmmi mmi fmmi mmi fmmi mmi fmmi mmi\""
  exit 1;
fi


data=$1
lang=$2
alidir=$3
dubmdir=$4  # where diagonal UBM is.
denlatdir=$5
dir=$6

silphonelist=`cat $lang/phones/silence.csl`
mkdir -p $dir/log

utils/lang/check_phones_compatible.sh $lang/phones.txt $alidir/phones.txt || exit 1;
utils/lang/check_phones_compatible.sh $lang/phones.txt $dubmdir/phones.txt || exit 1;
cp $lang/phones.txt $dir || exit 1;

for f in $data/feats.scp $lang/phones.txt $dubmdir/final.dubm $alidir/final.mdl \
  $alidir/ali.1.gz $denlatdir/lat.1.gz; do
  [ ! -f $f ] && echo "Expected file $f to exist" && exit 1;
done
cp $alidir/final.mdl $alidir/tree $dir || exit 1;
nj=`cat $alidir/num_jobs` || exit 1;
[ "$nj" -ne "`cat $denlatdir/num_jobs`" ] && \
  echo "$alidir and $denlatdir have different num-jobs" && exit 1;
sdata=$data/split$nj
splice_opts=`cat $alidir/splice_opts 2>/dev/null` # frame-splicing options.
cmvn_opts=`cat $alidir/cmvn_opts 2>/dev/null`
delta_opts=`cat $alidir/delta_opts 2>/dev/null`
mkdir -p $dir/log
cp $alidir/splice_opts $dir 2>/dev/null # frame-splicing options.
cp $alidir/cmvn_opts $dir 2>/dev/null # cmn/cmvn option.
cp $alidir/delta_opts $dir 2>/dev/null
[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;


if [ -f $alidir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
echo "$0: feature type is $feat_type"

# Note: $feats is the features before fMPE.
case $feat_type in
  delta) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas $delta_opts ark:- ark:- |";;
  lda) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |"
    cp $alidir/final.mat $dir    
    ;;
  *) echo "Invalid feature type $feat_type" && exit 1;
esac

[ -f $alidir/trans.1 ] && echo Using transforms from $alidir && \
  feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$alidir/trans.JOB ark:- ark:- |"

lats="ark:gunzip -c $denlatdir/lat.JOB.gz|"
if [[ "$boost" != "0.0" && "$boost" != 0 ]]; then
  lats="$lats lattice-boost-ali --b=$boost --silence-phones=$silphonelist $alidir/final.mdl ark:- 'ark,s,cs:gunzip -c $alidir/ali.JOB.gz|' ark:- |"
fi


fmpefeats="$feats" # At first, the features "after fMPE" are the same as the 
                   # base features.


# Initialize the fMPE object.  Note: we call it .fmpe because
# that's what it was called in the original paper, but since
# we're using the MMI objective function, it's really fMMI.

fmpe-init $dubmdir/final.dubm $dir/0.fmpe 2>$dir/log/fmpe_init.log || exit 1;


if [ $stage -le -1 ]; then
  # Get the gselect (Gaussian selection) info for fMPE.
  # Note: fMPE object starts with GMM object, so can be read
  # as one.
  $cmd JOB=1:$nj $dir/log/gselect.JOB.log \
    gmm-gselect --n=$ngselect $dir/0.fmpe "$feats" \
    "ark:|gzip -c >$dir/gselect.JOB.gz" || exit 1;
fi

cp $alidir/final.mdl $dir/0.mdl

x=0
num_iters=`echo $schedule | wc -w`

while [ $x -lt $num_iters ]; do
  iter_type=`echo $schedule | cut -d ' ' -f $[$x+1]`
  case $iter_type in 
    fmmi) fmmi_iter=true; local_cancel=false;;
    mmi) fmmi_iter=false; local_cancel=$cancel;;
    *) echo "Bad iteration type $iter_type"; exit 1;;
  esac

  echo "Getting MMI stats (needed for fMMI and MMI iterations).";
  if [ $stage -le $x ]; then
    $cmd JOB=1:$nj $dir/log/acc.$x.JOB.log \
      gmm-rescore-lattice $dir/$x.mdl "$lats" "$fmpefeats" ark:- \| \
      lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \
      sum-post --merge=$local_cancel --scale1=-1 --drop-frames=$drop_frames \
      ark:- "ark,s,cs:gunzip -c $alidir/ali.JOB.gz | ali-to-post ark:- ark:- |" ark:- \| \
      gmm-acc-stats2 $dir/$x.mdl "$fmpefeats" ark,s,cs:- \
      $dir/num_acc.$x.JOB.acc $dir/den_acc.$x.JOB.acc || exit 1;
    n=`echo $dir/{num,den}_acc.$x.*.acc | wc -w`;
    [ "$n" -ne $[$nj*2] ] && \
      echo "Wrong number of MMI accumulators $n versus 2*$nj" && exit 1;
    rm $dir/.error 2>/dev/null
    $cmd $dir/log/den_acc_sum.$x.log \
      gmm-sum-accs $dir/den_acc.$x.acc $dir/den_acc.$x.*.acc || touch $dir/.error &
    $cmd $dir/log/num_acc_sum.$x.log \
      gmm-sum-accs $dir/num_acc.$x.acc $dir/num_acc.$x.*.acc || touch $dir/.error &
    wait
    [ -f $dir/.error ] && echo "Error summing accs" && exit 1;
    rm $dir/den_acc.$x.*.acc
    rm $dir/num_acc.$x.*.acc
  fi

  if $fmmi_iter; then
    echo "Iteration $x: doing fMMI"
    if [ $stage -le $x ]; then
      # Get model derivative.  Note: the "ml accumulator" is the same as the "numerator"
      # since this is MMI.  We avoided doing the "canceling of stats" on this iteration
      # so that this would be true (this canceling wouldn't affect the derivative anyway,
      # so can have no benefit for fMMI, unlike MMI).
      $cmd $dir/log/get_stats_deriv.$x.log \
        gmm-get-stats-deriv $dir/$x.mdl $dir/num_acc.$x.acc $dir/den_acc.$x.acc \
        $dir/num_acc.$x.acc $dir/model_deriv.$x.gmmacc
      numpost="ark,s,cs:gunzip -c $alidir/ali.JOB.gz| ali-to-post ark:- ark:-|"
        # Note: the command gmm-fmpe-acc-stats below requires the pre-fMPE features.
      $cmd JOB=1:$nj $dir/log/acc_fmmi.$x.JOB.log \
        gmm-rescore-lattice $dir/$x.mdl "$lats" "$fmpefeats" ark:- \| \
        lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \
        sum-post --drop-frames=$drop_frames --merge=false --scale1=-1 \
          ark:- "$numpost" ark:- \| \
        gmm-fmpe-acc-stats --model-derivative=$dir/model_deriv.$x.gmmacc \
          $dir/$x.mdl $dir/$x.fmpe "$feats" \
         "ark,s,cs:gunzip -c $dir/gselect.JOB.gz|" ark,s,cs:-  \
         $dir/$x.JOB.fmpe_acc || exit 1;
      
      ( fmpe-sum-accs $dir/$x.fmpe_acc $dir/$x.*.fmpe_acc && \
        rm $dir/$x.*.fmpe_acc && \
        fmpe-est --learning-rate=$learning_rate $dir/$x.fmpe $dir/$x.fmpe_acc $dir/$[$x+1].fmpe ) \
        2>$dir/log/est_fmpe.$x.log || exit 1;

      fmpefeats="$feats fmpe-apply-transform $dir/$[$x+1].fmpe ark:- 'ark,s,cs:gunzip -c $dir/gselect.JOB.gz|' ark:- |" 
      # OK, now we do one iteration of the "rescaling update" where we use the
      # old and new ML accs, and we shift and rescale the model to match the new
      # features.
      $cmd JOB=1:$nj $dir/log/acc_ml.$x.JOB.log \
        gmm-acc-stats-ali $dir/$x.mdl "$fmpefeats" "ark:gunzip -c $alidir/ali.JOB.gz|" \
          $dir/new_ml_acc.$x.JOB.acc || exit 1;
      $cmd $dir/log/new_ml_acc_sum.$x.log \
        gmm-sum-accs $dir/new_ml_acc.$x.acc $dir/new_ml_acc.$x.*.acc || exit 1;
      $cmd $dir/log/update_rescale.$x.log \
        gmm-est-rescale $dir/$x.mdl $dir/num_acc.$x.acc $dir/new_ml_acc.$x.acc \
        $dir/$[$x+1].mdl || exit 1;
    fi
    # We need to set the features to use the correct fMPE object.
    # This is a repeat of a command above-- in case we didn't do this stage.
    fmpefeats="$feats fmpe-apply-transform $dir/$[$x+1].fmpe ark:- 'ark,s,cs:gunzip -c $dir/gselect.JOB.gz|' ark:- |" 
    # Now, diagnostics.
    objf_nf=`grep Overall $dir/log/acc_fmmi.$x.*.log | grep gmm-fmpe-acc-stats | awk '{ p+=$10*$12; nf+=$12; } END{print p/nf, nf;}'`
    objf=`echo $objf_nf | awk '{print $1}'`;
    nf=`echo $objf_nf | awk '{print $2}'`;
    impr=`grep Objf $dir/log/est_fmpe.$x.log | awk '{print $NF}'`
    impr=`perl -e "print ($impr/$nf);"` # normalize by #frames.
    echo On iter $x, objf was $objf, auxf improvement from fMMI was $impr | tee $dir/objf.$x.log
  else # MMI iteration-- on this iteration do model-space update.
    echo "Iteration $x: doing MMI update"
      # note: this tau value is for smoothing to model parameters;
      # you need to use gmm-ismooth-stats to smooth to the ML stats,
      # but anyway this script does canceling of num and den stats on
      # each frame (as suggested in the Boosted MMI paper) which would
      # make smoothing to ML impossible without accumulating extra stats.
    if [ $stage -le $x ]; then
      $cmd $dir/log/update.$x.log \
        gmm-est-gaussians-ebw --tau=$tau $dir/$x.mdl $dir/num_acc.$x.acc $dir/den_acc.$x.acc - \| \
        gmm-est-weights-ebw - $dir/num_acc.$x.acc $dir/den_acc.$x.acc $dir/$[$x+1].mdl || exit 1;
    else 
      echo "not doing this iteration because --stage=$stage"
    fi
    
    # Some diagnostics.. note, this objf is somewhat comparable to the
    # MMI objective function divided by the acoustic weight, and differences in it
    # are comparable to the auxf improvement printed by the update program.
    objf_nf=`grep Overall $dir/log/acc.$x.*.log | grep gmm-acc-stats2 | awk '{ p+=$10*$12; nf+=$12; } END{print p/nf, nf;}'`
    objf=`echo $objf_nf | awk '{print $1}'`;
    nf=`echo $objf_nf | awk '{print $2}'`;
    impr=`grep Overall $dir/log/update.$x.log | head -1 | awk '{print $10*$12;}'`
    impr=`perl -e "print ($impr/$nf);"` # renormalize by "real" #frames, to correct
    # for the canceling of stats.
    echo On iter $x, objf was $objf, auxf improvement was $impr | tee $dir/objf.$x.log
    rm $dir/$[x+1].fmpe 2>/dev/null; ln -s $x.fmpe $dir/$[$x+1].fmpe # link previous fMPE transform
  fi
  x=$[$x+1]
done

echo "Succeeded with $num_iters iters iterations of MMI+fMMI training (boosting factor = $boost)"

rm $dir/final.mdl 2>/dev/null; ln -s $num_iters.mdl $dir/final.mdl
rm $dir/final.fmpe 2>/dev/null; ln -s $num_iters.fmpe $dir/final.fmpe 

# Now do some cleanup.
rm $dir/gselect.*.gz $dir/*.acc $dir/*.fmpe_acc
exit 0;


================================================
FILE: egs/steps/train_mmi_sgmm2.sh
================================================
#!/usr/bin/env bash
# Copyright 2012-2013  Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.

# MMI training (or optionally boosted MMI, if you give the --boost option),
# for SGMMs.  4 iterations (by default) of Extended Baum-Welch update.
#
# Begin configuration section.
cmd=run.pl
num_iters=4
boost=0.0
cancel=true # if true, cancel num and den counts on each frame.
drop_frames=false # this is the same as frame dropping (see Karel's ICASSP2013 paper).
acwt=0.1
stage=0
update_opts=
transform_dir=
# End configuration section

echo "$0 $@"  # Print the command line for logging

[ -f ./path.sh ] && . ./path.sh; # source the path.
. parse_options.sh || exit 1;

if [ $# -ne 5 ]; then
  echo "Usage: steps/train_mmi_sgmm2.sh <data> <lang> <ali> <denlats> <exp>"
  echo " e.g.: steps/train_mmi_sgmm2.sh data/train_si84 data/lang exp/tri2b_ali_si84 exp/tri2b_denlats_si84 exp/tri2b_mmi"
  echo "Main options (for others, see top of script file)"
  echo "  --boost <boost-weight>                           # (e.g. 0.1), for boosted MMI.  (default 0)"
  echo "  --cancel (true|false)                            # cancel stats (true by default)"
  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
  echo "  --config <config-file>                           # config containing options"
  echo "  --stage <stage>                                  # stage to do partial re-run from."  
  echo "  --transform-dir <transform-dir>                  # directory to find fMLLR transforms."
  exit 1;
fi

data=$1
lang=$2
alidir=$3
denlatdir=$4
dir=$5
mkdir -p $dir/log

utils/lang/check_phones_compatible.sh $lang/phones.txt $alidir/phones.txt || exit 1;
cp $lang/phones.txt $dir || exit 1;

for f in $data/feats.scp $alidir/{tree,final.mdl,ali.1.gz} $denlatdir/lat.1.gz; do
  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
done
nj=`cat $alidir/num_jobs` || exit 1;
[ "$nj" -ne "`cat $denlatdir/num_jobs`" ] && \
  echo "$alidir and $denlatdir have different num-jobs" && exit 1;

sdata=$data/split$nj
splice_opts=`cat $alidir/splice_opts 2>/dev/null`
cmvn_opts=`cat $alidir/cmvn_opts 2>/dev/null`
mkdir -p $dir/log
cp $alidir/splice_opts $dir 2>/dev/null
cp $alidir/cmvn_opts $dir 2>/dev/null # cmn/cmvn option.
[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
echo $nj > $dir/num_jobs

cp $alidir/tree $dir
cp $alidir/final.mdl $dir/0.mdl
cp $alidir/final.alimdl $dir

silphonelist=`cat $lang/phones/silence.csl` || exit 1;

# Set up features

if [ -f $alidir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
echo "$0: feature type is $feat_type"

case $feat_type in
  delta) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
  lda) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |"
    cp $alidir/final.mat $dir    
    ;;
  *) echo "Invalid feature type $feat_type" && exit 1;
esac

if [ ! -z "$transform_dir" ]; then
  echo "$0: using transforms from $transform_dir"
  [ ! -f $transform_dir/trans.1 ] && echo "$0: no such file $transform_dir/trans.1" \
    && exit 1;
  feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$transform_dir/trans.JOB ark:- ark:- |"
else
  echo "$0: no fMLLR transforms."
fi

if [ -f $alidir/vecs.1 ]; then
  echo "$0: using speaker vectors from $alidir"
  spkvecs_opt="--spk-vecs=ark:$alidir/vecs.JOB --utt2spk=ark:$sdata/JOB/utt2spk"
else
  echo "$0: no speaker vectors."
  spkvecs_opt=
fi

if [ -f $alidir/gselect.1.gz ]; then
  echo "$0: using Gaussian-selection info from $alidir"
  gselect_opt="--gselect=ark,s,cs:gunzip -c $alidir/gselect.JOB.gz|"
else
  echo "$0: error: no Gaussian-selection info found" && exit 1;
fi

lats="ark:gunzip -c $denlatdir/lat.JOB.gz|"
if [[ "$boost" != "0.0" && "$boost" != 0 ]]; then
  lats="$lats lattice-boost-ali --b=$boost --silence-phones=$silphonelist $alidir/final.mdl ark:- 'ark,s,cs:gunzip -c $alidir/ali.JOB.gz|' ark:- |"
fi

x=0
while [ $x -lt $num_iters ]; do
  echo "Iteration $x of MMI training"
  # Note: the num and den states are accumulated at the same time: 
  # can cancel them per frame.
  if [ $stage -le $x ]; then
    $cmd JOB=1:$nj $dir/log/acc.$x.JOB.log \
      test -s $dir/den_acc.$x.JOB.gz -a -s $dir/num_acc.$x.JOB.gz '||' \
      sgmm2-rescore-lattice --speedup=true "$gselect_opt" $spkvecs_opt $dir/$x.mdl "$lats" "$feats" ark:- \| \
      lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \
      sum-post --drop-frames=$drop_frames --merge=$cancel --scale1=-1 \
      ark:- "ark,s,cs:gunzip -c $alidir/ali.JOB.gz | ali-to-post ark:- ark:- |" ark:- \| \
      sgmm2-acc-stats2 "$gselect_opt" $spkvecs_opt $dir/$x.mdl "$feats" ark,s,cs:- \
      "|gzip -c >$dir/num_acc.$x.JOB.gz" "|gzip -c >$dir/den_acc.$x.JOB.gz" || exit 1;

    n=`echo $dir/{num,den}_acc.$x.*.gz | wc -w`;
    [ "$n" -ne $[$nj*2] ] && \
      echo "Wrong number of MMI accumulators $n versus 2*$nj" && exit 1;
    num_acc_sum="sgmm2-sum-accs - ";
    den_acc_sum="sgmm2-sum-accs - ";
    for j in `seq $nj`; do 
      num_acc_sum="$num_acc_sum 'gunzip -c $dir/num_acc.$x.$j.gz|'"; 
      den_acc_sum="$den_acc_sum 'gunzip -c $dir/den_acc.$x.$j.gz|'"; 
    done
    $cmd $dir/log/update.$x.log \
     sgmm2-est-ebw $update_opts $dir/$x.mdl "$num_acc_sum |" "$den_acc_sum |" \
      $dir/$[$x+1].mdl || exit 1;
    rm $dir/*_acc.$x.*.gz 
  fi

  # Some diagnostics: the objective function progress and auxiliary-function
  # improvement.  Note: this code is same as in train_mmi.sh
  tail -n 50 $dir/log/acc.$x.*.log | perl -e '$acwt=shift @ARGV; while(<STDIN>) { if(m/sgmm2-acc-stats2.+Overall weighted acoustic likelihood per frame was (\S+) over (\S+) frames/) { $tot_aclike += $1*$2; $tot_frames1 += $2; } if(m|lattice-to-post.+Overall average log-like/frame is (\S+) over (\S+) frames.  Average acoustic like/frame is (\S+)|) { $tot_den_lat_like += $1*$2; $tot_frames2 += $2; $tot_den_aclike += $3*$2; } } if (abs($tot_frames1 - $tot_frames2) > 0.01*($tot_frames1 + $tot_frames2)) { print STDERR "Frame-counts disagree $tot_frames1 versus $tot_frames2\n"; } $tot_den_lat_like /= $tot_frames2; $tot_den_aclike /= $tot_frames2; $tot_aclike *= ($acwt / $tot_frames1);  $num_like = $tot_aclike + $tot_den_aclike; $per_frame_objf = $num_like - $tot_den_lat_like; print "$per_frame_objf $tot_frames1\n"; ' $acwt > $dir/tmpf
  objf=`cat $dir/tmpf | awk '{print $1}'`;
  nf=`cat $dir/tmpf | awk '{print $2}'`;
  rm $dir/tmpf
  impr=`grep -w Overall $dir/log/update.$x.log | awk '{x += $10*$12;} END{print x;}'`
  impr=`perl -e "print ($impr*$acwt/$nf);"` # We multiply by acwt, and divide by $nf which is the "real" number of frames.
  echo "Iteration $x: objf was $objf, MMI auxf change was $impr" | tee $dir/objf.$x.log
  x=$[$x+1]
done

echo "MMI training finished"

rm $dir/final.mdl 2>/dev/null
rm $dir/*.acc 2>/dev/null
ln -s $x.mdl $dir/final.mdl

exit 0;


================================================
FILE: egs/steps/train_mono.sh
================================================
#!/usr/bin/env bash
# Copyright 2012  Johns Hopkins University (Author: Daniel Povey)
#           2019  Xiaohui Zhang
# Apache 2.0


# To be run from ..
# Flat start and monophone training, with delta-delta features.
# This script applies cepstral mean normalization (per speaker).

# Begin configuration section.
nj=4
cmd=run.pl
scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
num_iters=40    # Number of iterations of training
max_iter_inc=30 # Last iter to increase #Gauss on.
initial_beam=6 # beam used in the first iteration (set smaller to speed up initialization)
regular_beam=10 # beam used after the first iteration
retry_beam=40
totgauss=1000 # Target #Gaussians.
careful=false
boost_silence=1.0 # Factor by which to boost silence likelihoods in alignment
realign_iters="1 2 3 4 5 6 7 8 9 10 12 14 16 18 20 23 26 29 32 35 38";
config= # name of config file.
stage=-4
power=0.25 # exponent to determine number of gaussians from occurrence counts
norm_vars=false # deprecated, prefer --cmvn-opts "--norm-vars=false"
cmvn_opts=  # can be used to add extra options to cmvn.
delta_opts= # can be used to add extra options to add-deltas
# End configuration section.

echo "$0 $@"  # Print the command line for logging

if [ -f path.sh ]; then . ./path.sh; fi
. parse_options.sh || exit 1;

if [ $# != 3 ]; then
  echo "Usage: steps/train_mono.sh [options] <data-dir> <lang-dir> <exp-dir>"
  echo " e.g.: steps/train_mono.sh data/train.1k data/lang exp/mono"
  echo "main options (for others, see top of script file)"
  echo "  --config <config-file>                           # config containing options"
  echo "  --nj <nj>                                        # number of parallel jobs"
  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
  exit 1;
fi

data=$1
lang=$2
dir=$3

oov_sym=`cat $lang/oov.int` || exit 1;

mkdir -p $dir/log
echo $nj > $dir/num_jobs
sdata=$data/split$nj;
[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;

cp $lang/phones.txt $dir || exit 1;

$norm_vars && cmvn_opts="--norm-vars=true $cmvn_opts"
echo $cmvn_opts  > $dir/cmvn_opts # keep track of options to CMVN.
[ ! -z $delta_opts ] && echo $delta_opts > $dir/delta_opts # keep track of options to delta

feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas $delta_opts ark:- ark:- |"
example_feats="`echo $feats | sed s/JOB/1/g`";

echo "$0: Initializing monophone system."

[ ! -f $lang/phones/sets.int ] && exit 1;
shared_phones_opt="--shared-phones=$lang/phones/sets.int"

if [ $stage -le -3 ]; then
  # Note: JOB=1 just uses the 1st part of the features-- we only need a subset anyway.
  if ! feat_dim=`feat-to-dim "$example_feats" - 2>/dev/null` || [ -z $feat_dim ]; then
    feat-to-dim "$example_feats" -
    echo "error getting feature dimension"
    exit 1;
  fi
  $cmd JOB=1 $dir/log/init.log \
    gmm-init-mono $shared_phones_opt "--train-feats=$feats subset-feats --n=10 ark:- ark:-|" $lang/topo $feat_dim \
    $dir/0.mdl $dir/tree || exit 1;
fi

numgauss=`gmm-info --print-args=false $dir/0.mdl | grep gaussians | awk '{print $NF}'`
incgauss=$[($totgauss-$numgauss)/$max_iter_inc] # per-iter increment for #Gauss

if [ $stage -le -2 ]; then
  echo "$0: Compiling training graphs"
  $cmd JOB=1:$nj $dir/log/compile_graphs.JOB.log \
    compile-train-graphs --read-disambig-syms=$lang/phones/disambig.int $dir/tree $dir/0.mdl  $lang/L.fst \
    "ark:sym2int.pl --map-oov $oov_sym -f 2- $lang/words.txt < $sdata/JOB/text|" \
    "ark:|gzip -c >$dir/fsts.JOB.gz" || exit 1;
fi

if [ $stage -le -1 ]; then
  echo "$0: Aligning data equally (pass 0)"
  $cmd JOB=1:$nj $dir/log/align.0.JOB.log \
    align-equal-compiled "ark:gunzip -c $dir/fsts.JOB.gz|" "$feats" ark,t:-  \| \
    gmm-acc-stats-ali --binary=true $dir/0.mdl "$feats" ark:- \
    $dir/0.JOB.acc || exit 1;
fi

# In the following steps, the --min-gaussian-occupancy=3 option is important, otherwise
# we fail to est "rare" phones and later on, they never align properly.

if [ $stage -le 0 ]; then
  gmm-est --min-gaussian-occupancy=3  --mix-up=$numgauss --power=$power \
    $dir/0.mdl "gmm-sum-accs - $dir/0.*.acc|" $dir/1.mdl 2> $dir/log/update.0.log || exit 1;
  rm $dir/0.*.acc
fi

beam=$initial_beam # will change to regular_beam below after 1st pass
# note: using slightly wider beams for WSJ vs. RM.
x=1
while [ $x -lt $num_iters ]; do
  echo "$0: Pass $x"
  if [ $stage -le $x ]; then
    if echo $realign_iters | grep -w $x >/dev/null; then
      echo "$0: Aligning data"
      mdl="gmm-boost-silence --boost=$boost_silence `cat $lang/phones/optional_silence.csl` $dir/$x.mdl - |"
      $cmd JOB=1:$nj $dir/log/align.$x.JOB.log \
        gmm-align-compiled $scale_opts --beam=$beam --retry-beam=$retry_beam --careful=$careful "$mdl" \
        "ark:gunzip -c $dir/fsts.JOB.gz|" "$feats" "ark,t:|gzip -c >$dir/ali.JOB.gz" \
        || exit 1;
    fi
    $cmd JOB=1:$nj $dir/log/acc.$x.JOB.log \
      gmm-acc-stats-ali  $dir/$x.mdl "$feats" "ark:gunzip -c $dir/ali.JOB.gz|" \
      $dir/$x.JOB.acc || exit 1;

    $cmd $dir/log/update.$x.log \
      gmm-est --write-occs=$dir/$[$x+1].occs --mix-up=$numgauss --power=$power $dir/$x.mdl \
      "gmm-sum-accs - $dir/$x.*.acc|" $dir/$[$x+1].mdl || exit 1;
    rm $dir/$x.mdl $dir/$x.*.acc $dir/$x.occs 2>/dev/null
  fi
  if [ $x -le $max_iter_inc ]; then
     numgauss=$[$numgauss+$incgauss];
  fi
  beam=$regular_beam
  x=$[$x+1]
done

( cd $dir; rm final.{mdl,occs} 2>/dev/null; ln -s $x.mdl final.mdl; ln -s $x.occs final.occs )


steps/diagnostic/analyze_alignments.sh --cmd "$cmd" $lang $dir
utils/summarize_warnings.pl $dir/log

steps/info/gmm_dir_info.pl $dir

echo "$0: Done training monophone system in $dir"

exit 0

# example of showing the alignments:
# show-alignments data/lang/phones.txt $dir/30.mdl "ark:gunzip -c $dir/ali.0.gz|" | head -4


================================================
FILE: egs/steps/train_mpe.sh
================================================
#!/usr/bin/env bash
# Copyright 2012  Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.

# MMI training (or optionally boosted MMI, if you give the --boost option).
# 4 iterations (by default) of Extended Baum-Welch update.
#
# For the numerator we have a fixed alignment rather than a lattice--
# this actually follows from the way lattices are defined in Kaldi, which
# is to have a single path for each word (output-symbol) sequence.

# Begin configuration section.
cmd=run.pl
num_iters=4
boost=0.0
cancel=true # if true, cancel num and den counts on each frame.
tau=400
weight_tau=10
acwt=0.1
stage=0
smooth_to_mode=true
# End configuration section

echo "$0 $@"  # Print the command line for logging

[ -f ./path.sh ] && . ./path.sh; # source the path.
. parse_options.sh || exit 1;

if [ $# -ne 5 ]; then
  echo "Usage: steps/train_mmi.sh <data> <lang> <ali> <denlats> <exp>"
  echo " e.g.: steps/train_mmi.sh data/train_si84 data/lang exp/tri2b_ali_si84 exp/tri2b_denlats_si84 exp/tri2b_mmi"
  echo "Main options (for others, see top of script file)"
  echo "  --boost <boost-weight>                           # (e.g. 0.1), for boosted MMI.  (default 0)"
  echo "  --cancel (true|false)                            # cancel stats (true by default)"
  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
  echo "  --config <config-file>                           # config containing options"
  echo "  --stage <stage>                                  # stage to do partial re-run from."
  echo "  --tau                                            # tau for i-smooth to last iter (default 200)"
  
  exit 1;
fi

data=$1
lang=$2
alidir=$3
denlatdir=$4
dir=$5
mkdir -p $dir/log

utils/lang/check_phones_compatible.sh $lang/phones.txt $alidir/phones.txt || exit 1;
cp $lang/phones.txt $dir || exit 1;

for f in $data/feats.scp $alidir/{tree,final.mdl,ali.1.gz} $denlatdir/lat.1.gz; do
  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
done
nj=`cat $alidir/num_jobs` || exit 1;
[ "$nj" -ne "`cat $denlatdir/num_jobs`" ] && \
  echo "$alidir and $denlatdir have different num-jobs" && exit 1;

sdata=$data/split$nj
splice_opts=`cat $alidir/splice_opts 2>/dev/null`
cmvn_opts=`cat $alidir/cmvn_opts 2>/dev/null`
mkdir -p $dir/log
cp $alidir/splice_opts $dir 2>/dev/null
cp $alidir/cmvn_opts $dir 2>/dev/null # cmn/cmvn option.
[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
echo $nj > $dir/num_jobs

cp $alidir/{final.mdl,tree} $dir

silphonelist=`cat $lang/phones/silence.csl` || exit 1;

# Set up features

if [ -f $alidir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
echo "$0: feature type is $feat_type"

case $feat_type in
  delta) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
  lda) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |"
    cp $alidir/final.mat $dir    
    ;;
  *) echo "Invalid feature type $feat_type" && exit 1;
esac

[ -f $alidir/trans.1 ] && echo Using transforms from $alidir && \
  feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$alidir/trans.JOB ark:- ark:- |"

lats="ark:gunzip -c $denlatdir/lat.JOB.gz|"
if [[ "$boost" != "0.0" && "$boost" != 0 ]]; then
  lats="$lats lattice-boost-ali --b=$boost --silence-phones=$silphonelist $alidir/final.mdl ark:- 'ark,s,cs:gunzip -c $alidir/ali.JOB.gz|' ark:- |"
fi


cur_mdl=$alidir/final.mdl
x=0
while [ $x -lt $num_iters ]; do
  echo "Iteration $x of MPE training"
  # Note: the num and den states are accumulated at the same time, so we
  # can cancel them per frame.
  if [ $stage -le $x ]; then
    $cmd JOB=1:$nj $dir/log/acc.$x.JOB.log \
      gmm-rescore-lattice $cur_mdl "$lats" "$feats" ark:- \| \
      lattice-to-mpe-post --acoustic-scale=$acwt $cur_mdl \
        "ark,s,cs:gunzip -c $alidir/ali.JOB.gz |" ark:- ark:- \| \
      gmm-acc-stats2 $cur_mdl "$feats" ark,s,cs:- \
        $dir/num_acc.$x.JOB.acc $dir/den_acc.$x.JOB.acc || exit 1;

    n=`echo $dir/{num,den}_acc.$x.*.acc | wc -w`;
    [ "$n" -ne $[$nj*2] ] && \
      echo "Wrong number of MMI accumulators $n versus 2*$nj" && exit 1;
    $cmd $dir/log/den_acc_sum.$x.log \
      gmm-sum-accs $dir/den_acc.$x.acc $dir/den_acc.$x.*.acc || exit 1;
    rm $dir/den_acc.$x.*.acc
    $cmd $dir/log/num_acc_sum.$x.log \
      gmm-sum-accs $dir/num_acc.$x.acc $dir/num_acc.$x.*.acc || exit 1;
    rm $dir/num_acc.$x.*.acc

    # note: this tau value is for smoothing towards model parameters, not
    # as in the Boosted MMI paper, not towards the ML stats as in the earlier
    # work on discriminative training (e.g. my thesis).  
    # You could use gmm-ismooth-stats to smooth to the ML stats, if you had
    # them available [here they're not available if cancel=true].
    if ! $smooth_to_model; then
      echo "Iteration $x of MPE: computing ml (smoothing) stats"
      $cmd JOB=1:$nj $dir/log/acc_ml.$x.JOB.log \
        gmm-acc-stats $cur_mdl "$feats" \
          "ark,s,cs:gunzip -c $alidir/ali.JOB.gz | ali-to-post ark:- ark:- |" \
          $dir/ml.$x.JOB.acc || exit 1;
      $cmd $dir/log/acc_ml_sum.$x.log \
        gmm-sum-accs $dir/ml.$x.acc $dir/ml.$x.*.acc || exit 1;
      rm $dir/ml.$x.*.acc
      num_stats="gmm-ismooth-stats --tau=$tau $dir/ml.$x.acc $dir/num_acc.$x.acc -|"
    else 
      num_stats="gmm-ismooth-stats --smooth-from-model=true --tau=$tau $cur_mdl $dir/num_acc.$x.acc -|"
    fi  
    
    $cmd $dir/log/update.$x.log \
      gmm-est-gaussians-ebw $cur_mdl "$num_stats" $dir/den_acc.$x.acc - \| \
      gmm-est-weights-ebw - $dir/num_acc.$x.acc $dir/den_acc.$x.acc $dir/$[$x+1].mdl || exit 1;
    rm $dir/{den,num}_acc.$x.acc
  fi
  cur_mdl=$dir/$[$x+1].mdl

  # Some diagnostics: the objective function progress and auxiliary-function
  # improvement.

 tail -n 50 $dir/log/acc.$x.*.log | perl -e 'while(<STDIN>) { if(m/lattice-to-mpe-post.+Overall average frame-accuracy is (\S+) over (\S+) frames/) { $tot_objf += $1*$2; $tot_frames += $2; }} $tot_objf /= $tot_frames; print "$tot_objf $tot_frames\n"; ' > $dir/tmpf
  objf=`cat $dir/tmpf | awk '{print $1}'`;
  nf=`cat $dir/tmpf | awk '{print $2}'`;
  rm $dir/tmpf
  impr=`grep -w Overall $dir/log/update.$x.log | awk '{x += $10*$12;} END{print x;}'`
  impr=`perl -e "print ($impr*$acwt/$nf);"` # We multiply by acwt, and divide by $nf which is the "real" number of frames.
  # This gives us a projected objective function improvement.
  echo "Iteration $x: objf was $objf, MPE auxf change was $impr" | tee $dir/objf.$x.log
  x=$[$x+1]
done

echo "MPE training finished"

rm $dir/final.mdl 2>/dev/null
ln -s $x.mdl $dir/final.mdl

exit 0;


================================================
FILE: egs/steps/train_quick.sh
================================================
#!/usr/bin/env bash
# Copyright 2012  Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.


# Train a model on top of existing features (no feature-space learning of any
# kind is done).  This script initializes the model (i.e., the GMMs) from the
# previous system's model.  That is: for each state in the current model (after
# tree building), it chooses the closes state in the old model, judging the
# similarities based on overlap of counts in the tree stats.

# Begin configuration..
cmd=run.pl
scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
realign_iters="10 15"; # Only realign twice.
num_iters=20    # Number of iterations of training
maxiterinc=15 # Last iter to increase #Gauss on.
batch_size=750 # batch size to use while compiling graphs... memory/speed tradeoff.
beam=10 # alignment beam.
retry_beam=40
stage=-5
cluster_thresh=-1  # for build-tree control final bottom-up clustering of leaves
# End configuration section.

echo "$0 $@"  # Print the command line for logging

[ -f path.sh ] && . ./path.sh
. parse_options.sh || exit 1;

if [ $# != 6 ]; then
  echo "Usage: steps/train_quick.sh <num-leaves> <num-gauss> <data> <lang> <ali-dir> <exp-dir>"
  echo " e.g.: steps/train_quick.sh 2500 15000 data/train_si284 data/lang exp/tri3c_ali_si284 exp/tri4b"
  echo "Main options (for others, see top of script file)"
  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
  echo "  --config <config-file>                           # config containing options"
  echo "  --stage <stage>                                  # stage to do partial re-run from."
  exit 1;
fi

numleaves=$1
totgauss=$2
data=$3
lang=$4
alidir=$5
dir=$6

for f in $data/feats.scp $lang/L.fst $alidir/ali.1.gz $alidir/final.mdl; do
  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
done

# Set various variables.
oov=`cat $lang/oov.int`
silphonelist=`cat $lang/phones/silence.csl`
ciphonelist=`cat $lang/phones/context_indep.csl`
numgauss=$[totgauss/2] # Start with half the total number of Gaussians.  We won't have
  # to mix up much probably, as we're initializing with the old (already mixed-up) pdf's.  
[ $numgauss -lt $numleaves ] && numgauss=$numleaves
incgauss=$[($totgauss-$numgauss)/$maxiterinc] # per-iter increment for #Gauss
nj=`cat $alidir/num_jobs` || exit 1;
sdata=$data/split$nj
splice_opts=`cat $alidir/splice_opts 2>/dev/null` # frame-splicing options.
cmvn_opts=`cat $alidir/cmvn_opts 2>/dev/null`
delta_opts=`cat $alidir/delta_opts 2>/dev/null`

mkdir -p $dir/log
echo $nj >$dir/num_jobs
cp $alidir/splice_opts $dir 2>/dev/null
cp $alidir/cmvn_opts $dir 2>/dev/null # cmn/cmvn option.
cp $alidir/delta_opts $dir 2>/dev/null
[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;

utils/lang/check_phones_compatible.sh $lang/phones.txt $alidir/phones.txt || exit 1;
cp $lang/phones.txt $dir || exit 1;

## Set up features.
if [ -f $alidir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
echo "$0: feature type is $feat_type"

case $feat_type in
  delta) sifeats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas $delta_opts ark:- ark:- |";;
  lda) sifeats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |"
    cp $alidir/final.mat $dir    
    cp $alidir/full.mat $dir 2>/dev/null
    ;;
  *) echo "Invalid feature type $feat_type" && exit 1;
esac
if [ -f $alidir/trans.1 ]; then
  echo "$0: using transforms from $alidir"
  ln.pl $alidir/trans.* $dir # Link them to dest dir.
  feats="$sifeats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$dir/trans.JOB ark:- ark:- |"
else
  feats="$sifeats"
fi
##


if [ $stage -le -5 ]; then
  echo "$0: accumulating tree stats"
  $cmd JOB=1:$nj $dir/log/acc_tree.JOB.log \
    acc-tree-stats  --ci-phones=$ciphonelist $alidir/final.mdl "$feats" \
    "ark:gunzip -c $alidir/ali.JOB.gz|" $dir/JOB.treeacc || exit 1;
  [ "`ls $dir/*.treeacc | wc -w`" -ne "$nj" ] && echo "$0: Wrong #tree-stats" && exit 1;
  sum-tree-stats $dir/treeacc $dir/*.treeacc 2>$dir/log/sum_tree_acc.log || exit 1;
  rm $dir/*.treeacc
fi

if [ $stage -le -4 ]; then
  echo "$0: Getting questions for tree clustering."
  # preparing questions, roots file...
  cluster-phones $dir/treeacc $lang/phones/sets.int $dir/questions.int 2> $dir/log/questions.log || exit 1;
  cat $lang/phones/extra_questions.int >> $dir/questions.int
  compile-questions $lang/topo $dir/questions.int $dir/questions.qst 2>$dir/log/compile_questions.log || exit 1;

  echo "$0: Building the tree"
  $cmd $dir/log/build_tree.log \
    build-tree --verbose=1 --max-leaves=$numleaves \
    --cluster-thresh=$cluster_thresh $dir/treeacc $lang/phones/roots.int \
    $dir/questions.qst $lang/topo $dir/tree || exit 1;
fi

if [ $stage -le -3 ]; then
  echo "$0: Initializing the model"

  # The gmm-init-model command (with more than the normal # of command-line args)
  # will initialize the p.d.f.'s to the p.d.f.'s in the alignment model.

  gmm-init-model  --write-occs=$dir/1.occs  \
    $dir/tree $dir/treeacc $lang/topo $dir/tmp.mdl $alidir/tree $alidir/final.mdl  \
    2>$dir/log/init_model.log || exit 1;

  grep 'no stats' $dir/log/init_model.log && echo "$0: This is a bad warning.";
  rm $dir/treeacc
fi

if [ $stage -le -2 ]; then
  echo "$0: mixing up old model."
  # We do both mixing-down and mixing-up to get the target #Gauss in each state,
  # since the initial model may have either more or fewer Gaussians than we want.
  gmm-mixup --mix-down=$numgauss --mix-up=$numgauss $dir/tmp.mdl $dir/1.occs $dir/1.mdl \
    2> $dir/log/mixup.log || exit 1;
  rm $dir/tmp.mdl 
fi

# Convert alignments to the new tree.
if [ $stage -le -1 ]; then
  echo "$0: converting old alignments"
  $cmd JOB=1:$nj $dir/log/convert.JOB.log \
    convert-ali $alidir/final.mdl $dir/1.mdl $dir/tree \
    "ark:gunzip -c $alidir/ali.JOB.gz|" "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1;
fi

if [ $stage -le 0 ]; then
  echo "$0: compiling training graphs"
  $cmd JOB=1:$nj $dir/log/compile_graphs.JOB.log \
    compile-train-graphs --read-disambig-syms=$lang/phones/disambig.int --batch-size=$batch_size $dir/tree $dir/1.mdl $lang/L.fst  \
    "ark:sym2int.pl --map-oov $oov -f 2- $lang/words.txt < $sdata/JOB/text |" \
    "ark:|gzip -c >$dir/fsts.JOB.gz" || exit 1;
fi

x=1
while [ $x -lt $num_iters ]; do
  echo "$0: pass $x"
  if echo $realign_iters | grep -w $x >/dev/null && [ $stage -le $x ]; then
    echo "$0: aligning data"
    $cmd JOB=1:$nj $dir/log/align.$x.JOB.log \
      gmm-align-compiled $scale_opts --beam=$beam --retry-beam=$retry_beam $dir/$x.mdl \
      "ark:gunzip -c $dir/fsts.JOB.gz|" "$feats" "ark:|gzip -c >$dir/ali.JOB.gz" \
      || exit 1;
  fi
  if [ $stage -le $x ]; then
    $cmd JOB=1:$nj $dir/log/acc.$x.JOB.log \
      gmm-acc-stats-ali  $dir/$x.mdl "$feats" \
      "ark,s,cs:gunzip -c $dir/ali.JOB.gz|"  $dir/$x.JOB.acc || exit 1;
    [ "`ls $dir/$x.*.acc | wc -w`" -ne "$nj" ] && echo "$0: wrong #accs" && exit 1;
    $cmd $dir/log/update.$x.log \
      gmm-est --write-occs=$dir/$[$x+1].occs --mix-up=$numgauss $dir/$x.mdl \
      "gmm-sum-accs - $dir/$x.*.acc |" $dir/$[$x+1].mdl || exit 1;
    rm $dir/$x.mdl $dir/$x.*.acc $dir/$x.occs
  fi
  [[ $x -le $maxiterinc ]] && numgauss=$[$numgauss+$incgauss];
  x=$[$x+1];
done

if [ -f $alidir/trans.1 ]; then
  echo "$0: estimating alignment model"
  $cmd JOB=1:$nj $dir/log/acc_alimdl.JOB.log \
    ali-to-post "ark:gunzip -c $dir/ali.JOB.gz|" ark:-  \| \
    gmm-acc-stats-twofeats $dir/$x.mdl "$feats" "$sifeats" \
    ark,s,cs:- $dir/$x.JOB.acc || exit 1;
  [ "`ls $dir/$x.*.acc | wc -w`" -ne "$nj" ] && echo "$0: wrong #accs" && exit 1;

  $cmd $dir/log/est_alimdl.log \
    gmm-est --write-occs=$dir/final.occs --remove-low-count-gaussians=false $dir/$x.mdl \
    "gmm-sum-accs - $dir/$x.*.acc|" $dir/$x.alimdl || exit 1;
  rm $dir/$x.*.acc
  rm $dir/final.alimdl 2>/dev/null 
  ln -s $x.alimdl $dir/final.alimdl
fi

rm $dir/final.mdl 2>/dev/null
ln -s $x.mdl $dir/final.mdl

echo Done


================================================
FILE: egs/steps/train_raw_sat.sh
================================================
#!/usr/bin/env bash
# Copyright 2012  Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.


# This does Speaker Adapted Training (SAT).  We train on fMLLR-adapted features,
# but in this "raw" script, these transforms are at the level of the raw
# cepstra.  The model must be built on top of LDA+MLLT features, and the
# transforms are estimated using the model, in a rather clever way.  If there
# are no raw transforms supplied in the alignment directory, it will estimate
# transforms itself before building the tree (and in any case, it estimates
# transforms a number of times during training).
# You need to decode the models it builds with decode_raw_fmllr.sh

# Begin configuration section.
stage=-6
cmd=run.pl
scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
beam=10
retry_beam=40
boost_silence=1.0 # Factor by which to boost silence likelihoods in alignment
context_opts=  # e.g. set this to "--context-width 5 --central-position 2" for quinphone.
realign_iters="10 20 30";
fmllr_iters="2 4 6 12";
mllt_iters="3 5 7 10"
dim=40
randprune=4.0 # This is approximately the ratio by which we will speed up the
              # LDA and MLLT calculations via randomized pruning.
silence_weight=0.0 # Weight on silence in fMLLR estimation.
num_iters=35   # Number of iterations of training
max_iter_inc=25 # Last iter to increase #Gauss on.
power=0.2 # Exponent for number of gaussians according to occurrence counts
cluster_thresh=-1  # for build-tree control final bottom-up clustering of leaves
train_tree=true
# End configuration section.

echo "$0 $@"  # Print the command line for logging

[ -f path.sh ] && . ./path.sh
. parse_options.sh || exit 1;

if [ $# != 6 ]; then
  echo "Usage: steps/train_sat.sh <#leaves> <#gauss> <data> <lang> <ali-dir> <exp-dir>"
  echo " e.g.: steps/train_sat.sh 2500 15000 data/train_si84 data/lang exp/tri2b_ali_si84 exp/tri3b"
  echo "Main options (for others, see top of script file)"
  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
  echo "  --config <config-file>                           # config containing options"
  echo "  --stage <stage>                                  # stage to do partial re-run from."
  exit 1;
fi

numleaves=$1
totgauss=$2
data=$3
lang=$4
alidir=$5
dir=$6

for f in $data/feats.scp $lang/phones.txt $alidir/final.mdl $alidir/ali.1.gz; do
  [ ! -f $f ] && echo "train_sat.sh: no such file $f" && exit 1;
done

numgauss=$numleaves
incgauss=$[($totgauss-$numgauss)/$max_iter_inc]  # per-iter #gauss increment
oov=`cat $lang/oov.int`
nj=`cat $alidir/num_jobs` || exit 1;
silphonelist=`cat $lang/phones/silence.csl`
ciphonelist=`cat $lang/phones/context_indep.csl` || exit 1;
sdata=$data/split$nj;
splice_opts=`cat $alidir/splice_opts 2>/dev/null` # frame-splicing options.
cmvn_opts=`cat $alidir/cmvn_opts 2>/dev/null`
raw_dim=$(feat-to-dim scp:$data/feats.scp -) || exit 1;
! [ "$raw_dim" -gt 0 ] && echo "raw feature dim not set" && exit 1;

mkdir -p $dir/log
cp $alidir/splice_opts $dir 2>/dev/null # frame-splicing options.
cp $alidir/cmvn_opts $dir 2>/dev/null # cmn/cmvn option.

utils/lang/check_phones_compatible.sh $lang/phones.txt $alidir/phones.txt || exit 1;
cp $lang/phones.txt $dir || exit 1;

echo $nj >$dir/num_jobs
[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;

# Set up features.

if [[ ! -f $alidir/final.mat || ! -f $alidir/full.mat ]]; then
  echo "$0: expected to find  $alidir/final.mat and $alidir/full.mat"
  exit 1
fi

sisplicedfeats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- |"
sifeats="$sisplicedfeats transform-feats $alidir/final.mat ark:- ark:- |"


## Get initial fMLLR transforms (possibly from alignment dir)
if [ -f $alidir/raw_trans.1 ]; then
  echo "$0: Using transforms from $alidir"
  cur_trans_dir=$alidir
else 
  if [ $stage -le -6 ]; then
    echo "$0: obtaining initial fMLLR transforms since not present in $alidir"
    # The next line is necessary because of $silphonelist otherwise being incorrect; would require
    # old $lang dir which would require another option.  Not needed anyway.
    full_lda_mat="get-full-lda-mat --print-args=false $alidir/final.mat $alidir/full.mat -|"
    $cmd JOB=1:$nj $dir/log/fmllr.0.JOB.log \
      ali-to-post "ark:gunzip -c $alidir/ali.JOB.gz|" ark:- \| \
      weight-silence-post $silence_weight $silphonelist $alidir/final.mdl ark:- ark:- \| \
      gmm-est-fmllr-raw --raw-feat-dim=$raw_dim --spk2utt=ark:$sdata/JOB/spk2utt $alidir/final.mdl \
        "$full_lda_mat" "$sisplicedfeats" ark:- ark:$dir/raw_trans.JOB || exit 1;
  fi
  cur_trans_dir=$dir
fi

splicedfeats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$cur_trans_dir/raw_trans.JOB ark:- ark:- | splice-feats $splice_opts ark:- ark:- |"


if [ $stage -le -5 ]; then
  echo "Accumulating LDA statistics."
  $cmd JOB=1:$nj $dir/log/lda_acc.JOB.log \
    ali-to-post "ark:gunzip -c $alidir/ali.JOB.gz|" ark:- \| \
      weight-silence-post 0.0 $silphonelist $alidir/final.mdl ark:- ark:- \| \
      acc-lda --rand-prune=$randprune $alidir/final.mdl "$splicedfeats" ark,s,cs:- \
       $dir/lda.JOB.acc || exit 1;
  est-lda --write-full-matrix=$dir/full.mat --dim=$dim $dir/0.mat $dir/lda.*.acc \
      2>$dir/log/lda_est.log || exit 1;  
  rm $dir/lda.*.acc
fi

cur_lda_iter=0
feats="$splicedfeats transform-feats $dir/$cur_lda_iter.mat ark:- ark:- |"

# To build the tree, we use the previous directory's LDA transform, which
# is better as it has MLLT also.  It leads to higher auxiliary function
# improvements in tree building, which is generally a good thing.
tree_feats="$splicedfeats transform-feats $alidir/final.mat ark:- ark:- |"


if [ $stage -le -4 ] && $train_tree; then
  # Get tree stats.
  echo "$0: Accumulating tree stats"
  $cmd JOB=1:$nj $dir/log/acc_tree.JOB.log \
    acc-tree-stats $context_opts --ci-phones=$ciphonelist $alidir/final.mdl "$tree_feats" \
    "ark:gunzip -c $alidir/ali.JOB.gz|" $dir/JOB.treeacc || exit 1;
  [ "`ls $dir/*.treeacc | wc -w`" -ne "$nj" ] && echo "$0: Wrong #tree-accs" && exit 1;
  $cmd $dir/log/sum_tree_acc.log \
    sum-tree-stats $dir/treeacc $dir/*.treeacc || exit 1;
  rm $dir/*.treeacc
fi

if [ $stage -le -3 ] && $train_tree; then
  echo "$0: Getting questions for tree clustering."
  # preparing questions, roots file...
  cluster-phones $context_opts $dir/treeacc $lang/phones/sets.int $dir/questions.int 2> $dir/log/questions.log || exit 1;
  cat $lang/phones/extra_questions.int >> $dir/questions.int
  compile-questions $context_opts $lang/topo $dir/questions.int $dir/questions.qst 2>$dir/log/compile_questions.log || exit 1;

  echo "$0: Building the tree"
  $cmd $dir/log/build_tree.log \
    build-tree $context_opts --verbose=1 --max-leaves=$numleaves \
    --cluster-thresh=$cluster_thresh $dir/treeacc $lang/phones/roots.int \
    $dir/questions.qst $lang/topo $dir/tree || exit 1;
fi

if [ $stage -le -2 ]; then
  echo "$0: Initializing the model"
  # Since we trained the tree on different feats, we don't use gmm-init-model, which
  # would initialize the tree with invalid features.  This doesn't really matter anyway,
  # the first iteration of training will set suitable initial parameters.
  cp $alidir/tree $dir/ || exit 1;
  $cmd JOB=1 $dir/log/init_model.log \
    gmm-init-model-flat $dir/tree $lang/topo $dir/1.mdl \
    "$tree_feats subset-feats ark:- ark:-|" || exit 1;
fi

if [ $stage -le -1 ]; then
  # Convert the alignments.
  echo "$0: Converting alignments from $alidir to use current tree"
  $cmd JOB=1:$nj $dir/log/convert.JOB.log \
    convert-ali $alidir/final.mdl $dir/1.mdl $dir/tree \
     "ark:gunzip -c $alidir/ali.JOB.gz|" "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1;
fi

if [ $stage -le 0 ] && [ "$realign_iters" != "" ]; then
  echo "$0: Compiling graphs of transcripts"
  $cmd JOB=1:$nj $dir/log/compile_graphs.JOB.log \
    compile-train-graphs --read-disambig-syms=$lang/phones/disambig.int $dir/tree $dir/1.mdl  $lang/L.fst  \
     "ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt < $sdata/JOB/text |" \
      "ark:|gzip -c >$dir/fsts.JOB.gz" || exit 1;
fi

x=1
while [ $x -lt $num_iters ]; do
   echo Pass $x
  if echo $realign_iters | grep -w $x >/dev/null && [ $stage -le $x ]; then
    echo Aligning data
    mdl="gmm-boost-silence --boost=$boost_silence `cat $lang/phones/optional_silence.csl` $dir/$x.mdl - |"
    $cmd JOB=1:$nj $dir/log/align.$x.JOB.log \
      gmm-align-compiled $scale_opts --beam=$beam --retry-beam=$retry_beam "$mdl" \
      "ark:gunzip -c $dir/fsts.JOB.gz|" "$feats" \
      "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1;
  fi

  if echo $fmllr_iters | grep -w $x >/dev/null; then
    if [ $stage -le $x ]; then
      echo Estimating fMLLR transforms
      # We estimate a transform that's additional to the previous transform;
      # we'll compose them.

      full_lda_mat="get-full-lda-mat --print-args=false $dir/$cur_lda_iter.mat $dir/full.mat - |"
      $cmd JOB=1:$nj $dir/log/fmllr.$x.JOB.log \
        ali-to-post "ark:gunzip -c $dir/ali.JOB.gz|" ark:-  \| \
        weight-silence-post $silence_weight $silphonelist $dir/$x.mdl ark:- ark:- \| \
        gmm-est-fmllr-raw --raw-feat-dim=$raw_dim --spk2utt=ark:$sdata/JOB/spk2utt $dir/$x.mdl "$full_lda_mat" \
          "$splicedfeats" ark:- ark:$dir/tmp_trans.JOB || exit 1;
      for n in `seq $nj`; do
        ! ( compose-transforms --b-is-affine=true \
          ark:$dir/tmp_trans.$n ark:$cur_trans_dir/raw_trans.$n ark:$dir/composed_trans.$n \
          && mv $dir/composed_trans.$n $dir/raw_trans.$n && \
          rm $dir/tmp_trans.$n ) 2>$dir/log/compose_transforms.$x.log \
          && echo "$0: Error composing transforms" && exit 1;
      done
    fi
    cur_trans_dir=$dir
    splicedfeats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$cur_trans_dir/raw_trans.JOB ark:- ark:- | splice-feats $splice_opts ark:- ark:- |"
    feats="$splicedfeats transform-feats $dir/$cur_lda_iter.mat ark:- ark:- |"
  fi

  if echo $mllt_iters | grep -w $x >/dev/null; then
    if [ $stage -le $x ]; then
      echo "Estimating MLLT"
      $cmd JOB=1:$nj $dir/log/macc.$x.JOB.log \
        ali-to-post "ark:gunzip -c $dir/ali.JOB.gz|" ark:- \| \
        weight-silence-post 0.0 $silphonelist $dir/$x.mdl ark:- ark:- \| \
        gmm-acc-mllt --rand-prune=$randprune  $dir/$x.mdl "$feats" ark:- $dir/$x.JOB.macc \
        || exit 1;
      est-mllt $dir/$x.mat.new $dir/$x.*.macc 2> $dir/log/mupdate.$x.log || exit 1;
      gmm-transform-means  $dir/$x.mat.new $dir/$x.mdl $dir/$x.mdl \
        2> $dir/log/transform_means.$x.log || exit 1;
      compose-transforms --print-args=false $dir/$x.mat.new $dir/$cur_lda_iter.mat $dir/$x.mat || exit 1;
      rm $dir/$x.*.macc
    fi
    cur_lda_iter=$x
    feats="$splicedfeats transform-feats $dir/$cur_lda_iter.mat ark:- ark:- |"
  fi
  
  if [ $stage -le $x ]; then
    $cmd JOB=1:$nj $dir/log/acc.$x.JOB.log \
      gmm-acc-stats-ali $dir/$x.mdl "$feats" \
      "ark,s,cs:gunzip -c $dir/ali.JOB.gz|" $dir/$x.JOB.acc || exit 1;
    [ `ls $dir/$x.*.acc | wc -w` -ne "$nj" ] && echo "$0: Wrong #accs" && exit 1;
    $cmd $dir/log/update.$x.log \
      gmm-est --power=$power --write-occs=$dir/$[$x+1].occs --mix-up=$numgauss $dir/$x.mdl \
      "gmm-sum-accs - $dir/$x.*.acc |" $dir/$[$x+1].mdl || exit 1;
    rm $dir/$x.mdl $dir/$x.*.acc
    rm $dir/$x.occs 2>/dev/null
  fi
  [ $x -le $max_iter_inc ] && numgauss=$[$numgauss+$incgauss];
  x=$[$x+1];
done


if [ $stage -le $x ]; then
  # Accumulate stats for "alignment model"-- this model is
  # computed with the speaker-independent features, but matches Gaussian-for-Gaussian
  # with the final speaker-adapted model.
  sifeats="$sisplicedfeats transform-feats $dir/$cur_lda_iter.mat ark:- ark:- |"
  $cmd JOB=1:$nj $dir/log/acc_alimdl.JOB.log \
    ali-to-post "ark:gunzip -c $dir/ali.JOB.gz|" ark:-  \| \
    gmm-acc-stats-twofeats $dir/$x.mdl "$feats" "$sifeats" \
    ark,s,cs:- $dir/$x.JOB.acc || exit 1;
  [ `ls $dir/$x.*.acc | wc -w` -ne "$nj" ] && echo "$0: Wrong #accs" && exit 1;
  # Update model.
  $cmd $dir/log/est_alimdl.log \
    gmm-est --power=$power --remove-low-count-gaussians=false $dir/$x.mdl \
    "gmm-sum-accs - $dir/$x.*.acc|" $dir/$x.alimdl  || exit 1;
  rm $dir/$x.*.acc
fi

rm $dir/final.{mdl,alimdl,mat,occs} 2>/dev/null
ln -s $x.mdl $dir/final.mdl
ln -s $x.occs $dir/final.occs
ln -s $x.alimdl $dir/final.alimdl
ln -s $cur_lda_iter.mat $dir/final.mat


utils/summarize_warnings.pl $dir/log
(
  echo "$0: Likelihood evolution (not sure if this is totally correct):"
  for x in `seq $[$num_iters-1]`; do
    tail -n 30 $dir/log/acc.$x.*.log | awk '/Overall avg like/{l += $(NF-3)*$(NF-1); t += $(NF-1); }
        /Overall average logdet/{d += $(NF-3)*$(NF-1); t2 += $(NF-1);} 
        END{ d /= t2; l /= t; printf("%s ", d+l); } '
  done
  echo
) | tee $dir/log/summary.log

echo Done


================================================
FILE: egs/steps/train_sat.sh
================================================
#!/usr/bin/env bash
# Copyright 2012  Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.


# This does Speaker Adapted Training (SAT), i.e. train on
# fMLLR-adapted features.  It can be done on top of either LDA+MLLT, or
# delta and delta-delta features.  If there are no transforms supplied
# in the alignment directory, it will estimate transforms itself before
# building the tree (and in any case, it estimates transforms a number
# of times during training).


# Begin configuration section.
stage=-5
exit_stage=-100 # you can use this to require it to exit at the
                # beginning of a specific stage.  Not all values are
                # supported.
fmllr_update_type=full
cmd=run.pl
scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
beam=10
retry_beam=40
careful=false
boost_silence=1.0 # Factor by which to boost silence likelihoods in alignment
context_opts=  # e.g. set this to "--context-width 5 --central-position 2" for quinphone.
realign_iters="10 20 30";
fmllr_iters="2 4 6 12";
silence_weight=0.0 # Weight on silence in fMLLR estimation.
num_iters=35   # Number of iterations of training
max_iter_inc=25 # Last iter to increase #Gauss on.
power=0.2 # Exponent for number of gaussians according to occurrence counts
cluster_thresh=-1  # for build-tree control final bottom-up clustering of leaves
phone_map=
train_tree=true
tree_stats_opts=
cluster_phones_opts=
compile_questions_opts=
# End configuration section.

echo "$0 $@"  # Print the command line for logging

[ -f path.sh ] && . ./path.sh
. parse_options.sh || exit 1;

if [ $# != 6 ]; then
  echo "Usage: steps/train_sat.sh <#leaves> <#gauss> <data> <lang> <ali-dir> <exp-dir>"
  echo " e.g.: steps/train_sat.sh 2500 15000 data/train_si84 data/lang exp/tri2b_ali_si84 exp/tri3b"
  echo "Main options (for others, see top of script file)"
  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
  echo "  --config <config-file>                           # config containing options"
  echo "  --stage <stage>                                  # stage to do partial re-run from."
  exit 1;
fi

numleaves=$1
totgauss=$2
data=$3
lang=$4
alidir=$5
dir=$6

for f in $data/feats.scp $lang/phones.txt $alidir/final.mdl $alidir/ali.1.gz; do
  [ ! -f $f ] && echo "train_sat.sh: no such file $f" && exit 1;
done

numgauss=$numleaves
incgauss=$[($totgauss-$numgauss)/$max_iter_inc]  # per-iter #gauss increment
oov=`cat $lang/oov.int`
nj=`cat $alidir/num_jobs` || exit 1;
silphonelist=`cat $lang/phones/silence.csl`
ciphonelist=`cat $lang/phones/context_indep.csl` || exit 1;
sdata=$data/split$nj;
splice_opts=`cat $alidir/splice_opts 2>/dev/null` # frame-splicing options.
cmvn_opts=`cat $alidir/cmvn_opts 2>/dev/null`
delta_opts=`cat $alidir/delta_opts 2>/dev/null`
phone_map_opt=
[ ! -z "$phone_map" ] && phone_map_opt="--phone-map='$phone_map'"

mkdir -p $dir/log
cp $alidir/splice_opts $dir 2>/dev/null # frame-splicing options.
cp $alidir/cmvn_opts $dir 2>/dev/null # cmn/cmvn option.
cp $alidir/delta_opts $dir 2>/dev/null # delta option.

utils/lang/check_phones_compatible.sh $lang/phones.txt $alidir/phones.txt || exit 1;
cp $lang/phones.txt $dir || exit 1;

echo $nj >$dir/num_jobs
[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;

# Set up features.

if [ -f $alidir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
echo "$0: feature type is $feat_type"

## Set up speaker-independent features.
case $feat_type in
  delta) sifeats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas $delta_opts ark:- ark:- |";;
  lda) sifeats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |"
    cp $alidir/final.mat $dir
    cp $alidir/full.mat $dir 2>/dev/null
    ;;
  *) echo "$0: invalid feature type $feat_type" && exit 1;
esac

## Get initial fMLLR transforms (possibly from alignment dir)
if [ -f $alidir/trans.1 ]; then
  echo "$0: Using transforms from $alidir"
  feats="$sifeats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$alidir/trans.JOB ark:- ark:- |"
  cur_trans_dir=$alidir
else
  if [ $stage -le -5 ]; then
    echo "$0: obtaining initial fMLLR transforms since not present in $alidir"
    # The next line is necessary because of $silphonelist otherwise being incorrect; would require
    # old $lang dir which would require another option.  Not needed anyway.
    [ ! -z "$phone_map" ] && \
       echo "$0: error: you must provide transforms if you use the --phone-map option." && exit 1;
    $cmd JOB=1:$nj $dir/log/fmllr.0.JOB.log \
      ali-to-post "ark:gunzip -c $alidir/ali.JOB.gz|" ark:- \| \
      weight-silence-post $silence_weight $silphonelist $alidir/final.mdl ark:- ark:- \| \
      gmm-est-fmllr --fmllr-update-type=$fmllr_update_type \
      --spk2utt=ark:$sdata/JOB/spk2utt $alidir/final.mdl "$sifeats" \
      ark:- ark:$dir/trans.JOB || exit 1;
  fi
  feats="$sifeats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$dir/trans.JOB ark:- ark:- |"
  cur_trans_dir=$dir
fi

if [ $stage -le -4 ] && $train_tree; then
  # Get tree stats.
  echo "$0: Accumulating tree stats"
  $cmd JOB=1:$nj $dir/log/acc_tree.JOB.log \
    acc-tree-stats $context_opts $tree_stats_opts $phone_map_opt --ci-phones=$ciphonelist $alidir/final.mdl "$feats" \
    "ark:gunzip -c $alidir/ali.JOB.gz|" $dir/JOB.treeacc || exit 1;
  [ "`ls $dir/*.treeacc | wc -w`" -ne "$nj" ] && echo "$0: Wrong #tree-accs" && exit 1;
  $cmd $dir/log/sum_tree_acc.log \
    sum-tree-stats $dir/treeacc $dir/*.treeacc || exit 1;
  rm $dir/*.treeacc
fi

if [ $stage -le -3 ] && $train_tree; then
  echo "$0: Getting questions for tree clustering."
  # preparing questions, roots file...
  cluster-phones $cluster_phones_opts $context_opts $dir/treeacc $lang/phones/sets.int $dir/questions.int 2>$dir/log/questions.log || exit 1;
  cat $lang/phones/extra_questions.int >> $dir/questions.int
  compile-questions $context_opts $compile_questions_opts $lang/topo $dir/questions.int $dir/questions.qst 2>$dir/log/compile_questions.log || exit 1;

  echo "$0: Building the tree"
  $cmd $dir/log/build_tree.log \
    build-tree $context_opts --verbose=1 --max-leaves=$numleaves \
    --cluster-thresh=$cluster_thresh $dir/treeacc $lang/phones/roots.int \
    $dir/questions.qst $lang/topo $dir/tree || exit 1;
fi

if [ $stage -le -2 ]; then
  echo "$0: Initializing the model"
  if $train_tree; then
    gmm-init-model  --write-occs=$dir/1.occs  \
      $dir/tree $dir/treeacc $lang/topo $dir/1.mdl 2> $dir/log/init_model.log || exit 1;
    grep 'no stats' $dir/log/init_model.log && echo "This is a bad warning.";
    rm $dir/treeacc
  else
    cp $alidir/tree $dir/ || exit 1;
    $cmd JOB=1 $dir/log/init_model.log \
      gmm-init-model-flat $dir/tree $lang/topo $dir/1.mdl \
        "$feats subset-feats ark:- ark:-|" || exit 1;
  fi
fi

if [ $stage -le -1 ]; then
  # Convert the alignments.
  echo "$0: Converting alignments from $alidir to use current tree"
  $cmd JOB=1:$nj $dir/log/convert.JOB.log \
    convert-ali $phone_map_opt $alidir/final.mdl $dir/1.mdl $dir/tree \
     "ark:gunzip -c $alidir/ali.JOB.gz|" "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1;
fi

[ "$exit_stage" -eq 0 ] && echo "$0: Exiting early: --exit-stage $exit_stage" && exit 0;

if [ $stage -le 0 ] && [ "$realign_iters" != "" ]; then
  echo "$0: Compiling graphs of transcripts"
  $cmd JOB=1:$nj $dir/log/compile_graphs.JOB.log \
    compile-train-graphs --read-disambig-syms=$lang/phones/disambig.int $dir/tree $dir/1.mdl  $lang/L.fst  \
     "ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt < $sdata/JOB/text |" \
      "ark:|gzip -c >$dir/fsts.JOB.gz" || exit 1;
fi

x=1
while [ $x -lt $num_iters ]; do
   echo Pass $x
  if echo $realign_iters | grep -w $x >/dev/null && [ $stage -le $x ]; then
    echo Aligning data
    mdl="gmm-boost-silence --boost=$boost_silence `cat $lang/phones/optional_silence.csl` $dir/$x.mdl - |"
    $cmd JOB=1:$nj $dir/log/align.$x.JOB.log \
      gmm-align-compiled $scale_opts --beam=$beam --retry-beam=$retry_beam --careful=$careful "$mdl" \
      "ark:gunzip -c $dir/fsts.JOB.gz|" "$feats" \
      "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1;
  fi

  if echo $fmllr_iters | grep -w $x >/dev/null; then
    if [ $stage -le $x ]; then
      echo Estimating fMLLR transforms
      # We estimate a transform that's additional to the previous transform;
      # we'll compose them.
      $cmd JOB=1:$nj $dir/log/fmllr.$x.JOB.log \
        ali-to-post "ark:gunzip -c $dir/ali.JOB.gz|" ark:-  \| \
        weight-silence-post $silence_weight $silphonelist $dir/$x.mdl ark:- ark:- \| \
        gmm-est-fmllr --fmllr-update-type=$fmllr_update_type \
        --spk2utt=ark:$sdata/JOB/spk2utt $dir/$x.mdl \
        "$feats" ark:- ark:$dir/tmp_trans.JOB || exit 1;
      for n in `seq $nj`; do
        ! ( compose-transforms --b-is-affine=true \
          ark:$dir/tmp_trans.$n ark:$cur_trans_dir/trans.$n ark:$dir/composed_trans.$n \
          && mv $dir/composed_trans.$n $dir/trans.$n && \
          rm $dir/tmp_trans.$n ) 2>$dir/log/compose_transforms.$x.log \
          && echo "$0: Error composing transforms" && exit 1;
      done
    fi
    feats="$sifeats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$dir/trans.JOB ark:- ark:- |"
    cur_trans_dir=$dir
  fi

  if [ $stage -le $x ]; then
    $cmd JOB=1:$nj $dir/log/acc.$x.JOB.log \
      gmm-acc-stats-ali $dir/$x.mdl "$feats" \
      "ark,s,cs:gunzip -c $dir/ali.JOB.gz|" $dir/$x.JOB.acc || exit 1;
    [ `ls $dir/$x.*.acc | wc -w` -ne "$nj" ] && echo "$0: Wrong #accs" && exit 1;
    $cmd $dir/log/update.$x.log \
      gmm-est --power=$power --write-occs=$dir/$[$x+1].occs --mix-up=$numgauss $dir/$x.mdl \
      "gmm-sum-accs - $dir/$x.*.acc |" $dir/$[$x+1].mdl || exit 1;
    rm $dir/$x.mdl $dir/$x.*.acc
    rm $dir/$x.occs
  fi
  [ $x -le $max_iter_inc ] && numgauss=$[$numgauss+$incgauss];
  x=$[$x+1];
done


if [ $stage -le $x ]; then
  # Accumulate stats for "alignment model"-- this model is
  # computed with the speaker-independent features, but matches Gaussian-for-Gaussian
  # with the final speaker-adapted model.
  $cmd JOB=1:$nj $dir/log/acc_alimdl.JOB.log \
    ali-to-post "ark:gunzip -c $dir/ali.JOB.gz|" ark:-  \| \
    gmm-acc-stats-twofeats $dir/$x.mdl "$feats" "$sifeats" \
    ark,s,cs:- $dir/$x.JOB.acc || exit 1;
  [ `ls $dir/$x.*.acc | wc -w` -ne "$nj" ] && echo "$0: Wrong #accs" && exit 1;
  # Update model.
  $cmd $dir/log/est_alimdl.log \
    gmm-est --power=$power --remove-low-count-gaussians=false $dir/$x.mdl \
    "gmm-sum-accs - $dir/$x.*.acc|" $dir/$x.alimdl  || exit 1;
  rm $dir/$x.*.acc
fi

rm $dir/final.{mdl,alimdl,occs} 2>/dev/null
ln -s $x.mdl $dir/final.mdl
ln -s $x.occs $dir/final.occs
ln -s $x.alimdl $dir/final.alimdl


steps/diagnostic/analyze_alignments.sh --cmd "$cmd" $lang $dir

utils/summarize_warnings.pl $dir/log
(
  echo "$0: Likelihood evolution:"
  for x in `seq $[$num_iters-1]`; do
    tail -n 30 $dir/log/acc.$x.*.log | awk '/Overall avg like/{l += $(NF-3)*$(NF-1); t += $(NF-1); }
        /Overall average logdet/{d += $(NF-3)*$(NF-1); t2 += $(NF-1);}
        END{ d /= t2; l /= t; printf("%s ", d+l); } '
  done
  echo
) | tee $dir/log/summary.log


steps/info/gmm_dir_info.pl $dir

echo "$0: done training SAT system in $dir"

exit 0


================================================
FILE: egs/steps/train_sat_basis.sh
================================================
#!/usr/bin/env bash
# Copyright 2012  Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.
# Copyright 2013  GoVivace Inc. (Author: Nagendra Goel), Apache 2.0

# This does Speaker Adapted Training (SAT), i.e. train on
# fMLLR-adapted features.  It can be done on top of either LDA+MLLT, or
# delta and delta-delta features.  If there are no transforms supplied
# in the alignment directory, it will estimate transforms itself before
# building the tree (and in any case, it estimates transforms a number
# of times during training).


# Begin configuration section.
stage=-5
cmd=run.pl
scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
beam=10
retry_beam=40
boost_silence=1.0 # Factor by which to boost silence likelihoods in alignment
basis_fmllr_opts="--fmllr-min-count=22  --num-iters=10 --size-scale=0.2 --step-size-iters=3"
context_opts=  # e.g. set this to "--context-width 5 --central-position 2" for quinphone.
realign_iters="10 20 30";
fmllr_iters="2 4 6 12";
silence_weight=0.0 # Weight on silence in fMLLR estimation.
num_iters=35   # Number of iterations of training
max_iter_inc=25 # Last iter to increase #Gauss on.
power=0.2 # Exponent for number of gaussians according to occurrence counts
cluster_thresh=-1  # for build-tree control final bottom-up clustering of leaves
train_tree=true
# End configuration section.

echo "$0 $@"  # Print the command line for logging

[ -f path.sh ] && . ./path.sh
. parse_options.sh || exit 1;

if [ $# != 6 ]; then
  echo "Usage: steps/train_sat.sh <#leaves> <#gauss> <data> <lang> <ali-dir> <exp-dir>"
  echo " e.g.: steps/train_sat.sh 2500 15000 data/train_si84 data/lang exp/tri2b_ali_si84 exp/tri3b"
  echo "Main options (for others, see top of script file)"
  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
  echo "  --config <config-file>                           # config containing options"
  echo "  --stage <stage>                                  # stage to do partial re-run from."
  exit 1;
fi

numleaves=$1
totgauss=$2
data=$3
lang=$4
alidir=$5
dir=$6

for f in $data/feats.scp $lang/phones.txt $alidir/final.mdl $alidir/ali.1.gz; do
  [ ! -f $f ] && echo "train_sat.sh: no such file $f" && exit 1;
done

numgauss=$numleaves
incgauss=$[($totgauss-$numgauss)/$max_iter_inc]  # per-iter #gauss increment
oov=`cat $lang/oov.int`
nj=`cat $alidir/num_jobs` || exit 1;
silphonelist=`cat $lang/phones/silence.csl`
ciphonelist=`cat $lang/phones/context_indep.csl` || exit 1;
sdata=$data/split$nj;
splice_opts=`cat $alidir/splice_opts 2>/dev/null` # frame-splicing options.
cmvn_opts=`cat $alidir/cmvn_opts 2>/dev/null`
delta_opts=`cat $alidir/delta_opts 2>/dev/null`

mkdir -p $dir/log
cp $alidir/splice_opts $dir 2>/dev/null # frame-splicing options.
cp $alidir/cmvn_opts $dir 2>/dev/null # cmn/cmvn option.
cp $alidir/delta_opts $dir 2>/dev/null

utils/lang/check_phones_compatible.sh $lang/phones.txt $alidir/phones.txt || exit 1;
cp $lang/phones.txt $dir || exit 1;

echo $nj >$dir/num_jobs
[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;

# Set up features.

if [ -f $alidir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
echo "$0: feature type is $feat_type"

## Set up speaker-independent features.
case $feat_type in
  delta) sifeats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas $delta_opts ark:- ark:- |";;
  lda) sifeats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |"
    cp $alidir/final.mat $dir
    ;;
  *) echo "$0: invalid feature type $feat_type" && exit 1;
esac

## Get initial fMLLR transforms (possibly from alignment dir)
if [ -f $alidir/trans.1 ]; then
  echo "$0: Using transforms from $alidir"
  feats="$sifeats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$alidir/trans.JOB ark:- ark:- |"
  cur_trans_dir=$alidir
else
  if [ $stage -le -5 ]; then
    echo "$0: obtaining initial basis fMLLR transforms since not present in $alidir"
    # The next line is necessary because of $silphonelist otherwise being incorrect; would require
    # old $lang dir which would require another option.  Not needed anyway.
    $cmd JOB=1:$nj $dir/log/fmllr.0.JOB.log \
      ali-to-post "ark:gunzip -c $alidir/ali.JOB.gz|" ark:-  \| \
      weight-silence-post $silence_weight $silphonelist $alidir/final.mdl ark:- ark:- \| \
      gmm-post-to-gpost $alidir/final.mdl "$sifeats" ark:- ark:- \| \
      gmm-basis-fmllr-accs-gpost \
      $alidir/final.mdl "$sifeats" ark,s,cs:- $dir/basis.acc.JOB || exit 1;

    # Compute the basis matrices.
    $cmd $dir/log/basis_training.log \
          gmm-basis-fmllr-training $alidir/final.mdl $alidir/fmllr.basis $dir/basis.acc.* || exit 1;
    $cmd JOB=1:$nj $dir/log/fmllr.0.JOB.log \
      ali-to-post "ark:gunzip -c $alidir/ali.JOB.gz|" ark:-  \| \
      weight-silence-post $silence_weight $silphonelist $alidir/final.mdl ark:- ark:- \| \
      gmm-post-to-gpost $alidir/final.mdl "$sifeats" ark:- ark:- \| \
      gmm-est-basis-fmllr-gpost $basis_fmllr_opts --spk2utt=ark:$sdata/JOB/spk2utt  \
      $alidir/final.mdl $alidir/fmllr.basis "$sifeats"  ark,s,cs:- \
      ark:$alidir/trans.JOB || exit 1;

    feats="$sifeats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$alidir/trans.JOB ark:- ark:- |"
    cur_trans_dir=$alidir
  fi
fi

if [ $stage -le -4 ] && $train_tree; then
  # Get tree stats.
  echo "$0: Accumulating tree stats"
  $cmd JOB=1:$nj $dir/log/acc_tree.JOB.log \
    acc-tree-stats $context_opts --ci-phones=$ciphonelist $alidir/final.mdl "$feats" \
    "ark:gunzip -c $alidir/ali.JOB.gz|" $dir/JOB.treeacc || exit 1;
  [ "`ls $dir/*.treeacc | wc -w`" -ne "$nj" ] && echo "$0: Wrong #tree-accs" && exit 1;
  $cmd $dir/log/sum_tree_acc.log \
    sum-tree-stats $dir/treeacc $dir/*.treeacc || exit 1;
  rm $dir/*.treeacc
fi

if [ $stage -le -3 ] && $train_tree; then
  echo "$0: Getting questions for tree clustering."
  # preparing questions, roots file...
  cluster-phones $context_opts $dir/treeacc $lang/phones/sets.int $dir/questions.int 2> $dir/log/questions.log || exit 1;
  cat $lang/phones/extra_questions.int >> $dir/questions.int
  compile-questions $context_opts $lang/topo $dir/questions.int $dir/questions.qst 2>$dir/log/compile_questions.log || exit 1;

  echo "$0: Building the tree"
  $cmd $dir/log/build_tree.log \
    build-tree $context_opts --verbose=1 --max-leaves=$numleaves \
    --cluster-thresh=$cluster_thresh $dir/treeacc $lang/phones/roots.int \
    $dir/questions.qst $lang/topo $dir/tree || exit 1;
fi

if [ $stage -le -2 ]; then
  echo "$0: Initializing the model"
  if $train_tree; then
    gmm-init-model  --write-occs=$dir/1.occs  \
      $dir/tree $dir/treeacc $lang/topo $dir/1.mdl 2> $dir/log/init_model.log || exit 1;
    grep 'no stats' $dir/log/init_model.log && echo "This is a bad warning.";
    rm $dir/treeacc
  else
    cp $alidir/tree $dir/ || exit 1;
    $cmd JOB=1 $dir/log/init_model.log \
      gmm-init-model-flat $dir/tree $lang/topo $dir/1.mdl \
        "$feats subset-feats ark:- ark:-|" || exit 1;
  fi
fi

if [ $stage -le -1 ]; then
  # Convert the alignments.
  echo "$0: Converting alignments from $alidir to use current tree"
  $cmd JOB=1:$nj $dir/log/convert.JOB.log \
    convert-ali $alidir/final.mdl $dir/1.mdl $dir/tree \
     "ark:gunzip -c $alidir/ali.JOB.gz|" "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1;
fi

if [ $stage -le 0 ] && [ "$realign_iters" != "" ]; then
  echo "$0: Compiling graphs of transcripts"
  $cmd JOB=1:$nj $dir/log/compile_graphs.JOB.log \
    compile-train-graphs --read-disambig-syms=$lang/phones/disambig.int $dir/tree $dir/1.mdl  $lang/L.fst  \
     "ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt < $sdata/JOB/text |" \
      "ark:|gzip -c >$dir/fsts.JOB.gz" || exit 1;
fi

x=1
while [ $x -lt $num_iters ]; do
  echo Pass $x
  if echo $realign_iters | grep -w $x >/dev/null && [ $stage -le $x ]; then
    echo Aligning data
    mdl="gmm-boost-silence --boost=$boost_silence `cat $lang/phones/optional_silence.csl` $dir/$x.mdl - |"
    $cmd JOB=1:$nj $dir/log/align.$x.JOB.log \
      gmm-align-compiled $scale_opts --beam=$beam --retry-beam=$retry_beam "$mdl" \
        "ark:gunzip -c $dir/fsts.JOB.gz|" "$feats" \
        "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1;
  fi

  if echo $fmllr_iters | grep -w $x >/dev/null; then
    if [ $stage -le $x ]; then
      # Note: it's not really necessary to re-estimate the basis each time
      # but this is the way the script does it right now.
      echo Estimating basis and fMLLR transforms
      $cmd JOB=1:$nj $dir/log/fmllr_est.$x.JOB.log \
        ali-to-post "ark:gunzip -c $dir/ali.JOB.gz|" ark:-  \| \
        weight-silence-post $silence_weight $silphonelist $dir/$x.mdl ark:- ark:- \| \
        gmm-post-to-gpost $dir/$x.mdl "$feats" ark:- ark:- \| \
        gmm-basis-fmllr-accs-gpost \
          $dir/$x.mdl "$sifeats" ark,s,cs:- $dir/basis.acc.JOB || exit 1;

      # Compute the basis matrices.
      $cmd $dir/log/basis_training.log \
        gmm-basis-fmllr-training $dir/$x.mdl $dir/fmllr.basis $dir/basis.acc.* || exit 1;

      $cmd JOB=1:$nj $dir/log/fmllr_app.$x.JOB.log \
        ali-to-post "ark:gunzip -c $dir/ali.JOB.gz|" ark:-  \| \
        weight-silence-post $silence_weight $silphonelist $dir/$x.mdl ark:- ark:- \| \
        gmm-post-to-gpost $dir/$x.mdl "$sifeats" ark:- ark:- \| \
        gmm-est-basis-fmllr-gpost $basis_fmllr_opts --spk2utt=ark:$sdata/JOB/spk2utt \
          $dir/$x.mdl $dir/fmllr.basis "$sifeats"  ark,s,cs:- \
          ark:$dir/trans.JOB || exit 1;

    fi
    feats="$sifeats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$dir/trans.JOB ark:- ark:- |"
    cur_trans_dir=$dir
  fi

  if [ $stage -le $x ]; then
    $cmd JOB=1:$nj $dir/log/acc.$x.JOB.log \
      gmm-acc-stats-ali $dir/$x.mdl "$feats" \
      "ark,s,cs:gunzip -c $dir/ali.JOB.gz|" $dir/$x.JOB.acc || exit 1;
    [ `ls $dir/$x.*.acc | wc -w` -ne "$nj" ] && echo "$0: Wrong #accs" && exit 1;
    $cmd $dir/log/update.$x.log \
      gmm-est --power=$power --write-occs=$dir/$[$x+1].occs --mix-up=$numgauss $dir/$x.mdl \
      "gmm-sum-accs - $dir/$x.*.acc |" $dir/$[$x+1].mdl || exit 1;
    rm $dir/$x.mdl $dir/$x.*.acc
    rm $dir/$x.occs
  fi
  [ $x -le $max_iter_inc ] && numgauss=$[$numgauss+$incgauss];
  x=$[$x+1];
done


if [ $stage -le $x ]; then
  # Accumulate stats for "alignment model"-- this model is
  # computed with the speaker-independent features, but matches Gaussian-for-Gaussian
  # with the final speaker-adapted model.
  $cmd JOB=1:$nj $dir/log/acc_alimdl.JOB.log \
    ali-to-post "ark:gunzip -c $dir/ali.JOB.gz|" ark:-  \| \
    gmm-acc-stats-twofeats $dir/$x.mdl "$feats" "$sifeats" \
    ark,s,cs:- $dir/$x.JOB.acc || exit 1;
  [ `ls $dir/$x.*.acc | wc -w` -ne "$nj" ] && echo "$0: Wrong #accs" && exit 1;
  # Update model.
  $cmd $dir/log/est_alimdl.log \
    gmm-est --power=$power --remove-low-count-gaussians=false $dir/$x.mdl \
      "gmm-sum-accs - $dir/$x.*.acc|" $dir/$x.alimdl  || exit 1;
  rm $dir/$x.*.acc
fi

rm $dir/final.{mdl,alimdl,occs} 2>/dev/null
ln -s $x.mdl $dir/final.mdl
ln -s $x.occs $dir/final.occs
ln -s $x.alimdl $dir/final.alimdl


utils/summarize_warnings.pl $dir/log
(
  echo "$0: Likelihood evolution:"
  for x in `seq $[$num_iters-1]`; do
    tail -n 30 $dir/log/acc.$x.*.log | awk '/Overall avg like/{l += $(NF-3)*$(NF-1); t += $(NF-1); }
        /Overall average logdet/{d += $(NF-3)*$(NF-1); t2 += $(NF-1);}
        END{ d /= t2; l /= t; printf("%s ", d+l); } '
  done
  echo
) | tee $dir/log/summary.log

echo Done


================================================
FILE: egs/steps/train_segmenter.sh
================================================
#!/usr/bin/env bash

# Copyright 2013  Johns Hopkins University (Author: Daniel Povey)
# Apache 2.0

# Begin configuration.
stage=-4 # For restarting a process that went part way.
config=
cmd=run.pl

scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
realign_iters="10 20 30";
num_iters=35    # Number of iterations of training
max_iter_inc=25 # Last iter to increase #Gauss on.
beam=10
retry_beam=40
boost_silence=1.0 # Factor by which to boost silence likelihoods in alignment
power=0.25 # Exponent for number of gaussians according to occurrence counts
cluster_thresh=-1  # for build-tree control final bottom-up clustering of leaves
# End configuration.

echo "$0 $@"  # Print the command line for logging

[ -f path.sh ] && . ./path.sh;
. parse_options.sh || exit 1;

if [ $# != 6 ]; then
   echo "Usage: steps/train_segmenter.sh deltas.sh <num-leaves> <tot-gauss> <data-dir> <lang-dir> <alignment-dir> <exp-dir>"
   echo "e.g.: steps/train_deltas.sh 2000 10000 data/train_si84_half data/lang exp/mono_ali exp/tri1"
   echo "main options (for others, see top of script file)"
   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
   echo "  --config <config-file>                           # config containing options"
   echo "  --stage <stage>                                  # stage to do partial re-run from."
   exit 1;
fi

numleaves=$1
totgauss=$2
data=$3
lang=$4
alidir=$5
dir=$6

for f in $alidir/final.mdl $alidir/ali.1.gz $data/feats.scp $lang/phones.txt; do
  [ ! -f $f ] && echo "train_deltas.sh: no such file $f" && exit 1;
done

numgauss=$numleaves
incgauss=$[($totgauss-$numgauss)/$max_iter_inc] # per-iter increment for #Gauss
oov=`cat $lang/oov.int` || exit 1;
ciphonelist=`cat $lang/phones/context_indep.csl` || exit 1;
cmvn_opts=`cat $alidir/cmvn_opts 2>/dev/null`
nj=`cat $alidir/num_jobs` || exit 1;
mkdir -p $dir/log
echo $nj > $dir/num_jobs

utils/lang/check_phones_compatible.sh $lang/phones.txt $alidir/phones.txt || exit 1;
cp $lang/phones.txt $dir || exit 1;

sdata=$data/split$nj;
[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;

feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |"

rm $dir/.error 2>/dev/null

if [ $stage -le -3 ]; then
  echo "$0: accumulating tree stats"
  $cmd JOB=1:$nj $dir/log/acc_tree.JOB.log \
    acc-tree-stats  --ci-phones=$ciphonelist $alidir/final.mdl "$feats" \
     "ark:gunzip -c $alidir/ali.JOB.gz|" $dir/JOB.treeacc || exit 1;
  sum-tree-stats $dir/treeacc $dir/*.treeacc 2>$dir/log/sum_tree_acc.log || exit 1;
  rm $dir/*.treeacc
fi

if [ $stage -le -2 ]; then
  echo "$0: getting questions for tree-building, via clustering"
  # preparing questions, roots file...
  cluster-phones $dir/treeacc $lang/phones/sets.int $dir/questions.int 2> $dir/log/questions.log || exit 1;
  cat $lang/phones/extra_questions.int >> $dir/questions.int
  compile-questions $lang/topo $dir/questions.int $dir/questions.qst 2>$dir/log/compile_questions.log || exit 1;

  echo "$0: building the tree"
  $cmd $dir/log/build_tree.log \
    build-tree --verbose=1 --max-leaves=$numleaves \
    --cluster-thresh=$cluster_thresh $dir/treeacc $lang/phones/roots.int \
    $dir/questions.qst $lang/topo $dir/tree || exit 1;

  gmm-init-model  --write-occs=$dir/1.occs  \
    $dir/tree $dir/treeacc $lang/topo $dir/1.mdl 2> $dir/log/init_model.log || exit 1;
  grep 'no stats' $dir/log/init_model.log && echo "This is a bad warning.";

  gmm-mixup --mix-up=$numgauss $dir/1.mdl $dir/1.occs $dir/1.mdl 2>$dir/log/mixup.log || exit 1;
  rm $dir/treeacc
fi

if [ $stage -le -1 ]; then
  # Convert the alignments.
  echo "$0: converting alignments from $alidir to use current tree"
  $cmd JOB=1:$nj $dir/log/convert.JOB.log \
    convert-ali $alidir/final.mdl $dir/1.mdl $dir/tree \
     "ark:gunzip -c $alidir/ali.JOB.gz|" "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1;
fi

if [ $stage -le 0 ]; then
  echo "$0: compiling graphs of transcripts"
  $cmd JOB=1:$nj $dir/log/compile_graphs.JOB.log \
    compile-train-graphs --read-disambig-syms=$lang/phones/disambig.int $dir/tree $dir/1.mdl  $lang/L.fst  \
     "ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt < $data/split$nj/JOB/text |" \
      "ark:|gzip -c >$dir/fsts.JOB.gz" || exit 1;
fi

x=1
while [ $x -lt $num_iters ]; do
  echo "$0: training pass $x"
  if [ $stage -le $x ]; then
    if echo $realign_iters | grep -w $x >/dev/null; then
      echo "$0: aligning data"
      mdl="gmm-boost-silence --boost=$boost_silence `cat $lang/phones/optional_silence.csl` $dir/$x.mdl - |"
      $cmd JOB=1:$nj $dir/log/align.$x.JOB.log \
        gmm-align-compiled $scale_opts --beam=$beam --retry-beam=$retry_beam "$mdl" \
         "ark:gunzip -c $dir/fsts.JOB.gz|" "$feats" \
         "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1;
    fi
    $cmd JOB=1:$nj $dir/log/acc.$x.JOB.log \
      gmm-acc-stats-ali  $dir/$x.mdl "$feats" \
       "ark,s,cs:gunzip -c $dir/ali.JOB.gz|" $dir/$x.JOB.acc || exit 1;
    $cmd $dir/log/update.$x.log \
      gmm-est --mix-up=$numgauss --power=$power \
        --write-occs=$dir/$[$x+1].occs $dir/$x.mdl \
       "gmm-sum-accs - $dir/$x.*.acc |" $dir/$[$x+1].mdl || exit 1;
    rm $dir/$x.mdl $dir/$x.*.acc
    rm $dir/$x.occs
  fi
  [ $x -le $max_iter_inc ] && numgauss=$[$numgauss+$incgauss];
  x=$[$x+1];
done

rm $dir/final.mdl 2>/dev/null
ln -s $x.mdl $dir/final.mdl
ln -s $x.occs $dir/final.occs

# Summarize warning messages...
utils/summarize_warnings.pl  $dir/log

echo "$0: Done training system with delta+delta-delta features in $dir"


================================================
FILE: egs/steps/train_sgmm2.sh
================================================
#!/usr/bin/env bash

# Copyright 2012  Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.

# SGMM training, with speaker vectors.  This script would normally be called on
# top of fMLLR features obtained from a conventional system, but it also works
# on top of any type of speaker-independent features (based on
# deltas+delta-deltas or LDA+MLLT).  For more info on SGMMs, see the paper "The
# subspace Gaussian mixture model--A structured model for speech recognition".
# (Computer Speech and Language, 2011).

# Begin configuration section.
cmd=run.pl
stage=-6 # use this to resume partially finished training
context_opts= # e.g. set it to "--context-width=5 --central-position=2"  for a
# quinphone system.
scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
num_iters=25   # Total number of iterations of training
num_iters_alimdl=3 # Number of iterations for estimating alignment model.
max_iter_inc=15 # Last iter to increase #substates on.
realign_iters="5 10 15"; # Iters to realign on.
spkvec_iters="5 8 12 17" # Iters to estimate speaker vectors on.
increase_iters="6 10 14"; # Iters on which to increase phn dim and/or spk dim;
    # rarely necessary, and if it is, only the 1st will normally be necessary.
rand_prune=0.1 # Randomized-pruning parameter for posteriors, to speed up training.
               # Bigger -> more pruning; zero = no pruning.
phn_dim=  # You can use this to set the phonetic subspace dim. [default: feat-dim+1]
spk_dim=  # You can use this to set the speaker subspace dim. [default: feat-dim]
power=0.25 # Exponent for number of gaussians according to occurrence counts
beam=8
self_weight=0.9
retry_beam=40
leaves_per_group=5 # Relates to the SCTM (state-clustered tied-mixture) aspect:
                   # average number of pdfs in a "group" of pdfs.
update_m_iter=4
spk_dep_weights=true # [Symmetric SGMM] set this to false if you don't want "u" (i.e. to turn off
                      # symmetric SGMM.
# End configuration section.

echo "$0 $@"  # Print the command line for logging

if [ -f path.sh ]; then . ./path.sh; fi
. parse_options.sh || exit 1;


if [ $# != 7 ]; then
  echo "Usage: steps/train_sgmm2.sh <num-leaves> <num-substates> <data> <lang> <ali-dir> <ubm> <exp-dir>"
  echo " e.g.: steps/train_sgmm2.sh 5000 8000 data/train_si84 data/lang \\"
  echo "                      exp/tri3b_ali_si84 exp/ubm4a/final.ubm exp/sgmm4a"
  echo "main options (for others, see top of script file)"
  echo "  --config <config-file>                           # config containing options"
  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
  echo "  --silence-weight <sil-weight>                    # weight for silence (e.g. 0.5 or 0.0)"
  echo "  --num-iters <#iters>                             # Number of iterations of E-M"
  echo "  --leaves-per-group <#leaves>                     # Average #leaves shared in one group"
  exit 1;
fi

num_pdfs=$1  # final #leaves, at 2nd level of tree.
totsubstates=$2
data=$3
lang=$4
alidir=$5
ubm=$6
dir=$7

num_groups=$[$num_pdfs/$leaves_per_group]
first_spkvec_iter=`echo $spkvec_iters | awk '{print $1}'` || exit 1;
ciphonelist=`cat $lang/phones/context_indep.csl` || exit 1;

# Check some files.
for f in $data/feats.scp $lang/L.fst $alidir/ali.1.gz $alidir/final.mdl $ubm $alidir/num_jobs; do
  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
done


# Set some variables.
oov=`cat $lang/oov.int`
silphonelist=`cat $lang/phones/silence.csl`
if [ "$self_weight" == "1.0" ]; then
  numsubstates=$num_groups # Initial #-substates.
else
  numsubstates=$num_pdfs # Initial #-substates.
fi
incsubstates=$[($totsubstates-$numsubstates)/$max_iter_inc] # per-iter increment for #substates
feat_dim=`gmm-info $alidir/final.mdl 2>/dev/null | awk '/feature dimension/{print $NF}'` || exit 1;
[ $feat_dim -eq $feat_dim ] || exit 1; # make sure it's numeric.
[ -z $phn_dim ] && phn_dim=$[$feat_dim+1]
[ -z $spk_dim ] && spk_dim=$feat_dim
nj=`cat $alidir/num_jobs` || exit 1;
splice_opts=`cat $alidir/splice_opts 2>/dev/null` # frame-splicing options.
cmvn_opts=`cat $alidir/cmvn_opts 2>/dev/null`

mkdir -p $dir/log
cp $alidir/splice_opts $dir 2>/dev/null # frame-splicing options.
cp $alidir/cmvn_opts $dir 2>/dev/null # cmn/cmvn option.
echo $nj > $dir/num_jobs
sdata=$data/split$nj;
[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;

utils/lang/check_phones_compatible.sh $lang/phones.txt $alidir/phones.txt || exit 1;
cp $lang/phones.txt $dir || exit 1;

spkvecs_opt=  # Empty option for now, until we estimate the speaker vectors.
gselect_opt="--gselect=ark,s,cs:gunzip -c $dir/gselect.JOB.gz|"

## Set up features.
if [ -f $alidir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
echo "$0: feature type is $feat_type"

case $feat_type in
  delta) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
  lda) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |"
    cp $alidir/final.mat $dir
    ;;
  *) echo "$0: invalid feature type $feat_type" && exit 1;
esac
if [ -f $alidir/trans.1 ]; then
  echo "$0: using transforms from $alidir"
  feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$alidir/trans.JOB ark:- ark:- |"
elif [ -f $alidir/raw_trans.1 ]; then
  echo "$0: using raw-fMLLR transforms from $alidir"
  feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$alidir/raw_trans.JOB ark:- ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |"
fi
##


if [ $stage -le -6 ]; then
  echo "$0: accumulating tree stats"
  $cmd JOB=1:$nj $dir/log/acc_tree.JOB.log \
    acc-tree-stats $context_opts --ci-phones=$ciphonelist $alidir/final.mdl "$feats" \
    "ark:gunzip -c $alidir/ali.JOB.gz|" $dir/JOB.treeacc || exit 1;
  [ "`ls $dir/*.treeacc | wc -w`" -ne "$nj" ] && echo "$0: Wrong #tree-stats" && exit 1;
  sum-tree-stats $dir/treeacc $dir/*.treeacc 2>$dir/log/sum_tree_acc.log || exit 1;
  rm $dir/*.treeacc
fi

if [ $stage -le -5 ]; then
  echo "$0: Getting questions for tree clustering."
  # preparing questions, roots file...
  cluster-phones $context_opts $dir/treeacc $lang/phones/sets.int $dir/questions.int 2> $dir/log/questions.log || exit 1;
  cat $lang/phones/extra_questions.int >> $dir/questions.int
  compile-questions $context_opts $lang/topo $dir/questions.int $dir/questions.qst 2>$dir/log/compile_questions.log || exit 1;

  echo "$0: Building the tree"
  $cmd $dir/log/build_tree.log \
    build-tree-two-level $context_opts --binary=false --verbose=1 --max-leaves-first=$num_groups \
     --max-leaves-second=$num_pdfs $dir/treeacc $lang/phones/roots.int \
     $dir/questions.qst $lang/topo $dir/tree $dir/pdf2group.map || exit 1;
fi

if [ $stage -le -4 ]; then
  echo "$0: Initializing the model"
  # Note: if phn_dim > feat_dim+1 or spk_dim > feat_dim, these dims
  # will be truncated on initialization.
  $cmd $dir/log/init_sgmm.log \
    sgmm2-init --spk-dep-weights=$spk_dep_weights --self-weight=$self_weight \
       --pdf-map=$dir/pdf2group.map --phn-space-dim=$phn_dim \
       --spk-space-dim=$spk_dim $lang/topo $dir/tree $ubm $dir/0.mdl || exit 1;
fi

if [ $stage -le -3 ]; then
  echo "$0: doing Gaussian selection"
  $cmd JOB=1:$nj $dir/log/gselect.JOB.log \
    sgmm2-gselect $dir/0.mdl "$feats" \
    "ark,t:|gzip -c >$dir/gselect.JOB.gz" || exit 1;
fi

if [ $stage -le -2 ]; then
  echo "$0: compiling training graphs"
  text="ark:sym2int.pl --map-oov $oov -f 2- $lang/words.txt < $sdata/JOB/text|"
  $cmd JOB=1:$nj $dir/log/compile_graphs.JOB.log \
    compile-train-graphs --read-disambig-syms=$lang/phones/disambig.int $dir/tree $dir/0.mdl  $lang/L.fst  \
    "$text" "ark:|gzip -c >$dir/fsts.JOB.gz" || exit 1;
fi

if [ $stage -le -1 ]; then
  echo "$0: converting alignments"
  $cmd JOB=1:$nj $dir/log/convert_ali.JOB.log \
    convert-ali $alidir/final.mdl $dir/0.mdl $dir/tree "ark:gunzip -c $alidir/ali.JOB.gz|" \
    "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1;
fi


x=0
while [ $x -lt $num_iters ]; do
   echo "$0: training pass $x ... "
   if echo $realign_iters | grep -w $x >/dev/null && [ $stage -le $x ]; then
     echo "$0: re-aligning data"
     $cmd JOB=1:$nj $dir/log/align.$x.JOB.log  \
       sgmm2-align-compiled $spkvecs_opt $scale_opts "$gselect_opt" \
       --utt2spk=ark:$sdata/JOB/utt2spk --beam=$beam --retry-beam=$retry_beam \
       $dir/$x.mdl "ark:gunzip -c $dir/fsts.JOB.gz|" "$feats" \
       "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1;
   fi
   if [ $spk_dim -gt 0 ] && echo $spkvec_iters | grep -w $x >/dev/null; then
     if [ $stage -le $x ]; then
       $cmd JOB=1:$nj $dir/log/spkvecs.$x.JOB.log \
         ali-to-post "ark:gunzip -c $dir/ali.JOB.gz|" ark:- \| \
         weight-silence-post 0.01 $silphonelist $dir/$x.mdl ark:- ark:- \| \
         sgmm2-est-spkvecs --rand-prune=$rand_prune --spk2utt=ark:$sdata/JOB/spk2utt \
         $spkvecs_opt "$gselect_opt" $dir/$x.mdl "$feats" ark,s,cs:- \
         ark:$dir/tmp_vecs.JOB '&&' mv $dir/tmp_vecs.JOB $dir/vecs.JOB || exit 1;
     fi
     spkvecs_opt="--spk-vecs=ark:$dir/vecs.JOB"
   fi
   if [ $x -eq 0 ]; then
     flags=vwcSt # on the first iteration, don't update projections M or N
   elif [ $spk_dim -gt 0 -a $[$x%2] -eq 1 -a $x -ge $first_spkvec_iter ]; then
     # Update N if we have speaker-vector space and x is odd,
     # and we've already updated the speaker vectors...
     flags=vNwSct
   else
     if [ $x -ge $update_m_iter ]; then
       flags=vMwSct # udpate M.
     else
       flags=vwSct # no M on early iters, if --update-m-iter option given.
     fi
   fi
   $spk_dep_weights && [ $x -ge $first_spkvec_iter ] && flags=${flags}u; # update
   # spk-weight projections "u".

   if [ $stage -le $x ]; then
     $cmd JOB=1:$nj $dir/log/acc.$x.JOB.log \
       sgmm2-acc-stats $spkvecs_opt --utt2spk=ark:$sdata/JOB/utt2spk \
       --update-flags=$flags "$gselect_opt" --rand-prune=$rand_prune \
       $dir/$x.mdl "$feats" "ark,s,cs:gunzip -c $dir/ali.JOB.gz | ali-to-post ark:- ark:-|" \
       $dir/$x.JOB.acc || exit 1;
   fi

   # The next option is needed if the user specifies a phone or speaker sub-space
   # dimension that's higher than the "normal" one.
   increase_dim_opts=
   if echo $increase_dim_iters | grep -w $x >/dev/null; then
     increase_dim_opts="--increase-phn-dim=$phn_dim --increase-spk-dim=$spk_dim"
     # Note: the command below might have a null effect on some iterations.
     if [ $spk_dim -gt $feat_dim ]; then
       cmd JOB=1:$nj $dir/log/copy_vecs.$x.JOB.log \
         copy-vector --print-args=false --change-dim=$spk_dim \
         ark:$dir/vecs.JOB ark:$dir/vecs_tmp.$JOB '&&' \
         mv $dir/vecs_tmp.JOB $dir/vecs.JOB || exit 1;
     fi
   fi

   if [ $stage -le $x ]; then
     $cmd $dir/log/update.$x.log \
       sgmm2-est --update-flags=$flags --split-substates=$numsubstates \
       $increase_dim_opts --power=$power --write-occs=$dir/$[$x+1].occs \
       $dir/$x.mdl "sgmm2-sum-accs - $dir/$x.*.acc|" $dir/$[$x+1].mdl || exit 1;
     rm $dir/$x.mdl $dir/$x.*.acc $dir/$x.occs 2>/dev/null
   fi
   if [ $x -lt $max_iter_inc ]; then
     numsubstates=$[$numsubstates+$incsubstates]
   fi
   x=$[$x+1];
done

rm $dir/final.mdl $dir/final.occs 2>/dev/null
ln -s $x.mdl $dir/final.mdl
ln -s $x.occs $dir/final.occs

if [ $spk_dim -gt 0 ]; then
  # We need to create an "alignment model" that's been trained
  # without the speaker vectors, to do the first-pass decoding with.
  # in test time.

  # We do this for a few iters, in this recipe.
  final_mdl=$dir/$x.mdl
  cur_alimdl=$dir/$x.mdl
  while [ $x -lt $[$num_iters+$num_iters_alimdl] ]; do
    echo "$0: building alignment model (pass $x)"
    if [ $x -eq $num_iters ]; then # 1st pass of building alimdl.
      flags=MwcS # don't update v the first time.  Note-- we never update transitions.
      # they wouldn't change anyway as we use the same alignment as previously.
    else
      flags=vMwcS
    fi
    if [ $stage -le $x ]; then
      $cmd JOB=1:$nj $dir/log/acc_ali.$x.JOB.log \
        ali-to-post "ark:gunzip -c $dir/ali.JOB.gz|" ark:- \| \
        sgmm2-post-to-gpost $spkvecs_opt "$gselect_opt" \
         --utt2spk=ark:$sdata/JOB/utt2spk $final_mdl "$feats" ark,s,cs:- ark:- \| \
        sgmm2-acc-stats-gpost --rand-prune=$rand_prune --update-flags=$flags \
          $cur_alimdl "$feats" ark,s,cs:- $dir/$x.JOB.aliacc || exit 1;
      $cmd $dir/log/update_ali.$x.log \
        sgmm2-est --update-flags=$flags --remove-speaker-space=true --power=$power \
        $cur_alimdl "sgmm2-sum-accs - $dir/$x.*.aliacc|" $dir/$[$x+1].alimdl || exit 1;
      rm $dir/$x.*.aliacc || exit 1;
      [ $x -gt $num_iters ]  && rm $dir/$x.alimdl
    fi
    cur_alimdl=$dir/$[$x+1].alimdl
    x=$[$x+1]
  done
  rm $dir/final.alimdl 2>/dev/null
  ln -s $x.alimdl $dir/final.alimdl
fi

utils/summarize_warnings.pl $dir/log

echo Done


================================================
FILE: egs/steps/train_sgmm2_group.sh
================================================
#!/usr/bin/env bash

# Copyright 2012  Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.

# This version of the train_sgmm2 script has several jobs on each machine, and adds the
# accumulators up in memory.

# SGMM training, with speaker vectors.  This script would normally be called on
# top of fMLLR features obtained from a conventional system, but it also works
# on top of any type of speaker-independent features (based on
# deltas+delta-deltas or LDA+MLLT).  For more info on SGMMs, see the paper "The
# subspace Gaussian mixture model--A structured model for speech recognition".
# (Computer Speech and Language, 2011).

# Begin configuration section.
cmd=run.pl
stage=-6 # use this to resume partially finished training
context_opts= # e.g. set it to "--context-width=5 --central-position=2"  for a
# quinphone system.
scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
num_iters=25   # Total number of iterations of training
num_iters_alimdl=3 # Number of iterations for estimating alignment model.
max_iter_inc=15 # Last iter to increase #substates on.
realign_iters="5 10 15"; # Iters to realign on.
spkvec_iters="5 8 12 17" # Iters to estimate speaker vectors on.
increase_iters="6 10 14"; # Iters on which to increase phn dim and/or spk dim;
    # rarely necessary, and if it is, only the 1st will normally be necessary.
rand_prune=0.1 # Randomized-pruning parameter for posteriors, to speed up training.
               # Bigger -> more pruning; zero = no pruning.
phn_dim=  # You can use this to set the phonetic subspace dim. [default: feat-dim+1]
spk_dim=  # You can use this to set the speaker subspace dim. [default: feat-dim]
power=0.25 # Exponent for number of gaussians according to occurrence counts
beam=8
self_weight=0.9
retry_beam=40
leaves_per_group=5 # Relates to the SCTM (state-clustered tied-mixture) aspect:
                   # average number of pdfs in a "group" of pdfs.
update_m_iter=4
spk_dep_weights=true # [Symmetric SGMM] set this to false if you don't want "u" (i.e. to turn off
                      # symmetric SGMM.
group=3 # Number of jobs to group together on a single machine, and add the
        # stats locally.  Don't confuse this with leaves_per_group and so on,
        # they are totally unrelated.
parallel_opts=  # this option is now ignored.

echo "$0 $@"  # Print the command line for logging

if [ -f path.sh ]; then . ./path.sh; fi
. parse_options.sh || exit 1;


if [ $# != 7 ]; then
  echo "Usage: steps/train_sgmm2.sh <num-leaves> <num-substates> <data> <lang> <ali-dir> <ubm> <exp-dir>"
  echo " e.g.: steps/train_sgmm2.sh 5000 8000 data/train_si84 data/lang \\"
  echo "                      exp/tri3b_ali_si84 exp/ubm4a/final.ubm exp/sgmm4a"
  echo "main options (for others, see top of script file)"
  echo "  --group <n>                                      # number of jobs on one machine, default 3."
  echo "  --config <config-file>                           # config containing options"
  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
  echo "  --silence-weight <sil-weight>                    # weight for silence (e.g. 0.5 or 0.0)"
  echo "  --num-iters <#iters>                             # Number of iterations of E-M"
  exit 1;
fi

num_pdfs=$1  # final #leaves, at 2nd level of tree.
totsubstates=$2
data=$3
lang=$4
alidir=$5
ubm=$6
dir=$7

num_groups=$[$num_pdfs/$leaves_per_group]
first_spkvec_iter=`echo $spkvec_iters | awk '{print $1}'` || exit 1;
ciphonelist=`cat $lang/phones/context_indep.csl` || exit 1;

# Check some files.
for f in $data/feats.scp $lang/L.fst $alidir/ali.1.gz $alidir/final.mdl $ubm $alidir/num_jobs; do
  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
done


# Set some variables.
oov=`cat $lang/oov.int`
silphonelist=`cat $lang/phones/silence.csl`
if [ "$self_weight" == "1.0" ]; then
  numsubstates=$num_groups # Initial #-substates.
else
  numsubstates=$num_pdfs # Initial #-substates.
fi
incsubstates=$[($totsubstates-$numsubstates)/$max_iter_inc] # per-iter increment for #substates
feat_dim=`gmm-info $alidir/final.mdl 2>/dev/null | awk '/feature dimension/{print $NF}'` || exit 1;
[ $feat_dim -eq $feat_dim ] || exit 1; # make sure it's numeric.
[ -z $phn_dim ] && phn_dim=$[$feat_dim+1]
[ -z $spk_dim ] && spk_dim=$feat_dim
nj=`cat $alidir/num_jobs` || exit 1;
splice_opts=`cat $alidir/splice_opts 2>/dev/null` # frame-splicing options.
cmvn_opts=`cat $alidir/cmvn_opts 2>/dev/null`

mkdir -p $dir/log
cp $alidir/splice_opts $dir 2>/dev/null # frame-splicing options.
cp $alidir/cmvn_opts $dir 2>/dev/null # cmn/cmvn option.
echo $nj > $dir/num_jobs
sdata=$data/split$nj;
[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;

utils/lang/check_phones_compatible.sh $lang/phones.txt $alidir/phones.txt || exit 1;
cp $lang/phones.txt $dir || exit 1;

spkvecs_opt=  # Empty option for now, until we estimate the speaker vectors.
gselect_opt="--gselect=ark,s,cs:gunzip -c $dir/gselect.JOB.gz|"

## Set up features.
if [ -f $alidir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
echo "$0: feature type is $feat_type"

case $feat_type in
  delta) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
  lda) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |"
    cp $alidir/final.mat $dir
    ;;
  *) echo "$0: invalid feature type $feat_type" && exit 1;
esac
if [ -f $alidir/trans.1 ]; then
  echo "$0: using transforms from $alidir"
  feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$alidir/trans.JOB ark:- ark:- |"
elif [ -f $alidir/raw_trans.1 ]; then
  echo "$0: using raw-fMLLR transforms from $alidir"
  feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$alidir/raw_trans.JOB ark:- ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |"
fi
##


if [ $stage -le -6 ]; then
  echo "$0: accumulating tree stats"
  $cmd JOB=1:$nj $dir/log/acc_tree.JOB.log \
    acc-tree-stats $context_opts --ci-phones=$ciphonelist $alidir/final.mdl "$feats" \
    "ark:gunzip -c $alidir/ali.JOB.gz|" $dir/JOB.treeacc || exit 1;
  [ "`ls $dir/*.treeacc | wc -w`" -ne "$nj" ] && echo "$0: Wrong #tree-stats" && exit 1;
  sum-tree-stats $dir/treeacc $dir/*.treeacc 2>$dir/log/sum_tree_acc.log || exit 1;
  rm $dir/*.treeacc
fi

if [ $stage -le -5 ]; then
  echo "$0: Getting questions for tree clustering."
  # preparing questions, roots file...
  cluster-phones $context_opts $dir/treeacc $lang/phones/sets.int $dir/questions.int 2> $dir/log/questions.log || exit 1;
  cat $lang/phones/extra_questions.int >> $dir/questions.int
  compile-questions $context_opts $lang/topo $dir/questions.int $dir/questions.qst 2>$dir/log/compile_questions.log || exit 1;

  echo "$0: Building the tree"
  $cmd $dir/log/build_tree.log \
    build-tree-two-level $context_opts --binary=false --verbose=1 --max-leaves-first=$num_groups \
     --max-leaves-second=$num_pdfs $dir/treeacc $lang/phones/roots.int \
     $dir/questions.qst $lang/topo $dir/tree $dir/pdf2group.map || exit 1;
fi

if [ $stage -le -4 ]; then
  echo "$0: Initializing the model"
  # Note: if phn_dim > feat_dim+1 or spk_dim > feat_dim, these dims
  # will be truncated on initialization.
  $cmd $dir/log/init_sgmm.log \
    sgmm2-init --spk-dep-weights=$spk_dep_weights --self-weight=$self_weight \
       --pdf-map=$dir/pdf2group.map --phn-space-dim=$phn_dim \
       --spk-space-dim=$spk_dim $lang/topo $dir/tree $ubm $dir/0.mdl || exit 1;
fi

if [ $stage -le -3 ]; then
  echo "$0: doing Gaussian selection"
  $cmd JOB=1:$nj $dir/log/gselect.JOB.log \
    sgmm2-gselect $dir/0.mdl "$feats" \
    "ark,t:|gzip -c >$dir/gselect.JOB.gz" || exit 1;
fi

if [ $stage -le -2 ]; then
  echo "$0: compiling training graphs"
  text="ark:sym2int.pl --map-oov $oov -f 2- $lang/words.txt < $sdata/JOB/text|"
  $cmd JOB=1:$nj $dir/log/compile_graphs.JOB.log \
    compile-train-graphs --read-disambig-syms=$lang/phones/disambig.int $dir/tree $dir/0.mdl  $lang/L.fst  \
    "$text" "ark:|gzip -c >$dir/fsts.JOB.gz" || exit 1;
fi

if [ $stage -le -1 ]; then
  echo "$0: converting alignments"
  $cmd JOB=1:$nj $dir/log/convert_ali.JOB.log \
    convert-ali $alidir/final.mdl $dir/0.mdl $dir/tree "ark:gunzip -c $alidir/ali.JOB.gz|" \
    "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1;
fi


x=0
while [ $x -lt $num_iters ]; do
   echo "$0: training pass $x ... "
   if echo $realign_iters | grep -w $x >/dev/null && [ $stage -le $x ]; then
     echo "$0: re-aligning data"
     $cmd JOB=1:$nj $dir/log/align.$x.JOB.log  \
       sgmm2-align-compiled $spkvecs_opt $scale_opts "$gselect_opt" \
       --utt2spk=ark:$sdata/JOB/utt2spk --beam=$beam --retry-beam=$retry_beam \
       $dir/$x.mdl "ark:gunzip -c $dir/fsts.JOB.gz|" "$feats" \
       "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1;
   fi
   if [ $spk_dim -gt 0 ] && echo $spkvec_iters | grep -w $x >/dev/null; then
     if [ $stage -le $x ]; then
       $cmd JOB=1:$nj $dir/log/spkvecs.$x.JOB.log \
         ali-to-post "ark:gunzip -c $dir/ali.JOB.gz|" ark:- \| \
         weight-silence-post 0.01 $silphonelist $dir/$x.mdl ark:- ark:- \| \
         sgmm2-est-spkvecs --rand-prune=$rand_prune --spk2utt=ark:$sdata/JOB/spk2utt \
         $spkvecs_opt "$gselect_opt" $dir/$x.mdl "$feats" ark,s,cs:- \
         ark:$dir/tmp_vecs.JOB '&&' mv $dir/tmp_vecs.JOB $dir/vecs.JOB || exit 1;
     fi
     spkvecs_opt="--spk-vecs=ark:$dir/vecs.JOB"
   fi
   if [ $x -eq 0 ]; then
     flags=vwcSt # on the first iteration, don't update projections M or N
   elif [ $spk_dim -gt 0 -a $[$x%2] -eq 1 -a $x -ge $first_spkvec_iter ]; then
     # Update N if we have speaker-vector space and x is odd,
     # and we've already updated the speaker vectors...
     flags=vNwSct
   else
     if [ $x -ge $update_m_iter ]; then
       flags=vMwSct # udpate M.
     else
       flags=vwSct # no M on early iters, if --update-m-iter option given.
     fi
   fi
   $spk_dep_weights && [ $x -ge $first_spkvec_iter ] && flags=${flags}u; # update
   # spk-weight projections "u".

   # Submit separate jobs for small groups (of size $group) of accumulators.
   Args=() # bash array of training commands for 1:nj, that put accs to stdout.
   for n in `seq $nj`; do
     Args[$n]=`echo "sgmm2-acc-stats $spkvecs_opt --utt2spk=ark:$sdata/JOB/utt2spk \
             --update-flags=$flags '$gselect_opt' --rand-prune=$rand_prune \
             $dir/$x.mdl '$feats' 'ark,s,cs:gunzip -c $dir/ali.JOB.gz | ali-to-post ark:- ark:-|' - |" | sed s/JOB/$n/g`
   done

   g=0
   rm $dir/.error 2>/dev/null
   if [ $stage -le $x ]; then
     while [ $[$g*$group] -lt $nj ]; do
       if [ -s $dir/acc.$x.$g.gz ]; then
         echo "Skipping creation of acc $dir/acc.$x.$g.gz as it already exists."
       else
         start=$[$g*$group + 1]; # start-position in array Args.
         # see http://www.thegeekstuff.com/2010/06/bash-array-tutorial/, this uses Bash arrays."
         # The syntax "${Args[@]:$start:$group}" is equivalent to, say,
         # "${Args[3]}" "${Args[4]}" if start=3 and group=2.  Except it's smart about the end
         # of the array, it won't give you empty quoted strings if the length "group" takes you off
         # the end of the array.
         $cmd --num-threads "$group" $dir/log/acc.$x.$g.log \
           sgmm2-sum-accs --parallel=true "|gzip -c >$dir/acc.$x.$g.gz" "${Args[@]:$start:$group}"  || touch $dir/.error &
       fi
       g=$[$g+1];
     done
     wait
     if [ -f $dir/.error ]; then
       echo "Something went wrong during accumulation on pass $x"
       exit 1;
     fi
   fi

   # The next option is needed if the user specifies a phone or speaker sub-space
   # dimension that's higher than the "normal" one.
   increase_dim_opts=
   if echo $increase_dim_iters | grep -w $x >/dev/null; then
     increase_dim_opts="--increase-phn-dim=$phn_dim --increase-spk-dim=$spk_dim"
     # Note: the command below might have a null effect on some iterations.
     if [ $spk_dim -gt $feat_dim ]; then
       cmd JOB=1:$nj $dir/log/copy_vecs.$x.JOB.log \
         copy-vector --print-args=false --change-dim=$spk_dim \
         ark:$dir/vecs.JOB ark:$dir/vecs_tmp.$JOB '&&' \
         mv $dir/vecs_tmp.JOB $dir/vecs.JOB || exit 1;
     fi
   fi

   if [ $stage -le $x ]; then
     acc_sum="sgmm2-sum-accs - ";
     for j in `seq 0 $[$g-1]`; do acc_sum="$acc_sum 'gunzip -c $dir/acc.$x.$j.gz|'"; done
     $cmd $dir/log/update.$x.log \
       sgmm2-est --update-flags=$flags --split-substates=$numsubstates \
       $increase_dim_opts --power=$power --write-occs=$dir/$[$x+1].occs \
       $dir/$x.mdl "$acc_sum|" $dir/$[$x+1].mdl || exit 1;
     rm $dir/$x.mdl $dir/acc.$x.*.gz $dir/$x.occs 2>/dev/null
   fi
   if [ $x -lt $max_iter_inc ]; then
     numsubstates=$[$numsubstates+$incsubstates]
   fi
   x=$[$x+1];
done

rm $dir/final.mdl $dir/final.occs 2>/dev/null
ln -s $x.mdl $dir/final.mdl
ln -s $x.occs $dir/final.occs

if [ $spk_dim -gt 0 ]; then
  # We need to create an "alignment model" that's been trained
  # without the speaker vectors, to do the first-pass decoding with.
  # in test time.

  # We do this for a few iters, in this recipe.
  final_mdl=$dir/$x.mdl
  cur_alimdl=$dir/$x.mdl
  while [ $x -lt $[$num_iters+$num_iters_alimdl] ]; do
    echo "$0: building alignment model (pass $x)"
    if [ $x -eq $num_iters ]; then # 1st pass of building alimdl.
      flags=MwcS # don't update v the first time.  Note-- we never update transitions.
      # they wouldn't change anyway as we use the same alignment as previously.
    else
      flags=vMwcS
    fi
    if [ $stage -le $x ]; then
      Args=() # bash array of training commands for 1:nj, that put accs to stdout.
      for n in `seq $nj`; do
        Args[$n]=`echo "ali-to-post 'ark:gunzip -c $dir/ali.JOB.gz|' ark:- | \
          sgmm2-post-to-gpost $spkvecs_opt '$gselect_opt' \
          --utt2spk=ark:$sdata/JOB/utt2spk $final_mdl '$feats' ark,s,cs:- ark:- | \
                  sgmm2-acc-stats-gpost --rand-prune=$rand_prune --update-flags=$flags \
          $cur_alimdl '$feats' ark,s,cs:- - |" | sed s/JOB/$n/g`
      done
      g=0
      rm $dir/.error 2>/dev/null
      while [ $[$g*$group] -lt $nj ]; do
        if [ -s $dir/acc.$x.$g.gz ]; then
          echo "Skipping creation of acc $dir/acc.$x.$g.gz as it already exists."
        else
          start=$[$g*$group + 1]; # start-position in array Args.
          $cmd --num-threads "$group" $dir/log/acc.$x.$g.log \
            sgmm2-sum-accs --parallel=true "|gzip -c >$dir/acc.$x.$g.gz" "${Args[@]:$start:$group}"  || touch $dir/.error &
        fi
        g=$[$g+1];
      done
      wait
      if [ -f $dir/.error ]; then
        echo "Something went wrong during accumulation on pass $x"
        exit 1;
      fi
      acc_sum="sgmm2-sum-accs - ";
      for j in `seq 0 $[$g-1]`; do acc_sum="$acc_sum 'gunzip -c $dir/acc.$x.$j.gz|'"; done
      $cmd $dir/log/update_ali.$x.log \
        sgmm2-est --update-flags=$flags --remove-speaker-space=true --power=$power \
        $cur_alimdl "$acc_sum|" $dir/$[$x+1].alimdl || exit 1;
      rm $dir/acc.$x.*.gz || exit 1;
      [ $x -gt $num_iters ]  && rm $dir/$x.alimdl
    fi
    cur_alimdl=$dir/$[$x+1].alimdl
    x=$[$x+1]
  done
  rm $dir/final.alimdl 2>/dev/null
  ln -s $x.alimdl $dir/final.alimdl
fi

utils/summarize_warnings.pl $dir/log

echo Done


================================================
FILE: egs/steps/train_smbr.sh
================================================
#!/usr/bin/env bash
# Copyright 2012  Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.

# sMBR training 
# 4 iterations (by default) of Extended Baum-Welch update.
#
# For the numerator we have a fixed alignment rather than a lattice--
# this actually follows from the way lattices are defined in Kaldi, which
# is to have a single path for each word (output-symbol) sequence.

# Begin configuration section.
cmd=run.pl
num_iters=4
cancel=true # if true, cancel num and den counts on each frame.
tau=400
weight_tau=10
acwt=0.1
stage=0
smooth_to_mode=true
one_silence_class=false
# End configuration section

echo "$0 $@"  # Print the command line for logging

[ -f ./path.sh ] && . ./path.sh; # source the path.
. parse_options.sh || exit 1;

if [ $# -ne 5 ]; then
  echo "Usage: steps/train_smbr.sh <data> <lang> <ali> <denlats> <exp>"
  echo " e.g.: steps/train_smbr.sh data/train_si84 data/lang exp/tri2b_ali_si84 exp/tri2b_denlats_si84 exp/tri2b_smbr"
  echo "Main options (for others, see top of script file)"
  echo "  --cancel (true|false)                            # cancel stats (true by default)"
  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
  echo "  --config <config-file>                           # config containing options"
  echo "  --stage <stage>                                  # stage to do partial re-run from."
  echo "  --tau                                            # tau for i-smooth to last iter (default 200)"
  echo "  --one-silence-class <true|false>                 # If true, newer approach which will tend"
  echo "                                                   # to reduce insertions (default: false)"
  exit 1;
fi

data=$1
lang=$2
alidir=$3
denlatdir=$4
dir=$5
mkdir -p $dir/log

utils/lang/check_phones_compatible.sh $lang/phones.txt $alidir/phones.txt || exit 1;
cp $lang/phones.txt $dir || exit 1;

for f in $data/feats.scp $alidir/{tree,final.mdl,ali.1.gz} $denlatdir/lat.1.gz; do
  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
done
nj=`cat $alidir/num_jobs` || exit 1;
[ "$nj" -ne "`cat $denlatdir/num_jobs`" ] && \
  echo "$alidir and $denlatdir have different num-jobs" && exit 1;

sdata=$data/split$nj
splice_opts=`cat $alidir/splice_opts 2>/dev/null`
cmvn_opts=`cat $alidir/cmvn_opts 2>/dev/null`
delta_opts=`cat $alidir/delta_opts 2>/dev/null`
mkdir -p $dir/log
cp $alidir/splice_opts $dir 2>/dev/null
cp $alidir/cmvn_opts $dir 2>/dev/null # cmn/cmvn option.
cp $alidir/delta_opts $dir 2>/dev/null
[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
echo $nj > $dir/num_jobs

cp $alidir/{final.mdl,tree} $dir

silphonelist=`cat $lang/phones/silence.csl` || exit 1;

# Set up features

if [ -f $alidir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
echo "$0: feature type is $feat_type"

case $feat_type in
  delta) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas $delta_opts ark:- ark:- |";;
  lda) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |"
    cp $alidir/final.mat $dir    
    ;;
  *) echo "Invalid feature type $feat_type" && exit 1;
esac

[ -f $alidir/trans.1 ] && echo Using transforms from $alidir && \
  feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$alidir/trans.JOB ark:- ark:- |"

lats="ark:gunzip -c $denlatdir/lat.JOB.gz|"

cur_mdl=$alidir/final.mdl
x=0
while [ $x -lt $num_iters ]; do
  echo "Iteration $x of sMBR training"
  # Note: the num and den states are accumulated at the same time, so we
  # can cancel them per frame.
  if [ $stage -le $x ]; then
    $cmd JOB=1:$nj $dir/log/acc.$x.JOB.log \
      gmm-rescore-lattice $cur_mdl "$lats" "$feats" ark:- \| \
      lattice-to-smbr-post --acoustic-scale=$acwt \
        --one-silence-class=$one_silence_class \
        --silence-phones=$silphonelist  $cur_mdl \
        "ark,s,cs:gunzip -c $alidir/ali.JOB.gz |" ark:- ark:- \| \
      gmm-acc-stats2 $cur_mdl "$feats" ark,s,cs:- \
        $dir/num_acc.$x.JOB.acc $dir/den_acc.$x.JOB.acc || exit 1;

    n=`echo $dir/{num,den}_acc.$x.*.acc | wc -w`;
    [ "$n" -ne $[$nj*2] ] && \
      echo "Wrong number of sMBR accumulators $n versus 2*$nj" && exit 1;
    $cmd $dir/log/den_acc_sum.$x.log \
      gmm-sum-accs $dir/den_acc.$x.acc $dir/den_acc.$x.*.acc || exit 1;
    rm $dir/den_acc.$x.*.acc
    $cmd $dir/log/num_acc_sum.$x.log \
      gmm-sum-accs $dir/num_acc.$x.acc $dir/num_acc.$x.*.acc || exit 1;
    rm $dir/num_acc.$x.*.acc

  # note: this tau value is for smoothing towards model parameters, not
  # as in the Boosted MMI paper, not towards the ML stats as in the earlier
  # work on discriminative training (e.g. my thesis).  
  # You could use gmm-ismooth-stats to smooth to the ML stats, if you had
  # them available [here they're not available if cancel=true].
    if ! $smooth_to_model; then
      echo "Iteration $x of sMBR: computing ml (smoothing) stats"
      $cmd JOB=1:$nj $dir/log/acc_ml.$x.JOB.log \
        gmm-acc-stats $cur_mdl "$feats" \
          "ark,s,cs:gunzip -c $alidir/ali.JOB.gz | ali-to-post ark:- ark:- |" \
          $dir/ml.$x.JOB.acc || exit 1;
      $cmd $dir/log/acc_ml_sum.$x.log \
        gmm-sum-accs $dir/ml.$x.acc $dir/ml.$x.*.acc || exit 1;
      rm $dir/ml.$x.*.acc
      num_stats="gmm-ismooth-stats --tau=$tau $dir/ml.$x.acc $dir/num_acc.$x.acc -|"
    else 
      num_stats="gmm-ismooth-stats --smooth-from-model=true --tau=$tau $cur_mdl $dir/num_acc.$x.acc -|"
    fi  
    
    $cmd $dir/log/update.$x.log \
      gmm-est-gaussians-ebw $cur_mdl "$num_stats" $dir/den_acc.$x.acc - \| \
      gmm-est-weights-ebw - $dir/num_acc.$x.acc $dir/den_acc.$x.acc $dir/$[$x+1].mdl || exit 1;
    rm $dir/{den,num}_acc.$x.acc
  fi
  cur_mdl=$dir/$[$x+1].mdl

  # Some diagnostics: the objective function progress and auxiliary-function
  # improvement.

 tail -n 50 $dir/log/acc.$x.*.log | perl -e 'while(<STDIN>) { if(m/lattice-to-smbr-post.+Overall average frame-accuracy is (\S+) over (\S+) frames/) { $tot_objf += $1*$2; $tot_frames += $2; }} $tot_objf /= $tot_frames; print "$tot_objf $tot_frames\n"; ' > $dir/tmpf
  objf=`cat $dir/tmpf | awk '{print $1}'`;
  nf=`cat $dir/tmpf | awk '{print $2}'`;
  rm $dir/tmpf
  impr=`grep -w Overall $dir/log/update.$x.log | awk '{x += $10*$12;} END{print x;}'`
  impr=`perl -e "print ($impr*$acwt/$nf);"` # We multiply by acwt, and divide by $nf which is the "real" number of frames.
  # This gives us a projected objective function improvement.
  echo "Iteration $x: objf was $objf, sMBR auxf change was $impr" | tee $dir/objf.$x.log
  x=$[$x+1]
done

echo "sMBR training finished"

rm $dir/final.mdl 2>/dev/null
ln -s $x.mdl $dir/final.mdl

exit 0;


================================================
FILE: egs/steps/train_ubm.sh
================================================
#!/usr/bin/env bash
# Copyright 2012  Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.

# This trains a UBM (i.e. a mixture of Gaussians), by clustering
# the Gaussians from a trained HMM/GMM system and then doing a few
# iterations of UBM training.
# We mostly use this for SGMM systems.

# Begin configuration section.
cmd=run.pl
silence_weight=  # You can set it to e.g. 0.0, to weight down silence in training.
stage=-2
num_gselect1=50 # first stage of Gaussian-selection
num_gselect2=25 # second stage.
intermediate_num_gauss=2000
num_iters=3
no_fmllr=false
# End configuration section.

echo "$0 $@"  # Print the command line for logging

if [ -f path.sh ]; then . ./path.sh; fi
. parse_options.sh || exit 1;


if [ $# != 5 ]; then
  echo "Usage: steps/train_ubm.sh <num-gauss> <data> <lang> <ali-dir> <exp>"
  echo " e.g.: steps/train_ubm.sh 400 data/train_si84 data/lang exp/tri2b_ali_si84 exp/ubm3c"
  echo "main options (for others, see top of script file)"
  echo "  --config <config-file>                           # config containing options"
  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
  echo "  --silence-weight <sil-weight>                    # weight for silence (e.g. 0.5 or 0.0)"
  echo "  --num-iters <#iters>                             # Number of iterations of E-M"\
  echo "  --no-fmllr (true|false)                          # ignore speaker matrices even if present"
  exit 1;
fi

num_gauss=$1
data=$2
lang=$3
alidir=$4
dir=$5

for f in $data/feats.scp $lang/L.fst $alidir/ali.1.gz $alidir/final.mdl $alidir/num_jobs; do
  [ ! -f $f ] && echo "No such file $f" && exit 1;
done

if [ $[$num_gauss*2] -gt $intermediate_num_gauss ]; then
  echo "intermediate_num_gauss was too small $intermediate_num_gauss"
  intermediate_num_gauss=$[$num_gauss*2];
  echo "setting it to $intermediate_num_gauss"
fi


# Set various variables.
silphonelist=`cat $lang/phones/silence.csl` || exit 1;
nj=`cat $alidir/num_jobs` || exit 1;

mkdir -p $dir/log
echo $nj > $dir/num_jobs
sdata=$data/split$nj;
[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
splice_opts=`cat $alidir/splice_opts 2>/dev/null` # frame-splicing options.
cmvn_opts=`cat $alidir/cmvn_opts 2>/dev/null`
delta_opts=`cat $alidir/delta_opts 2>/dev/null`

utils/lang/check_phones_compatible.sh $lang/phones.txt $alidir/phones.txt || exit 1;
cp $lang/phones.txt $dir || exit 1;

## Set up features.
if [ -f $alidir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
echo "$0: feature type is $feat_type"

case $feat_type in
  delta) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas $delta_opts ark:- ark:- |";;
  lda) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |"
    cp $alidir/final.mat $dir
    ;;
  *) echo "$0: invalid feature type $feat_type" && exit 1;
esac


if [ -f $alidir/trans.1 ]; then
  if $no_fmllr; then
    echo "$0: deliberately ignoring speaker transforms from $alidir"
  else
    echo "$0: using transforms from $alidir"
    feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$alidir/trans.JOB ark:- ark:- |"
  fi
elif [ -f $alidir/raw_trans.1 ]; then
  echo "$0: using raw-FMLLR transforms from $alidir"
  feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$alidir/raw_trans.JOB ark:- ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |"
fi
##

if [ ! -z "$silence_weight" ]; then
  weights_opt="--weights='ark,s,cs:gunzip -c $alidir/ali.JOB.gz | ali-to-post ark:- ark:- | weight-silence-post $silence_weight $silphonelist $alidir/final.mdl ark:- ark:- | post-to-weights ark:- ark:- |'"
else
  weights_opt=
fi

if [ $stage -le -2 ]; then
  echo "$0: clustering model $alidir/final.mdl to get initial UBM"
  $cmd $dir/log/cluster.log \
    init-ubm --intermediate-num-gauss=$intermediate_num_gauss --ubm-num-gauss=$num_gauss \
    --verbose=2 --fullcov-ubm=true $alidir/final.mdl $alidir/final.occs \
    $dir/0.ubm   || exit 1;
fi

# Do initial phase of Gaussian selection and save it to disk -- later on we'll
# do more Gaussian selection to further prune, as the model changes.


if [ $stage -le -1 ]; then
  echo "$0: doing Gaussian selection"
  $cmd JOB=1:$nj $dir/log/gselect.JOB.log \
    gmm-gselect --n=$num_gselect1 "fgmm-global-to-gmm $dir/0.ubm - |" "$feats" \
    "ark:|gzip -c >$dir/gselect.JOB.gz" || exit 1;
fi


x=0
while [ $x -lt $num_iters ]; do
  echo "Pass $x"
  $cmd JOB=1:$nj $dir/log/acc.$x.JOB.log \
    gmm-gselect --n=$num_gselect2 "--gselect=ark,s,cs:gunzip -c $dir/gselect.JOB.gz|" \
    "fgmm-global-to-gmm $dir/$x.ubm - |" "$feats" ark:- \| \
    fgmm-global-acc-stats $weights_opt --gselect=ark,s,cs:- $dir/$x.ubm "$feats" \
    $dir/$x.JOB.acc || exit 1;
  lowcount_opt="--remove-low-count-gaussians=false"
  [ $[$x+1] -eq $num_iters ] && lowcount_opt=   # Only remove low-count Gaussians
  # on last iter-- we can't do it earlier, or the Gaussian-selection info would
  # be mismatched.
  $cmd $dir/log/update.$x.log \
    fgmm-global-est $lowcount_opt --verbose=2 $dir/$x.ubm "fgmm-global-sum-accs - $dir/$x.*.acc |" \
      $dir/$[$x+1].ubm || exit 1;
  rm $dir/$x.*.acc $dir/$x.ubm
  x=$[$x+1]
done

rm $dir/gselect.*.gz
rm $dir/final.ubm 2>/dev/null
mv $dir/$x.ubm $dir/final.ubm || exit 1;


================================================
FILE: egs/steps/word_align_lattices.sh
================================================
#!/usr/bin/env bash

# Copyright Johns Hopkins University (Author: Daniel Povey)  2012
# Apache 2.0.

# Begin configuration section.
silence_label=0
cmd=run.pl
# End configuration section.

echo "$0 $@"  # Print the command line for logging

for x in `seq 2`; do
  [ "$1" == "--silence-label" ] && silence_label=$2 && shift 2;
  [ "$1" == "--cmd" ] && cmd="$2" && shift 2;
done

if [ $# != 3 ]; then
   echo "Word-align lattices (make the arcs sync up with words)"
   echo ""
   echo "Usage: $0 [options] <lang-dir> <decode-dir-in> <decode-dir-out>"
   echo "options: [--cmd (run.pl|queue.pl [queue opts])] [--silence-label <integer-id-of-silence-word>]"
   exit 1;
fi

. ./path.sh || exit 1;

lang=$1
indir=$2
outdir=$3

mdl=`dirname $indir`/final.mdl
wbfile=$lang/phones/word_boundary.int

for f in $mdl $wbfile $indir/num_jobs; do
  [ ! -f $f ] && echo "word_align_lattices.sh: no such file $f" && exit 1;
done

mkdir -p $outdir/log


cp $indir/num_jobs $outdir;
nj=`cat $indir/num_jobs`

$cmd JOB=1:$nj $outdir/log/align.JOB.log \
  lattice-align-words --silence-label=$silence_label --test=true \
   $wbfile $mdl "ark:gunzip -c $indir/lat.JOB.gz|" "ark,t:|gzip -c >$outdir/lat.JOB.gz" || exit 1;


================================================
FILE: egs/utils/add_disambig.pl
================================================
#!/usr/bin/env perl
# Copyright 2010-2011 Microsoft Corporation

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#  http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.


# Adds some specified number of disambig symbols to a symbol table.
# Adds these as #1, #2, etc.
# If the --include-zero option is specified, includes an extra one
# #0.

$include_zero = 0;
if($ARGV[0] eq "--include-zero") {
    $include_zero = 1;
    shift @ARGV;
}

if(@ARGV != 2) {
    die "Usage: add_disambig.pl [--include-zero] symtab.txt num_extra > symtab_out.txt ";
}


$input = $ARGV[0];
$nsyms = $ARGV[1];

open(F, "<$input") || die "Opening file $input";

while(<F>) {
    @A = split(" ", $_);
    @A == 2 || die "Bad line $_";
    $lastsym = $A[1];
    print;
}

if(!defined($lastsym)){
 die "Empty symbol file?";
}

if($include_zero) {
    $lastsym++;
    print "#0  $lastsym\n";
}

for($n = 1; $n <= $nsyms; $n++) {
    $y = $n + $lastsym;
    print "#$n  $y\n";
}


================================================
FILE: egs/utils/add_lex_disambig.pl
================================================
#!/usr/bin/env perl
#  Copyright 2010-2011  Microsoft Corporation
#            2013-2016  Johns Hopkins University (author: Daniel Povey)
#                 2015  Hainan Xu
#                 2015  Guoguo Chen

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#  http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.


# Adds disambiguation symbols to a lexicon.
# Outputs still in the normal lexicon format.
# Disambig syms are numbered #1, #2, #3, etc. (#0
# reserved for symbol in grammar).
# Outputs the number of disambig syms to the standard output.
# With the --pron-probs option, expects the second field
# of each lexicon line to be a pron-prob.
# With the --sil-probs option, expects three additional
# fields after the pron-prob, representing various components
# of the silence probability model.

$pron_probs = 0;
$sil_probs = 0;
$first_allowed_disambig = 1;

for ($n = 1; $n <= 3 && @ARGV > 0; $n++) {
  if ($ARGV[0] eq "--pron-probs") {
    $pron_probs = 1;
    shift @ARGV;
  }
  if ($ARGV[0] eq "--sil-probs") {
    $sil_probs = 1;
    shift @ARGV;
  }
  if ($ARGV[0] eq "--first-allowed-disambig") {
    $first_allowed_disambig = 0 + $ARGV[1];
    if ($first_allowed_disambig < 1) {
      die "add_lex_disambig.pl: invalid --first-allowed-disambig option: $first_allowed_disambig\n";
    }
    shift @ARGV;
    shift @ARGV;
  }
}

if (@ARGV != 2) {
  die "Usage: add_lex_disambig.pl [opts] <lexicon-in> <lexicon-out>\n" .
    "This script adds disambiguation symbols to a lexicon in order to\n" .
    "make decoding graphs determinizable; it adds pseudo-phone\n" .
    "disambiguation symbols #1, #2 and so on at the ends of phones\n" .
    "to ensure that all pronunciations are different, and that none\n" .
    "is a prefix of another.\n" .
    "It prints to the standard output the number of the largest-numbered" .
    "disambiguation symbol that was used.\n" .
    "\n" .
    "Options:   --pron-probs       Expect pronunciation probabilities in the 2nd field\n" .
    "           --sil-probs        [should be with --pron-probs option]\n" .
    "                              Expect 3 extra fields after the pron-probs, for aspects of\n" .
    "                              the silence probability model\n" .
    "           --first-allowed-disambig <n>  The number of the first disambiguation symbol\n" .
    "                              that this script is allowed to add.  By default this is\n" .
    "                              #1, but you can set this to a larger value using this option.\n" .
    "e.g.:\n" .
    " add_lex_disambig.pl lexicon.txt lexicon_disambig.txt\n" .
    " add_lex_disambig.pl --pron-probs lexiconp.txt lexiconp_disambig.txt\n" .
    " add_lex_disambig.pl --pron-probs --sil-probs lexiconp_silprob.txt lexiconp_silprob_disambig.txt\n";
}


$lexfn = shift @ARGV;
$lexoutfn = shift @ARGV;

open(L, "<$lexfn") || die "Error opening lexicon $lexfn";

# (1)  Read in the lexicon.
@L = ( );
while(<L>) {
    @A = split(" ", $_);
    push @L, join(" ", @A);
}

# (2) Work out the count of each phone-sequence in the
# lexicon.

foreach $l (@L) {
    @A = split(" ", $l);
    shift @A; # Remove word.
    if ($pron_probs) {
      $p = shift @A;
      if (!($p > 0.0 && $p <= 1.0)) { die "Bad lexicon line $l (expecting pron-prob as second field)"; }
    }
    if ($sil_probs) {
      $silp = shift @A;
      if (!($silp > 0.0 && $silp <= 1.0)) { die "Bad lexicon line $l for silprobs"; }
      $correction = shift @A;
      if ($correction <= 0.0) { die "Bad lexicon line $l for silprobs"; }
      $correction = shift @A;
      if ($correction <= 0.0) { die "Bad lexicon line $l for silprobs"; }
    }
    if (!(@A)) {
      die "Bad lexicon line $1, no phone in phone list";
    }
    $count{join(" ",@A)}++;
}

# (3) For each left sub-sequence of each phone-sequence, note down
# that it exists (for identifying prefixes of longer strings).

foreach $l (@L) {
    @A = split(" ", $l);
    shift @A; # Remove word.
    if ($pron_probs) { shift @A; } # remove pron-prob.
    if ($sil_probs) {
      shift @A; # Remove silprob
      shift @A; # Remove silprob
      shift @A; # Remove silprob, there three numbers for sil_probs
    }
    while(@A > 0) {
        pop @A;  # Remove last phone
        $issubseq{join(" ",@A)} = 1;
    }
}

# (4) For each entry in the lexicon:
#  if the phone sequence is unique and is not a
#  prefix of another word, no diambig symbol.
#  Else output #1, or #2, #3, ... if the same phone-seq
#  has already been assigned a disambig symbol.


open(O, ">$lexoutfn") || die "Opening lexicon file $lexoutfn for writing.\n";

# max_disambig will always be the highest-numbered disambiguation symbol that
# has been used so far.
$max_disambig = $first_allowed_disambig - 1;

foreach $l (@L) {
  @A = split(" ", $l);
  $word = shift @A;
  if ($pron_probs) {
    $pron_prob = shift @A;
  }
  if ($sil_probs) {
    $sil_word_prob = shift @A;
    $word_sil_correction = shift @A;
    $prev_nonsil_correction = shift @A
  }
  $phnseq = join(" ", @A);
  if (!defined $issubseq{$phnseq}
      && $count{$phnseq} == 1) {
    ;                           # Do nothing.
  } else {
    if ($phnseq eq "") {        # need disambig symbols for the empty string
      # that are not use anywhere else.
      $max_disambig++;
      $reserved_for_the_empty_string{$max_disambig} = 1;
      $phnseq = "#$max_disambig";
    } else {
      $cur_disambig = $last_used_disambig_symbol_of{$phnseq};
      if (!defined $cur_disambig) {
        $cur_disambig = $first_allowed_disambig;
      } else {
        $cur_disambig++;           # Get a number that has not been used yet for
                                   # this phone sequence.
      }
      while (defined $reserved_for_the_empty_string{$cur_disambig}) {
        $cur_disambig++;
      }
      if ($cur_disambig > $max_disambig) {
        $max_disambig = $cur_disambig;
      }
      $last_used_disambig_symbol_of{$phnseq} = $cur_disambig;
      $phnseq = $phnseq . " #" . $cur_disambig;
    }
  }
  if ($pron_probs) {
    if ($sil_probs) {
      print O "$word\t$pron_prob\t$sil_word_prob\t$word_sil_correction\t$prev_nonsil_correction\t$phnseq\n";
    } else {
      print O "$word\t$pron_prob\t$phnseq\n";
    }
  } else {
    print O "$word\t$phnseq\n";
  }
}

print $max_disambig . "\n";


================================================
FILE: egs/utils/analyze_segments.pl
================================================
#!/usr/bin/perl
# Copyright 2015 GoVivace Inc. (Author: Nagendra Kumar Goel)

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#  http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.

# Analyze a segments file and print important stats on it.

$dur = $total = 0;
$maxDur = 0;
$minDur = 9999999999;
$n = 0;
while(<>){
    chomp;
    @t = split(/\s+/);
    $dur = $t[3] - $t[2];
    $total += $dur;
    if ($dur > $maxDur) {
        $maxSegId = $t[0];
        $maxDur = $dur;
    }
    if ($dur < $minDur) {
        $minSegId = $t[0];
        $minDur = $dur;
    }
    $n++;
}
$avg=$total/$n;
$hrs = $total/3600;
print "Total $hrs hours of data\n";
print "Average segment length $avg seconds\n";
print "Segment $maxSegId has length of $maxDur seconds\n";
print "Segment $minSegId has length of $minDur seconds\n";


================================================
FILE: egs/utils/apply_map.pl
================================================
#!/usr/bin/env perl
use warnings; #sed replacement for -w perl parameter
# Copyright 2012  Johns Hopkins University (Author: Daniel Povey)
# Apache 2.0.

# This program is a bit like ./sym2int.pl in that it applies a map
# to things in a file, but it's a bit more general in that it doesn't
# assume the things being mapped to are single tokens, they could
# be sequences of tokens.  See the usage message.


$permissive = 0;

for ($x = 0; $x <= 2; $x++) {

  if (@ARGV > 0 && $ARGV[0] eq "-f") {
    shift @ARGV;
    $field_spec = shift @ARGV;
    if ($field_spec =~ m/^\d+$/) {
      $field_begin = $field_spec - 1; $field_end = $field_spec - 1;
    }
    if ($field_spec =~ m/^(\d*)[-:](\d*)/) { # accept e.g. 1:10 as a courtesty (properly, 1-10)
      if ($1 ne "") {
        $field_begin = $1 - 1;  # Change to zero-based indexing.
      }
      if ($2 ne "") {
        $field_end = $2 - 1;    # Change to zero-based indexing.
      }
    }
    if (!defined $field_begin && !defined $field_end) {
      die "Bad argument to -f option: $field_spec";
    }
  }

  if (@ARGV > 0 && $ARGV[0] eq '--permissive') {
    shift @ARGV;
    # Mapping is optional (missing key is printed to output)
    $permissive = 1;
  }
}

if(@ARGV != 1) {
  print STDERR "Invalid usage: " . join(" ", @ARGV) . "\n";
  print STDERR <<'EOF';
Usage: apply_map.pl [options] map <input >output
 options: [-f <field-range> ] [--permissive]
   This applies a map to some specified fields of some input text:
   For each line in the map file: the first field is the thing we
   map from, and the remaining fields are the sequence we map it to.
   The -f (field-range) option says which fields of the input file the map
   map should apply to.
   If the --permissive option is supplied, fields which are not present
   in the map will be left as they were.
 Applies the map 'map' to all input text, where each line of the map
 is interpreted as a map from the first field to the list of the other fields
 Note: <field-range> can look like 4-5, or 4-, or 5-, or 1, it means the field
 range in the input to apply the map to.
 e.g.: echo A B | apply_map.pl a.txt
 where a.txt is:
 A a1 a2
 B b
 will produce:
 a1 a2 b
EOF
  exit(1);
}

($map_file) = @ARGV;
open(M, "<$map_file") || die "Error opening map file $map_file: $!";

while (<M>) {
  @A = split(" ", $_);
  @A >= 1 || die "apply_map.pl: empty line.";
  $i = shift @A;
  $o = join(" ", @A);
  $map{$i} = $o;
}

while(<STDIN>) {
  @A = split(" ", $_);
  for ($x = 0; $x < @A; $x++) {
    if ( (!defined $field_begin || $x >= $field_begin)
         && (!defined $field_end || $x <= $field_end)) {
      $a = $A[$x];
      if (!defined $map{$a}) {
        if (!$permissive) {
          die "apply_map.pl: undefined key $a in $map_file\n";
        } else {
          print STDERR "apply_map.pl: warning! missing key $a in $map_file\n";
        }
      } else {
        $A[$x] = $map{$a};
      }
    }
  }
  print join(" ", @A) . "\n";
}


================================================
FILE: egs/utils/best_wer.sh
================================================
#!/usr/bin/env bash
#
# Copyright 2010-2011 Microsoft Corporation

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#  http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.

# To be run from one directory above this script.

perl -e 'while(<>){ 
    s/\|(\d)/\| $1/g; s/(\d)\|/$1 \|/g;
    if (m/[WS]ER (\S+)/ && (!defined $bestwer || $bestwer > $1)){ $bestwer = $1; $bestline=$_; } # kaldi "compute-wer" tool.
    elsif (m: (Mean|Sum/Avg|)\s*\|\s*\S+\s+\S+\s+\|\s+\S+\s+\S+\s+\S+\s+\S+\s+(\S+)\s+\S+\s+\|:
        && (!defined $bestwer || $bestwer > $2)){ $bestwer = $2; $bestline=$_; } }  # sclite.
   if (defined $bestline){ print $bestline; } ' | \
  awk 'BEGIN{ FS="%WER"; } { if(NF == 2) { print FS$2" "$1; } else { print $0; }}' | \
  awk 'BEGIN{ FS="Sum/Avg"; } { if(NF == 2) { print $2" "$1; } else { print $0; }}' | \
  awk '{ if($1!~/%WER/) { print "%WER "$9" "$0; } else { print $0; }}' | \
  sed -e 's|\s\s*| |g' -e 's|\:$||' -e 's|\:\s*\|\s*$||'


================================================
FILE: egs/utils/build_const_arpa_lm.sh
================================================
#!/usr/bin/env bash

# Copyright 2014  Guoguo Chen
# Apache 2.0

# This script reads in an Arpa format language model, and converts it into the
# ConstArpaLm format language model.

# begin configuration section
# end configuration section

[ -f path.sh ] && . ./path.sh;

. utils/parse_options.sh

if [ $# != 3 ]; then
  echo "Usage: "
  echo "  $0 [options] <arpa-lm-path> <old-lang-dir> <new-lang-dir>"
  echo "e.g.:"
  echo "  $0 data/local/lm/3-gram.full.arpa.gz data/lang/ data/lang_test_tgmed"
  echo "Options"
  exit 1;
fi

export LC_ALL=C

arpa_lm=$1
old_lang=$2
new_lang=$3

mkdir -p $new_lang

mkdir -p $new_lang
cp -r $old_lang/* $new_lang

unk=`cat $old_lang/oov.int`
bos=`grep "^<s>\s" $old_lang/words.txt | awk '{print $2}'`
eos=`grep "^</s>\s" $old_lang/words.txt | awk '{print $2}'`
if [[ -z $bos || -z $eos ]]; then
  echo "$0: <s> and </s> symbols are not in $old_lang/words.txt"
  exit 1
fi
if [[ -z $unk ]]; then
  echo "$0: can't find oov symbol id in $old_lang/oov.int"
  exit 1
fi


arpa-to-const-arpa --bos-symbol=$bos \
  --eos-symbol=$eos --unk-symbol=$unk \
  "gunzip -c $arpa_lm | utils/map_arpa_lm.pl $new_lang/words.txt|"  $new_lang/G.carpa  || exit 1;

exit 0;


================================================
FILE: egs/utils/combine_data.sh
================================================
#!/usr/bin/env bash
# Copyright 2012  Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.
#           2014  David Snyder

# This script combines the data from multiple source directories into
# a single destination directory.

# See http://kaldi-asr.org/doc/data_prep.html#data_prep_data for information
# about what these directories contain.

# Begin configuration section.
extra_files= # specify additional files in 'src-data-dir' to merge, ex. "file1 file2 ..."
skip_fix=false # skip the fix_data_dir.sh in the end
# End configuration section.

echo "$0 $@"  # Print the command line for logging

if [ -f path.sh ]; then . ./path.sh; fi
. parse_options.sh || exit 1;

if [ $# -lt 2 ]; then
  echo "Usage: combine_data.sh [--extra-files 'file1 file2'] <dest-data-dir> <src-data-dir1> <src-data-dir2> ..."
  echo "Note, files that don't appear in all source dirs will not be combined,"
  echo "with the exception of utt2uniq and segments, which are created where necessary."
  exit 1
fi

dest=$1;
shift;

first_src=$1;

rm -r $dest 2>/dev/null
mkdir -p $dest;

export LC_ALL=C

for dir in $*; do
  if [ ! -f $dir/utt2spk ]; then
    echo "$0: no such file $dir/utt2spk"
    exit 1;
  fi
done

# Check that frame_shift are compatible, where present together with features.
dir_with_frame_shift=
for dir in $*; do
  if [[ -f $dir/feats.scp && -f $dir/frame_shift ]]; then
    if [[ $dir_with_frame_shift ]] &&
       ! cmp -s $dir_with_frame_shift/frame_shift $dir/frame_shift; then
      echo "$0:error: different frame_shift in directories $dir and " \
           "$dir_with_frame_shift. Cannot combine features."
      exit 1;
    fi
    dir_with_frame_shift=$dir
  fi
done

# W.r.t. utt2uniq file the script has different behavior compared to other files
# it is not compulsary for it to exist in src directories, but if it exists in
# even one it should exist in all. We will create the files where necessary
has_utt2uniq=false
for in_dir in $*; do
  if [ -f $in_dir/utt2uniq ]; then
    has_utt2uniq=true
    break
  fi
done

if $has_utt2uniq; then
  # we are going to create an utt2uniq file in the destdir
  for in_dir in $*; do
    if [ ! -f $in_dir/utt2uniq ]; then
      # we assume that utt2uniq is a one to one mapping
      cat $in_dir/utt2spk | awk '{printf("%s %s\n", $1, $1);}'
    else
      cat $in_dir/utt2uniq
    fi
  done | sort -k1 > $dest/utt2uniq
  echo "$0: combined utt2uniq"
else
  echo "$0 [info]: not combining utt2uniq as it does not exist"
fi
# some of the old scripts might provide utt2uniq as an extrafile, so just remove it
extra_files=$(echo "$extra_files"|sed -e "s/utt2uniq//g")

# segments are treated similarly to utt2uniq. If it exists in some, but not all
# src directories, then we generate segments where necessary.
has_segments=false
for in_dir in $*; do
  if [ -f $in_dir/segments ]; then
    has_segments=true
    break
  fi
done

if $has_segments; then
  for in_dir in $*; do
    if [ ! -f $in_dir/segments ]; then
      echo "$0 [info]: will generate missing segments for $in_dir" 1>&2
      utils/data/get_segments_for_data.sh $in_dir
    else
      cat $in_dir/segments
    fi
  done | sort -k1 > $dest/segments
  echo "$0: combined segments"
else
  echo "$0 [info]: not combining segments as it does not exist"
fi

for file in utt2spk utt2lang utt2dur utt2num_frames reco2dur feats.scp text cmvn.scp vad.scp reco2file_and_channel wav.scp spk2gender $extra_files; do
  exists_somewhere=false
  absent_somewhere=false
  for d in $*; do
    if [ -f $d/$file ]; then
      exists_somewhere=true
    else
      absent_somewhere=true
      fi
  done

  if ! $absent_somewhere; then
    set -o pipefail
    ( for f in $*; do cat $f/$file; done ) | sort -k1 > $dest/$file || exit 1;
    set +o pipefail
    echo "$0: combined $file"
  else
    if ! $exists_somewhere; then
      echo "$0 [info]: not combining $file as it does not exist"
    else
      echo "$0 [info]: **not combining $file as it does not exist everywhere**"
    fi
  fi
done

utils/utt2spk_to_spk2utt.pl <$dest/utt2spk >$dest/spk2utt

if [[ $dir_with_frame_shift ]]; then
  cp $dir_with_frame_shift/frame_shift $dest
fi

if ! $skip_fix ; then
  utils/fix_data_dir.sh $dest || exit 1;
fi

exit 0


================================================
FILE: egs/utils/convert_slf.pl
================================================
#!/usr/bin/env perl

# Copyright 2014  Brno University of Technology (author Karel Vesely)
# Copyright 2013  Korbinian Riedhammer

# Convert a kaldi-lattice to HTK SLF format;  if given an output
# directory, each lattice will be put in an individual gzipped file.

# Internal representation of nodes, links:
# node hash:
# { W=>[word], t=>[time], n_out_arcs=>[number_of_outgoing_arcs] };
# (Time internally represented as integer number of frames.)
# link hash:
# { S=>[start_node], E=>[end_node], W=>[word], v=>[0], a=>[acoustic_score], l=>[graph_score] }
# 
# The HTK output supports:
# - words on links [default],
#   - simpler, same as in kaldi lattices, node-ids in output correspond to kaldi lattices
# - words on nodes,
#   - apart from original nodes, there are extra nodes containing the words.
#   - each original ark is replaced by word-node and two links, connecting it with original nodes.


use utf8;
use List::Util qw(max);

binmode(STDIN, ":encoding(utf8)");
binmode(STDOUT, ":encoding(utf8)");

# defaults
$framerate=0.01;
$wordtonode=0;

$usage="Convert kaldi lattices to HTK SLF (v1.1) format.\n".
       "Usage: convert_slf.pl [options] lat-file.txt [out-dir]\n".
       "  e.g. lattice-align-words lang/phones/word_boundary.int final.mdl 'ark:gunzip -c lat.gz |' ark,t:- | utils/int2sym.pl -f 3 lang/words.txt | $0 - slf/\n".
       "\n".
       "Options regarding the SLF output:\n".
       "  --frame-rate x  Frame rate to compute timing information (default: $framerate)\n".
       "  --word-to-node  Print the word symbols on nodes (adds extra nodes+links; default: words at links)\n".
       "\n";

# parse options
while (@ARGV gt 0 and $ARGV[0] =~ m/^--/) {
  $param = shift @ARGV;
  if ($param eq "--frame-rate") { $framerate = shift @ARGV; }
  elsif ($param eq "--word-to-node") { $wordtonode = 1;}
  else {
    print STDERR "Unknown option $param\n";
    print STDERR;
    print STDERR $usage;
    exit 1;
  }
}

# check positional arg count
if (@ARGV < 1 || @ARGV > 2) {
  print STDERR $usage;
  exit 1;
}

# store gzipped lattices individually to outdir:
$outdir = "";
if (@ARGV == 2) {
  $outdir = pop @ARGV;
  unless (-d $outdir) { system("mkdir -p $outdir"); }
  unless (-d $outdir) {
    print STDERR "Could not create directory $outdir\n";
    exit 1;
  }
}
# or we'll print lattices to stdout:
if ($outdir eq "") {
  open(FH, ">-") or die "Could not write to stdout (???)\n";
}


### parse kaldi lattices:

$utt = "";
$arc = 0;
$latest_time = 0.0;
@links = ();
%nodes = ();
%nodes_extra = ();
%accepting_states = ();

open (FI, $ARGV[0]) or die "Could not read from file\n";
binmode(FI, ":encoding(utf8)");

while(<FI>) {
  chomp;

  @A = split /\s+/;

  if (@A == 1 and $utt eq "") {
    # new lattice
    $utt = $A[0];
    $nodes{0} = { W=>"!NULL", t=>0.0, n_out_arcs=>0 }; #initial node

  } elsif (@A == 1) {
    # accepting node without FST weight, store data for link to terminal super-state
    $accepting_states{$A[0]} = { W=>"!NULL", v=>0, a=>0, l=>0 };

  } elsif (@A == 2) {
    # accepting state with FST weight on it, again store data for the link
    ($s, $info) = @A;
    ($gs, $as, $ss) = split(/,/, $info);

    # kaldi saves -log, but HTK does it the other way round
    $gs *= -1;
    $as *= -1;

    # the state sequence is something like 1_2_4_56_45, get number of tokens after splitting by '_':
    $ss = scalar split(/_/, $ss);
    
    # update the end time
    die "Node $s not yet visited, is lattice sorted topologically? $utt" unless exists $nodes{$s}{t};
    $time_end = $nodes{$s}{t} + $ss;
    if ($latest_time < $time_end) { $latest_time = $time_end; }

    # add the link data
    $accepting_states{$A[0]} = { W=>"!NULL", v=>0, a=>$as, l=>$gs };

  } elsif (@A == 4 or @A == 3) {
    # FSA arc
    ($s, $e, $w, $info) = @A;
    if ($info ne "") {
      ($gs, $as, $ss) = split(/,/, $info);
    } else {
      $gs = 0; $as = 0; $ss = "";
    }

    # rename epsilons to null
    $w = "!NULL" if $w eq "<eps>";

    # kaldi saves -log, but HTK does it the other way round
    $gs *= -1;
    $as *= -1;
    
    # the state sequence is something like 1_2_4_56_45, get number of tokens after splitting by '_':
    $ss = scalar split(/_/, $ss);
    
    # keep track of the number of outgoing arcs for each node 
    # (later, we will connect sinks to the terminal state)
    $nodes{$s}{n_out_arcs} += 1;

    # keep track of timing
    die "Node $s not yet visited, is lattice sorted topologically? $utt" unless exists $nodes{$s};
    $time_end = $nodes{$s}{t} + $ss;
    if ($latest_time < $time_end) { $latest_time = $time_end; }

    # sanity check on already existing node
    if (exists $nodes{$e}) {
      die "Node $e previously stored with different time ".$nodes{$e}{t}." now $time_end, $utt.\n"
       if $time_end ne $nodes{$e}{t};
    }

    # store internal representation of the arc
    if (not $wordtonode) {
      # The words on links, the lattice keeps it's original structure,
      # add node; do not overwrite
      $nodes{$e} = { t=>$time_end, n_out_arcs=>0 } unless defined $nodes{$e};
      # add the link data
      push @links, { S=>$s, E=>$e, W=>$w, v=>0, a=>$as, l=>$gs };

    } else {
      # The problem here was that, if we have a node with several incoming links,
      # the links can have different words on it, so we cannot simply put word from 
      # link into the node.
      #
      # The simple solution is:
      # each FST arc gets replaced by extra node with word and two links,
      # connecting it with original nodes.
      #
      # The lattice gets larger, and it is good to minimize the lattice during importing.
      #
      # During reading the FST, we don't know how many nodes there are in total, 
      # so the extra nodes are stored separately, indexed by arc number, 
      # and links have flags describing which type of node are they connected to.

      # add 'extra node' containing the word:
      $nodes_extra{$arc} = { W=>$w, t=>$time_end };
      # add 'original node'; do not overwrite
      $nodes{$e} = { W=>"!NULL", t=>$time_end, n_out_arcs=>0 } unless defined $nodes{$e};
      
      # add the link from 'original node' to 'extra node'
      push @links, { S=>$s, E=>$arc, W=>$w, v=>0, a=>$as, l=>$gs, to_extra_node=>1 };
      # add the link from 'extra node' to 'original node'
      push @links, { S=>$arc, E=>$e, W=>$w, v=>0, a=>0, l=>0, from_extra_node=>1 };
   
      # increase arc counter 
      $arc++;
    }

  } elsif (@A == 0) { # end of lattice reading, we'll add terminal super-state, and print it soon...
    # find sinks
    %sinks = ();
    for $n (keys %nodes) { 
      $sinks{$n} = 1 if ($nodes{$n}{n_out_arcs} == 0);
    }

    # sanity check: lattices need at least one sink!
    if (scalar keys %sinks == 0) {
      print STDERR "Error: $utt does not have at least one sink node-- cyclic lattice??\n";
    }

    # add terminal super-state,
    $last_node = max(keys(%nodes)) + 1;
    $nodes{$last_node} = { W=>"!NULL", t=>$latest_time };

    # connect all accepting states with terminal super-state,
    for $accept (sort { $a <=> $b } keys %accepting_states) {
      %a = %{$accepting_states{$accept}};
      push @links, { S=>$accept, E=>$last_node, W=>$a{W}, v=>$a{v}, a=>$a{a}, l=>$a{l} };
    }

    # connect also all sinks that are not accepting states,
    for $sink (sort { $a <=> $b } keys %sinks) {
      unless(exists($accepting_states{$sink})) {
        print STDERR "WARNING: detected sink node which is not accepting state in lattice $utt, incomplete lattice?\n";
        $a = \$accepting_states{$accept};
        push @links, { S=>$accept, E=>$last_node, W=>"!NULL", v=>0, a=>0, l=>0 };
      }
    }

    # print out the lattice;  open file handle first
    unless ($outdir eq "") {
      open(FH, "|-", "gzip -c > $outdir/$utt.lat.gz") or die "Could not write to $outdir/$utt.lat.gz\n";
      binmode(FH, ":encoding(utf8)");
    } 

    if (not $wordtonode) {
      # print lattice with words on links:
      
      # header
      print FH "VERSION=1.1\n";
      print FH "UTTERANCE=$utt\n";
      print FH "N=".(keys %nodes)."\tL=".(@links)."\n";

      # nodes
      for $n (sort { $a <=> $b } keys %nodes) {
        printf FH "I=%d\tt=%.2f\n", $n, $nodes{$n}{t}*$framerate;
      }

      # links/arks
      for $i (0 .. $#links) {
        %l = %{$links[$i]}; # get hash representing the link...
        printf FH "J=$i\tS=%d\tE=%d\tW=%s\tv=%f\ta=%f\tl=%f\n", $l{S}, $l{E}, $l{W}, $l{v}, $l{a}, $l{l};
      }

    } else {
      # print lattice with words in the nodes:

      # header
      print FH "VERSION=1.1\n";
      print FH "UTTERANCE=$utt\n";
      print FH "N=".(scalar(keys(%nodes))+scalar(keys(%nodes_extra)))."\tL=".(@links)."\n";

      # number of original nodes, offset of extra_nodes
      $node_id_offset = scalar keys %nodes;

      # nodes
      for $n (sort { $a <=> $b } keys %nodes) {
        printf FH "I=%d\tW=%s\tt=%.2f\n", $n, $nodes{$n}{W}, $nodes{$n}{t}*$framerate;
      }
      # extra nodes
      for $n (sort { $a <=> $b } keys %nodes_extra) {
        printf FH "I=%d\tW=%s\tt=%.2f\n", $n+$node_id_offset, $nodes_extra{$n}{W}, $nodes_extra{$n}{t}*$framerate;
      }

      # links/arks
      for $i (0 .. $#links) {
        %l = %{$links[$i]}; # get hash representing the link...
        if ($l{from_extra_node}) { $l{S} += $node_id_offset; }
        if ($l{to_extra_node}) { $l{E} += $node_id_offset; }
        printf FH "J=$i\tS=%d\tE=%d\tv=%f\ta=%f\tl=%f\n", $l{S}, $l{E}, $l{v}, $l{a}, $l{l};
      }
    }

    print FH "\n";

    # close handle if it was a file
    close(FH) unless ($outdir eq "");

    # clear data
    $utt = "";
    $arc = 0;
    $latest_time = 0.0;
    @links = ();
    %nodes = ();
    %nodes_extra = ();
    %accepting_states = ();
  } else {
    die "Unexpected column number of input line\n$_";
  }
}

if ($utt != "") {
  print STDERR "Last lattice was not printed as it might be incomplete?  Missing empty line?\n";
}


================================================
FILE: egs/utils/convert_slf_parallel.sh
================================================
#!/usr/bin/env bash
# Copyright Brno University of Technology (Author: Karel Vesely) 2014.  Apache 2.0.

# This script converts lattices to HTK format compatible with other toolkits.
# We can choose to put words to nodes or arcs, as both is valid in the SLF format.

# begin configuration section.
cmd=run.pl
dirname=lats-in-htk-slf
parallel_opts="--max-jobs-run 50" # We should limit disk stress
word_to_node=false # Words in arcs or nodes? [default:arcs]
#end configuration section.

echo "$0 $@"

[ -f ./path.sh ] && . ./path.sh
. parse_options.sh || exit 1;

if [ $# -ne 3 ]; then
  echo "Usage: $0 [options] <data-dir> <lang-dir|graph-dir> <decode-dir>"
  echo " Options:"
  echo "    --cmd (run.pl|queue.pl...)      # specify how to run the sub-processes."
  echo "    --word-to-link (true|false)     # put word symbols on links or nodes."
  echo "    --parallel-opts STR             # parallelization options (def.: '--max-jobs-run 50')."
  echo "e.g.:"
  echo "$0 data/dev data/lang exp/tri4a/decode_dev"
  exit 1;
fi

data=$1
lang=$2 # Note: may be graph directory not lang directory, but has the necessary stuff copied.
dir=$3

model=$(dirname $dir)/final.mdl # assume model one level up from decoding dir.

for f in $lang/words.txt $lang/phones/align_lexicon.int $model $dir/lat.1.gz; do
  [ ! -f $f ] && echo "$0: expecting file $f to exist" && exit 1;
done

[ ! -d $dir/$dirname/log ] && mkdir -p $dir/$dirname

echo "$0: Converting lattices into '$dir/$dirname'"

# Words in arcs or nodes? [default:nodes]
word_to_link_arg=
$word_to_node && word_to_node_arg="--word-to-node"

nj=$(cat $dir/num_jobs)

# convert the lattices (individually, gzipped)
$cmd $parallel_opts JOB=1:$nj $dir/$dirname/log/lat_convert.JOB.log \
  mkdir -p $dir/$dirname/JOB/ '&&' \
  lattice-align-words-lexicon --output-error-lats=true --output-if-empty=true \
    $lang/phones/align_lexicon.int $model "ark:gunzip -c $dir/lat.JOB.gz |" ark,t:- \| \
  utils/int2sym.pl -f 3 $lang/words.txt \| \
  utils/convert_slf.pl $word_to_node_arg - $dir/$dirname/JOB/ || exit 1

# make list of lattices
find -L $PWD/$dir/$dirname -name *.lat.gz > $dir/$dirname/lat_htk.scp || exit 1

# check number of lattices:
nseg=$(cat $data/segments | wc -l)
nlat_out=$(cat $dir/$dirname/lat_htk.scp | wc -l)
echo "segments $nseg, saved-lattices $nlat_out"
#
[ $nseg -ne $nlat_out ] && echo "WARNING: missing $((nseg-nlat_out)) lattices for some segments!" \
  && exit 1

echo "success, converted lats to HTK : $PWD/$dir/$dirname/lat_htk.scp"
exit 0


================================================
FILE: egs/utils/copy_data_dir.sh
================================================
#!/usr/bin/env bash

# Copyright 2013  Johns Hopkins University (author: Daniel Povey)
# Apache 2.0

# This script operates on a directory, such as in data/train/,
# that contains some subset of the following files:
#  feats.scp
#  wav.scp
#  vad.scp
#  spk2utt
#  utt2spk
#  text
#
# It copies to another directory, possibly adding a specified prefix or a suffix
# to the utterance and/or speaker names.  Note, the recording-ids stay the same.
#


# begin configuration section
spk_prefix=
utt_prefix=
spk_suffix=
utt_suffix=
validate_opts=   # should rarely be needed.
# end configuration section

. utils/parse_options.sh

if [ $# != 2 ]; then
  echo "Usage: "
  echo "  $0 [options] <srcdir> <destdir>"
  echo "e.g.:"
  echo " $0 --spk-prefix=1- --utt-prefix=1- data/train data/train_1"
  echo "Options"
  echo "   --spk-prefix=<prefix>     # Prefix for speaker ids, default empty"
  echo "   --utt-prefix=<prefix>     # Prefix for utterance ids, default empty"
  echo "   --spk-suffix=<suffix>     # Suffix for speaker ids, default empty"
  echo "   --utt-suffix=<suffix>     # Suffix for utterance ids, default empty"
  exit 1;
fi


export LC_ALL=C

srcdir=$1
destdir=$2

if [ ! -f $srcdir/utt2spk ]; then
  echo "copy_data_dir.sh: no such file $srcdir/utt2spk"
  exit 1;
fi

if [ "$destdir" == "$srcdir" ]; then
  echo "$0: this script requires <srcdir> and <destdir> to be different."
  exit 1
fi

set -e;

mkdir -p $destdir

cat $srcdir/utt2spk | awk -v p=$utt_prefix -v s=$utt_suffix '{printf("%s %s%s%s\n", $1, p, $1, s);}' > $destdir/utt_map
cat $srcdir/spk2utt | awk -v p=$spk_prefix -v s=$spk_suffix '{printf("%s %s%s%s\n", $1, p, $1, s);}' > $destdir/spk_map

if [ ! -f $srcdir/utt2uniq ]; then
  if [[ ! -z $utt_prefix  ||  ! -z $utt_suffix ]]; then
    cat $srcdir/utt2spk | awk -v p=$utt_prefix -v s=$utt_suffix '{printf("%s%s%s %s\n", p, $1, s, $1);}' > $destdir/utt2uniq
  fi
else
  cat $srcdir/utt2uniq | awk -v p=$utt_prefix -v s=$utt_suffix '{printf("%s%s%s %s\n", p, $1, s, $2);}' > $destdir/utt2uniq
fi

cat $srcdir/utt2spk | utils/apply_map.pl -f 1 $destdir/utt_map  | \
  utils/apply_map.pl -f 2 $destdir/spk_map >$destdir/utt2spk

utils/utt2spk_to_spk2utt.pl <$destdir/utt2spk >$destdir/spk2utt

if [ -f $srcdir/feats.scp ]; then
  utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/feats.scp >$destdir/feats.scp
fi

if [ -f $srcdir/vad.scp ]; then
  utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/vad.scp >$destdir/vad.scp
fi

if [ -f $srcdir/segments ]; then
  utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/segments >$destdir/segments
  cp $srcdir/wav.scp $destdir
else # no segments->wav indexed by utt.
  if [ -f $srcdir/wav.scp ]; then
    utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/wav.scp >$destdir/wav.scp
  fi
fi

if [ -f $srcdir/reco2file_and_channel ]; then
  cp $srcdir/reco2file_and_channel $destdir/
fi

if [ -f $srcdir/text ]; then
  utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/text >$destdir/text
fi
if [ -f $srcdir/utt2dur ]; then
  utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/utt2dur >$destdir/utt2dur
fi
if [ -f $srcdir/utt2num_frames ]; then
  utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/utt2num_frames >$destdir/utt2num_frames
fi
if [ -f $srcdir/reco2dur ]; then
  if [ -f $srcdir/segments ]; then
    cp $srcdir/reco2dur $destdir/reco2dur
  else
    utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/reco2dur >$destdir/reco2dur
  fi
fi
if [ -f $srcdir/spk2gender ]; then
  utils/apply_map.pl -f 1 $destdir/spk_map <$srcdir/spk2gender >$destdir/spk2gender
fi
if [ -f $srcdir/cmvn.scp ]; then
  utils/apply_map.pl -f 1 $destdir/spk_map <$srcdir/cmvn.scp >$destdir/cmvn.scp
fi
for f in frame_shift stm glm ctm; do
  if [ -f $srcdir/$f ]; then
    cp $srcdir/$f $destdir
  fi
done

rm $destdir/spk_map $destdir/utt_map

echo "$0: copied data from $srcdir to $destdir"

for f in feats.scp cmvn.scp vad.scp utt2lang utt2uniq utt2dur utt2num_frames text wav.scp reco2file_and_channel frame_shift stm glm ctm; do
  if [ -f $destdir/$f ] && [ ! -f $srcdir/$f ]; then
    echo "$0: file $f exists in dest $destdir but not in src $srcdir.  Moving it to"
    echo " ... $destdir/.backup/$f"
    mkdir -p $destdir/.backup
    mv $destdir/$f $destdir/.backup/
  fi
done


[ ! -f $srcdir/feats.scp ] && validate_opts="$validate_opts --no-feats"
[ ! -f $srcdir/text ] && validate_opts="$validate_opts --no-text"

utils/validate_data_dir.sh $validate_opts $destdir


================================================
FILE: egs/utils/create_data_link.pl
================================================
#!/usr/bin/env perl

# Copyright 2013  Guoguo Chen
#           2014  Johns Hopkins University (author: Daniel Povey)
# Apache 2.0.
#
# This script distributes data onto different file systems by making symbolic
# links. It is supposed to use together with utils/create_split_dir.pl, which
# creates a "storage" directory that links to different file systems.
#
# If a sub-directory egs/storage does not exist, it does nothing. If it exists,
# then it selects pseudo-randomly a number from those available in egs/storage/*
# creates a link such as
#
#   egs/egs.3.4.ark -> storage/4/egs.3.4.ark
#
use strict;
use warnings;
use File::Basename;
use File::Spec;
use Getopt::Long;

sub GetGCD {
  my ($a, $b) = @_;
  while ($a != $b) {
    if ($a > $b) {
      $a = $a - $b;
    } else {
      $b = $b - $a;
    }
  }
  return $a;
}

my $Usage = <<EOU;
create_data_link.pl:
This script distributes data onto different file systems by making symbolic
links. It is supposed to use together with utils/create_split_dir.pl, which
creates a "storage" directory that links to different file systems.

If a sub-directory foo/storage does not exist, it does nothing. If it exists,
then it selects pseudo-randomly a number from those available in foo/storage/*
creates a link such as

  foo/egs.3.4.ark -> storage/4/egs.3.4.ark

Usage: utils/create_data_link.pl <data-archive1> [<data-archive2> ... ]
 e.g.: utils/create_data_link.pl foo/bar/egs.3.4.ark foo/bar/egs.3.5.ark
 (note: the dirname, e.g. foo/bar/, must be the same in all cases).

See also utils/remove_data_links.sh
EOU

GetOptions();

if (@ARGV == 0) {
  die $Usage;
}

my $example_fullpath = $ARGV[0];

# Check if the storage has been created. If so, do nothing.
my $dirname = dirname($example_fullpath);
if (! -d "$dirname/storage") {
  exit(0);
}

# Storage exists, create symbolic links in the next few steps.

# First, get a list of the available storage directories, and check if they are
# properly created.
opendir(my $dh, "$dirname/storage/") || die "$0: Fail to open $dirname/storage/\n";
my @storage_dirs = grep(/^[0-9]*$/, readdir($dh));
closedir($dh);
my $num_storage = scalar(@storage_dirs);
for (my $x = 1; $x <= $num_storage; $x++) {
  (-d "$dirname/storage/$x") || die "$0: $dirname/storage/$x does not exist\n";
}

# Second, get the coprime list.
my @coprimes;
for (my $n = 1; $n <= $num_storage; $n++) {
  if (GetGCD($n, $num_storage) == 1) {
    push(@coprimes, $n);
  }
}

my $ret = 0;

foreach my $fullpath (@ARGV) {
  if ($dirname ne dirname($fullpath)) {
    die "Mismatch in directory names of arguments: $example_fullpath versus $fullpath";
  }

  # Finally, work out the directory index where we should put the data to.
  my $basename = basename($fullpath);
  my $filename_numbers = $basename;
  $filename_numbers =~ s/[^0-9]+/ /g;
  my @filename_numbers = split(" ", $filename_numbers);
  my $total = 0;
  my $index = 0;
  foreach my $x (@filename_numbers) {
    if ($index >= scalar(@coprimes)) {
      $index = 0;
    }
    $total += $x * $coprimes[$index];
    $index++;
  }
  my $dir_index = $total % $num_storage + 1;

  # Make the symbolic link.
  if (-e $fullpath) {
    unlink($fullpath);
  }
  if (symlink("storage/$dir_index/$basename", $fullpath) != 1) { # failure
    $ret = 1;  # will exit with error status.
  }
}

exit($ret);

## testing:
# rm -rf foo bar
# mkdir -p bar/{1,2,3,4}
# mkdir -p foo/storage
# for x in 1 2 3 4; do ln -s ../../bar/$x foo/storage/$x; done
# utils/create_data_link.pl utils/create_data_link.pl foo/1.3.ark  foo/2.3.ark
# ls -l foo
# total 0
# lrwxrwxrwx 1 dpovey fax 17 Sep  2 17:41 1.3.ark -> storage/3/1.3.ark
# lrwxrwxrwx 1 dpovey fax 17 Sep  2 17:41 2.3.ark -> storage/4/2.3.ark
# drwxr-xr-x 2 dpovey fax 38 Sep  2 17:40 storage


================================================
FILE: egs/utils/create_split_dir.pl
================================================
#!/usr/bin/env perl

# Copyright 2013  Guoguo Chen
# Apache 2.0.
#
# This script creates storage directories on different file systems, and creates
# symbolic links to those directories. For example, a command
#
#   utils/create_split_dir.pl /export/gpu-0{3,4,5}/egs/storage egs/storage
#
# will mkdir -p all of those directories, and will create links
#
#   egs/storage/1 -> /export/gpu-03/egs/storage
#   egs/storage/2 -> /export/gpu-03/egs/storage
#   ...
#
use strict;
use warnings;
use File::Spec;
use Getopt::Long;

my $Usage = <<EOU;
create_split_dir.pl:
This script creates storage directories on different file systems, and creates
symbolic links to those directories.

Usage: utils/create_split_dir.pl <actual_storage_dirs> <pseudo_storage_dir>
 e.g.: utils/create_split_dir.pl /export/gpu-0{3,4,5}/egs/storage egs/storage

Allowed options:
  --suffix    : Common suffix to <actual_storage_dirs>    (string, default = "")

See also create_data_link.pl, which is intended to work with the resulting
directory structure, and remove_data_links.sh
EOU

my $suffix="";
GetOptions('suffix=s' => \$suffix);

if (@ARGV < 2) {
  die $Usage;
}

my $ans = 1;

my $dir = pop(@ARGV);
system("mkdir -p $dir 2>/dev/null");

my @all_actual_storage = ();
foreach my $file (@ARGV) {
  push @all_actual_storage, File::Spec->rel2abs($file . "/" . $suffix);
}

my $index = 1;
foreach my $actual_storage (@all_actual_storage) {
  my $pseudo_storage = "$dir/$index";

  # If the symbolic link already exists, delete it.
  if (-l $pseudo_storage) {
    print STDERR "$0: link $pseudo_storage already exists, not overwriting.\n";
    $index++;
    next;
  }

  # Create the destination directory and make the link.
  system("mkdir -p $actual_storage 2>/dev/null");
  if ($? != 0) {
    print STDERR "$0: error creating directory $actual_storage\n";
    exit(1);
  }
  { # create a README file for easier deletion.
    open(R, ">$actual_storage/README.txt");
    my $storage_dir = File::Spec->rel2abs($dir);
    print R "# This directory is linked from $storage_dir, as part of Kaldi striped data\n";
    print R "# The full list of directories where this data resides is:\n";
    foreach my $d (@all_actual_storage) {
      print R "$d\n";
    }
    close(R);
  }
  my $ret = symlink($actual_storage, $pseudo_storage);

  # Process the returned values
  $ans = $ans && $ret;
  if (! $ret) {
    print STDERR "Error linking $actual_storage to $pseudo_storage\n";
  }

  $index++;
}

exit($ans == 1 ? 0 : 1);


================================================
FILE: egs/utils/ctm/convert_ctm.pl
================================================
#!/usr/bin/env perl

# Copyright 2012  Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.

# This takes as standard input a ctm file that's "relative to the utterance",
# i.e. times are measured relative to the beginning of the segments, and it
# uses a "segments" file (format:
# utterance-id recording-id start-time end-time
# ) and a "reco2file_and_channel" file (format:
# recording-id basename-of-file

$skip_unknown=undef;
if ( $ARGV[0] eq "--skip-unknown" ) {
  $skip_unknown=1;
  shift @ARGV;
}

if (@ARGV < 2 || @ARGV > 3) {
  print STDERR "Usage: convert_ctm.pl <segments-file> <reco2file_and_channel-file> [<utterance-ctm>] > real-ctm\n";
  exit(1);
}

$segments = shift @ARGV;
$reco2file_and_channel = shift @ARGV;

open(S, "<$segments") || die "opening segments file $segments";
while(<S>) {
  @A = split(" ", $_);
  @A == 4 || die "Bad line in segments file: $_";
  ($utt, $recording_id, $begin_time, $end_time) = @A;
  $utt2reco{$utt} = $recording_id;
  $begin{$utt} = $begin_time;
  $end{$utt} = $end_time;
}
close(S);
open(R, "<$reco2file_and_channel") || die "open reco2file_and_channel file $reco2file_and_channel";
while(<R>) {
  @A = split(" ", $_);
  @A == 3 || die "Bad line in reco2file_and_channel file: $_";
  ($recording_id, $file, $channel) = @A;
  $reco2file{$recording_id} = $file;
  $reco2channel{$recording_id} = $channel;
}


# Now process the ctm file, which is either the standard input or the third
# command-line argument.
$num_done = 0;
while(<>) {
  @A= split(" ", $_);
  ( @A == 5 || @A == 6 ) || die "Unexpected ctm format: $_";
  # lines look like:
  # <utterance-id> 1 <begin-time> <length> <word> [ confidence ]
  ($utt, $one, $wbegin, $wlen, $w, $conf) = @A;
  $reco = $utt2reco{$utt};
  if (!defined $reco) { 
      next if defined $skip_unknown;
      die "Utterance-id $utt not defined in segments file $segments"; 
  }
  $file = $reco2file{$reco};
  $channel = $reco2channel{$reco};
  if (!defined $file || !defined $channel) { 
    die "Recording-id $reco not defined in reco2file_and_channel file $reco2file_and_channel"; 
  }
  $b = $begin{$utt};
  $e = $end{$utt};
  $wbegin_r = $wbegin + $b; # Make it relative to beginning of the recording.
  $wbegin_r = sprintf("%.2f", $wbegin_r);
  $wlen = sprintf("%.2f", $wlen);
  if (defined $conf) {
    $line = "$file $channel $wbegin_r $wlen $w $conf\n"; 
  } else {
    $line = "$file $channel $wbegin_r $wlen $w\n"; 
  }
  if ($wbegin_r + $wlen > $e + 0.01) {
    print STDERR "Warning: word appears to be past end of recording; line is $line";
  }
  print $line; # goes to stdout.
  $num_done++;
}

if ($num_done == 0) { exit 1; } else { exit 0; }

__END__

# Test example [also test it without the 0.5's]
echo utt reco 10.0 20.0 > segments
echo reco file A > reco2file_and_channel
echo utt 1 8.0 1.0 word 0.5 > ctm_in
echo file A 18.00 1.00 word 0.5 > ctm_out
utils/convert_ctm.pl segments reco2file_and_channel ctm_in | cmp - ctm_out || echo error
rm segments reco2file_and_channel ctm_in ctm_out


================================================
FILE: egs/utils/ctm/fix_ctm.sh
================================================
#! /bin/bash

stmfile=$1
ctmfile=$2

segments_stm=`cat $stmfile | cut -f 1 -d ' ' | sort -u`
segments_ctm=`cat $ctmfile | cut -f 1 -d ' ' | sort -u`

segments_stm_count=`echo "$segments_stm" | wc -l `
segments_ctm_count=`echo "$segments_ctm" | wc -l `

#echo $segments_stm_count
#echo $segments_ctm_count

if [ "$segments_stm_count" -gt "$segments_ctm_count"  ] ; then
  pp=$( diff <(echo "$segments_stm") <(echo "$segments_ctm" ) | grep "^<" | sed "s/^< *//g")
  (
    for elem in $pp ; do
      echo "$elem 1 0 0 EMPTY_RECOGNIZED_PHRASE"
    done
  ) >> $ctmfile
  echo "FIXED CTM FILE"
  exit 0
elif [ "$segments_stm_count" -lt "$segments_ctm_count"  ] ; then
  echo "Segment STM count: $segments_stm_count"
  echo "Segment CTM count: $segments_ctm_count"
  echo "FAILURE FIXING CTM FILE"
  exit 1
else
  exit 0
fi


================================================
FILE: egs/utils/ctm/resolve_ctm_overlaps.py
================================================
#! /usr/bin/env python

# Copyright 2014  Johns Hopkins University (Authors: Daniel Povey)
#           2014  Vijayaditya Peddinti
#           2016  Vimal Manohar
# Apache 2.0.

"""
Script to combine ctms with overlapping segments.
The current approach is very simple. It ignores the words,
which are hypothesized in the half of the overlapped region
that is closer to the utterance boundary.
So if there are two segments
in the region 0s to 30s and 25s to 55s, with overlap of 5s,
the last 2.5s of the first utterance i.e. from 27.5s to 30s is truncated
and the first 2.5s of the second utterance i.e. from 25s to 27.s is truncated.
"""

from __future__ import print_function
from __future__ import division
import argparse
import collections
import logging

from collections import defaultdict

logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
handler = logging.StreamHandler()
handler.setLevel(logging.INFO)
formatter = logging.Formatter(
    '%(asctime)s [%(pathname)s:%(lineno)s - '
    '%(funcName)s - %(levelname)s ] %(message)s')
handler.setFormatter(formatter)
logger.addHandler(handler)


def get_args():
    """gets command line arguments"""

    usage = """ Python script to resolve overlaps in ctms.  May be used with
                utils/data/subsegment_data_dir.sh. """
    parser = argparse.ArgumentParser(usage)
    parser.add_argument('segments', type=argparse.FileType('r'),
                        help='use segments to resolve overlaps')
    parser.add_argument('ctm_in', type=argparse.FileType('r'),
                        help='input_ctm_file')
    parser.add_argument('ctm_out', type=argparse.FileType('w'),
                        help='output_ctm_file')
    parser.add_argument('--verbose', type=int, default=0,
                        help="Higher value for more verbose logging.")
    args = parser.parse_args()

    if args.verbose > 2:
        logger.setLevel(logging.DEBUG)
        handler.setLevel(logging.DEBUG)

    return args


def read_segments(segments_file):
    """Read from segments and returns two dictionaries,
    {utterance-id: (recording_id, start_time, end_time)}
    {recording_id: list-of-utterances}
    """
    segments = {}
    reco2utt = defaultdict(list)

    num_lines = 0
    for line in segments_file:
        num_lines += 1
        parts = line.strip().split()
        assert len(parts) in [4, 5]
        segments[parts[0]] = (parts[1], float(parts[2]), float(parts[3]))
        reco2utt[parts[1]].append(parts[0])

    logger.info("Read %d lines from segments file %s",
                num_lines, segments_file.name)
    segments_file.close()

    return segments, reco2utt


def read_ctm(ctm_file, segments):
    """Read CTM from ctm_file into a dictionary of values indexed by the
    recording.
    It is assumed to be sorted by the recording-id and utterance-id.

    Returns a dictionary {recording : ctm_lines}
        where ctm_lines is a list of lines of CTM corresponding to the
        utterances in the recording.
        The format is as follows:
        [[(utteranceA, channelA, start_time1, duration1, hyp_word1, conf1),
          (utteranceA, channelA, start_time2, duration2, hyp_word2, conf2),
          ...
          (utteranceA, channelA, start_timeN, durationN, hyp_wordN, confN)],
         [(utteranceB, channelB, start_time1, duration1, hyp_word1, conf1),
          (utteranceB, channelB, start_time2, duration2, hyp_word2, conf2),
          ...],
         ...
         [...
          (utteranceZ, channelZ, start_timeN, durationN, hyp_wordN, confN)]
        ]
    """
    ctms = {}

    num_lines = 0
    for line in ctm_file:
        num_lines += 1
        parts = line.split()

        utt = parts[0]
        reco = segments[utt][0]

        if (reco, utt) not in ctms:
            ctms[(reco, utt)] = []

        ctms[(reco, utt)].append([parts[0], parts[1], float(parts[2]),
                                  float(parts[3])] + parts[4:])

    logger.info("Read %d lines from CTM %s", num_lines, ctm_file.name)

    ctm_file.close()
    return ctms


def resolve_overlaps(ctms, segments):
    """Resolve overlaps within segments of the same recording.

    Returns new lines of CTM for the recording.

    Arguments:
        ctms - The CTM lines for a single recording. This is one value stored
            in the dictionary read by read_ctm(). Assumes that the lines
            are sorted by the utterance-ids.
            The format is the following:
            [[(utteranceA, channelA, start_time1, duration1, hyp_word1, conf1),
              (utteranceA, channelA, start_time2, duration2, hyp_word2, conf2),
              ...
              (utteranceA, channelA, start_timeN, durationN, hyp_wordN, confN)
             ],
             [(utteranceB, channelB, start_time1, duration1, hyp_word1, conf1),
              (utteranceB, channelB, start_time2, duration2, hyp_word2, conf2),
              ...],
             ...
             [...
              (utteranceZ, channelZ, start_timeN, durationN, hyp_wordN, confN)]
            ]
        segments - Dictionary containing the output of read_segments()
            { utterance_id: (recording_id, start_time, end_time) }
        """
    total_ctm = []
    if len(ctms) == 0:
        raise RuntimeError('CTMs for recording is empty. '
                           'Something wrong with the input ctms')

    # First column of first line in CTM for first utterance
    next_utt = ctms[0][0][0]
    for utt_index, ctm_for_cur_utt in enumerate(ctms):
        if utt_index == len(ctms) - 1:
            break

        if len(ctm_for_cur_utt) == 0:
            next_utt = ctms[utt_index + 1][0][0]
            continue

        cur_utt = ctm_for_cur_utt[0][0]
        if cur_utt != next_utt:
            logger.error(
                "Current utterance %s is not the same as the next "
                "utterance %s in previous iteration.\n"
                "CTM is not sorted by utterance-id?",
                cur_utt, next_utt)
            raise ValueError

        # Assumption here is that the segments are written in
        # consecutive order?
        ctm_for_next_utt = ctms[utt_index + 1]
        next_utt = ctm_for_next_utt[0][0]
        if segments[next_utt][1] < segments[cur_utt][1]:
            logger.error(
                "Next utterance %s <= Current utterance %s. "
                "CTM is not sorted by start-time of utterance-id.",
                next_utt, cur_utt)
            raise ValueError

        try:
            # length of this utterance
            window_length = segments[cur_utt][2] - segments[cur_utt][1]

            # overlap of this segment with the next segment
            # i.e. current_utterance_end_time - next_utterance_start_time
            # Note: It is possible for this to be negative when there is
            # actually no overlap between consecutive segments.
            try:
                overlap = segments[cur_utt][2] - segments[next_utt][1]
            except KeyError:
                logger("Could not find utterance %s in segments",
                       next_utt)
                raise

            if overlap > 0 and segments[next_utt][2] <= segments[cur_utt][2]:
                # Next utterance is entirely within this utterance.
                # So we leave this ctm as is and make the next one empty.
                total_ctm.extend(ctm_for_cur_utt)
                ctms[utt_index + 1] = []
                continue

            # find a break point (a line in the CTM) for the current utterance
            # i.e. the first line that has more than half of it outside
            # the first half of the overlap region.
            # Note: This line will not be included in the output CTM, which is
            # only upto the line before this.
            try:
                index = next(
                    (i for i, line in enumerate(ctm_for_cur_utt)
                     if (line[2] + line[3] / 2.0
                         > window_length - overlap / 2.0)))
            except StopIteration:
                # It is possible for such a word to not exist, e.g the last
                # word in the CTM is longer than overlap length and starts
                # before the beginning of the overlap.
                # or the last word ends before the middle of the overlap.
                index = len(ctm_for_cur_utt)

            # Ignore the hypotheses beyond this midpoint. They will be
            # considered as part of the next segment.
            total_ctm.extend(ctm_for_cur_utt[:index])

            # Find a break point (a line in the CTM) for the next utterance
            # i.e. the first line that has more than half of it outside
            # the first half of the overlap region.
            try:
                index = next(
                    (i for i, line in enumerate(ctm_for_next_utt)
                    if line[2] + line[3] / 2.0 > overlap / 2.0))
            except StopIteration:
                # This can happen if there is no word hypothesized after
                # half the overlap region.
                ctms[utt_index + 1] = []
                continue

            if index > 0:
                # Update the ctm_for_next_utt to include only the lines
                # starting from index.
                ctms[utt_index + 1] = ctm_for_next_utt[index:]
            # else leave the ctm as is.
        except:
            logger.error("Could not resolve overlaps between CTMs for "
                         "%s and %s", cur_utt, next_utt)
            logger.error("Current CTM:")
            for line in ctm_for_cur_utt:
                logger.error(ctm_line_to_string(line))
            logger.error("Next CTM:")
            for line in ctm_for_next_utt:
                logger.error(ctm_line_to_string(line))
            raise

    # merge the last ctm entirely
    total_ctm.extend(ctms[-1])

    return total_ctm


def ctm_line_to_string(line):
    """Converts a line of CTM to string."""
    return "{0} {1} {2} {3} {4}".format(line[0], line[1], line[2], line[3],
                                        " ".join(line[4:]))


def write_ctm(ctm_lines, out_file):
    """Writes CTM lines stored in a list to file."""
    for line in ctm_lines:
        print(ctm_line_to_string(line), file=out_file)


def run(args):
    """this method does everything in this script"""
    segments, reco2utt = read_segments(args.segments)
    ctms = read_ctm(args.ctm_in, segments)

    for reco, utts in reco2utt.items():
        ctms_for_reco = []
        for utt in sorted(utts, key=lambda x: segments[x][1]):
            if (reco, utt) in ctms:
                ctms_for_reco.append(ctms[(reco, utt)])
        if len(ctms_for_reco) == 0:
            logger.info("CTM for recording {0} was empty".format(reco))
            continue
        try:
            # Process CTMs in the recordings
            ctms_for_reco = resolve_overlaps(ctms_for_reco, segments)
            write_ctm(ctms_for_reco, args.ctm_out)
        except Exception:
            logger.error("Failed to process CTM for recording %s",
                         reco)
            raise
    args.ctm_out.close()
    logger.info("Wrote CTM for %d recordings.", len(ctms))


def main():
    """The main function which parses arguments and call run()."""
    args = get_args()
    try:
        run(args)
    except:
        logger.error("Failed to resolve overlaps", exc_info=True)
        raise SystemExit(1)
    finally:
        try:
            for f in [args.segments, args.ctm_in, args.ctm_out]:
                if f is not None:
                    f.close()
        except IOError:
            logger.error("Could not close some files. "
                         "Disk error or broken pipes?")
            raise
        except UnboundLocalError:
            raise SystemExit(1)


if __name__ == "__main__":
    main()


================================================
FILE: egs/utils/data/combine_short_segments.sh
================================================
#!/usr/bin/env bash

# Copyright 2013  Johns Hopkins University (author: Daniel Povey)
# Apache 2.0

# This script copies and modifies a data directory while combining
# segments whose duration is lower than a specified minimum segment
# length.
#
# Note: this does not work for the wav.scp, since there is no natural way to
# concatenate segments; you have to operate on directories that already have
# features extracted.

#


# begin configuration section
cleanup=true
speaker_only=false  # If true, utterances are only combined from the same speaker.
                    # It may be useful for the speaker recognition task.
                    # If false, utterances are preferentially combined from the same speaker,
                    # and then combined across different speakers.
# end configuration section


. utils/parse_options.sh

if [ $# != 3 ]; then
  echo "Usage: "
  echo "  $0 [options] <srcdir> <min-segment-length-in-seconds> <dir>"
  echo "e.g.:"
  echo " $0 data/train 1.55 data/train_comb"
  echo " Options:"
  echo "  --speaker-only <true|false>  # options to internal/choose_utts_to_combine.py, default false."
  exit 1;
fi


export LC_ALL=C

srcdir=$1
min_seg_len=$2
dir=$3

if [ "$dir" == "$srcdir" ]; then
  echo "$0: this script requires <srcdir> and <dir> to be different."
  exit 1
fi

for f in $srcdir/utt2spk $srcdir/feats.scp; do
  [ ! -s $f ] && echo "$0: expected file $f to exist and be nonempty" && exit 1
done

if ! awk '{if (NF != 2) exit(1);}' <$srcdir/feats.scp; then
  echo "$0: could not combine short segments because $srcdir/feats.scp has "
  echo " entries with too many fields"
fi

if ! mkdir -p $dir; then
  echo "$0: could not create directory $dir"
  exit 1;
fi

if ! utils/validate_data_dir.sh --no-text $srcdir; then
  echo "$0: failed to validate input directory $srcdir.  If needed, run   utils/fix_data_dir.sh $srcdir"
  exit 1
fi

if ! python -c "x=float('$min_seg_len'); assert(x>0.0 and x<100.0);" 2>/dev/null; then
  echo "$0: bad <min-segment-length-in-seconds>: got '$min_seg_len'"
  exit 1
fi

set -e
set -o pipefail

# make sure $srcdir/utt2dur exists.
utils/data/get_utt2dur.sh $srcdir

utils/data/internal/choose_utts_to_combine.py --min-duration=$min_seg_len \
  --merge-within-speakers-only=$speaker_only \
  $srcdir/spk2utt $srcdir/utt2dur $dir/utt2utts $dir/utt2spk $dir/utt2dur

utils/utt2spk_to_spk2utt.pl < $dir/utt2spk > $dir/spk2utt

# create the feats.scp.
# if a line of utt2utts is like 'utt2-comb2 utt2 utt3', then
# the utils/apply_map.pl will create a line that looks like
# 'utt2-comb2 foo.ark:4315 foo.ark:431423'
# and the awk command creates suitable command lines like:
# 'utt2-comb2 concat-feats foo.ark:4315 foo.ark:431423 - |'
utils/apply_map.pl -f 2- $srcdir/feats.scp <$dir/utt2utts | \
  awk '{if (NF<=2){print;} else { $1 = $1 " concat-feats --print-args=false"; $NF = $NF " - |"; print; }}' > $dir/feats.scp

# create $dir/text by concatenating the source 'text' entries for the original
# utts.
if [ -f $srcdir/text ]; then
  utils/apply_map.pl -f 2- $srcdir/text <$dir/utt2utts > $dir/text
fi

if [ -f $srcdir/utt2uniq ]; then
  # the utt2uniq file is such that if 2 utts were derived from the same original
  # utt (e.g. by speed perturbing) they map to the same 'uniq' value.  This is
  # so that we can properly hold out validation data for neural net training and
  # know that we're not training on perturbed verions of that utterance.  We
  # need to obtain the utt2uniq file so that if any 2 'new' utts contain any of
  # the same 'old' utts, their 'uniq' values are the same [but otherwise as far
  # as possible, the 'uniq' values are different.]
  #
  # we'll do this by arranging the old 'uniq' values into groups as necessary to
  # capture this property.

  # The following command creates 'uniq_sets', each line of which contains
  # a set of original 'uniq' values, and effectively we assert that they must
  # be grouped together to the same 'uniq' value.
  # the first awk command prints a group of the original utterance-ids that
  # are combined together into a single new utterance, and the apply_map
  # command converts those into a list of original 'uniq' values.
  awk '{$1 = ""; print;}' < $dir/utt2utts | \
    utils/apply_map.pl $srcdir/utt2uniq > $dir/uniq_sets

  # The next command creates $dir/uniq2merged_uniq, which is a map from the
  # original 'uniq' values to the 'merged' uniq values.
  # for example, if $dir/uniq_sets were to contain
  # a b
  # b c
  # d
  # then we'd obtain a uniq2merged_uniq file that looks like:
  # a a
  # b a
  # c a
  # d d
  # ... because a and b appear together, and b and c appear together,
  # they have to be merged into the same set, and we name that set 'a'
  # (in general, we take the lowest string in lexicographical order).

  cat $dir/uniq_sets | LC_ALL=C python3 -c '
import sys;
from collections import defaultdict
uniq2orig_uniq = dict()
equal_pairs = set()  # set of 2-tuples (a,b) which should have equal orig_uniq
while True:
    line = sys.stdin.readline()
    if line == "": break
    split_line = line.split() # list of uniq strings that should map in same set
    # initialize uniq2orig_uniq to the identity mapping
    for uniq in split_line: uniq2orig_uniq[uniq] = uniq
    for a in split_line[1:]: equal_pairs.add((split_line[0], a))

changed = True
while changed:
    changed = False
    for a,b in equal_pairs:
         min_orig_uniq = min(uniq2orig_uniq[a], uniq2orig_uniq[b])
         for x in [a,b]:
             if uniq2orig_uniq[x] != min_orig_uniq:
                 uniq2orig_uniq[x] = min_orig_uniq
                 changed = True

for uniq in sorted(uniq2orig_uniq.keys()):
    print(uniq, uniq2orig_uniq[uniq])
' > $dir/uniq_to_orig_uniq
  rm $dir/uniq_sets


  # In the following command, suppose we have a line like:
  # utt1-comb2 utt1 utt2
  # .. the first awk command retains only the first original utt, to give
  # utt1-comb2 utt1
  # [we can pick one arbitrarily since we know any of them would map to the same
  # orig_uniq value.]
  # the first apply_map.pl command maps the 'utt1' to the 'uniq' value it mapped to
  # in $srcdir, and the second apply_map.pl command maps it to the grouped 'uniq'
  # value obtained by the inline python script above.
  awk '{print $1, $2}' < $dir/utt2utts | utils/apply_map.pl -f 2 $srcdir/utt2uniq | \
    utils/apply_map.pl -f 2 $dir/uniq_to_orig_uniq > $dir/utt2uniq
  rm $dir/uniq_to_orig_uniq
fi

# note: the user will have to recompute the cmvn, as the speakers may have changed.
rm $dir/cmvn.scp 2>/dev/null || true

utils/validate_data_dir.sh --no-text --no-wav $dir

if $cleanup; then
  rm $dir/utt2utts
fi


================================================
FILE: egs/utils/data/convert_data_dir_to_whole.sh
================================================
#! /bin/bash

# Copyright 2016-2018  Vimal Manohar
# Apache 2.0

# This scripts converts a data directory into a "whole" data directory
# by removing the segments and using the recordings themselves as 
# utterances

set -o pipefail

. ./path.sh

. utils/parse_options.sh

if [ $# -ne 2 ]; then
  echo "Usage: convert_data_dir_to_whole.sh <in-data> <out-data>"
  echo " e.g.: convert_data_dir_to_whole.sh data/dev data/dev_whole"
  exit 1
fi

data=$1
dir=$2

if [ ! -f $data/segments ]; then
  echo "$0: Data directory already does not contain segments. So just copying it."
  utils/copy_data_dir.sh $data $dir
  exit 0
fi

mkdir -p $dir
cp $data/wav.scp $dir
if [ -f $data/reco2file_and_channel ]; then 
  cp $data/reco2file_and_channel $dir; 
fi

mkdir -p $dir/.backup
if [ -f $dir/feats.scp ]; then
  mv $dir/feats.scp $dir/.backup
fi
if [ -f $dir/cmvn.scp ]; then
  mv $dir/cmvn.scp $dir/.backup
fi
if [ -f $dir/utt2spk ]; then
  mv $dir/utt2spk $dir/.backup
fi

[ -f $data/stm ] && cp $data/stm $dir
[ -f $data/glm ] && cp $data/glm $dir

utils/data/internal/combine_segments_to_recording.py \
  --write-reco2utt=$dir/reco2sorted_utts $data/segments $dir/utt2spk || exit 1

if [ -f $data/text ]; then
  utils/apply_map.pl -f 2- $data/text < $dir/reco2sorted_utts > $dir/text || exit 1
fi

rm $dir/reco2sorted_utts

utils/fix_data_dir.sh $dir || exit 1

exit 0


================================================
FILE: egs/utils/data/extend_segment_times.py
================================================
#!/usr/bin/env python

from __future__ import print_function
import sys
import argparse
from collections import defaultdict


parser = argparse.ArgumentParser(description="""
 Usage: extend_segment_times.py [options] <input-segments >output-segments
 This program pads the times in a 'segments' file (e.g. data/train/segments)
 with specified left and right context (for cases where there was no
 silence padding in the original segments file)""")

parser.add_argument("--start-padding", type = float, default = 0.1,
                    help="Amount of padding, in seconds, for the start time of "
                    "each segment (start times <0 will be set to zero).")
parser.add_argument("--end-padding", type = float, default = 0.1,
                    help="Amount of padding, in seconds, for the end time of "
                    "each segment.")
parser.add_argument("--last-segment-end-padding", type = float, default = 0.1,
                    help="Amount of padding, in seconds, for the end time of "
                    "the last segment of each file (maximum allowed).")
parser.add_argument("--fix-overlapping-segments", type = str,
                    default = 'true', choices=['true', 'false'],
                    help="If true, prevent segments from overlapping as a result "
                    "of the padding (or that were already overlapping)")
args = parser.parse_args()


# the input file will be a sequence of lines which are each of the form:
# <utterance-id> <recording-id> <start-time> <end-time>
# e.g.
# utt-1 recording-1 0.62 5.40
# The output will be in the same format and in the same
# order, except wiht modified times.

# This variable maps from a recording-id to a listof the utterance
# indexes (as integer indexes into 'entries']
# that are part of that recording.
recording_to_utt_indexes = defaultdict(list)

# This is an array of the entries in the segments file, in the fomrat:
# (utterance-id as astring, recording-id as string,
#  start-time as float, end-time as float)
entries = []


while True:
    line = sys.stdin.readline()
    if line == '':
        break
    try:
        [ utt_id, recording_id, start_time, end_time ] = line.split()
        start_time = float(start_time)
        end_time = float(end_time)
    except:
        sys.exit("extend_segment_times.py: could not interpret line: " + line)
    if not end_time > start_time:
        print("extend_segment_times.py: bad segment (ignoring): " + line,
              file = sys.stderr)
    recording_to_utt_indexes[recording_id].append(len(entries))
    entries.append([utt_id, recording_id, start_time, end_time])

num_times_fixed = 0

for recording, utt_indexes in recording_to_utt_indexes.items():
    # this_entries is a list of lists, sorted on mid-time.
    # Notice: because lists are objects, when we change 'this_entries'
    # we change the underlying entries.
    this_entries = sorted([ entries[x] for x in utt_indexes ],
                          key = lambda x : 0.5 * (x[2] + x[3]))
    min_time = 0
    max_time = max([ x[3] for x in this_entries ]) + args.last_segment_end_padding
    start_padding = args.start_padding
    end_padding = args.end_padding
    for n in range(len(this_entries)):
        this_entries[n][2] = max(min_time, this_entries[n][2] - start_padding)
        this_entries[n][3] = min(max_time, this_entries[n][3] + end_padding)

    for n in range(len(this_entries) - 1):
        this_end_time = this_entries[n][3]
        next_start_time = this_entries[n+1][2]
        if this_end_time > next_start_time and args.fix_overlapping_segments == 'true':
            midpoint = 0.5 * (this_end_time + next_start_time)
            this_entries[n][3] = midpoint
            this_entries[n+1][2] = midpoint
            num_times_fixed += 1


# this prints a number with a certain number of digits after
# the point, while removing trailing zeros.
def FloatToString(f):
    num_digits = 6 # we want to print 6 digits after the zero
    g = f
    while abs(g) > 1.0:
        g *= 0.1
        num_digits += 1
    format_str = '%.{0}g'.format(num_digits)
    return format_str % f

for entry in entries:
    [ utt_id, recording_id, start_time, end_time ] = entry
    if not start_time < end_time:
        print("extend_segment_times.py: bad segment after processing (ignoring): " +
              ' '.join(entry), file = sys.stderr)
        continue
    print(utt_id, recording_id, FloatToString(start_time), FloatToString(end_time))


print("extend_segment_times.py: extended {0} segments; fixed {1} "
      "overlapping segments".format(len(entries), num_times_fixed),
      file = sys.stderr)

## test:
#  (echo utt1 reco1 0.2 6.2; echo utt2 reco1 6.3 9.8 )| extend_segment_times.py
# and also try the above with the options --last-segment-end-padding=0.0 --fix-overlapping-segments=false


================================================
FILE: egs/utils/data/extract_wav_segments_data_dir.sh
================================================
#!/usr/bin/env bash

# Copyright    2017  Hossein Hadian
# Apache 2.0

# This script copies a data directory (which has a 'segments' file), extracting
# wav segments (according to the 'segments' file)
# so that the resulting data directory does not have a 'segments' file anymore.

nj=4
cmd=run.pl

. ./utils/parse_options.sh
. ./path.sh

if [ $# != 2 ]; then
  echo "Usage: $0 <srcdir> <destdir>"
  echo " This script copies data directory <srcdir> to <destdir> and removes"
  echo " the 'segments' file by extracting the wav segments."
  echo "Options: "
  echo "  --nj <nj>                                        # number of parallel jobs"
  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
  exit 1;
fi


export LC_ALL=C

srcdir=$1
dir=$2
logdir=$dir/log

if ! mkdir -p $dir/data; then
  echo "$0: failed to create directory $dir/data"
  exit 1
fi
mkdir -p $logdir

set -eu -o pipefail
utils/copy_data_dir.sh $srcdir $dir

split_segments=""
for n in $(seq $nj); do
  split_segments="$split_segments $logdir/segments.$n"
done

utils/split_scp.pl $srcdir/segments $split_segments

$cmd JOB=1:$nj $logdir/extract_wav_segments.JOB.log \
     extract-segments scp,p:$srcdir/wav.scp $logdir/segments.JOB \
     ark,scp:$dir/data/wav_segments.JOB.ark,$dir/data/wav_segments.JOB.scp

# concatenate the .scp files together.
for n in $(seq $nj); do
  cat $dir/data/wav_segments.$n.scp
done > $dir/data/wav_segments.scp

cat $dir/data/wav_segments.scp | awk '{ print $1 " wav-copy " $2 " - |" }' >$dir/wav.scp
rm $dir/{segments,reco2file_and_channel} 2>/dev/null || true


================================================
FILE: egs/utils/data/fix_subsegment_feats.pl
================================================
#!/usr/bin/env perl

# Copyright 2016  Vimal Manohar
# Apache 2.0.

use warnings;

# This script reads from stdin a feats.scp file that contains frame ranges and
# ensures that they don't exceed the maximum number of frames supplied in the
# <utt2max-frames> file. 
# <utt2max-frames> is usually computed using get_utt2num_frames.sh on the 
# original directory which will be segmented using 
# utils/data/subsegment_data_dir.sh.
# 
# e.g. feats.scp
# utt_foo-1 foo-bar.ark:514231[721:892]
# 
# utt2max-frames
# utt_foo-1 891
# 
# fixed_feats.scp
# utt_foo-1 foo-bar.ark:514231[721:890]
# 
# Note: Here 891 is the number of frames in the archive foo-bar.ark
# The frame end for utt_foo-1, i.e. 892 (0-indexed) exceeds the archive size
# (891) by two frames. This script fixes that line by truncating the range 
# to 890.

if (scalar @ARGV != 1) {
  my $usage = <<END;
This script reads from stdin a feats.scp file that contains frame ranges and
ensures that they don't exceed the maximum number of frames supplied in the
<utt2max-frames> file. 

Usage: $0 <utt2max-frames> < feats.scp > fixed_feats.scp
END
  die "$usage";
}

my $utt2max_frames_file = $ARGV[0];

open MAX_FRAMES, $utt2max_frames_file or die "$0: Could not open file $utt2max_frames_file";

my %utt2max_frames;

while (<MAX_FRAMES>) {
  chomp;
  my @F = split;
  
  (scalar @F == 2) or die "$0: Invalid line $_ in $utt2max_frames_file";

  $utt2max_frames{$F[0]} = $F[1];
}

while (<STDIN>) {
  my $line = $_;
  
  #if (m/\[([^][]*)\]\[([^][]*)\]\s*$/) {
  #  print STDERR ("fix_subsegment_feats.pl: this script only supports single indices");
  #  exit(1);
  #}
  
  my $before_range = "";
  my $range = "";

  if (m/^(.*)\[([^][]*)\]\s*$/) {
    $before_range = $1;
    $range = $2;
  } else {
    print;
    next;
  }

  my @F = split(/ /, $before_range);
  my $utt = shift @F;
  defined $utt2max_frames{$utt} or die "fix_subsegment_feats.pl: Could not find key $utt in $utt2max_frames_file.\nError with line $line";

  if ($range !~ m/^(\d*):(\d*)([,]?.*)$/) {
    print STDERR "fix_subsegment_feats.pl: could not make sense of input line $_";
    exit(1);
  }
    
  my $row_start = $1;
  my $row_end = $2;
  my $col_range = $3;
  
  if ($row_start >= $utt2max_frames{$utt}) {
    print STDERR "Removing $utt because row_start $row_start >= file max length $utt2max_frames{$utt}\n";
    next;
  }  
  if ($row_end >= $utt2max_frames{$utt}) {
    print STDERR "Fixed row_end for $utt from $row_end to $utt2max_frames{$utt}-1\n";
    $row_end = $utt2max_frames{$utt} - 1;
  } 
   
  if ($row_start ne "") {
    $range = "$row_start:$row_end";
  } else {
    $range = "";
  }

  if ($col_range ne "") {
    $range .= ",$col_range";
  }
  print ("$utt " . join(" ", @F) . "[" . $range . "]\n");
}


================================================
FILE: egs/utils/data/get_allowed_durations.py
================================================
#!/usr/bin/env python3

# Copyright     2017  Hossein Hadian
#               2019  Facebook Inc. (Author: Vimal Manohar)
# Apache 2.0


""" This script generates a set of allowed lengths of utterances
    spaced by a factor (like 10%). This is useful for generating
    fixed-length chunks for chain training.
"""

import argparse
import os
import sys
import copy
import math
import logging

sys.path.insert(0, 'steps')
import libs.common as common_lib

logger = logging.getLogger('libs')
logger.setLevel(logging.INFO)
handler = logging.StreamHandler()
handler.setLevel(logging.INFO)
formatter = logging.Formatter("%(asctime)s [%(pathname)s:%(lineno)s - "
                              "%(funcName)s - %(levelname)s ] %(message)s")
handler.setFormatter(formatter)
logger.addHandler(handler)

def get_args():
    parser = argparse.ArgumentParser(description="""
    This script creates a list of allowed durations of utterances for flatstart
    LF-MMI training corresponding to input data directory 'data_dir' and writes
    it in two files in output directory 'dir':
    1) allowed_durs.txt -- durations are in seconds
    2) allowed_lengths.txt -- lengths are in number of frames

    Both the allowed_durs.txt and allowed_lengths.txt are formatted to
    have one entry on each line. Examples are as follows:

    $ echo data/train/allowed_lengths.txt
    414
    435
    468

    $ echo data/train/allowed_durs.txt
    4.16
    4.37
    4.70

    These files can then be used by a downstream script to perturb the
    utterances to these lengths.
    A perturbed data directory (created by a downstream script
    similar to utils/data/perturb_speed_to_allowed_lengths.py)
    that only contains utterances of these allowed durations,
    along with the corresponding allowed_lengths.txt are
    consumed by the e2e chain egs preparation script.
    See steps/nnet3/chain/e2e/get_egs_e2e.sh for how these are used.

    See also:
    * egs/cifar/v1/image/get_allowed_lengths.py -- a similar script for OCR datasets
    * utils/data/perturb_speed_to_allowed_lengths.py --
        creates the allowed_lengths.txt AND perturbs the data directory
    """)
    parser.add_argument('factor', type=float, default=12,
                        help='Spacing (in percentage) between allowed lengths. '
                        'Can be 0, which means all seen lengths that are a multiple of '
                        'frame_subsampling_factor will be allowed.')
    parser.add_argument('data_dir', type=str, help='path to data dir. Assumes that '
                        'it contains the utt2dur file.')
    parser.add_argument('dir', type=str, help='We write the output files '
                        'allowed_lengths.txt and allowed_durs.txt to this directory.')
    parser.add_argument('--coverage-factor', type=float, default=0.05,
                        help="""Percentage of durations not covered from each
                             side of duration histogram.""")
    parser.add_argument('--frame-shift', type=int, default=10,
                        help="""Frame shift in milliseconds.""")
    parser.add_argument('--frame-length', type=int, default=25,
                        help="""Frame length in milliseconds.""")
    parser.add_argument('--frame-subsampling-factor', type=int, default=3,
                        help="""Chain frame subsampling factor.
                             See steps/nnet3/chain/train.py""")
    args = parser.parse_args()
    return args


def read_kaldi_mapfile(path):
    """ Read any Kaldi mapping file - like text, .scp files, etc.
    """

    m = {}
    with open(path, 'r', encoding='latin-1') as f:
        for line in f:
            line = line.strip(" \t\r\n")
            sp_pos = line.find(' ')
            key = line[:sp_pos]
            val = line[sp_pos+1:]
            m[key] = val
    return m


def find_duration_range(utt2dur, coverage_factor):
    """Given a list of utterance durations, find the start and end duration to cover

     If we try to cover
     all durations which occur in the training set, the number of
     allowed lengths could become very large.

     Returns
     -------
     start_dur: float
     end_dur: float
    """
    durs = [float(val) for key, val in utt2dur.items()]
    durs.sort()
    to_ignore_dur = 0
    tot_dur = sum(durs)
    for d in durs:
        to_ignore_dur += d
        if to_ignore_dur * 100.0 / tot_dur > coverage_factor:
            start_dur = d
            break
    to_ignore_dur = 0
    for d in reversed(durs):
        to_ignore_dur += d
        if to_ignore_dur * 100.0 / tot_dur > coverage_factor:
            end_dur = d
            break
    if start_dur < 0.3:
        start_dur = 0.3  # a hard limit to avoid too many allowed lengths --not critical
    return start_dur, end_dur


def get_allowed_durations(start_dur, end_dur, args):
    """Given the start and end duration, find a set of
       allowed durations spaced by args.factor%. Also write
       out the list of allowed durations and the corresponding
       allowed lengths (in frames) on disk.

     Returns
     -------
     allowed_durations: list of allowed durations (in seconds)
    """

    allowed_durations = []
    d = start_dur
    with open(os.path.join(args.dir, 'allowed_durs.txt'), 'w', encoding='latin-1') as durs_fp, \
           open(os.path.join(args.dir, 'allowed_lengths.txt'), 'w', encoding='latin-1') as lengths_fp:
        while d < end_dur:
            length = int(d * 1000 - args.frame_length) / args.frame_shift + 1
            if length % args.frame_subsampling_factor != 0:
                length = (args.frame_subsampling_factor *
                              (length // args.frame_subsampling_factor))
                d = (args.frame_shift * (length - 1.0)
                     + args.frame_length + args.frame_shift / 2) / 1000.0
            allowed_durations.append(d)
            durs_fp.write("{}\n".format(d))
            lengths_fp.write("{}\n".format(int(length)))
            d *= args.factor
    return allowed_durations


def get_trivial_allowed_durations(utt2dur, args):
    lengths = list(set(
        [int(float(d) * 1000 - args.frame_length) / args.frame_shift + 1
         for key, d in utt2dur.items()]
    ))
    lengths.sort()

    allowed_durations = []
    with open(os.path.join(args.dir, 'allowed_durs.txt'), 'w', encoding='latin-1') as durs_fp, \
           open(os.path.join(args.dir, 'allowed_lengths.txt'), 'w', encoding='latin-1') as lengths_fp:
        for length in lengths:
            if length % args.frame_subsampling_factor != 0:
                length = (args.frame_subsampling_factor *
                              (length // args.frame_subsampling_factor))
                d = (args.frame_shift * (length - 1.0)
                     + args.frame_length + args.frame_shift / 2) / 1000.0
            allowed_durations.append(d)
            durs_fp.write("{}\n".format(d))
            lengths_fp.write("{}\n".format(int(length)))

    assert len(allowed_durations) > 0
    start_dur = allowed_durations[0]
    end_dur = allowed_durations[-1]

    logger.info("Durations in the range [{},{}] will be covered."
                "".format(start_dur, end_dur))
    logger.info("There will be {} unique allowed lengths "
                "for the utterances.".format(len(allowed_durations)))

    return allowed_durations


def main():
    args = get_args()
    utt2dur = read_kaldi_mapfile(os.path.join(args.data_dir, 'utt2dur'))

    if args.factor == 0.0:
        get_trivial_allowed_durations(utt2dur, args)
        return

    args.factor = 1.0 + args.factor / 100.0

    start_dur, end_dur = find_duration_range(utt2dur, args.coverage_factor)
    logger.info("Durations in the range [{},{}] will be covered. "
                "Coverage rate: {}%".format(start_dur, end_dur,
                                      100.0 - args.coverage_factor * 2))
    logger.info("There will be {} unique allowed lengths "
                "for the utterances.".format(int(math.log(end_dur / start_dur)/
                                                 math.log(args.factor))))

    get_allowed_durations(start_dur, end_dur, args)


if __name__ == '__main__':
      main()


================================================
FILE: egs/utils/data/get_frame_shift.sh
================================================
#!/usr/bin/env bash

# Copyright 2016  Johns Hopkins University (author: Daniel Povey)
# Apache 2.0

# This script takes as input a data directory, such as data/train/, preferably
# with utt2dur file already existing (or the utt2dur file will be created if
# not), and it attempts to work out the approximate frame shift by comparing the
# utt2dur with the output of feat-to-len on the feats.scp.  It prints it out.
# if the shift is very close to, but above, 0.01 (the normal frame shift) it
# rounds it down.

. utils/parse_options.sh
. ./path.sh

if [ $# != 1 ]; then
  cat >&2 <<EOF
Usage: frame_shift=\$($0 <datadir>)
e.g.:  frame_shift=\$($0 data/train)

This script prints the frame-shift in seconds (e.g. 0.01) to the standard out.
Its output is intended to be captured in a shell variable.

If <datadir> does not contain the file utt2dur, this script may invoke
utils/data/get_utt2dur.sh, which will require write permission to <datadir>.
EOF
  exit 1
fi

export LC_ALL=C

dir=$1

if [[ -s $dir/frame_shift ]]; then
  cat $dir/frame_shift
  exit
fi

if [ ! -f $dir/feats.scp ]; then
  echo "$0: $dir/feats.scp does not exist" 1>&2
  exit 1
fi

if [ ! -s $dir/utt2dur ]; then
  if [ ! -e $dir/wav.scp ] && [ ! -s $dir/segments ]; then
    echo "$0: neither $dir/wav.scp nor $dir/segments exist; assuming a frame shift of 0.01." 1>&2
    echo 0.01
    exit 0
  fi
  echo "$0: $dir/utt2dur does not exist: creating it" 1>&2
  utils/data/get_utt2dur.sh 1>&2 $dir || exit 1
fi

temp=$(mktemp /tmp/tmp.XXXX) || exit 1

feat-to-len --print-args=false "scp:head -n 10 $dir/feats.scp|" ark,t:- > $temp

if [[ ! -s $temp ]]; then
  rm $temp
  echo "$0: error running feat-to-len" 1>&2
  exit 1
fi

frame_shift=$(head -n 10 $dir/utt2dur | paste - $temp | awk '
      { dur += $2; frames += $4; }
  END { shift = dur / frames;
        if (shift > 0.01 && shift < 0.0102) shift = 0.01;
        print shift; }') || exit 1;

rm $temp

echo $frame_shift > $dir/frame_shift
echo $frame_shift
exit 0


================================================
FILE: egs/utils/data/get_num_frames.sh
================================================
#!/usr/bin/env bash

# This script works out the approximate number of frames in a training directory.
# This is sometimes needed by higher-level scripts


if [ -f path.sh ]; then . ./path.sh; fi
. parse_options.sh || exit 1;

if [ $# -ne 1 ]; then
  (
    echo "Usage: $0 <data-dir>"
    echo "Prints the number of frames of data in the data-dir"
  ) 1>&2
fi

data=$1

if [ ! -f $data/utt2dur ]; then
  utils/data/get_utt2dur.sh $data 1>&2 || exit 1
fi

frame_shift=$(utils/data/get_frame_shift.sh $data) || exit 1

awk -v s=$frame_shift '{n += $2} END{printf("%.0f\n", (n / s))}' <$data/utt2dur


================================================
FILE: egs/utils/data/get_reco2dur.sh
================================================
#!/usr/bin/env bash

# Copyright 2016  Johns Hopkins University (author: Daniel Povey)
#           2018  Andrea Carmantini
# Apache 2.0

# This script operates on a data directory, such as in data/train/, and adds the
# reco2dur file if it does not already exist.  The file 'reco2dur' maps from
# recording to the duration of the recording in seconds.  This script works it
# out from the 'wav.scp' file, or, if utterance-ids are the same as recording-ids, from the
# utt2dur file (it first tries interrogating the headers, and if this fails, it reads the wave
# files in entirely.)
# We could use durations from segments file, but that's not the duration of the recordings
# but the sum of utterance lenghts (silence in between could be excluded from segments)
# For sum of utterance lenghts:
# awk 'FNR==NR{uttdur[$1]=$2;next}
# { for(i=2;i<=NF;i++){dur+=uttdur[$i];}
#   print $1 FS dur; dur=0  }'  $data/utt2dur $data/reco2utt


frame_shift=0.01
cmd=run.pl
nj=48

. utils/parse_options.sh
. ./path.sh

if [ $# != 1 ]; then
  echo "Usage: $0 [options] <datadir>"
  echo "e.g.:"
  echo " $0 data/train"
  echo " Options:"
  echo " --frame-shift      # frame shift in seconds. Only relevant when we are"
  echo "                    # getting duration from feats.scp (default: 0.01). "
  exit 1
fi

export LC_ALL=C

data=$1


if [ -s $data/reco2dur ] && \
  [ $(wc -l < $data/wav.scp) -eq $(wc -l < $data/reco2dur) ]; then
  echo "$0: $data/reco2dur already exists with the expected length.  We won't recompute it."
  exit 0;
fi

if [ -s $data/utt2dur ] && \
   [ $(wc -l < $data/utt2spk) -eq $(wc -l < $data/utt2dur) ] && \
   [ ! -s $data/segments ]; then

  echo "$0: $data/wav.scp indexed by utt-id; copying utt2dur to reco2dur"
  cp $data/utt2dur $data/reco2dur && exit 0;

elif [ -f $data/wav.scp ]; then
  echo "$0: obtaining durations from recordings"

  # if the wav.scp contains only lines of the form
  # utt1  /foo/bar/sph2pipe -f wav /baz/foo.sph |
  if cat $data/wav.scp | perl -e '
     while (<>) { s/\|\s*$/ |/;  # make sure final | is preceded by space.
             @A = split; if (!($#A == 5 && $A[1] =~ m/sph2pipe$/ &&
                               $A[2] eq "-f" && $A[3] eq "wav" && $A[5] eq "|")) { exit(1); }
             $reco = $A[0]; $sphere_file = $A[4];

             if (!open(F, "<$sphere_file")) { die "Error opening sphere file $sphere_file"; }
             $sample_rate = -1;  $sample_count = -1;
             for ($n = 0; $n <= 30; $n++) {
                $line = <F>;
                if ($line =~ m/sample_rate -i (\d+)/) { $sample_rate = $1; }
                if ($line =~ m/sample_count -i (\d+)/) { $sample_count = $1; }
                if ($line =~ m/end_head/) { break; }
             }
             close(F);
             if ($sample_rate == -1 || $sample_count == -1) {
               die "could not parse sphere header from $sphere_file";
             }
             $duration = $sample_count * 1.0 / $sample_rate;
             print "$reco $duration\n";
     } ' > $data/reco2dur; then
    echo "$0: successfully obtained recording lengths from sphere-file headers"
  else
    echo "$0: could not get recording lengths from sphere-file headers, using wav-to-duration"
    if ! command -v wav-to-duration >/dev/null; then
      echo  "$0: wav-to-duration is not on your path"
      exit 1;
    fi

    read_entire_file=false
    if grep -q 'sox.*speed' $data/wav.scp; then
      read_entire_file=true
      echo "$0: reading from the entire wav file to fix the problem caused by sox commands with speed perturbation. It is going to be slow."
      echo "... It is much faster if you call get_reco2dur.sh *before* doing the speed perturbation via e.g. perturb_data_dir_speed.sh or "
      echo "... perturb_data_dir_speed_3way.sh."
    fi

    num_recos=$(wc -l <$data/wav.scp)
    if [ $nj -gt $num_recos ]; then
      nj=$num_recos
    fi

    temp_data_dir=$data/wav${nj}split
    wavscps=$(for n in `seq $nj`; do echo $temp_data_dir/$n/wav.scp; done)
    subdirs=$(for n in `seq $nj`; do echo $temp_data_dir/$n; done)

    if ! mkdir -p $subdirs >&/dev/null; then
	for n in `seq $nj`; do
	    mkdir -p $temp_data_dir/$n
	done
    fi

    utils/split_scp.pl $data/wav.scp $wavscps


    $cmd JOB=1:$nj $data/log/get_reco_durations.JOB.log \
      wav-to-duration --read-entire-file=$read_entire_file \
      scp:$temp_data_dir/JOB/wav.scp ark,t:$temp_data_dir/JOB/reco2dur || \
        { echo "$0: there was a problem getting the durations"; exit 1; } # This could

    for n in `seq $nj`; do
      cat $temp_data_dir/$n/reco2dur
    done > $data/reco2dur
  fi
  rm -r $temp_data_dir
else
  echo "$0: Expected $data/wav.scp to exist"
  exit 1
fi

len1=$(wc -l < $data/wav.scp)
len2=$(wc -l < $data/reco2dur)
if [ "$len1" != "$len2" ]; then
  echo "$0: warning: length of reco2dur does not equal that of wav.scp, $len2 != $len1"
  if [ $len1 -gt $[$len2*2] ]; then
    echo "$0: less than half of recordings got a duration: failing."
    exit 1
  fi
fi

echo "$0: computed $data/reco2dur"

exit 0


================================================
FILE: egs/utils/data/get_reco2utt_for_data.sh
================================================
#! /bin/bash

# Copyright 2016  Vimal Manohar
# Apache 2.0

if [ $# -ne 1 ]; then
  echo "This script outputs a mapping from recording to a list of utterances "
  echo "corresponding to the recording. It is analogous to the content of "
  echo "a spk2utt file, but is indexed by recording instead of speaker."
  echo "Usage: get_reco2utt.sh <data>"
  echo " e.g.: get_reco2utt.sh data/train"
  exit 1
fi

data=$1

if [ ! -s $data/segments ]; then
  utils/data/get_segments_for_data.sh $data > $data/segments
fi

cut -d ' ' -f 1,2 $data/segments | utils/utt2spk_to_spk2utt.pl


================================================
FILE: egs/utils/data/get_segments_for_data.sh
================================================
#!/usr/bin/env bash

# This script operates on a data directory, such as in data/train/,
# and writes new segments to stdout. The file 'segments' maps from
# utterance to time offsets into a recording, with the format:
#   <utterance-id> <recording-id> <segment-begin> <segment-end>
# This script assumes utterance and recording ids are the same (i.e., that
# wav.scp is indexed by utterance), and uses durations from 'utt2dur', 
# created if necessary by get_utt2dur.sh.

. ./path.sh

if [ $# != 1 ]; then
  echo "Usage: $0 [options] <datadir>"
  echo "e.g.:"
  echo " $0 data/train > data/train/segments"
  exit 1
fi

data=$1

if [ ! -s $data/utt2dur ]; then
  utils/data/get_utt2dur.sh $data 1>&2 || exit 1;
fi

# <utt-id> <utt-id> 0 <utt-dur>
awk '{ print $1, $1, 0, $2 }' $data/utt2dur

exit 0


================================================
FILE: egs/utils/data/get_uniform_subsegments.py
================================================
#! /usr/bin/env python

# Copyright 2017  Vimal Manohar
#           2017  Matthew Maciejewski
# Apache 2.0.

from __future__ import print_function
import argparse
import logging
import sys
import textwrap

def get_args():
    parser = argparse.ArgumentParser(
        description=textwrap.dedent("""
        Creates a subsegments file from an input segments file.

        The output format is

        <subsegment-id> <utterance-id> <start-time> <end-time>,

        where the timings are relative to the start-time of the
         <utterance-id> in the input segments file.
        Reminder: the format of the input segments file is:

         <utterance-id> <recording-id> <start-time> <end-time>

        where the recording-id corresponds to a wav file (or a channel of
        a wav file) from wav.scp.  Note: you can use
        utils/data/get_segments_for_data.sh to generate a 'default'
        segments file for your data if one doesn't already exist.

        e.g.: get_uniform_subsegments.py data/dev/segments > \\
                data/dev_uniform_segments/sub_segments

        utils/data/subsegment_data_dir.sh data/dev \\
            data/dev_uniform_segments/sub_segments data/dev_uniform_segments

        The output is written to stdout. The resulting file can be
        passed to utils/data/subsegment_data_dir.sh to sub-segment
        the data directory."""),
        formatter_class=argparse.RawDescriptionHelpFormatter)
    parser.add_argument("--max-segment-duration", type=float,
                        default=30, help="""Maximum duration of the
                        subsegments (in seconds)""")
    parser.add_argument("--overlap-duration", type=float,
                        default=5, help="""Overlap between
                        adjacent segments (in seconds)""")
    parser.add_argument("--max-remaining-duration", type=float,
                        default=10, help="""Segment is not split
                        if the left-over duration is more than this
                        many seconds""")
    parser.add_argument("--constant-duration", type=bool,
                        default=False, help="""Final segment is given
                        a start time max-segment-duration before the
                        end to force a constant segment duration. This
                        overrides the max-remaining-duration parameter""")
    parser.add_argument("segments_file", type=argparse.FileType('r'),
                        help="""Input kaldi segments file""")

    args = parser.parse_args()
    return args


def run(args):
    if (args.constant_duration):
        dur_threshold = args.max_segment_duration
    else:
        dur_threshold = args.max_segment_duration + args.max_remaining_duration

    for line in args.segments_file:
        parts = line.strip().split()
        utt_id = parts[0]
        start_time = float(parts[2])
        end_time = float(parts[3])

        dur = end_time - start_time

        start = start_time
        while (dur > dur_threshold):
            end = start + args.max_segment_duration
            start_relative = start - start_time
            end_relative = end - start_time
            new_utt = "{utt_id}-{s:08d}-{e:08d}".format(
                utt_id=utt_id, s=int(100 * start_relative),
                e=int(100 * end_relative))
            print ("{new_utt} {utt_id} {s:.3f} {e:.3f}".format(
                new_utt=new_utt, utt_id=utt_id, s=start_relative,
                e=start_relative + args.max_segment_duration))
            start += args.max_segment_duration - args.overlap_duration
            dur -= args.max_segment_duration - args.overlap_duration

        if (args.constant_duration):
            if (dur < 0):
              continue
            if (dur < args.max_remaining_duration):
              start = max(end_time - args.max_segment_duration, start_time)
            end = min(start + args.max_segment_duration, end_time)
        else:
            end = end_time
        new_utt = "{utt_id}-{s:08d}-{e:08d}".format(
            utt_id=utt_id, s=int(round(100 * (start - start_time))),
            e=int(round(100 * (end - start_time))))
        print ("{new_utt} {utt_id} {s:.3f} {e:.3f}".format(
            new_utt=new_utt, utt_id=utt_id, s=start - start_time,
            e=end - start_time))


def main():
    args = get_args()
    try:
        run(args)
    except Exception:
        logging.error("Failed creating subsegments", exc_info=True)
        raise SystemExit(1)
    finally:
        args.segments_file.close()


if __name__ == '__main__':
    main()


================================================
FILE: egs/utils/data/get_utt2dur.sh
================================================
#!/usr/bin/env bash

# Copyright 2016  Johns Hopkins University (author: Daniel Povey)
# Apache 2.0

# This script operates on a data directory, such as in data/train/, and adds the
# utt2dur file if it does not already exist.  The file 'utt2dur' maps from
# utterance to the duration of the utterance in seconds.  This script works it
# out from the 'segments' file, or, if not present, from the wav.scp file (it
# first tries interrogating the headers, and if this fails, it reads the wave
# files in entirely.)

frame_shift=0.01
cmd=run.pl
nj=48

. utils/parse_options.sh
. ./path.sh

if [ $# != 1 ]; then
  echo "Usage: $0 [options] <datadir>"
  echo "e.g.:"
  echo " $0 data/train"
  echo " Options:"
  echo " --frame-shift      # frame shift in seconds. Only relevant when we are"
  echo "                    # getting duration from feats.scp, and only if the "
  echo "                    # file frame_shift does not exist (default: 0.01). "
  exit 1
fi

export LC_ALL=C

data=$1

if [ -s $data/utt2dur ] && \
  [ $(wc -l < $data/utt2spk) -eq $(wc -l < $data/utt2dur) ]; then
  echo "$0: $data/utt2dur already exists with the expected length.  We won't recompute it."
  exit 0;
fi

if [ -s $data/segments ]; then
  echo "$0: working out $data/utt2dur from $data/segments"
  awk '{len=$4-$3; print $1, len;}' < $data/segments  > $data/utt2dur
elif [[ -s $data/frame_shift && -f $data/utt2num_frames ]]; then
  echo "$0: computing $data/utt2dur from $data/{frame_shift,utt2num_frames}."
  frame_shift=$(cat $data/frame_shift) || exit 1
  # The 1.5 correction is the typical value of (frame_length-frame_shift)/frame_shift.
  awk -v fs=$frame_shift '{ $2=($2+1.5)*fs; print }' <$data/utt2num_frames  >$data/utt2dur
elif [ -f $data/wav.scp ]; then
  echo "$0: segments file does not exist so getting durations from wave files"

  # if the wav.scp contains only lines of the form
  # utt1  /foo/bar/sph2pipe -f wav /baz/foo.sph |
  if perl <$data/wav.scp -e '
     while (<>) { s/\|\s*$/ |/;  # make sure final | is preceded by space.
             @A = split; if (!($#A == 5 && $A[1] =~ m/sph2pipe$/ &&
                               $A[2] eq "-f" && $A[3] eq "wav" && $A[5] eq "|")) { exit(1); }
             $utt = $A[0]; $sphere_file = $A[4];

             if (!open(F, "<$sphere_file")) { die "Error opening sphere file $sphere_file"; }
             $sample_rate = -1;  $sample_count = -1;
             for ($n = 0; $n <= 30; $n++) {
                $line = <F>;
                if ($line =~ m/sample_rate -i (\d+)/) { $sample_rate = $1; }
                if ($line =~ m/sample_count -i (\d+)/) { $sample_count = $1; }
                if ($line =~ m/end_head/) { break; }
             }
             close(F);
             if ($sample_rate == -1 || $sample_count == -1) {
               die "could not parse sphere header from $sphere_file";
             }
             $duration = $sample_count * 1.0 / $sample_rate;
             print "$utt $duration\n";
     } ' > $data/utt2dur; then
    echo "$0: successfully obtained utterance lengths from sphere-file headers"
  else
    echo "$0: could not get utterance lengths from sphere-file headers, using wav-to-duration"
    if ! command -v wav-to-duration >/dev/null; then
      echo  "$0: wav-to-duration is not on your path"
      exit 1;
    fi

    read_entire_file=true
    if grep -q 'sox.*speed' $data/wav.scp; then
      read_entire_file=true
      echo "$0: reading from the entire wav file to fix the problem caused by sox commands with speed perturbation. It is going to be slow."
      echo "... It is much faster if you call get_utt2dur.sh *before* doing the speed perturbation via e.g. perturb_data_dir_speed.sh or "
      echo "... perturb_data_dir_speed_3way.sh."
    fi


    num_utts=$(wc -l <$data/utt2spk)
    if [ $nj -gt $num_utts ]; then
      nj=$num_utts
    fi

    utils/data/split_data.sh --per-utt $data $nj
    sdata=$data/split${nj}utt

    $cmd JOB=1:$nj $data/log/get_durations.JOB.log \
      wav-to-duration --read-entire-file=$read_entire_file \
      scp:$sdata/JOB/wav.scp ark,t:$sdata/JOB/utt2dur || \
        { echo "$0: there was a problem getting the durations"; exit 1; }

    for n in `seq $nj`; do
      cat $sdata/$n/utt2dur
    done > $data/utt2dur
  fi
elif [ -f $data/feats.scp ]; then
  echo "$0: wave file does not exist so getting durations from feats files"
  if [[ -s $data/frame_shift ]]; then
    frame_shift=$(cat $data/frame_shift) || exit 1
    echo "$0: using frame_shift=$frame_shift from file $data/frame_shift"
  fi
  # The 1.5 correction is the typical value of (frame_length-frame_shift)/frame_shift.
  feat-to-len scp:$data/feats.scp ark,t:- |
    awk -v frame_shift=$frame_shift '{print $1, ($2+1.5)*frame_shift}' >$data/utt2dur
else
  echo "$0: Expected $data/wav.scp, $data/segments or $data/feats.scp to exist"
  exit 1
fi

len1=$(wc -l < $data/utt2spk)
len2=$(wc -l < $data/utt2dur)
if [ "$len1" != "$len2" ]; then
  echo "$0: warning: length of utt2dur does not equal that of utt2spk, $len2 != $len1"
  if [ $len1 -gt $[$len2*2] ]; then
    echo "$0: less than half of utterances got a duration: failing."
    exit 1
  fi
fi

echo "$0: computed $data/utt2dur"

exit 0


================================================
FILE: egs/utils/data/get_utt2num_frames.sh
================================================
#! /bin/bash

# Copyright 2016  Vimal Manohar
# Apache 2.0.

cmd=run.pl
nj=4

frame_shift=0.01
frame_overlap=0.015

. utils/parse_options.sh
. ./path.sh

if [ $# -ne 1 ]; then
  echo "This script writes a file utt2num_frames with the "
  echo "number of frames in each utterance as measured based on the "
  echo "duration of the utterances (in utt2dur) and the specified "
  echo "frame_shift and frame_overlap."
  echo "Usage: $0 <data>"
  exit 1
fi

data=$1

if [ -s $data/utt2num_frames ]; then
  echo "$0: $data/utt2num_frames already present!"
  exit 0;
fi

if [ ! -f $data/feats.scp ]; then
  utils/data/get_utt2dur.sh --nj ${nj} --cmd "$cmd" $data
  awk -v fs=$frame_shift -v fovlp=$frame_overlap \
    '{print $1" "int( ($2 - fovlp) / fs)}' $data/utt2dur > $data/utt2num_frames
  exit 0
fi

utils/split_data.sh --per-utt $data $nj || exit 1
$cmd JOB=1:$nj $data/log/get_utt2num_frames.JOB.log \
  feat-to-len scp:$data/split${nj}utt/JOB/feats.scp ark,t:$data/split${nj}utt/JOB/utt2num_frames || exit 1

for n in `seq $nj`; do
  cat $data/split${nj}utt/$n/utt2num_frames
done > $data/utt2num_frames

echo "$0: Computed and wrote $data/utt2num_frames"


================================================
FILE: egs/utils/data/internal/choose_utts_to_combine.py
================================================
#!/usr/bin/env python

# Copyright 2016  Vijayaditya Peddinti
#           2016  Johns Hopkins University (author: Daniel Povey)
# Apache 2.0

from __future__ import print_function
import argparse
from random import randint
import sys
import os
from collections import defaultdict


parser = argparse.ArgumentParser(description="""
This script, called from data/utils/combine_short_segments.sh, chooses consecutive
utterances to concatenate that will satisfy the minimum segment length.  It uses the
--spk2utt file to ensure that utterances from the same speaker are preferentially
combined (as far as possible while respecting the minimum segment length).
If it has to combine utterances across different speakers in order to satisfy the
duration constraint, it will assign the combined utterances to the speaker which
contributed the most to the duration of the combined utterances.


The utt2uts output of this program is a map from new
utterance-id to a list of old utterance-ids, so for example if the inputs were
utt1, utt2 and utt3, and utterances 2 and 3 were combined, the output might look
like:
utt1 utt1
utt2-combine2 utt2 utt3
The utt2spk output of this program assigns utterances to the speakers of the input;
in the (hopefully rare) case where utterances were combined across speakers, it
will assign the utterance to whichever of the original speakers contributed the most
to the grouped utterance.
""")


parser.add_argument("--min-duration", type = float, default = 1.55,
                    help="Minimum utterance duration")
parser.add_argument("--merge-within-speakers-only", type = str, default = 'false',
                    choices = ['true', 'false'],
                    help="If true, utterances are only combined from the same speaker."
                    "It may be useful for the speaker recognition task."
                    "If false, utterances are preferentially combined from the same speaker,"
                    "and then combined across different speakers.")
parser.add_argument("spk2utt_in", type = str, metavar = "<spk2utt-in>",
                    help="Filename of [input] speaker to utterance map needed "
                    "because this script tries to merge utterances from the "
                    "same speaker as much as possible, and also needs to produce"
                    "an output utt2spk map.")
parser.add_argument("utt2dur_in", type = str, metavar = "<utt2dur-in>",
                    help="Filename of [input] utterance-to-duration map, with lines like 'utt1 1.23'.")
parser.add_argument("utt2utts_out", type = str, metavar = "<utt2utts-out>",
                    help="Filename of [output] new-utterance-to-old-utterances map, with lines "
                    "like 'utt1 utt1' or 'utt2-comb2 utt2 utt3'")
parser.add_argument("utt2spk_out", type = str, metavar = "<utt2spk-out>",
                    help="Filename of [output] utt2spk map, which maps new utterances to original "
                    "speakers.  If utterances were combined across speakers, we map the new "
                    "utterance to the speaker that contributed the most to them.")
parser.add_argument("utt2dur_out", type = str, metavar = "<utt2spk-out>",
                    help="Filename of [output] utt2dur map, which is just the summations of "
                    "the durations of the source utterances.")


args = parser.parse_args()


# This LessThan is designed to be impervious to roundoff effects in cases where
# numbers are really always separated by a distance >> 1.0e-05.  It will return
# false if x and y are almost identical, differing only by roundoff effects.
def LessThan(x, y):
    return x < y - 1.0e-5


# This function implements the core of the utterance-combination code.
# The input 'durations' is a list of durations, which must all be
# >=0.0  This function tries to combine consecutive indexes
# into groups such that for each group, the total duration is at
# least 'min_duration'.  It returns a list of (start,end) indexes.
# For example, CombineList(0.1, [5.0,6.0,7.0]) would return
# [ (0,1), (1,2), (2,3) ] because no combination is necessary; each
# returned pair represents a singleton group.
# Or CombineList(1.0, [0.5, 0.6, 0.7]) would return
# [ (0,3) ].
# Or CombineList(1.0, [0.5, 0.6, 1.7]) would return
# [ (0,2), (2,3) ].
# Note: if sum(durations) < min_duration, this function will
# return everything in one group but of course the sum of durations
# will be less than the total.
def CombineList(min_duration, durations):
    assert min_duration >= 0.0 and min(durations) > 0.0

    num_utts = len(durations)

    # for each utterance-index i, group_start[i] gives us the
    # start-index of the group of utterances of which it's currently
    # a member.
    group_start = list(range(num_utts))
    # if utterance-index i currently corresponds to the start of a group
    # of utterances, then group_durations[i] is the total duration of
    # that utterance-group, otherwise undefined.
    group_durations = list(durations)
    # if utterance-index i currently corresponds to the start of a group
    # of utterances, then group_end[i] is the end-index (i.e. last index plus one
    # of that utterance-group, otherwise undefined.
    group_end = [ x + 1 for x in range(num_utts) ]

    queue = [ i for i in range(num_utts) if LessThan(group_durations[i], min_duration) ]

    while len(queue) > 0:
        i = queue.pop()
        if group_start[i] != i or not LessThan(group_durations[i], min_duration):
            # this group no longer exists or already has at least the minimum duration.
            continue
        this_dur = group_durations[i]
        # left_dur is the duration of the group to the left of this group,
        # or 0.0 if there is no such group.
        left_dur = group_durations[group_start[i-1]] if i > 0 else 0.0
        # right_dur is the duration of the group to the right of this group,
        # or 0.0 if there is no such group.
        right_dur = group_durations[group_end[i]] if group_end[i] < num_utts else 0.0


        if left_dur == 0.0 and right_dur == 0.0:
            # there is only one group.  Nothing more to merge; break
            assert group_start[i] == 0 and group_end[i] == num_utts
            break
        # work out whether to combine left or right,
        # by means of the combine_left variable [ True or False ]
        if left_dur == 0.0:
            combine_left = False
        elif right_dur == 0.0:
            combine_left = True
        elif LessThan(left_dur + this_dur, min_duration):
            # combining left would still be below the minimum duration->
            # combine right... if it's above the min duration then good;
            # otherwise it still doesn't really matter so we might as well
            # pick one.
            combine_left = False
        elif LessThan(right_dur + this_dur, min_duration):
            # combining right would still be below the minimum duration,
            # and combining left would be >= the min duration (else we wouldn't
            # have reached this line) -> combine left.
            combine_left = True
        elif LessThan(left_dur, right_dur):
            # if we reached here then combining either way would take us >= the
            # minimum duration; but if left_dur < right_dur then we combine left
            # because that would give us more evenly sized segments.
            combine_left = True
        else:
            # if we reached here then combining either way would take us >= the
            # minimum duration; but  left_dur >= right_dur, so we combine right
            # because that would give us more evenly sized segments.
            combine_left = False

        if combine_left:
            assert left_dur != 0.0
            new_group_start = group_start[i-1]
            group_end[new_group_start] = group_end[i]
            for j in range(group_start[i], group_end[i]):
                group_start[j] = new_group_start
                group_durations[new_group_start] += durations[j]
            # note: there is no need to add group_durations[new_group_start] to
            # the queue even if it is still below the minimum length, because it
            # would have previously had to have been below the minimum length,
            # therefore it would already be in the queue.
        else:
            assert right_dur != 0.0
            # group start doesn't change, group end changes.
            old_group_end = group_end[i]
            new_group_end = group_end[old_group_end]
            group_end[i] = new_group_end
            for j in range(old_group_end, new_group_end):
                group_durations[i] += durations[j]
                group_start[j] = i
            if LessThan(group_durations[i], min_duration):
                # the group starting at i is still below the minimum length, so
                # we need to put it back on the queue.
                queue.append(i)

    ans = []
    cur_group_start = 0
    while cur_group_start < num_utts:
        ans.append( (cur_group_start, group_end[cur_group_start]) )
        cur_group_start = group_end[cur_group_start]
    return ans

def SelfTest():
    assert CombineList(0.1, [5.0, 6.0, 7.0]) == [ (0,1), (1,2), (2,3) ]
    assert CombineList(0.5, [0.1, 6.0, 7.0]) == [ (0,2), (2,3) ]
    assert CombineList(0.5, [6.0, 7.0, 0.1]) == [ (0,1), (1,3) ]
    # in the two examples below, it combines with the shorter one if both would
    # be above min-dur.
    assert CombineList(0.5, [6.0, 0.1, 7.0]) == [ (0,2), (2,3) ]
    assert CombineList(0.5, [7.0, 0.1, 6.0]) == [ (0,1), (1,3) ]
    # in the example below, it combines with whichever one would
    # take it above the min-dur, if there is only one such.
    # note, it tests the 0.1 first as the queue is popped from the end.
    assert CombineList(1.0, [1.0, 0.5, 0.1, 6.0]) == [ (0,2), (2,4) ]

    for x in range(100):
        min_duration = 0.05
        num_utts = randint(1, 15)
        durations = []
        for i in range(num_utts):
            durations.append(0.01 * randint(1, 10))
        ranges = CombineList(min_duration, durations)
        if len(ranges) > 1:  # check that each range's duration is >= min_duration
            for j in range(len(ranges)):
                (start, end) = ranges[j]
                this_dur = sum([ durations[k] for k in range(start, end) ])
                assert not LessThan(this_dur, min_duration)

        # check that the list returned is not affected by very tiny differences
        # in the inputs.
        durations2 = list(durations)
        for i in range(len(durations2)):
            durations2[i] += 1.0e-07 * randint(-5, 5)
        ranges2 = CombineList(min_duration, durations2)
        assert ranges2 == ranges

# This function figures out the grouping of utterances.
# The input is:
# 'min_duration' which is the minimum utterance length in seconds.
# 'merge_within_speakers_only' which is a ['true', 'false'] choice.
# If true, then utterances are only combined if they belong to the same speaker.
# 'spk2utt' which is a list of pairs (speaker-id, [list-of-utterances])
# 'utt2dur' which is a dict from utterance-id to duration (as a float)
# It returns a lists of lists of utterances; each list corresponds to
# a group, e.g.
# [ ['utt1'], ['utt2', 'utt3'] ]
def GetUtteranceGroups(min_duration, merge_within_speakers_only, spk2utt, utt2dur):
    # utt_groups will be a list of lists of utterance-ids formed from the
    # first pass of combination.
    utt_groups = []
    # group_durations will be the durations of the corresponding elements of
    # 'utt_groups'.
    group_durations = []

    # This block calls CombineList for the utterances of each speaker
    # separately, in the 'first pass' of combination.
    for i in range(len(spk2utt)):
        (spk, utts) = spk2utt[i]
        durations = [] # durations for this group of utts.
        for utt in utts:
            try:
                durations.append(utt2dur[utt])
            except:
                sys.exit("choose_utts_to_combine.py: no duration available "
                         "in utt2dur file {0} for utterance {1}".format(
                        args.utt2dur_in, utt))
        ranges = CombineList(min_duration, durations)
        for start, end in ranges:  # each element of 'ranges' is a 2-tuple (start, end)
            utt_groups.append( [ utts[i] for i in range(start, end) ])
            group_durations.append(sum([ durations[i] for i in range(start, end) ]))

    old_dur_sum = sum(utt2dur.values())
    new_dur_sum = sum(group_durations)
    if abs(old_dur_sum - new_dur_sum) > 0.0001 * old_dur_sum:
        print("choose_utts_to_combine.py: large difference in total "
              "durations: {0} vs {1} ".format(old_dur_sum, new_dur_sum),
              file = sys.stderr)

    # Now we combine the groups obtained above, in case we had situations where
    # the combination of all the utterances of one speaker were still below
    # the minimum duration.
    if merge_within_speakers_only == 'true':
      return utt_groups
    else:
      new_utt_groups = []
      ranges = CombineList(min_duration, group_durations)
      for start, end in ranges:
          # the following code is destructive of 'utt_groups' but it doesn't
          # matter.
          this_group = utt_groups[start]
          for i in range(start + 1, end):
              this_group += utt_groups[i]
          new_utt_groups.append(this_group)
      print("choose_utts_to_combine.py: combined {0} utterances to {1} utterances "
            "while respecting speaker boundaries, and then to {2} utterances "
            "with merging across speaker boundaries.".format(
              len(utt2dur), len(utt_groups), len(new_utt_groups)),
            file = sys.stderr)
      return new_utt_groups


SelfTest()

if args.min_duration < 0.0:
    print("choose_utts_to_combine.py: bad minium duration {0}".format(
            args.min_duration))

# spk2utt is a list of 2-tuples (speaker-id, [list-of-utterances])
spk2utt = []
# utt2spk is a dict from speaker-id to utternace-id.
utt2spk = dict()
try:
    f = open(args.spk2utt_in)
except:
    sys.exit("choose_utts_to_combine.py: error opening --spk2utt={0}".format(args.spk2utt_in))
while True:
    line = f.readline()
    if line == '':
        break
    a = line.split()
    if len(a) < 2:
        sys.exit("choose_utts_to_combine.py: bad line in spk2utt file: " + line)
    spk = a[0]
    utts = a[1:]
    spk2utt.append((spk, utts))
    for utt in utts:
        if utt in utt2spk:
            sys.exit("choose_utts_to_combine.py: utterance {0} is listed more than once"
                     "in the spk2utt file {1}".format(utt, args.spk2utt_in))
        utt2spk[utt] = spk
f.close()

# utt2dur is a dict from utterance-id (as a string) to duration in seconds (as a float)
utt2dur = dict()
try:
    f = open(args.utt2dur_in)
except:
    sys.exit("choose_utts_to_combine.py: error opening utt2dur file {0}".format(args.utt2dur_in))
while True:
    line = f.readline()
    if line == '':
        break
    try:
        [ utt, dur ] = line.split()
        dur = float(dur)
        utt2dur[utt] = dur
    except:
        sys.exit("choose_utts_to_combine.py: bad line in utt2dur file {0}: {1}".format(
                args.utt2dur_in, line))


utt_groups = GetUtteranceGroups(args.min_duration, args.merge_within_speakers_only, spk2utt, utt2dur)

# set utt_group names to an array like [ 'utt1', 'utt2-comb2', 'utt4', ... ]
utt_group_names = [ group[0] if len(group)==1 else "{0}-comb{1}".format(group[0], len(group))
                    for group in utt_groups ]


# write the utt2utts file.
try:
    with open(args.utt2utts_out, 'w') as f:
        for i in range(len(utt_groups)):
            print(utt_group_names[i], ' '.join(utt_groups[i]), file = f)
except Exception as e:
    sys.exit("choose_utts_to_combine.py: exception writing to "
             "<utt2utts-out>={0}: {1}".format(args.utt2utts_out, str(e)))

# write the utt2spk file.
try:
    with open(args.utt2spk_out, 'w') as f:
        for i in range(len(utt_groups)):
            utt_group = utt_groups[i]
            spk_list = [ utt2spk[utt] for utt in utt_group ]
            if spk_list == [ spk_list[0] ] * len(utt_group):
                spk = spk_list[0]
            else:
                spk2dur = defaultdict(float)
                # spk2dur is a map from the speaker-id to the duration within this
                # utt, that it comprises.
                for utt in utt_group:
                    spk2dur[utt2spk[utt]] += utt2dur[utt]
                # the following code, which picks the speaker that contributed
                # the most to the duration of this utterance, is a little
                # complex because we want to break ties in a deterministic way
                # picking the earlier spaker in case of a tied duration.
                longest_spk_dur = -1.0
                spk = None
                for this_spk in sorted(spk2dur.keys()):
                    if LessThan(longest_spk_dur, spk2dur[this_spk]):
                        longest_spk_dur = spk2dur[this_spk]
                        spk = this_spk
                assert spk != None
            new_utt = utt_group_names[i]
            print(new_utt, spk, file = f)
except Exception as e:
    sys.exit("choose_utts_to_combine.py: exception writing to "
             "<utt2spk-out>={0}: {1}".format(args.utt2spk_out, str(e)))

# write the utt2dur file.
try:
    with open(args.utt2dur_out, 'w') as f:
        for i in range(len(utt_groups)):
            utt_name = utt_group_names[i]
            duration = sum([ utt2dur[utt] for utt in utt_groups[i]])
            print(utt_name, duration, file = f)
except Exception as e:
    sys.exit("choose_utts_to_combine.py: exception writing to "
             "<utt2dur-out>={0}: {1}".format(args.utt2dur_out, str(e)))


================================================
FILE: egs/utils/data/internal/combine_segments_to_recording.py
================================================
#!/usr/bin/env python3

# Copyright 2018  Vimal Manohar
# Apache 2.0

from __future__ import print_function
import argparse
import sys
import collections
from collections import defaultdict

def get_args():
    parser = argparse.ArgumentParser(description="""
        This script combines segments into utterances at
        recording-level and write out new utt2spk file with reco-id as the
        speakers. If --write-reco2utt is provided, it writes a mapping from
        recording-id to the list of utterances sorted by start and end times.
        This map can be used to combine text corresponding to the segments to
        recording-level.""")

    parser.add_argument("--write-reco2utt", help="If provided, writes a "
                        "mapping from recording-id to list of utterances "
                        "sorted by start and end times.")
    parser.add_argument("segments_in", help="Input segments file")
    parser.add_argument("utt2spk_out", help="Output utt2spk file")

    args = parser.parse_args()

    return args


def main():
    args = get_args()

    utt2reco = {}
    segments_for_reco = defaultdict(list)
    for line in open(args.segments_in):
        parts = line.strip().split()

        if len(parts) < 4:
            raise TypeError("bad line in segments file {}".format(line))

        utt = parts[0]
        reco = parts[1]
        start_time = parts[2]
        end_time = parts[3]

        segments_for_reco[reco].append((utt, start_time, end_time))
        utt2reco[utt] = reco

    if args.write_reco2utt is not None:
        with open(args.write_reco2utt, 'w') as reco2utt_writer, \
                open(args.utt2spk_out, 'w') as utt2spk_writer:
            for reco, segments_in_reco in segments_for_reco.items():
                utts = ' '.join([seg[0] for seg in sorted(
                    segments_in_reco, key=lambda x:(x[1], x[2]))])
                print("{0} {1}".format(reco, utts), file=reco2utt_writer)
                print ("{0} {0}".format(reco), file=utt2spk_writer)
    else:
        with open(args.utt2spk_out, 'w') as utt2spk_writer:
            for reco in segments_for_reco.keys():
                print ("{0} {0}".format(reco), file=utt2spk_writer)


if __name__ == "__main__":
    main()


================================================
FILE: egs/utils/data/internal/modify_speaker_info.py
================================================
#!/usr/bin/env python

from __future__ import print_function
import argparse, sys,os
from collections import defaultdict
parser = argparse.ArgumentParser(description="""
Combine consecutive utterances into fake speaker ids for a kind of
poor man's segmentation.  Reads old utt2spk from standard input,
outputs new utt2spk to standard output.""")
parser.add_argument("--utts-per-spk-max", type = int, required = True,
                    help="Maximum number of utterances allowed per speaker")
parser.add_argument("--seconds-per-spk-max", type = float, required = True,
                    help="""Maximum duration in seconds allowed per speaker.
                         If this option is >0, --utt2dur option must be provided.""")
parser.add_argument("--utt2dur", type = str,
                    help="""Filename of input 'utt2dur' file (needed only if
                    --seconds-per-spk-max is provided)""")
parser.add_argument("--respect-speaker-info", type = str, default = 'true',
                    choices = ['true', 'false'],
                    help="""If true, the output speakers will be split from "
                    "existing speakers.""")

args = parser.parse_args()

utt2spk = dict()
# an undefined spk2utt entry will default to an empty list.
spk2utt = defaultdict(lambda: [])

while True:
    line = sys.stdin.readline()
    if line == '':
        break;
    a = line.split()
    if len(a) != 2:
        sys.exit("modify_speaker_info.py: bad utt2spk line from standard input (expected two fields): " +
                 line)
    [ utt, spk ] = a
    utt2spk[utt] = spk
    spk2utt[spk].append(utt)

if args.seconds_per_spk_max > 0:
    utt2dur = dict()
    try:
        f = open(args.utt2dur)
        while True:
            line = f.readline()
            if line == '':
                break
            a = line.split()
            if len(a) != 2:
                sys.exit("modify_speaker_info.py: bad utt2dur line from standard input (expected two fields): " +
                         line)
            [ utt, dur ] = a
            utt2dur[utt] = float(dur)
        for utt in utt2spk:
            if not utt in utt2dur:
                sys.exit("modify_speaker_info.py: utterance {0} not in utt2dur file {1}".format(
                        utt, args.utt2dur))
    except Exception as e:
        sys.exit("modify_speaker_info.py: problem reading utt2dur info: " + str(e))

# splits a list of utts into a list of lists, based on constraints from the
# command line args.  Note: the last list will tend to be shorter than the others,
# we make no attempt to fix this.
def SplitIntoGroups(uttlist):
    ans = [] # list of lists.
    cur_uttlist = []
    cur_dur = 0.0
    for utt in uttlist:
        if ((args.utts_per_spk_max > 0 and len(cur_uttlist) == args.utts_per_spk_max) or
            (args.seconds_per_spk_max > 0 and len(cur_uttlist) > 0 and
             cur_dur + utt2dur[utt] > args.seconds_per_spk_max)):
            ans.append(cur_uttlist)
            cur_uttlist = []
            cur_dur = 0.0
        cur_uttlist.append(utt)
        if args.seconds_per_spk_max > 0:
            cur_dur += utt2dur[utt]
    if len(cur_uttlist) > 0:
        ans.append(cur_uttlist)
    return ans


# This function will return '%01d' if d < 10, '%02d' if d < 100, and so on.
# It's for printf printing of numbers in such a way that sorted order will be
# correct.
def GetFormatString(d):
    ans = 1
    while (d >= 10):
        d //= 10  # integer division
        ans += 1
    # e.g. we might return the string '%01d' or '%02d'
    return '%0{0}d'.format(ans)


if args.respect_speaker_info == 'true':
    for spk in sorted(spk2utt.keys()):
        uttlists = SplitIntoGroups(spk2utt[spk])
        format_string = '%s-' + GetFormatString(len(uttlists))
        for i in range(len(uttlists)):
            # the following might look like: '%s-%02d'.format('john_smith' 9 + 1),
            # giving 'john_smith-10'.
            this_spk = format_string % (spk, i + 1)
            for utt in uttlists[i]:
                print(utt, this_spk)
else:
    uttlists = SplitIntoGroups(sorted(utt2spk.keys()))
    format_string = 'speaker-' + GetFormatString(len(uttlists))
    for i in range(len(uttlists)):
        # the following might look like: 'speaker-%04d'.format(105 + 1),
        # giving 'speaker-0106'.
        this_spk = format_string % (i + 1)
        for utt in uttlists[i]:
            print(utt, this_spk)


================================================
FILE: egs/utils/data/internal/perturb_volume.py
================================================
#!/usr/bin/env python

# Copyright 2017  Vimal Manohar
# Apache 2.0

"""
This script reads a wav.scp file from the input and perturbs the
volume of the recordings and writes to stdout the contents of
a new wav.scp file.
"""
from __future__ import print_function

import argparse
import re
import random
import sys

def get_args():
    parser = argparse.ArgumentParser(description="""
        This script reads a wav.scp file from the input and perturbs the
        volume of the recordings and writes to stdout the contents of
        a new wav.scp file.
        If --reco2vol is provided, then for each recording, the volume factor
        specified in that file is applied.
        Otherwise, a volume factor is chosen randomly from a uniform
        distribution between --scale-low and --scale-high.
        """)

    parser.add_argument("--scale-low", type=float, default=0.125,
                        help="Minimum volume scale to be applied.")
    parser.add_argument("--scale-high", type=float, default=2,
                        help="Maximum volume scale to tbe applid.")
    parser.add_argument("--reco2vol", type=str, default=None,
                        help="If supplied, it must be a file of the format "
                        "<reco-id> <volume-scale>, which specifies the "
                        "volume scale to be applied for each recording.")
    parser.add_argument("--write-reco2vol", type=str, default=None,
                        help="If provided, the volume scale used for each "
                        "recording will be written to this file")
    args = parser.parse_args()

    if args.reco2vol == "":
        args.reco2vol = None
    if args.write_reco2vol == "":
        args.write_reco2vol = None

    return args


def read_reco2vol(volumes_file):
    """Read volume scales for recordings.
    The format of volumes_file is <reco-id> <volume-scale>
    Returns a dictionary { reco-id : volume-scale }
    """
    volumes = {}
    with open(volumes_file) as volume_reader:
        for line in volume_reader.readlines():
            if len(line.strip()) == 0:
                continue

            parts = line.strip().split()
            if len(parts) != 2:
                raise RuntimeError("Unable to parse the line {0} in file {1}."
                                   "".format(line.strip(), volumes_file))
            volumes[parts[0]] = float(parts[1])
    return volumes


def run(args):
    random.seed(0)

    volumes = None
    if args.reco2vol is not None:
        volumes = read_reco2vol(args.reco2vol)

    if args.write_reco2vol is not None:
        volume_writer = open(args.write_reco2vol, 'w')

    for line in sys.stdin.readlines():
        if len(line.strip()) == 0:
            continue
        parts = line.strip().split()
        reco_id = parts[0]

        vol = random.uniform(args.scale_low, args.scale_high)
        if volumes is not None:
            if reco_id not in volumes:
                raise RuntimeError('Could not find volume for id {0} in '
                                   '{1}'.format(reco_id, args.reco2vol))
            vol = volumes[reco_id]

        # Handle three cases of rxfilenames appropriately;
        # 'input piped command', 'file offset' and 'filename'
        if line.strip()[-1] == '|':
            print ('{0} sox --vol {1} -t wav - -t wav - |'.format(
                line.strip(), vol))
        elif re.search(':[0-9]+$', line.strip()) is not None:
            print ('{id} wav-copy {wav} - | '
                   'sox --vol {vol} -t wav - -t wav - |'.format(
                       id=parts[0], wav=' '.join(parts[1:]), vol=vol))
        else:
            print ('{id} sox --vol {vol} -t wav {wav} -t wav - |'.format(
                id=parts[0], wav=' '.join(parts[1:]), vol=vol))

        if args.write_reco2vol is not None:
            volume_writer.write('{id} {vol}\n'.format(id=parts[0], vol=vol))


def main():
    args = get_args()
    run(args)


if __name__ == "__main__":
    main()


================================================
FILE: egs/utils/data/limit_feature_dim.sh
================================================
#!/usr/bin/env bash

# Copyright 2016  Alibaba Robotics Corp. (author: Xingyu Na)
# Apache 2.0

# The script creates a new data directory by selecting a specified
# dimension range of the features in the source directory.

. utils/parse_options.sh

if [ $# != 3 ]; then
  echo "Usage: "
  echo "  $0 <feat-dim-range> <srcdir> <destdir>"
  echo "The script creates a new data directory by selecting a specified"
  echo "dimension range of the features in the source directory."
  echo "e.g.:"
  echo " $0 0:39 data/train_hires_pitch data/train_hires"
  exit 1;
fi

feat_dim_range=$1
srcdir=$2
destdir=$3

if [ "$destdir" == "$srcdir" ]; then
  echo "$0: this script requires <srcdir> and <destdir> to be different."
  exit 1
fi

if [ ! -f $srcdir/feats.scp ]; then
  echo "$0: no such file $srcdir/feats.scp"
  exit 1;
fi

mkdir -p $destdir
utils/copy_data_dir.sh $srcdir $destdir

if [ -f $destdir/cmvn.scp ]; then
  rm $destdir/cmvn.scp
  echo "$0: warning: removing $destdir/cmvn.cp, you will have to regenerate it from the features."
fi

rm $destdir/feats.scp
sed 's/$/\[:,'${feat_dim_range}'\]/' $srcdir/feats.scp | \
  utils/data/normalize_data_range.pl > $destdir/feats.scp

[ ! -f $srcdir/text ] && validate_opts="$validate_opts --no-text"
utils/validate_data_dir.sh $validate_opts $destdir


================================================
FILE: egs/utils/data/modify_speaker_info.sh
================================================
#!/usr/bin/env bash

# Copyright 2013-2016  Johns Hopkins University (author: Daniel Povey)
# Apache 2.0

# This script copies a data directory (like utils/copy_data.sh) while
# modifying (splitting or merging) the speaker information in that data directory.
#
# This is done without looking at the data at all; we use only duration
# constraints and maximum-num-utts-per-speaker to assign contiguous
# sets of utterances to speakers.
#
# This has two general uses:
# (1) when dumping iVectors for training purposes, it's helpful to have
#   a good variety of iVectors, and this can be accomplished by splitting
#   speakers up into multiple copies of those speakers.  We typically
#   use the --utts-per-spk-max 2 option for this.
# (2) when dealing with data that is not diarized, and given that we
#   haven't checked any diarization scripts into Kaldi yet, this
#   script can do a "dumb" diarization that just groups consecutive
#   utterances into groups based on length constraints.
#   There are two cases here:

#       a) With --respect-speaker-info true (the default),
#         it only splits within existing speakers.
#         This is suitable when you have existing speaker
#         info that's meaningful in some way, e.g. represents
#         individual recordings.
#      b) With --respect-speaker-info false,
#        it completely ignores the existing speaker information
#        and constructs new speaker identities based on
#        utterance names.  This is suitable in scenarios when
#        you have a one-to-one map between speakers and
#        utterances.

# begin configuration section
utts_per_spk_max=-1
seconds_per_spk_max=-1
respect_speaker_info=true
# end configuration section

. utils/parse_options.sh

if [ $# != 2 ]; then
  echo "Usage: "
  echo "  $0 [options] <srcdir> <destdir>"
  echo "e.g.:"
  echo " $0 --utts-per-spk-max 2 data/train data/train-max2"
  echo "Options"
  echo "   --utts-per-spk-max <n>  # number of utterances per speaker maximum,"
  echo "                           # default -1 (meaning no maximum).  E.g. 2."
  echo "   --seconds-per-spk-max <n> # number of seconds per speaker maximum,"
  echo "                             # default -1 (meaning no maximum).  E.g. 60."
  echo "   --respect-speaker-info <true|false>  # If true, respect the"
  echo "                                        # existing speaker map (i.e. do not"
  echo "                                        # assign utterances from different"
  echo "                                        # speakers to the same generated speaker)."
  echo "                                        # Default: true."
  echo "Note: one or both of the --utts-per-spk-max or --seconds-per-spk-max"
  echo "options is required."
  exit 1;
fi

export LC_ALL=C

srcdir=$1
destdir=$2

if [ "$destdir"  == "$srcdir" ]; then
  echo "$0: <srcdir> must be different from <destdir>."
  exit 1
fi

if [ "$seconds_per_spk_max" == "-1" ] && ! [ "$utts_per_spk_max" -gt 0 ]; then
  echo "$0: one or both of the --utts-per-spk-max or --seconds-per-spk-max options must be provided."
fi

if [ ! -f $srcdir/utt2spk ]; then
  echo "$0: no such file $srcdir/utt2spk"
  exit 1;
fi

set -e;
set -o pipefail

mkdir -p $destdir

if [ "$seconds_per_spk_max" != -1 ]; then
  # we need the utt2dur file.
  utils/data/get_utt2dur.sh $srcdir
  utt2dur_opt="--utt2dur=$srcdir/utt2dur"
else
  utt2dur_opt=
fi

utils/data/internal/modify_speaker_info.py \
   $utt2dur_opt --respect-speaker-info=$respect_speaker_info \
  --utts-per-spk-max=$utts_per_spk_max --seconds-per-spk-max=$seconds_per_spk_max \
  <$srcdir/utt2spk >$destdir/utt2spk

utils/utt2spk_to_spk2utt.pl <$destdir/utt2spk >$destdir/spk2utt

# This script won't create the new cmvn.scp, it should be recomputed.
if [ -f $destdir/cmvn.scp ]; then
  mkdir -p $destdir/.backup
  mv $destdir/cmvn.scp $destdir/.backup
  echo "$0: moving $destdir/cmvn.scp to $destdir/.backup/cmvn.scp"
fi

# these things won't be affected by the change of speaker mapping.
for f in feats.scp segments wav.scp reco2file_and_channel text stm glm ctm; do
  [ -f $srcdir/$f ] && cp $srcdir/$f $destdir/
done


orig_num_spk=$(wc -l <$srcdir/spk2utt)
new_num_spk=$(wc -l <$destdir/spk2utt)

echo "$0: copied data from $srcdir to $destdir, number of speakers changed from $orig_num_spk to $new_num_spk"
opts=
[ ! -f $srcdir/feats.scp ] && opts="--no-feats"
[ ! -f $srcdir/text ] && opts="$opts --no-text"
[ ! -f $srcdir/wav.scp ] && opts="$opts --no-wav"

utils/validate_data_dir.sh $opts $destdir


================================================
FILE: egs/utils/data/modify_speaker_info_to_recording.sh
================================================
#!/usr/bin/env bash

# Copyright 2017  Vimal Manohar
# Apache 2.0.

# Copy the data directory, but modify it to use the recording-id as the 
# speaker. This is useful to get matching speaker information in the 
# whole recording data directory.
# Note that this also appends the recording-id as a prefix to the 
# utterance-id.

if [ $# -ne 2 ]; then
  echo "Usage: $0 <in-data> <out-data>"
  echo " e.g.: $0 data/train data/train_recospk"
  exit 1
fi

in_data=$1
out_data=$2

mkdir -p $out_data

for f in wav.scp segments utt2spk; do 
  if [ ! -f $in_data/$f ]; then
    echo "$0: Could not find file $in_data/$f" 
    exit 1
  fi
done

cp $in_data/wav.scp $out_data/ || exit 1
cp $in_data/reco2file_and_channel $out_data/ 2> /dev/null || true
awk '{print $1" "$2"-"$1}' $in_data/segments > \
  $out_data/old2new.uttmap || exit 1
utils/apply_map.pl -f 1 $out_data/old2new.uttmap < $in_data/segments > \
  $out_data/segments || exit 1
awk '{print $1" "$2}' $out_data/segments > $out_data/utt2spk || exit 1
utils/utt2spk_to_spk2utt.pl $out_data/utt2spk > $out_data/spk2utt || exit 1

if [ -f $in_data/text ]; then
  utils/apply_map.pl -f 1 $out_data/old2new.uttmap < $in_data/text > \
    $out_data/text || exit 1
fi

if [ -f $in_data/feats.scp ]; then
  utils/apply_map.pl -f 1 $out_data/old2new.uttmap < $in_data/feats.scp > \
    $out_data/feats.scp || exit 1
fi

utils/fix_data_dir.sh $out_data || exit 1
utils/validate_data_dir.sh --no-text --no-feats $out_data || exit 1


================================================
FILE: egs/utils/data/normalize_data_range.pl
================================================
#!/usr/bin/env perl

# This script is intended to read and write scp files possibly containing indexes for
# sub-ranges of features, like
# foo-123  bar.ark:431423[78:89]
# meaning rows 78 through 89 of the matrix located at bar.ark:431423.
#
# Its purpose is to normalize lines which have ranges on top of ranges, like
#
# foo-123  bar.ark:431423[78:89][3:4]
#
# This program interprets the later [] expression as a sub-range of the matrix returned by the first []
# expression; in this case, we'd get
#
# foo-123  bar.ark:431423[81:82]
#
# Note that these ranges are based on zero-indexing, and have a 'first:last'
# interpretation, so the range [0:0] is a matrix with one row.  And also note
# that column ranges are permitted, after row ranges, and the row range may be
# empty, e.g.

# foo-123  bar.ark:431423[81:82,0:13]
# or
# foo-123  bar.ark:431423[81:82,0:13]
#

# This program reads from the standard input (or command-line file or files),
# and writes to the standard output.


# This function combines ranges, either row or column ranges.  start1 and end1
# are the first range, and start2 and end2 are interpreted as a sub-range of the
# first range.  It is acceptable for either start1 and end1, or start2 and end2, to
# be empty.
# This function returns the start and end of the range, as an array.
sub combine_ranges {
  ($row_or_column, $start1, $end1, $start2, $end2) = @_;

  if ($start1 eq "" && $end1 eq "") {
    return ($start2, $end2);
  } elsif ($start2 eq "" && $end2 eq "") {
    return ($start1, $end1);
  } else {
    # For now this script doesn't support the case of ranges like [20:], even
    # though they are supported at the C++ level.
    if ($start1 eq "" || $start2 eq "" || $end1 eq "" || $end2 == "") {
      chop $line;
      print STDERR ("normalize_data_range.pl: could not make sense of line $line\n");
      exit(1)
    }
    if ($start1 + $end2 > $end1) {
      chop $line;
      print STDERR ("normalize_data_range.pl: could not make sense of line $line " .
            "[second $row_or_column range too large vs first range, $start1 + $end2 > $end1]\n");
          # exit(1);
      return ($start2+$start1, $end1);
    }
    return ($start2+$start1, $end2+$start1);
  }
}


while (<>) {
  $line = $_;
  # we only need to do something if we detect two of these ranges.
  # The following regexp matches strings of the form ...[foo][bar]
  # where foo and bar have no square brackets in them.
  if (m/\[([^][]*)\]\[([^][]*)\]\s*$/) {
    $before_range = $`;
    $first_range = $1;   # e.g. '0:500,20:21', or '0:500', or ',0:13'.
    $second_range = $2;  # has same general format as first_range.
    if ($_ =~ m/concat-feats /) {
      # sometimes in scp files, we use the command concat-feats to splice together
      # two feature matrices.  Handling this correctly is complicated and we don't
      # anticipate needing it, so we just refuse to process this type of data.
      print STDERR ("normalize_data_range.pl: this script cannot [yet] normalize the data ranges " .
        "if concat-feats was in the input data\n");
      exit(1);
    }
    # print STDERR "matched: $before_range $first_range $second_range\n";
    if ($first_range !~ m/^((\d*):(\d*)|)(,(\d*):(\d*)|)$/) {
      print STDERR "normalize_data_range.pl: could not make sense of input line $_";
      exit(1);
    }
    $row_start1 = $2;
    $row_end1 = $3;
    $col_start1 = $5;
    $col_end1 = $6;

    if ($second_range !~ m/^((\d*):(\d*)|)(,(\d*):(\d*)|)$/) {
      print STDERR "normalize_data_range.pl: could not make sense of input line $_";
      exit(1);
    }
    $row_start2 = $2;
    $row_end2 = $3;
    $col_start2 = $5;
    $col_end2 = $6;

    ($row_start, $row_end) = combine_ranges("row", $row_start1, $row_end1, $row_start2, $row_end2);
    ($col_start, $col_end) = combine_ranges("column", $col_start1, $col_end1, $col_start2, $col_end2);


    if ($row_start ne "") {
      $range = "$row_start:$row_end";
    } else {
      $range = "";
    }
    if ($col_start ne "") {
      $range .= ",$col_start:$col_end";
    }
    print $before_range . "[" . $range . "]\n";
  } else {
    print;
  }
}

__END__

# Testing
# echo foo |  utils/data/normalize_data_range.pl -> foo
# echo 'foo[bar:baz]' |  utils/data/normalize_data_range.pl -> foo[bar:baz]
# echo 'foo[bar:baz][bin:bang]' |  utils/data/normalize_data_range.pl -> normalize_data_range.pl: could not make sense of input line foo[bar:baz][bin:bang]
# echo 'foo[10:20][0:5]' |  utils/data/normalize_data_range.pl -> foo[10:15]
# echo 'foo[,10:20][,0:5]' |  utils/data/normalize_data_range.pl -> foo[,10:15]
# echo 'foo[,0:100][1:15]' |  utils/data/normalize_data_range.pl -> foo[1:15,0:100]
# echo 'foo[1:15][,0:100]' |  utils/data/normalize_data_range.pl -> foo[1:15,0:100]
# echo 'foo[10:20][0:11]' |  utils/data/normalize_data_range.pl -> normalize_data_range.pl: could not make sense of line foo[10:20][0:11] [second row range too large vs first range, 10 + 11 > 20]
# echo 'foo[,10:20][,0:11]' |  utils/data/normalize_data_range.pl -> normalize_data_range.pl: could not make sense of line foo[,10:20][,0:11] [second column range too large vs first range, 10 + 11 > 20]


================================================
FILE: egs/utils/data/perturb_data_dir_speed_3way.sh
================================================
#!/usr/bin/env bash

# Copyright 2016-2018  Johns Hopkins University (author: Daniel Povey)
#                2018  Hossein Hadian

# Apache 2.0

# This script does the standard 3-way speed perturbing of
# a data directory (it operates on the wav.scp).

# If you add the option "--always-include-prefix true", it will include the
# prefix "sp1.0-" for the original un-perturbed data.  This can help resolve
# problems with sorting.
# We don't make '--always-include-prefix true' the default  behavior because
# it can break some older scripts that relied on the original utterance-ids
# being a subset of the perturbed data's utterance-ids.

always_include_prefix=false

. utils/parse_options.sh

if [ $# != 2 ]; then
  echo "Usage: perturb_data_dir_speed_3way.sh <srcdir> <destdir>"
  echo "Applies standard 3-way speed perturbation using factors of 0.9, 1.0 and 1.1."
  echo "e.g.:"
  echo " $0 [options] data/train data/train_sp"
  echo "Note: if <destdir>/feats.scp already exists, this will refuse to run."
  echo "Options:"
  echo "    --always-include-prefix [true|false]   # default: false.  If set to true,"
  echo "                                           # it will add the prefix 'sp1.0-' to"
  echo "                                           # utterance and speaker-ids for data at"
  echo "                                           # the original speed.  Can resolve"
  echo "                                           # issues RE data sorting."
  exit 1
fi

srcdir=$1
destdir=$2

if [ ! -f $srcdir/wav.scp ]; then
  echo "$0: expected $srcdir/wav.scp to exist"
  exit 1
fi

if [ -f $destdir/feats.scp ]; then
  echo "$0: $destdir/feats.scp already exists: refusing to run this (please delete $destdir/feats.scp if you want this to run)"
  exit 1
fi

echo "$0: making sure the utt2dur and the reco2dur files are present"
echo "... in ${srcdir}, because obtaining it after speed-perturbing"
echo "... would be very slow, and you might need them."
utils/data/get_utt2dur.sh ${srcdir}
utils/data/get_reco2dur.sh ${srcdir}

utils/data/perturb_data_dir_speed.sh 0.9 ${srcdir} ${destdir}_speed0.9 || exit 1
utils/data/perturb_data_dir_speed.sh 1.1 ${srcdir} ${destdir}_speed1.1 || exit 1

if $always_include_prefix; then
  utils/copy_data_dir.sh --spk-prefix sp1.0- --utt-prefix sp1.0- ${srcdir} ${destdir}_speed1.0
  if [ ! -f $srcdir/utt2uniq ]; then
    cat $srcdir/utt2spk | awk  '{printf("sp1.0-%s %s\n", $1, $1);}' > ${destdir}_speed1.0/utt2uniq
  else
    cat $srcdir/utt2uniq | awk '{printf("sp1.0-%s %s\n", $1, $2);}' > ${destdir}_speed1.0/utt2uniq
  fi
  utils/data/combine_data.sh $destdir ${destdir}_speed1.0 ${destdir}_speed0.9 ${destdir}_speed1.1 || exit 1

  rm -r ${destdir}_speed0.9 ${destdir}_speed1.1 ${destdir}_speed1.0
else
  utils/data/combine_data.sh $destdir ${srcdir} ${destdir}_speed0.9 ${destdir}_speed1.1 || exit 1
  rm -r ${destdir}_speed0.9 ${destdir}_speed1.1
fi

echo "$0: generated 3-way speed-perturbed version of data in $srcdir, in $destdir"
if ! utils/validate_data_dir.sh --no-feats --no-text $destdir; then
  echo "$0: Validation failed.  If it is a sorting issue, try the option '--always-include-prefix true'."
  exit 1
fi

exit 0


================================================
FILE: egs/utils/data/perturb_data_dir_volume.sh
================================================
#!/usr/bin/env bash

# Copyright 2016  Johns Hopkins University (author: Daniel Povey)
# Apache 2.0

# This script operates on a data directory, such as in data/train/, and modifies
# the wav.scp to perturb the volume (typically useful for training data when
# using systems that don't have cepstral mean normalization).

reco2vol=   # A file with the format <reco-id> <volume> that specifies the 
            # factor by which the volume of the recording must be scaled.
            # If not provided, then the volume will be chosen randomly to 
            # be between --scale-low and --scale-high.
write_reco2vol=     # File to write volume-scales applied to the recordings.
                    # Can be passed to --reco2vol to use the same volumes for 
                    # another data directory. 
                    # e.g. the unperturbed data directory.
scale_low=0.125
scale_high=2

. utils/parse_options.sh

if [ $# != 1 ]; then
  echo "Usage: $0 <datadir>"
  echo "e.g.:"
  echo " $0 data/train"
  exit 1
fi

export LC_ALL=C

data=$1

if [ ! -f $data/wav.scp ]; then
  echo "$0: Expected $data/wav.scp to exist"
  exit 1
fi

# Check if volume perturbation is already this. We assume that the volume
# perturbation is done if it has a line 'sox --vol' applied on the whole 
# recording.
# e.g. 
# foo-1 cat foo.wav | sox --vol 1.6 -t wav - -t wav - |    # volume perturbation done
# bar-1 sox --vol 1.2 bar.wav -t wav - |                   # volume perturbation done
# foo-2 wav-reverberate --additive-signals="sox --vol=0.1 noise1.wav -t wav -|" foo.wav |   # volume perturbation not done
volume_perturb_done=`head -n100 $data/wav.scp | python -c "
import sys, re
for line in sys.stdin.readlines():
  if len(line.strip()) == 0:
    continue
  # Handle three cases of rxfilenames appropriately; 'input piped command', 'file offset' and 'filename'
  parts = line.strip().split()
  if line.strip()[-1] == '|':
    if re.search('sox --vol', ' '.join(parts[-11:])):
      print('true')
      sys.exit(0)
  elif re.search(':[0-9]+$', line.strip()) is not None:
    continue
  else:
    if ' '.join(parts[1:3]) == 'sox --vol':
      print('true')
      sys.exit(0)
print('false')
"` || exit 1

if $volume_perturb_done; then
  echo "$0: It looks like the data was already volume perturbed.  Not doing anything."
  exit 0
fi

cat $data/wav.scp | utils/data/internal/perturb_volume.py \
  --reco2vol=$reco2vol ${write_reco2vol:+--write-reco2vol=$write_reco2vol} \
  --scale-low=$scale_low --scale-high=$scale_high > \
  $data/wav.scp_scaled || exit 1;

len1=$(cat $data/wav.scp | wc -l)
len2=$(cat $data/wav.scp_scaled | wc -l)
if [ "$len1" != "$len2" ]; then
  echo "$0: error detected: number of lines changed $len1 vs $len2";
  exit 1
fi

mv $data/wav.scp_scaled $data/wav.scp

if [ -f $data/feats.scp ]; then
  echo "$0: $data/feats.scp exists; moving it to $data/.backup/ as it wouldn't be valid any more."
  mkdir -p $data/.backup/
  mv $data/feats.scp $data/.backup/
fi

echo "$0: added volume perturbation to the data in $data"
exit 0


================================================
FILE: egs/utils/data/perturb_speed_to_allowed_lengths.py
================================================
#!/usr/bin/env python3

# Copyright     2017  Hossein Hadian
# Apache 2.0


""" This script perturbs speeds of utterances to force their lengths to some
    allowed lengths spaced by a factor (like 10%)
"""

import argparse
import os
import sys
import copy
import math
import logging

sys.path.insert(0, 'steps')
import libs.common as common_lib

logger = logging.getLogger('libs')
logger.setLevel(logging.INFO)
handler = logging.StreamHandler()
handler.setLevel(logging.INFO)
formatter = logging.Formatter("%(asctime)s [%(pathname)s:%(lineno)s - "
                              "%(funcName)s - %(levelname)s ] %(message)s")
handler.setFormatter(formatter)
logger.addHandler(handler)

def get_args():
    parser = argparse.ArgumentParser(description="""This script copies the 'srcdir'
                                   data directory to output data directory 'dir'
                                   while modifying the utterances so that there are
                                   3 copies of each utterance: one with the same
                                   speed, one with a higher speed (not more than
                                   factor% faster) and one with a lower speed
                                   (not more than factor% slower)""")
    parser.add_argument('factor', type=float, default=12,
                        help='Spacing (in percentage) between allowed lengths.')
    parser.add_argument('srcdir', type=str,
                        help='path to source data dir')
    parser.add_argument('dir', type=str, help='output dir')
    parser.add_argument('--coverage-factor', type=float, default=0.05,
                        help="""Percentage of durations not covered from each
                             side of duration histogram.""")
    parser.add_argument('--frame-shift', type=int, default=10,
                        help="""Frame shift in milliseconds.""")
    parser.add_argument('--frame-length', type=int, default=25,
                        help="""Frame length in milliseconds.""")
    parser.add_argument('--frame-subsampling-factor', type=int, default=3,
                        help="""Chain frame subsampling factor.
                             See steps/nnet3/chain/train.py""")
    parser.add_argument('--speed-perturb', type=str, choices=['true','false'],
                        default='true',
                        help="""If false, no speed perturbation will occur, i.e.
                             only 1 copy of each utterance will be
                             saved, which is modified to have an allowed length
                             by using extend-wav-with-silence.""")
    args = parser.parse_args()
    args.speed_perturb = True if args.speed_perturb == 'true' else False
    return args

class Utterance(object):
    """ This class represents a Kaldi utterance
        in a data directory like data/train
    """

    def __init__(self, uid, wavefile, speaker, transcription, dur):
        self.wavefile = (wavefile if wavefile.rstrip(" \t\r\n").endswith('|') else
                         'cat {} |'.format(wavefile))
        self.speaker = speaker
        self.transcription = transcription
        self.id = uid
        self.dur = float(dur)

    def to_kaldi_utt_str(self):
        return self.id + " " + self.transcription

    def to_kaldi_wave_str(self):
        return self.id + " " + self.wavefile

    def to_kaldi_dur_str(self):
        return "{} {:0.3f}".format(self.id, self.dur)


def read_kaldi_datadir(dir):
    """ Read a data directory like
        data/train as a list of utterances
    """

    # check to make sure that no segments file exists as this script won't work
    # with data directories which use a segments file.
    if os.path.isfile(os.path.join(dir, 'segments')):
        logger.info("The data directory '{}' seems to use a 'segments' file. "
                    "This script does not yet support a 'segments' file. You'll need "
                    "to use utils/data/extract_wav_segments_data_dir.sh "
                    "to convert the data dir so it does not use a 'segments' file. "
                    "Exiting...".format(dir))
        sys.exit(1)

    logger.info("Loading the data from {}...".format(dir))
    utterances = []
    wav_scp = read_kaldi_mapfile(os.path.join(dir, 'wav.scp'))
    text = read_kaldi_mapfile(os.path.join(dir, 'text'))
    utt2dur = read_kaldi_mapfile(os.path.join(dir, 'utt2dur'))
    utt2spk = read_kaldi_mapfile(os.path.join(dir, 'utt2spk'))

    num_fail = 0
    for utt in wav_scp:
        if utt in text and utt in utt2dur and utt in utt2spk:
            utterances.append(Utterance(utt, wav_scp[utt], utt2spk[utt],
                                  text[utt], utt2dur[utt]))
        else:
            num_fail += 1

    if float(len(utterances)) / len(wav_scp) < 0.5:
        logger.info("More than half your data is problematic. Try "
                    "fixing using fix_data_dir.sh.")
        sys.exit(1)

    logger.info("Successfully read {} utterances. Failed for {} "
                "utterances.".format(len(utterances), num_fail))
    return utterances


def read_kaldi_mapfile(path):
    """ Read any Kaldi mapping file - like text, .scp files, etc.
    """

    m = {}
    with open(path, 'r', encoding='latin-1') as f:
        for line in f:
            line = line.strip(" \t\r\n")
            sp_pos = line.find(' ')
            key = line[:sp_pos]
            val = line[sp_pos+1:]
            m[key] = val
    return m

def generate_kaldi_data_files(utterances, outdir):
    """ Write out a list of utterances as Kaldi data files into an
        output data directory.
    """

    logger.info("Exporting to {}...".format(outdir))
    speakers = {}

    with open(os.path.join(outdir, 'text'), 'w', encoding='latin-1') as f:
        for utt in utterances:
            f.write(utt.to_kaldi_utt_str() + "\n")

    with open(os.path.join(outdir, 'wav.scp'), 'w', encoding='latin-1') as f:
        for utt in utterances:
            f.write(utt.to_kaldi_wave_str() + "\n")

    with open(os.path.join(outdir, 'utt2dur'), 'w', encoding='latin-1') as f:
        for utt in utterances:
            f.write(utt.to_kaldi_dur_str() + "\n")

    with open(os.path.join(outdir, 'utt2spk'), 'w', encoding='latin-1') as f:
        for utt in utterances:
            f.write(utt.id + " " + utt.speaker + "\n")
            if utt.speaker not in speakers:
                speakers[utt.speaker] = [utt.id]
            else:
                speakers[utt.speaker].append(utt.id)

    with open(os.path.join(outdir, 'spk2utt'), 'w', encoding='latin-1') as f:
        for s in speakers:
            f.write(s + " ")
            for utt in speakers[s]:
                f.write(utt + " ")
            f.write('\n')

    logger.info("Successfully wrote {} utterances to data "
                "directory '{}'".format(len(utterances), outdir))

def find_duration_range(utterances, coverage_factor):
    """Given a list of utterances, find the start and end duration to cover

     If we try to cover
     all durations which occur in the training set, the number of
     allowed lengths could become very large.

     Returns
     -------
     start_dur: int
     end_dur: int
    """
    durs = []
    for u in utterances:
        durs.append(u.dur)
    durs.sort()
    to_ignore_dur = 0
    tot_dur = sum(durs)
    for d in durs:
        to_ignore_dur += d
        if to_ignore_dur * 100.0 / tot_dur > coverage_factor:
            start_dur = d
            break
    to_ignore_dur = 0
    for d in reversed(durs):
        to_ignore_dur += d
        if to_ignore_dur * 100.0 / tot_dur > coverage_factor:
            end_dur = d
            break
    if start_dur < 0.3:
        start_dur = 0.3  # a hard limit to avoid too many allowed lengths --not critical
    return start_dur, end_dur


def find_allowed_durations(start_dur, end_dur, args):
    """Given the start and end duration, find a set of
       allowed durations spaced by args.factor%. Also write
       out the list of allowed durations and the corresponding
       allowed lengths (in frames) on disk.

     Returns
     -------
     allowed_durations: list of allowed durations (in seconds)
    """

    allowed_durations = []
    d = start_dur
    with open(os.path.join(args.dir, 'allowed_durs.txt'), 'w', encoding='latin-1') as durs_fp, \
           open(os.path.join(args.dir, 'allowed_lengths.txt'), 'w', encoding='latin-1') as lengths_fp:
        while d < end_dur:
            length = int(d * 1000 - args.frame_length) / args.frame_shift + 1
            if length % args.frame_subsampling_factor != 0:
                length = (args.frame_subsampling_factor *
                              (length // args.frame_subsampling_factor))
                d = (args.frame_shift * (length - 1.0)
                     + args.frame_length + args.frame_shift / 2) / 1000.0
            allowed_durations.append(d)
            durs_fp.write("{}\n".format(d))
            lengths_fp.write("{}\n".format(int(length)))
            d *= args.factor
    return allowed_durations


def perturb_utterances(utterances, allowed_durations, args):
    """Given a set of utterances and a set of allowed durations, generate
       an extended set of perturbed utterances (all having an allowed duration)

     Returns
     -------
     perturbed_utterances: list of pertubed utterances
    """

    perturbed_utterances = []
    for u in utterances:
        # find i such that: allowed_durations[i-1] <= u.dur <= allowed_durations[i]
        # i = len(allowed_durations) --> no upper bound
        # i = 0         --> no lower bound
        if u.dur < allowed_durations[0]:
            i = 0
        elif u.dur > allowed_durations[-1]:
            i = len(allowed_durations)
        else:
            i = 1
            while i < len(allowed_durations):
                if u.dur <= allowed_durations[i] and u.dur >= allowed_durations[i - 1]:
                    break
                i += 1

        if i > 0 and args.speed_perturb:  # we have a smaller allowed duration
            allowed_dur = allowed_durations[i - 1]
            speed = u.dur / allowed_dur
            if max(speed, 1.0/speed) > args.factor:  # this could happen for very short/long utterances
                continue
            u1 = copy.deepcopy(u)
            u1.id = 'pv1-' + u.id
            u1.speaker = 'pv1-' + u.speaker
            u1.wavefile = '{} sox -t wav - -t wav - speed {} | '.format(u.wavefile, speed)
            u1.dur = allowed_dur
            perturbed_utterances.append(u1)


        if i < len(allowed_durations):  # we have a larger allowed duration
            allowed_dur2 = allowed_durations[i]
            speed = u.dur / allowed_dur2
            if max(speed, 1.0/speed) > args.factor:
                continue

            ## Add two versions for the second allowed_duration
            ## one version is by using speed modification using sox
            ## the other is by extending by silence
            if args.speed_perturb:
                u2 = copy.deepcopy(u)
                u2.id = 'pv2-' + u.id
                u2.speaker = 'pv2-' + u.speaker
                u2.wavefile = '{} sox -t wav - -t wav - speed {} | '.format(u.wavefile, speed)
                u2.dur = allowed_dur2
                perturbed_utterances.append(u2)

            delta = allowed_dur2 - u.dur
            if delta <= 1e-4:
                continue
            u3 = copy.deepcopy(u)
            u3.id = 'pv3-' + u.id
            u3.speaker = 'pv3-' + u.speaker
            u3.wavefile = '{} extend-wav-with-silence --extra-silence-length={} - - | '.format(u.wavefile, delta)
            u3.dur = allowed_dur2
            perturbed_utterances.append(u3)
    return perturbed_utterances


def main():
    args = get_args()
    args.factor = 1.0 + args.factor / 100.0

    if not os.path.exists(args.dir):
        os.makedirs(args.dir)

    utterances = read_kaldi_datadir(args.srcdir)

    start_dur, end_dur = find_duration_range(utterances, args.coverage_factor)
    logger.info("Durations in the range [{},{}] will be covered. "
                "Coverage rate: {}%".format(start_dur, end_dur,
                                      100.0 - args.coverage_factor * 2))
    logger.info("There will be {} unique allowed lengths "
                "for the utterances.".format(int(math.log(end_dur / start_dur)/
                                                 math.log(args.factor))))

    allowed_durations = find_allowed_durations(start_dur, end_dur, args)

    perturbed_utterances = perturb_utterances(utterances, allowed_durations,
                                              args)

    generate_kaldi_data_files(perturbed_utterances, args.dir)


if __name__ == '__main__':
      main()


================================================
FILE: egs/utils/data/remove_dup_utts.sh
================================================
#!/usr/bin/env bash

# Remove excess utterances once they appear  more than a specified
# number of times with the same transcription, in a data set.
# E.g. useful for removing excess "uh-huh" from training.

if [ $# != 3 ]; then
  echo "Usage: remove_dup_utts.sh max-count <src-data-dir> <dest-data-dir>"
  echo "e.g.: remove_dup_utts.sh 10 data/train data/train_nodup"
  echo "This script is used to filter out utterances that have from over-represented"
  echo "transcriptions (such as 'uh-huh'), by limiting the number of repetitions of"
  echo "any given word-sequence to a specified value.  It's often used to get"
  echo "subsets for early stages of training."
  exit 1;
fi

maxcount=$1
srcdir=$2
destdir=$3
mkdir -p $destdir

[ ! -f $srcdir/text ] && echo "$0: Invalid input directory $srcdir" && exit 1;

! mkdir -p $destdir && echo "$0: could not create directory $destdir" && exit 1;

! [ "$maxcount" -gt 1 ] && echo "$0: invalid max-count '$maxcount'" && exit 1;

cp $srcdir/* $destdir
cat $srcdir/text | \
  perl -e '
  $maxcount = shift @ARGV;
  @all = ();
   $p1 = 103349; $p2 = 71147; $k = 0;
   sub random { # our own random number generator: predictable.
     $k = ($k + $p1) % $p2;
     return ($k / $p2);
  }
  while(<>) {
    push @all, $_;
    @A = split(" ", $_);
    shift @A;
    $text = join(" ", @A);
    $count{$text} ++;
  }
  foreach $line (@all) {
    @A = split(" ", $line);
    shift @A;
    $text = join(" ", @A);
    $n = $count{$text};
    if ($n < $maxcount || random() < ($maxcount / $n)) {
      print $line;
    }
  }'  $maxcount >$destdir/text

echo "Reduced number of utterances from `cat $srcdir/text | wc -l` to `cat $destdir/text | wc -l`"

echo "Using fix_data_dir.sh to reconcile the other files."
utils/fix_data_dir.sh $destdir
rm -r $destdir/.backup

exit 0


================================================
FILE: egs/utils/data/resample_data_dir.sh
================================================
#! /bin/bash

# Copyright 2016  Vimal Manohar
#           2018  Xiaohui Zhang
# Apache 2.0.

if [ $# -ne 2 ]; then
  echo "This script adds a sox line in wav.scp to resample the audio at a "
  echo "different sampling-rate"
  echo "Usage: $0 <frequency> <data-dir>"
  echo " e.g.: $0 8000 data/dev"
  exit 1
fi

freq=$1
dir=$2

sox=`which sox` || { echo "Could not find sox in PATH"; exit 1; }

if [ -f $dir/feats.scp ]; then
  mkdir -p $dir/.backup
  mv $dir/feats.scp $dir/.backup/
  if [ -f $dir/cmvn.scp ]; then
    mv $dir/cmvn.scp $dir/.backup/
  fi
  echo "$0: feats.scp already exists. Moving it to $dir/.backup"
fi

# After resampling we cannot compute utt2dur from wav.scp any more,
# so we create utt2dur now, in case it's needed later
if [ ! -s $dir/utt2dur ]; then
  utils/data/get_utt2dur.sh $dir 1>&2 || exit 1;
fi

mv $dir/wav.scp $dir/wav.scp.tmp
cat $dir/wav.scp.tmp | python -c "import sys
for line in sys.stdin.readlines():
  splits = line.strip().split()
  if splits[-1] == '|':
    out_line = line.strip() + ' $sox -t wav - -c 1 -b 16 -t wav - rate $freq |'
  else:
    out_line = '{0} cat {1} | $sox -t wav - -c 1 -b 16 -t wav - rate $freq |'.format(splits[0], ' '.join(splits[1:]))
  print (out_line)" > ${dir}/wav.scp
rm $dir/wav.scp.tmp


================================================
FILE: egs/utils/data/shift_and_combine_feats.sh
================================================
#!/usr/bin/env bash

# Copyright 2017  Hossein Hadian

# Apache 2.0

write_utt2orig=              # if provided, this script will write
                             # a mapping of shifted utterance ids
                             # to the original ones into the file
                             # specified by this option

echo "$0 $@"  # Print the command line for logging
if [ -f path.sh ]; then . ./path.sh; fi
. utils/parse_options.sh

if [ $# != 3 ]; then
  echo "Usage: $0 <frame-subsampling-factor> <srcdir> <destdir>"
  echo "e.g.: $0 3 data/train data/train_fs3"
  echo "For use in perturbing data for discriminative training and alignment of"
  echo "frame-subsampled systems, this script uses utils/data/shift_feats.sh"
  echo "and utils/data/combine_data.sh to shift the features"
  echo "<frame-subsampling-factor> different ways and combine them."
  echo "E.g. if <frame-subsampling-factor> is 3, this script will combine"
  echo "the data frame-shifted by -1, 0 and 1 (c.f. shift-feats)."
  exit 1
fi

frame_subsampling_factor=$1
srcdir=$2
destdir=$3

if [ ! -f $srcdir/feats.scp ]; then
  echo "$0: expected $srcdir/feats.scp to exist"
  exit 1
fi

if [ -f $destdir/feats.scp ]; then
  echo "$0: $destdir/feats.scp already exists: refusing to run this (please delete $destdir/feats.scp if you want this to run)"
  exit 1
fi

if [ ! -z $write_utt2orig ]; then
  awk '{print $1 " " $1}' $srcdir/feats.scp >$write_utt2orig
fi

tmp_shift_destdirs=()
for frame_shift in `seq $[-(frame_subsampling_factor/2)] $[-(frame_subsampling_factor/2) + frame_subsampling_factor - 1]`; do
  if [ "$frame_shift" == 0 ]; then continue; fi
  utils/data/shift_feats.sh $frame_shift $srcdir ${destdir}_fs$frame_shift || exit 1
  tmp_shift_destdirs+=("${destdir}_fs$frame_shift")
  if [ ! -z $write_utt2orig ]; then
    awk -v prefix="fs$frame_shift-" '{printf("%s%s %s\n", prefix, $1, $1);}' $srcdir/feats.scp >>$write_utt2orig
  fi  
done
utils/data/combine_data.sh $destdir $srcdir ${tmp_shift_destdirs[@]} || exit 1
rm -r ${tmp_shift_destdirs[@]}

utils/validate_data_dir.sh $destdir

src_nf=`cat $srcdir/feats.scp | wc -l`
dest_nf=`cat $destdir/feats.scp | wc -l`
if [ $[src_nf*frame_subsampling_factor] -ne $dest_nf ]; then
  echo "There was a problem. Expected number of feature lines in destination dir to be $[src_nf*frame_subsampling_factor];"
  exit 1;
fi

echo "$0: Successfully generated $frame_subsampling_factor-way shifted version of data in $srcdir, in $destdir"


================================================
FILE: egs/utils/data/shift_feats.sh
================================================
#!/usr/bin/env bash

# Copyright 2016    Vimal Manohar
#           2017    Hossein Hadian
# Apache 2.0

echo "$0 $@"  # Print the command line for logging
if [ -f path.sh ]; then . ./path.sh; fi
. parse_options.sh || exit 1;

if [ $# != 3 ]; then
  echo " Usage: $0 <frame-shift> <srcdir> <destdir>"
  echo "e.g.: $0 -1 data/train data/train_fs-1"
  echo "The script creates a new data directory with the features modified"
  echo "using the program shift-feats with the specified frame-shift."
  echo "This program automatically adds the prefix 'fs<frame-shift>-' to the"
  echo "utterance and speaker names. See also utils/data/shift_and_combine_feats.sh"
  exit 1
fi

frame_shift=$1
srcdir=$2
destdir=$3


if [ "$destdir" == "$srcdir" ]; then
  echo "$0: this script requires <srcdir> and <destdir> to be different."
  exit 1
fi

if [ ! -f $srcdir/feats.scp ]; then
  echo "$0: no such file $srcdir/feats.scp"
  exit 1;
fi

utt_prefix="fs$frame_shift-"
spk_prefix="fs$frame_shift-"

mkdir -p $destdir
utils/copy_data_dir.sh --utt-prefix $utt_prefix --spk-prefix $spk_prefix \
  $srcdir $destdir

if grep --quiet "'" $srcdir/feats.scp; then
  echo "$0: the input features already use single quotes. Can't proceed."
  exit 1;
fi

awk -v shift=$frame_shift 'NF == 2 {uttid=$1; feat=$2; qt="";} \
NF > 2 {idx=index($0, " "); uttid=$1; feat=substr($0, idx + 1); qt="\x27";} \
NF {print uttid " shift-feats --print-args=false --shift=" shift, qt feat qt " - |";}' \
  $destdir/feats.scp >$destdir/feats_shifted.scp
mv -f $destdir/feats_shifted.scp $destdir/feats.scp

echo "$0: Done"


================================================
FILE: egs/utils/data/subsegment_data_dir.sh
================================================
#!/usr/bin/env bash

# Copyright 2013  Johns Hopkins University (author: Daniel Povey)
# Apache 2.0


# This script allows you to specify a 'segments' file with segments
# relative to existing utterances, with lines like
#  utterance_foo-1 utterance_foo 7.5 8.2
#  utterance_foo-2 utterance_foo 8.9 10.1
# and a 'text' file with sub-segmented text like
#  utterance_foo-1 hello there
#  utterance_foo-2 how are you
# and combine this with an existing data-dir that was all relative
# to the original utterance-ids like 'utterance_foo', producing
# a new subsegmented output directory.
#
# It does the right thing for you on the various files that the
# data directory contained (except you have to recreate
# the CMVN stats).


segment_end_padding=0.0
cmd=run.pl
nj=1

. utils/parse_options.sh

if [ $# != 4 ] && [ $# != 3 ]; then
  echo "Usage: "
  echo "  $0 [options] <srcdir> <subsegments-file> [<text-file>] <destdir>"
  echo "This script sub-segments a data directory.  <subsegments-file> is to"
  echo "have lines of the form <new-utt> <old-utt> <start-time-within-old-utt> <end-time-within-old-utt>"
  echo "and <text-file> is of the form <new-utt> <word1> <word2> ... <wordN>."
  echo "This script appropriately combines the <subsegments-file> with the original"
  echo "segments file, if necessary, and if not, creates a segments file."
  echo "e.g.:"
  echo " $0 data/train [options] exp/tri3b_resegment/segments exp/tri3b_resegment/text data/train_resegmented"
  echo " Options:"
  echo "  --segment-end-padding <padding-time>       # e.g. 0.02.  Default 0.0.  If provided,"
  echo "                                             # we will add this value to the end times of <destdir>/segments"
  echo "                                             # when creating it.  This can be useful to account for"
  echo "                                             # end effects in feature generation.  The reason this is"
  echo "                                             # not just applied to the input segments file, is that"
  echo "                                             # for purposes of computing the num-frames of the parts of"
  echo "                                             # matrices in feats.scp, the padding should not be done."
  echo "  See also: resolve_ctm_overlaps.py"
  exit 1;
fi


export LC_ALL=C

srcdir=$1
subsegments=$2

add_subsegment_text=false
if [ $# -eq 4 ]; then
  new_text=$3
  dir=$4
  add_subsegment_text=true

  if [ ! -f "$new_text" ]; then
    echo "$0: no such file $new_text"
    exit 1
  fi

else
  dir=$3
fi

for f in "$subsegments" "$srcdir/utt2spk"; do
  if [ ! -f "$f" ]; then
    echo "$0: no such file $f"
    exit 1;
  fi
done

if ! mkdir -p $dir; then
  echo "$0: failed to create directory $dir"
fi

if $add_subsegment_text; then
  if ! cmp <(awk '{print $1}' <$subsegments)  <(awk '{print $1}' <$new_text); then
    echo "$0: expected the first fields of the files $subsegments and $new_text to be identical"
    exit 1
  fi
fi

# create the utt2spk in $dir
if ! awk '{if (NF != 4 || !($4 > $3)) { print("Bad line: " $0); exit(1) } }' <$subsegments; then
  echo "$0: failed checking subsegments file $subsegments"
  exit 1
fi

set -e
set -o pipefail

# Create a mapping from the new to old utterances.  This file will be deleted later.
awk '{print $1, $2}' < $subsegments > $dir/new2old_utt

# Create the new utt2spk file [just map from the second field
utils/apply_map.pl -f 2 $srcdir/utt2spk < $dir/new2old_utt >$dir/utt2spk
# .. and the new spk2utt file.
utils/utt2spk_to_spk2utt.pl  <$dir/utt2spk >$dir/spk2utt

if $add_subsegment_text; then
  # the new text file is just what the user provides.
  cp $new_text $dir/text
fi

# copy the source wav.scp
cp $srcdir/wav.scp $dir
if [ -f $srcdir/reco2file_and_channel ]; then
  cp $srcdir/reco2file_and_channel $dir
fi

# copy the source reco2dur
if [ -f $srcdir/reco2dur ]; then
  cp $srcdir/reco2dur $dir
fi

if [ -f $srcdir/segments ]; then
  # we have to map the segments file.
  # What's going on below is a little subtle.
  # $srcdir/segments has lines like: <old-utt-id> <recording-id> <start-time> <end-time>
  # and $subsegments has lines like: <new-utt-id> <old-utt-id> <start-time> <end-time>
  # The apply-map command replaces <old-utt-id> [the 2nd field of $subsegments]
  # with <recording-id> <start-time> <end-time>.
  # so after that first command we have lines like
  # <new-utt-id> <recording-id> <start-time-of-old-utt-within-recording> <end-time-old-utt-within-recording> \
  #   <start-time-of-new-utt-within-old-utt> <end-time-of-new-utt-within-old-utt>
  # which the awk command turns into:
  # <new-utt-id> <recording-id> <start-time-of-new-utt-within-recording> <end-time-of-new-utt-within-recording>
  utils/apply_map.pl -f 2 $srcdir/segments <$subsegments | \
    awk -v pad=$segment_end_padding '{ print $1, $2, $5+$3, $6+$3+pad; }' >$dir/segments
else
  # the subsegments file just becomes the segments file.
  awk -v pad=$segment_end_padding '{$4 += pad; print}' <$subsegments >$dir/segments
fi

if [ -f $srcdir/utt2uniq ]; then
  utils/apply_map.pl -f 2 $srcdir/utt2uniq <$dir/new2old_utt >$dir/utt2uniq
fi

if [ -f $srcdir/feats.scp ]; then
  # We want to avoid recomputing the features.   We'll use sub-matrices of the
  # original feature matrices, using the [] notation that is available for
  # matrices in Kaldi.
  if [ ! -s $srcdir/frame_shift ]; then
    frame_shift=$(utils/data/get_frame_shift.sh $srcdir) || exit 1
  else
    frame_shift=$(cat $srcdir/frame_shift)
  fi
  echo "$0: note: frame shift is $frame_shift [affects feats.scp]"

  # The subsegments format is <new-utt-id> <old-utt-id> <start-time> <end-time>.
  # e.g. 'utt_foo-1 utt_foo 7.21 8.93'
  # The first awk command replaces this with the format:
  # <new-utt-id> <old-utt-id> <first-frame> <last-frame>
  # e.g. 'utt_foo-1 utt_foo 721 893'
  # and the apply_map.pl command replaces 'utt_foo' (the 2nd field) with its corresponding entry
  # from the original wav.scp, so we get a line like:
  # e.g. 'utt_foo-1 foo-bar.ark:514231 721 892'
  # Note: the reason we subtract one from the last time is that it's going to
  # represent the 'last' frame, not the 'end' frame [i.e. not one past the last],
  # in the matlab-like, but zero-indexed [first:last] notion.  For instance, a segment with 1 frame
  # would have start-time 0.00 and end-time 0.01, which would become the frame range
  # [0:0]
  # The second awk command turns this into something like
  # utt_foo-1 foo-bar.ark:514231[721:892]
  # It has to be a bit careful because the format actually allows for more general things
  # like pipes that might contain spaces, so it has to be able to produce output like the
  # following:
  # utt_foo-1 some command|[721:892]
  # The 'end' frame is ensured to not exceed the feature archive size of
  # <old-utt-id>. This is done using the script fix_subsegment_feats.pl.
  # e.g if the number of frames in foo-bar.ark is 891, then the features are
  # truncated to that many frames.
  # utt_foo-1 foo-bar.ark:514231[721:890]
  # Lastly, utils/data/normalize_data_range.pl will only do something nontrivial if
  # the original data-dir already had data-ranges in square brackets.

  # Here, we computes the maximum 'end' frame allowed for each <new-utt-id>.
  # This is equal to the number of frames in the feature archive for <old-utt-id>.
  if [ ! -f $srcdir/utt2num_frames ]; then
    echo "$0: WARNING: Could not find $srcdir/utt2num_frames. It might take a long time to run get_utt2num_frames.sh."
    echo "Increase the number of jobs or write this file while extracting features by passing --write-utt2num-frames true to steps/make_mfcc.sh etc."
  fi
  utils/data/get_utt2num_frames.sh --cmd "$cmd" --nj $nj $srcdir
  awk '{print $1" "$2}' $subsegments | \
    utils/apply_map.pl -f 2 $srcdir/utt2num_frames > \
    $dir/utt2max_frames

  awk -v s=$frame_shift '{print $1, $2, int(($3/s)+0.5), int(($4/s)-0.5);}' <$subsegments| \
    utils/apply_map.pl -f 2 $srcdir/feats.scp | \
    awk '{p=NF-1; for (n=1;n<NF-2;n++) printf("%s ", $n); k=NF-2; l=NF-1; printf("%s[%d:%d]\n", $k, $l, $NF)}' | \
    utils/data/fix_subsegment_feats.pl $dir/utt2max_frames | \
    utils/data/normalize_data_range.pl >$dir/feats.scp || { echo "Failed to create $dir/feats.scp" && exit; }

  # Parse the frame ranges from feats.scp, which is in the form of [first-frame:last-frame]
  # and write the number-of-frames = last-frame - first-frame + 1 for the utterance.
  cat $dir/feats.scp | perl -ne 'm/^(\S+) .+\[(\d+):(\d+)\]$/; print "$1 " . ($3-$2+1) . "\n"' > \
    $dir/utt2num_frames

  # Here we add frame ranges to the elements of vad.scp, as we did for rows of feats.scp above.
  if [ -f $srcdir/vad.scp ]; then
    cat $subsegments | awk -v s=$frame_shift '{print $1, $2, int(($3/s)+0.5), int(($4/s)-0.5);}' | \
      utils/apply_map.pl -f 2 $srcdir/vad.scp | \
      awk '{p=NF-1; for (n=1;n<NF-2;n++) printf("%s ", $n); k=NF-2; l=NF-1; printf("%s[%d:%d]\n", $k, $l, $NF)}' | \
      utils/data/fix_subsegment_feats.pl $dir/utt2max_frames | \
      utils/data/normalize_data_range.pl >$dir/vad.scp
  fi
fi


if [ -f $dir/cmvn.scp ]; then
  rm $dir/cmvn.scp
  echo "$0: warning: removing $dir/cmvn.scp, you will have to regenerate it from the features."
fi

# remove the utt2dur file in case it's now invalid-- it be regenerated from the segments file.
rm $dir/utt2dur 2>/dev/null || true

if [ -f $srcdir/spk2gender ]; then
  cp $srcdir/spk2gender $dir
fi
if [ -f $srcdir/glm ]; then
  cp $srcdir/glm $dir
fi
if [ -f $srcdir/stm ]; then
  cp $srcdir/stm $dir
fi

for f in ctm; do
  if [ -f $srcdir/$f ]; then
    echo "$0: not copying $srcdir/$f to $dir because sub-segmenting it is "
    echo " ... not implemented yet (and probably it's not needed.)"
  fi
done

rm $dir/new2old_utt

echo "$0: subsegmented data from $srcdir to $dir"


================================================
FILE: egs/utils/dict_dir_add_pronprobs.sh
================================================
#!/usr/bin/env bash

# Apache 2.0.
# Copyright  2014  Johns Hopkins University (author: Daniel Povey)
#            2014  Guoguo Chen
#            2015  Hainan Xu


# The thing that this script implements is described in the paper:
# "PRONUNCIATION AND SILENCE PROBABILITY MODELING FOR ASR"
# by Guoguo Chen et al, see
# http://www.danielpovey.com/files/2015_interspeech_silprob.pdf

. ./path.sh || exit 1;

# begin configuration
max_normalize=true
# end configuration

. utils/parse_options.sh || exit 1;

set -e

if [[ $# -ne 3 && $# -ne 5 ]]; then
  echo "Usage: $0 [options] <input-dict-dir> <input-pron-counts> \\"
  echo "          [input-sil-counts] [input-bigram-counts] <output-dict-dir>"
  echo " e.g.: $0 data/local/dict \\"
  echo "          exp/tri3/pron_counts_nowb.txt exp/tri3/sil_counts_nowb.txt \\"
  echo "          exp/tri3/pron_bigram_counts_nowb.txt data/local/dict_prons"
  echo " e.g.: $0 data/local/dict \\"
  echo "          exp/tri3/pron_counts_nowb.txt data/local/dict_prons"
  echo ""
  echo "This script takes pronunciation counts, e.g. generated by aligning your training"
  echo "data and getting the prons using steps/get_prons.sh, and creates a modified"
  echo "dictionary directory with pronunciation probabilities. If the [input-sil-counts]"
  echo "parameter is provided, it will also include silprobs in the generated lexicon."
  echo "Options:"
  echo "   --max-normalize   (true|false)             # default true.  If true,"
  echo "                                              # divide each pron-prob by the"
  echo "                                              # most likely pron-prob per word."
  exit 1;
fi

if [ $# -eq 3 ]; then
  srcdir=$1
  pron_counts=$2
  dir=$3
elif [ $# -eq 5 ]; then
  srcdir=$1
  pron_counts=$2
  sil_counts=$3
  bigram_counts=$4
  dir=$5
fi

if [ ! -s $pron_counts ]; then
  echo "$0: expected file $pron_counts to exist";
  exit 1;
fi

mkdir -p $dir || exit 1;
utils/validate_dict_dir.pl $srcdir;

if [ -f $srcdir/lexicon.txt ]; then
  src_lex=$srcdir/lexicon.txt
  perl -ane 'print join(" ", split(" ", $_)) . "\n";' < $src_lex |\
    sort -u > $dir/lexicon.txt
elif [ -f $srcdir/lexiconp.txt ]; then
  echo "$0: removing the pron-probs from $srcdir/lexiconp.txt to create $dir/lexicon.txt"
  # the Perl command below normalizes the spaces (avoid double space).
  src_lex=$srcdir/lexiconp.txt
  awk '{$2 = ""; print $0;}' <$srcdir/lexiconp.txt |\
    perl -ane 'print join(" ", split(" " ,$_)) . "\n";' |\
    sort -u > $dir/lexicon.txt || exit 1;
fi


# the cat and awk commands below are implementing add-one smoothing.
cat <(awk '{print 1, $0;}' <$dir/lexicon.txt) $pron_counts | \
  awk '{ count = $1; $1 = ""; word_count[$2] += count; pron_count[$0] += count; pron2word[$0] = $2; }
       END{ for (p in pron_count) { word = pron2word[p]; num = pron_count[p]; den = word_count[word];
          print num / den, p } } ' | \
    awk '{ word = $2; $2 = $1; $1 = word; print; }' | grep -v '^<eps>' |\
    sort -k1,1 -k2g,2 -k3 > $dir/lexiconp.txt


n_old=$(wc -l <$dir/lexicon.txt)
n_new=$(wc -l <$dir/lexiconp.txt)

if [ "$n_old" != "$n_new" ]; then
  echo "$0: number of lines differs from $dir/lexicon.txt $n_old vs $dir/lexiconp.txt $n_new"
  echo "Probably something went wrong (e.g. input prons were generated from a different lexicon"
  echo "than $srcdir, or you used pron_counts.txt when you should have used pron_counts_nowb.txt"
  echo "or something else.  Make sure the prons in $src_lex $pron_counts look"
  echo "the same."
  exit 1;
fi

if $max_normalize; then
  echo "$0: normalizing pronprobs so maximum is 1 for each word."
  cat $dir/lexiconp.txt | awk '{if ($2 > max[$1]) { max[$1] = $2; }} END{for (w in max) { print w, max[w]; }}' > $dir/maxp.txt

  awk -v maxf=$dir/maxp.txt  'BEGIN{ while (getline <maxf) { max[$1] = $2; }} { $2 = $2 / max[$1]; print }' <$dir/lexiconp.txt > $dir/lexicon_tmp.txt || exit 1;

  if ! [ $(wc -l  <$dir/lexicon_tmp.txt)  -eq $(wc -l  <$dir/lexiconp.txt) ]; then
    echo "$0: error max-normalizing pron-probs"
    exit 1;
  fi
  mv $dir/lexicon_tmp.txt $dir/lexiconp.txt
  rm $dir/maxp.txt
fi

# Create $dir/lexiconp_silprob.txt and $dir/silprob.txt if silence counts file
# exists. The format of $dir/lexiconp_silprob.txt is:
# word pron-prob P(s_r | w)  F(s_l | w) F(n_l | w) pron
#  where:  P(s_r | w) is the probability of silence to the right of the word
#          F(s_l | w) is a factor which is greater than one if silence to the
#                  left of the word is more than averagely probable.
#          F(n_l | w) is a factor which is greater than one if nonsilence to the
#                  left of the word is more than averagely probable.
if [ -n "$sil_counts" ]; then
  if [ ! -s "$sil_counts" ]; then
    echo "$0: expected file $sil_counts to exist and not empty" && exit 1;
  fi
  cat $sil_counts | perl -e '
    # Load silence counts
    %sil_wpron = (); %nonsil_wpron = (); %wpron_sil = (); %wpron_nonsil = ();
    $sil_count = 0; $nonsil_count = 0;
    while (<STDIN>) {
      chomp; @col = split; @col >= 5 || die "'$0': bad line \"$_\"\n";
      $wpron = join(" ", @col[4..scalar(@col)-1]);
      ($sil_wpron{$wpron}, $nonsil_wpron{$wpron},
       $wpron_sil{$wpron}, $wpron_nonsil{$wpron}) = @col[0..3];
      $sil_count += $sil_wpron{$wpron}; $nonsil_count += $nonsil_wpron{$wpron};
    }

    # Open files.
    ($lexiconp, $bigram_counts, $lexiconp_silprob, $silprob) = @ARGV;
    open(LP, "<$lexiconp") || die "'$0': fail to open $lexiconp\n";
    open(WPC, "<$bigram_counts") || die "'$0': fail to open $bigram_counts\n";
    open(SP, ">$silprob") || die "'$0': fail to open $silprob\n";
    open(LPSP, ">$lexiconp_silprob") ||
      die "'$0': fail to open $lexiconp_silprob\n";

    # Computes P(s_r | w) in the paper.
    $lambda2 = 2;             # Smoothing term, \lambda_2 in the paper.
    %P_w_sr = ();
    %all_wprons = ();
    $sil_prob = sprintf("%.2f", $sil_count / ($sil_count + $nonsil_count));
    while (<LP>) {
      chomp; @col = split; @col >= 3 || die "'$0': bad line \"$_\"\n";
      $word = shift @col; $pron_prob = shift @col; $pron = join(" ", @col);
      unshift(@col, $word); $wpron = join(" ", @col);

      $wpron_sil_count = $wpron_sil{$wpron} + $sil_prob * $lambda2;
      $wpron_nonsil_count = $wpron_nonsil{$wpron} + (1 - $sil_prob) * $lambda2;
      $sil_after_prob = sprintf("%.2f",
        $wpron_sil_count / ($wpron_sil_count + $wpron_nonsil_count));
      if ($sil_after_prob == "0.00") { $sil_after_prob = "0.01"; }
      if ($sil_after_prob == "1.00") { $sil_after_prob = "0.99"; }
      $P_w_sr{$wpron} = $sil_after_prob;

      $all_wprons{$wpron} = $pron_prob;
    }

    # Reads C(v ? w) in the paper.
    %wpron_pair_count = ();
    while (<WPC>) {
      chomp; @col = split("\t"); @col == 3 || die "'$0': bad line \"$_\"\n";
      $count = shift @col; $wpron1 = shift @col; $wpron2 = shift @col;
      $key = "${wpron1}\t${wpron2}";
      $wpron_pair_count{$key} = $count;
    }

    # Computes \bar{C}(s w) and \bar{C}(n w) in the paper.
    %bar_C_s_w = ();
    %bar_C_n_w = ();
    foreach my $key (keys %wpron_pair_count) {
      $count = $wpron_pair_count{$key};
      ($wpron1, $wpron2) = split("\t", $key);
      $bar_C_s_w{$wpron2} += $count * $P_w_sr{$wpron1};
      $bar_C_n_w{$wpron2} += $count * (1 - $P_w_sr{$wpron1});
    }

    # Computes F(s_l | w) and F(n_l | w) in the paper.
    $lambda3 = 2;             # Smoothing term, \lambda_3 in the paper.
    foreach my $wpron (keys %all_wprons) {
      @col = split(" ", $wpron);
      $word = shift @col;
      $pron = join(" ", @col);
      $pron_prob = $all_wprons{$wpron};

      $F_sl_w = ($sil_wpron{$wpron} + $lambda3) / ($bar_C_s_w{$wpron} + $lambda3);
      $F_nl_w = ($nonsil_wpron{$wpron} + $lambda3) / ($bar_C_n_w{$wpron} + $lambda3);
      $F_sl_w = sprintf("%.2f", $F_sl_w);
      $F_nl_w = sprintf("%.2f", $F_nl_w);
      if ($F_sl_w == "0.00") { $F_sl_w = "0.01"; }
      if ($F_nl_w == "0.00") { $F_nl_w = "0.01"; }

      print LPSP "$word $pron_prob $P_w_sr{$wpron} $F_sl_w $F_nl_w $pron\n";
    }

    # Create silprob.txt
    $BOS_sil_count = $wpron_sil{"<s>"} + $sil_prob * $lambda2;
    $BOS_nonsil_count = $wpron_nonsil{"<s>"} + (1 - $sil_prob) * $lambda2;
    $P_BOS_sr = sprintf("%.2f", $BOS_sil_count / ($BOS_sil_count + $BOS_nonsil_count));
    $F_sl_EOS = ($sil_wpron{"</s>"} + $lambda3) / ($bar_C_s_w{"</s>"} + $lambda3);
    $F_nl_EOS = ($nonsil_wpron{"</s>"} + $lambda3) / ($bar_C_n_w{"</s>"} + $lambda3);
    if ($P_BOS_sr == "1.00") { $P_BOS_sr = "0.99"; }
    if ($P_BOS_sr == "0.00") { $P_BOS_sr = "0.01"; }
    if ($F_sl_EOS == "0.00") { $F_sl_EOS = "0.01"; }
    if ($F_nl_EOS == "0.00") { $F_nl_EOS = "0.01"; }
    print SP "<s> $P_BOS_sr\n</s>_s $F_sl_EOS\n</s>_n $F_nl_EOS\noverall $sil_prob\n";
    ' $dir/lexiconp.txt $bigram_counts $dir/lexiconp_silprob_unsorted.txt $dir/silprob.txt
    sort -k1,1 -k2g,2 -k6 $dir/lexiconp_silprob_unsorted.txt > $dir/lexiconp_silprob.txt
fi

# now regenerate lexicon.txt from lexiconp.txt, to make sure the lines are
# in the same order.
cat $dir/lexiconp.txt | awk '{$2 = ""; print;}' | sed 's/  / /g' >$dir/lexicon.txt


# add mandatory files.
for f in silence_phones.txt nonsilence_phones.txt; do
  if [ ! -f $srcdir/$f ]; then
    echo "$0: expected $srcdir/$f to exist."
    exit 1;
  fi
  cp $srcdir/$f $dir/ || exit 1;
done


# add optional files (at least, I think these are optional; would have to check the docs).
for f in optional_silence.txt extra_questions.txt; do
  if [ -f $srcdir/$f ]; then
    cp $srcdir/$f $dir || exit 1;
  fi
done


echo "$0: produced dictionary directory with probabilities in $dir/"
echo "$0: validating $dir .."
sleep 1
utils/validate_dict_dir.pl $dir || exit 1;


echo "Some low-probability prons include: "
echo "# sort -k2,2 -n $dir/lexiconp.txt  | head -n 8"

sort -k2,2 -n $dir/lexiconp.txt  | head -n 8

exit 0


================================================
FILE: egs/utils/eps2disambig.pl
================================================
#!/usr/bin/env perl
# Copyright 2010-2011 Microsoft Corporation
#                2015 Guoguo Chen

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#  http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.

# This script replaces epsilon with #0 on the input side only, of the G.fst
# acceptor.  

while(<>){
  if (/\s+#0\s+/) {
    print STDERR "$0: ERROR: LM has word #0, " .
                 "which is reserved as disambiguation symbol\n";
    exit 1;
  }
  s:^(\d+\s+\d+\s+)\<eps\>(\s+):$1#0$2:;
  print;
}


================================================
FILE: egs/utils/filt.py
================================================
#!/usr/bin/env python

# Apache 2.0

from __future__ import print_function
import sys

vocab=set()
with open(sys.argv[1]) as vocabfile:
    for line in vocabfile:
        vocab.add(line.strip())

with open(sys.argv[2]) as textfile:
    for line in textfile:
        print(" ".join([word if word in vocab else '<UNK>' for word in line.strip().split()]))


================================================
FILE: egs/utils/filter_scp.pl
================================================
#!/usr/bin/env perl
# Copyright 2010-2012 Microsoft Corporation
#                     Johns Hopkins University (author: Daniel Povey)

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#  http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.


# This script takes a list of utterance-ids or any file whose first field
# of each line is an utterance-id, and filters an scp
# file (or any file whose "n-th" field is an utterance id), printing
# out only those lines whose "n-th" field is in id_list. The index of
# the "n-th" field is 1, by default, but can be changed by using
# the -f <n> switch

$exclude = 0;
$field = 1;
$shifted = 0;

do {
  $shifted=0;
  if ($ARGV[0] eq "--exclude") {
    $exclude = 1;
    shift @ARGV;
    $shifted=1;
  }
  if ($ARGV[0] eq "-f") {
    $field = $ARGV[1];
    shift @ARGV; shift @ARGV;
    $shifted=1
  }
} while ($shifted);

if(@ARGV < 1 || @ARGV > 2) {
  die "Usage: filter_scp.pl [--exclude] [-f <field-to-filter-on>] id_list [in.scp] > out.scp \n" .
      "Prints only the input lines whose f'th field (default: first) is in 'id_list'.\n" .
      "Note: only the first field of each line in id_list matters.  With --exclude, prints\n" .
      "only the lines that were *not* in id_list.\n" .
      "Caution: previously, the -f option was interpreted as a zero-based field index.\n" .
      "If your older scripts (written before Oct 2014) stopped working and you used the\n" .
      "-f option, add 1 to the argument.\n" .
      "See also: utils/filter_scp.pl .\n";
}


$idlist = shift @ARGV;
open(F, "<$idlist") || die "Could not open id-list file $idlist";
while(<F>) {
  @A = split;
  @A>=1 || die "Invalid id-list file line $_";
  $seen{$A[0]} = 1;
}

if ($field == 1) { # Treat this as special case, since it is common.
  while(<>) {
    $_ =~ m/\s*(\S+)\s*/ || die "Bad line $_, could not get first field.";
    # $1 is what we filter on.
    if ((!$exclude && $seen{$1}) || ($exclude && !defined $seen{$1})) {
      print $_;
    }
  }
} else {
  while(<>) {
    @A = split;
    @A > 0 || die "Invalid scp file line $_";
    @A >= $field || die "Invalid scp file line $_";
    if ((!$exclude && $seen{$A[$field-1]}) || ($exclude && !defined $seen{$A[$field-1]})) {
      print $_;
    }
  }
}

# tests:
# the following should print "foo 1"
# ( echo foo 1; echo bar 2 ) | utils/filter_scp.pl <(echo foo)
# the following should print "bar 2".
# ( echo foo 1; echo bar 2 ) | utils/filter_scp.pl -f 2 <(echo 2)


================================================
FILE: egs/utils/filter_scps.pl
================================================
#!/usr/bin/env perl
# Copyright 2010-2012   Microsoft Corporation
#           2012-2016   Johns Hopkins University (author: Daniel Povey)
#                2015   Xiaohui Zhang

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#  http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.


# This script takes multiple lists of utterance-ids or any file whose first field
# of each line is an utterance-id, as filters, and filters an scp
# file (or any file whose "n-th" field is an utterance id), printing
# out only those lines whose "n-th" field is in filter. The index of
# the "n-th" field is 1, by default, but can be changed by using
# the -f <n> switch


$field = 1;
$shifted = 0;
$print_warnings = 1;
do {
  $shifted=0;
  if ($ARGV[0] eq "-f") {
    $field = $ARGV[1];
    shift @ARGV; shift @ARGV;
    $shifted = 1;
  }
  if (@ARGV[0] eq "--no-warn") {
    $print_warnings = 0;
    shift @ARGV;
    $shifted = 1;
  }
} while ($shifted);


if(@ARGV != 4) {
  die "Usage: utils/filter_scps.pl [-f <field-to-filter-on>] <job-range-specifier> <filter-pattern> <input-scp> <output-scp-pattern>\n" .
       "e.g.:  utils/filter_scps.pl  JOB=1:10 data/train/split10/JOB/spk2utt data/train/feats.scp data/train/split10/JOB/feats.scp\n" .
       "similar to utils/filter_scp.pl, but it uses multiple filters and output multiple filtered files.\n".
       "The -f option specifies the field in <input-scp> that we filter on (default: 1)." .
       "See also: utils/filter_scp.pl\n";
}

if ($ARGV[0] =~ m/^([\w_][\w\d_]*)+=(\d+):(\d+)$/) { # e.g. JOB=1:10
  $jobname = $1;
  $jobstart = $2;
  $jobend = $3;
  shift;
  if ($jobstart > $jobend) {
    die "filter_scps.pl: invalid job range $ARGV[0]";
  }
} else {
  die "filter_scps.pl: bad job-range specifier $ARGV[0]: expected e.g. JOB=1:10";
}

$idlist = shift @ARGV;

if ($idlist !~ m/$jobname/ &&
    $jobend > $jobstart) {
  print STDERR "filter_scps.pl: you are trying to use multiple filter files as filter patterns but "
    . "you are providing just one filter file ($idlist)\n";
  exit(1);
}


$infile = shift @ARGV;

$outfile = shift @ARGV;

if ($outfile !~ m/$jobname/ &&  $jobend > $jobstart) {
  print STDERR "filter_scps.pl: you are trying to create multiple filtered files but "
    . "you are providing just one output file ($outfile)\n";
  exit(1);
}

# This hashes from the id (e.g. utterance-id) to an array of the relevant
# job-ids (which are integers).  In any normal use-case, this array will contain
# exactly one job-id for any given id, but we want to be agnostic about this.
%id2jobs = ( );

# Some variables that we set to produce a warning.
$warn_uncovered = 0;
$warn_multiply_covered = 0;

for ($jobid = $jobstart; $jobid <= $jobend; $jobid++) {
  $idlist_n = $idlist;
  $idlist_n =~ s/$jobname/$jobid/g;

  open(F, "<$idlist_n") || die "Could not open id-list file $idlist_n";

  while(<F>) {
    @A = split;
    @A >= 1 || die "Invalid line $_ in id-list file $idlist_n";
    $id = $A[0];
    if (! defined $id2jobs{$id}) {
      $id2jobs{$id} = [ ];  # new anonymous array.
    }
    push @{$id2jobs{$id}}, $jobid;
  }
  close(F);
}

# job2output hashes from the job-id, to an anonymous array containing
# a sequence of output lines.
%job2output = ( );
for ($jobid = $jobstart; $jobid <= $jobend; $jobid++) {
  $job2output{$jobid} = [ ];  # new anonymous array.
}

open (F, "< $infile") or die "Can't open $infile for read: $!";
while (<F>) {
  if ($field == 1) {           # Treat this as special case, since it is common.
    $_ =~ m/\s*(\S+)\s*/ || die "Bad line $_, could not get first field.";
    # $1 is what we filter on.
    $id = $1;
  } else {
    @A = split;
    @A > 0 || die "Invalid scp file line $_";
    @A >= $field || die "Invalid scp file line $_";
    $id = $A[$field-1];
  }
  if ( ! defined $id2jobs{$id}) {
    $warn_uncovered = 1;
  } else {
    @jobs = @{$id2jobs{$id}};   # this dereferences the array reference.
    if (@jobs > 1) {
      $warn_multiply_covered = 1;
    }
    foreach $job_id (@jobs) {
      if (!defined $job2output{$job_id}) {
        die "Likely code error";
      }
      push @{$job2output{$job_id}}, $_;
    }
  }
}
close(F);

for ($jobid = $jobstart; $jobid <= $jobend; $jobid++) {
  $outfile_n = $outfile;
  $outfile_n =~ s/$jobname/$jobid/g;
  open(FW, ">$outfile_n") || die "Could not open output file $outfile_n";
  $printed = 0;
  foreach $line (@{$job2output{$jobid}}) {
    print FW $line;
    $printed = 1;
  }
  if (!printed) {
    print STDERR "filter_scps.pl: warning: output to $outfile_n is empty\n";
  }
  close(FW);
}

if ($warn_uncovered && $print_warnings) {
  print STDERR "filter_scps.pl: warning: some input lines did not get output\n";
}
if ($warn_multiply_covered && $print_warnings) {
  print STDERR "filter_scps.pl: warning: some input lines were output to multiple files [OK if splitting per utt] " .
    join(" ", @ARGV) . "\n";
}


================================================
FILE: egs/utils/find_arpa_oovs.pl
================================================
#!/usr/bin/env perl
# Copyright 2010-2011 Microsoft Corporation

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#  http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.


if (  @ARGV < 1 && @ARGV > 2) {
    die "Usage: find_arpa_oovs.pl words.txt [lm.arpa]\n";
    # This program finds words in the arpa file that are not symbols
    # in the OpenFst-format symbol table words.txt.  It prints them
    # on the standard output, one per line.
}

$symtab = shift @ARGV;
open(S, "<$symtab") || die "Failed opening symbol table file $symtab\n";
while(<S>){
    @A = split(" ", $_);
    @A == 2 || die "Bad line in symbol table file: $_";
    $seen{$A[0]} = 1;
}

$found_data=0;
$curgram=0;
while(<>) { # Find the \data\ marker.
    if(m:^\\data\\\s*$:) { $found_data=1; last; }
}

if ($found_data==0) {
  print STDERR "find_arpa_oovs.pl: found no \\data\\ marker in the ARPA input.\n";
  exit(1);
}

while(<>) {
    if(m/^\\(\d+)\-grams:\s*$/) {
        $curgram = $1;
        if($curgram > 1) {
            last; # This is an optimization as we can get the vocab from the 1-grams
        }
    } elsif($curgram > 0) {
        @A = split(" ", $_);
        if(@A > 1) {
            shift @A;
            for($n=0;$n<$curgram;$n++) {
                $word = $A[$n];
                if(!defined $word) { print STDERR "Unusual line $_ (line $.) in arpa file.\n"; }
                $in_arpa{$word} = 1;
            }
        } else {
            if(@A > 0 && $A[0] !~ m:\\end\\:) {
                print STDERR "Unusual line $_ (line $.) in arpa file\n";
            }
        }
    }
}

foreach $w (keys %in_arpa) {
    if(!defined $seen{$w} && $w ne "<s>" && $w ne "</s>") {
        print "$w\n";
    }
}


================================================
FILE: egs/utils/fix_data_dir.sh
================================================
#!/usr/bin/env bash

# This script makes sure that only the segments present in
# all of "feats.scp", "wav.scp" [if present], segments [if present]
# text, and utt2spk are present in any of them.
# It puts the original contents of data-dir into
# data-dir/.backup

cmd="$@"

utt_extra_files=
spk_extra_files=

. utils/parse_options.sh

if [ $# != 1 ]; then
  echo "Usage: utils/data/fix_data_dir.sh <data-dir>"
  echo "e.g.: utils/data/fix_data_dir.sh data/train"
  echo "This script helps ensure that the various files in a data directory"
  echo "are correctly sorted and filtered, for example removing utterances"
  echo "that have no features (if feats.scp is present)"
  exit 1
fi

data=$1

if [ -f $data/images.scp ]; then
  image/fix_data_dir.sh $cmd
  exit $?
fi

mkdir -p $data/.backup

[ ! -d $data ] && echo "$0: no such directory $data" && exit 1;

[ ! -f $data/utt2spk ] && echo "$0: no such file $data/utt2spk" && exit 1;

set -e -o pipefail -u

tmpdir=$(mktemp -d /tmp/kaldi.XXXX);
trap 'rm -rf "$tmpdir"' EXIT HUP INT PIPE TERM

export LC_ALL=C

function check_sorted {
  file=$1
  sort -k1,1 -u <$file >$file.tmp
  if ! cmp -s $file $file.tmp; then
    echo "$0: file $1 is not in sorted order or not unique, sorting it"
    mv $file.tmp $file
  else
    rm $file.tmp
  fi
}

for x in utt2spk spk2utt feats.scp text segments wav.scp cmvn.scp vad.scp \
    reco2file_and_channel spk2gender utt2lang utt2uniq utt2dur reco2dur utt2num_frames; do
  if [ -f $data/$x ]; then
    cp $data/$x $data/.backup/$x
    check_sorted $data/$x
  fi
done


function filter_file {
  filter=$1
  file_to_filter=$2
  cp $file_to_filter ${file_to_filter}.tmp
  utils/filter_scp.pl $filter ${file_to_filter}.tmp > $file_to_filter
  if ! cmp ${file_to_filter}.tmp  $file_to_filter >&/dev/null; then
    length1=$(cat ${file_to_filter}.tmp | wc -l)
    length2=$(cat ${file_to_filter} | wc -l)
    if [ $length1 -ne $length2 ]; then
      echo "$0: filtered $file_to_filter from $length1 to $length2 lines based on filter $filter."
    fi
  fi
  rm $file_to_filter.tmp
}

function filter_recordings {
  # We call this once before the stage when we filter on utterance-id, and once
  # after.

  if [ -f $data/segments ]; then
  # We have a segments file -> we need to filter this and the file wav.scp, and
  # reco2file_and_utt, if it exists, to make sure they have the same list of
  # recording-ids.

    if [ ! -f $data/wav.scp ]; then
      echo "$0: $data/segments exists but not $data/wav.scp"
      exit 1;
    fi
    awk '{print $2}' < $data/segments | sort | uniq > $tmpdir/recordings
    n1=$(cat $tmpdir/recordings | wc -l)
    [ ! -s $tmpdir/recordings ] && \
      echo "Empty list of recordings (bad file $data/segments)?" && exit 1;
    utils/filter_scp.pl $data/wav.scp $tmpdir/recordings > $tmpdir/recordings.tmp
    mv $tmpdir/recordings.tmp $tmpdir/recordings


    cp $data/segments{,.tmp}; awk '{print $2, $1, $3, $4}' <$data/segments.tmp >$data/segments
    filter_file $tmpdir/recordings $data/segments
    cp $data/segments{,.tmp}; awk '{print $2, $1, $3, $4}' <$data/segments.tmp >$data/segments
    rm $data/segments.tmp

    filter_file $tmpdir/recordings $data/wav.scp
    [ -f $data/reco2file_and_channel ] && filter_file $tmpdir/recordings $data/reco2file_and_channel
    [ -f $data/reco2dur ] && filter_file $tmpdir/recordings $data/reco2dur
    true
  fi
}

function filter_speakers {
  # throughout this program, we regard utt2spk as primary and spk2utt as derived, so...
  utils/utt2spk_to_spk2utt.pl $data/utt2spk > $data/spk2utt

  cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers
  for s in cmvn.scp spk2gender; do
    f=$data/$s
    if [ -f $f ]; then
      filter_file $f $tmpdir/speakers
    fi
  done

  filter_file $tmpdir/speakers $data/spk2utt
  utils/spk2utt_to_utt2spk.pl $data/spk2utt > $data/utt2spk

  for s in cmvn.scp spk2gender $spk_extra_files; do
    f=$data/$s
    if [ -f $f ]; then
      filter_file $tmpdir/speakers $f
    fi
  done
}

function filter_utts {
  cat $data/utt2spk | awk '{print $1}' > $tmpdir/utts

  ! cat $data/utt2spk | sort | cmp - $data/utt2spk && \
    echo "utt2spk is not in sorted order (fix this yourself)" && exit 1;

  ! cat $data/utt2spk | sort -k2 | cmp - $data/utt2spk && \
    echo "utt2spk is not in sorted order when sorted first on speaker-id " && \
    echo "(fix this by making speaker-ids prefixes of utt-ids)" && exit 1;

  ! cat $data/spk2utt | sort | cmp - $data/spk2utt && \
    echo "spk2utt is not in sorted order (fix this yourself)" && exit 1;

  if [ -f $data/utt2uniq ]; then
    ! cat $data/utt2uniq | sort | cmp - $data/utt2uniq && \
      echo "utt2uniq is not in sorted order (fix this yourself)" && exit 1;
  fi

  maybe_wav=
  maybe_reco2dur=
  [ ! -f $data/segments ] && maybe_wav=wav.scp # wav indexed by utts only if segments does not exist.
  [ -s $data/reco2dur ] && [ ! -f $data/segments ] && maybe_reco2dur=reco2dur # reco2dur indexed by utts

  maybe_utt2dur=
  if [ -f $data/utt2dur ]; then
    cat $data/utt2dur | \
      awk '{ if (NF == 2 && $2 > 0) { print }}' > $data/utt2dur.ok || exit 1
    maybe_utt2dur=utt2dur.ok
  fi

  maybe_utt2num_frames=
  if [ -f $data/utt2num_frames ]; then
    cat $data/utt2num_frames | \
      awk '{ if (NF == 2 && $2 > 0) { print }}' > $data/utt2num_frames.ok || exit 1
    maybe_utt2num_frames=utt2num_frames.ok
  fi

  for x in feats.scp text segments utt2lang $maybe_wav $maybe_utt2dur $maybe_utt2num_frames; do
    if [ -f $data/$x ]; then
      utils/filter_scp.pl $data/$x $tmpdir/utts > $tmpdir/utts.tmp
      mv $tmpdir/utts.tmp $tmpdir/utts
    fi
  done
  rm $data/utt2dur.ok 2>/dev/null || true
  rm $data/utt2num_frames.ok 2>/dev/null || true

  [ ! -s $tmpdir/utts ] && echo "fix_data_dir.sh: no utterances remained: not proceeding further." && \
    rm $tmpdir/utts && exit 1;


  if [ -f $data/utt2spk ]; then
    new_nutts=$(cat $tmpdir/utts | wc -l)
    old_nutts=$(cat $data/utt2spk | wc -l)
    if [ $new_nutts -ne $old_nutts ]; then
      echo "fix_data_dir.sh: kept $new_nutts utterances out of $old_nutts"
    else
      echo "fix_data_dir.sh: kept all $old_nutts utterances."
    fi
  fi

  for x in utt2spk utt2uniq feats.scp vad.scp text segments utt2lang utt2dur utt2num_frames $maybe_wav $maybe_reco2dur $utt_extra_files; do
    if [ -f $data/$x ]; then
      cp $data/$x $data/.backup/$x
      if ! cmp -s $data/$x <( utils/filter_scp.pl $tmpdir/utts $data/$x ) ; then
        utils/filter_scp.pl $tmpdir/utts $data/.backup/$x > $data/$x
      fi
    fi
  done

}

filter_recordings
filter_speakers
filter_utts
filter_speakers
filter_recordings

utils/utt2spk_to_spk2utt.pl $data/utt2spk > $data/spk2utt

echo "fix_data_dir.sh: old files are kept in $data/.backup"


================================================
FILE: egs/utils/format_lm.sh
================================================
#!/usr/bin/env bash

# Copyright 2012  Arnab Ghoshal
#           2010-2011  Microsoft Corporation
#           2016-2018  Johns Hopkins University (author: Daniel Povey)

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#  http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.

set -e

if [ $# -ne 4 ]; then
  echo "Usage: $0 <lang_dir> <arpa-LM> <lexicon> <out_dir>"
  echo "E.g.: $0 data/lang data/local/lm/foo.kn.gz data/local/dict/lexicon.txt data/lang_test"
  echo "Convert ARPA-format language models to FSTs.";
  exit 1;
fi

lang_dir=$1
lm=$2
lexicon=$3
out_dir=$4
mkdir -p $out_dir

[ -f ./path.sh ] && . ./path.sh

echo "Converting '$lm' to FST"

# the -ef test checks if  source and target directory
# are two different directories in the filesystem
# if they are the same, the section guarded by the test
# would be actually harmfull (deleting the phones/ subdirectory)
if [ -e $out_dir ] && [ ! $lang_dir -ef $out_dir ] ; then
  if [ -e $out_dir/phones ] ; then
    rm -r $out_dir/phones
  fi

  for f in phones.txt words.txt topo L.fst L_disambig.fst phones oov.int oov.txt; do
     cp -r $lang_dir/$f $out_dir
  done
fi

lm_base=$(basename $lm '.gz')
gunzip -c $lm \
  | arpa2fst --disambig-symbol=#0 \
             --read-symbol-table=$out_dir/words.txt - $out_dir/G.fst
set +e
fstisstochastic $out_dir/G.fst
set -e
# The output is like:
# 9.14233e-05 -0.259833
# we do expect the first of these 2 numbers to be close to zero (the second is
# nonzero because the backoff weights make the states sum to >1).

# Everything below is only for diagnostic.
# Checking that G has no cycles with empty words on them (e.g. <s>, </s>);
# this might cause determinization failure of CLG.
# #0 is treated as an empty word.
mkdir -p $out_dir/tmpdir.g
awk '{if(NF==1){ printf("0 0 %s %s\n", $1,$1); }}
     END{print "0 0 #0 #0"; print "0";}' \
     < "$lexicon" > $out_dir/tmpdir.g/select_empty.fst.txt

fstcompile --isymbols=$out_dir/words.txt --osymbols=$out_dir/words.txt \
  $out_dir/tmpdir.g/select_empty.fst.txt \
  | fstarcsort --sort_type=olabel \
  | fstcompose - $out_dir/G.fst > $out_dir/tmpdir.g/empty_words.fst

fstinfo $out_dir/tmpdir.g/empty_words.fst | grep cyclic | grep -w 'y' \
  && echo "Language model has cycles with empty words" && exit 1

rm -r $out_dir/tmpdir.g


echo "Succeeded in formatting LM: '$lm'"


================================================
FILE: egs/utils/format_lm_sri.sh
================================================
#!/usr/bin/env bash

# Copyright 2012  Arnab Ghoshal
# Copyright 2010-2011  Microsoft Corporation

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#  http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.

# Begin configuration section.
srilm_opts="-subset -prune-lowprobs -unk -tolower"
# end configuration sections


. utils/parse_options.sh

if [ $# -ne 4 ] && [ $# -ne 3 ]; then
  echo "Usage: $0 [options] <lang-dir> <arpa-LM> [<lexicon>] <out-dir>"
  echo "The <lexicon> argument is no longer needed but is supported for back compatibility"
  echo "E.g.: utils/format_lm_sri.sh data/lang data/local/lm/foo.kn.gz data/local/dict/lexicon.txt data/lang_test"
  echo "Converts ARPA-format language models to FSTs. Change the LM vocabulary using SRILM."
  echo "Note: if you want to just convert ARPA LMs to FSTs, there is a simpler way to do this"
  echo "that doesn't require SRILM: see utils/format_lm.sh"
  echo "options:"
  echo " --help                 # print this message and exit"
  echo " --srilm-opts STRING      # options to pass to SRILM tools (default: '$srilm_opts')"
  exit 1;
fi


if [ $# -eq 4 ] ; then
  lang_dir=$1
  lm=$2
  lexicon=$3
  out_dir=$4
else
  lang_dir=$1
  lm=$2
  out_dir=$3
fi

for f in $lm $lang_dir/words.txt; do
  if [ ! -f $f ]; then
    echo "$0: expected input file $f to exist."
    exit 1;
  fi
done

[ -f ./path.sh ] && . ./path.sh

loc=`which change-lm-vocab`
if [ -z $loc ]; then
  echo You appear to not have SRILM tools installed.
  echo cd to $KALDI_ROOT/tools and run extras/install_srilm.sh.
  exit 1
fi

echo "Converting '$lm' to FST"
tmpdir=$(mktemp -d /tmp/kaldi.XXXX);
trap 'rm -rf "$tmpdir"' EXIT

mkdir -p $out_dir
cp -r $lang_dir/* $out_dir || exit 1;

awk '{print $1}' $out_dir/words.txt > $tmpdir/voc || exit 1;

# Change the LM vocabulary to be the intersection of the current LM vocabulary
# and the set of words in the pronunciation lexicon. This also renormalizes the
# LM by recomputing the backoff weights, and remove those ngrams whose
# probabilities are lower than the backed-off estimates.
change-lm-vocab -vocab $tmpdir/voc -lm $lm -write-lm - $srilm_opts | \
  arpa2fst --disambig-symbol=#0 \
           --read-symbol-table=$out_dir/words.txt - $out_dir/G.fst || exit 1

fstisstochastic $out_dir/G.fst

# The output is like:
# 9.14233e-05 -0.259833
# we do expect the first of these 2 numbers to be close to zero (the second is
# nonzero because the backoff weights make the states sum to >1).

echo "Succeeded in formatting LM '$lm' -> '$out_dir/G.fst'"


================================================
FILE: egs/utils/gen_topo.pl
================================================
#!/usr/bin/env perl

# Copyright 2012  Johns Hopkins University (author: Daniel Povey)

# Generate a topology file.  This allows control of the number of states in the
# non-silence HMMs, and in the silence HMMs.

if (@ARGV != 4) {
  print STDERR "Usage: utils/gen_topo.pl <num-nonsilence-states> <num-silence-states> <colon-separated-nonsilence-phones> <colon-separated-silence-phones>\n";
  print STDERR "e.g.:  utils/gen_topo.pl 3 5 4:5:6:7:8:9:10 1:2:3\n";
  exit (1);
}

($num_nonsil_states, $num_sil_states, $nonsil_phones, $sil_phones) = @ARGV;

( $num_nonsil_states >= 1 && $num_nonsil_states <= 100 ) ||
  die "Unexpected number of nonsilence-model states $num_nonsil_states\n";
(( $num_sil_states == 1 || $num_sil_states >= 3) && $num_sil_states <= 100 ) ||
  die "Unexpected number of silence-model states $num_sil_states\n";

$nonsil_phones =~ s/:/ /g;
$sil_phones =~ s/:/ /g;
$nonsil_phones =~ m/^\d[ \d]*$/ || die "$0: bad arguments @ARGV\n";
$sil_phones =~ m/^\d[ \d]*$/ || die "$0: bad arguments @ARGV\n";

print "<Topology>\n";
print "<TopologyEntry>\n";
print "<ForPhones>\n";
print "$nonsil_phones\n";
print "</ForPhones>\n";
for ($state = 0; $state < $num_nonsil_states; $state++) {
  $statep1 = $state+1;
  print "<State> $state <PdfClass> $state <Transition> $state 0.75 <Transition> $statep1 0.25 </State>\n";
}
print "<State> $num_nonsil_states </State>\n"; # non-emitting final state.
print "</TopologyEntry>\n";
# Now silence phones.  They have a different topology-- apart from the first and
# last states, it's fully connected, as long as you have >= 3 states.

if ($num_sil_states > 1) {
  $transp = 1.0 / ($num_sil_states-1);
  print "<TopologyEntry>\n";
  print "<ForPhones>\n";
  print "$sil_phones\n";
  print "</ForPhones>\n";
  print "<State> 0 <PdfClass> 0 ";
  for ($nextstate = 0; $nextstate < $num_sil_states-1; $nextstate++) { # Transitions to all but last
    # emitting state.
    print "<Transition> $nextstate $transp ";
  }
  print "</State>\n";
  for ($state = 1; $state < $num_sil_states-1; $state++) { # the central states all have transitions to
    # themselves and to the last emitting state.
    print "<State> $state <PdfClass> $state ";
    for ($nextstate = 1; $nextstate < $num_sil_states; $nextstate++) {
      print "<Transition> $nextstate $transp ";
    }
    print "</State>\n";
  }
  # Final emitting state (non-skippable).
  $state = $num_sil_states-1;
  print "<State> $state <PdfClass> $state <Transition> $state 0.75 <Transition> $num_sil_states 0.25 </State>\n";
  # Final nonemitting state:
  print "<State> $num_sil_states </State>\n";
  print "</TopologyEntry>\n";
} else {
  print "<TopologyEntry>\n";
  print "<ForPhones>\n";
  print "$sil_phones\n";
  print "</ForPhones>\n";
  print "<State> 0 <PdfClass> 0 ";
  print "<Transition> 0 0.75 ";
  print "<Transition> 1 0.25 ";
  print "</State>\n";
  print "<State> $num_sil_states </State>\n"; # non-emitting final state.
  print "</TopologyEntry>\n";
}

print "</Topology>\n";


================================================
FILE: egs/utils/int2sym.pl
================================================
#!/usr/bin/env perl
# Copyright 2010-2012 Microsoft Corporation  Johns Hopkins University (Author: Daniel Povey)
# Apache 2.0.

undef $field_begin;
undef $field_end;


if ($ARGV[0] eq "-f") {
  shift @ARGV;
  $field_spec = shift @ARGV;
  if ($field_spec =~ m/^\d+$/) {
    $field_begin = $field_spec - 1; $field_end = $field_spec - 1;
  }
  if ($field_spec =~ m/^(\d*)[-:](\d*)/) { # accept e.g. 1:10 as a courtesy (properly, 1-10)
    if ($1 ne "") {
      $field_begin = $1 - 1; # Change to zero-based indexing.
    }
    if ($2 ne "") {
      $field_end = $2 - 1; # Change to zero-based indexing.
    }
  }
  if (!defined $field_begin && !defined $field_end) {
    die "Bad argument to -f option: $field_spec";
  }
}
$symtab = shift @ARGV;
if(!defined $symtab) {
    print STDERR "Usage: int2sym.pl [options] symtab [input] > output\n" .
      "options: [-f (<field>|<field_start>-<field-end>)]\n" .
      "e.g.: -f 2, or -f 3-4\n";
    exit(1);
}

open(F, "<$symtab") || die "Error opening symbol table file $symtab";
while(<F>) {
    @A = split(" ", $_);
    @A == 2 || die "bad line in symbol table file: $_";
    $int2sym{$A[1]} = $A[0];
}

sub int2sym {
    my $a = shift @_;
    my $pos = shift @_;
    if($a !~  m:^\d+$:) { # not all digits..
      $pos1 = $pos+1; # make it one-based.
      die "int2sym.pl: found noninteger token $a [in position $pos1]\n";
    }
    $s = $int2sym{$a};
    if(!defined ($s)) {
      die "int2sym.pl: integer $a not in symbol table $symtab.";
    }
    return $s;
}

$error = 0;
while (<>) {
  @A = split(" ", $_);
  for ($pos = 0; $pos <= $#A; $pos++) {
    $a = $A[$pos];
    if ( (!defined $field_begin || $pos >= $field_begin)
         && (!defined $field_end || $pos <= $field_end)) {
      $a = int2sym($a, $pos);
    }
    print $a . " ";
  }
  print "\n";
}


================================================
FILE: egs/utils/kwslist_post_process.pl
================================================
#!/usr/bin/env perl

# Copyright 2012  Johns Hopkins University (Author: Guoguo Chen)
# Apache 2.0.
#

use strict;
use warnings;
use Getopt::Long;

sub ReadKwslist {
  my $kwslist_in = shift @_;

  my $source = "STDIN";
  if ($kwslist_in ne "-") {
    open(I, "<$kwslist_in") || die "$0: Fail to open kwslist $kwslist_in\n";
    $source = "I";
  }

  # Read in the kwslist and parse it. Note that this is a naive parse -- I simply
  # assume that the kwslist is "properly" generated
  my @KWS;
  my (@info, $kwid, $tbeg, $dur, $file, $score, $channel);
  my ($kwlist_filename, $language, $system_id) = ("", "", "");
  while (<$source>) {
    chomp;

    if (/<kwslist/) {
      /language="(\S+)"/ && ($language = $1);
      /system_id="(\S+)"/ && ($system_id = $1);
      /kwlist_filename="(\S+)"/ && ($kwlist_filename = $1);
      @info = ($kwlist_filename, $language, $system_id);
      next;
    }

    if (/<detected_kwlist/) {
      ($kwid) = /kwid="(\S+)"/;
      next;
    }

    if (/<kw/) {
      ($dur) = /dur="(\S+)"/;
      ($file) = /file="(\S+)"/;
      ($tbeg) = /tbeg="(\S+)"/;
      ($score) = /score="(\S+)"/;
      ($channel) = /channel="(\S+)"/;
      push(@KWS, [$kwid, $file, $channel, $tbeg, $dur, $score, ""]);
    }
  }

  $kwslist_in eq "-" || close(I);

  return [\@info, \@KWS];
}

sub PrintKwslist {
  my ($info, $KWS) = @_;

  my $kwslist = "";

  # Start printing
  $kwslist .= "<kwslist kwlist_filename=\"$info->[0]\" language=\"$info->[1]\" system_id=\"$info->[2]\">\n";
  my $prev_kw = "";
  foreach my $kwentry (@{$KWS}) {
    if ($prev_kw ne $kwentry->[0]) {
      if ($prev_kw ne "") {$kwslist .= "  </detected_kwlist>\n";}
      $kwslist .= "  <detected_kwlist search_time=\"1\" kwid=\"$kwentry->[0]\" oov_count=\"0\">\n";
      $prev_kw = $kwentry->[0];
    }
    $kwslist .= "    <kw file=\"$kwentry->[1]\" channel=\"$kwentry->[2]\" tbeg=\"$kwentry->[3]\" dur=\"$kwentry->[4]\" score=\"$kwentry->[5]\" decision=\"$kwentry->[6]\"";
    if (defined($kwentry->[7])) {$kwslist .= " threshold=\"$kwentry->[7]\"";}
    if (defined($kwentry->[8])) {$kwslist .= " raw_score=\"$kwentry->[8]\"";}
    $kwslist .= "/>\n";
  }
  $kwslist .= "  </detected_kwlist>\n";
  $kwslist .= "</kwslist>\n";

  return $kwslist;
}

sub KwslistOutputSort {
  if ($a->[0] ne $b->[0]) {
    if ($a->[0] =~ m/[0-9]+$/ and $b->[0] =~ m/[0-9]+$/) {
      ($a->[0] =~ /([0-9]*)$/)[0] <=> ($b->[0] =~ /([0-9]*)$/)[0]
    } else {
      $a->[0] cmp $b->[0];
    }
  } elsif ($a->[5] ne $b->[5]) {
    $b->[5] <=> $a->[5];
  } else {
    $a->[1] cmp $b->[1];
  }
}
sub KwslistDupSort {
  my ($a, $b, $duptime) = @_;
  if ($a->[0] ne $b->[0]) {
    $a->[0] cmp $b->[0];
  } elsif ($a->[1] ne $b->[1]) {
    $a->[1] cmp $b->[1];
  } elsif ($a->[2] ne $b->[2]) {
    $a->[2] cmp $b->[2];
  } elsif (abs($a->[3]-$b->[3]) >= $duptime){
    $a->[3] <=> $b->[3];
  } elsif ($a->[5] ne $b->[5]) {
    $b->[5] <=> $a->[5];
  } else {
    $b->[4] <=> $a->[4];
  }
}

my $Usage = <<EOU;
This script reads a kwslist.xml file, does the post processing such as making decisions,
normalizing score, removing duplicates, etc. It writes the results to another kwslist.xml
file.

Usage: kwslist_post_process.pl [options] <kwslist_in|-> <kwslist_out|->
 e.g.: kwslist_post_process.pl kwslist.in.xml kwslist.out.xml

Allowed options:
  --beta        : Beta value when computing ATWV                (float,   default = 999.9)
  --digits      : How many digits should the score use          (int,     default = "infinite")
  --duptime     : Tolerance for duplicates                      (float,   default = 0.5)
  --duration    : Duration of the audio (Actural length/2)      (float,   default = 3600)
  --normalize   : Normalize scores or not                       (boolean, default = false)
  --Ntrue-scale : Keyword independent scale factor for Ntrue    (float,   default = 1.0)
  --remove-dup  : Remove duplicates                             (boolean, default = false)
  --remove-NO   : Remove the "NO" decision instances            (boolean, default = false)
  --verbose     : Verbose level (higher --> more kws section)   (integer, default 0)
  --YES-cutoff  : Only keep "\$YES-cutoff" yeses for each kw     (int,     default = -1)

EOU

my $beta = 999.9;
my $duration = 3600;
my $normalize = "false";
my $verbose = 0;
my $Ntrue_scale = 1.0;
my $remove_dup = "false";
my $duptime = 0.5;
my $remove_NO = "false";
my $digits = 0;
my $YES_cutoff = -1;
GetOptions('beta=f'     => \$beta,
  'duration=f'          => \$duration,
  'normalize=s'         => \$normalize,
  'verbose=i'           => \$verbose,
  'Ntrue-scale=f'       => \$Ntrue_scale,
  'remove-dup=s'        => \$remove_dup,
  'duptime=f'           => \$duptime,
  'remove-NO=s'         => \$remove_NO,
  'digits=i'            => \$digits,
  'YES-cutoff=i'        => \$YES_cutoff);

($normalize eq "true" || $normalize eq "false") || die "$0: Bad value for option --normalize\n";
($remove_dup eq "true" || $remove_dup eq "false") || die "$0: Bad value for option --remove-dup\n";
($remove_NO eq "true" || $remove_NO eq "false") || die "$0: Bad value for option --remove-NO\n";

@ARGV == 2 || die $Usage;

# Workout the input/output source
my $kwslist_in = shift @ARGV;
my $kwslist_out = shift @ARGV;

my ($info, $KWS) = @{ReadKwslist($kwslist_in)};

# Work out the Ntrue
my %Ntrue;
foreach my $kwentry (@{$KWS}) {
  if (!defined($Ntrue{$kwentry->[0]})) {
    $Ntrue{$kwentry->[0]} = 0.0;
  }
  $Ntrue{$kwentry->[0]} += $kwentry->[5];
}

# Scale the Ntrue and work out the expected count based threshold
my %threshold;
foreach my $key (keys %Ntrue) {
  $Ntrue{$key} *= $Ntrue_scale;
  $threshold{$key} = $Ntrue{$key}/($duration/$beta+($beta-1)/$beta*$Ntrue{$key});
}

# Removing duplicates
if ($remove_dup eq "true") {
  my @tmp = sort {KwslistDupSort($a, $b, $duptime)} @{$KWS};
  my @KWS = ();
  push(@KWS, $tmp[0]);
  for (my $i = 1; $i < scalar(@tmp); $i ++) {
    my $prev = $KWS[-1];
    my $curr = $tmp[$i];
    if ((abs($prev->[3]-$curr->[3]) < $duptime ) &&
        ($prev->[2] eq $curr->[2]) &&
        ($prev->[1] eq $curr->[1]) &&
        ($prev->[0] eq $curr->[0])) {
      next;
    } else {
      push(@KWS, $curr);
    }
  }
  $KWS = \@KWS;
}

my $format_string = "%g";
if ($digits gt 0 ) {
  $format_string = "%." . $digits ."f";
}

# Making decisions...
my %YES_count;
foreach my $kwentry (@{$KWS}) {
  my $threshold = $threshold{$kwentry->[0]};
  if ($kwentry->[5] > $threshold) {
    $kwentry->[6] = "YES";
    if (defined($YES_count{$kwentry->[0]})) {
      $YES_count{$kwentry->[0]} ++;
    } else {
      $YES_count{$kwentry->[0]} = 1;
    }
  } else {
    $kwentry->[6] = "NO";
    if (!defined($YES_count{$kwentry->[0]})) {
      $YES_count{$kwentry->[0]} = 0;
    }
  }
  if ($verbose > 0) {
    push(@{$kwentry}, sprintf("%g", $threshold));
  }
  if ($normalize eq "true") {
    if ($verbose > 0) {
      push(@{$kwentry}, $kwentry->[5]);
    }
    my $numerator = (1-$threshold)*$kwentry->[5];
    my $denominator = (1-$threshold)*$kwentry->[5]+(1-$kwentry->[5])*$threshold;
    if ($denominator != 0) {
      $kwentry->[5] = sprintf($format_string, $numerator/$denominator);
    } else {
      $kwentry->[5] = sprintf($format_string, $kwentry->[5]);
    }
  } else {
    $kwentry->[5] = sprintf($format_string, $kwentry->[5]);
  }
}

# Sorting and printing
my @tmp = sort KwslistOutputSort @{$KWS};

# Process the YES-cutoff. Note that you don't need this for the normal cases where
# hits and false alarms are balanced
if ($YES_cutoff != -1) {
  my $count = 1;
  for (my $i = 1; $i < scalar(@tmp); $i ++) {
    if ($tmp[$i]->[0] ne $tmp[$i-1]->[0]) {
      $count = 1;
      next;
    }
    if ($YES_count{$tmp[$i]->[0]} > $YES_cutoff*2) {
      $tmp[$i]->[6] = "NO";
      $tmp[$i]->[5] = 0;
      next;
    }
    if (($count == $YES_cutoff) && ($tmp[$i]->[6] eq "YES")) {
      $tmp[$i]->[6] = "NO";
      $tmp[$i]->[5] = 0;
      next;
    }
    if ($tmp[$i]->[6] eq "YES") {
      $count ++;
    }
  }
}

# Process the remove-NO decision
if ($remove_NO eq "true") {
  my @KWS = @tmp;
  @tmp = ();
  for (my $i = 0; $i < scalar(@KWS); $i ++) {
    if ($KWS[$i]->[6] eq "YES") {
      push(@tmp, $KWS[$i]);
    }
  }
}

# Printing
my $kwslist = PrintKwslist($info, \@tmp);

if ($kwslist_out eq "-") {
  print $kwslist;
} else {
  open(O, ">$kwslist_out") || die "$0: Fail to open output file $kwslist_out\n";
  print O $kwslist;
  close(O);
}


================================================
FILE: egs/utils/lang/add_unigrams_arpa.pl
================================================
#!/usr/bin/env perl

# Copyright 2018  Xiaohui Zhang
# Apache 2.0.
#
use strict;
use warnings;
use Getopt::Long;

my $Usage = <<EOU;
# This is a simple script to add unigrams to an ARPA lm file.
Usage: utils/lang/add_unigrams_arpa.pl [options] <oov-prob-file> <scale> <input-arpa >output-arpa
<oov-prob-file> contains a list of words and their probabilities, e.g. "jack 0.2". All probs will be
scaled by a positive scalar <scale> and then be used as the unigram prob. of the added word.
The scale should approximiately relect the OOV rate of the language in concern.
EOU

my @F;
my @OOVS;

if (@ARGV != 2) {
  die $Usage;
}

# Gets parameters.
my $oov_prob_file = shift @ARGV;
my $scale = shift @ARGV;
my $arpa_in = shift @ARGV;
my $arpa_out = shift @ARGV;

# Opens files.
open(F, "<$oov_prob_file") || die "$0: Fail to open $oov_prob_file\n";
while (<F>) { push @OOVS, $_; }
my $num_oovs = @OOVS;

$scale > 0.0 || die "Bad scale";
print STDERR "$0: Creating LM file with additional unigrams, using $oov_prob_file\n";

my %vocab;
my $unigram = 0;
my $num_unigrams = 0;
my @lines;

# Parse and record the head and unigrams in the ARPA LM.
while(<STDIN>) {
  if (m/^ngram 1=(\d+)/) { $num_unigrams = $1; }
  
  if (m/^\\2-grams:$/) { last; }
  if (m/^\\1-grams:$/) { $unigram = 1; push(@lines, $_); next; }
  if (m/^\\2-grams:$/) { $unigram = 0; }

  my @col = split(" ", $_);
  if ( $unigram == 1 ) {
    # Record in-vocab words into a map.
    if ( @col > 0 ) {
      my $word = $col[1];
      $vocab{$word} = 1;
      push(@lines, $_);
    } else {
      # Insert out-of-vocab words and their probs into the unigram list.
      foreach my $l (@OOVS) {
        my @A = split(" ", $l);
        @A == 2 || die "bad line in oov2prob: $_;";
        my $word = $A[0];
        my $prob = $A[1];
        if (exists($vocab{$word})) { next; }
        $num_unigrams ++;
        my $log10prob = (log($prob * $scale) / log(10.0));
        $vocab{$word} = 1;
        my $line = sprintf("%.6f\t$word\n", $log10prob);
        push(@lines, $line);
      }
    }
  } else { push(@lines, $_); }
}

# Print the head and unigrams, with the updated # unigrams in the head.
foreach my $l (@lines) {
  if ($l =~ m/ngram 1=/) {
    print "ngram 1=$num_unigrams\n";
  } else {
    print $l;
  }
}

# Print the left fields.
print "\n\\2-grams:\n";
while(<STDIN>) {
  print;
}

close(F);
exit 0


================================================
FILE: egs/utils/lang/adjust_unk_arpa.pl
================================================
#!/usr/bin/env perl

# Copyright 2018  Xiaohui Zhang
# Apache 2.0.
#
use strict;
use warnings;
use Getopt::Long;

my $Usage = <<EOU;
# This is a simple script to set/scale the prob of n-grams where the OOV dict entry is the predicted word, in an ARPA lm file.
Usage: utils/lang/adjust_unk_arpa.pl [options] <oov-dict-entry> <unk-scale> <input-arpa >output-arpa

Allowed options:
  --fixed-value (true|false)   : If true, interpret the unk-scale as a fixed value we'll set to
                                 the unigram prob of the OOV dict entry, rather than using it to
                                 scale the probs. In this case higher order n-grams containing
                                 the OOV dict entry remain untouched. This is useful when the OOV
                                 dict entry doesn't appear in n-grams (n>1) as the predicted word.
EOU

my $fixed_value = "false";
GetOptions('fixed-value=s' => \$fixed_value);

($fixed_value eq "true" || $fixed_value eq "false") ||
  die "$0: Bad value for option --fixed-value\n";

if (@ARGV != 2) {
  die $Usage;
}

# Gets parameters.
my $unk_word = shift @ARGV;
my $unk_scale = shift @ARGV;
my $arpa_in = shift @ARGV;
my $arpa_out = shift @ARGV;

$unk_scale > 0.0 || die "Bad unk_scale"; # this must be positive
if ( $fixed_value eq "true" ) {
  print STDERR "$0: Setting the unigram prob of $unk_word in LM file as $unk_scale.\n";
} else {
  print STDERR "$0: Scaling the probs of ngrams where $unk_word is the predicted word in LM file by $unk_scale.\n";
}

my $ngram = 0; # the order of ngram we are visiting

# Change the unigram prob of the unk-word in the ARPA LM.
while(<STDIN>) {
  if (m/^\\1-grams:$/) { $ngram = 1; }
  if (m/^\\2-grams:$/) { $ngram = 2; }
  if (m/^\\3-grams:$/) { $ngram = 3; }
  if (m/^\\4-grams:$/) { $ngram = 4; }
  if (m/^\\5-grams:$/) { $ngram = 5; }
  my @col = split(" ", $_);
  if ( @col > 1 && $ngram > 0 && $col[$ngram] eq $unk_word ) {
    if ( $fixed_value eq "true" && $ngram == 1 ) {
      $col[0] = (log($unk_scale) / log(10.0));
    } elsif ($fixed_value eq "false" ) {
      $col[0] += (log($unk_scale) / log(10.0));
    }
    my $line = join("\t", @col);
    print "$line\n";
  } else {
    print;
  }
}

exit 0


================================================
FILE: egs/utils/lang/adjust_unk_graph.sh
================================================
#!/usr/bin/env bash
# Copyright 2018 Xiaohui Zhang
# Apache 2.0

# This script copies a fully expanded decoding graph (HCLG.fst) and scales the scores
# of all arcs whose output symbol is a user-specified OOV symbol (or any other word).
# This achieves an equivalent effect of utils/lang/adjust_unk_arpa.pl, which scales
# the LM prob of all ngrams predicting an OOV symbol, while avoiding re-creating the graph.

set -o pipefail

if [ $# != 4 ]; then
   echo "Usage: utils/adjust_unk_graph.sh <oov-dict-entry> <scale> <in-graph-dir> <out-graph-dir>"
   echo "e.g.: utils/adjust_unk_graph.sh \"<unk>\" 0.1 exp/tri1/graph exp/tri1/graph_unk_scale_0.1"
   exit 1;
fi

if [ -f path.sh ]; then . ./path.sh; fi

oov_word=$1
unk_scale=$2
graphdir_in=$3
graphdir_out=$4

mkdir -p $graphdir_out

required="HCLG.fst words.txt disambig_tid.int num_pdfs phones phones.txt words.txt"
for f in $required; do
  [ ! -e $graphdir_in/$f ] && echo "adjust_unk_graph.sh: expected $graphdir_in/$f to exist" && exit 1;
  cp -r $graphdir_in/$f $graphdir_out
done

cp -r $graphdir_in/{disambig_tid.int,num_pdfs,phones,phones.txt,words.txt} $graphdir_out

oov_id=`echo $oov_word | utils/sym2int.pl $graphdir_in/words.txt`
[ -z $oov_id ] && echo "adjust_unk_graph.sh: the specified oov symbol $oov_word is out of the vocabulary." && exit 1;
fstprint $graphdir_in/HCLG.fst | awk -v oov=$oov_id -v unk_scale=$unk_scale '{if($4==oov) $5=$5-log(unk_scale);print $0}' | \
  fstcompile | fstconvert --fst_type=const  > $graphdir_out/HCLG.fst || exit 1;


================================================
FILE: egs/utils/lang/bpe/add_final_optional_silence.sh
================================================
#!/usr/bin/env bash
. ./path.sh

final_sil_prob=0.5

echo "$0 $@"  # Print the command line for logging

. ./utils/parse_options.sh

if [ $# -ne 1 ]; then
  echo "Usage: $0  <lang>"
  echo " Add final optional silence to lexicon FSTs (L.fst and L_disambig.fst) in"
  echo " lang/ directory <lang>."
  echo " This can be useful in systems with byte-pair encoded (BPE) lexicons, in which"
  echo " the word-initial silence is part of the lexicon, so we turn off the standard"
  echo " optional silence in the lexicon"
  echo "options:"
  echo "   --final-sil-prob <final silence probability>      # default 0.5"
  exit 1;
fi

lang=$1

if [ $lang/phones/final_sil_prob -nt $lang/phones/nonsilence.txt ]; then
  echo "$0 $lang/phones/final_sil_prob exists. Exiting..."
  exit 1;
fi

silphone=$(cat $lang/phones/optional_silence.int)

sil_eq_zero=$(echo $(perl -e "if ( $final_sil_prob == 0.0) {print 'true';} else {print 'false';}"))
sil_eq_one=$(echo $(perl -e "if ( $final_sil_prob == 1.0) {print 'true';} else {print 'false';}"))
sil_lt_zero=$(echo $(perl -e "if ( $final_sil_prob < 0.0) {print 'true';} else {print 'false';}"))
sil_gt_one=$(echo $(perl -e "if ( $final_sil_prob > 1.0) {print 'true';} else {print 'false';}"))

if  $sil_lt_zero || $sil_gt_one; then
  echo "$0 final-sil-prob should be between 0.0 and 1.0. Final silence was not added."
  exit 1;
else
  if $sil_eq_zero; then
    echo "$0 final-sil-prob = 0 => Final silence was not added."
    exit 0;
  elif $sil_eq_one; then
    ( echo "0 1 $silphone 0";
      echo "1" ) | fstcompile > $lang/final_sil.fst
  else
    log_silprob=$(echo $(perl -e "print log $final_sil_prob"))
    ( echo "0 1 $silphone 0 $log_silprob";
      echo "0 $log_silprob";
      echo "1" ) | fstcompile > $lang/final_sil.fst
  fi
  mv $lang/L.fst $lang/L.fst.orig
  mv $lang/L_disambig.fst $lang/L_disambig.fst.orig
  fstconcat $lang/L.fst.orig $lang/final_sil.fst | fstarcsort --sort_type=olabel > $lang/L.fst
  fstconcat $lang/L_disambig.fst.orig $lang/final_sil.fst | fstarcsort --sort_type=olabel > $lang/L_disambig.fst
  echo "$final_sil_prob" > $lang/phones/final_sil_prob
fi


================================================
FILE: egs/utils/lang/bpe/apply_bpe.py
================================================
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Author: Rico Sennrich
# Released under the MIT License.

"""Use operations learned with learn_bpe.py to encode a new text.
The text will not be smaller, but use only a fixed vocabulary, with rare words
encoded as variable-length sequences of subword units.

Reference:
Rico Sennrich, Barry Haddow and Alexandra Birch (2015). Neural Machine Translation of Rare Words with Subword Units.
Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (ACL 2016). Berlin, Germany.
"""

from __future__ import unicode_literals, division

import sys
import codecs
import io
import argparse
import re

# hack for python2/3 compatibility
from io import open
argparse.open = open

class BPE(object):

    def __init__(self, codes, merges=-1, separator='@@', vocab=None, glossaries=None):

        codes.seek(0)

        # check version information
        firstline = codes.readline()
        if firstline.startswith('#version:'):
            self.version = tuple([int(x) for x in re.sub(r'(\.0+)*$','', firstline.split()[-1]).split(".")])
        else:
            self.version = (0, 1)
            codes.seek(0)

        self.bpe_codes = [tuple(item.strip().split(' ')) for (n, item) in enumerate(codes) if (n < merges or merges == -1)]

        for item in self.bpe_codes:
            if len(item) != 2:
                sys.stderr.write('Error: invalid line in BPE codes file: {0}\n'.format(' '.join(item)))
                sys.stderr.write('The line should exist of exactly two subword units, separated by whitespace\n'.format(' '.join(item)))
                sys.exit(1)

        # some hacking to deal with duplicates (only consider first instance)
        self.bpe_codes = dict([(code,i) for (i,code) in reversed(list(enumerate(self.bpe_codes)))])

        self.bpe_codes_reverse = dict([(pair[0] + pair[1], pair) for pair,i in self.bpe_codes.items()])

        self.separator = separator

        self.vocab = vocab

        self.glossaries = glossaries if glossaries else []

        self.cache = {}

    def process_line(self, line):
        """segment line, dealing with leading and trailing whitespace"""

        out = ""

        leading_whitespace = len(line)-len(line.lstrip())
        if leading_whitespace:
            out += line[:leading_whitespace]

        out += self.segment(line)

        trailing_whitespace = len(line)-len(line.rstrip())
        if trailing_whitespace:
            out += line[-trailing_whitespace:]

        return out

    def segment(self, sentence):
        """segment single sentence (whitespace-tokenized string) with BPE encoding"""
        output = []
        for word in sentence.strip().split(' '):
            # eliminate double spaces
            if not word:
                continue
            new_word = [out for segment in self._isolate_glossaries(word)
                        for out in encode(segment,
                                          self.bpe_codes,
                                          self.bpe_codes_reverse,
                                          self.vocab,
                                          self.separator,
                                          self.version,
                                          self.cache,
                                          self.glossaries)]

            for item in new_word[:-1]:
                output.append(item + self.separator)
            output.append(new_word[-1])

        return ' '.join(output)

    def _isolate_glossaries(self, word):
        word_segments = [word]
        for gloss in self.glossaries:
            word_segments = [out_segments for segment in word_segments
                                 for out_segments in isolate_glossary(segment, gloss)]
        return word_segments

def create_parser():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.RawDescriptionHelpFormatter,
        description="learn BPE-based word segmentation")

    parser.add_argument(
        '--input', '-i', type=argparse.FileType('r'), default=sys.stdin,
        metavar='PATH',
        help="Input file (default: standard input).")
    parser.add_argument(
        '--codes', '-c', type=argparse.FileType('r'), metavar='PATH',
        required=True,
        help="File with BPE codes (created by learn_bpe.py).")
    parser.add_argument(
        '--merges', '-m', type=int, default=-1,
        metavar='INT',
        help="Use this many BPE operations (<= number of learned symbols)"+
             "default: Apply all the learned merge operations")
    parser.add_argument(
        '--output', '-o', type=argparse.FileType('w'), default=sys.stdout,
        metavar='PATH',
        help="Output file (default: standard output)")
    parser.add_argument(
        '--separator', '-s', type=str, default='@@', metavar='STR',
        help="Separator between non-final subword units (default: '%(default)s'))")
    parser.add_argument(
        '--vocabulary', type=argparse.FileType('r'), default=None,
        metavar="PATH",
        help="Vocabulary file (built with get_vocab.py). If provided, this script reverts any merge operations that produce an OOV.")
    parser.add_argument(
        '--vocabulary-threshold', type=int, default=None,
        metavar="INT",
        help="Vocabulary threshold. If vocabulary is provided, any word with frequency < threshold will be treated as OOV")
    parser.add_argument(
        '--glossaries', type=str, nargs='+', default=None,
        metavar="STR",
        help="Glossaries. The strings provided in glossaries will not be affected"+
             "by the BPE (i.e. they will neither be broken into subwords, nor concatenated with other subwords")

    return parser

def get_pairs(word):
    """Return set of symbol pairs in a word.

    word is represented as tuple of symbols (symbols being variable-length strings)
    """
    pairs = set()
    prev_char = word[0]
    for char in word[1:]:
        pairs.add((prev_char, char))
        prev_char = char
    return pairs

def encode(orig, bpe_codes, bpe_codes_reverse, vocab, separator, version, cache, glossaries=None):
    """Encode word based on list of BPE merge operations, which are applied consecutively
    """

    if orig in cache:
        return cache[orig]

    if orig in glossaries:
        cache[orig] = (orig,)
        return (orig,)

    if version == (0, 1):
        word = tuple(orig) + ('</w>',)
    elif version == (0, 2): # more consistent handling of word-final segments
        word = tuple(orig[:-1]) + ( orig[-1] + '</w>',)
    else:
        raise NotImplementedError

    pairs = get_pairs(word)

    if not pairs:
        return orig

    while True:
        bigram = min(pairs, key = lambda pair: bpe_codes.get(pair, float('inf')))
        if bigram not in bpe_codes:
            break
        first, second = bigram
        new_word = []
        i = 0
        while i < len(word):
            try:
                j = word.index(first, i)
                new_word.extend(word[i:j])
                i = j
            except:
                new_word.extend(word[i:])
                break

            if word[i] == first and i < len(word)-1 and word[i+1] == second:
                new_word.append(first+second)
                i += 2
            else:
                new_word.append(word[i])
                i += 1
        new_word = tuple(new_word)
        word = new_word
        if len(word) == 1:
            break
        else:
            pairs = get_pairs(word)

    # don't print end-of-word symbols
    if word[-1] == '</w>':
        word = word[:-1]
    elif word[-1].endswith('</w>'):
        word = word[:-1] + (word[-1].replace('</w>',''),)

    if vocab:
        word = check_vocab_and_split(word, bpe_codes_reverse, vocab, separator)

    cache[orig] = word
    return word

def recursive_split(segment, bpe_codes, vocab, separator, final=False):
    """Recursively split segment into smaller units (by reversing BPE merges)
    until all units are either in-vocabulary, or cannot be split futher."""

    try:
        if final:
            left, right = bpe_codes[segment + '</w>']
            right = right[:-4]
        else:
            left, right = bpe_codes[segment]
    except:
        #sys.stderr.write('cannot split {0} further.\n'.format(segment))
        yield segment
        return

    if left + separator in vocab:
        yield left
    else:
        for item in recursive_split(left, bpe_codes, vocab, separator, False):
            yield item

    if (final and right in vocab) or (not final and right + separator in vocab):
        yield right
    else:
        for item in recursive_split(right, bpe_codes, vocab, separator, final):
            yield item

def check_vocab_and_split(orig, bpe_codes, vocab, separator):
    """Check for each segment in word if it is in-vocabulary,
    and segment OOV segments into smaller units by reversing the BPE merge operations"""

    out = []

    for segment in orig[:-1]:
        if segment + separator in vocab:
            out.append(segment)
        else:
            #sys.stderr.write('OOV: {0}\n'.format(segment))
            for item in recursive_split(segment, bpe_codes, vocab, separator, False):
                out.append(item)

    segment = orig[-1]
    if segment in vocab:
        out.append(segment)
    else:
        #sys.stderr.write('OOV: {0}\n'.format(segment))
        for item in recursive_split(segment, bpe_codes, vocab, separator, True):
            out.append(item)

    return out


def read_vocabulary(vocab_file, threshold):
    """read vocabulary file produced by get_vocab.py, and filter according to frequency threshold.
    """

    vocabulary = set()

    for line in vocab_file:
        word, freq = line.strip().split(' ')
        freq = int(freq)
        if threshold == None or freq >= threshold:
            vocabulary.add(word)

    return vocabulary

def isolate_glossary(word, glossary):
    """
    Isolate a glossary present inside a word.

    Returns a list of subwords. In which all 'glossary' glossaries are isolated 

    For example, if 'USA' is the glossary and '1934USABUSA' the word, the return value is:
        ['1934', 'USA', 'B', 'USA']
    """
    if word == glossary or glossary not in word:
        return [word]
    else:
        splits = word.split(glossary)
        segments = [segment.strip() for split in splits[:-1] for segment in [split, glossary] if segment != '']
        return segments + [splits[-1].strip()] if splits[-1] != '' else segments

if __name__ == '__main__':

    # python 2/3 compatibility
    if sys.version_info < (3, 0):
        sys.stderr = codecs.getwriter('UTF-8')(sys.stderr)
        sys.stdout = codecs.getwriter('UTF-8')(sys.stdout)
        sys.stdin = codecs.getreader('UTF-8')(sys.stdin)
    else:
        sys.stdin = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8')
        sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8')
        sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', write_through=True, line_buffering=True)

    parser = create_parser()
    args = parser.parse_args()

    # read/write files as UTF-8
    args.codes = codecs.open(args.codes.name, encoding='utf-8')
    if args.input.name != '<stdin>':
        args.input = codecs.open(args.input.name, encoding='utf-8')
    if args.output.name != '<stdout>':
        args.output = codecs.open(args.output.name, 'w', encoding='utf-8')
    if args.vocabulary:
        args.vocabulary = codecs.open(args.vocabulary.name, encoding='utf-8')

    if args.vocabulary:
        vocabulary = read_vocabulary(args.vocabulary, args.vocabulary_threshold)
    else:
        vocabulary = None

    bpe = BPE(args.codes, args.merges, args.separator, vocabulary, args.glossaries)

    for line in args.input:
        args.output.write(bpe.process_line(line))


================================================
FILE: egs/utils/lang/bpe/bidi.py
================================================
#!/usr/bin/env python3
# Copyright   2018 Chun-Chieh Chang

# This script is largely written by Stephen Rawls
# and uses the python package https://pypi.org/project/PyICU_BiDi/
# The code leaves right to left text alone and reverses left to right text.

import icu_bidi
import io
import sys
import unicodedata
# R=strong right-to-left;  AL=strong arabic right-to-left
rtl_set =  set(chr(i) for i in range(sys.maxunicode)
               if unicodedata.bidirectional(chr(i)) in ['R','AL'])
def determine_text_direction(text):
    # Easy case first
    for char in text:
        if char in rtl_set:
            return icu_bidi.UBiDiLevel.UBIDI_RTL
    # If we made it here we did not encounter any strongly rtl char
    return icu_bidi.UBiDiLevel.UBIDI_LTR

def utf8_visual_to_logical(text):
    text_dir = determine_text_direction(text)

    bidi = icu_bidi.Bidi()
    bidi.inverse = True
    bidi.reordering_mode = icu_bidi.UBiDiReorderingMode.UBIDI_REORDER_INVERSE_LIKE_DIRECT
    bidi.reordering_options = icu_bidi.UBiDiReorderingOption.UBIDI_OPTION_DEFAULT # icu_bidi.UBiDiReorderingOption.UBIDI_OPTION_INSERT_MARKS

    bidi.set_para(text, text_dir, None)

    res = bidi.get_reordered(0 | icu_bidi.UBidiWriteReorderedOpt.UBIDI_DO_MIRRORING | icu_bidi.UBidiWriteReorderedOpt.UBIDI_KEEP_BASE_COMBINING)

    return res

def utf8_logical_to_visual(text):
    text_dir = determine_text_direction(text)

    bidi = icu_bidi.Bidi()

    bidi.reordering_mode = icu_bidi.UBiDiReorderingMode.UBIDI_REORDER_DEFAULT
    bidi.reordering_options = icu_bidi.UBiDiReorderingOption.UBIDI_OPTION_DEFAULT  #icu_bidi.UBiDiReorderingOption.UBIDI_OPTION_INSERT_MARKS

    bidi.set_para(text, text_dir, None)

    res = bidi.get_reordered(0 | icu_bidi.UBidiWriteReorderedOpt.UBIDI_DO_MIRRORING | icu_bidi.UBidiWriteReorderedOpt.UBIDI_KEEP_BASE_COMBINING)

    return res


##main##
sys.stdin = io.TextIOWrapper(sys.stdin.buffer, encoding="utf8")
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf8")
for line in sys.stdin:
    line = line.strip()
    line = utf8_logical_to_visual(line)[::-1]
    sys.stdout.write(line + '\n')


================================================
FILE: egs/utils/lang/bpe/learn_bpe.py
================================================
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Author: Rico Sennrich
# Released under the MIT License.

"""Use byte pair encoding (BPE) to learn a variable-length encoding of the vocabulary in a text.
Unlike the original BPE, it does not compress the plain text, but can be used to reduce the vocabulary
of a text to a configurable number of symbols, with only a small increase in the number of tokens.

Reference:
Rico Sennrich, Barry Haddow and Alexandra Birch (2016). Neural Machine Translation of Rare Words with Subword Units.
Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (ACL 2016). Berlin, Germany.
"""

from __future__ import unicode_literals
from __future__ import division
from __future__ import print_function

import sys
import codecs
import re
import copy
import argparse
from collections import defaultdict, Counter

# hack for python2/3 compatibility
from io import open
argparse.open = open

def create_parser():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.RawDescriptionHelpFormatter,
        description="learn BPE-based word segmentation")

    parser.add_argument(
        '--input', '-i', type=argparse.FileType('r'), default=sys.stdin,
        metavar='PATH',
        help="Input text (default: standard input).")

    parser.add_argument(
        '--output', '-o', type=argparse.FileType('w'), default=sys.stdout,
        metavar='PATH',
        help="Output file for BPE codes (default: standard output)")
    parser.add_argument(
        '--symbols', '-s', type=int, default=10000,
        help="Create this many new symbols (each representing a character n-gram) (default: %(default)s))")
    parser.add_argument(
        '--min-frequency', type=int, default=2, metavar='FREQ',
        help='Stop if no symbol pair has frequency >= FREQ (default: %(default)s))')
    parser.add_argument('--dict-input', action="store_true",
        help="If set, input file is interpreted as a dictionary where each line contains a word-count pair")
    parser.add_argument(
        '--verbose', '-v', action="store_true",
        help="verbose mode.")

    return parser

def get_vocabulary(fobj, is_dict=False):
    """Read text and return dictionary that encodes vocabulary
    """
    vocab = Counter()
    for i, line in enumerate(fobj):
        if is_dict:
            try:
                word, count = line.strip().split(' ')
            except:
                print('Failed reading vocabulary file at line {0}: {1}'.format(i, line))
                sys.exit(1)
            vocab[word] += int(count)
        else:
            for word in line.strip().split(' '):
                if word:
                    vocab[word] += 1
    return vocab

def update_pair_statistics(pair, changed, stats, indices):
    """Minimally update the indices and frequency of symbol pairs

    if we merge a pair of symbols, only pairs that overlap with occurrences
    of this pair are affected, and need to be updated.
    """
    stats[pair] = 0
    indices[pair] = defaultdict(int)
    first, second = pair
    new_pair = first+second
    for j, word, old_word, freq in changed:

        # find all instances of pair, and update frequency/indices around it
        i = 0
        while True:
            # find first symbol
            try:
                i = old_word.index(first, i)
            except ValueError:
                break
            # if first symbol is followed by second symbol, we've found an occurrence of pair (old_word[i:i+2])
            if i < len(old_word)-1 and old_word[i+1] == second:
                # assuming a symbol sequence "A B C", if "B C" is merged, reduce the frequency of "A B"
                if i:
                    prev = old_word[i-1:i+1]
                    stats[prev] -= freq
                    indices[prev][j] -= 1
                if i < len(old_word)-2:
                    # assuming a symbol sequence "A B C B", if "B C" is merged, reduce the frequency of "C B".
                    # however, skip this if the sequence is A B C B C, because the frequency of "C B" will be reduced by the previous code block
                    if old_word[i+2] != first or i >= len(old_word)-3 or old_word[i+3] != second:
                        nex = old_word[i+1:i+3]
                        stats[nex] -= freq
                        indices[nex][j] -= 1
                i += 2
            else:
                i += 1

        i = 0
        while True:
            try:
                # find new pair
                i = word.index(new_pair, i)
            except ValueError:
                break
            # assuming a symbol sequence "A BC D", if "B C" is merged, increase the frequency of "A BC"
            if i:
                prev = word[i-1:i+1]
                stats[prev] += freq
                indices[prev][j] += 1
            # assuming a symbol sequence "A BC B", if "B C" is merged, increase the frequency of "BC B"
            # however, if the sequence is A BC BC, skip this step because the count of "BC BC" will be incremented by the previous code block
            if i < len(word)-1 and word[i+1] != new_pair:
                nex = word[i:i+2]
                stats[nex] += freq
                indices[nex][j] += 1
            i += 1


def get_pair_statistics(vocab):
    """Count frequency of all symbol pairs, and create index"""

    # data structure of pair frequencies
    stats = defaultdict(int)

    #index from pairs to words
    indices = defaultdict(lambda: defaultdict(int))

    for i, (word, freq) in enumerate(vocab):
        prev_char = word[0]
        for char in word[1:]:
            stats[prev_char, char] += freq
            indices[prev_char, char][i] += 1
            prev_char = char

    return stats, indices


def replace_pair(pair, vocab, indices):
    """Replace all occurrences of a symbol pair ('A', 'B') with a new symbol 'AB'"""
    first, second = pair
    pair_str = ''.join(pair)
    pair_str = pair_str.replace('\\','\\\\')
    changes = []
    pattern = re.compile(r'(?<!\S)' + re.escape(first + ' ' + second) + r'(?!\S)')
    if sys.version_info < (3, 0):
        iterator = indices[pair].iteritems()
    else:
        iterator = indices[pair].items()
    for j, freq in iterator:
        if freq < 1:
            continue
        word, freq = vocab[j]
        new_word = ' '.join(word)
        new_word = pattern.sub(pair_str, new_word)
        new_word = tuple(new_word.split(' '))

        vocab[j] = (new_word, freq)
        changes.append((j, new_word, word, freq))

    return changes

def prune_stats(stats, big_stats, threshold):
    """Prune statistics dict for efficiency of max()

    The frequency of a symbol pair never increases, so pruning is generally safe
    (until we the most frequent pair is less frequent than a pair we previously pruned)
    big_stats keeps full statistics for when we need to access pruned items
    """
    for item,freq in list(stats.items()):
        if freq < threshold:
            del stats[item]
            if freq < 0:
                big_stats[item] += freq
            else:
                big_stats[item] = freq


def main(infile, outfile, num_symbols, min_frequency=2, verbose=False, is_dict=False):
    """Learn num_symbols BPE operations from vocabulary, and write to outfile.
    """

    # version 0.2 changes the handling of the end-of-word token ('</w>');
    # version numbering allows bckward compatibility
    outfile.write('#version: 0.2\n')

    vocab = get_vocabulary(infile, is_dict)
    vocab = dict([(tuple(x[:-1])+(x[-1]+'</w>',) ,y) for (x,y) in vocab.items()])
    sorted_vocab = sorted(vocab.items(), key=lambda x: x[1], reverse=True)

    stats, indices = get_pair_statistics(sorted_vocab)
    big_stats = copy.deepcopy(stats)
    # threshold is inspired by Zipfian assumption, but should only affect speed
    threshold = max(stats.values()) / 10
    for i in range(num_symbols):
        if stats:
            most_frequent = max(stats, key=lambda x: (stats[x], x))

        # we probably missed the best pair because of pruning; go back to full statistics
        if not stats or (i and stats[most_frequent] < threshold):
            prune_stats(stats, big_stats, threshold)
            stats = copy.deepcopy(big_stats)
            most_frequent = max(stats, key=lambda x: (stats[x], x))
            # threshold is inspired by Zipfian assumption, but should only affect speed
            threshold = stats[most_frequent] * i/(i+10000.0)
            prune_stats(stats, big_stats, threshold)

        if stats[most_frequent] < min_frequency:
            sys.stderr.write('no pair has frequency >= {0}. Stopping\n'.format(min_frequency))
            break

        if verbose:
            sys.stderr.write('pair {0}: {1} {2} -> {1}{2} (frequency {3})\n'.format(i, most_frequent[0], most_frequent[1], stats[most_frequent]))
        outfile.write('{0} {1}\n'.format(*most_frequent))
        changes = replace_pair(most_frequent, sorted_vocab, indices)
        update_pair_statistics(most_frequent, changes, stats, indices)
        stats[most_frequent] = 0
        if not i % 100:
            prune_stats(stats, big_stats, threshold)


if __name__ == '__main__':

    # python 2/3 compatibility
    if sys.version_info < (3, 0):
        sys.stderr = codecs.getwriter('UTF-8')(sys.stderr)
        sys.stdout = codecs.getwriter('UTF-8')(sys.stdout)
        sys.stdin = codecs.getreader('UTF-8')(sys.stdin)
    else:
        sys.stderr = codecs.getwriter('UTF-8')(sys.stderr.buffer)
        sys.stdout = codecs.getwriter('UTF-8')(sys.stdout.buffer)
        sys.stdin = codecs.getreader('UTF-8')(sys.stdin.buffer)

    parser = create_parser()
    args = parser.parse_args()

    # read/write files as UTF-8
    if args.input.name != '<stdin>':
        args.input = codecs.open(args.input.name, encoding='utf-8')
    if args.output.name != '<stdout>':
        args.output = codecs.open(args.output.name, 'w', encoding='utf-8')

    main(args.input, args.output, args.symbols, args.min_frequency, args.verbose, is_dict=args.dict_input)


================================================
FILE: egs/utils/lang/bpe/prepend_words.py
================================================
#!/usr/bin/env python3

# This script, prepend '|' to every words in the transcript to mark
# the beginning of the words for finding the initial-space of every word
# after decoding.

import sys
import io
import re

whitespace = re.compile("[ \t]+")
infile = io.TextIOWrapper(sys.stdin.buffer, encoding='latin-1')
output = io.TextIOWrapper(sys.stdout.buffer, encoding='latin-1')
for line in infile:
    words = whitespace.split(line.strip(" \t\r\n"))
    output.write(' '.join([ "|"+word for word in words]) + '\n')


================================================
FILE: egs/utils/lang/bpe/reverse.py
================================================
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

# This script, reverse all latin and digits sequences
# (including words like MP3) to put them in the right order in the images.

import re, os, sys, io

in_stream = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8')
out_stream = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
for line in in_stream:
    out_stream.write(re.sub(r'[a-zA-Z0-9][a-zA-Z0-9\s\.\:]*[a-zA-Z0-9]',
                            lambda m:m.group(0)[::-1], line))


================================================
FILE: egs/utils/lang/check_g_properties.pl
================================================
#!/usr/bin/env perl

use IPC::Open2;

if (@ARGV != 1) {
  print "Usage: $0 [options] <lang_directory>\n";
  print "e.g.:  $0 data/lang\n";
  exit(1);
}

$lang = shift @ARGV;

# This script checks that G.fst in the lang.fst directory is OK with respect
# to certain expected properties, and returns nonzero exit status if a problem was
# detected.  It is called from validate_lang.pl.
# This only checks the properties of G that relate to disambiguation symbols,
# epsilons and forbidden symbols <s> and </s>.

if (! -e "$lang/G.fst") {
  print "$0: error: $lang/G.fst does not exist\n";
  exit(1);
}

open(W, "<$lang/words.txt") || die "opening $lang/words.txt";
$hash_zero = -1;
while (<W>) {
  @A = split(" ", $_);
  ($sym, $int) = @A;
  if ($sym eq "<s>" || $sym eq "</s>") { $is_forbidden{$int} = 1; }
  if ($sym eq "#0") { $hash_zero = $int; }
  if ($sym =~ m/^#nonterm/) { $is_nonterminal{$int} = 1; }
}

if (-e "$lang/phones/wdisambig_words.int") {
  open(F, "<$lang/phones/wdisambig_words.int") || die "opening $lang/phones/wdisambig_words.int";
  while (<F>) {
    chop;
    $is_disambig{$_} = 1;
  }
} else {
  $is_disambig{$hash_zero} = 1;
}

$input_cmd = ". ./path.sh; fstprint $lang/G.fst|";
open(G, $input_cmd) || die "running command $input_cmd";

$info_cmd = ". ./path.sh; fstcompile | fstinfo ";
open2(O, I, "$info_cmd") || die "running command $info_cmd";

$has_epsilons = 0;

while (<G>) {
  @A = split(" ", $_);
  if (@A >= 4) {
    if ($is_forbidden{$A[2]} || $is_forbidden{$A[3]}) {
      chop;
      print "$0: validating $lang: error: line $_ in G.fst contains forbidden symbol <s> or </s>\n";
      exit(1);
    } elsif ($is_disambig{$A[2]}) {
      print I $_;
      if ($A[3] != 0) {
        chop;
        print "$0: validating $lang: error: line $_ in G.fst has disambig on input but no epsilon on output\n";
        exit(1);
      }
    } elsif ($A[2] == 0) {
      print I $_;
      $has_epsilons = 1;
    } elsif ($A[2] != $A[3] && !$is_nonterminal{$A[2]} ) {
      chop;
      print "$0: validating $lang: error: line $_ in G.fst has inputs and outputs different but input is not disambig symbol or nonterminal.\n";
      exit(1);
    }
  }
}

close(I);  # tell 'fstcompile | fstinfo' pipeline that its input is done.
while (<O>) {
  if (m/cyclic\s+y/) {
    print "$0: validating $lang: error: G.fst has cycles containing only disambig symbols and epsilons.  Would cause determinization failure\n";
    exit(1);
  }
}

if ($has_epsilons) {
  print "$0: warning: validating $lang: G.fst has epsilon-input arcs.  We don't expect these in most setups.\n";
}

print "--> $0 successfully validated $lang/G.fst\n";
exit(0);


================================================
FILE: egs/utils/lang/check_phones_compatible.sh
================================================
#!/usr/bin/env bash
# Copyright 2016 Hang Lyu

# Licensed udner the Apache License, Version 2.0 (the "Lincense");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#  http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OF IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.

# This script exits with status zero if the phone symbols tables are the same
# except for possible differences in disambiguation symbols (meaning that all
# symbols except those beginning with a # are mapped to the same values).
# Otherwise it prints a warning and exits with status 1.
# For the sake of compatibility with other scripts that did not write the
# phones.txt to model directories, this script exits silently with status 0
# if one of the phone symbol tables does not exist.

. utils/parse_options.sh || exit 1;

if [ $# -ne 2 ]; then
  echo "Usage: utils/lang/check_phones_compatible.sh <phones-symbol-table1> <phones-symbol-table2>"
  echo "e.g.: utils/lang/check_phones_compatible.sh data/lang/phones.txt exp/tri3/phones.txt"
  exit 1;
fi

table_first=$1
table_second=$2

# check if the files exist or not
if [ ! -f $table_first ]; then
  if [ ! -f $table_second ]; then
    echo "$0: Error! Both of the two phones-symbol tables are absent."
    echo "Please check your command"
    exit 1;
  else
    # The phones-symbol-table1 is absent. The model directory maybe created by old script.
    # For back compatibility, this script exits silently with status 0.
    exit 0;
  fi
elif [ ! -f $table_second ]; then
  # The phones-symbol-table2 is absent. The model directory maybe created by old script.
  # For back compatibility, this script exits silently with status 0.
  exit 0;
fi

# Check if the two tables are the same (except for possible difference in disambiguation symbols).
if ! cmp -s <(grep -v "^#" $table_first) <(grep -v "^#" $table_second); then
  echo "$0: phone symbol tables $table_first and $table_second are not compatible."
  exit 1;
fi

exit 0;


================================================
FILE: egs/utils/lang/compute_sentence_probs_arpa.py
================================================
#!/usr/bin/env python

# Dongji Gao

# We're using python 3.x style but want it to work in python 2.x

from __future__ import print_function
import argparse
import sys
import math

parser = argparse.ArgumentParser(description="This script evaluates the log probabilty (default log base is e) of each sentence "
                                             "from data (in text form), given a language model in arpa form "
                                             "and a specific ngram order.",
                                 epilog="e.g. ./compute_sentence_probs_arpa.py ARPA_LM NGRAM_ORDER TEXT_IN PROB_FILE --log-base=LOG_BASE",
                                 formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument("arpa_lm", type=str,
                    help="Input language model in arpa form.")
parser.add_argument("ngram_order", type=int,
                    help="Order of ngram")
parser.add_argument("text_in", type=str,
                    help="Filename of input text file (each line will be interpreted as a sentence).")
parser.add_argument("prob_file", type=str,
                    help="Filename of output probability file.")
parser.add_argument("--log-base", type=float, default=math.exp(1),
                    help="Log base for log porbability")
args = parser.parse_args()

def check_args(args):
    args.text_in_handle = sys.stdin if args.text_in == "-" else open(args.text_in, "r")
    args.prob_file_handle = sys.stdout if args.prob_file == "-" else open(args.prob_file, "w")
    if args.log_base <= 0:
        sys.exit("compute_sentence_probs_arpa.py: Invalid log base (must be greater than 0)")

def is_logprob(input):
    if input[0] == "-":
        try:
            float(input[1:])
            return True
        except:
            return False
    else:
        return False

def check_number(model_file, tot_num):
    cur_num = 0
    max_ngram_order = 0
    with open(model_file) as model:
        lines = model.readlines()
        for line in lines[1:]:
            if "=" not in line:
                return (cur_num == tot_num), max_ngram_order
            cur_num += int(line.split("=")[-1])
            max_ngram_order = int(line.split("=")[0].split()[-1])

# This function load language model in arpa form and save in a dictionary for
# computing sentence probabilty of input text file.
def load_model(model_file):
    with open(model_file) as model:
        ngram_dict = {}
        lines = model.readlines()

        # check arpa form
        if lines[0][:-1] != "\\data\\":
            sys.exit("compute_sentence_probs_arpa.py: Please make sure that language model is in arpa form.")

        # read line
        for line in lines:
            if line[0] == "-":
                line_split = line.split()
                if is_logprob(line_split[-1]):
                    ngram_key = " ".join(line_split[1:-1])
                    if ngram_key in ngram_dict:
                        sys.exit("compute_sentence_probs_arpa.py: Duplicated ngram in arpa language model: {}.".format(ngram_key))
                    ngram_dict[ngram_key] = (line_split[0], line_split[-1])
                else:
                    ngram_key = " ".join(line_split[1:])
                    if ngram_key in ngram_dict:
                        sys.exit("compute_sentence_probs_arpa.py: Duplicated ngram in arpa language model: {}.".format(ngram_key))
                    ngram_dict[ngram_key] = (line_split[0],)

    return ngram_dict, len(ngram_dict)

def compute_sublist_prob(sub_list):
    if len(sub_list) == 0:
        sys.exit("compute_sentence_probs_arpa.py: Ngram substring not found in arpa language model, please check.")

    sub_string = " ".join(sub_list)
    if sub_string in ngram_dict:
        return -float(ngram_dict[sub_string][0][1:])
    else:
        backoff_substring = " ".join(sub_list[:-1])
        backoff_weight = 0.0 if (backoff_substring not in ngram_dict or len(ngram_dict[backoff_substring]) < 2) \
                         else -float(ngram_dict[backoff_substring][1][1:])
        return compute_sublist_prob(sub_list[1:]) + backoff_weight

def compute_begin_prob(sub_list):
    logprob = 0
    for i in range(1, len(sub_list) - 1):
        logprob += compute_sublist_prob(sub_list[:i + 1])
    return logprob

# The probability is computed in this way:
# p(word_N | word_N-1 ... word_1) = ngram_dict[word_1 ... word_N][0].
# Here gram_dict is a dictionary stores a tuple corresponding to ngrams.
# The first element of tuple is probablity and the second is backoff probability (if exists).
# If the particular ngram (word_1 ... word_N) is not in the dictionary, then
# p(word_N | word_N-1 ... word_1) = p(word_N | word_(N-1) ... word_2) * backoff_weight(word_(N-1) | word_(N-2) ... word_1)
# If the sequence (word_(N-1) ... word_1) is not in the dictionary, then the backoff_weight gets replaced with 0.0 (log1)
# More details can be found in https://cmusphinx.github.io/wiki/arpaformat/
def compute_sentence_prob(sentence, ngram_order):
    sentence_split = sentence.split()
    for i in range(len(sentence_split)):
        if sentence_split[i] not in ngram_dict:
            sentence_split[i] = "<unk>"
    sen_length = len(sentence_split)

    if sen_length < ngram_order:
        return compute_begin_prob(sentence_split)
    else:
        logprob = 0
        begin_sublist = sentence_split[:ngram_order]
        logprob += compute_begin_prob(begin_sublist)

        for i in range(sen_length - ngram_order + 1):
            cur_sublist = sentence_split[i : i + ngram_order]
            logprob += compute_sublist_prob(cur_sublist)

    return logprob


def output_result(text_in_handle, output_file_handle, ngram_order):
    lines = text_in_handle.readlines()
    logbase_modifier = math.log(10, args.log_base)
    for line in lines:
        new_line = "<s> " + line[:-1] + " </s>"
        logprob = compute_sentence_prob(new_line, ngram_order)
        new_logprob = logprob * logbase_modifier
        output_file_handle.write("{}\n".format(new_logprob))
    text_in_handle.close()
    output_file_handle.close()


if __name__ == "__main__":
    check_args(args)
    ngram_dict, tot_num = load_model(args.arpa_lm)

    num_valid, max_ngram_order = check_number(args.arpa_lm, tot_num)
    if not num_valid:
        sys.exit("compute_sentence_probs_arpa.py: Wrong loading model.")
    if args.ngram_order <= 0 or args.ngram_order > max_ngram_order:
        sys.exit("compute_sentence_probs_arpa.py: " +
            "Invalid ngram_order (either negative or greater than maximum ngram number ({}) allowed)".format(max_ngram_order))

    output_result(args.text_in_handle, args.prob_file_handle, args.ngram_order)


================================================
FILE: egs/utils/lang/extend_lang.sh
================================================
#!/usr/bin/env bash
# Copyright     2018  Johns Hopkins University (Author: Daniel Povey);
#               2019  Dongji Gao

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#  http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.

# derived files, that go in data/lang/.

# Begin configuration section.
sil_prob=0.5
silprob_file=
# end configuration section

echo "$0 $@"  # Print the command line for logging

. utils/parse_options.sh

if [ $# -ne 3 ]; then
  echo "Usage: utils/extend_lang.sh <old-lang-dir> <lexicon> <new-lang-dir>"
  echo "e.g.: utils/extend_lang.sh data/lang data/local/dict_new_words/lexiconp.txt data/lang_new_words"
  echo ""
  echo "This script creates a lang/ directory <new-lang-dir> with L.fst and L_disambig.fst"
  echo "derived from the provided lexicon, but all other information being the same as the old"
  echo "lang/ directory, including the phones.txt and words.txt being compatible (however,"
  echo "words.txt may have new words, and phones.txt may have extra disambiguation symbols"
  echo "if needed).  We do not allow new phones."
  echo ""
  echo "CAUTION: the lexicon generated will only cover the words in the provided lexicon,"
  echo "which might not include all the words in words.txt.  You should make sure your"
  echo "lexicon is a superset of the original lexicon used to generate <old-lang-dir>,"
  echo "if this would be a problem for your scenario."
  echo ""
  echo "The basename of <lexicon> must be either lexicon.txt, lexiconp.txt or lexiconp_silprob.txt."
  echo ""
  echo "Options"
  echo "     --sil-prob <probability of silence>             # default: 0.5 [must have 0 <= silprob < 1]"
  echo "     --silprob-file <file contains silence probability>    # must be provided if lexicon is lexiconp_silprob.txt"
  exit 1;
fi

srcdir=$1
lexicon=$2
dir=$3

[ -f path.sh ] && . ./path.sh


for f in $srcdir/phones.txt $lexicon; do
  if [ ! -f $f ]; then
    echo "$0: expected file $f to exist"
    exit 1
  fi
done

if ! awk '{if(NF < 2) exit(1)} END{if(NR==0) exit(1)}' <$lexicon; then
  echo "$0: it looks like there words without pronunciations or.."
  echo "  ...blank lines in $lexicon, or it is empty."
  exit 1
fi

mkdir -p $dir

if [ -d $dir/phones ]; then rm -r $dir/phones; fi

cp -r $srcdir/phones $dir/

for f in oov.int oov.txt phones.txt topo words.txt; do
  cp $srcdir/$f $dir/
done

tmpdir=$dir/temp
rm -r $tmpdir 2>/dev/null
mkdir -p $tmpdir

silprob=false

if [ $(basename $lexicon) == "lexiconp_silprob.txt" ]; then
  silprob=true
  if [ -z $silprob_file ] ; then
    echo "silprob_file not provided, checking $srcdir"
    if [ -f $srcdir/silprob.txt ]; then
        silprob_file=$srcdir/silprob.txt
        echo "silprob_file found in $srcdir"
    else
        echo "silprob_file not found in $srcdir" && exit 1;
    fi
  else
    if [ ! -f $silprob_file ]; then
      echo "$silprob_file does not exist" && exit 1;
    fi
  fi
elif [ $(basename $lexicon) != lexiconp.txt ]; then
  echo "$0: currently this script only supports the lexiconp.txt or lexiconp_silprob.txt format;"
  echo " ... your lexicon has to have that filename."
fi

# Get the list of extra words.
awk -v w=$srcdir/words.txt 'BEGIN{while(getline <w) seen[$1] = $1} { if (!($1 in seen)) oov[$1] = 1}
                     END{ for(k in oov) print k;}' <$lexicon >$tmpdir/extra_words.txt

# Add entries to words.txt for all the words that were not previously in the
# lexicon.
highest_number=$(tail -n 1 $srcdir/words.txt | awk '{print $2}')
awk -v start=$highest_number '{print $1, NR+start}' <$tmpdir/extra_words.txt >>$dir/words.txt
echo "$0: added $(wc -l <$tmpdir/extra_words.txt) extra words to words.txt"

if [ -f $dir/phones/nonterminals.txt ]; then
  # extra grammar-decoding-related options for getting the lexicon.
  grammar_opts="--left-context-phones=$dir/phones/left_context_phones.txt --nonterminals=$srcdir/phones/nonterminals.txt"
else
  grammar_opts=""
fi

if [ -f $dir/phones/word_boundary.txt ]; then
  # was `if $position_dependent_phones; then..` in prepare_lang.sh
  if "$silprob"; then
    perl -ane '@A=split(" ",$_); $w = shift @A; $p = shift @A; $silword_p = shift @A;
              $wordsil_f = shift @A; $wordnonsil_f = shift @A; @A>0||die;
         if(@A==1) { print "$w $p $silword_p $wordsil_f $wordnonsil_f $A[0]_S\n"; }
         else { print "$w $p $silword_p $wordsil_f $wordnonsil_f $A[0]_B ";
         for($n=1;$n<@A-1;$n++) { print "$A[$n]_I "; } print "$A[$n]_E\n"; } ' \
         < $lexicon > $tmpdir/lexiconp_silprob.txt
  else
    perl -ane '@A=split(" ",$_); $w = shift @A; $p = shift @A; @A>0||die;
           if(@A==1) { print "$w $p $A[0]_S\n"; } else { print "$w $p $A[0]_B ";
           for($n=1;$n<@A-1;$n++) { print "$A[$n]_I "; } print "$A[$n]_E\n"; } ' \
         < $lexicon > $tmpdir/lexiconp.txt || exit 1;
  fi
else
  if "$silprob"; then
    cp $lexicon $tempdir/lexiconp_silprob.txt
  else
    cp $lexicon $tmpdir/lexiconp.txt
  fi
fi

# Check that there are no unseen phones in the lexicon.
if "$silprob"; then
  if ! utils/sym2int.pl -f 6- $srcdir/phones.txt $tmpdir/lexiconp_silprob.txt >/dev/null; then
    echo "$0: it looks like there are unseen phones in your lexicon $lexicon"
    exit 1
  fi
else 
  if ! utils/sym2int.pl -f 3- $srcdir/phones.txt $tmpdir/lexiconp.txt >/dev/null; then
    echo "$0: it looks like there are unseen phones in your lexicon $lexicon"
    exit 1
  fi
fi

if "$silprob"; then
  ndisambig=$(utils/add_lex_disambig.pl --pron-probs --sil-probs $tmpdir/lexiconp_silprob.txt $tmpdir/lexiconp_silprob_disambig.txt)
else
  ndisambig=$(utils/add_lex_disambig.pl --pron-probs $tmpdir/lexiconp.txt $tmpdir/lexiconp_disambig.txt)
fi

ndisambig=$[ndisambig+1]  # Add one to disambiguate silence.

# we'll need to figure out whether any of these disambiguation symbols are
# absent from our current disambiguation phones.. if they are, then we need to
# add them as new disambiguation symbols to phones.txt.
for n in $(seq 0 $ndisambig); do
  sym='#'$n; if ! grep -w -q "$sym" $dir/phones/disambig.txt; then echo "$sym"; fi
done > $tmpdir/extra_disambig.txt
highest_number=$(tail -n 1 $srcdir/phones.txt | awk '{print $2}')
awk -v start=$highest_number '{print $1, NR+start}' <$tmpdir/extra_disambig.txt >>$dir/phones.txt
echo "$0: added $(wc -l <$tmpdir/extra_disambig.txt) extra disambiguation symbols to phones.txt"

# add extra_disambig symbols into disambig.txt
cat $tmpdir/extra_disambig.txt >> $dir/phones/disambig.txt
utils/sym2int.pl $dir/phones.txt <$dir/phones/disambig.txt >$dir/phones/disambig.int
utils/sym2int.pl $dir/phones.txt <$dir/phones/disambig.txt | \
  awk '{printf(":%d", $1);} END{printf "\n"}' | sed s/:// > $dir/phones/disambig.csl

silphone=`cat $srcdir/phones/optional_silence.txt` || exit 1;
[ -z "$silphone" ] && \
  ( echo "You have no optional-silence phone; it is required in the current scripts"
    echo "but you may use the option --sil-prob 0.0 to stop it being used." ) && \
   exit 1;

if "$silprob"; then
  # remove the silprob
  cat $tmpdir/lexiconp_silprob.txt |\
    awk '{
      for(i=1; i<=NF; i++) {
        if(i!=3 && i!=4 && i!=5) printf("%s\t", $i); if(i==NF) print "";
      }
    }' > $tmpdir/lexiconp.txt
fi

# First remove pron-probs from the lexicon.
perl -ape 's/(\S+\s+)\S+\s+(.+)/$1$2/;' <$tmpdir/lexiconp.txt >$tmpdir/align_lexicon.txt

# Note: here, $silphone will have no suffix e.g. _S because it occurs as optional-silence,
# and is not part of a word.
[ ! -z "$silphone" ] && echo "<eps> $silphone" >> $tmpdir/align_lexicon.txt

cat $tmpdir/align_lexicon.txt | \
  perl -ane '@A = split; print $A[0], " ", join(" ", @A), "\n";' | sort | uniq > $dir/phones/align_lexicon.txt

if [ -f $dir/phones/nonterminals.txt ]; then
  for w in "#nonterm_begin" "#nonterm_end" $(cat $dir/phones/nonterminals.txt); do
    echo $w $w  # These are words without pronunciations, so leave those prons
                # empty.
    done >> $dir/phones/align_lexicon.txt
fi

# create phones/align_lexicon.int from phones/align_lexicon.txt
cat $dir/phones/align_lexicon.txt | utils/sym2int.pl -f 3- $dir/phones.txt | \
  utils/sym2int.pl -f 1-2 $dir/words.txt > $dir/phones/align_lexicon.int

# Create the basic L.fst without disambiguation symbols, for use
# in training.
if "$silprob"; then
  utils/lang/make_lexicon_fst_silprob.py $grammar_opts --sil-phone=$silphone \
         $tmpdir/lexiconp_silprob.txt $silprob_file | \
      fstcompile --isymbols=$dir/phones.txt --osymbols=$dir/words.txt \
        --keep_isymbols=false --keep_osymbols=false |   \
      fstarcsort --sort_type=olabel > $dir/L.fst || exit 1;
else
  utils/lang/make_lexicon_fst.py $grammar_opts --sil-prob=$sil_prob --sil-phone=$silphone \
           $tmpdir/lexiconp.txt | \
      fstcompile --isymbols=$dir/phones.txt --osymbols=$dir/words.txt \
        --keep_isymbols=false --keep_osymbols=false | \
      fstarcsort --sort_type=olabel > $dir/L.fst || exit 1;
fi


# and create the version that has disambiguation symbols.
if "$silprob"; then
  utils/lang/make_lexicon_fst_silprob.py $grammar_opts \
    --sil-phone=$silphone --sil-disambig='#'$ndisambig \
    $tmpdir/lexiconp_silprob_disambig.txt $silprob_file | \
    fstcompile --isymbols=$dir/phones.txt --osymbols=$dir/words.txt \
      --keep_isymbols=false --keep_osymbols=false |   \
    fstaddselfloops  $dir/phones/wdisambig_phones.int $dir/phones/wdisambig_words.int | \
    fstarcsort --sort_type=olabel > $dir/L_disambig.fst || exit 1;
else
  utils/lang/make_lexicon_fst.py $grammar_opts \
    --sil-prob=$sil_prob --sil-phone=$silphone --sil-disambig='#'$ndisambig \
       $tmpdir/lexiconp_disambig.txt | \
     fstcompile --isymbols=$dir/phones.txt --osymbols=$dir/words.txt \
                --keep_isymbols=false --keep_osymbols=false | \
     fstaddselfloops $dir/phones/wdisambig_phones.int $dir/phones/wdisambig_words.int | \
     fstarcsort --sort_type=olabel > $dir/L_disambig.fst || exit 1;
fi


echo "$(basename $0): validating output directory"
# the --skip-generate-words-check option is needed because L.fst may not actually
# contain all the words in words.txt.
! utils/validate_lang.pl --skip-generate-words-check $dir && echo "$(basename $0): error validating output" &&  exit 1;

exit 0;


================================================
FILE: egs/utils/lang/get_word_position_phone_map.pl
================================================
#!/usr/bin/env perl

# Copyright 2018  Johns Hopkins University (author: Daniel Povey)
# Apache 2.0.
#
use strict;
use warnings;

my $Usage = <<EOU;
# This script is for creating a mapping from word-position-dependent phones
# (with _I, _B, _E, _S suffixes) to word-position-independent phones,
# along with a word-position-independent version of phones.txt.
# It should only be required in very unusual situations.

Usage: utils/lang/get_word_position_phone_map.pl <lang-dir> <output-dir>

<lang-dir> is a conventional lang dir as validated by validate_lang.pl.
It is an error if <lang-dir> does not have word-position-dependent phones.

To <output-dir> will be written the following files:
  phones.txt is a conventional symbol table, similar in format to the one
   in <lang-dir>, but without word-position-dependency or disambiguation
   symbols.
  phone_map.int is a mapping from the input <lang-dir>'s phones to
   the phones in <output-dir>/phones.txt, containing integers, i.e.
   <word-position-dependent-phone> <word-position-independent-phone>.
  phone_map.txt is the text form of the mapping in phone_map.int, mostly
   provided for reference.
EOU


if (@ARGV != 2) {
  die $Usage;
}

my $lang_dir = shift @ARGV;
my $output_dir = shift @ARGV;

foreach my $filename ( ("phones.txt", "phones/disambig.int") ) {
  if (! -f "$lang_dir/$filename") {
    die "$0: expected file $lang_dir/$filename to exist";
  }
}

if (! -d $output_dir) {
  die "$0: expected directory $output_dir to exist";
}


# %is_disambig is a hash indexed by integer phone index in the input $lang_dir,
# which will contain 1 for each (integer) disambiguation symbol.
my %is_disambig;

open(D, "<$lang_dir/phones/disambig.int") || die "opening $lang_dir/phones/disambig.int";
while (<D>) {
  my $disambig_sym = int($_);
  $is_disambig{$disambig_sym} = 1;
}
close(D);

## @orig_phone_list will be an array indexed by integer index, containing
## the written form of the original, non-word-position-dependent phones.
## (but excluding disambiguation symbols like #0, #1 and so on).
## E.g. @orig_phone_list = ( "<eps>", "SIL", "SIL_B", "SIL_E", "SIL_I", "SIL_S", ... )
my @orig_phone_list = ();


## @mapped_phones will be an array of the same size as @orig_phone_list, but
## containing the same phone mapped to context-independent form,
## e.g. ( "<eps>", "SIL", "SIL", "SIL", SIL", "SIL",... )
my @mapped_phones = ();


## @mapped_phone_list will contain the distinct mapped phones in order,
## e.g. ( "<eps>", "SIL", "AA", ... )
my @mapped_phone_list = ();

## mapped_phone_to_int will be a mapping from the strings in @mapped_phones,
## such as "<eps>" and "SIL", to an integer like 0, 1, ....
my %mapped_phone_to_int;

# $cur_mapped_int keeps track of the symbols we've used in the output
# phones.txt.
my $cur_mapped_int = 0;

# $cur_line is the current line index in input phones.txt
my $cur_line = 0;

open(F, "<$lang_dir/phones.txt") || die "$0: failed to open $lang_dir/phones.txt for reading";

while (<F>) {
  chop;  # remove newline from $_ (the line we just read) for easy printing.
  my @A = split;  # split $_ on space.
  if (@A != 2) {  # if the array @A does not have length 2...
    die "$0: bad line $_ in file $lang_dir/phones.txt";
  }
  my $phone_name = $A[0];  # e.g. "<eps>" or "SIL" or "SIL_B" ...
  my $phone_int = int($A[1]);
  if ($phone_int != $cur_line) {
    die ("$0: unexpected line $_ in $lang_dir/phones.txt, expected integer to be $cur_line");
  }
  if (! $is_disambig{$phone_int}) {
    # if it's not a disambiguation symbol...
    my $mapped_phone_name = $phone_name;
    $mapped_phone_name =~ s/_[BESI]$//;

    push @orig_phone_list, $phone_name;
    push @mapped_phones, $mapped_phone_name;

    if (!defined $mapped_phone_to_int{$mapped_phone_name}) {
      $mapped_phone_to_int{$mapped_phone_name} = $cur_mapped_int++;
      push @mapped_phone_list, $mapped_phone_name;
    }
  }
  $cur_line++;
}
close(F);

if ($cur_line == 0) {
  die "$0: empty $lang_dir/phones.txt";
}

if ($cur_mapped_int == @orig_phone_list) {
  # if the number of distinct mapped phones is the same as the
  # number of input phones (including epsilon), it means the mapping
  # was a no-op.  This is an error, because it doesn't make sense to
  # run this script on input that was not word-position-dependent.
  die "input lang dir $lang_dir was not word-position-dependent.";
}

open(P, ">$output_dir/phones.txt") || die "failed to open $output_dir/phones.txt for writing.";
open(I, ">$output_dir/phone_map.int") || die "failed to open $output_dir/phone_map.int for writing.";
open(T, ">$output_dir/phone_map.txt") || die "failed to open $output_dir/phone_map.txt for writing.";

for (my $x = 0; $x <= $#mapped_phone_list; $x++) {
  print P "$mapped_phone_list[$x] $x\n";
}


for (my $x = 0; $x <= $#orig_phone_list; $x++) {
  my $orig_phone_name = $orig_phone_list[$x];
  my $mapped_phone_name = $mapped_phones[$x];
  my $y = $mapped_phone_to_int{$mapped_phone_name};
  defined $y || die "code error";

  print I "$x $y\n";
  print T "$orig_phone_name $mapped_phone_name\n";
}


(close(I) && close(T) && close(P)) || die "failed to close file (disk full?)";


exit(0);


================================================
FILE: egs/utils/lang/grammar/augment_phones_txt.py
================================================
#!/usr/bin/env python3


import argparse
import re
import os
import sys

def get_args():
    parser = argparse.ArgumentParser(description="""This script augments a phones.txt
       file (a phone-level symbol table) by adding certain special symbols
       relating to grammar support.  See ../add_nonterminals.sh for context.""")

    parser.add_argument('input_phones_txt', type=str,
                        help='Filename of input phones.txt file, to be augmented')
    parser.add_argument('nonterminal_symbols_list', type=str,
                        help='Filename of a file containing a list of nonterminal '
                        'symbols, one per line.  E.g. #nonterm:contact_list')
    parser.add_argument('output_phones_txt', type=str, help='Filename of output '
                        'phones.txt file.  May be the same as input-phones-txt.')
    args = parser.parse_args()
    return args


def read_phones_txt(filename):
    """Reads the phones.txt file in 'filename', returns a 2-tuple (lines, highest_symbol)
       where 'lines' is all the lines the phones.txt as a list of strings,
       and 'highest_symbol' is the integer value of the highest-numbered symbol
       in the symbol table.  It is an error if the phones.txt is empty or mis-formatted."""

    # The use of latin-1 encoding does not preclude reading utf-8.  latin-1
    # encoding means "treat words as sequences of bytes", and it is compatible
    # with utf-8 encoding as well as other encodings such as gbk, as long as the
    # spaces are also spaces in ascii (which we check).  It is basically how we
    # emulate the behavior of python before python3.
    whitespace = re.compile("[ \t]+")
    with open(filename, 'r', encoding='latin-1') as f:
        lines = [line.strip(" \t\r\n") for line in f]
        highest_numbered_symbol = 0
        for line in lines:
            s = whitespace.split(line)
            try:
                i = int(s[1])
                if i > highest_numbered_symbol:
                    highest_numbered_symbol = i
            except:
                raise RuntimeError("Could not interpret line '{0}' in file '{1}'".format(
                line, filename))
            if s[0] == '#nonterm_bos':
                raise RuntimeError("It looks like the symbol table {0} already has nonterminals "
                                   "in it.".format(filename))
        return lines, highest_numbered_symbol


def read_nonterminals(filename):
    """Reads the user-defined nonterminal symbols in 'filename', checks that
       it has the expected format and has no duplicates, and returns the nonterminal
       symbols as a list of strings, e.g.
       ['#nonterm:contact_list', '#nonterm:phone_number', ... ]. """
    ans = [line.strip(" \t\r\n") for line in open(filename, 'r', encoding='latin-1')]
    if len(ans) == 0:
        raise RuntimeError("The file {0} contains no nonterminal symbols.".format(filename))
    for nonterm in ans:
        if nonterm[:9] != '#nonterm:':
            raise RuntimeError("In file '{0}', expected nonterminal symbols to start with '#nonterm:', found '{1}'"
                               .format(filename, nonterm))
    if len(set(ans)) != len(ans):
        raise RuntimeError("Duplicate nonterminal symbols are present in file {0}".format(filename))
    return ans

def write_phones_txt(orig_lines, highest_numbered_symbol, nonterminals, filename):
    """Writes updated phones.txt to 'filename'.  'orig_lines' is the original lines
       in the phones.txt file as a list of strings (without the newlines);
       highest_numbered_symbol is the highest numbered symbol in the original
       phones.txt; nonterminals is a list of strings like '#nonterm:foo'."""
    with open(filename, 'w', encoding='latin-1') as f:
        for l in orig_lines:
            print(l, file=f)
        cur_symbol = highest_numbered_symbol + 1
        for n in ['#nonterm_bos', '#nonterm_begin', '#nonterm_end', '#nonterm_reenter' ] + nonterminals:
            print("{0} {1}".format(n, cur_symbol), file=f)
            cur_symbol = cur_symbol + 1


def main():
    args = get_args()
    (lines, highest_symbol) = read_phones_txt(args.input_phones_txt)
    nonterminals = read_nonterminals(args.nonterminal_symbols_list)
    write_phones_txt(lines, highest_symbol, nonterminals, args.output_phones_txt)


if __name__ == '__main__':
      main()


================================================
FILE: egs/utils/lang/grammar/augment_words_txt.py
================================================
#!/usr/bin/env python3


import argparse
import os
import sys
import re

def get_args():
    parser = argparse.ArgumentParser(description="""This script augments a words.txt
       file (a word-level symbol table) by adding certain special symbols
       relating to grammar support.  See ../add_nonterminals.sh for context,
       and augment_phones_txt.py.""")

    parser.add_argument('input_words_txt', type=str,
                        help='Filename of input words.txt file, to be augmented')
    parser.add_argument('nonterminal_symbols_list', type=str,
                        help='Filename of a file containing a list of nonterminal '
                        'symbols, one per line.  E.g. #nonterm:contact_list')
    parser.add_argument('output_words_txt', type=str, help='Filename of output '
                        'words.txt file.  May be the same as input-words-txt.')
    args = parser.parse_args()
    return args


def read_words_txt(filename):
    """Reads the words.txt file in 'filename', returns a 2-tuple (lines, highest_symbol)
       where 'lines' is all the lines the words.txt as a list of strings,
       and 'highest_symbol' is the integer value of the highest-numbered symbol
       in the symbol table.  It is an error if the words.txt is empty or mis-formatted."""

    # The use of latin-1 encoding does not preclude reading utf-8.  latin-1
    # encoding means "treat words as sequences of bytes", and it is compatible
    # with utf-8 encoding as well as other encodings such as gbk, as long as the
    # spaces are also spaces in ascii (which we check).  It is basically how we
    # emulate the behavior of python before python3.
    whitespace = re.compile("[ \t]+")
    with open(filename, 'r', encoding='latin-1') as f:
        lines = [line.strip(" \t\r\n") for line in f]
        highest_numbered_symbol = 0
        for line in lines:
            s = whitespace.split(line)
            try:
                i = int(s[1])
                if i > highest_numbered_symbol:
                    highest_numbered_symbol = i
            except:
                raise RuntimeError("Could not interpret line '{0}' in file '{1}'".format(
                line, filename))
            if s[0] in [ '#nonterm_begin', '#nonterm_end' ]:
                raise RuntimeError("It looks like the symbol table {0} already has nonterminals "
                                   "in it.".format(filename))
        return lines, highest_numbered_symbol


def read_nonterminals(filename):
    """Reads the user-defined nonterminal symbols in 'filename', checks that
       it has the expected format and has no duplicates, and returns the nonterminal
       symbols as a list of strings, e.g.
       ['#nonterm:contact_list', '#nonterm:phone_number', ... ]. """
    ans = [line.strip(" \t\r\n") for line in open(filename, 'r', encoding='latin-1')]
    if len(ans) == 0:
        raise RuntimeError("The file {0} contains no nonterminal symbols.".format(filename))
    for nonterm in ans:
        if nonterm[:9] != '#nonterm:':
            raise RuntimeError("In file '{0}', expected nonterminal symbols to start with '#nonterm:', found '{1}'"
                               .format(filename, nonterm))
    if len(set(ans)) != len(ans):
        raise RuntimeError("Duplicate nonterminal symbols are present in file {0}".format(filename))
    return ans

def write_words_txt(orig_lines, highest_numbered_symbol, nonterminals, filename):
    """Writes updated words.txt to 'filename'.  'orig_lines' is the original lines
       in the words.txt file as a list of strings (without the newlines);
       highest_numbered_symbol is the highest numbered symbol in the original
       words.txt; nonterminals is a list of strings like '#nonterm:foo'."""
    with open(filename, 'w', encoding='latin-1') as f:
        for l in orig_lines:
            print(l, file=f)
        cur_symbol = highest_numbered_symbol + 1
        for n in [ '#nonterm_begin', '#nonterm_end' ] + nonterminals:
            print("{0} {1}".format(n, cur_symbol), file=f)
            cur_symbol = cur_symbol + 1


def main():
    args = get_args()
    (lines, highest_symbol) = read_words_txt(args.input_words_txt)
    nonterminals = read_nonterminals(args.nonterminal_symbols_list)
    write_words_txt(lines, highest_symbol, nonterminals, args.output_words_txt)


if __name__ == '__main__':
      main()


================================================
FILE: egs/utils/lang/internal/apply_unk_lm.sh
================================================
#!/usr/bin/env bash

# Copyright      2016 Johns Hopkins University (Author: Daniel Povey);

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#  http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.


# Begin configuration section.

# end configuration sections

echo "$0 $@"  # Print the command line for logging
[ -f path.sh ] && . ./path.sh


. utils/parse_options.sh

if [ $# -ne 2 ]; then
  echo "Usage: $0 [options] <input-unk-lm-fst> <lang-dir>"
  echo "e.g.: $0 exp/make_unk/unk_fst.txt data/lang_unk"
  echo ""
  echo "This script, which is called from the end of prepare_lang.sh,"
  echo "inserts the unknown-word LM FST into the lexicon FSTs"
  echo "<lang-dir>/L.fst and <lang-dir>/L_disambig.fst in place of"
  echo "the special disambiguation symbol #2 (which was inserted by"
  echo "add_lex_disambig.pl as a placeholder for this FST)."
  echo ""
  echo "  <input-unk-lm-fst>:  A text-form FST, typically with the name"
  echo "                unk_fst.txt.  We will remove all symbols from the"
  echo "                output before applying it."
  echo "  <lang-dir>:  A partially built lang/ directory.  We modify"
  echo "               L.fst and L_disambig.fst, and read only words.txt."
  exit 1;
fi


unk_lm_fst=$1
lang=$2

set -e

for f in "$unk_lm_fst" $lang/L.fst $lang/L_disambig.fst $lang/words.txt $lang/oov.int; do
  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1;
done

unused_phone_label=$(tail -n 1 $lang/phones.txt | awk '{print $2 + 1}')
label_to_replace=$(awk '{if ($1 == "#2") {print $2;}}' <$lang/phones.txt)
! [ "$unused_phone_label" -eq "$unused_phone_label" -a "$label_to_replace" -eq "$label_to_replace" ] && \
   echo "$0: error getting unused phone label or label for #2" && exit 1


# OK, now fstreplace works based on olabels, but we actually want to deal with ilabels,
# so we need to invert all the FSTs before and after doing fstreplace.
awk '{if(NF>=4) $4 = "<eps>"; print }' <$unk_lm_fst | \
  fstcompile --isymbols=$lang/phones.txt --osymbols=$lang/words.txt | \
  fstinvert > $lang/unk_temp.fst

num_states_unk=$(fstinfo $lang/unk_temp.fst | grep '# of states' | awk '{print $NF}')

# fstreplace usage is:
# Usage: fstreplace root.fst rootlabel [rule1.fst label1 ...] [out.fst]
# ... the rootlabel should just be an otherwise unused symbol.
# all the labels are olabels (word labels).. that is hardcoded in fstreplace.

for f in L.fst L_disambig.fst; do

  # with OpenFst tools, to refer to the standard input/output you need to use
  # the empty string '' and not '-'.
  fstinvert $lang/$f | fstreplace '' "$unused_phone_label" $lang/unk_temp.fst "$label_to_replace" | fstinvert > $lang/${f}.temp

  num_states_old=$(fstinfo $lang/$f | grep '# of states' | awk '{print $NF}')
  num_states_new=$(fstinfo $lang/${f}.temp | grep '# of states' | awk '{print $NF}')
  num_states_added=$[$num_states_new-$num_states_old]
  echo "$0: in $f, substituting in the unknown-word LM (which had $num_states_unk states) added $num_states_added new FST states."
  mv -f $lang/${f}.temp $lang/$f
done

rm $lang/unk_temp.fst

exit 0;


================================================
FILE: egs/utils/lang/internal/arpa2fst_constrained.py
================================================
#!/usr/bin/env python

# Copyright 2016  Johns Hopkins University (Author: Daniel Povey)
# Apache 2.0.

from __future__ import print_function
from __future__ import division
import sys
import argparse
import math
from collections import defaultdict

# note, this was originally based

parser = argparse.ArgumentParser(description="""
This script converts an ARPA-format language model to FST format
(like the C++ program arpa2fst), but does so while applying bigram
constraints supplied in a separate file.  The resulting language
model will have no unigram state, and there will be no backoff from
the bigram level.
This is useful for phone-level language models in order to keep
graphs small and impose things like linguistic constraints on
allowable phone sequences.
This script writes its output to the stdout.  It is a text-form FST,
suitable for compilation by fstcompile.
""")


parser.add_argument('--disambig-symbol', type = str, default = "#0",
                    help = 'Disambiguation symbol (e.g. #0), '
                    'that is printed on the input side only of backoff '
                    'arcs (output side would be epsilon)')
parser.add_argument('arpa_in', type = str,
                    help = 'The input ARPA file (must not be gzipped)')
parser.add_argument('allowed_bigrams_in', type = str,
                    help = "A file containing the list of allowed bigram pairs.  "
                    "Must include pairs like '<s> foo' and 'foo </s>', as well as "
                    "pairs like 'foo bar'.")
parser.add_argument('--verbose', type = int, default = 0,
                    choices=[0,1,2,3,4,5], help = 'Verbose level')

args = parser.parse_args()

if args.verbose >= 1:
    print(' '.join(sys.argv), file = sys.stderr)


class HistoryState(object):
    def __init__(self):
        # note: neither backoff_prob nor the floats
        # in word_to_prob are in log space.
        self.backoff_prob = 1.0
        # will be a dict from string to float.  the prob is
        # the actual probability of the word, including any probability
        # mass from backoff (they get added together while writing out
        # the arpa, and these probs are read in from the arpa).
        self.word_to_prob = dict()


class ArpaModel(object):
    def __init__(self):
        # self.orders is indexed by history-length [i.e. 0 for unigram,
        # 1 for bigram and so on], and is then a dict indexed
        # by tuples of history-words.  E.g. for trigrams, we'd index
        # it as self.orders[2][('a', 'b')].
        # The value-type of the dict is HistoryState.  E.g. to set the
        # probability of the trigram a b -> c to 0.2, we'd do
        # self.orders[2][('a', 'b')].word_to_prob['c'] = 0.2
        self.orders = []

    def Read(self, arpa_in):
        assert len(self.orders) == 0
        log10 = math.log(10.0)
        if arpa_in == "" or arpa_in == "-":
            arpa_in = "/dev/stdin"
        try:
            f = open(arpa_in, "r")
        except:
            sys.exit("{0}: error opening ARPA file {1}".format(
                     sys.argv[0], arpa_in))
        # first read till the \data\ marker.
        while True:
            line = f.readline()
            if line == '':
                sys.exit("{0}: reading {1}, got EOF looking for \\data\\ marker.".format(
                    sys.argv[0], arpa_in))
            if line[0:6] == '\\data\\':
                break
        while True:
            # read, and ignore, the lines like 'ngram 1=1264'...
            line = f.readline()
            if line == '\n' or line == '\r\n':
                break
            if line[0:5] != 'ngram':
                sys.exit("{0}: reading {1}, read something unexpected in header: {2}".format(
                    sys.argv[0], arpa_in, line[:-1]))
            rest=line[5:]
            a = rest.split('=')  # e.g. a = [ '1', '1264] ]
            if len(a) != 2:
                sys.exit("{0}: reading {1}, read something unexpected in header: {2}".format(
                    sys.argv[0], arpa_in, line[:-1]))
            max_order = int(a[0])


        for n in range(max_order):
            # self.orders[n], indexed by history-length (length of the
            # history-vector, == order-1), is a map from history as a tuple
            # of strings, to class HistoryState.
            self.orders.append(defaultdict(lambda: HistoryState()))

        cur_order = 0
        while True:
            line = f.readline()
            if line == '':
                sys.exit("{0}: reading {1}, found EOF while looking for \\end\\ marker.".format(
                    sys.argv[0], arpa_in))
            elif line[0:5] == '\\end\\':
                if len(self.orders) == 0:
                    sys.exit("{0}: reading {1}, read no n-grams.".format(sys.argv[0], arpa_in))
                break
            else:
                cur_order += 1
                expected_line = '\\{0}-grams:'.format(cur_order)
                if not expected_line in line:  # e.g. allow trailing whitespace and newline
                    sys.exit("{0}: reading {1}, expected line {1}, got {2}".format(arpa_in, expected_line, line[:-1]))
                if args.verbose >= 2:
                    print("{0}: reading {1}-grams".format(
                        sys.argv[0], cur_order), file = sys.stderr)

                # now read all the n-grams from this order.
                while True:
                    line = f.readline()
                    # the section of n-grams is terminated by a blank line.
                    if line == '\n' or line == '\r\n':
                        break
                    a = line.split()
                    l = len(a)
                    if l != cur_order + 1 and l != cur_order + 2:
                        sys.exit("{0}: reading {1}: in {2}-grams section, got bad line: {3}".format(
                            sys.argv[0], arpa_in, cur_order, line[:-1]))
                    try:
                        prob = math.exp(float(a[0]) * log10)
                        hist = tuple(a[1:cur_order])  # tuple of strings
                        word = a[cur_order]  # a string
                        backoff_prob = math.exp(float(a[cur_order+1]) * log10) if l == cur_order + 2 else None
                    except Exception as e:
                        sys.exit("{0}: reading {1}: in {2}-grams section, got bad "
                                 "line (exception is: {3}): {4}".format(
                                     sys.argv[0], arpa_in, cur_order,
                                     str(type(e)) + ',' + str(e), line[:-1]))
                    self.orders[cur_order-1][hist].word_to_prob[word] = prob
                    if backoff_prob != None:
                        self.orders[cur_order][hist + (word,)].backoff_prob = backoff_prob

        if args.verbose >= 2:
            print("{0}: read {1}-gram model from {2}".format(
                sys.argv[0], cur_order, arpa_in), file = sys.stderr)
        if cur_order < 2:
            # we'd have to have some if-statements in the code to make this work,
            # and I don't want to have to test it.
            sys.exit("{0}: this script does not work when the ARPA language model "
                     "is unigram.".format(sys.argv[0]))

    # Returns the probability of word 'word' in history-state 'hist'.
    # Dies with error if this word is not predicted at all by the LM (not in vocab).
    # history-state does not exist.
    def GetProb(self, hist, word):
        assert len(hist) < len(self.orders)
        if len(hist) == 0:
            word_to_prob = self.orders[0][()].word_to_prob
            if not word in word_to_prob:
                sys.exit("{0}: no probability in unigram for word {1}".format(
                    sys.argv[0], word))
            return word_to_prob[word]
        else:
            if hist in self.orders[len(hist)]:
                hist_state = self.orders[len(hist)][hist]
                if word in hist_state.word_to_prob:
                    return hist_state.word_to_prob[word]
                else:
                    return hist_state.backoff_prob * self.GetProb(hist[1:], word)
            else:
                return self.GetProb(hist[1:], word)

    # This gets the state corresponding to 'hist' in 'hist_to_state', but backs
    # off for us if there is no such state.
    def GetStateForHist(self, hist_to_state, hist):
        if hist in hist_to_state:
            return hist_to_state[hist]
        else:
            if len(hist) <= 1:
                # this would likely be a code error, but possibly an error
                # in the ARPA file
                sys.exit("{0}: error processing histories: history-state {1} "
                         "does not exist.".format(sys.argv[0], hist))
            return self.GetStateForHist(hist_to_state, hist[1:])


    def GetHistToStateMap(self):
        # This function, called from PrintAsFst, returns (hist_to_state,
        # state_to_hist), which map from history (as a tuple of strings) to
        # integer FST-state and vice versa.

        hist_to_state = dict()
        state_to_hist = []

        # Make sure the initial bigram state comes first (and that
        # we have such a state even if it was completely pruned
        # away in the bigram LM.. which is unlikely of course)
        hist = ('<s>',)
        hist_to_state[hist] = len(state_to_hist)
        state_to_hist.append(hist)

        # create a bigram state for each of the 'real' words...  even if the LM
        # didn't naturally have such bigram states, we'll create them so that we
        # can enforce the bigram constraints supplied in 'bigrams_file' by the
        # user.
        for word in self.orders[0][()].word_to_prob:
            if word != '<s>' and word != '</s>':
                hist = (word,)
                hist_to_state[hist] = len(state_to_hist)
                state_to_hist.append(hist)

        # note: we do not allocate an FST state for the unigram state, because
        # we don't have a unigram state in the output FST, only bigram states; and
        # we don't iterate over bigram histories because we covered them all above;
        # that's why we start 'n' from 2 below instead of from 0.
        for n in range(2, len(self.orders)):
            for hist in self.orders[n].keys():
                # note: hist is a tuple of strings.
                assert not hist in hist_to_state
                hist_to_state[hist] = len(state_to_hist)
                state_to_hist.append(hist)

        return (hist_to_state, state_to_hist)

    # This function prints the estimated language model as an FST.
    # disambig_symbol will be something like '#0' (a symbol introduced
    # to make the result determinizable).
    # bigram_map represent the allowed bigrams (left-word, right-word): it's a map
    # from left-word to a set of right-words (both are strings).
    def PrintAsFst(self, disambig_symbol, bigram_map):
        # History will map from history (as a tuple) to integer FST-state.
        (hist_to_state, state_to_hist) = self.GetHistToStateMap()


        # The following 3 things are just for diagnostics.
        normalization_stats = [ [0, 0.0] for x in range(len(self.orders)) ]
        num_ngrams_allowed = 0
        num_ngrams_disallowed = 0

        for state in range(len(state_to_hist)):
            hist = state_to_hist[state]
            hist_len = len(hist)
            assert hist_len > 0
            if hist_len == 1:  # it's a bigram state...
                context_word = hist[0]
                if not context_word in bigram_map:
                    print("{0}: warning: word {1} appears in ARPA but is not listed "
                          "as a left context in the bigram map".format(
                              sys.argv[0], context_word), file = sys.stderr)
                    continue
                # word list is a list of words that can follow this word.  It must be nonempty.
                word_list = list(bigram_map[context_word])

                normalization_stats[hist_len][0] += 1

                for word in word_list:
                    prob = self.GetProb((context_word,), word)
                    assert prob != 0
                    normalization_stats[hist_len][1] += prob
                    cost = -math.log(prob)
                    if abs(cost) < 0.01 and args.verbose >= 3:
                        print("{0}: warning: very small cost {1} for {2}->{3}".format(
                            sys.argv[0], cost, context_word, word), file=sys.stderr)
                    if word == '</s>':
                        # print the final-prob of this state.
                        print("%d %.3f" % (state, cost))
                    else:
                        next_state = self.GetStateForHist(hist_to_state,
                                                          (context_word, word))
                        print("%d %d %s %s %.3f" %
                              (state, next_state, word, word, cost))
            else:  # it's a higher-order than bigram state.
                assert hist in self.orders[hist_len]
                hist_state = self.orders[hist_len][hist]
                most_recent_word = hist[-1]

                normalization_stats[hist_len][0] += 1
                normalization_stats[hist_len][1] += \
                  sum([ self.GetProb(hist, word) for word in bigram_map[most_recent_word]])

                for word, prob in hist_state.word_to_prob.items():
                    cost = -math.log(prob)
                    if word in bigram_map[most_recent_word]:
                        num_ngrams_allowed += 1
                    else:
                        num_ngrams_disallowed += 1
                        continue
                    if word == '</s>':
                        # print the final-prob of this state.
                        print("%d %.3f" % (state, cost))
                    else:
                        next_state = self.GetStateForHist(hist_to_state,
                                                          (hist) + (word,))
                        print("%d %d %s %s %.3f" %
                              (state, next_state, word, word, cost))
                # Now deal with the backoff probability of this state (back off
                # to the lower-order state).
                assert hist in self.orders[hist_len]
                backoff_prob = self.orders[hist_len][hist].backoff_prob
                assert backoff_prob != 0.0
                cost = -math.log(backoff_prob)
                backoff_hist = hist[1:]
                backoff_state = self.GetStateForHist(hist_to_state, backoff_hist)
                # note: we only print the disambig symbol on the input side.
                if args.verbose >= 3 and abs(cost) < 0.001:
                    print("{0}: very low backoff cost {1} for history {2}, state = {3}".format(
                        sys.argv[0], cost, str(hist), state), file = sys.stderr)

                # For hist-states that completely back off (they have no words coming out of them),
                # there is no need to disambiguate, we can print an epsilon that will later be removed.
                this_disambig_symbol = disambig_symbol if len(hist_state.word_to_prob) != 0 else '<eps>'
                print("%d %d %s <eps> %.3f" %
                      (state, backoff_state, this_disambig_symbol, cost))
        if args.verbose >= 1:
            for hist_len in range(1, len(self.orders)):
                num_states = normalization_stats[hist_len][0]
                avg_prob_sum = normalization_stats[hist_len][1] / num_states if num_states > 0 else 0.0
                print("{0}: for {1}-gram states, over {2} states the average sum of "
                      "probs was {3} (would be 1.0 if properly normalized).".format(
                          sys.argv[0], hist_len + 1, num_states, avg_prob_sum),
                      file = sys.stderr)
            if num_ngrams_disallowed != 0:
                print("{0}: for explicit n-grams higher than bigram from the ARPA model, {0} "
                      "were allowed by the bigram constraints and {1} were disallowed (we "
                      "normally expect all or almost all of them to be allowed).".format(
                          num_ngrams_allowed, num_ngrams_disallowed), file = sys.stderr)


# returns a map which is a dict [indexed by left-hand word] of sets [containing
# the right-hand word].
def ReadBigramMap(bigrams_file):
    ans = defaultdict(lambda: set())

    have_one_bos = False
    have_one_eos = False
    have_one_regular = False

    try:
        f = open(bigrams_file, "r")
    except:
        sys.exit("utils/lang/internal/arpa2fst_constrained.py: error opening "
                 "bigrams file " + bigrams_file)
    while True:
        line = f.readline()
        if line == '':
            break
        a = line.split()
        if len(a) != 2:
            sys.exit("utils/lang/internal/arpa2fst_constrained.py: bad line in "
                     "bigrams file {0} (expect 2 fields): {1}".format(
                         bigrams_file, line[:-1]))
        [word1, word2] = a
        if word1 in ans and word2 in ans[word1]:
            sys.exit("{0}: bigrams file contained duplicate entry: {1} {2}".format(
                sys.argv[0], word1, word2), file = sys.stderr)
        if word2 == '<s>' or word1 == '</s>':
            sys.exit("{0}: bad sequence of BOS/EOS symbols: {1} {2}".format(
                sys.argv[0], word1, word2))
        if word1 == '<s>':
            have_one_bos = True
        elif word2 == '</s>':
            have_one_eos = True
        else:
            have_one_regular = True
        ans[word1].add(word2)
    # check for at least one pair with BOS
    if len(ans) == 0:
        sys.exit("{0}: no data found in bigrams file {1}".format(
            sys.argv[0], bigrams_file))
    elif not (have_one_bos and have_one_eos and have_one_regular):
        sys.exit("{0}: the bigrams file {1} does not look right "
                 "(make sure BOS and EOS symbols are there)".format(
            sys.argv[0], bigrams_file))
    return ans

arpa_model = ArpaModel()
arpa_model.Read(args.arpa_in)
bigrams_map = ReadBigramMap(args.allowed_bigrams_in)
if len(args.disambig_symbol.split()) != 1:
    sys.exit("{0}: invalid option --disambig-symbol={1}".format(
        sys.argv[0], args.disambig_symbol))
arpa_model.PrintAsFst(args.disambig_symbol, bigrams_map)


================================================
FILE: egs/utils/lang/internal/modify_unk_pron.py
================================================
#!/usr/bin/env python

# Copyright 2016  Johns Hopkins University (Author: Daniel Povey)
# Apache 2.0.

from __future__ import print_function
import sys
import os
import argparse
from collections import defaultdict

# note, this was originally based

parser = argparse.ArgumentParser(description="""
This script replaces the existing pronunciation of the
unknown word in the provided lexicon, with a pronunciation
consisting of three disambiguation symbols: #1 followed by #2
followed by #3.
The #2 will later be replaced by a phone-level LM by
apply_unk_lm.sh (called later on by prepare_lang.sh).
Caution: this script is sensitive to the basename of the
lexicon: it should be called either lexiconp.txt, in which
case the format is 'word pron-prob p1 p2 p3 ...'
or lexiconp_silprob.txt, in which case the format is
'word pron-prob sil-prob1 sil-prob2 sil-prob3 p1 p2 p3....'.
It is an error if there is not exactly one pronunciation of
the unknown word in the lexicon.""",
epilog="""E.g.: modify_unk_pron.py data/local/lang/lexiconp.txt '<unk>'.
This script is called from prepare_lang.sh.""")

parser.add_argument('lexicon_file', type = str,
                    help = 'Filename of the lexicon file to operate on (this is '
                    'both an input and output of this script).')
parser.add_argument('unk_word', type = str,
                    help = "The printed form of the unknown/OOV word, normally '<unk>'.")

args = parser.parse_args()

if len(args.unk_word.split()) != 1:
    sys.exit("{0}: invalid unknown-word '{1}'".format(
        sys.argv[0], args.unk_word))

basename = os.path.basename(args.lexicon_file)
if basename != 'lexiconp.txt' and basename != 'lexiconp_silprob.txt':
    sys.exit("{0}: expected the basename of the lexicon file to be either "
             "'lexiconp.txt' or 'lexiconp_silprob.txt', got: {1}".format(
                 sys.argv[0], args.lexicon_file))
# the lexiconp.txt format is: word pron-prob p1 p2 p3...
# lexiconp_silprob.txt has 3 extra real-valued fields after the pron-prob.
num_fields_before_pron = 2 if basename == 'lexiconp.txt' else 5

print(' '.join(sys.argv), file = sys.stderr)

try:
    lexicon_in = open(args.lexicon_file, 'r')
except:
    sys.exit("{0}: failed to open lexicon file {1}".format(
        sys.argv[0], args.lexicon_file))

split_lines = []
unk_index = -1
while True:
    line = lexicon_in.readline()
    if line == '':
        break
    this_split_line = line.split()
    if this_split_line[0] == args.unk_word:
        if unk_index != -1:
            sys.exit("{0}: expected there to be exactly one pronunciation of the "
                     "unknown word {1} in {2}, but there are more than one.".format(
                         sys.argv[0], args.lexicon_file, args.unk_word))
        unk_index = len(split_lines)
    if len(this_split_line) <= num_fields_before_pron:
        sys.exit("{0}: input file {1} had a bad line (too few fields): {2}".format(
            sys.argv[0], args.lexicon_file, line[:-1]))
    split_lines.append(this_split_line)

if len(split_lines) == 0:
    sys.exit("{0}: read no data from lexicon file {1}.".format(
        sys.argv[0], args.lexicon_file))


if unk_index == -1:
    sys.exit("{0}: expected there to be exactly one pronunciation of the "
             "unknown word {1} in {2}, but there are none.".format(
                 sys.argv[0], args.unk_word, args.lexicon_file))

lexicon_in.close()

# now modify the pron.
split_lines[unk_index] = split_lines[unk_index][0:num_fields_before_pron] + [ '#1', '#2', '#3' ]


try:
    # write to the same file.
    lexicon_out = open(args.lexicon_file, 'w')
except:
    sys.exit("{0}: failed to open lexicon file {1} for writing (permissions probleM?)".format(
        sys.argv[0], args.lexicon_file))

for split_line in split_lines:
    print(' '.join(split_line), file = lexicon_out)

try:
    lexicon_out.close()
except:
    sys.exit("{0}: failed to close lexicon file {1} after writing (disk full?)".format(
        sys.argv[0], args.lexicon_file))


================================================
FILE: egs/utils/lang/limit_arpa_unk_history.py
================================================
#!/usr/bin/env python3

# Copyright 2018    Armin Oliya
# Apache 2.0.

'''
This script takes an existing ARPA lanugage model and limits the <unk> history
to make it suitable for downstream <unk> modeling.
This is for the case when you don't have access
to the original text corpus that is used for creating the LM.
If you do, you can use pocolm with the option --limit-unk-history=true.
This keeps the graph compact after adding the unk model.
'''

import argparse
import io
import re
import sys
from collections import defaultdict


parser = argparse.ArgumentParser(
    description='''This script takes an existing ARPA lanugage model
    and limits the <unk> history to make it suitable
    for downstream <unk> modeling.
    It supports up to 5-grams.''',
    usage='''utils/lang/limit_arpa_unk_history.py
    <oov-dict-entry> <input-arpa >output-arpa''',
    epilog='''E.g.: gunzip -c src.arpa.gz |
    utils/lang/limit_arpa_unk_history.py "<unk>" | gzip -c >dest.arpa.gz''')

parser.add_argument(
    'oov_dict_entry',
    help='oov identifier, for example "<unk>"', type=str)
args = parser.parse_args()


def get_ngram_stats(old_lm_lines):
    ngram_counts = defaultdict(int)

    for i in range(10):
        g = re.search(r"ngram (\d)=(\d+)", old_lm_lines[i])
        if g:
            ngram_counts[int(g.group(1))] = int(g.group(2))

    if len(ngram_counts) == 0:
        sys.exit("""Couldn't get counts per ngram section.
            The input doesn't seem to be a valid ARPA language model.""")

    max_ngrams = list(ngram_counts.keys())[-1]
    skip_rows = ngram_counts[1]

    if max_ngrams > 5:
        sys.exit("This script supports up to 5-gram language models.")

    return max_ngrams, skip_rows, ngram_counts


def find_and_replace_unks(old_lm_lines, max_ngrams, skip_rows):
    ngram_diffs = defaultdict(int)
    whitespace_pattern = re.compile("[ \t]+")
    unk_pattern = re.compile(
        "[0-9.-]+(?:[\s\\t]\S+){1,3}[\s\\t]" + args.oov_dict_entry +
        "[\s\\t](?!-[0-9]+\.[0-9]+).*")
    backoff_pattern = re.compile(
        "[0-9.-]+(?:[\s\\t]\S+){1,3}[\s\\t]<unk>[\s\\t]-[0-9]+\.[0-9]+")
    passed_2grams, last_ngram = False, False
    unk_row_count, backoff_row_count = 0, 0

    print("Upadting the language model .. ", file=sys.stderr)
    new_lm_lines = old_lm_lines[:skip_rows]

    for i in range(skip_rows, len(old_lm_lines)):
            line = old_lm_lines[i].strip(" \t\r\n")

            if "\{}-grams:".format(3) in line:
                passed_2grams = True
            if "\{}-grams:".format(max_ngrams) in line:
                last_ngram = True

            for i in range(max_ngrams):
                if "\{}-grams:".format(i+1) in line:
                    ngram = i+1

            # remove any n-gram states of the form: foo <unk> -> X
            # that is, any n-grams of order > 2 where <unk>
            # is the second-to-last word
            # here we skip 1-gram and 2-gram sections of arpa

            if passed_2grams:
                g_unk = unk_pattern.search(line)
                if g_unk:
                    ngram_diffs[ngram] = ngram_diffs[ngram] - 1
                    unk_row_count += 1
                    continue

            # remove backoff probability from the lines that end with <unk>
            # for example, the -0.64 in -4.09 every <unk> -0.64
            # here we skip the last n-gram section because it
            # doesn't include backoff probabilities

            if not last_ngram:
                g_backoff = backoff_pattern.search(line)
                if g_backoff:
                    updated_row = whitespace_pattern.split(g_backoff.group(0))[:-1]
                    updated_row = updated_row[0] + \
                        "\t" + " ".join(updated_row[1:]) + "\n"
                    new_lm_lines.append(updated_row)
                    backoff_row_count += 1
                    continue

            new_lm_lines.append(line+"\n")

    print("Removed {} lines including {} as second-to-last term.".format(
        unk_row_count, args.oov_dict_entry), file=sys.stderr)
    print("Removed backoff probabilties from {} lines.".format(
        backoff_row_count), file=sys.stderr)

    return new_lm_lines, ngram_diffs


def read_old_lm():
    print("Reading ARPA LM frome input stream .. ", file=sys.stderr)

    with io.TextIOWrapper(
            sys.stdin.buffer,
            encoding="latin-1") as input_stream:
        old_lm_lines = input_stream.readlines()

    return old_lm_lines


def write_new_lm(new_lm_lines, ngram_counts, ngram_diffs):
    ''' Update n-gram counts that go in the header of the arpa lm '''

    for i in range(10):
        g = re.search(r"ngram (\d)=(\d+)", new_lm_lines[i])
        if g:
            n = int(g.group(1))
            if n in ngram_diffs:
                # ngram_diffs contains negative values
                new_num_ngrams = ngram_counts[n] + ngram_diffs[n]
                new_lm_lines[i] = "ngram {}={}\n".format(
                    n, new_num_ngrams)

    with io.TextIOWrapper(
            sys.stdout.buffer,
            encoding="latin-1") as output_stream:
        output_stream.writelines(new_lm_lines)


def main():
    old_lm_lines = read_old_lm()
    max_ngrams, skip_rows,  ngram_counts = get_ngram_stats(old_lm_lines)
    new_lm_lines, ngram_diffs = find_and_replace_unks(
        old_lm_lines, max_ngrams, skip_rows)
    write_new_lm(new_lm_lines, ngram_counts, ngram_diffs)


if __name__ == "__main__":
    main()


================================================
FILE: egs/utils/lang/make_kn_lm.py
================================================
#!/usr/bin/env python3

# Copyright 2016  Johns Hopkins University (Author: Daniel Povey)
#           2018  Ruizhe Huang
# Apache 2.0.

# This is an implementation of computing Kneser-Ney smoothed language model
# in the same way as srilm. This is a back-off, unmodified version of
# Kneser-Ney smoothing, which produces the same results as the following
# command (as an example) of srilm:
#
# $ ngram-count -order 4 -kn-modify-counts-at-end -ukndiscount -gt1min 0 -gt2min 0 -gt3min 0 -gt4min 0 \
# -text corpus.txt -lm lm.arpa
#
# The data structure is based on: kaldi/egs/wsj/s5/utils/lang/make_phone_lm.py
# The smoothing algorithm is based on: http://www.speech.sri.com/projects/srilm/manpages/ngram-discount.7.html

import sys
import os
import re
import io
import math
import argparse
from collections import Counter, defaultdict


parser = argparse.ArgumentParser(description="""
    Generate kneser-ney language model as arpa format. By default,
    it will read the corpus from standard input, and output to standard output.
    """)
parser.add_argument("-ngram-order", type=int, default=4, choices=[2, 3, 4, 5, 6, 7], help="Order of n-gram")
parser.add_argument("-text", type=str, default=None, help="Path to the corpus file")
parser.add_argument("-lm", type=str, default=None, help="Path to output arpa file for language models")
parser.add_argument("-verbose", type=int, default=0, choices=[0, 1, 2, 3, 4, 5], help="Verbose level")
args = parser.parse_args()

default_encoding = "latin-1"  # For encoding-agnostic scripts, we assume byte stream as input.
                              # Need to be very careful about the use of strip() and split()
                              # in this case, because there is a latin-1 whitespace character
                              # (nbsp) which is part of the unicode encoding range.
                              # Ref: kaldi/egs/wsj/s5/utils/lang/bpe/prepend_words.py @ 69cd717
strip_chars = " \t\r\n"
whitespace = re.compile("[ \t]+")


class CountsForHistory:
    # This class (which is more like a struct) stores the counts seen in a
    # particular history-state.  It is used inside class NgramCounts.
    # It really does the job of a dict from int to float, but it also
    # keeps track of the total count.
    def __init__(self):
        # The 'lambda: defaultdict(float)' is an anonymous function taking no
        # arguments that returns a new defaultdict(float).
        self.word_to_count = defaultdict(int)
        self.word_to_context = defaultdict(set)  # using a set to count the number of unique contexts
        self.word_to_f = dict()  # discounted probability
        self.word_to_bow = dict()  # back-off weight
        self.total_count = 0

    def words(self):
        return self.word_to_count.keys()

    def __str__(self):
        # e.g. returns ' total=12: 3->4, 4->6, -1->2'
        return ' total={0}: {1}'.format(
            str(self.total_count),
            ', '.join(['{0} -> {1}'.format(word, count)
                      for word, count in self.word_to_count.items()]))

    def add_count(self, predicted_word, context_word, count):
        assert count >= 0

        self.total_count += count
        self.word_to_count[predicted_word] += count
        if context_word is not None:
            self.word_to_context[predicted_word].add(context_word)


class NgramCounts:
    # A note on data-structure.  Firstly, all words are represented as
    # integers.  We store n-gram counts as an array, indexed by (history-length
    # == n-gram order minus one) (note: python calls arrays "lists") of dicts
    # from histories to counts, where histories are arrays of integers and
    # "counts" are dicts from integer to float.  For instance, when
    # accumulating the 4-gram count for the '8' in the sequence '5 6 7 8', we'd
    # do as follows: self.counts[3][[5,6,7]][8] += 1.0 where the [3] indexes an
    # array, the [[5,6,7]] indexes a dict, and the [8] indexes a dict.
    def __init__(self, ngram_order, bos_symbol='<s>', eos_symbol='</s>'):
        assert ngram_order >= 2

        self.ngram_order = ngram_order
        self.bos_symbol = bos_symbol
        self.eos_symbol = eos_symbol

        self.counts = []
        for n in range(ngram_order):
            self.counts.append(defaultdict(lambda: CountsForHistory()))

        self.d = []  # list of discounting factor for each order of ngram

    # adds a raw count (called while processing input data).
    # Suppose we see the sequence '6 7 8 9' and ngram_order=4, 'history'
    # would be (6,7,8) and 'predicted_word' would be 9; 'count' would be
    # 1.
    def add_count(self, history, predicted_word, context_word, count):
        self.counts[len(history)][history].add_count(predicted_word, context_word, count)

    # 'line' is a string containing a sequence of integer word-ids.
    # This function adds the un-smoothed counts from this line of text.
    def add_raw_counts_from_line(self, line):
        words = [self.bos_symbol] + whitespace.split(line) + [self.eos_symbol]

        for i in range(len(words)):
            for n in range(1, self.ngram_order+1):
                if i + n > len(words):
                    break

                ngram = words[i: i + n]
                predicted_word = ngram[-1]
                history = tuple(ngram[: -1])
                if i == 0 or n == self.ngram_order:
                    context_word = None
                else:
                    context_word = words[i-1]

                self.add_count(history, predicted_word, context_word, 1)

    def add_raw_counts_from_standard_input(self):
        lines_processed = 0
        infile = io.TextIOWrapper(sys.stdin.buffer, encoding=default_encoding)  # byte stream as input
        for line in infile:
            line = line.strip(strip_chars)
            if line == '':
                break
            self.add_raw_counts_from_line(line)
            lines_processed += 1
        if lines_processed == 0 or args.verbose > 0:
            print("make_phone_lm.py: processed {0} lines of input".format(lines_processed), file=sys.stderr)

    def add_raw_counts_from_file(self, filename):
        lines_processed = 0
        with open(filename, encoding=default_encoding) as fp:
            for line in fp:
                line = line.strip(strip_chars)
                if line == '':
                    break
                self.add_raw_counts_from_line(line)
                lines_processed += 1
        if lines_processed == 0 or args.verbose > 0:
            print("make_phone_lm.py: processed {0} lines of input".format(lines_processed), file=sys.stderr)

    def cal_discounting_constants(self):
        # For each order N of N-grams, we calculate discounting constant D_N = n1_N / (n1_N + 2 * n2_N),
        # where n1_N is the number of unique N-grams with count = 1 (counts-of-counts).
        # This constant is used similarly to absolute discounting.
        # Return value: d is a list of floats, where d[N+1] = D_N

        self.d = [0]  # for the lowest order, i.e., 1-gram, we do not need to discount, thus the constant is 0
                      # This is a special case: as we currently assumed having seen all vocabularies in the dictionary,
                      # but perhaps this is not the case for some other scenarios.
        for n in range(1, self.ngram_order):
            this_order_counts = self.counts[n]
            n1 = 0
            n2 = 0
            for hist, counts_for_hist in this_order_counts.items():
                stat = Counter(counts_for_hist.word_to_count.values())
                n1 += stat[1]
                n2 += stat[2]
            assert n1 + 2 * n2 > 0
            self.d.append(n1 * 1.0 / (n1 + 2 * n2))

    def cal_f(self):
        # f(a_z) is a probability distribution of word sequence a_z.
        # Typically f(a_z) is discounted to be less than the ML estimate so we have
        # some leftover probability for the z words unseen in the context (a_).
        #
        # f(a_z) = (c(a_z) - D0) / c(a_)    ;; for highest order N-grams
        # f(_z)  = (n(*_z) - D1) / n(*_*)	;; for lower order N-grams

        # highest order N-grams
        n = self.ngram_order - 1
        this_order_counts = self.counts[n]
        for hist, counts_for_hist in this_order_counts.items():
            for w, c in counts_for_hist.word_to_count.items():
                counts_for_hist.word_to_f[w] = max((c - self.d[n]), 0) * 1.0 / counts_for_hist.total_count

        # lower order N-grams
        for n in range(0, self.ngram_order - 1):
            this_order_counts = self.counts[n]
            for hist, counts_for_hist in this_order_counts.items():

                n_star_star = 0
                for w in counts_for_hist.word_to_count.keys():
                    n_star_star += len(counts_for_hist.word_to_context[w])

                if n_star_star != 0:
                    for w in counts_for_hist.word_to_count.keys():
                        n_star_z = len(counts_for_hist.word_to_context[w])
                        counts_for_hist.word_to_f[w] = max((n_star_z - self.d[n]), 0) * 1.0 / n_star_star
                else:  # patterns begin with <s>, they do not have "modified count", so use raw count instead
                    for w in counts_for_hist.word_to_count.keys():
                        n_star_z = counts_for_hist.word_to_count[w]
                        counts_for_hist.word_to_f[w] = max((n_star_z - self.d[n]), 0) * 1.0 / counts_for_hist.total_count

    def cal_bow(self):
        # Backoff weights are only necessary for ngrams which form a prefix of a longer ngram.
        # Thus, two sorts of ngrams do not have a bow:
        # 1) highest order ngram
        # 2) ngrams ending in </s>
        #
        # bow(a_) = (1 - Sum_Z1 f(a_z)) / (1 - Sum_Z1 f(_z))
        # Note that Z1 is the set of all words with c(a_z) > 0

        # highest order N-grams
        n = self.ngram_order - 1
        this_order_counts = self.counts[n]
        for hist, counts_for_hist in this_order_counts.items():
            for w in counts_for_hist.word_to_count.keys():
                counts_for_hist.word_to_bow[w] = None

        # lower order N-grams
        for n in range(0, self.ngram_order - 1):
            this_order_counts = self.counts[n]
            for hist, counts_for_hist in this_order_counts.items():
                for w in counts_for_hist.word_to_count.keys():
                    if w == self.eos_symbol:
                        counts_for_hist.word_to_bow[w] = None
                    else:
                        a_ = hist + (w,)

                        assert len(a_) < self.ngram_order
                        assert a_ in self.counts[len(a_)].keys()

                        a_counts_for_hist = self.counts[len(a_)][a_]

                        sum_z1_f_a_z = 0
                        for u in a_counts_for_hist.word_to_count.keys():
                            sum_z1_f_a_z += a_counts_for_hist.word_to_f[u]

                        sum_z1_f_z = 0
                        _ = a_[1:]
                        _counts_for_hist = self.counts[len(_)][_]
                        for u in a_counts_for_hist.word_to_count.keys():  # Should be careful here: what is Z1
                            sum_z1_f_z += _counts_for_hist.word_to_f[u]

                        counts_for_hist.word_to_bow[w] = (1.0 - sum_z1_f_a_z) / (1.0 - sum_z1_f_z)

    def print_raw_counts(self, info_string):
        # these are useful for debug.
        print(info_string)
        res = []
        for this_order_counts in self.counts:
            for hist, counts_for_hist in this_order_counts.items():
                for w in counts_for_hist.word_to_count.keys():
                    ngram = " ".join(hist) + " " + w
                    ngram = ngram.strip(strip_chars)

                    res.append("{0}\t{1}".format(ngram, counts_for_hist.word_to_count[w]))
        res.sort(reverse=True)
        for r in res:
            print(r)

    def print_modified_counts(self, info_string):
        # these are useful for debug.
        print(info_string)
        res = []
        for this_order_counts in self.counts:
            for hist, counts_for_hist in this_order_counts.items():
                for w in counts_for_hist.word_to_count.keys():
                    ngram = " ".join(hist) + " " + w
                    ngram = ngram.strip(strip_chars)

                    modified_count = len(counts_for_hist.word_to_context[w])
                    raw_count = counts_for_hist.word_to_count[w]

                    if modified_count == 0:
                        res.append("{0}\t{1}".format(ngram, raw_count))
                    else:
                        res.append("{0}\t{1}".format(ngram, modified_count))
        res.sort(reverse=True)
        for r in res:
            print(r)

    def print_f(self, info_string):
        # these are useful for debug.
        print(info_string)
        res = []
        for this_order_counts in self.counts:
            for hist, counts_for_hist in this_order_counts.items():
                for w in counts_for_hist.word_to_count.keys():
                    ngram = " ".join(hist) + " " + w
                    ngram = ngram.strip(strip_chars)

                    f = counts_for_hist.word_to_f[w]
                    if f == 0:  # f(<s>) is always 0
                        f = 1e-99

                    res.append("{0}\t{1}".format(ngram, math.log(f, 10)))
        res.sort(reverse=True)
        for r in res:
            print(r)

    def print_f_and_bow(self, info_string):
        # these are useful for debug.
        print(info_string)
        res = []
        for this_order_counts in self.counts:
            for hist, counts_for_hist in this_order_counts.items():
                for w in counts_for_hist.word_to_count.keys():
                    ngram = " ".join(hist) + " " + w
                    ngram = ngram.strip(strip_chars)

                    f = counts_for_hist.word_to_f[w]
                    if f == 0:  # f(<s>) is always 0
                        f = 1e-99

                    bow = counts_for_hist.word_to_bow[w]
                    if bow is None:
                        res.append("{1}\t{0}".format(ngram, math.log(f, 10)))
                    else:
                        res.append("{1}\t{0}\t{2}".format(ngram, math.log(f, 10), math.log(bow, 10)))
        res.sort(reverse=True)
        for r in res:
            print(r)

    def print_as_arpa(self, fout=io.TextIOWrapper(sys.stdout.buffer, encoding='latin-1')):
        # print as ARPA format.

        print('\\data\\', file=fout)
        for hist_len in range(self.ngram_order):
            # print the number of n-grams.
            print('ngram {0}={1}'.format(
                hist_len + 1,
                sum([len(counts_for_hist.word_to_f) for counts_for_hist in self.counts[hist_len].values()])),
                file=fout
            )

        print('', file=fout)

        for hist_len in range(self.ngram_order):
            print('\\{0}-grams:'.format(hist_len + 1), file=fout)

            this_order_counts = self.counts[hist_len]
            for hist, counts_for_hist in this_order_counts.items():
                for word in counts_for_hist.word_to_count.keys():
                    ngram = hist + (word,)
                    prob = counts_for_hist.word_to_f[word]
                    bow = counts_for_hist.word_to_bow[word]

                    if prob == 0:  # f(<s>) is always 0
                        prob = 1e-99

                    line = '{0}\t{1}'.format('%.7f' % math.log10(prob), ' '.join(ngram))
                    if bow is not None:
                        line += '\t{0}'.format('%.7f' % math.log10(bow))
                    print(line, file=fout)
            print('', file=fout)
        print('\\end\\', file=fout)


if __name__ == "__main__":

    ngram_counts = NgramCounts(args.ngram_order)

    if args.text is None:
        ngram_counts.add_raw_counts_from_standard_input()
    else:
        assert os.path.isfile(args.text)
        ngram_counts.add_raw_counts_from_file(args.text)

    ngram_counts.cal_discounting_constants()
    ngram_counts.cal_f()
    ngram_counts.cal_bow()

    if args.lm is None:
        ngram_counts.print_as_arpa()
    else:
        with open(args.lm, 'w', encoding=default_encoding) as f:
            ngram_counts.print_as_arpa(fout=f)


================================================
FILE: egs/utils/lang/make_lexicon_fst.py
================================================
#!/usr/bin/env python3

# Copyright   2018  Johns Hopkins University (author: Daniel Povey)
# Apache 2.0.

# see get_args() below for usage message.
import argparse
import os
import sys
import math
import re

# The use of latin-1 encoding does not preclude reading utf-8.  latin-1
# encoding means "treat words as sequences of bytes", and it is compatible
# with utf-8 encoding as well as other encodings such as gbk, as long as the
# spaces are also spaces in ascii (which we check).  It is basically how we
# emulate the behavior of python before python3.
sys.stdout = open(1, 'w', encoding='latin-1', closefd=False)
sys.stderr = open(2, 'w', encoding='latin-1', closefd=False)

def get_args():
    parser = argparse.ArgumentParser(description="""This script creates the
       text form of a lexicon FST, to be compiled by fstcompile using the
       appropriate symbol tables (phones.txt and words.txt) .  It will mostly
       be invoked indirectly via utils/prepare_lang.sh.  The output goes to
       the stdout.""")

    parser.add_argument('--sil-phone', dest='sil_phone', type=str,
                        help="""Text form of optional-silence phone, e.g. 'SIL'.  See also
                        the --silprob option.""")
    parser.add_argument('--sil-prob', dest='sil_prob', type=float, default=0.0,
                        help="""Probability of silence between words (including at the
                        beginning and end of word sequences).  Must be in the range [0.0, 1.0].
                        This refers to the optional silence inserted by the lexicon; see
                        the --silphone option.""")
    parser.add_argument('--sil-disambig', dest='sil_disambig', type=str,
                        help="""Disambiguation symbol to disambiguate silence, e.g. #5.
                        Will only be supplied if you are creating the version of L.fst
                        with disambiguation symbols, intended for use with cyclic G.fst.
                        This symbol was introduced to fix a rather obscure source of
                        nondeterminism of CLG.fst, that has to do with reordering of
                        disambiguation symbols and phone symbols.""")
    parser.add_argument('--left-context-phones', dest='left_context_phones', type=str,
                        help="""Only relevant if --nonterminals is also supplied; this relates
                        to grammar decoding (see http://kaldi-asr.org/doc/grammar.html or
                        src/doc/grammar.dox).  Format is a list of left-context phones,
                        in text form, one per line.  E.g. data/lang/phones/left_context_phones.txt""")
    parser.add_argument('--nonterminals', type=str,
                        help="""If supplied, --left-context-phones must also be supplied.
                        List of user-defined nonterminal symbols such as #nonterm:contact_list,
                        one per line.  E.g. data/local/dict/nonterminals.txt.""")
    parser.add_argument('lexiconp', type=str,
                        help="""Filename of lexicon with pronunciation probabilities
                        (normally lexiconp.txt), with lines of the form 'word prob p1 p2...',
                        e.g. 'a   1.0    ay'""")
    args = parser.parse_args()
    return args


def read_lexiconp(filename):
    """Reads the lexiconp.txt file in 'filename', with lines like 'word pron p1 p2 ...'.
    Returns a list of tuples (word, pron_prob, pron), where 'word' is a string,
   'pron_prob', a float, is the pronunciation probability (which must be >0.0
    and would normally be <=1.0),  and 'pron' is a list of strings representing phones.
    An element in the returned list might be ('hello', 1.0, ['h', 'eh', 'l', 'ow']).
    """

    ans = []
    found_empty_prons = False
    found_large_pronprobs = False
    # See the comment near the top of this file, RE why we use latin-1.
    with open(filename, 'r', encoding='latin-1') as f:
        whitespace = re.compile("[ \t]+")
        for line in f:
            a = whitespace.split(line.strip(" \t\r\n"))
            if len(a) < 2:
                print("{0}: error: found bad line '{1}' in lexicon file {2} ".format(
                    sys.argv[0], line.strip(" \t\r\n"), filename), file=sys.stderr)
                sys.exit(1)
            word = a[0]
            if word == "<eps>":
                # This would clash with the epsilon symbol normally used in OpenFst.
                print("{0}: error: found <eps> as a word in lexicon file "
                      "{1}".format(line.strip(" \t\r\n"), filename), file=sys.stderr)
                sys.exit(1)
            try:
                pron_prob = float(a[1])
            except:
                print("{0}: error: found bad line '{1}' in lexicon file {2}, 2nd field "
                      "should be pron-prob".format(sys.argv[0], line.strip(" \t\r\n"), filename),
                      file=sys.stderr)
                sys.exit(1)
            prons = a[2:]
            if pron_prob <= 0.0:
                print("{0}: error: invalid pron-prob in line '{1}' of lexicon file {2} ".format(
                    sys.argv[0], line.strip(" \t\r\n"), filename), file=sys.stderr)
                sys.exit(1)
            if len(prons) == 0:
                found_empty_prons = True
            ans.append( (word, pron_prob, prons) )
            if pron_prob > 1.0:
                found_large_pronprobs = True
    if found_empty_prons:
        print("{0}: warning: found at least one word with an empty pronunciation "
              "in lexicon file {1}.".format(sys.argv[0], filename),
              file=sys.stderr)
    if found_large_pronprobs:
        print("{0}: warning: found at least one word with pron-prob >1.0 "
              "in {1}".format(sys.argv[0], filename), file=sys.stderr)


    if len(ans) == 0:
        print("{0}: error: found no pronunciations in lexicon file {1}".format(
            sys.argv[0], filename), file=sys.stderr)
        sys.exit(1)
    return ans


def write_nonterminal_arcs(start_state, loop_state, next_state,
                           nonterminals, left_context_phones):
    """This function relates to the grammar-decoding setup, see
    kaldi-asr.org/doc/grammar.html.  It is called from write_fst_no_silence
    and write_fst_silence, and writes to the stdout some extra arcs
    in the lexicon FST that relate to nonterminal symbols.
    See the section "Special symbols in L.fst,
    kaldi-asr.org/doc/grammar.html#grammar_special_l.
       start_state: the start-state of L.fst.
       loop_state:  the state of high out-degree in L.fst where words leave
                  and enter.
       next_state: the number from which this function can start allocating its
                  own states.  the updated value of next_state will be returned.
       nonterminals: the user-defined nonterminal symbols as a list of
          strings, e.g. ['#nonterm:contact_list', ... ].
       left_context_phones: a list of phones that may appear as left-context,
          e.g. ['a', 'ah', ... '#nonterm_bos'].
    """
    shared_state = next_state
    next_state += 1
    final_state = next_state
    next_state += 1

    print("{src}\t{dest}\t{phone}\t{word}\t{cost}".format(
        src=start_state, dest=shared_state,
        phone='#nonterm_begin', word='#nonterm_begin',
        cost=0.0))

    for nonterminal in nonterminals:
        print("{src}\t{dest}\t{phone}\t{word}\t{cost}".format(
            src=loop_state, dest=shared_state,
            phone=nonterminal, word=nonterminal,
            cost=0.0))
    # this_cost equals log(len(left_context_phones)) but the expression below
    # better captures the meaning.  Applying this cost to arcs keeps the FST
    # stochatic (sum-to-one, like an HMM), so that if we do weight pushing
    # things won't get weird.  In the grammar-FST code when we splice things
    # together we will cancel out this cost, see the function CombineArcs().
    this_cost = -math.log(1.0 / len(left_context_phones))

    for left_context_phone in left_context_phones:
        print("{src}\t{dest}\t{phone}\t{word}\t{cost}".format(
            src=shared_state, dest=loop_state,
            phone=left_context_phone, word='<eps>', cost=this_cost))
    # arc from loop-state to a final-state with #nonterm_end as ilabel and olabel
    print("{src}\t{dest}\t{phone}\t{word}\t{cost}".format(
        src=loop_state, dest=final_state,
        phone='#nonterm_end', word='#nonterm_end', cost=0.0))
    print("{state}\t{final_cost}".format(
        state=final_state, final_cost=0.0))
    return next_state


def write_fst_no_silence(lexicon, nonterminals=None, left_context_phones=None):
    """Writes the text format of L.fst to the standard output.  This version is for
    when --sil-prob=0.0, meaning there is no optional silence allowed.

      'lexicon' is a list of 3-tuples (word, pron-prob, prons) as returned by
        read_lexiconp().
     'nonterminals', which relates to grammar decoding (see kaldi-asr.org/doc/grammar.html),
        is either None, or the user-defined nonterminal symbols as a list of
        strings, e.g. ['#nonterm:contact_list', ... ].
     'left_context_phones', which also relates to grammar decoding, and must be
        supplied if 'nonterminals' is supplied is either None or a list of
        phones that may appear as left-context, e.g. ['a', 'ah', ... '#nonterm_bos'].
    """

    loop_state = 0
    next_state = 1  # the next un-allocated state, will be incremented as we go.
    for (word, pronprob, pron) in lexicon:
        cost = -math.log(pronprob)
        cur_state = loop_state
        for i in range(len(pron) - 1):
            print("{src}\t{dest}\t{phone}\t{word}\t{cost}".format(
                src=cur_state,
                dest=next_state,
                phone=pron[i],
                word=(word if i == 0 else '<eps>'),
                cost=(cost if i == 0 else 0.0)))
            cur_state = next_state
            next_state += 1

        i = len(pron) - 1  # note: i == -1 if pron is empty.
        print("{src}\t{dest}\t{phone}\t{word}\t{cost}".format(
            src=cur_state,
            dest=loop_state,
            phone=(pron[i] if i >= 0 else '<eps>'),
            word=(word if i <= 0 else '<eps>'),
            cost=(cost if i <= 0 else 0.0)))

    if nonterminals is not None:
        next_state = write_nonterminal_arcs(
            loop_state, loop_state, next_state,
            nonterminals, left_context_phones)

    print("{state}\t{final_cost}".format(
        state=loop_state,
        final_cost=0.0))


def write_fst_with_silence(lexicon, sil_prob, sil_phone, sil_disambig,
                           nonterminals=None, left_context_phones=None):
    """Writes the text format of L.fst to the standard output.  This version is for
       when --sil-prob != 0.0, meaning there is optional silence
     'lexicon' is a list of 3-tuples (word, pron-prob, prons)
         as returned by read_lexiconp().
     'sil_prob', which is expected to be strictly between 0.. and 1.0, is the
         probability of silence
     'sil_phone' is the silence phone, e.g. "SIL".
     'sil_disambig' is either None, or the silence disambiguation symbol, e.g. "#5".
     'nonterminals', which relates to grammar decoding (see kaldi-asr.org/doc/grammar.html),
        is either None, or the user-defined nonterminal symbols as a list of
        strings, e.g. ['#nonterm:contact_list', ... ].
     'left_context_phones', which also relates to grammar decoding, and must be
        supplied if 'nonterminals' is supplied is either None or a list of
        phones that may appear as left-context, e.g. ['a', 'ah', ... '#nonterm_bos'].
    """

    assert sil_prob > 0.0 and sil_prob < 1.0
    sil_cost = -math.log(sil_prob)
    no_sil_cost = -math.log(1.0 - sil_prob);

    start_state = 0
    loop_state = 1  # words enter and leave from here
    sil_state = 2   # words terminate here when followed by silence; this state
                    # has a silence transition to loop_state.
    next_state = 3  # the next un-allocated state, will be incremented as we go.


    print('{src}\t{dest}\t{phone}\t{word}\t{cost}'.format(
        src=start_state, dest=loop_state,
        phone='<eps>', word='<eps>', cost=no_sil_cost))
    print('{src}\t{dest}\t{phone}\t{word}\t{cost}'.format(
        src=start_state, dest=sil_state,
        phone='<eps>', word='<eps>', cost=sil_cost))
    if sil_disambig is None:
        print('{src}\t{dest}\t{phone}\t{word}\t{cost}'.format(
            src=sil_state, dest=loop_state,
            phone=sil_phone, word='<eps>', cost=0.0))
    else:
        sil_disambig_state = next_state
        next_state += 1
        print('{src}\t{dest}\t{phone}\t{word}\t{cost}'.format(
            src=sil_state, dest=sil_disambig_state,
            phone=sil_phone, word='<eps>', cost=0.0))
        print('{src}\t{dest}\t{phone}\t{word}\t{cost}'.format(
            src=sil_disambig_state, dest=loop_state,
            phone=sil_disambig, word='<eps>', cost=0.0))


    for (word, pronprob, pron) in lexicon:
        pron_cost = -math.log(pronprob)
        cur_state = loop_state
        for i in range(len(pron) - 1):
            print("{src}\t{dest}\t{phone}\t{word}\t{cost}".format(
                src=cur_state, dest=next_state,
                phone=pron[i],
                word=(word if i == 0 else '<eps>'),
                cost=(pron_cost if i == 0 else 0.0)))
            cur_state = next_state
            next_state += 1

        i = len(pron) - 1  # note: i == -1 if pron is empty.
        print("{src}\t{dest}\t{phone}\t{word}\t{cost}".format(
            src=cur_state,
            dest=loop_state,
            phone=(pron[i] if i >= 0 else '<eps>'),
            word=(word if i <= 0 else '<eps>'),
            cost=no_sil_cost + (pron_cost if i <= 0 else 0.0)))
        print("{src}\t{dest}\t{phone}\t{word}\t{cost}".format(
            src=cur_state,
            dest=sil_state,
            phone=(pron[i] if i >= 0 else '<eps>'),
            word=(word if i <= 0 else '<eps>'),
            cost=sil_cost + (pron_cost if i <= 0 else 0.0)))

    if nonterminals is not None:
        next_state = write_nonterminal_arcs(
            start_state, loop_state, next_state,
            nonterminals, left_context_phones)

    print("{state}\t{final_cost}".format(
        state=loop_state,
        final_cost=0.0))


def write_words_txt(orig_lines, highest_numbered_symbol, nonterminals, filename):
    """Writes updated words.txt to 'filename'.  'orig_lines' is the original lines
       in the words.txt file as a list of strings (without the newlines);
       highest_numbered_symbol is the highest numbered symbol in the original
       words.txt; nonterminals is a list of strings like '#nonterm:foo'."""
    with open(filename, 'w', encoding='latin-1') as f:
        for l in orig_lines:
            print(l, file=f)
        cur_symbol = highest_numbered_symbol + 1
        for n in [ '#nonterm_begin', '#nonterm_end' ] + nonterminals:
            print("{0} {1}".format(n, cur_symbol), file=f)
            cur_symbol = cur_symbol + 1


def read_nonterminals(filename):
    """Reads the user-defined nonterminal symbols in 'filename', checks that
       it has the expected format and has no duplicates, and returns the nonterminal
       symbols as a list of strings, e.g.
       ['#nonterm:contact_list', '#nonterm:phone_number', ... ]. """
    ans = [line.strip(" \t\r\n") for line in open(filename, 'r', encoding='latin-1')]
    if len(ans) == 0:
        raise RuntimeError("The file {0} contains no nonterminals symbols.".format(filename))
    for nonterm in ans:
        if nonterm[:9] != '#nonterm:':
            raise RuntimeError("In file '{0}', expected nonterminal symbols to start with '#nonterm:', found '{1}'"
                               .format(filename, nonterm))
    if len(set(ans)) != len(ans):
        raise RuntimeError("Duplicate nonterminal symbols are present in file {0}".format(filename))
    return ans

def read_left_context_phones(filename):
    """Reads, checks, and returns a list of left-context phones, in text form, one
       per line.  Returns a list of strings, e.g. ['a', 'ah', ..., '#nonterm_bos' ]"""
    ans = [line.strip(" \t\r\n") for line in open(filename, 'r', encoding='latin-1')]
    if len(ans) == 0:
        raise RuntimeError("The file {0} contains no left-context phones.".format(filename))
    whitespace = re.compile("[ \t]+")
    for s in ans:
        if len(whitespace.split(s)) != 1:
            raise RuntimeError("The file {0} contains an invalid line '{1}'".format(filename, s)   )

    if len(set(ans)) != len(ans):
        raise RuntimeError("Duplicate left-context phones are present in file {0}".format(filename))
    return ans


def is_token(s):
    """Returns true if s is a string and is space-free."""
    if not isinstance(s, str):
        return False
    whitespace = re.compile("[ \t\r\n]+")
    split_str = whitespace.split(s);
    return len(split_str) == 1 and s == split_str[0]


def main():
    args = get_args()

    lexicon = read_lexiconp(args.lexiconp)

    if args.nonterminals is None:
        nonterminals, left_context_phones = None, None
    else:
        if args.left_context_phones is None:
            print("{0}: if --nonterminals is specified, --left-context-phones must also "
                  "be specified".format(sys.argv[0]))
            sys.exit(1)
        nonterminals = read_nonterminals(args.nonterminals)
        left_context_phones = read_left_context_phones(args.left_context_phones)

    if args.sil_prob == 0.0:
          write_fst_no_silence(lexicon,
                               nonterminals=nonterminals,
                               left_context_phones=left_context_phones)
    else:
        # Do some checking that the options make sense.
        if args.sil_prob < 0.0 or args.sil_prob >= 1.0:
            print("{0}: invalid value specified --sil-prob={1}".format(
                sys.argv[0], args.sil_prob), file=sys.stderr)
            sys.exit(1)

        if not is_token(args.sil_phone):
            print("{0}: you specified --sil-prob={1} but --sil-phone is set "
                  "to '{2}'".format(sys.argv[0], args.sil_prob, args.sil_phone),
                  file=sys.stderr)
            sys.exit(1)
        if args.sil_disambig is not None and not is_token(args.sil_disambig):
            print("{0}: invalid value --sil-disambig='{1}' was specified."
                  "".format(sys.argv[0], args.sil_disambig), file=sys.stderr)
            sys.exit(1)
        write_fst_with_silence(lexicon, args.sil_prob, args.sil_phone,
                               args.sil_disambig,
                               nonterminals=nonterminals,
                               left_context_phones=left_context_phones)


#    (lines, highest_symbol) = read_words_txt(args.input_words_txt)
#    nonterminals = read_nonterminals(args.nonterminal_symbols_list)
#    write_words_txt(lines, highest_symbol, nonterminals, args.output_words_txt)


if __name__ == '__main__':
      main()


================================================
FILE: egs/utils/lang/make_lexicon_fst_silprob.py
================================================
#!/usr/bin/env python3
# Copyright   2018  Johns Hopkins University (author: Daniel Povey)
#             2018  Jiedan Zhu
# Apache 2.0.
# see get_args() below for usage message.

import argparse
import os
import sys
import math
import re

# The use of latin-1 encoding does not preclude reading utf-8.  latin-1
# encoding means "treat words as sequences of bytes", and it is compatible
# with utf-8 encoding as well as other encodings such as gbk, as long as the
# spaces are also spaces in ascii (which we check).  It is basically how we
# emulate the behavior of python before python3.

sys.stdout = open(1, 'w', encoding='latin-1', closefd=False)
sys.stderr = open(2, 'w', encoding='latin-1', closefd=False)


def get_args():
    parser = argparse.ArgumentParser(description="""This script creates the
       text form of a lexicon FST, to be compiled by fstcompile using the
       appropriate symbol tables (phones.txt and words.txt) .  It will mostly
       be invoked indirectly via utils/prepare_lang.sh.  The output goes to
       the stdout.

       This version is for a lexicon with word-specific silence probabilities,
       see http://www.danielpovey.com/files/2015_interspeech_silprob.pdf
       for an explanation""")

    parser.add_argument('--sil-phone', dest='sil_phone', type=str,
                        help="Text form of optional-silence phone, e.g. 'SIL'.")
    parser.add_argument('--sil-disambig', dest='sil_disambig', type=str, default="<eps>",
                        help="""Disambiguation symbol to disambiguate silence, e.g. #5.
                        Will only be supplied if you are creating the version of L.fst
                        with disambiguation symbols, intended for use with cyclic G.fst.
                        This symbol was introduced to fix a rather obscure source of
                        nondeterminism of CLG.fst, that has to do with reordering of
                        disambiguation symbols and phone symbols.""")
    parser.add_argument('lexiconp', type=str,
                        help="""Filename of lexicon with pronunciation probabilities
                        (normally lexiconp.txt), with lines of the form
                        'word pron-prob prob-of-sil correction-term-for-sil correction-term-for-no-sil p1 p2...',
                        e.g. 'a   1.0  0.8 1.2  0.6  ay'""")
    parser.add_argument('silprobs', type=str,
                        help="""Filename with silence probabilities, with lines of the form
                        '<s> p(sil-after|<s>) //
                        </s>_s correction-term-for-sil-for-</s> //
                        </s>_n correction-term-for-no-sil-for-</s> //
                        overall p(overall-sil), where // represents line break.
                        See also utils/dict_dir_add_pronprobs.sh,
                        which creates this file as silprob.txt.""")
    parser.add_argument('--left-context-phones', dest='left_context_phones', type=str,
                        help="""Only relevant if --nonterminals is also supplied; this relates
                        to grammar decoding (see http://kaldi-asr.org/doc/grammar.html or
                        src/doc/grammar.dox).  Format is a list of left-context phones,
                        in text form, one per line.  E.g. data/lang/phones/left_context_phones.txt""")
    parser.add_argument('--nonterminals', type=str,
                        help="""If supplied, --left-context-phones must also be supplied.
                        List of user-defined nonterminal symbols such as #nonterm:contact_list,
                        one per line.  E.g. data/local/dict/nonterminals.txt.""")

    args = parser.parse_args()
    return args


def read_silprobs(filename):
    """ Reads the silprobs file (e.g. silprobs.txt) which will have a format like this:
     <s> 0.99
     </s>_s 2.50607106867326
     </s>_n 0.00653829808100956
     overall 0.20
    and returns it as a 4-tuple, e.g. in this example (0.99, 2.50, 0.006, 0.20)
    """
    silbeginprob = -1
    silendcorrection = -1
    nonsilendcorrection = -1
    siloverallprob = -1
    with open(filename, 'r', encoding='latin-1') as f:
        whitespace = re.compile("[ \t]+")
        for line in f:
            a = whitespace.split(line.strip(" \t\r\n"))
            if len(a) != 2:
                print("{0}: error: found bad line '{1}' in silprobs file {1} ".format(
                    sys.argv[0], line.strip(" \t\r\n"), filename), file=sys.stderr)
                sys.exit(1)
            label = a[0]
            try:
                if label == "<s>":
                    silbeginprob = float(a[1])
                elif label == "</s>_s":
                    silendcorrection = float(a[1])
                elif label == "</s>_n":
                    nonsilendcorrection = float(a[1])
                elif label == "overall":
                    siloverallprob = float(a[1]) # this is not in use, still keep it?
                else:
                    raise RuntimeError()
            except:
                print("{0}: error: found bad line '{1}' in silprobs file {1}"
                      .format(sys.argv[0], line.strip(" \t\r\n"), filename),
                      file=sys.stderr)
                sys.exit(1)
    if (silbeginprob <= 0.0 or silbeginprob > 1.0 or
        silendcorrection <= 0.0 or nonsilendcorrection <= 0.0 or
        siloverallprob <= 0.0 or siloverallprob > 1.0):
        print("{0}: error: prob is not correct in silprobs file {1}."
            .format(sys.argv[0], filename), file=sys.stderr)
        sys.exit(1)
    return (silbeginprob, silendcorrection, nonsilendcorrection, siloverallprob)


def read_lexiconp(filename):
    """Reads the lexiconp.txt file in 'filename', with lines like
    'word p(pronunciation|word) p(sil-after|word) correction-term-for-sil
    correction-term-for-no-sil p1 p2 ...'.
    Returns a list of tuples (word, pron_prob, word_sil_prob,
    sil_word_correction, non_sil_word_correction, prons), where 'word' is a string,
   'pron_prob', a float, is the pronunciation probability (which must be >0.0
    and would normally be <=1.0), 'word_sil_prob' is a float,
    'sil_word_correction' is a float, 'non_sil_word_correction' is a float,
    and 'pron' is a list of strings representing phones.
    An element in the returned list might be
    ('hello', 1.0, 0.5, 0.3, 0.6, ['h', 'eh', 'l', 'ow']).
    """
    ans = []
    found_empty_prons = False
    found_large_pronprobs = False
    # See the comment near the top of this file, RE why we use latin-1.
    whitespace = re.compile("[ \t]+")
    with open(filename, 'r', encoding='latin-1') as f:
        for line in f:
            a = whitespace.split(line.strip(" \t\r\n"))
            if len(a) < 2:
                print("{0}: error: found bad line '{1}' in lexicon file {1} ".format(
                    sys.argv[0], line.strip(" \t\r\n"), filename), file=sys.stderr)
                sys.exit(1)
            word = a[0]
            if word == "<eps>":
                # This would clash with the epsilon symbol normally used in OpenFst.
                print("{0}: error: found <eps> as a word in lexicon file "
                      "{1}".format(line.strip(" \t\r\n"), filename), file=sys.stderr)
                sys.exit(1)
            try:
                pron_prob = float(a[1])
                word_sil_prob = float(a[2])
                sil_word_correction = float(a[3])
                non_sil_word_correction = float(a[4])
            except:
                print("{0}: error: found bad line '{1}' in lexicon file {2}, 2nd field "
                      "through 5th field should be numbers".format(sys.argv[0],
                                                                   line.strip(" \t\r\n"), filename),
                      file=sys.stderr)
                sys.exit(1)
            prons = a[5:]
            if pron_prob <= 0.0:
                print("{0}: error: invalid pron-prob in line '{1}' of lexicon file {2} ".format(
                    sys.argv[0], line.strip(" \t\r\n"), filename), file=sys.stderr)
                sys.exit(1)
            if len(prons) == 0:
                found_empty_prons = True
            ans.append((
                word, pron_prob, word_sil_prob,
                sil_word_correction, non_sil_word_correction, prons))
            if pron_prob > 1.0:
                found_large_pronprobs = True
    if found_empty_prons:
        print("{0}: warning: found at least one word with an empty pronunciation "
              "in lexicon file {1}.".format(sys.argv[0], filename),
              file=sys.stderr)
    if found_large_pronprobs:
        print("{0}: warning: found at least one word with pron-prob >1.0 "
              "in {1}".format(sys.argv[0], filename), file=sys.stderr)
    if len(ans) == 0:
        print("{0}: error: found no pronunciations in lexicon file {1}".format(
            sys.argv[0], filename), file=sys.stderr)
        sys.exit(1)
    return ans


def write_nonterminal_arcs(start_state, sil_state, non_sil_state,
                           next_state, sil_phone,
                           nonterminals, left_context_phones):
    """This function relates to the grammar-decoding setup, see
    kaldi-asr.org/doc/grammar.html.  It is called from write_fst, and writes to
    the stdout some extra arcs in the lexicon FST that relate to nonterminal
    symbols.

    See the section "Special symbols in L.fst,
    kaldi-asr.org/doc/grammar.html#grammar_special_l.
       start_state: the start-state of L.fst.
       sil_state:  the state of high out-degree in L.fst where words leave
                   when preceded by optional silence
       non_sil_state:   the state of high out-degree in L.fst where words leave
                   when not preceded by optional silence
       next_state: the number from which this function can start allocating its
                  own states.  the updated value of next_state will be returned.
       sil_phone:  the optional-silence phone (a string, e.g 'sil')
       nonterminals: the user-defined nonterminal symbols as a list of
          strings, e.g. ['#nonterm:contact_list', ... ].
       left_context_phones: a list of phones that may appear as left-context,
          e.g. ['a', 'ah', ... '#nonterm_bos'].
    """
    shared_state = next_state
    next_state += 1
    final_state = next_state
    next_state += 1

    print("{src}\t{dest}\t{phone}\t{word}\t{cost}".format(
        src=start_state, dest=shared_state,
        phone='#nonterm_begin', word='#nonterm_begin',
        cost=0.0))

    for nonterminal in nonterminals:
        # What we are doing here could be viewed as a little lazy, by going to
        # 'shared_state' instead of a state specific to nonsilence vs. silence
        # left-context vs. unknown (for #nonterm_begin).  If we made them
        # separate we could improve (by half) the correctness of how it
        # interacts with sil-probs in the hard-to-handle case where
        # word-position-dependent phones are not used and some words end
        # in the optional-silence phone.
        for src in [sil_state, non_sil_state]:
            print("{src}\t{dest}\t{phone}\t{word}\t{cost}".format(
                src=src, dest=shared_state,
                phone=nonterminal, word=nonterminal,
                cost=0.0))

    # this_cost equals log(len(left_context_phones)) but the expression below
    # better captures the meaning.  Applying this cost to arcs keeps the FST
    # stochatic (sum-to-one, like an HMM), so that if we do weight pushing
    # things won't get weird.  In the grammar-FST code when we splice things
    # together we will cancel out this cost, see the function CombineArcs().
    this_cost = -math.log(1.0 / len(left_context_phones))

    for left_context_phone in left_context_phones:
        # The following line is part of how we get this to interact correctly with
        # the silence probabilities: if the left-context phone was the silence
        # phone, it goes to sil_state, else nonsil_state.  This won't always
        # do the right thing if you have a system without word-position-dependent
        # phones (--position-dependent-phones false to prepare_lang.sh) and
        # you have words that end in the optional-silence phone.
        dest = (sil_state if left_context_phone == sil_phone else non_sil_state)

        print("{src}\t{dest}\t{phone}\t{word}\t{cost}".format(
            src=shared_state, dest=dest,
            phone=left_context_phone, word='<eps>', cost=this_cost))

    # arc from sil_state and non_sil_state to a final-state with #nonterm_end as
    # ilabel and olabel.  The costs on these arcs are zero because if you take
    # that arc, you are not really terminating the sequence, you are just
    # skipping to sil_state or non_sil_state in the FST one level up.  It
    # takes the correct path because of the code around 'dest = ...' a few
    # lines above this, after reaching 'shared_state' because it saw the
    # user-defined nonterminal.
    for src in [sil_state, non_sil_state]:
        print("{src}\t{dest}\t{phone}\t{word}\t{cost}".format(
            src=src, dest=final_state,
            phone='#nonterm_end', word='#nonterm_end', cost=0.0))
    print("{state}\t{final_cost}".format(
        state=final_state, final_cost=0.0))
    return next_state

def write_fst(lexicon, silprobs, sil_phone, sil_disambig,
              nonterminals = None, left_context_phones = None):
    """Writes the text format of L.fst (or L_disambig.fst)  to the standard output.
     'lexicon' is a list of 5-tuples
     (word, pronprob, wordsilprob, silwordcorrection, nonsilwordcorrection, pron)
         as returned by read_lexiconp().
     'silprobs' is a 4-tuple of probabilities as returned by read_silprobs().
     'sil_phone' is the silence phone, e.g. "SIL".
     'sil_disambig' is either '<eps>', or the silence disambiguation symbol, e.g. "#5".
     'nonterminals', which relates to grammar decoding (see kaldi-asr.org/doc/grammar.html),
        is either None, or the user-defined nonterminal symbols as a list of
        strings, e.g. ['#nonterm:contact_list', ... ].
     'left_context_phones', which also relates to grammar decoding, and must be
        supplied if 'nonterminals' is supplied is either None or a list of
        phones that may appear as left-context, e.g. ['a', 'ah', ... '#nonterm_bos'].
    """
    silbeginprob, silendcorrection, nonsilendcorrection, siloverallprob = silprobs
    initial_sil_cost = -math.log(silbeginprob)
    initial_non_sil_cost = -math.log(1.0 - silbeginprob);
    sil_end_correction_cost = -math.log(silendcorrection)
    non_sil_end_correction_cost = -math.log(nonsilendcorrection);
    start_state = 0
    non_sil_state = 1  # words enter and leave from here
    sil_state = 2   # words terminate here when followed by silence; this state
                    # has a silence transition to loop_state.
    next_state = 3  # the next un-allocated state, will be incremented as we go.

    # Arcs from the start state to the silence and nonsilence loop states
    # The one to the nonsilence state has the silence disambiguation symbol
    # (We always use that symbol on the *non*-silence-containing arcs, which
    # avoids having to introduce extra arcs).
    print('{src}\t{dest}\t{phone}\t{word}\t{cost}'.format(
        src=start_state, dest=non_sil_state,
        phone=sil_disambig, word='<eps>', cost=initial_non_sil_cost))
    print('{src}\t{dest}\t{phone}\t{word}\t{cost}'.format(
        src=start_state, dest=sil_state,
        phone=sil_phone, word='<eps>', cost=initial_sil_cost))

    for (word, pronprob, wordsilprob, silwordcorrection, nonsilwordcorrection, pron) in lexicon:
        pron_cost = -math.log(pronprob)
        word_to_sil_cost = -math.log(wordsilprob)
        word_to_non_sil_cost = -math.log(1.0 - wordsilprob)
        sil_to_word_cost = -math.log(silwordcorrection)
        non_sil_to_word_cost = -math.log(nonsilwordcorrection)

        if len(pron) == 0:
            # this is not really expected but we try to handle it gracefully.
            pron = ['<eps>']

        new_state = next_state  # allocate a new state
        next_state += 1
        # Create transitions from both non_sil_state and sil_state to 'new_state',
        # with the word label and the word's first phone on them
        print("{src}\t{dest}\t{phone}\t{word}\t{cost}".format(
            src=non_sil_state, dest=new_state,
            phone=pron[0], word=word, cost=(pron_cost + non_sil_to_word_cost)))
        print("{src}\t{dest}\t{phone}\t{word}\t{cost}".format(
            src=sil_state, dest=new_state,
            phone=pron[0], word=word, cost=(pron_cost + sil_to_word_cost)))
        cur_state = new_state

        # add states and arcs for all but the first phone.
        for i in range(1, len(pron)):
            new_state = next_state
            next_state += 1
            print("{src}\t{dest}\t{phone}\t<eps>".format(
                src=cur_state, dest=new_state, phone=pron[i]))
            cur_state = new_state

        # ... and from there we return via two arcs to the silence and
        # nonsilence state.  the silence-disambig symbol, if used,q
        # goes on the nonsilence arc; this saves us having to insert an epsilon.
        print("{src}\t{dest}\t{phone}\t{word}\t{cost}".format(
            src=cur_state,  dest=non_sil_state,
            phone=sil_disambig, word='<eps>',
            cost=word_to_non_sil_cost))
        print("{src}\t{dest}\t{phone}\t{word}\t{cost}".format(
            src=cur_state, dest=sil_state,
            phone=sil_phone, word='<eps>',
            cost=word_to_sil_cost))

    if nonterminals is not None:
        next_state = write_nonterminal_arcs(
            start_state, sil_state, non_sil_state,
            next_state, sil_phone,
            nonterminals, left_context_phones)

    print('{src}\t{cost}'.format(src=sil_state, cost=sil_end_correction_cost))
    print('{src}\t{cost}'.format(src=non_sil_state, cost=non_sil_end_correction_cost))

def read_nonterminals(filename):
    """Reads the user-defined nonterminal symbols in 'filename', checks that
       it has the expected format and has no duplicates, and returns the nonterminal
       symbols as a list of strings, e.g.
       ['#nonterm:contact_list', '#nonterm:phone_number', ... ]. """
    ans = [line.strip(" \t\r\n") for line in open(filename, 'r', encoding='latin-1')]
    if len(ans) == 0:
        raise RuntimeError("The file {0} contains no nonterminals symbols.".format(filename))
    for nonterm in ans:
        if nonterm[:9] != '#nonterm:':
            raise RuntimeError("In file '{0}', expected nonterminal symbols to start with '#nonterm:', found '{1}'"
                               .format(filename, nonterm))
    if len(set(ans)) != len(ans):
        raise RuntimeError("Duplicate nonterminal symbols are present in file {0}".format(filename))
    return ans

def read_left_context_phones(filename):
    """Reads, checks, and returns a list of left-context phones, in text form, one
       per line.  Returns a list of strings, e.g. ['a', 'ah', ..., '#nonterm_bos' ]"""
    ans = [line.strip(" \t\r\n") for line in open(filename, 'r', encoding='latin-1')]
    if len(ans) == 0:
        raise RuntimeError("The file {0} contains no left-context phones.".format(filename))
    for s in ans:
        if len(s.split()) != 1:
            raise RuntimeError("The file {0} contains an invalid line '{1}'".format(filename, s)   )

    if len(set(ans)) != len(ans):
        raise RuntimeError("Duplicate nonterminal symbols are present in file {0}".format(filename))
    return ans


def main():
    args = get_args()
    silprobs = read_silprobs(args.silprobs)
    lexicon = read_lexiconp(args.lexiconp)


    if args.nonterminals is None:
        nonterminals, left_context_phones = None, None
    else:
        if args.left_context_phones is None:
            print("{0}: if --nonterminals is specified, --left-context-phones must also "
                  "be specified".format(sys.argv[0]))
            sys.exit(1)
        nonterminals = read_nonterminals(args.nonterminals)
        left_context_phones = read_left_context_phones(args.left_context_phones)

    write_fst(lexicon, silprobs, args.sil_phone, args.sil_disambig,
              nonterminals, left_context_phones)


if __name__ == '__main__':
      main()


================================================
FILE: egs/utils/lang/make_phone_bigram_lang.sh
================================================
#!/usr/bin/env bash

# Apache 2.0.  Copyright 2012, Johns Hopkins University (author: Daniel Povey)

# This script creates a "lang" directory of the "testing" type (including G.fst)
# given an existing "alignment" directory and an existing "lang" directory.
# The directory contains only single-phone words, and a bigram language model that
# is built without smoothing, on top of single phones.  The point of no smoothing
# is to limit the number of transitions, so we can decode reasonably fast, and the
# graph won't blow up.  This is probably going to be most useful for things like
# language-id.
#
#  See also steps/make_phone_graph.sh


echo "$0 $@"  # Print the command line for logging

[ -f ./path.sh ] && . ./path.sh; # source the path.
. parse_options.sh || exit 1;


if [ $# != 3 ]; then
  echo "Usage: $0: [options] <lang-dir> <ali-dir> <output-lang-dir>"
  echo "e.g.: $0: data/lang exp/tri3b_ali data/lang_phone_bg"
  exit 1;
fi

lang=$1
alidir=$2
lang_out=$3

for f in $lang/phones.txt $alidir/ali.1.gz; do
  [ ! -f $f ] && echo "Expected file $f to exist" && exit 1;
done

mkdir -p $lang_out || exit 1;

grep -v '#' $lang/phones.txt >  $lang_out/phones.txt # no disambig symbols
      # needed; G and L . G will be deterministic.
cp $lang/topo $lang_out
rm -r $lang_out/phones 2>/dev/null
cp -r $lang/phones/ $lang_out/
rm $lang_out/phones/word_boundary.* 2>/dev/null # these would
  # no longer be valid.
rm $lang_out/phones/wdisambig* 2>/dev/null  # ditto this.

# List of disambig symbols will be empty: not needed, since G.fst and L.fst * G.fst
# are determinizable without any.
echo -n > $lang_out/phones/disambig.txt
echo -n > $lang_out/phones/disambig.int
echo -n > $lang_out/phones/disambig.csl
echo -n > $lang_out/phones/wdisambig.txt
echo -n > $lang_out/phones/wdisambig_phones.int
echo -n > $lang_out/phones/wdisambig_words.int

# Let OOV symbol be the first phone.  This is arbitrary, it's just
# so that validate_lang.pl succeeds.  We should never actually use
# this.
oov_sym=$(tail -n +2 $lang_out/phones.txt | head -n 1 | awk '{print $1}')
oov_int=$(tail -n +2 $lang_out/phones.txt | head -n 1 | awk '{print $2}')
echo $oov_sym > $lang_out/oov.txt
echo $oov_int > $lang_out/oov.int


# Get phone-level transcripts of training data and create a
# language model.
ali-to-phones $alidir/final.mdl "ark:gunzip -c $alidir/ali.*.gz|" ark,t:- | \
  perl -e 'while(<>) {
    @A = split(" ", $_);
    shift @A; # Remove the utterance-id.
    foreach $p ( @A ) { $phones{$p} = 1; } # assoc. array of phones.
    unshift @A, "<s>";
    push @A, "</s>";
    for ($n = 0; $n+1 < @A; $n++) {
      $p = $A[$n]; $q = $A[$n+1];
      $count{$p,$q}++;
      $histcount{$p}++;
    }
  }
  @phones = keys %phones;
  unshift @phones, "<s>";
  # @phones is now all real phones, plus <s>.
  for ($n = 0; $n < @phones; $n++) {
    $phn2state{$phones[$n]} = $n;
  }
  foreach $p (@phones) {
    $src = $phn2state{$p};
    $hist = $histcount{$p};
    $hist > 0 || die;
    foreach $q (@phones) {
      $c = $count{$p,$q};
      if (defined $c) {
        $cost = -log($c / $hist); # cost on FST arc.
        $dest = $phn2state{$q};
        print "$src $dest $q $cost\n";  # Note: q is actually numeric.
      }
    }
    $c = $count{$p,"</s>"};
    if (defined $c) {
      $cost = -log($c / $hist); # cost on FST arc.
      print "$src $cost\n"; # final-prob.
    }
  } ' | fstcompile --acceptor=true | \
    fstarcsort --sort_type=ilabel > $lang_out/G.fst

# symbols for phones and words are the same.
# Neither has disambig symbols.
cp $lang_out/phones.txt $lang_out/words.txt

grep -v '<eps>' $lang_out/phones.txt | awk '{printf("0 0 %s %s\n", $2, $2);} END{print("0 0.0");}' | \
   fstcompile  > $lang_out/L.fst

# note: first two fields of align_lexicon.txt are interpreted as the word; the remaining
# fields are the phones that are in the pron of the word.  These are all the same, for us.
for p in $(grep -v '<eps>' $lang_out/phones.txt | awk '{print $1}'); do echo $p $p $p; done > $lang_out/phones/align_lexicon.txt

# just use one sym2int.pl command, since phones.txt and words.txt are identical.
utils/sym2int.pl $lang_out/phones.txt <$lang_out/phones/align_lexicon.txt >$lang_out/phones/align_lexicon.int

# L and L_disambig are the same.
cp $lang_out/L.fst $lang_out/L_disambig.fst

utils/validate_lang.pl --skip-disambig-check $lang_out || exit 1;


================================================
FILE: egs/utils/lang/make_phone_lm.py
================================================
#!/usr/bin/env python

# Copyright 2016  Johns Hopkins University (Author: Daniel Povey)
# Apache 2.0.

from __future__ import print_function
from __future__ import division
import sys
import argparse
import math
from collections import defaultdict

# note, this was originally based

parser = argparse.ArgumentParser(description="""
This script creates a language model that's intended to be used in modeling
phone sequences (either of sentences or of dictionary entries), although of
course it will work for any type of data.  The easiest way
to describe it is as a a Kneser-Ney language model (unmodified, with addition)
with a fixed discounting constant equal to 1, except with no smoothing of the
bigrams (and hence no unigram state).  This is (a) because we want to keep the
graph after context expansion small, (b) because languages tend to have
constraints on which phones can follow each other, and (c) in order to get valid
sequences of word-position-dependent phones so that lattice-align-words can
work.  It also includes have a special entropy-based pruning technique that
backs off the statistics of pruned n-grams to lower-order states.

This script reads lines from its standard input, each
consisting of a sequence of integer symbol-ids (which should be > 0),
representing the phone sequences of a sentence or dictionary entry.
This script outputs a backoff language model in FST format""",
                                 epilog="See also utils/lang/make_phone_bigram_lang.sh")


parser.add_argument("--phone-disambig-symbol", type = int, required = False,
                    help = "Integer corresponding to an otherwise-unused "
                    "phone-level disambiguation symbol (e.g. #5).  This is "
                    "inserted at the beginning of the phone sequence and "
                    "whenever we back off.")
parser.add_argument("--ngram-order", type = int, default = 4,
                    choices = [2,3,4,5,6,7],
                    help = "Order of n-gram to use (but see also --num-extra-states;"
                    "the effective order after pruning may be less.")
parser.add_argument("--num-extra-ngrams", type = int, default = 20000,
                    help = "Target number of n-grams in addition to the n-grams in "
                    "the bigram LM states which can't be pruned away.  n-grams "
                    "will be pruned to reach this target.")
parser.add_argument("--no-backoff-ngram-order", type = int, default = 2,
                    choices = [1,2,3,4,5],
                    help = "This specifies the n-gram order at which (and below which) "
                    "no backoff or pruning should be done.  This is expected to normally "
                    "be bigram, but for testing purposes you may want to set it to "
                    "1.")
parser.add_argument("--print-as-arpa", type = str, default = "false",
                    choices = ["true", "false"],
                    help = "If true, print LM in ARPA format (default is to print "
                    "as FST).  You must also set --no-backoff-ngram-order=1 or "
                    "this is not allowed.")
parser.add_argument("--verbose", type = int, default = 0,
                    choices=[0,1,2,3,4,5], help = "Verbose level")

args = parser.parse_args()

if args.verbose >= 1:
    print(' '.join(sys.argv), file = sys.stderr)


class CountsForHistory(object):
    ## This class (which is more like a struct) stores the counts seen in a
    ## particular history-state.  It is used inside class NgramCounts.
    ## It really does the job of a dict from int to float, but it also
    ## keeps track of the total count.
    def __init__(self):
        # The 'lambda: defaultdict(float)' is an anonymous function taking no
        # arguments that returns a new defaultdict(float).
        self.word_to_count = defaultdict(int)
        self.total_count = 0

    def Words(self):
        return list(self.word_to_count.keys())

    def __str__(self):
        # e.g. returns ' total=12 3->4 4->6 -1->2'
        return ' total={0} {1}'.format(
            str(self.total_count),
            ' '.join(['{0} -> {1}'.format(word, count)
                      for word, count in self.word_to_count.items()]))


    ## Adds a certain count (expected to be integer, but might be negative).  If
    ## the resulting count for this word is zero, removes the dict entry from
    ## word_to_count.
    ## [note, though, that in some circumstances we 'add back' zero counts
    ## where the presence of n-grams would be structurally required by the arpa,
    ## specifically if a higher-order history state has a nonzero count,
    ## we need to structurally have the count there in the states it backs
    ## off to.
    def AddCount(self, predicted_word, count):
        self.total_count += count
        assert self.total_count >= 0
        old_count = self.word_to_count[predicted_word]
        new_count = old_count + count
        if new_count < 0:
            print("predicted-word={0}, old-count={1}, count={2}".format(
                    predicted_word, old_count, count))
        assert new_count >= 0
        if new_count == 0:
            del self.word_to_count[predicted_word]
        else:
            self.word_to_count[predicted_word] = new_count

class NgramCounts(object):
    ## A note on data-structure.  Firstly, all words are represented as
    ## integers.  We store n-gram counts as an array, indexed by (history-length
    ## == n-gram order minus one) (note: python calls arrays "lists") of dicts
    ## from histories to counts, where histories are arrays of integers and
    ## "counts" are dicts from integer to float.  For instance, when
    ## accumulating the 4-gram count for the '8' in the sequence '5 6 7 8', we'd
    ## do as follows: self.counts[3][[5,6,7]][8] += 1.0 where the [3] indexes an
    ## array, the [[5,6,7]] indexes a dict, and the [8] indexes a dict.
    def __init__(self, ngram_order):
        assert ngram_order >= 2
        # Integerized counts will never contain negative numbers, so
        # inside this program, we use -3 and -2 for the BOS and EOS symbols
        # respectively.
        # Note: it's actually important that the bos-symbol is the most negative;
        # it helps ensure that we print the state with left-context <s> first
        # when we print the FST, and this means that the start-state will have
        # the correct value.
        self.bos_symbol = -3
        self.eos_symbol = -2
        # backoff_symbol is kind of a pseudo-word, it's used in keeping track of
        # the backoff counts in each state.
        self.backoff_symbol = -1
        self.total_num_words = 0  # count includes EOS but not BOS.
        self.counts = []
        for n in range(ngram_order):
            self.counts.append(defaultdict(lambda: CountsForHistory()))

    # adds a raw count (called while processing input data).
    # Suppose we see the sequence '6 7 8 9' and ngram_order=4, 'history'
    # would be (6,7,8) and 'predicted_word' would be 9; 'count' would be
    # 1.
    def AddCount(self, history, predicted_word, count):
        self.counts[len(history)][history].AddCount(predicted_word, count)


    # 'line' is a string containing a sequence of integer word-ids.
    # This function adds the un-smoothed counts from this line of text.
    def AddRawCountsFromLine(self, line):
        try:
            words = [self.bos_symbol] + [ int(x) for x in line.split() ] + [self.eos_symbol]
        except:
            sys.exit("make_phone_lm.py: bad input line {0} (expected a sequence "
                     "of integers)".format(line))

        for n in range(1, len(words)):
            predicted_word = words[n]
            history_start = max(0, n + 1 - args.ngram_order)
            history = tuple(words[history_start:n])
            self.AddCount(history, predicted_word, 1)
            self.total_num_words += 1

    def AddRawCountsFromStandardInput(self):
        lines_processed = 0
        while True:
            line = sys.stdin.readline()
            if line == '':
                break
            self.AddRawCountsFromLine(line)
            lines_processed += 1
        if lines_processed == 0 or args.verbose > 0:
            print("make_phone_lm.py: processed {0} lines of input".format(
                    lines_processed), file = sys.stderr)


    # This backs off the counts by subtracting 1 and assigning the subtracted
    # count to the backoff state.  It's like a special case of Kneser-Ney with D
    # = 1.  The optimal D would likely be something like 0.9, but we plan to
    # later do entropy-pruning, and the remaining small counts of 0.1 would
    # essentially all get pruned away anyway, so we don't lose much by doing it
    # like this.
    def ApplyBackoff(self):
        # note: in the normal case where args.no_backoff_ngram_order == 2 we
        # don't do backoff for history-length = 1 (i.e. for bigrams)... this is
        # a kind of special LM where we're not going to back off to unigram,
        # there will be no unigram.
        if args.verbose >= 1:
            initial_num_ngrams = self.GetNumNgrams()
        for n in reversed(list(range(args.no_backoff_ngram_order, args.ngram_order))):
            this_order_counts = self.counts[n]
            for hist, counts_for_hist in this_order_counts.items():
                backoff_hist = hist[1:]
                backoff_counts_for_hist = self.counts[n-1][backoff_hist]
                this_discount_total = 0
                for word in counts_for_hist.Words():
                    counts_for_hist.AddCount(word, -1)
                    # You can interpret the following line as incrementing the
                    # count-of-counts for the next-lower order.  Note, however,
                    # that later when we remove n-grams, we'll also add their
                    # counts to the next-lower-order history state, so the
                    # resulting counts won't strictly speaking be
                    # counts-of-counts.
                    backoff_counts_for_hist.AddCount(word, 1)
                    this_discount_total += 1
                counts_for_hist.AddCount(self.backoff_symbol, this_discount_total)

        if args.verbose >= 1:
            # Note: because D == 1, we completely back off singletons.
            print("make_phone_lm.py: ApplyBackoff() reduced the num-ngrams from "
                  "{0} to {1}".format(initial_num_ngrams, self.GetNumNgrams()),
                  file = sys.stderr)


    # This function prints out to stderr the n-gram counts stored in this
    # object; it's used for debugging.
    def Print(self, info_string):
        print(info_string, file=sys.stderr)
        # these are useful for debug.
        total = 0.0
        total_excluding_backoff = 0.0
        for this_order_counts in self.counts:
            for hist, counts_for_hist in this_order_counts.items():
                print(str(hist) + str(counts_for_hist), file = sys.stderr)
                total += counts_for_hist.total_count
                total_excluding_backoff += counts_for_hist.total_count
                if self.backoff_symbol in counts_for_hist.word_to_count:
                    total_excluding_backoff -= counts_for_hist.word_to_count[self.backoff_symbol]
        print('total count = {0}, excluding backoff = {1}'.format(
                total, total_excluding_backoff), file = sys.stderr)

    def GetHistToStateMap(self):
        # This function, called from PrintAsFst, returns a map from
        # history to integer FST-state.
        hist_to_state = dict()
        fst_state_counter = 0
        for n in range(0, args.ngram_order):
            for hist in self.counts[n].keys():
                hist_to_state[hist] = fst_state_counter
                fst_state_counter += 1
        return hist_to_state

    # Returns the probability of word 'word' in history-state 'hist'.
    # If 'word' is self.backoff_symbol, returns the backoff prob
    # of this history-state.
    # Returns None if there is no such word in this history-state, or this
    # history-state does not exist.
    def GetProb(self, hist, word):
        if len(hist) >= args.ngram_order or not hist in self.counts[len(hist)]:
            return None
        counts_for_hist = self.counts[len(hist)][hist]
        total_count = float(counts_for_hist.total_count)
        if not word in counts_for_hist.word_to_count:
            print("make_phone_lm.py: no prob for {0} -> {1} "
                  "[no such count]".format(hist, word),
                  file = sys.stderr)
            return None
        prob = float(counts_for_hist.word_to_count[word]) / total_count
        if len(hist) > 0 and word != self.backoff_symbol and \
          self.backoff_symbol in counts_for_hist.word_to_count:
            prob_in_backoff = self.GetProb(hist[1:], word)
            backoff_prob = float(counts_for_hist.word_to_count[self.backoff_symbol]) / total_count
            try:
                prob += backoff_prob * prob_in_backoff
            except:
                sys.exit("problem, hist is {0}, word is {1}".format(hist, word))
        return prob

    def PruneEmptyStates(self):
        # Removes history-states that have no counts.

        # It's possible in principle for history-states to have no counts and
        # yet they cannot be pruned away because a higher-order version of the
        # state exists with nonzero counts, so we have to keep track of this.
        protected_histories = set()

        states_removed_per_hist_len = [ 0 ] * args.ngram_order

        for n in reversed(list(range(args.no_backoff_ngram_order,
                                args.ngram_order))):
            num_states_removed = 0
            for hist, counts_for_hist in self.counts[n].items():
                l = len(counts_for_hist.word_to_count)
                assert l > 0 and self.backoff_symbol in counts_for_hist.word_to_count
                if l == 1 and not hist in protected_histories:  # only the backoff symbol has a count.
                    del self.counts[n][hist]
                    num_states_removed += 1
                else:
                    # if this state was not pruned away, then the state that
                    # it backs off to may not be pruned away either.
                    backoff_hist = hist[1:]
                    protected_histories.add(backoff_hist)
            states_removed_per_hist_len[n] = num_states_removed
        if args.verbose >= 1:
            print("make_phone_lm.py: in PruneEmptyStates(), num states removed for "
                  "each history-length was: " + str(states_removed_per_hist_len),
                  file = sys.stderr)

    def EnsureStructurallyNeededNgramsExist(self):
        # makes sure that if an n-gram like (6, 7, 8) -> 9 exists,
        # then counts exist for (7, 8) -> 9 and (8,) -> 9.  It does so
        # by adding zero counts where such counts were absent.
        # [note: () -> 9 is guaranteed anyway by the backoff method, if
        # we have a unigram state].
        if args.verbose >= 1:
            num_ngrams_initial = self.GetNumNgrams()
        for n in reversed(list(range(args.no_backoff_ngram_order,
                                args.ngram_order))):

            for hist, counts_for_hist in self.counts[n].items():
                # This loop ensures that if we have an n-gram like (6, 7, 8) -> 9,
                # then, say, (7, 8) -> 9 and (8) -> 9 exist.
                reduced_hist = hist
                for m in reversed(list(range(args.no_backoff_ngram_order, n))):
                    reduced_hist = reduced_hist[1:]  # shift an element off
                                                     # the history.
                    counts_for_backoff_hist = self.counts[m][reduced_hist]
                    for word in counts_for_hist.word_to_count.keys():
                        counts_for_backoff_hist.word_to_count[word] += 0
                # This loop ensures that if we have an n-gram like (6, 7, 8) -> 9,
                # then, say, (6, 7) -> 8 and (6) -> 7 exist.  This will be needed
                # for FST representations of the ARPA LM.
                reduced_hist = hist
                for m in reversed(list(range(args.no_backoff_ngram_order, n))):
                    this_word = reduced_hist[-1]
                    reduced_hist = reduced_hist[:-1]  # pop an element off the
                                                      # history
                    counts_for_backoff_hist = self.counts[m][reduced_hist]
                    counts_for_backoff_hist.word_to_count[this_word] += 0
        if args.verbose >= 1:
            print("make_phone_lm.py: in EnsureStructurallyNeededNgramsExist(), "
                  "added {0} n-grams".format(self.GetNumNgrams() - num_ngrams_initial),
                  file = sys.stderr)


    # This function prints the estimated language model as an FST.
    def PrintAsFst(self, word_disambig_symbol):
        # n is the history-length (== order + 1).  We iterate over the
        # history-length in the order 1, 0, 2, 3, and then iterate over the
        # histories of each order in sorted order.  Putting order 1 first
        # and sorting on the histories
        # ensures that the bigram state with <s> as the left context comes first.
        # (note: self.bos_symbol is the most negative symbol)

        # History will map from history (as a tuple) to integer FST-state.
        hist_to_state = self.GetHistToStateMap()

        for n in [ 1, 0 ] + list(range(2, args.ngram_order)):
            this_order_counts = self.counts[n]
            # For order 1, make sure the keys are sorted.
            keys = this_order_counts.keys() if n != 1 else sorted(this_order_counts.keys())
            for hist in keys:
                word_to_count = this_order_counts[hist].word_to_count
                this_fst_state = hist_to_state[hist]

                for word in word_to_count.keys():
                    # work out this_cost.  Costs in OpenFst are negative logs.
                    this_cost = -math.log(self.GetProb(hist, word))

                    if word > 0: # a real word.
                        next_hist = hist + (word,)  # appending tuples
                        while not next_hist in hist_to_state:
                            next_hist = next_hist[1:]
                        next_fst_state = hist_to_state[next_hist]
                        print(this_fst_state, next_fst_state, word, word,
                              this_cost)
                    elif word == self.eos_symbol:
                        # print final-prob for this state.
                        print(this_fst_state, this_cost)
                    else:
                        assert word == self.backoff_symbol
                        backoff_fst_state = hist_to_state[hist[1:len(hist)]]
                        print(this_fst_state, backoff_fst_state,
                              word_disambig_symbol, 0, this_cost)

    # This function returns a set of n-grams that cannot currently be pruned
    # away, either because a higher-order form of the same n-gram already exists,
    # or because the n-gram leads to an n-gram state that exists.
    # [Note: as we prune, we remove any states that can be removed; see that
    # PruneToIntermediateTarget() calls PruneEmptyStates().

    def GetProtectedNgrams(self):
        ans = set()
        for n in range(args.no_backoff_ngram_order + 1, args.ngram_order):
            for hist, counts_for_hist in self.counts[n].items():
                # If we have an n-gram (6, 7, 8) -> 9, the following loop will
                # add the backed-off n-grams (7, 8) -> 9 and (8) -> 9 to
                # 'protected-ngrams'.
                reduced_hist = hist
                for m in reversed(list(range(args.no_backoff_ngram_order, n))):
                    reduced_hist = reduced_hist[1:]  # shift an element off
                                                     # the history.

                    for word in counts_for_hist.word_to_count.keys():
                        if word != self.backoff_symbol:
                            ans.add(reduced_hist + (word,))
                # The following statement ensures that if we are in a
                # history-state (6, 7, 8), then n-grams (6, 7, 8) and (6, 7) are
                # protected.  This assures that the FST states are accessible.
                reduced_hist = hist
                for m in reversed(list(range(args.no_backoff_ngram_order, n))):
                    ans.add(reduced_hist)
                    reduced_hist = reduced_hist[:-1]  # pop an element off the
                                                      # history
        return ans

    def PruneNgram(self, hist, word):
        counts_for_hist = self.counts[len(hist)][hist]
        assert word != self.backoff_symbol and word in counts_for_hist.word_to_count
        count = counts_for_hist.word_to_count[word]
        del counts_for_hist.word_to_count[word]
        counts_for_hist.word_to_count[self.backoff_symbol] += count
        # the next call adds the count to the symbol 'word' in the backoff
        # history-state, and also updates its 'total_count'.
        self.counts[len(hist) - 1][hist[1:]].AddCount(word, count)

    # The function PruningLogprobChange is the same as the same-named
    # function in float-counts-prune.cc in pocolm.  Note, it doesn't access
    # any class members.

    # This function computes the log-likelihood change (<= 0) from backing off
    # a particular symbol to the lower-order state.
    # The value it returns can be interpreted as a lower bound the actual log-likelihood
    # change.  By "the actual log-likelihood change" we mean of data generated by
    # the model itself before making the change, then modeled with the changed model
    # [and comparing the log-like with the log-like before changing the model].  That is,
    # it's a K-L divergence, but with the caveat that we don't normalize by the
    # overall count of the data, so it's a K-L divergence multiplied by the training-data
    # count.

    #  'count' is the count of the word (call it 'a') in this state.  It's an integer.
    #  'discount' is the discount-count in this state (represented as the count
    #         for the symbol self.backoff_symbol).  It's an integer.
    #  [note: we don't care about the total-count in this state, it cancels out.]
    #  'backoff_count' is the count of word 'a' in the lower-order state.
    #                 [actually it is the augmented count, treating any
    #                  extra probability from even-lower-order states as
    #                  if it were a count].  It's a float.
    #  'backoff_total' is the total count in the lower-order state.  It's a float.
    def PruningLogprobChange(self, count, discount, backoff_count, backoff_total):
        if count == 0:
            return 0.0

        assert discount > 0 and backoff_total >= backoff_count and backoff_total >= 0.99 * discount


        # augmented_count is like 'count', but with the extra count for symbol
        # 'a' due to backoff included.
        augmented_count = count + discount * backoff_count / backoff_total

        # We imagine a phantom symbol 'b' that represents all symbols other than
        # 'a' appearing in this history-state that are accessed via backoff.  We
        # treat these as being distinct symbols from the same symbol if accessed
        # not-via-backoff.  (Treating same symbols as distinct gives an upper bound
        # on the divergence).  We also treat them as distinct from the same symbols
        # that are being accessed via backoff from other states.  b_count is the
        # observed count of symbol 'b' in this state (the backed-off count is
        # zero).  b_count is also the count of symbol 'b' in the backoff state.
        # Note: b_count will not be negative because backoff_total >= backoff_count.
        b_count = discount * ((backoff_total - backoff_count) / backoff_total)
        assert b_count >= -0.001 * backoff_total

        # We imagine a phantom symbol 'c' that represents all symbols other than
        # 'a' and 'b' appearing in the backoff state, which got there from
        # backing off other states (other than 'this' state).  Again, we imagine
        # the symbols are distinct even though they may not be (i.e. that c and
        # b represent disjoint sets of symbol, even though they might not really
        # be disjoint), and this gives us an upper bound on the divergence.
        c_count = backoff_total - backoff_count - b_count
        assert c_count >= -0.001 * backoff_total

        # a_other is the count of 'a' in the backoff state that comes from
        # 'other sources', i.e. it was backed off from history-states other than
        # the current history state.
        a_other_count = backoff_count - discount * backoff_count / backoff_total
        assert a_other_count >= -0.001 * backoff_count

        # the following sub-expressions are the 'new' versions of certain
        # quantities after we assign the total count 'count' to backoff.  it
        # increases the backoff count in 'this' state, and also the total count
        # in the backoff state, and the count of symbol 'a' in the backoff
        # state.
        new_backoff_count = backoff_count + count  # new count of symbol 'a' in
                                                    # backoff state
        new_backoff_total = backoff_total + count  # new total count in
                                                    # backoff state.
        new_discount = discount + count  # new discount-count in 'this' state.


        # all the loglike changes below are of the form
        # count-of-symbol * log(new prob / old prob)
        # which can be more conveniently written (by canceling the denominators),
        # count-of-symbol * log(new count / old count).

        # this_a_change is the log-like change of symbol 'a' coming from 'this'
        # state.  bear in mind that
        # augmented_count = count + discount * backoff_count / backoff_total,
        # and the 'count' term is zero in the numerator part of the log expression,
        # because symbol 'a' is completely backed off in 'this' state.
        this_a_change = augmented_count * \
            math.log((new_discount * new_backoff_count / new_backoff_total)/ \
                         augmented_count)

        # other_a_change is the log-like change of symbol 'a' coming from all
        # other states than 'this'.  For speed reasons we don't examine the
        # direct (non-backoff) counts of symbol 'a' in all other states than
        # 'this' that back off to the backoff state-- it would be slower.
        # Instead we just treat the direct part of the prob for symbol 'a' as a
        # distinct symbol when it comes from those other states... as usual,
        # doing so gives us an upper bound on the divergence.
        other_a_change = \
            a_other_count * math.log((new_backoff_count / new_backoff_total) / \
                                         (backoff_count / backoff_total)) 

        # b_change is the log-like change of phantom symbol 'b' coming from
        # 'this' state (and note: it only comes from this state, that's how we
        # defined it).
        # note: the expression below could be more directly written as a
        # ratio of pseudo-counts as follows, by converting the backoff probabilities
        # into pseudo-counts in 'this' state:
        #  b_count * logf((new_discount * b_count / new_backoff_total) /
        #                 (discount * b_count / backoff_total),
        # but we cancel b_count to give us the expression below.
        b_change = b_count * math.log((new_discount / new_backoff_total) / \
                                          (discount / backoff_total))

        # c_change is the log-like change of phantom symbol 'c' coming from
        # all other states that back off to the backoff sate (and all prob. mass of
        # 'c' comes from those other states).  The expression below could be more
        # directly written as a ratio of counts, as c_count * logf((c_count /
        # new_backoff_total) / (c_count / backoff_total)), but we simplified it to
        # the expression below.
        c_change = c_count * math.log(backoff_total / new_backoff_total)

        ans = this_a_change + other_a_change + b_change + c_change
        # the answer should not be positive.
        assert ans <= 0.0001 * (count + discount + backoff_count + backoff_total)
        if args.verbose >= 4:
            print("pruning-logprob-change for {0},{1},{2},{3} is {4}".format(
                    count, discount, backoff_count, backoff_total, ans),
                  file = sys.stderr)
        return ans


    def GetLikeChangeFromPruningNgram(self, hist, word):
        counts_for_hist = self.counts[len(hist)][hist]
        counts_for_backoff_hist = self.counts[len(hist) - 1][hist[1:]]
        assert word != self.backoff_symbol and word in counts_for_hist.word_to_count
        count = counts_for_hist.word_to_count[word]
        discount = counts_for_hist.word_to_count[self.backoff_symbol]
        backoff_total = counts_for_backoff_hist.total_count
        # backoff_count is a pseudo-count: it's like the count of 'word' in the
        # backoff history-state, but adding something to account for further
        # levels of backoff.
        try:
            backoff_count = self.GetProb(hist[1:], word) * backoff_total
        except:
            print("problem getting backoff count: hist = {0}, word = {1}".format(hist, word),
                  file = sys.stderr)
            sys.exit(1)

        return self.PruningLogprobChange(float(count), float(discount),
                                         backoff_count, float(backoff_total))

    # note: returns loglike change per word.
    def PruneToIntermediateTarget(self, num_extra_ngrams):
        protected_ngrams = self.GetProtectedNgrams()
        initial_num_extra_ngrams = self.GetNumExtraNgrams()
        num_ngrams_to_prune = initial_num_extra_ngrams - num_extra_ngrams
        assert num_ngrams_to_prune > 0

        num_candidates_per_order = [ 0 ] * args.ngram_order
        num_pruned_per_order = [ 0 ] * args.ngram_order


        # like_change_and_ngrams this will be a list of tuples consisting
        # of the likelihood change as a float and then the words of the n-gram
        # that we're considering pruning,
        # e.g. (-0.164, 7, 8, 9)
        # meaning that pruning the n-gram (7, 8) -> 9 leads to
        # a likelihood change of -0.164.  We'll later sort this list
        # so we can prune the n-grams that made the least-negative
        # likelihood change.
        like_change_and_ngrams = []
        for n in range(args.no_backoff_ngram_order, args.ngram_order):
            for hist, counts_for_hist in self.counts[n].items():
                for word, count in counts_for_hist.word_to_count.items():
                    if word != self.backoff_symbol:
                        if not hist + (word,) in protected_ngrams:
                            like_change = self.GetLikeChangeFromPruningNgram(hist, word)
                            like_change_and_ngrams.append((like_change,) + hist + (word,))
                            num_candidates_per_order[len(hist)] += 1

        like_change_and_ngrams.sort(reverse = True)

        if num_ngrams_to_prune > len(like_change_and_ngrams):
            print('make_phone_lm.py: aimed to prune {0} n-grams but could only '
                  'prune {1}'.format(num_ngrams_to_prune, len(like_change_and_ngrams)),
                  file = sys.stderr)
            num_ngrams_to_prune = len(like_change_and_ngrams)

        total_loglike_change = 0.0

        for i in range(num_ngrams_to_prune):
            total_loglike_change += like_change_and_ngrams[i][0]
            hist = like_change_and_ngrams[i][1:-1]  # all but 1st and last elements
            word = like_change_and_ngrams[i][-1]  # last element
            num_pruned_per_order[len(hist)] += 1
            self.PruneNgram(hist, word)

        like_change_per_word = total_loglike_change / self.total_num_words

        if args.verbose >= 1:
            effective_threshold = (like_change_and_ngrams[num_ngrams_to_prune - 1][0]
                                   if num_ngrams_to_prune >= 0 else 0.0)
            print("Pruned from {0} ngrams to {1}, with threshold {2}.  Candidates per order were {3}, "
                  "num-ngrams pruned per order were {4}.  Like-change per word was {5}".format(
                    initial_num_extra_ngrams,
                    initial_num_extra_ngrams - num_ngrams_to_prune,
                    '%.4f' % effective_threshold,
                    num_candidates_per_order,
                    num_pruned_per_order,
                    like_change_per_word), file = sys.stderr)

        if args.verbose >= 3:
            print("Pruning: like_change_and_ngrams is:\n" +
                  '\n'.join([str(x) for x in like_change_and_ngrams[:num_ngrams_to_prune]]) +
                  "\n-------- stop pruning here: ----------\n" +
                  '\n'.join([str(x) for x in like_change_and_ngrams[num_ngrams_to_prune:]]),
                  file = sys.stderr)
            self.Print("Counts after pruning to num-extra-ngrams={0}".format(
                    initial_num_extra_ngrams - num_ngrams_to_prune))

        self.PruneEmptyStates()
        if args.verbose >= 3:
            ngram_counts.Print("Counts after removing empty states [inside pruning algorithm]:")
        return like_change_per_word


    def PruneToFinalTarget(self, num_extra_ngrams):
        # prunes to a specified num_extra_ngrams.  The 'extra_ngrams' refers to
        # the count of n-grams of order higher than args.no_backoff_ngram_order.
        # We construct a sequence of targets that gradually approaches
        # this value.  Doing it iteratively like this is a good way
        # to deal with the fact that sometimes we can't prune a certain
        # n-gram before certain other n-grams are pruned (because
        # they lead to a state that must be kept, or an n-gram exists
        # that backs off to this n-gram).

        current_num_extra_ngrams = self.GetNumExtraNgrams()

        if num_extra_ngrams >= current_num_extra_ngrams:
            print('make_phone_lm.py: not pruning since target num-extra-ngrams={0} is >= '
                  'current num-extra-ngrams={1}'.format(num_extra_ngrams, current_num_extra_ngrams),
                  file=sys.stderr)
            return

        target_sequence = [num_extra_ngrams]
        # two final iterations where the targets differ by factors of 1.1,
        # preceded by two iterations where the targets differ by factors of 1.2.
        for this_factor in [ 1.1, 1.2 ]:
            for n in range(0,2):
                if int((target_sequence[-1]+1) * this_factor) < current_num_extra_ngrams:
                    target_sequence.append(int((target_sequence[-1]+1) * this_factor))
        # then change in factors of 1.3
        while True:
            this_factor = 1.3
            if int((target_sequence[-1]+1) * this_factor) < current_num_extra_ngrams:
                target_sequence.append(int((target_sequence[-1]+1) * this_factor))
            else:
                break

        target_sequence = list(set(target_sequence))  # only keep unique targets.
        target_sequence.sort(reverse = True)

        print('make_phone_lm.py: current num-extra-ngrams={0}, pruning with '
              'following sequence of targets: {1}'.format(current_num_extra_ngrams,
                                                          target_sequence),
              file = sys.stderr)
        total_like_change_per_word = 0.0
        for target in target_sequence:
            total_like_change_per_word += self.PruneToIntermediateTarget(target)

        if args.verbose >= 1:
            print('make_phone_lm.py: K-L divergence from pruning (upper bound) is '
                  '%.4f' % total_like_change_per_word, file = sys.stderr)


    # returns the number of n-grams on top of those that can't be pruned away
    # because their order is <= args.no_backoff_ngram_order.
    def GetNumExtraNgrams(self):
        ans = 0
        for hist_len in range(args.no_backoff_ngram_order, args.ngram_order):
            # note: hist_len + 1 is the actual order.
            ans += self.GetNumNgrams(hist_len)
        return ans


    def GetNumNgrams(self, hist_len = None):
        ans = 0
        if hist_len == None:
            for hist_len in range(args.ngram_order):
                # note: hist_len + 1 is the actual order.
                ans += self.GetNumNgrams(hist_len)
            return ans
        else:
            for counts_for_hist in self.counts[hist_len].values():
                ans += len(counts_for_hist.word_to_count)
                if self.backoff_symbol in counts_for_hist.word_to_count:
                    ans -= 1  # don't count the backoff symbol, it doesn't produce
                              # its own n-gram line.
            return ans


    # this function, used in PrintAsArpa, converts an integer to
    # a string by either printing it as a string, or for self.bos_symbol
    # and self.eos_symbol, printing them as "<s>" and "</s>" respectively.
    def IntToString(self, i):
        if i == self.bos_symbol:
            return '<s>'
        elif i == self.eos_symbol:
            return '</s>'
        else:
            assert i != self.backoff_symbol
            return str(i)


    def PrintAsArpa(self):
        # Prints out the FST in ARPA format.
        assert args.no_backoff_ngram_order == 1  # without unigrams we couldn't
                                                 # print as ARPA format.

        print('\\data\\');
        for hist_len in range(args.ngram_order):
            # print the number of n-grams.  Add 1 for the 1-gram
            # section because of <s>, we print -99 as the prob so we
            # have a place to put the backoff prob.
            print('ngram {0}={1}'.format(
                    hist_len + 1,
                    self.GetNumNgrams(hist_len) + (1 if hist_len == 0 else 0)))

        print('')

        for hist_len in range(args.ngram_order):
            print('\\{0}-grams:'.format(hist_len + 1))

            # print fake n-gram for <s>, for its backoff prob.
            if hist_len == 0:
                backoff_prob = self.GetProb((self.bos_symbol,), self.backoff_symbol)
                if backoff_prob != None:
                    print('-99\t<s>\t{0}'.format('%.5f' % math.log10(backoff_prob)))

            for hist in self.counts[hist_len].keys():
                for word in self.counts[hist_len][hist].word_to_count.keys():
                    if word != self.backoff_symbol:
                        prob = self.GetProb(hist, word)
                        assert prob != None and prob > 0
                        backoff_prob = self.GetProb((hist)+(word,), self.backoff_symbol)
                        line = '{0}\t{1}'.format('%.5f' % math.log10(prob),
                                                 ' '.join(self.IntToString(x) for x in hist + (word,)))
                        if backoff_prob != None:
                            line += '\t{0}'.format('%.5f' % math.log10(backoff_prob))
                        print(line)
            print('')
        print('\\end\\')


ngram_counts = NgramCounts(args.ngram_order)
ngram_counts.AddRawCountsFromStandardInput()

if args.verbose >= 3:
    ngram_counts.Print("Raw counts:")
ngram_counts.ApplyBackoff()
if args.verbose >= 3:
    ngram_counts.Print("Counts after applying Kneser-Ney discounting:")
ngram_counts.EnsureStructurallyNeededNgramsExist()
if args.verbose >= 3:
    ngram_counts.Print("Counts after adding structurally-needed n-grams (1st time):")
ngram_counts.PruneEmptyStates()
if args.verbose >= 3:
    ngram_counts.Print("Counts after removing empty states:")
ngram_counts.PruneToFinalTarget(args.num_extra_ngrams)

ngram_counts.EnsureStructurallyNeededNgramsExist()
if args.verbose >= 3:
    ngram_counts.Print("Counts after adding structurally-needed n-grams (2nd time):")


if args.print_as_arpa == "true":
    ngram_counts.PrintAsArpa()
else:
    if args.phone_disambig_symbol == None:
        sys.exit("make_phone_lm.py: --phone-disambig-symbol must be provided (unless "
                 "you are writing as ARPA")
    ngram_counts.PrintAsFst(args.phone_disambig_symbol)


## Below are some little test commands that can be used to look at the detailed stats
## for a kind of sanity check.
# test comand:
# (echo 6 7 8 4; echo 7 8 9; echo 7 8; echo 7 4; echo 8 4 ) | utils/lang/make_phone_lm.py --phone-disambig-symbol=400  --verbose=3
#  (echo 6 7 8 4; echo 7 8 9; echo 7 8; echo 7 4; echo 8 4 ) | utils/lang/make_phone_lm.py --phone-disambig-symbol=400  --verbose=3 --num-extra-ngrams=0
# (echo 6 7 8 4; echo 6 7 ) | utils/lang/make_phone_lm.py --print-as-arpa=true --no-backoff-ngram-order=1  --verbose=3


## The following shows how we created some data suitable to do comparisons with
## other language modeling toolkits.  Note: we're running in a configuration
## where --no-backoff-ngram-order=1 (i.e. we have a unigram LM state) because
## it's the only way to get perplexity calculations and to write an ARPA file.
##
# cd egs/tedlium/s5_r2
# . ./path.sh
# mkdir -p lm_test
# ali-to-phones exp/tri3/final.mdl "ark:gunzip -c exp/tri3/ali.*.gz|" ark,t:-  | awk '{$1 = ""; print}' > lm_test/phone_seqs
# wc lm_test/phone_seqs
# 92464  8409563 27953288 lm_test/phone_seqs
# head -n 20000 lm_test/phone_seqs > lm_test/train.txt
# tail -n 1000 lm_test/phone_seqs > lm_test/test.txt

## This shows make_phone_lm.py with the default number of extra-lm-states (20k)
## You have to have SRILM on your path to ger perplexities [note: it should be on the
## path if you installed it and you sourced the tedlium s5b path.sh, as above.]
# utils/lang/make_phone_lm.py --print-as-arpa=true --no-backoff-ngram-order=1 --verbose=1 < lm_test/train.txt > lm_test/arpa_pr20k
# ngram -order 4 -unk -lm lm_test/arpa_pr20k -ppl lm_test/test.txt
# file lm_test/test.txt: 1000 sentences, 86489 words, 3 OOVs
# 0 zeroprobs, logprob= -80130.1 ppl=*8.23985* ppl1= 8.44325
# on training data: 0 zeroprobs, logprob= -1.6264e+06 ppl= 7.46947 ppl1= 7.63431

## This shows make_phone_lm.py without any pruning (make --num-extra-ngrams very large).
# utils/lang/make_phone_lm.py --print-as-arpa=true --num-extra-ngrams=1000000 --no-backoff-ngram-order=1 --verbose=1 < lm_test/train.txt > lm_test/arpa
# ngram -order 4 -unk -lm lm_test/arpa -ppl lm_test/test.txt
# file lm_test/test.txt: 1000 sentences, 86489 words, 3 OOVs
# 0 zeroprobs, logprob= -74976 ppl=*7.19459* ppl1= 7.36064
# on training data: 0 zeroprobs, logprob= -1.44198e+06 ppl= 5.94659 ppl1= 6.06279

## This is SRILM without pruning (c.f. the 7.19 above, it's slightly better).
# ngram-count -text lm_test/train.txt -order 4 -kndiscount2 -kndiscount3 -kndiscount4 -interpolate -lm lm_test/arpa_srilm
# ngram -order 4 -unk -lm lm_test/arpa_srilm -ppl lm_test/test.txt
# file lm_test/test.txt: 1000 sentences, 86489 words, 3 OOVs
# 0 zeroprobs, logprob= -74742.2 ppl= *7.15044* ppl1= 7.31494


## This is SRILM with a pruning beam tuned to get 20k n-grams above unigram
##  (c.f. the 8.23 above, it's a lot worse).
# ngram-count -text lm_test/train.txt -order 4 -kndiscount2 -kndiscount3 -kndiscount4 -interpolate -prune 1.65e-05 -lm lm_test/arpa_srilm.pr1.65e-5
# the model has 20249 n-grams above unigram [c.f. our 20k]
# ngram -order 4 -unk -lm lm_test/arpa_srilm.pr1.65e-5 -ppl lm_test/test.txt
# file lm_test/test.txt: 1000 sentences, 86489 words, 3 OOVs
# 0 zeroprobs, logprob= -86803.7 ppl=*9.82202* ppl1= 10.0849


## This is pocolm..
## Note: we have to hold out some of the training data as dev to
## estimate the hyperparameters, but we'll fold it back in before
## making the final LM. [--fold-dev-into=train]
# mkdir -p lm_test/data/text
# head -n 1000 lm_test/train.txt > lm_test/data/text/dev.txt
# tail -n +1001 lm_test/train.txt > lm_test/data/text/train.txt
## give it a 'large' num-words so it picks them all.
# export PATH=$PATH:../../../tools/pocolm/scripts
# train_lm.py --num-word=100000 --fold-dev-into=train lm_test/data/text 4 lm_test/data/lm_unpruned
# get_data_prob.py lm_test/test.txt lm_test/data/lm_unpruned/100000_4.pocolm
## compute-probs: average log-prob per word was -1.95956 (perplexity = *7.0962*) over 87489 words.
## Note: we can compare this perplexity with 7.15 with SRILM and 7.19 with make_phone_lm.py.

#   pruned_lm_dir=${lm_dir}/${num_word}_${order}_prune${threshold}.pocolm
# prune_lm_dir.py --target-num-ngrams=20100 lm_test/data/lm_unpruned/100000_4.pocolm lm_test/data/lm_unpruned/100000_4_pr20k.pocolm
# get_data_prob.py lm_test/test.txt lm_test/data/lm_unpruned/100000_4_pr20k.pocolm
## compute-probs: average log-prob per word was -2.0409 (perplexity = 7.69757) over 87489 words.
## note: the 7.69 can be compared with 9.82 from SRILM and 8.23 from pocolm.
## format_arpa_lm.py lm_test/data/lm_unpruned/100000_4_pr20k.pocolm | head
## .. it has 20488 n-grams above unigram.  More than 20k but not enough to explain the difference
## .. in perplexity.

## OK... if I reran after modifying prune_lm_dir.py to comment out the line
## 'steps += 'EM EM'.split()' which adds the two EM stages per step, and got the
## perplexity again, I got the following:
## compute-probs: average log-prob per word was -2.09722 (perplexity = 8.14353) over 87489 words.
## .. so it turns out the E-M is actually important.


================================================
FILE: egs/utils/lang/make_position_dependent_subword_lexicon.py
================================================
#!/usr/bin/env python3

# 2019 Dongji Gao

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#  http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.

from make_lexicon_fst import read_lexiconp
import argparse
import math

def get_args():
    parser = argparse.ArgumentParser(description="""This script creates a
        position-dependent subword lexicon from a position-independent subword lexicon
        by adding suffixes ("_B", "_I", "_E", "_S") to the related phones.
        It assumes that the input lexicon does not contain disambiguation symbols.""")
    parser.add_argument("--separator", type=str, default="@@", help="""Separator
        indicates the position of a subword in a word. 
        Subword ends with separator can only appear at the beginning or middle of a word. 
        Subword without separator can only appear at the end of a word or is a word itself.
        E.g. "international -> inter@@ nation@@ al";
             "nation        -> nation"
        The separator should match the separator used in the input lexicon.""")
    parser.add_argument("lexiconp", type=str, help="""Filename of subword position-independent 
        lexicon with pronunciation probabilities, with lines of the form 'subword prob p1 p2 ...'""")
    args = parser.parse_args()
    return args

def is_end(subword, separator):
    """Return true if the subword can appear at the end of a word (i.e., the subword 
    does not end with separator). Return false otherwise."""
    return not subword.endswith(separator)

def write_position_dependent_lexicon(lexiconp, separator):
    """Print a position-dependent lexicon for each subword from the input lexiconp by adding
    appropriate suffixes ("_B", "_I", "_E", "_S") to the phone sequence related to the subword.
    There are 4 types of position-dependent subword:
    1) Beginning subword. It can only appear at the beginning of a word.
       The first phone suffix should be "_B" and other suffixes should be "_I"s:
        nation@@ 1.0 n_B ey_I sh_I ih_I n_I
        n@@      1.0 n_B
    2) Middle subword. It can only appear at the middle of a word.
       All phone suffixes should be "_I"s:
        nation@@ 1.0 n_I ey_I sh_I ih_I n_I
    3) End subword. It can only appear at the end of a word.
       The last phone suffix should be "_E" and other suffixes should be "_I"s:
        nation   1.0 n_I ey_I sh_I ih_I n_E
        n        1.0 n_E
    4) Singleton subword (i.e., the subword is word it self). 
       The first phone suffix should be "_B" and the last suffix should be "_E".
       All other suffixes should be "_I"s. If there is only one phone, its suffix should be "_S":
        nation   1.0 n_B ey_I sh_I ih_I n_E
        n        1.0 n_S
    In most cases (i.e., subwords have more than 1 phones), the suffixes of phones in the middle are "_I"s.
    So the suffix_list is initialized with all _I and we only replace the first and last phone suffix when
    dealing with different cases when necessary.
    """
    for (word, prob, phones) in lexiconp:
        phones_length = len(phones)

        # suffix_list is initialized by all "_I"s.
        suffix_list = ["_I" for i in range(phones_length)]

        if is_end(word, separator):
            # print end subword lexicon by replacing the last phone suffix by "_E"
            suffix_list[-1] = "_E"
            phones_list = [phone + suffix for (phone, suffix) in zip(phones, suffix_list)]
            print("{} {} {}".format(word, prob, ' '.join(phones_list)))

            # print singleton subword lexicon
            # the phone suffix is "_S" if the there is only 1 phone.
            if phones_length == 1:
                suffix_list[0] = "_S"
                phones_list = [phone + suffix for (phone, suffix) in zip(phones, suffix_list)]
                print("{} {} {}".format(word, prob, ' '.join(phones_list)))
            # the first phone suffix is "_B" is there is more than 1 phones.
            else:
                suffix_list[0] = "_B"
                phones_list = [phone + suffix for (phone, suffix) in zip(phones, suffix_list)]
                print("{} {} {}".format(word, prob, ' '.join(phones_list)))
        else:
            # print middle subword lexicon
            phones_list = [phone + suffix for (phone, suffix) in zip(phones, suffix_list)]
            print("{} {} {}".format(word, prob, ' '.join(phones_list)))

            # print beginning subword lexicon by replacing the first phone suffix by "_B"
            suffix_list[0] = "_B"
            phones_list = [phone + suffix for (phone, suffix) in zip(phones, suffix_list)]
            print("{} {} {}".format(word, prob, ' '.join(phones_list)))

def main():
    args = get_args()
    lexiconp = read_lexiconp(args.lexiconp)
    write_position_dependent_lexicon(lexiconp, args.separator)

if __name__ == "__main__":
    main()


================================================
FILE: egs/utils/lang/make_subword_lexicon_fst.py
================================================
#!/usr/bin/env python3

# 2019 Dongji Gao
# Apache 2.0.

from make_lexicon_fst import read_lexiconp
import argparse
import math
import sys

# see get_args() below for usage mesage
def get_args():
    parser = argparse.ArgumentParser(description="""This script creates the
        text form of a subword lexicon FST to be compiled by fstcompile using
        the appropriate symbol tables (phones.txt and words.txt). It will mostly
        be invoked indirectly via utils/prepare_lang_subword.sh. The output
        goes to the stdout. This script is the subword version of make_lexicon_fst.py.
        It only allows optional silence to appear after end-subword or singleton-subword,
        (i.e., subwords without separator). In this version we do not support
        pronunciation probability. (i.e., pron-prob = 1.0)""")

    parser.add_argument('--sil-phone', type=str, help="""Text form of
        optional-silence phone, e.g. 'SIL'. See also the --sil-prob option.""")
    parser.add_argument('--sil-prob', type=float, default=0.0, help="""Probability
        of silence between words (including the beginning and end of word sequence).
        Must be in range [0.0, 1.0). This refer to the optional silence inserted by
        the lexicon; see the --sil-phone option.""")
    parser.add_argument('--sil-disambig', type=str, help="""Disambiguation symbol
        to disambiguate silence, e.g. #5. Will only be supplied if you are creating 
        the version of L.fst with disambiguation symbols, intended for use with cyclic 
        G.fst. This symbol was introduced to fix a rather obscure source of nondeterminism 
        of CLG.fst, that has to do with reordering of disambiguation symbols and phone symbols.""")
    parser.add_argument('--position-dependent', action="store_true", help="""Whether 
        the input lexicon is position-dependent.""")
    parser.add_argument("--separator", type=str, default="@@", help="""Separator
        indicates the position of a subword in a word.
        Subword followed by separator can only appear at the beginning or middle of a word.
        Subword without separator can only appear at the end of a word or is a word itself.
        E.g. "international -> inter@@ nation@@ al";
             "nation        -> nation"
    The separator should match the separator used in the input lexicon.""")
    parser.add_argument('lexiconp', type=str, help="""Filename of lexicon with
        pronunciation probabilities (normally lexiconp.txt), with lines of the
        form 'subword prob p1 p2...', e.g. 'a, 1.0 ay'""")
    args = parser.parse_args()
    return args

def contain_disambig_symbol(phones):
    """Return true if the phone sequence contains disambiguation symbol.
    Return false otherwise. Disambiguation symbol is at the end of phones 
    in the form of #1, #2... There is at most one disambiguation 
    symbol for each phone sequence"""
    return True if phones[-1].startswith("#") else False

def print_arc(src, dest, phone, word, cost):
    print('{}\t{}\t{}\t{}\t{}'.format(src, dest, phone, word, cost))

def is_end(word, separator):
    """Return true if the subword can appear at the end of a word (i.e., the subword
    does not end with separator). Return false otherwise."""
    return not word.endswith(separator)

def get_suffix(phone):
    """Return the suffix of a phone. The suffix is in the form of '_B', '_I'..."""
    if len(phone) < 3:
        print("{}: invalid phone {} (please check if the phone is position-dependent)".format(
              sys.argv[0], phone), file=sys.stderr)
        sys.exit(1)
    return phone[-2:]

def write_fst_no_silence(lexicon, position_dependent, separator):
    """Writes the text format of L.fst to the standard output.  This version is for
    when --sil-prob=0.0, meaning there is no optional silence allowed.
    loop_state here is the start and final state of the fst. It goes to word_start_state
    via epsilon transition.
    In position-independent case, there is no difference between beginning word and 
    middle word. So all subwords with separator would leave from and enter word_start_state.
    All subword without separator would leave from word_start_state and enter loop_state.
    This guarantees that optional silence can only follow a word-end subword.

    In position-dependent case, there are 4 types of position-dependent subword:
    1) Beginning subword. The first phone suffix should be "_B" and other suffixes should be "_I"s:
        nation@@ 1.0 n_B ey_I sh_I ih_I n_I
        n@@      1.0 n_B
    2) Middle subword. All phone suffixes should be "_I"s:
        nation@@ 1.0 n_I ey_I sh_I ih_I n_I
    3) End subword. The last phone suffix should be "_E" and other suffixes be should "_I"s:
        nation   1.0 n_I ey_I sh_I ih_I n_E
        n        1.0 n_E
    4) Singleton subword (i.e., the subword is word it self).
       The first phone suffix should be "_B" and the last suffix should be "_E".
       All other suffix should be "_I"s. If there is only one phone, its suffix should be "_S":
        nation   1.0 n_B ey_I sh_I ih_I n_E
        n        1.0 n_S

    So we need an extra word_internal_state. The beginning word 
    would leave from word_start_state and enter word_internal_state and middle word
    would leave from and enter word_internal_state. The rest part is same.

      'lexicon' is a list of 3-tuples (subword, pron-prob, prons) as returned by
      'position_dependent', which is true is the lexicon is position-dependent.
      'separator' is a symbol which indicates the position of a subword in word.
    """
    # regular setting
    loop_state = 0
    word_start_state = 1
    next_state = 2

    print_arc(loop_state, word_start_state, "<eps>", "<eps>", 0.0)

    # optional setting for word_internal_state
    if position_dependent:
        word_internal_state = next_state
        next_state += 1

    for (word, pron_prob, phones) in lexicon:
        pron_cost = 0.0                # do not support pron_prob
        phones_len = len(phones)

        # set start and end state for different cases
        if position_dependent:
            first_phone_suffix = get_suffix(phones[0])
            last_phone = phones[-2] if contain_disambig_symbol(phones) else phones[-1]
            last_phone_suffix = get_suffix(last_phone)

            # singleton word
            if first_phone_suffix == "_S":
                current_state = word_start_state
                end_state = loop_state
            # set the current_state
            elif first_phone_suffix == "_B":
                current_state = word_start_state
            elif first_phone_suffix == "_I" or first_phone_suffix == "_E":
                current_state = word_internal_state
            # then set the end_state
            if last_phone_suffix == "_B" or last_phone_suffix == "_I":
                end_state = word_internal_state
            elif last_phone_suffix == "_E":
                end_state = loop_state
        else:
            current_state = word_start_state
            end_state = loop_state if is_end(word, separator) else word_start_state

        # print arcs (except the last one) for the subword
        for i in range(phones_len - 1):
            word = word if i == 0 else "<eps>"
            cost = pron_cost if i == 0 else 0.0
            print_arc(current_state, next_state, phones[i], word, cost)
            current_state = next_state
            next_state += 1

        # print the last arc
        i = phones_len - 1
        phone = phones[i] if i >=0 else "<eps>"
        word = word if i <= 0 else "<eps>"
        cost = pron_cost if i <= 0 else 0.0
        print_arc(current_state, end_state, phone, word, cost)

    # set the final state
    print("{state}\t{final_cost}".format(state=loop_state, final_cost=0.0))

def write_fst_with_silence(lexicon, sil_phone, sil_prob, sil_disambig, position_dependent, separator):
    """Writes the text format of L.fst to the standard output.  This version is for
    when --sil-prob=0.0, meaning there is no optional silence allowed.
    loop_state here is the start and final state of the fst. It goes to word_start_state
    via epsilon transition.

    In position-independent case, there is no difference between beginning word and 
    middle word. So all subwords with separator would leave from and enter word_start_state.
    All subword without separator would leave from word_start_state and enter sil_state.
    This guarantees that optional silence can only follow a word-end subword and such subwords
    must appear at the end of the whole subword sequence.

    In position-dependent case, there are 4 types of position-dependent subword:
    1) Beginning subword. The first phone suffix should be "_B" and other suffixes should be "_I"s:
        nation@@ 1.0 n_B ey_I sh_I ih_I n_I
        n@@      1.0 n_B
    2) Middle subword. All phone suffixes should be "_I"s:
        nation@@ 1.0 n_I ey_I sh_I ih_I n_I
    3) End subword. The last phone suffix should be "_E" and other suffixes be should "_I"s:
        nation   1.0 n_I ey_I sh_I ih_I n_E
        n        1.0 n_E
    4) Singleton subword (i.e., the subword is word it self).
       The first phone suffix should be "_B" and the last suffix should be "_E".
       All other suffix should be "_I"s. If there is only one phone, its suffix should be "_S":
        nation   1.0 n_B ey_I sh_I ih_I n_E
        n        1.0 n_S

    So we need an extra word_internal_state. The beginning word 
    would leave from word_start_state and enter word_internal_state and middle word
    would leave from and enter word_internal_state. The rest part is same.

      'lexicon' is a list of 3-tuples (subword, pron-prob, prons)
         as returned by read_lexiconp().
      'sil_prob', which is expected to be strictly between 0.0 and 1.0, is the
         probability of silence
      'sil_phone' is the silence phone, e.g. "SIL".
      'sil_disambig' is either None, or the silence disambiguation symbol, e.g. "#5".
      'position_dependent', which is True is the lexicion is position-dependent.
      'separator' is the symbol we use to indicate the position of a subword in word.
    """

    sil_cost = -math.log(sil_prob)
    no_sil_cost = -math.log(1 - sil_prob)

    # regular setting
    start_state = 0
    loop_state = 1         # also the final state
    sil_state = 2          # words terminate here when followed by silence; this state
                           # has a licence transition to loop_state
    word_start_state = 3   # subword leave from here
    next_state = 4         # the next un-allocated state, will be incremented as we go

    print_arc(start_state, loop_state, "<eps>", "<eps>", no_sil_cost)
    print_arc(start_state, sil_state, "<eps>", "<eps>", sil_cost)
    print_arc(loop_state, word_start_state, "<eps>", "<eps>", 0.0)

    # optional setting for disambig_state
    if sil_disambig is None:
        print_arc(sil_state, loop_state, sil_phone, "<eps>", 0.0)
    else:
        disambig_state = next_state
        next_state += 1
        print_arc(sil_state, disambig_state, sil_phone, "<eps>", 0.0)
        print_arc(disambig_state, loop_state, sil_disambig, "<eps>", 0.0)

    # optional setting for word_internal_state
    if position_dependent:
        word_internal_state = next_state
        next_state += 1

    for (word, pron_prob, phones) in lexicon:
        pron_cost = 0.0           # do not support pron_prob
        phones_len = len(phones)
        
        # set start and end state for different cases
        if position_dependent:
            first_phone_suffix = get_suffix(phones[0])
            last_phone = phones[-2] if contain_disambig_symbol(phones) else phones[-1]
            last_phone_suffix = get_suffix(last_phone)

            # singleton subword
            if first_phone_suffix == "_S":
                current_state = word_start_state
                end_state_list = [loop_state, sil_state]
                end_cost_list = [no_sil_cost, sil_cost]
            # first set the current_state
            elif first_phone_suffix == "_B":
                current_state = word_start_state
            elif first_phone_suffix == "_I" or first_phone_suffix == "_E":
                current_state = word_internal_state
            # then set the end_state (end_state_list)
            if last_phone_suffix == "_B" or last_phone_suffix == "_I":
                end_state_list = [word_internal_state]
                end_cost_list = [0.0]
            elif last_phone_suffix == "_E":
                end_state_list = [loop_state, sil_state]
                end_cost_list = [no_sil_cost, sil_cost]
        else:
            current_state = word_start_state
            if is_end(word, separator):
                end_state_list = [loop_state, sil_state]
                end_cost_list = [no_sil_cost, sil_cost]
            else:
                end_state_list = [word_start_state]
                end_cost_list = [0.0]

        # print arcs (except the last one) for the subword
        for i in range(phones_len - 1):
            word = word if i == 0 else "<eps>"
            cost = pron_cost if i == 0 else 0.0
            print_arc(current_state, next_state, phones[i], word, cost)
            current_state = next_state
            next_state += 1

        # print the last arc
        i = phones_len - 1
        phone = phones[i] if i >= 0 else "<eps>"
        word = word if i <= 0 else "<eps>"
        cost = pron_cost if i <= 0 else 0.0
        for (end_state, end_cost) in zip(end_state_list, end_cost_list):
            print_arc(current_state, end_state, phone, word, cost + end_cost)

    # set the final state
    print("{state}\t{final_cost}".format(state=loop_state, final_cost=0.0))

def main():
    args = get_args()
    if args.sil_prob < 0.0 or args.sil_prob >= 1.0:
        print("{}: invalid value specified --sil-prob={}".format(
              sys.argv[0], args.sil_prob), file=sys.stderr)
        sys.exit(1)
    lexicon = read_lexiconp(args.lexiconp)
    if args.sil_prob == 0.0:
        write_fst_no_silence(lexicon, args.position_dependent, args.separator)
    else:
        write_fst_with_silence(lexicon, args.sil_phone, args.sil_prob, 
            args.sil_disambig, args.position_dependent, args.separator)

if __name__ == "__main__":
    main()


================================================
FILE: egs/utils/lang/make_unk_lm.sh
================================================
#!/usr/bin/env bash

# Copyright      2016 Johns Hopkins University (Author: Daniel Povey);

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#  http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.


# Begin configuration section.
cmd=run.pl
ngram_order=4
num_extra_ngrams=10000
position_dependent_phones=true
use_pocolm=true
min_word_length=2
stage=0
phone_disambig_symbol="#1"

# end configuration sections

[ -f path.sh ] && . ./path.sh
. utils/parse_options.sh

if [ $# -ne 2 ]; then
  echo "Usage: $0 [options] <input-dict-dir> <work-dir>"
  echo "e.g.: $0 data/local/dict exp/make_unk"
  echo ""
  echo "This script creates, as an FST, a phone language model suitable for modeling"
  echo "the unknown word.  It first trains a language model on the phone sequences of the"
  echo "provided dictionary entries (which should be without any word-position-dependency"
  echo "tags); it then creates an FST from it, while, for compactness after context-dependency"
  echo "limiting the transitions to seen bigram pairs of phones.  Then, by composing with"
  echo "a separate FST it converts it into word-position-dependent phones if applicable,"
  echo "while imposing a minimum-number-of-phones constraint."
  echo ""
  echo "  <input-dict-dir>:  A dictionary directory (as validated by validate_dict_dir.pl);"
  echo "             the dictionary from this location (lexicon.txt, lexiconp.txt, or"
  echo "             lexiconp_silprob.txt) will be used to train the language model on"
  echo "             phones.  The files silence_phones.txt and nonsilence_phones.txt will"
  echo "             be used to construct a symbol table used internally, and to"
  echo "             exclude lexicon entries containing silences."
  echo " <work-dir>:    A place to put logs and the output of this script.  The output of"
  echo "                this script will be written to <work-dir>/unk_fst.txt (we write in"
  echo "                text form so that it's independent of the phones.txt)."
  echo "Options:"
  echo "    --ngram-order <n>                 # (default: 4)  N-gram order of the phone-level language"
  echo "                                      # model.  Must be in range [2, 7]"
  echo "    --num-extra-ngrams <n>            # (default: 10000).  The maximum the number of n-grams"
  echo "                                      # that may be present in the language model in addition"
  echo "                                      # to the unigrams.  The LM will be pruned to achieve this."
  echo "    --use-pocolm <true|false>         # (default: true).  If true, use pocolm to estimate the"
  echo "                                      # language model; you will be prompted to install it if"
  echo "                                      # needed.  (If false, we use the script make_phone_lm.py,"
  echo "                                      # which is simpler but the perplexity is not as good)."
  echo "    --position-dependent-phones <true|false>  # (default: true).  If true, assume position-dependent"
  echo "                                      # phones (although in any case the lexicon should use position-"
  echo "                                      # independent phones).  If position-dependent phones are used,"
  echo "                                      # after creating the LM we compose with an FST that converts"
  echo "                                      # into position-dependent phones while enforcing the natural"
  echo "                                      # constraints that they form a single word."
  echo "    --min-word-length <1|2>           # (default: 2).  May only be 1 or 2.  The minimum word length"
  echo "                                      # (in number of phones) that is allowed"
  echo "    --phone-disambig-symbol <symbol>  # default: '#1'.  This is the symbol that will be put on the"
  echo "                                      # input side of backoff arcs.  You won't normally have to change"
  echo "                                      # this because prepare_lang.sh expects '#1' there."
  exit 1;
fi


dict_dir=$1
dir=$2

set -e

mkdir -p $dir/log

if [ $stage -le 0 ]; then
  if ! utils/validate_dict_dir.pl $dict_dir >&$dir/log/validate_dict_dir.log; then
    cat $dir/log/validate_dict_dir.log
    echo "$0: failed to validate input dict-dir $dict_dir"
    exit 1
  fi
fi

if ! [ $ngram_order -ge 2 ] || ! [ $ngram_order -le 7 ]; then
  echo "$0: invalid --ngram-order $ngram_order (must be in [2,7])"
  exit 1
fi

if ! [ $min_word_length -ge 1 ] || ! [ $min_word_length -le 2 ]; then
  echo "$0: invalid --min-word-length $min_word_length (must be in [1,2])"
  exit 1
fi

# The next command creates a symbol table that will cover all the symbols we might
# possibly need in this script.  The word-position-dependent suffixes (_B and so on
# won't be needed if --position-dependent-phones is false, but it won't hurt.
cat $dict_dir/silence_phones.txt $dict_dir/nonsilence_phones.txt | \
  awk '{for(n=1;n<=NF;n++) print $n; }' | \
  awk '{print $1; print $1 "_B"; print $1 "_I"; print $1 "_S"; print $1 "_E";}' | \
      cat - <(echo "$phone_disambig_symbol") | \
  awk 'BEGIN{print "<eps> 0";} {print $1, NR;}' > $dir/phones.txt

phone_disambig_int=$(tail -n 1 <$dir/phones.txt | awk '{print $2}')
if ! [ $phone_disambig_int == $phone_disambig_int ]; then
  echo "$0: problem working out integer form of phone-disambig symbol."
  exit 1;
fi

if [ -e $dict_dir/lexicon.txt ]; then
  src_dict=$dict_dir/lexicon.txt
  first_phone_field=2
elif [ -e $dict_dir/lexiconp.txt ]; then
  src_dict=$dict_dir/lexiconp.txt
  first_phone_field=3
else
  [ ! -e $dict_dir/lexiconp_silprob.txt ] && \
    echo "$0: expected file $dict_dir/lexiconp_silprob.txt to exist" && exit 1
  src_dict=$dict_dir/lexiconp_silprob.tt
  first_phone_field=6
fi

cat $dict_dir/silence_phones.txt | awk '{for(n=1;n<=NF;n++) print $n; }' > $dir/silence_phones.txt

# prepare the cleaned up version of the dictionary (to train our phone LM), with
# the first field (the word) removed, with prons that have silence phones in
# them removed, and with empty prons (which should not be allowed anyway, but
# just in case..) removed.
awk -v dir=$dir -v ff=$first_phone_field \
   'BEGIN{ while ((getline <(dir"/silence_phones.txt")) > 0) sil[$1]=1;  }
         { ok=1; for (n=ff; n<=NF; n++) { if ($n in sil) ok=0; }
           if (ok && NF>=ff) { for (n=ff;n<=NF;n++) printf("%s ",$n); print ""; } else {
            print("make_unk_lm.sh: info: not including dict line: ", $0) >"/dev/stderr" }}' <$src_dict >$dir/training.txt
cat $dir/training.txt | awk '{for(n=1;n<=NF;n++) seen[$n]=1; } END{for (k in seen) print k;}' > $dir/all_nonsil_phones

num_dict_lines=$(wc -l <$src_dict)
num_train_lines=$(wc -l < $dir/training.txt)
if ! [ $num_train_lines -gt 0 ]; then
  echo "$0: something went wrong getting text to train phone-level LM."
  exit 1
fi
echo "$0: training on $num_train_lines words out of $num_dict_lines in the "
echo "     ... original dictionary (excluding words with silence phones)."


if [ $num_train_lines -lt 2000 ] && $use_pocolm; then
  echo "$0: the number of lines of training data is very small [$num_train_lines]."
  echo "    Setting --use-pocolm to false since it probably won't work well"
  echo "    on so little data (e.g. hard to estimate the discounting parameters)"
  echo "    Using make_phone_lm.py instead."
  use_pocolm=false
fi

if $use_pocolm; then
  if [ ! -e $KALDI_ROOT/tools/pocolm ]; then
    echo "$0: $KALDI_ROOT/tools/pocolm does not exist:"
    echo " ... please do:  cd $KALDI_ROOT/tools; extras/install_pocolm.sh"
    echo " ... and then rerun this script."
    exit 1
  fi

  PATH=$KALDI_ROOT/tools/pocolm/scripts:$PATH

  if [ $stage -le 1 ]; then
    echo "$0: training $ngram_order-gram LM with pocolm"

    mkdir -p $dir/pocolm/text
    heldout_ratio=5  # hold out one fifth of the data as validation to estimate
    # metaparameters; we'll fold it back in before estimating the
    # final LM.
    cat $dir/training.txt | awk -v h=$heldout_ratio '{if(NR%h == 0) print; }' > $dir/pocolm/text/dev.txt
    cat $dir/training.txt | awk -v h=$heldout_ratio '{if(NR%h != 0) print; }' > $dir/pocolm/text/train.txt


    # the following options are because we expect the amount of data to be small,
    # all the data subsampling isn't really needed and will increase the chance of
    # something going wrong.

    small_data_opts="--num-splits 4 --warm-start-ratio 1"
    $cmd $dir/log/train_lm.log \
         train_lm.py --wordlist $dir/all_nonsil_phones $small_data_opts \
         --fold-dev-into=train $dir/pocolm/text $ngram_order $dir/pocolm
  fi

  if [ $stage -le 2 ]; then
    echo "$0: pruning LM with pocolm"
    num_words=$(wc -l <$dir/all_nonsil_phones)
    num_ngrams=$[$num_extra_ngrams+$num_words]


    $cmd $dir/log/prune_lm_dir.log \
         prune_lm_dir.py --target-num-ngrams=$num_ngrams \
         $dir/pocolm/all_nonsil_phones_${ngram_order}.pocolm $dir/poclm/lm_pruned

    # format as arpa.
    format_arpa_lm.py $dir/poclm/lm_pruned > $dir/pocolm.arpa
  fi

  if [ $stage -le 3 ]; then
    echo "$0: applying bigram constraints and converting from ARPA to FST"
    # now get bigram constraints: we want to get an FST that only allows phone
    # bigrams that we've seen (this may enforce certain linguistic constraints,
    # and also stops the graph from blowing up too much once we introduce
    # phonetic context.
    # The NF > 0 is just a double-check that there are no empty prons, which
    # would be bad as it would allow an empty pronunciation of the unknown word.
    cat $dir/training.txt | awk '{ if (NF > 0) printf("<s> %s </s>\n", $0); }' | \
      awk '{for(n=1;n<NF;n++) { m=n+1; seen[ $n " " $m ] = 1; }} END{for(k in seen) print k;}' \
          > $dir/allowed_bigrams

    $cmd $dir/log/arpa2fst.log \
         utils/lang/internal/arpa2fst_constrained.py --verbose=3 \
           --disambig-symbol="$phone_disambig_symbol" \
         $dir/pocolm.arpa $dir/allowed_bigrams '>' $dir/unk_fst_orig.txt
  fi
else

  if [ $stage -le 1 ]; then
    echo "$0: using make_phone_lm.py to create $ngram_order-gram language-model FST"
    $cmd $dir/log/make_phone_lm.log \
         utils/sym2int.pl $dir/phones.txt $dir/training.txt '|' \
         utils/lang/make_phone_lm.py --verbose=2 \
         --phone-disambig-symbol=$phone_disambig_int \
         --num-extra-ngrams=$num_extra_ngrams \
         --ngram-order=$ngram_order '|' \
         utils/int2sym.pl -f 3-4 $dir/phones.txt '>'$dir/unk_fst_orig.txt
  fi
fi


sym_opts="--isymbols=$dir/phones.txt --osymbols=$dir/phones.txt"

if ! $position_dependent_phones; then
  if  [ $min_word_length == 1 ]; then
    echo "$0: no word-length constraint or word-position-dependency, so exiting."
    # There is no need to compose unk_fst_orig.txt with a separate FST: because of
    # the bigram constraints and because we ensure that there were no empty prons
    # in the dictionary (no empty lines in training.txt), the FST wouldn't allow
    # length-zero words anyway.
    cp $dir/unk_fst_orig.txt $dir/unk_fst.txt
    fstcompile $sym_opts <$dir/unk_fst.txt >$dir/unk.fst
    exit 0;
  else
    echo "$0: creating constraint_fst.txt for min-word-length=2 constraint."
    # min-word-length is 2; we need to apply that constraint.  A note on the FST
    # states: 0 is start state, 1 is "seen one phone", 2 is "seen two or more
    # phones".
    # We don't need to take into account the disambig symbol because we compose on
    # the right with this FST, and it doesn't appear on the output side.
    cat $dir/all_nonsil_phones | \
      awk '{ph[$1]=1} END{ for (p in ph) { print 0,1,p,p; print 1,2,p,p; print 2,2,p,p; }
                 print 2,0.0; }' > $dir/constraint_fst.txt
  fi
else
  echo "$0: creating constraint_fst.txt for min-word-length=$min_word_length constraint, plus word-position-dependency conversion."

  # Add constraints and convert phones without tags into phones with the _B, _E, _I and _S
  # tags (begin, end, internal, singleton).

  # States:
  # 0 is start state,
  # 1 is "seen initial phone (and maybe internal phones) of multi-phone word",
  # 2 is "seen final phone of multi-phone word".
  # 3 is "seen phone of single-phone word"; note, if --min-word-length is 2,
  #      then state 3 will not exist.

  cat $dir/all_nonsil_phones | \
    awk -v mwl=$min_word_length -v "disambig=$phone_disambig_symbol" \
 '{ph[$1]=1} END{ for (n=0; n<3; n++) print n,n,disambig,disambig;
                  for (p in ph) { printf("0 1 %s %s_B\n", p, p); printf("1 1 %s %s_I\n", p, p);
                                  printf("1 2 %s %s_E\n", p, p); if (mwl==1) printf("0 3 %s %s_S\n", p, p);  }
                 print 2,0.0; if (mwl==1) print 3,0.0; }' >$dir/constraint_fst.txt
fi


echo "$0: creating final FST via composition, etc."

fstcompile $sym_opts <$dir/constraint_fst.txt | fstarcsort > $dir/constraint.fst
fstcompile $sym_opts <$dir/unk_fst_orig.txt >$dir/unk_orig.fst

# The first 'fstproject' below projects on the input; it makes sure the
# disambiguation symbol appears on the output side also.
# The fstcompose actually applies the constraints and does the conversion, but
# after this the "correct" phones appear only on the output side.
# The second 'fstproject' copies the word-position-dependent phones to
# the input side.
# The 'fstpushspecial' pushes the weights, as the composition with the
#  constraint FST makes the FST quite non-stochastic [weights per state do not
#  sum up to one].
# The 'fstrmsymbols' command makes sure the disambiguation symbol appears only
# on the input side.
# 'fstminimizeencoded' combines states that are the same as far as their output
# arcs are concerned; in the case where --min-word-length is 1, this combines
# a lot of final-states that have no transitions out of them.
fstproject $dir/unk_orig.fst | \
  fstcompose - $dir/constraint.fst | \
  fstproject --project_output=true | \
  fstpushspecial | \
  fstminimizeencoded | \
  fstrmsymbols --remove-from-output=true <(echo $phone_disambig_int) >$dir/unk.fst

fstprint $sym_opts <$dir/unk.fst >$dir/unk_fst.txt


exit 0;


================================================
FILE: egs/utils/lang/validate_disambig_sym_file.pl
================================================
#!/usr/bin/env perl

# Copyright 2016 FAU Erlangen (Author: Axel Horndasch)
# Apache 2.0.
#
# Concept: Dan Povey

use strict;
use warnings;
use Getopt::Long;

my $Usage = <<EOU;
Usage:  validate_disambig_sym_file.pl [options] disambig_syms.txt

This scripts checks if the entries of a file containing disambiguation symbols
(word or phone level) are all valid. To be valid the symbols
- must start with the hash mark '#',
- must not contain any whitespace,
- must not be equal to '#-1' (disallowed because it is used internally in some
  FST stuff).

In case the option '--allow-numeric' is used with 'false', the symbols must
also be non-numeric (to avoid overlap with the automatically generated symbols).

Allowed options:
  --allow-numeric (true|false) : Default true. If false, disallow numeric
                                 disambiguation symbols like #0, #1 and so on.

EOU

# Command line options
my $allow_numeric = "true";

# Get the optional command line options
GetOptions(
    "allow-numeric=s" => \$allow_numeric,
    ) or die ($Usage);

if (@ARGV != 1) {
  die($Usage);
}

my $disambig_sym_file = shift @ARGV;

print "$0: Checking validity of file \"$disambig_sym_file\" ...\n";
if (-z $disambig_sym_file) {
  print "$0: The file \"$disambig_sym_file\" is empty or does not exist, exiting ...\n"; exit 1;
}

if (not open(SYMS, "<$disambig_sym_file")) {
  print "$0: Could not open file \"$disambig_sym_file\", exiting ...\n"; exit 1;
}

# Go through the file containing disambiguation symbols line by line
while (<SYMS>) {
  chomp;
  my $symbol = $_;

  if ($symbol =~ /^#(.*)$/) {
    my $sympart = $1;
    if ($sympart eq "") {
      print "$0: Only \"$symbol\" is not allowed as a disambiguation symbol, exiting ...\n"; exit 1;
    }
    if ($sympart =~/\s+/) {
      print "$0: The disambiguation symbol \"$symbol\" contains whitespace, exiting ...\n"; exit 1;
    }
    if ($sympart eq "-1") {
      print "$0: The disambiguation symbol \"$symbol\" is not allowed, exiting ...\n"; exit 1;
    }
    if ($allow_numeric eq "false" &&
	$sympart =~/^[0-9]+$/) {
      print "$0: Since \"$symbol\" is supposed to be an extra disambiguation symbol, it must not be numeric, exiting ...\n"; exit 1;
    }
  } else {
    print "$0: The disambiguation symbol \"$symbol\" does not start with a '#', exiting ...\n"; exit 1;
  }
}

print "--> SUCCESS [validating disambiguation symbol file \"$disambig_sym_file\"]\n";
exit 0;


================================================
FILE: egs/utils/ln.pl
================================================
#!/usr/bin/env perl
use File::Spec;

if ( @ARGV < 2 ) {
  print STDERR "usage: ln.pl input1 input2 dest-dir\n" .
    "This script does a soft link of input1, input2, etc." .
    "to dest-dir, using relative links where possible\n" .
    "Note: input-n and dest-dir may both be absolute pathnames,\n" .
    "or relative pathnames, relative to the current directlory.\n";
  exit(1);
}  

$dir = pop @ARGV;
if ( ! -d $dir ) {
  print STDERR "ln.pl: last argument must be a directory ($dir is not a directory)\n";
  exit(1);
}

$ans = 1; # true.

$absdir = File::Spec->rel2abs($dir); # Get $dir as abs path.
defined $absdir || die "No such directory $dir";
foreach $file (@ARGV) {
  $absfile =  File::Spec->rel2abs($file); # Get $file as abs path.
  defined $absfile || die "No such file or directory: $file";
  @absdir_split = split("/", $absdir);
  @absfile_split = split("/", $absfile);

  $newfile = $absdir . "/" . $absfile_split[$#absfile_split]; # we'll use this
  # as the destination in the link command.
  $num_removed = 0;
  while (@absdir_split > 0 && $absdir_split[0] eq $absfile_split[0]) {
    shift @absdir_split;
    shift @absfile_split;
    $num_removed++;
  }
  if (-l $newfile) { # newfile is already a link -> safe to delete it.
    unlink($newfile); # "unlink" just means delete.
  }
  if ($num_removed == 0) { # will use absolute pathnames.
    $oldfile = "/" . join("/", @absfile_split);
    $ret = symlink($oldfile, $newfile);
  } else {
    $num_dots = @absdir_split;
    $oldfile = join("/", @absfile_split);
    for ($n = 0; $n < $num_dots; $n++) {
      $oldfile = "../" . $oldfile;
    }
    $ret = symlink($oldfile, $newfile);
  }
  $ans = $ans && $ret;
  if (! $ret) {
    print STDERR "Error linking $oldfile to $newfile\n";
  }
}

exit ($ans == 1 ? 0 : 1);


================================================
FILE: egs/utils/make_absolute.sh
================================================
#!/usr/bin/env bash

# This script replaces the command readlink -f (which is not portable).
# It turns a pathname into an absolute pathname, including following soft links.
target_file=$1

cd $(dirname $target_file)
target_file=$(basename "$target_file")

# Iterate down a (possible) chain of symlinks
while [ -L "$target_file" ]; do
    target_file=$(readlink $target_file)
    cd $(dirname $target_file)
    target_file=$(basename $target_file)
done

# Compute the canonicalized name by finding the physical path 
# for the directory we're in and appending the target file.
phys_dir=$(pwd -P)
result=$phys_dir/$target_file
echo $result


================================================
FILE: egs/utils/make_lexicon_fst.pl
================================================
#!/usr/bin/env perl

# THIS SCRIPT IS DEPRECATED AND WILL BE REMOVED.  See
# utils/lang/make_lexicon_fst.py which is the python-based replacement.


use warnings; #sed replacement for -w perl parameter
# Copyright 2010-2011  Microsoft Corporation
#                2013  Johns Hopkins University (author: Daniel Povey)

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#  http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.


# makes lexicon FST, in text form, from lexicon (pronunciation probabilities optional).

$pron_probs = 0;

if ((@ARGV > 0) && ($ARGV[0] eq "--pron-probs")) {
  $pron_probs = 1;
  shift @ARGV;
}

if (@ARGV != 1 && @ARGV != 3 && @ARGV != 4) {
  print STDERR "Usage: make_lexicon_fst.pl [--pron-probs] lexicon.txt [silprob silphone [sil_disambig_sym]] >lexiconfst.txt\n\n";
  print STDERR "Creates a lexicon FST that transduces phones to words, and may allow optional silence.\n\n";
  print STDERR "Note: ordinarily, each line of lexicon.txt is:\n";
  print STDERR "  word phone1 phone2 ... phoneN;\n";
  print STDERR "if the --pron-probs option is used, each line is:\n";
  print STDERR "  word pronunciation-probability phone1 phone2 ... phoneN.\n\n";
  print STDERR "The probability 'prob' will typically be between zero and one, and note that\n";
  print STDERR "it's generally helpful to normalize so the largest one for each word is 1.0, but\n";
  print STDERR "this is your responsibility.\n\n";
  print STDERR "The silence disambiguation symbol, e.g. something like #5, is used only\n";
  print STDERR "when creating a lexicon with disambiguation symbols, e.g. L_disambig.fst,\n";
  print STDERR "and was introduced to fix a particular case of non-determinism of decoding graphs.\n\n";
  exit(1);
}

$lexfn = shift @ARGV;
if (@ARGV == 0) {
  $silprob = 0.0;
} elsif (@ARGV == 2) {
  ($silprob,$silphone) = @ARGV;
} else {
  ($silprob,$silphone,$sildisambig) = @ARGV;
}
if ($silprob != 0.0) {
  $silprob < 1.0 || die "Sil prob cannot be >= 1.0";
  $silcost = -log($silprob);
  $nosilcost = -log(1.0 - $silprob);
}


open(L, "<$lexfn") || die "Error opening lexicon $lexfn";


if ( $silprob == 0.0 ) { # No optional silences: just have one (loop+final) state which is numbered zero.
  $loopstate = 0;
  $nextstate = 1;               # next unallocated state.
  while (<L>) {
    @A = split(" ", $_);
    @A == 0 && die "Empty lexicon line.";
    foreach $a (@A) {
      if ($a eq "<eps>") {
        die "Bad lexicon line $_ (<eps> is forbidden)";
      }
    }
    $w = shift @A;
    if (! $pron_probs) {
      $pron_cost = 0.0;
    } else {
      $pron_prob = shift @A;
      if (! defined $pron_prob || !($pron_prob > 0.0 && $pron_prob <= 1.0)) {
        die "Bad pronunciation probability in line $_";
      }
      $pron_cost = -log($pron_prob);
    }
    if ($pron_cost != 0.0) { $pron_cost_string = "\t$pron_cost"; } else { $pron_cost_string = ""; }

    $s = $loopstate;
    $word_or_eps = $w;
    while (@A > 0) {
      $p = shift @A;
      if (@A > 0) {
        $ns = $nextstate++;
      } else {
        $ns = $loopstate;
      }
      print "$s\t$ns\t$p\t$word_or_eps$pron_cost_string\n";
      $word_or_eps = "<eps>";
      $pron_cost_string = ""; # so we only print it on the first arc of the word.
      $s = $ns;
    }
  }
  print "$loopstate\t0\n";      # final-cost.
} else {                        # have silence probs.
  $startstate = 0;
  $loopstate = 1;
  $silstate = 2;   # state from where we go to loopstate after emitting silence.
  print "$startstate\t$loopstate\t<eps>\t<eps>\t$nosilcost\n"; # no silence.
  if (!defined $sildisambig) {
    print "$startstate\t$loopstate\t$silphone\t<eps>\t$silcost\n"; # silence.
    print "$silstate\t$loopstate\t$silphone\t<eps>\n";             # no cost.
    $nextstate = 3;
  } else {
    $disambigstate = 3;
    $nextstate = 4;
    print "$startstate\t$disambigstate\t$silphone\t<eps>\t$silcost\n"; # silence.
    print "$silstate\t$disambigstate\t$silphone\t<eps>\n"; # no cost.
    print "$disambigstate\t$loopstate\t$sildisambig\t<eps>\n"; # silence disambiguation symbol.
  }
  while (<L>) {
    @A = split(" ", $_);
    $w = shift @A;
    if (! $pron_probs) {
      $pron_cost = 0.0;
    } else {
      $pron_prob = shift @A;
      if (! defined $pron_prob || !($pron_prob > 0.0 && $pron_prob <= 1.0)) {
        die "Bad pronunciation probability in line $_";
      }
      $pron_cost = -log($pron_prob);
    }
    if ($pron_cost != 0.0) { $pron_cost_string = "\t$pron_cost"; } else { $pron_cost_string = ""; }
    $s = $loopstate;
    $word_or_eps = $w;
    while (@A > 0) {
      $p = shift @A;
      if (@A > 0) {
        $ns = $nextstate++;
        print "$s\t$ns\t$p\t$word_or_eps$pron_cost_string\n";
        $word_or_eps = "<eps>";
        $pron_cost_string = ""; $pron_cost = 0.0; # so we only print it the 1st time.
        $s = $ns;
      } elsif (!defined($silphone) || $p ne $silphone) {
        # This is non-deterministic but relatively compact,
        # and avoids epsilons.
        $local_nosilcost = $nosilcost + $pron_cost;
        $local_silcost = $silcost + $pron_cost;
        print "$s\t$loopstate\t$p\t$word_or_eps\t$local_nosilcost\n";
        print "$s\t$silstate\t$p\t$word_or_eps\t$local_silcost\n";
      } else {
        # no point putting opt-sil after silence word.
        print "$s\t$loopstate\t$p\t$word_or_eps$pron_cost_string\n";
      }
    }
  }
  print "$loopstate\t0\n";      # final-cost.
}


================================================
FILE: egs/utils/make_lexicon_fst_silprob.pl
================================================
#!/usr/bin/env perl

# THIS SCRIPT IS DEPRECATED AND WILL BE REMOVED.  See
# utils/lang/make_lexicon_fst_silprob.py which is the python-based replacement.

use warnings; #sed replacement for -w perl parameter
# Copyright 2010-2011  Microsoft Corporation
#                2013  Johns Hopkins University (author: Daniel Povey)
#                2015  Hainan Xu
#                2015  Guoguo Chen

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#  http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.


# makes lexicon FST, in text form, from lexicon which contains (optional)
# probabilities of pronuniations, and (mandatory) probabilities of silence
# before and after the pronunciation. This script is almost the same with
# the make_lexicon_fst.pl script except for the word-dependent silprobs part

if (@ARGV != 4) {
  print STDERR "Usage: $0 lexiconp_silprob_disambig.txt \\\n";
  print STDERR "       silprob.txt silphone_string sil_disambig_sym > lexiconfst.txt \n";
  print STDERR "\n";
  print STDERR "This script is almost the same as the utils/make_lexicon_fst.pl\n";
  print STDERR "except here we include word-dependent silence probabilities\n";
  print STDERR "when making the lexicon FSTs. ";
  print STDERR "For details, see paper \nhttp://danielpovey.com/files/2015_interspeech_silprob.pdf\n\n";
  print STDERR "The lexiconp_silprob_disambig.txt file should have each line like \n\n";
  print STDERR "word p(pronunciation|word) p(sil-after|word) correction-term-for-sil ";
  print STDERR "correction-term-for-no-sil phone-1 phone-2 ... phone-N\n\n";
  print STDERR "The pronunciation would have to include disambiguation symbols;\n";
  print STDERR "the 2 correction terms above are computed to reflect how much a \n";
  print STDERR "word affects the probability of a [non-]silence before it. \n";
  print STDERR "Please see the paper (link given above) for detailed descriptions\n";
  print STDERR "for how the 2 terms are computed.\n\n";
  print STDERR "The silprob.txt file contains 4 lines, \n\n";
  print STDERR "<s> p(sil-after|<s>)\n";
  print STDERR "</s>_s correction-term-for-sil-for-</s>\n";
  print STDERR "</s>_n correction-term-for-no-sil-for-</s>\n";
  print STDERR "overall p(overall-sil)\n\n";
  print STDERR "Other files are the same as utils/make_lexicon_fst.pl\n";

  exit(1);
}

$lexfn = shift @ARGV;
$silprobfile = shift @ARGV;

($silphone,$sildisambig) = @ARGV;

open(L, "<$lexfn") || die "Error opening lexicon $lexfn";
open(SP, "<$silprobfile") || die "Error opening word-sil-probs $SP";

$silbeginprob = -1;
$silendcorrection = -1;
$nonsilendcorrection = -1;
$siloverallprob = -1;

while (<SP>) {
  @A = split(" ", $_);
  $w = shift @A;
  if ($w eq "<s>") {
    $silbeginprob = shift @A;
  }
  if ($w eq "</s>_s") {
    $silendcorrection = shift @A;
  }
  if ($w eq "</s>_n") {
    $nonsilendcorrection = shift @A;
  }
  if ($w eq "overall") {
    $siloverallprob = shift @A;
  }
}

$startstate = 0;
$nonsilstart = 1;
$silstart = 2;
$nextstate = 3;

$cost = -log($silbeginprob);
print "$startstate\t$silstart\t$silphone\t<eps>\t$cost\n"; # will change these
$cost = -log(1 - $silbeginprob);
print "$startstate\t$nonsilstart\t$sildisambig\t<eps>\t$cost\n";

while (<L>) {
  @A = split(" ", $_);
  $w = shift @A;
  $pron_prob = shift @A;
  if (! defined $pron_prob || !($pron_prob > 0.0 && $pron_prob <= 1.0)) {
    die "Bad pronunciation probability in line $_";
  }

  $wordsilprob = shift @A;
  $silwordcorrection = shift @A;
  $nonsilwordcorrection = shift @A;

  $pron_cost = -log($pron_prob);
  $wordsilcost = -log($wordsilprob);
  $wordnonsilcost = -log(1.0 - $wordsilprob);
  $silwordcost = -log($silwordcorrection);
  $nonsilwordcost = -log($nonsilwordcorrection);

  $first = 1;  # used as a bool, to handle the first phone (adding sils)
  while (@A > 0) {
    $p = shift @A;

    if ($first == 1) {
      $newstate = $nextstate++;

      # for nonsil before w
      $cost = $nonsilwordcost + $pron_cost;
      print "$nonsilstart\t$newstate\t$p\t$w\t$cost\n";

      # for sil before w
      $cost = $silwordcost + $pron_cost;
      print "$silstart\t$newstate\t$p\t$w\t$cost\n";
      $first = 0;
    }
    else {
      $oldstate = $nextstate - 1;
      print "$oldstate\t$nextstate\t$p\t<eps>\n";
      $nextstate++;
    }
    if (@A == 0) {
      $oldstate = $nextstate - 1;
      # for no sil after w
      $cost = $wordnonsilcost;
      print "$oldstate\t$nonsilstart\t$sildisambig\t<eps>\t$cost\n";

      # for sil after w
      $cost = $wordsilcost;
      print "$oldstate\t$silstart\t$silphone\t<eps>\t$cost\n";
    }
  }
}
$cost = -log($silendcorrection);
print "$silstart\t$cost\n";
$cost = -log($nonsilendcorrection);
print "$nonsilstart\t$cost\n";


================================================
FILE: egs/utils/make_unigram_grammar.pl
================================================
#!/usr/bin/env perl
# Copyright 2010-2012 Microsoft Corporation  Johns Hopkins University (Author: Daniel Povey)

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#  http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.

# This script is used in discriminative training.
# This script makes a simple unigram-loop version of G.fst
# using a unigram grammar estimated from some training transcripts.
# This is for MMI training.
# We don't have any silences in G.fst; these are supplied by the
# optional silences in the lexicon.

# Note: the symbols in the transcripts become the input and output
# symbols of G.txt; these can be numeric or not.

if(@ARGV != 0) {
    die "Usage: make_unigram_grammar.pl < text-transcripts > G.txt"
}

$totcount = 0;
$nl = 0;
while (<>) {
  @A = split(" ", $_);
  foreach $a (@A) {
    $count{$a}++;
    $totcount++;
  }
  $nl++;
  $totcount++; # Treat end-of-sentence as a symbol for purposes of
  # $totcount, so the grammar is properly stochastic.  This doesn't
  # become </s>, it just becomes the final-prob.
}

foreach $a (keys %count) {
  $prob = $count{$a} / $totcount;
  $cost = -log($prob);          # Negated natural-log probs.
  print "0\t0\t$a\t$a\t$cost\n";
}
# Zero final-cost.
$final_prob = $nl / $totcount;
$final_cost = -log($final_prob);
print "0\t$final_cost\n";


================================================
FILE: egs/utils/map_arpa_lm.pl
================================================
#!/usr/bin/env perl

# Copyright 2014  Guoguo Chen
#           2014  Johns Hopkins University (author: Daniel Povey)
# Apache 2.0.
#
use strict;
use warnings;
use Getopt::Long;

my $Usage = <<EOU;
This script reads the Arpa format language model, and maps the words into
integers or vice versa. It ignores the words that are not in the symbol table,
and updates the head information.

It will be used joinly with lmbin/arpa-to-const-arpa to build ConstArpaLm format
language model. We first map the words in an Arpa format language model to
integers, and then use lmbin/arpa-to-const-arpa to build a ConstArpaLm format
language model.

Usage: utils/map_arpa_lm.pl [options] <vocab-file> < input-arpa >output-arpa
 e.g.: utils/map_arpa_lm.pl words.txt <arpa_lm.txt >arpa_lm.int

Allowed options:
  --sym2int   : If true, maps words to integers, other wise maps integers to
                words. (boolean, default = true)

EOU

my $sym2int = "true";
GetOptions('sym2int=s' => \$sym2int);

($sym2int eq "true" || $sym2int eq "false") ||
  die "$0: Bad value for option --sym2int\n";

if (@ARGV != 1) {
  die $Usage;
}

# Gets parameters.
my $symtab = shift @ARGV;
my $arpa_in = shift @ARGV;
my $arpa_out = shift @ARGV;

# Opens files.
open(M, "<$symtab") || die "$0: Fail to open $symtab\n";

# Reads in the mapper.
my %mapper;
while (<M>) {
  chomp;
  my @col = split(/[\s]+/, $_);
  @col == 2 || die "$0: Bad line in mapper file \"$_\"\n";
  if ($sym2int eq "true") {
    if (defined($mapper{$col[0]})) {
      die "$0: Duplicate entry \"$col[0]\"\n";
    }
    $mapper{$col[0]} = $col[1];
  } else {
    if (defined($mapper{$col[1]})) {
      die "$0: Duplicate entry \"$col[1]\"\n";
    }
    $mapper{$col[1]} = $col[0];
  }
}

my $num_oov_lines = 0;
my $max_oov_warn = 20;

# Parses Arpa n-gram language model.
my $arpa = "";
my $current_order = -1;
my %head_ngram_count;
my %actual_ngram_count;
while (<STDIN>) {
  chomp;
  my @col = split(" ", $_);

  if ($current_order == -1 and ! m/^\\data\\$/) {
    next;
  }

  if (m/^\\data\\$/) {
    print STDERR "$0: Processing \"\\data\\\"\n";
    print "$_\n";
    $current_order = 0;
  } elsif (m/^\\[0-9]*-grams:$/) {
    $current_order = $_;
    $current_order =~ s/-grams:$//g;
    $current_order =~ s/^\\//g;
    print "$_\n";
    print STDERR "$0: Processing \"\\$current_order-grams:\\\"\n";
  } elsif (m/^\\end\\/) {
    print "$_\n";
  } elsif ($_ eq "") {
    if ($current_order >= 1) {
      print "\n";
    }
  } else {
    if ($current_order == 0) {
      # echo head section.
      print "$_\n";
    } else {
      # Parses n-gram section.
      if (@col > 2 + $current_order || @col < 1 + $current_order) {
        die "$0: Bad line in arpa lm \"$_\"\n";
      }
      my $prob = shift @col;
      my $is_oov = 0;
      for (my $i = 0; $i < $current_order; $i++) {
        my $temp = $mapper{$col[$i]};
        if (!defined($temp)) {
          $is_oov = 1;
          $num_oov_lines++;
          last;
        } else {
          $col[$i] = $temp;
        }
      }
      if (!$is_oov) {
        my $rest_of_line = join(" ", @col);
        print "$prob\t$rest_of_line\n";
      } else {
        if ($num_oov_lines < $max_oov_warn) {
          print STDERR "$0: Warning: OOV line $_\n";
        }
      }
    }
  }
}

if ($num_oov_lines > 0) {
  print STDERR "$0: $num_oov_lines lines of the Arpa file contained OOVs and ";
  print STDERR "were not printed.\n";
}

close(M);


================================================
FILE: egs/utils/mkgraph.sh
================================================
#!/usr/bin/env bash
# Copyright 2010-2012 Microsoft Corporation
#           2012-2013 Johns Hopkins University (Author: Daniel Povey)
# Apache 2.0

# This script creates a fully expanded decoding graph (HCLG) that represents
# all the language-model, pronunciation dictionary (lexicon), context-dependency,
# and HMM structure in our model.  The output is a Finite State Transducer
# that has word-ids on the output, and transition-ids on the input (these are indexes
# that resolve to pdf-ids).
# See
#  http://kaldi-asr.org/doc/graph_recipe_test.html
# (this is compiled from this repository using Doxygen,
# the source for this part is in src/doc/graph_recipe_test.dox)

set -o pipefail

tscale=1.0
loopscale=0.1

remove_oov=false

for x in `seq 4`; do
  [ "$1" == "--mono" -o "$1" == "--left-biphone" -o "$1" == "--quinphone" ] && shift && \
    echo "WARNING: the --mono, --left-biphone and --quinphone options are now deprecated and ignored."
  [ "$1" == "--remove-oov" ] && remove_oov=true && shift;
  [ "$1" == "--transition-scale" ] && tscale=$2 && shift 2;
  [ "$1" == "--self-loop-scale" ] && loopscale=$2 && shift 2;
done

if [ $# != 3 ]; then
   echo "Usage: utils/mkgraph.sh [options] <lang-dir> <model-dir> <graphdir>"
   echo "e.g.: utils/mkgraph.sh data/lang_test exp/tri1/ exp/tri1/graph"
   echo " Options:"
   echo " --remove-oov       #  If true, any paths containing the OOV symbol (obtained from oov.int"
   echo "                    #  in the lang directory) are removed from the G.fst during compilation."
   echo " --transition-scale #  Scaling factor on transition probabilities."
   echo " --self-loop-scale  #  Please see: http://kaldi-asr.org/doc/hmm.html#hmm_scale."
   echo "Note: the --mono, --left-biphone and --quinphone options are now deprecated"
   echo "and will be ignored."
   exit 1;
fi

if [ -f path.sh ]; then . ./path.sh; fi

lang=$1
tree=$2/tree
model=$2/final.mdl
dir=$3

mkdir -p $dir

# If $lang/tmp/LG.fst does not exist or is older than its sources, make it...
# (note: the [[ ]] brackets make the || type operators work (inside [ ], we
# would have to use -o instead),  -f means file exists, and -ot means older than).

required="$lang/L_disambig.fst $lang/G.fst $lang/phones.txt $lang/words.txt $lang/phones/silence.csl $lang/phones/disambig.int $model $tree"
for f in $required; do
  [ ! -f $f ] && echo "mkgraph.sh: expected $f to exist" && exit 1;
done

if [ -f $dir/HCLG.fst ]; then
  # detect when the result already exists, and avoid overwriting it.
  must_rebuild=false
  for f in $required; do
    [ $f -nt $dir/HCLG.fst ] && must_rebuild=true
  done
  if ! $must_rebuild; then
    echo "$0: $dir/HCLG.fst is up to date."
    exit 0
  fi
fi


N=$(tree-info $tree | grep "context-width" | cut -d' ' -f2) || { echo "Error when getting context-width"; exit 1; }
P=$(tree-info $tree | grep "central-position" | cut -d' ' -f2) || { echo "Error when getting central-position"; exit 1; }

[[ -f $2/frame_subsampling_factor && "$loopscale" == "0.1" ]] && \
  echo "$0: WARNING: chain models need '--self-loop-scale 1.0'";

if [ -f $lang/phones/nonterm_phones_offset.int ]; then
  if [[ $N != 2  || $P != 1 ]]; then
    echo "$0: when doing grammar decoding, you can only build graphs for left-biphone trees."
    exit 1
  fi
  nonterm_phones_offset=$(cat $lang/phones/nonterm_phones_offset.int)
  nonterm_opt="--nonterm-phones-offset=$nonterm_phones_offset"
  prepare_grammar_command="make-grammar-fst --nonterm-phones-offset=$nonterm_phones_offset - -"
else
  prepare_grammar_command="cat"
  nonterm_opt=
fi

mkdir -p $lang/tmp
trap "rm -f $lang/tmp/LG.fst.$$" EXIT HUP INT PIPE TERM
# Note: [[ ]] is like [ ] but enables certain extra constructs, e.g. || in
# place of -o
if [[ ! -s $lang/tmp/LG.fst || $lang/tmp/LG.fst -ot $lang/G.fst || \
      $lang/tmp/LG.fst -ot $lang/L_disambig.fst ]]; then
  fsttablecompose $lang/L_disambig.fst $lang/G.fst | fstdeterminizestar --use-log=true | \
    fstminimizeencoded | fstpushspecial > $lang/tmp/LG.fst.$$ || exit 1;
  mv $lang/tmp/LG.fst.$$ $lang/tmp/LG.fst
  fstisstochastic $lang/tmp/LG.fst || echo "[info]: LG not stochastic."
fi

clg=$lang/tmp/CLG_${N}_${P}.fst
clg_tmp=$clg.$$
ilabels=$lang/tmp/ilabels_${N}_${P}
ilabels_tmp=$ilabels.$$
trap "rm -f $clg_tmp $ilabels_tmp" EXIT HUP INT PIPE TERM
if [[ ! -s $clg || $clg -ot $lang/tmp/LG.fst \
    || ! -s $ilabels || $ilabels -ot $lang/tmp/LG.fst ]]; then
  fstcomposecontext $nonterm_opt --context-size=$N --central-position=$P \
   --read-disambig-syms=$lang/phones/disambig.int \
   --write-disambig-syms=$lang/tmp/disambig_ilabels_${N}_${P}.int \
    $ilabels_tmp $lang/tmp/LG.fst |\
    fstarcsort --sort_type=ilabel > $clg_tmp
  mv $clg_tmp $clg
  mv $ilabels_tmp $ilabels
  fstisstochastic $clg || echo "[info]: CLG not stochastic."
fi

trap "rm -f $dir/Ha.fst.$$" EXIT HUP INT PIPE TERM
if [[ ! -s $dir/Ha.fst || $dir/Ha.fst -ot $model  \
    || $dir/Ha.fst -ot $lang/tmp/ilabels_${N}_${P} ]]; then
  make-h-transducer $nonterm_opt --disambig-syms-out=$dir/disambig_tid.int \
    --transition-scale=$tscale $lang/tmp/ilabels_${N}_${P} $tree $model \
     > $dir/Ha.fst.$$  || exit 1;
  mv $dir/Ha.fst.$$ $dir/Ha.fst
fi

trap "rm -f $dir/HCLGa.fst.$$" EXIT HUP INT PIPE TERM
if [[ ! -s $dir/HCLGa.fst || $dir/HCLGa.fst -ot $dir/Ha.fst || \
      $dir/HCLGa.fst -ot $clg ]]; then
  if $remove_oov; then
    [ ! -f $lang/oov.int ] && \
      echo "$0: --remove-oov option: no file $lang/oov.int" && exit 1;
    clg="fstrmsymbols --remove-arcs=true --apply-to-output=true $lang/oov.int $clg|"
  fi
  fsttablecompose $dir/Ha.fst "$clg" | fstdeterminizestar --use-log=true \
    | fstrmsymbols $dir/disambig_tid.int | fstrmepslocal | \
     fstminimizeencoded > $dir/HCLGa.fst.$$ || exit 1;
  mv $dir/HCLGa.fst.$$ $dir/HCLGa.fst
  fstisstochastic $dir/HCLGa.fst || echo "HCLGa is not stochastic"
fi

trap "rm -f $dir/HCLG.fst.$$" EXIT HUP INT PIPE TERM
if [[ ! -s $dir/HCLG.fst || $dir/HCLG.fst -ot $dir/HCLGa.fst ]]; then
  add-self-loops --self-loop-scale=$loopscale --reorder=true $model $dir/HCLGa.fst | \
    $prepare_grammar_command | \
    fstconvert --fst_type=const > $dir/HCLG.fst.$$ || exit 1;
  mv $dir/HCLG.fst.$$ $dir/HCLG.fst
  if [ $tscale == 1.0 -a $loopscale == 1.0 ]; then
    # No point doing this test if transition-scale not 1, as it is bound to fail.
    fstisstochastic $dir/HCLG.fst || echo "[info]: final HCLG is not stochastic."
  fi
fi

# note: the empty FST has 66 bytes.  this check is for whether the final FST
# is the empty file or is the empty FST.
if ! [ $(head -c 67 $dir/HCLG.fst | wc -c) -eq 67 ]; then
  echo "$0: it looks like the result in $dir/HCLG.fst is empty"
  exit 1
fi

# save space.
rm $dir/HCLGa.fst $dir/Ha.fst 2>/dev/null || true

# keep a copy of the lexicon and a list of silence phones with HCLG...
# this means we can decode without reference to the $lang directory.


cp $lang/words.txt $dir/ || exit 1;
mkdir -p $dir/phones
cp $lang/phones/word_boundary.* $dir/phones/ 2>/dev/null # might be needed for ctm scoring,
cp $lang/phones/align_lexicon.* $dir/phones/ 2>/dev/null # might be needed for ctm scoring,
cp $lang/phones/optional_silence.* $dir/phones/ 2>/dev/null # might be needed for analyzing alignments.
    # but ignore the error if it's not there.


cp $lang/phones/disambig.{txt,int} $dir/phones/ 2> /dev/null
cp $lang/phones/silence.csl $dir/phones/ || exit 1;
cp $lang/phones.txt $dir/ 2> /dev/null # ignore the error if it's not there.

am-info --print-args=false $model | grep pdfs | awk '{print $NF}' > $dir/num_pdfs


================================================
FILE: egs/utils/mkgraph_lookahead.sh
================================================
#!/bin/bash
# Copyright 2019 Alpha Cephei Inc.
# Copyright 2018 Joan Puigcerver
# Copyright 2010-2012 Microsoft Corporation
#           2012-2013 Johns Hopkins University (Author: Daniel Povey)
# Apache 2.0

# This script creates setup for decoding with lookahead online composition. The 
# graph HCLr.fst represents pronunciation dictionary (lexicon), context-dependency,
# and HMM structure in our model. The graph Gr.fst represents the language model.
# If arpa model is provided it compiles ngram model into compact LOUDS-encoded
# structure with opengrm. Both HCLr.fst and Gr.fst are optionally combined into
# single graph HCLG for testing with default decoders.
#
# See
#  http://kaldi-asr.org/doc/graph_recipe_test.html
# (this is compiled from this repository using Doxygen,
# the source for this part is in src/doc/graph_recipe_test.dox)
#
# Note that most of the fsts here are not stochastic, so many kaldi operations like
# fstpushspecial or fstdeterminizestar in log domain do not really work for them. 
# Instead most operations are in tropical domain.
set -o pipefail

tscale=1.0
loopscale=0.1
compose_graph=false
remove_oov=false

for x in `seq 4`; do
  [ "$1" == "--remove-oov" ] && remove_oov=true && shift;
  [ "$1" == "--compose-graph" ] && compose_graph=true && shift;
  [ "$1" == "--transition-scale" ] && tscale=$2 && shift 2;
  [ "$1" == "--self-loop-scale" ] && loopscale=$2 && shift 2;
done

# Note: [[ ]] is like [ ] but enables certain extra constructs, e.g. || in
# place of -o
if [[ $# != 3 && $# != 4 ]]; then
   echo "Usage: $0 [options] <lang-dir> <model-dir> [<arpa_file>] <graphdir>"
   echo "e.g.: $0 data/lang data/local/lm.gz exp/tri1 db/trigram.lm.gz exp/tri1/lgraph"
   echo " Options:"
   echo " --remove-oov       #  If true, any paths containing the OOV symbol (obtained from oov.int"
   echo "                    #  in the lang directory) are removed from the G.fst during compilation."
   echo " --transition-scale #  Scaling factor on transition probabilities."
   echo " --self-loop-scale  #  Please see: http://kaldi-asr.org/doc/hmm.html#hmm_scale."
   echo " --compose-graph    #  Compile composed graph for testing with other decoders (default: false)"
   exit 1;
fi

if [ -f path.sh ]; then . ./path.sh; fi

lang=$1
tree=$2/tree
model=$2/final.mdl

if [ $# == 3 ]; then
  echo "$0 : compiling grammar $1/G.fst"
  arpa=
  dir=$3
else
  echo "$0 : compiling grammar $3"
  arpa=$3
  dir=$4
  loc=`which ngramread`
  if [ -z $loc ]; then
    echo You appear to not have OpenGRM tools installed.
    echo cd to $KALDI_ROOT/tools and run extras/install_opengrm.sh.
    exit 1
  fi
fi

mkdir -p $dir

required="$lang/L_disambig.fst $arpa $lang/phones.txt $lang/words.txt $lang/phones/silence.csl $lang/phones/disambig.int $arpa $model $tree"
for f in $required; do
  [ ! -f $f ] && echo "$0 : expected $f to exist" && exit 1;
done

if [ -f $dir/HCLG.fst ]; then
  # detect when the result already exists, and avoid overwriting it.
  must_rebuild=false
  for f in $required; do
    [ $f -nt $dir/HCLG.fst ] && must_rebuild=true
  done
  if ! $must_rebuild; then
    echo "$0: $dir/HCLG.fst is up to date."
    exit 0
  fi
fi


N=$(tree-info $tree | grep "context-width" | cut -d' ' -f2) || { echo "Error when getting context-width"; exit 1; }
P=$(tree-info $tree | grep "central-position" | cut -d' ' -f2) || { echo "Error when getting central-position"; exit 1; }

[[ -f $2/frame_subsampling_factor && "$loopscale" == "0.1" ]] && \
  echo "$0: WARNING: chain models need '--self-loop-scale 1.0'";

trap "rm -f $dir/L_disambig_det.fst.$$" EXIT HUP INT PIPE TERM
if [[ ! -s $dir/L_disambig_det.fst || $dir/L_disambig_det -ot $lang/L_disambig.fst ]]; then
  fstdeterminizestar $lang/L_disambig.fst | fstarcsort --sort_type=ilabel > $dir/L_disambig_det.fst.$$ || exit 1;
  mv $dir/L_disambig_det.fst.$$ $dir/L_disambig_det.fst
fi

cl=$dir/CL_${N}_${P}.fst
cl_tmp=$cl.$$
ilabels=$dir/ilabels_${N}_${P}
ilabels_tmp=$ilabels.$$
trap "rm -f $cl_tmp $ilabels_tmp" EXIT HUP INT PIPE TERM
if [[ ! -s $cl || $cl -ot $dir/L_disambig_det.fst \
    || ! -s $ilabels || $ilabels -ot $dir/L_disambig_det.fst ]]; then
  fstcomposecontext $nonterm_opt --context-size=$N --central-position=$P \
   --read-disambig-syms=$lang/phones/disambig.int \
   --write-disambig-syms=$dir/disambig_ilabels_${N}_${P}.int \
    $ilabels_tmp $dir/L_disambig_det.fst | \
    fstarcsort --sort_type=ilabel > $cl_tmp
  mv $cl_tmp $cl
  mv $ilabels_tmp $ilabels
fi

trap "rm -f $dir/Ha.fst.$$" EXIT HUP INT PIPE TERM
if [[ ! -s $dir/Ha.fst || $dir/Ha.fst -ot $model  \
    || $dir/Ha.fst -ot $dir/ilabels_${N}_${P} ]]; then
  make-h-transducer $nonterm_opt --disambig-syms-out=$dir/disambig_tid.int \
    --transition-scale=$tscale $dir/ilabels_${N}_${P} $tree $model | \
  fstarcsort --sort_type=olabel \
     > $dir/Ha.fst.$$  || exit 1;
  mv $dir/Ha.fst.$$ $dir/Ha.fst
fi

trap "rm -f $dir/HCLr.fst.$$" EXIT HUP INT PIPE TERM
if [[ ! -s $dir/HCLr.fst || $dir/HCLr.fst -ot $dir/Ha.fst || \
      $dir/HCLr.fst -ot $cl ]]; then
  fstcompose $dir/Ha.fst "$cl" | fstdeterminizestar | \
     add-self-loops --disambig-syms=$dir/disambig_tid.int --self-loop-scale=$loopscale --reorder=true $model | \
     fstarcsort --sort_type=olabel | \
     fstconvert --fst_type=olabel_lookahead --save_relabel_opairs=${dir}/relabel \
      > $dir/HCLr.fst.$$ || exit 1;
  mv $dir/HCLr.fst.$$ $dir/HCLr.fst
fi

trap "rm -f $dir/Gr.fst.$$" EXIT HUP INT PIPE TERM
if [[ -z $arpa ]]; then
  if [[ ! -s $dir/Gr.fst || $dir/Gr.fst -ot $lang/G.fst ]]; then
    gr=${lang}/G.fst
    if $remove_oov; then
      [ ! -f $lang/oov.int ] && \
        echo "$0: --remove-oov option: no file $lang/oov.int" && exit 1;
      fstrmsymbols --remove-arcs=true --apply-to-output=true $lang/oov.int $gr | \
        fstrelabel --relabel_ipairs=${dir}/relabel | \
        fstarcsort --sort_type=ilabel | \
        fstconvert --fst_type=const > ${dir}/Gr.fst.$$
    else
      fstrelabel --relabel_ipairs=${dir}/relabel "$gr" | \
        fstarcsort --sort_type=ilabel | \
        fstconvert --fst_type=const > ${dir}/Gr.fst.$$
    fi
    mv $dir/Gr.fst.$$ $dir/Gr.fst
    cp $lang/words.txt $dir/ || exit 1;
  fi
else
  if [[ ! -s $dir/Gr.fst || $dir/Gr.fst -ot $arpa ]]; then
    # Opengrm builds acceptors, so we need to reorder words in symboltable
    utils/apply_map.pl --permissive -f 2 ${dir}/relabel < ${lang}/words.txt > ${dir}/words.txt
    gunzip -c $arpa | ngramread --OOV_symbol=`cat ${lang}/oov.txt` --symbols=${dir}/words.txt --ARPA | \
    fstarcsort --sort_type=ilabel | \
      fstconvert --fst_type=ngram > ${dir}/Gr.fst.$$
    mv $dir/Gr.fst.$$ $dir/Gr.fst
  fi
fi

if $compose_graph; then
  trap "rm -f $dir/HCLG.fst.$$" EXIT HUP INT PIPE TERM
  if [[ ! -s $dir/HCLG.fst || $dir/HCLG.fst -ot $dir/HCLr.fst \
        || $dir/HCLG.fst -ot $dir/Gr.fst ]]; then
    fstcompose ${dir}/HCLr.fst ${dir}/Gr.fst | \
    fstrmsymbols $dir/disambig_tid.int  | \
    fstconvert --fst_type=const > $dir/HCLG.fst.$$ || exit 1;
    mv $dir/HCLG.fst.$$ $dir/HCLG.fst
    if [ $tscale == 1.0 -a $loopscale == 1.0 ]; then
      # No point doing this test if transition-scale not 1, as it is bound to fail.
      fstisstochastic $dir/HCLG.fst || echo "[info]: final HCLG is not stochastic."
    fi
  fi

  # note: the empty FST has 66 bytes.  this check is for whether the final FST
  # is the empty file or is the empty FST.
  if ! [ $(head -c 67 $dir/HCLG.fst | wc -c) -eq 67 ]; then
    echo "$0: it looks like the result in $dir/HCLG.fst is empty"
    exit 1
  fi
fi

# keep a copy of the lexicon and a list of silence phones with HCLG...
# this means we can decode without reference to the $lang directory.

mkdir -p $dir/phones
cp $lang/phones/word_boundary.* $dir/phones/ 2>/dev/null # might be needed for ctm scoring,
cp $lang/phones/align_lexicon.* $dir/phones/ 2>/dev/null # might be needed for ctm scoring,
cp $lang/phones/optional_silence.* $dir/phones/ 2>/dev/null # might be needed for analyzing alignments.
    # but ignore the error if it's not there.


cp $lang/phones/disambig.{txt,int} $dir/phones/ 2> /dev/null
cp $lang/phones/silence.csl $dir/phones/ || exit 1;
cp $lang/phones.txt $dir/ 2> /dev/null # ignore the error if it's not there.

am-info --print-args=false $model | grep pdfs | awk '{print $NF}' > $dir/num_pdfs


================================================
FILE: egs/utils/nnet/gen_dct_mat.py
================================================
#!/usr/bin/env python

# Copyright 2012  Brno University of Technology (author: Karel Vesely)

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#  http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.

# ./gen_dct_mat.py
# script generates matrix with DCT transform, which is sparse
# and takes into account that data-layout is along frequency axis,
# while DCT is done along temporal axis.

from __future__ import division
from __future__ import print_function
from math import *
import sys


from optparse import OptionParser

def print_on_same_line(text):
    print(text, end=' ')

parser = OptionParser()
parser.add_option('--fea-dim', dest='dim', help='feature dimension')
parser.add_option('--splice', dest='splice', help='applied splice value')
parser.add_option('--dct-basis', dest='dct_basis', help='number of DCT basis')
(options, args) = parser.parse_args()

if(options.dim == None):
    parser.print_help()
    sys.exit(1)

dim=int(options.dim)
splice=int(options.splice)
dct_basis=int(options.dct_basis)

timeContext=2*splice+1


#generate the DCT matrix
M_PI = 3.1415926535897932384626433832795
M_SQRT2 = 1.4142135623730950488016887


#generate sparse DCT matrix
print('[')
for k in range(dct_basis):
    for m in range(dim):
        for n in range(timeContext):
          if(n==0):
              print_on_same_line(m*'0 ')
          else:
              print_on_same_line((dim-1)*'0 ')
          print_on_same_line(str(sqrt(2.0/timeContext)*cos(M_PI/timeContext*k*(n+0.5))))
          if(n==timeContext-1):
              print_on_same_line((dim-m-1)*'0 ')
        print()
    print()

print(']')


================================================
FILE: egs/utils/nnet/gen_hamm_mat.py
================================================
#!/usr/bin/env python

# Copyright 2012  Brno University of Technology (author: Karel Vesely)

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#  http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.

# ./gen_hamm_mat.py
# script generates diagonal matrix with hamming window values

from __future__ import division
from __future__ import print_function
from math import *
import sys


from optparse import OptionParser

def print_on_same_line(text):
    print(text, end=' ')

parser = OptionParser()
parser.add_option('--fea-dim', dest='dim', help='feature dimension')
parser.add_option('--splice', dest='splice', help='applied splice value')
(options, args) = parser.parse_args()

if(options.dim == None):
    parser.print_help()
    sys.exit(1)

dim=int(options.dim)
splice=int(options.splice)


#generate the diagonal matrix with hammings
M_2PI = 6.283185307179586476925286766559005

dim_mat=(2*splice+1)*dim
timeContext=2*splice+1
print('[')
for row in range(dim_mat):
    for col in range(dim_mat):
        if col!=row:
            print_on_same_line('0')
        else:
            i=int(row/dim)
            print_on_same_line(str(0.54 - 0.46*cos((M_2PI * i) / (timeContext-1))))
    print()

print(']')


================================================
FILE: egs/utils/nnet/gen_splice.py
================================================
#!/usr/bin/env python

# Copyright 2012  Brno University of Technology (author: Karel Vesely)

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#  http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.

# ./gen_splice.py
# generates <splice> Component

from __future__ import print_function
from math import *
import sys


from optparse import OptionParser

def print_on_same_line(text):
    print(text, end=' ')

parser = OptionParser()
parser.add_option('--fea-dim', dest='dim_in', help='feature dimension')
parser.add_option('--splice', dest='splice', help='number of frames to concatenate with the central frame')
parser.add_option('--splice-step', dest='splice_step', help='splicing step (frames dont need to be consecutive, --splice 3 --splice-step 2 will select offsets: -6 -4 -2 0 2 4 6)', default='1' )
(options, args) = parser.parse_args()

if(options.dim_in == None):
    parser.print_help()
    sys.exit(1)

dim_in=int(options.dim_in)
splice=int(options.splice)
splice_step=int(options.splice_step)

dim_out=(2*splice+1)*dim_in

print('<splice> {0} {1}'.format(dim_out, dim_in))
print_on_same_line('[')

splice_vec = list(range(-splice*splice_step, splice*splice_step+1, splice_step))
for idx in range(len(splice_vec)):
    print_on_same_line(splice_vec[idx])

print(']')


================================================
FILE: egs/utils/nnet/make_blstm_proto.py
================================================
#!/usr/bin/env python

# Copyright 2015-2016  Brno University of Technology (author: Karel Vesely)

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#  http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.

# Generated Nnet prototype, to be initialized by 'nnet-initialize'.

from __future__ import print_function
import sys

###
### Parse options
###
from optparse import OptionParser
usage="%prog [options] <feat-dim> <num-leaves> >nnet-proto-file"
parser = OptionParser(usage)
# Required,
parser.add_option('--cell-dim', dest='cell_dim', type='int', default=320,
                   help='Number of cells for one direction in BLSTM [default: %default]');
parser.add_option('--proj-dim', dest='proj_dim', type='int', default=200,
                   help='Dim reduction for one direction in BLSTM [default: %default]');
parser.add_option('--proj-dim-last', dest='proj_dim_last', type='int', default=320,
                   help='Dim reduction for one direction in BLSTM (last BLSTM component) [default: %default]');
parser.add_option('--num-layers', dest='num_layers', type='int', default=2,
                   help='Number of BLSTM layers [default: %default]');
# Optional (default == 'None'),
parser.add_option('--lstm-param-range', dest='lstm_param_range', type='float',
                   help='Range of initial BLSTM parameters [default: %default]');
parser.add_option('--param-stddev', dest='param_stddev', type='float',
                   help='Standard deviation for initial weights of Softmax layer [default: %default]');
parser.add_option('--cell-clip', dest='cell_clip', type='float',
                   help='Clipping cell values during propagation (per-frame) [default: %default]');
parser.add_option('--diff-clip', dest='diff_clip', type='float',
                   help='Clipping partial-derivatives during BPTT (per-frame) [default: %default]');
parser.add_option('--cell-diff-clip', dest='cell_diff_clip', type='float',
                   help='Clipping partial-derivatives of "cells" during BPTT (per-frame, those accumulated by CEC) [default: %default]');
parser.add_option('--grad-clip', dest='grad_clip', type='float',
                   help='Clipping the accumulated gradients (per-updates) [default: %default]');
#

(o,args) = parser.parse_args()
if len(args) != 2 :
  parser.print_help()
  sys.exit(1)

(feat_dim, num_leaves) = [int(i) for i in args];

# Original prototype from Jiayu,
#<NnetProto>
#<Transmit> <InputDim> 40 <OutputDim> 40
#<LstmProjectedStreams> <InputDim> 40 <OutputDim> 512 <CellDim> 800 <ParamScale> 0.01 <NumStream> 4
#<AffineTransform> <InputDim> 512 <OutputDim> 8000 <BiasMean> 0.000000 <BiasRange> 0.000000 <ParamStddev> 0.04
#<Softmax> <InputDim> 8000 <OutputDim> 8000
#</NnetProto>

lstm_extra_opts=""
if None != o.lstm_param_range: lstm_extra_opts += "<ParamRange> %f "   % o.lstm_param_range
if None != o.cell_clip:        lstm_extra_opts += "<CellClip> %f "     % o.cell_clip
if None != o.diff_clip:        lstm_extra_opts += "<DiffClip> %f "     % o.diff_clip
if None != o.cell_diff_clip:   lstm_extra_opts += "<CellDiffClip> %f " % o.cell_diff_clip
if None != o.grad_clip:        lstm_extra_opts += "<GradClip> %f "     % o.grad_clip

softmax_affine_opts=""
if None != o.param_stddev:     softmax_affine_opts += "<ParamStddev> %f " % o.param_stddev

# The BLSTM layers,
if o.num_layers == 1:
  # Single BLSTM,
  print("<BlstmProjected> <InputDim> %d <OutputDim> %d <CellDim> %s" % (feat_dim, 2*o.proj_dim_last, o.cell_dim) + lstm_extra_opts)
else:
  # >1 BLSTM,
  print("<BlstmProjected> <InputDim> %d <OutputDim> %d <CellDim> %s" % (feat_dim, 2*o.proj_dim, o.cell_dim) + lstm_extra_opts)
  for l in range(o.num_layers - 2):
    print("<BlstmProjected> <InputDim> %d <OutputDim> %d <CellDim> %s" % (2*o.proj_dim, 2*o.proj_dim, o.cell_dim) + lstm_extra_opts)
  print("<BlstmProjected> <InputDim> %d <OutputDim> %d <CellDim> %s" % (2*o.proj_dim, 2*o.proj_dim_last, o.cell_dim) + lstm_extra_opts)

# Adding <Tanh> for more stability,
print("<Tanh> <InputDim> %d <OutputDim> %d" % (2*o.proj_dim_last, 2*o.proj_dim_last))

# Softmax layer,
print("<AffineTransform> <InputDim> %d <OutputDim> %d <BiasMean> 0.0 <BiasRange> 0.0" % (2*o.proj_dim_last, num_leaves) + softmax_affine_opts)
print("<Softmax> <InputDim> %d <OutputDim> %d" % (num_leaves, num_leaves))


================================================
FILE: egs/utils/nnet/make_cnn_proto.py
================================================
#!/usr/bin/env python

# Copyright 2014  Brno University of Technology (author: Katerina Zmolikova, Karel Vesely)

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#  http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.

# Generated Nnet prototype, to be initialized by 'nnet-initialize'.

from __future__ import division
from __future__ import print_function
import math, random, sys
from optparse import OptionParser

###
### Parse options
###
usage="%prog [options] <feat-dim> <num-leaves> <num-hidden-layers> <num-hidden-neurons>  >nnet-proto-file"
parser = OptionParser(usage)

parser.add_option('--activation-type', dest='activation_type', 
                   help='Select type of activation function : (<Sigmoid>|<Tanh>) [default: %default]', 
                   default='<Sigmoid>', type='string');
parser.add_option('--num-filters1', dest='num_filters1',
		   help='Number of filters in first convolutional layer [default: %default]',
		   default=128, type='int')
parser.add_option('--num-filters2', dest='num_filters2',
		   help='Number of filters in second convolutional layer [default: %default]',
		   default=256, type='int')
parser.add_option('--pool-size', dest='pool_size',
	  	   help='Size of pooling [default: %default]',
		   default=3, type='int')
parser.add_option('--pool-step', dest='pool_step',
		  help='Step of pooling [default: %default]',
		  default=3, type='int')
parser.add_option('--pool-type', dest='pool_type',
		  help='Type of pooling (Max || Average) [default: %default]',
		  default='Max', type='string')
parser.add_option('--pitch-dim', dest='pitch_dim',
		  help='Number of features representing pitch [default: %default]',
		  default=0, type='int')
parser.add_option('--delta-order', dest='delta_order',
		  help='Order of delta features [default: %default]',
		  default=2, type='int')
parser.add_option('--splice', dest='splice',
		  help='Length of splice [default: %default]',
		  default=5,type='int')
parser.add_option('--patch-step1', dest='patch_step1',
		  help='Patch step of first convolutional layer [default: %default]',
		  default=1, type='int')
parser.add_option('--patch-dim1', dest='patch_dim1',
		  help='Dim of convolutional kernel in 1st layer (freq. axis) [default: %default]',
  		  default=8, type='int')
parser.add_option('--patch-dim2', dest='patch_dim2',
		  help='Dim of convolutional kernel in 2nd layer (freq. axis) [default: %default]',
  		  default=4, type='int')
parser.add_option('--dir', dest='protodir',
		  help='Directory, where network prototypes will be saved [default: %default]',
		  default='.', type='string')
parser.add_option('--num-pitch-neurons', dest='num_pitch_neurons',
		  help='Number of neurons in layers processing pitch features [default: %default]',
		  default='200', type='int')

(o,args) = parser.parse_args()
if len(args) != 1 : 
  parser.print_help()
  sys.exit(1)
 
feat_dim = int(args[0]);
### End parse options 

feat_raw_dim = feat_dim / (o.delta_order+1) / (o.splice*2+1) - o.pitch_dim # we need number of feats without deltas and splice and pitch

# Check
assert(feat_dim > 0)
assert(o.pool_type == 'Max' or o.pool_type == 'Average')

###
### Print prototype of the network
###

# Begin the prototype
print("<NnetProto>")

# Convolutional part of network
num_patch1 = 1 + (feat_raw_dim - o.patch_dim1) / o.patch_step1
num_pool = 1 + (num_patch1 - o.pool_size) / o.pool_step
patch_dim2 = o.patch_dim2
patch_step2 = o.patch_step1
patch_stride2 = num_pool # same as layer1 outputs 
num_patch2 = 1 + (num_pool - patch_dim2) / patch_step2

inputdim_of_cnn = feat_dim
outputdim_of_cnn = o.num_filters2*num_patch2

convolution_proto = ''  

convolution_proto += "<ConvolutionalComponent> <InputDim> %d <OutputDim> %d <PatchDim> %d <PatchStep> %d <PatchStride> %d <BiasMean> %f <BiasRange> %f <ParamStddev> %f <MaxNorm> %f\n" % \
			(feat_raw_dim * (o.delta_order+1) * (o.splice*2+1), o.num_filters1 * num_patch1, o.patch_dim1, o.patch_step1, feat_raw_dim, -1.0, 2.0, 0.02, 30) #~8x11x3 = 264 inputs
convolution_proto += "<%sPoolingComponent> <InputDim> %d <OutputDim> %d <PoolSize> %d <PoolStep> %d <PoolStride> %d\n" % \
			(o.pool_type, o.num_filters1*num_patch1, o.num_filters1*num_pool, o.pool_size, o.pool_step, o.num_filters1)
convolution_proto += "<Rescale> <InputDim> %d <OutputDim> %d <InitParam> %f\n" % \
			(o.num_filters1*num_pool, o.num_filters1*num_pool, 1)
convolution_proto += "<AddShift> <InputDim> %d <OutputDim> %d <InitParam> %f\n" % \
			(o.num_filters1*num_pool, o.num_filters1*num_pool, 0)
convolution_proto += "%s <InputDim> %d <OutputDim> %d\n" % \
			(o.activation_type, o.num_filters1*num_pool, o.num_filters1*num_pool)
convolution_proto += "<ConvolutionalComponent> <InputDim> %d <OutputDim> %d <PatchDim> %d <PatchStep> %d <PatchStride> %d <BiasMean> %f <BiasRange> %f <ParamStddev> %f <MaxNorm> %f\n" % \
			(o.num_filters1*num_pool, outputdim_of_cnn, patch_dim2, patch_step2, patch_stride2, -2.0, 4.0, 0.1, 50) #~4x128 = 512 inputs
convolution_proto += "<Rescale> <InputDim> %d <OutputDim> %d <InitParam> %f\n" % \
			(outputdim_of_cnn, outputdim_of_cnn, 1)
convolution_proto += "<AddShift> <InputDim> %d <OutputDim> %d <InitParam> %f\n" % \
			(outputdim_of_cnn, outputdim_of_cnn, 0)
convolution_proto += "%s <InputDim> %d <OutputDim> %d\n" % \
			(o.activation_type, outputdim_of_cnn, outputdim_of_cnn)

if (o.pitch_dim > 0):
  # convolutional part
  f_conv = open('%s/nnet.proto.convolution' % o.protodir, 'w')
  f_conv.write('<NnetProto>\n')
  f_conv.write(convolution_proto)
  f_conv.write('</NnetProto>\n')
  f_conv.close()
  
  # pitch part
  f_pitch = open('%s/nnet.proto.pitch' % o.protodir, 'w')
  f_pitch.write('<NnetProto>\n')
  f_pitch.write('<AffineTransform> <InputDim> %d <OutputDim> %d <BiasMean> %f <BiasRange> %f <ParamStddev> %f\n' % \
		((o.pitch_dim * (o.delta_order+1) * (o.splice*2+1)), o.num_pitch_neurons, -2, 4, 0.02))
  f_pitch.write('%s <InputDim> %d <OutputDim> %d\n' % \
		(o.activation_type, o.num_pitch_neurons, o.num_pitch_neurons))
  f_pitch.write('<AffineTransform> <InputDim> %d <OutputDim> %d <BiasMean> %f <BiasRange> %f <ParamStddev> %f\n' % \
		(o.num_pitch_neurons, o.num_pitch_neurons, -2, 4, 0.1))
  f_pitch.write('%s <InputDim> %d <OutputDim> %d\n' % \
		(o.activation_type, o.num_pitch_neurons, o.num_pitch_neurons))
  f_pitch.write('</NnetProto>\n')
  f_pitch.close()

  # paralell part
  vector = ''
  for i in range(1, inputdim_of_cnn, feat_raw_dim + o.pitch_dim):
    vector += '%d:1:%d ' % (i, i + feat_raw_dim - 1)
  for i in range(feat_raw_dim+1, inputdim_of_cnn + 1, feat_raw_dim + o.pitch_dim):
    vector += '%d:1:%d ' % (i, i + o.pitch_dim - 1)
  print('<Copy> <InputDim> %d <OutputDim> %d <BuildVector> %s </BuildVector>' % \
	(inputdim_of_cnn, inputdim_of_cnn, vector))
  print('<ParallelComponent> <InputDim> %d <OutputDim> %d <NestedNnetProto> %s %s </NestedNnetProto>' % \
	(inputdim_of_cnn, o.num_pitch_neurons + outputdim_of_cnn, '%s/nnet.proto.convolution' % o.protodir, '%s/nnet.proto.pitch' % o.protodir))

else: # no pitch
  print(convolution_proto)

# We are done!
sys.exit(0)


================================================
FILE: egs/utils/nnet/make_lstm_proto.py
================================================
#!/usr/bin/env python

# Copyright 2015-2016  Brno University of Technology (author: Karel Vesely)

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#  http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.

# Generated Nnet prototype, to be initialized by 'nnet-initialize'.

from __future__ import print_function
import sys

###
### Parse options
###
from optparse import OptionParser
usage="%prog [options] <feat-dim> <num-leaves> >nnet-proto-file"
parser = OptionParser(usage)
# Required,
parser.add_option('--cell-dim', dest='cell_dim', type='int', default=320,
                   help='Number of cells for one direction in LSTM [default: %default]');
parser.add_option('--proj-dim', dest='proj_dim', type='int', default=400,
                   help='Number of LSTM recurrent units [default: %default]');
parser.add_option('--num-layers', dest='num_layers', type='int', default=2,
                   help='Number of LSTM layers [default: %default]');
# Optional (default == 'None'),
parser.add_option('--lstm-param-range', dest='lstm_param_range', type='float',
                   help='Range of initial LSTM parameters [default: %default]');
parser.add_option('--param-stddev', dest='param_stddev', type='float',
                   help='Standard deviation for initial weights of Softmax layer [default: %default]');
parser.add_option('--cell-clip', dest='cell_clip', type='float',
                   help='Clipping cell values during propagation (per-frame) [default: %default]');
parser.add_option('--diff-clip', dest='diff_clip', type='float',
                   help='Clipping partial-derivatives during BPTT (per-frame) [default: %default]');
parser.add_option('--cell-diff-clip', dest='cell_diff_clip', type='float',
                   help='Clipping partial-derivatives of "cells" during BPTT (per-frame, those accumulated by CEC) [default: %default]');
parser.add_option('--grad-clip', dest='grad_clip', type='float',
                   help='Clipping the accumulated gradients (per-updates) [default: %default]');
#

(o,args) = parser.parse_args()
if len(args) != 2 :
  parser.print_help()
  sys.exit(1)

(feat_dim, num_leaves) = [int(i) for i in args];

# Original prototype from Jiayu,
#<NnetProto>
#<Transmit> <InputDim> 40 <OutputDim> 40
#<LstmProjectedStreams> <InputDim> 40 <OutputDim> 512 <CellDim> 800 <ParamScale> 0.01 <NumStream> 4
#<AffineTransform> <InputDim> 512 <OutputDim> 8000 <BiasMean> 0.000000 <BiasRange> 0.000000 <ParamStddev> 0.04
#<Softmax> <InputDim> 8000 <OutputDim> 8000
#</NnetProto>

lstm_extra_opts=""
if None != o.lstm_param_range: lstm_extra_opts += "<ParamRange> %f "   % o.lstm_param_range
if None != o.cell_clip:        lstm_extra_opts += "<CellClip> %f "     % o.cell_clip
if None != o.diff_clip:        lstm_extra_opts += "<DiffClip> %f "     % o.diff_clip
if None != o.cell_diff_clip:   lstm_extra_opts += "<CellDiffClip> %f " % o.cell_diff_clip
if None != o.grad_clip:        lstm_extra_opts += "<GradClip> %f "     % o.grad_clip

softmax_affine_opts=""
if None != o.param_stddev:     softmax_affine_opts += "<ParamStddev> %f " % o.param_stddev

# The LSTM layers,
print("<LstmProjected> <InputDim> %d <OutputDim> %d <CellDim> %s" % (feat_dim, o.proj_dim, o.cell_dim) + lstm_extra_opts)
for l in range(o.num_layers - 1):
  print("<LstmProjected> <InputDim> %d <OutputDim> %d <CellDim> %s" % (o.proj_dim, o.proj_dim, o.cell_dim) + lstm_extra_opts)

# Adding <Tanh> for more stability,
print("<Tanh> <InputDim> %d <OutputDim> %d" % (o.proj_dim, o.proj_dim))

# Softmax layer,
print("<AffineTransform> <InputDim> %d <OutputDim> %d <BiasMean> 0.0 <BiasRange> 0.0" % (o.proj_dim, num_leaves) + softmax_affine_opts)
print("<Softmax> <InputDim> %d <OutputDim> %d" % (num_leaves, num_leaves))


================================================
FILE: egs/utils/nnet/make_nnet_proto.py
================================================
#!/usr/bin/env python

# Copyright 2014-2016  Brno University of Technology (author: Karel Vesely)

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#  http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.

# Generated Nnet prototype, to be initialized by 'nnet-initialize'.

from __future__ import division
from __future__ import print_function
import math, random, sys, re

###
### Parse options
###
from optparse import OptionParser
usage="%prog [options] <feat-dim> <num-leaves> <num-hid-layers> <num-hid-neurons> >nnet-proto-file"
parser = OptionParser(usage)

# Softmax related,
parser.add_option('--no-softmax', dest='with_softmax',
                   help='Do not put <SoftMax> in the prototype [default: %default]',
                   default=True, action='store_false');
parser.add_option('--block-softmax-dims', dest='block_softmax_dims',
                   help='Generate <BlockSoftmax> with dims D1:D2:D3 [default: %default]',
                   default="", type='string');
# Activation related,
parser.add_option('--activation-type', dest='activation_type',
                   help='Select type of activation function : (<Sigmoid>|<Tanh>|<ParametricRelu>) [default: %default]',
                   default='<Sigmoid>', type='string');
parser.add_option('--activation-opts', dest='activation_opts',
                   help='Additional options for protoype of activation function [default: %default]',
                   default='', type='string');
# Affine-transform related,
parser.add_option('--hid-bias-mean', dest='hid_bias_mean',
                   help='Set bias for hidden activations [default: %default]',
                   default=-2.0, type='float');
parser.add_option('--hid-bias-range', dest='hid_bias_range',
                   help='Set bias range for hidden activations (+/- 1/2 range around mean) [default: %default]',
                   default=4.0, type='float');
parser.add_option('--param-stddev-factor', dest='param_stddev_factor',
                   help='Factor to rescale Normal distriburtion for initalizing weight matrices [default: %default]',
                   default=0.1, type='float');
parser.add_option('--no-glorot-scaled-stddev', dest='with_glorot',
                   help='Generate normalized weights according to X.Glorot paper, but mapping U->N with same variance (factor sqrt(x/(dim_in+dim_out)))',
                   action='store_false', default=True);
parser.add_option('--no-smaller-input-weights', dest='smaller_input_weights',
                   help='Disable 1/12 reduction of stddef in input layer [default: %default]',
                   action='store_false', default=True);
parser.add_option('--no-bottleneck-trick', dest='bottleneck_trick',
                   help='Disable smaller initial weights and learning rate around bottleneck',
                   action='store_false', default=True);
parser.add_option('--max-norm', dest='max_norm',
                   help='Max radius of neuron-weights in L2 space (if longer weights get shrinked, not applied to last layer, 0.0 = disable) [default: %default]',
                   default=0.0, type='float');
parser.add_option('--affine-opts', dest='affine_opts',
                   help='Additional options for protoype of affine tranform [default: %default]',
                   default='', type='string');
# Topology related,
parser.add_option('--bottleneck-dim', dest='bottleneck_dim',
                   help='Make bottleneck network with desired bn-dim (0 = no bottleneck) [default: %default]',
                   default=0, type='int');
parser.add_option('--with-dropout', dest='with_dropout',
                   help='Add <Dropout> after the non-linearity of hidden layer.',
                   action='store_true', default=False);
parser.add_option('--dropout-opts', dest='dropout_opts',
                   help='Extra options for dropout [default: %default]',
                   default='', type='string');


(o,args) = parser.parse_args()
if len(args) != 4 :
  parser.print_help()
  sys.exit(1)

# A HACK TO PASS MULTI-WORD OPTIONS, WORDS ARE CONNECTED BY UNDERSCORES '_',
o.activation_opts = o.activation_opts.replace("_"," ")
o.affine_opts = o.affine_opts.replace("_"," ")
o.dropout_opts = o.dropout_opts.replace("_"," ")

(feat_dim, num_leaves, num_hid_layers, num_hid_neurons) = [int(i) for i in args];
### End parse options


# Check
assert(feat_dim > 0)
assert(num_leaves > 0)
assert(num_hid_layers >= 0)
assert(num_hid_neurons > 0)
if o.block_softmax_dims:
  assert(sum(map(int, re.split("[,:]", o.block_softmax_dims))) == num_leaves) # posible separators : ',' ':'

# Optionaly scale
def Glorot(dim1, dim2):
  if o.with_glorot:
    # 35.0 = magic number, gives ~1.0 in inner layers for hid-dim 1024dim,
    return 35.0 * math.sqrt(2.0/(dim1+dim2));
  else:
    return 1.0


###
### Print prototype of the network
###

# NO HIDDEN LAYER, ADDING BOTTLENECK!
# No hidden layer while adding bottleneck means:
# - add bottleneck layer + hidden layer + output layer
if num_hid_layers == 0 and o.bottleneck_dim != 0:
  assert(o.bottleneck_dim > 0)
  assert(num_hid_layers == 0)
  if o.bottleneck_trick:
    # 25% smaller stddev -> small bottleneck range, 10x smaller learning rate
    print("<LinearTransform> <InputDim> %d <OutputDim> %d <ParamStddev> %f <LearnRateCoef> %f" % \
     (feat_dim, o.bottleneck_dim, \
      (o.param_stddev_factor * Glorot(feat_dim, o.bottleneck_dim) * 0.75 ), 0.1))
    # 25% smaller stddev -> smaller gradient in prev. layer, 10x smaller learning rate for weigts & biases
    print("<AffineTransform> <InputDim> %d <OutputDim> %d <BiasMean> %f <BiasRange> %f <ParamStddev> %f <LearnRateCoef> %f <BiasLearnRateCoef> %f <MaxNorm> %f" % \
     (o.bottleneck_dim, num_hid_neurons, o.hid_bias_mean, o.hid_bias_range, \
      (o.param_stddev_factor * Glorot(o.bottleneck_dim, num_hid_neurons) * 0.75 ), 0.1, 0.1, o.max_norm))
  else:
    print("<LinearTransform> <InputDim> %d <OutputDim> %d <ParamStddev> %f" % \
     (feat_dim, o.bottleneck_dim, \
      (o.param_stddev_factor * Glorot(feat_dim, o.bottleneck_dim))))
    print("<AffineTransform> <InputDim> %d <OutputDim> %d <BiasMean> %f <BiasRange> %f <ParamStddev> %f <MaxNorm> %f" % \
     (o.bottleneck_dim, num_hid_neurons, o.hid_bias_mean, o.hid_bias_range, \
      (o.param_stddev_factor * Glorot(o.bottleneck_dim, num_hid_neurons)), o.max_norm))
  print("%s <InputDim> %d <OutputDim> %d %s" % (o.activation_type, num_hid_neurons, num_hid_neurons, o.activation_opts)) # Non-linearity
  # Last AffineTransform (10x smaller learning rate on bias)
  print("<AffineTransform> <InputDim> %d <OutputDim> %d <BiasMean> %f <BiasRange> %f <ParamStddev> %f <LearnRateCoef> %f <BiasLearnRateCoef> %f" % \
   (num_hid_neurons, num_leaves, 0.0, 0.0, \
    (o.param_stddev_factor * Glorot(num_hid_neurons, num_leaves)), 1.0, 0.1))
  # Optionaly append softmax
  if o.with_softmax:
    if o.block_softmax_dims == "":
      print("<Softmax> <InputDim> %d <OutputDim> %d" % (num_leaves, num_leaves))
    else:
      print("<BlockSoftmax> <InputDim> %d <OutputDim> %d <BlockDims> %s" % (num_leaves, num_leaves, o.block_softmax_dims))
  print("</NnetProto>")
  # We are done!
  sys.exit(0)

# NO HIDDEN LAYERS!
# Add only last layer (logistic regression)
if num_hid_layers == 0:
  print("<AffineTransform> <InputDim> %d <OutputDim> %d <BiasMean> %f <BiasRange> %f <ParamStddev> %f" % \
        (feat_dim, num_leaves, 0.0, 0.0, (o.param_stddev_factor * Glorot(feat_dim, num_leaves))))
  if o.with_softmax:
    if o.block_softmax_dims == "":
      print("<Softmax> <InputDim> %d <OutputDim> %d" % (num_leaves, num_leaves))
    else:
      print("<BlockSoftmax> <InputDim> %d <OutputDim> %d <BlockDims> %s" % (num_leaves, num_leaves, o.block_softmax_dims))
  print("</NnetProto>")
  # We are done!
  sys.exit(0)


# THE USUAL DNN PROTOTYPE STARTS HERE!
# Assuming we have >0 hidden layers,
assert(num_hid_layers > 0)

# Begin the prototype,
# First AffineTranform,
print("<AffineTransform> <InputDim> %d <OutputDim> %d <BiasMean> %f <BiasRange> %f <ParamStddev> %f <MaxNorm> %f %s" % \
      (feat_dim, num_hid_neurons, o.hid_bias_mean, o.hid_bias_range, \
       (o.param_stddev_factor * Glorot(feat_dim, num_hid_neurons) * \
        (math.sqrt(1.0/12.0) if o.smaller_input_weights else 1.0)), o.max_norm, o.affine_opts))
      # Note.: compensating dynamic range mismatch between input features and Sigmoid-hidden layers,
      # i.e. mapping the std-dev of N(0,1) (input features) to std-dev of U[0,1] (sigmoid-outputs).
      # This is done by multiplying with stddev(U[0,1]) = sqrt(1/12).
      # The stddev of weights is consequently reduced with scale 0.29,
print("%s <InputDim> %d <OutputDim> %d %s" % (o.activation_type, num_hid_neurons, num_hid_neurons, o.activation_opts))
if o.with_dropout:
  print("<Dropout> <InputDim> %d <OutputDim> %d %s" % (num_hid_neurons, num_hid_neurons, o.dropout_opts))


# Internal AffineTransforms,
for i in range(num_hid_layers-1):
  print("<AffineTransform> <InputDim> %d <OutputDim> %d <BiasMean> %f <BiasRange> %f <ParamStddev> %f <MaxNorm> %f %s" % \
        (num_hid_neurons, num_hid_neurons, o.hid_bias_mean, o.hid_bias_range, \
         (o.param_stddev_factor * Glorot(num_hid_neurons, num_hid_neurons)), o.max_norm, o.affine_opts))
  print("%s <InputDim> %d <OutputDim> %d %s" % (o.activation_type, num_hid_neurons, num_hid_neurons, o.activation_opts))
  if o.with_dropout:
    print("<Dropout> <InputDim> %d <OutputDim> %d %s" % (num_hid_neurons, num_hid_neurons, o.dropout_opts))

# Optionaly add bottleneck,
if o.bottleneck_dim != 0:
  assert(o.bottleneck_dim > 0)
  if o.bottleneck_trick:
    # 25% smaller stddev -> small bottleneck range, 10x smaller learning rate
    print("<LinearTransform> <InputDim> %d <OutputDim> %d <ParamStddev> %f <LearnRateCoef> %f" % \
     (num_hid_neurons, o.bottleneck_dim, \
      (o.param_stddev_factor * Glorot(num_hid_neurons, o.bottleneck_dim) * 0.75 ), 0.1))
    # 25% smaller stddev -> smaller gradient in prev. layer, 10x smaller learning rate for weigts & biases
    print("<AffineTransform> <InputDim> %d <OutputDim> %d <BiasMean> %f <BiasRange> %f <ParamStddev> %f <LearnRateCoef> %f <BiasLearnRateCoef> %f <MaxNorm> %f %s" % \
     (o.bottleneck_dim, num_hid_neurons, o.hid_bias_mean, o.hid_bias_range, \
      (o.param_stddev_factor * Glorot(o.bottleneck_dim, num_hid_neurons) * 0.75 ), 0.1, 0.1, o.max_norm, o.affine_opts))
  else:
    # Same learninig-rate and stddev-formula everywhere,
    print("<LinearTransform> <InputDim> %d <OutputDim> %d <ParamStddev> %f" % \
     (num_hid_neurons, o.bottleneck_dim, \
      (o.param_stddev_factor * Glorot(num_hid_neurons, o.bottleneck_dim))))
    print("<AffineTransform> <InputDim> %d <OutputDim> %d <BiasMean> %f <BiasRange> %f <ParamStddev> %f <MaxNorm> %f %s" % \
     (o.bottleneck_dim, num_hid_neurons, o.hid_bias_mean, o.hid_bias_range, \
      (o.param_stddev_factor * Glorot(o.bottleneck_dim, num_hid_neurons)), o.max_norm, o.affine_opts))
  print("%s <InputDim> %d <OutputDim> %d %s" % (o.activation_type, num_hid_neurons, num_hid_neurons, o.activation_opts))
  if o.with_dropout:
    print("<Dropout> <InputDim> %d <OutputDim> %d %s" % (num_hid_neurons, num_hid_neurons, o.dropout_opts))

# Last AffineTransform (10x smaller learning rate on bias)
print("<AffineTransform> <InputDim> %d <OutputDim> %d <BiasMean> %f <BiasRange> %f <ParamStddev> %f <LearnRateCoef> %f <BiasLearnRateCoef> %f" % \
      (num_hid_neurons, num_leaves, 0.0, 0.0, \
       (o.param_stddev_factor * Glorot(num_hid_neurons, num_leaves)), 1.0, 0.1))

# Optionaly append softmax
if o.with_softmax:
  if o.block_softmax_dims == "":
    print("<Softmax> <InputDim> %d <OutputDim> %d" % (num_leaves, num_leaves))
  else:
    print("<BlockSoftmax> <InputDim> %d <OutputDim> %d <BlockDims> %s" % (num_leaves, num_leaves, o.block_softmax_dims))

# We are done!
sys.exit(0)


================================================
FILE: egs/utils/nnet/subset_data_tr_cv.sh
================================================
#!/usr/bin/env bash
#
# Copyright 2017  Brno University of Technology (Author: Karel Vesely);
# Apache 2.0

# This scripts splits 'data' directory into two parts:
# - training set with 90% of speakers
# - held-out set with 10% of speakers (cv)
# (to be used in frame cross-entropy training of 'nnet1' models),

# The script also accepts a list of held-out set speakers by '--cv-spk-list'
# (with perturbed data, we pass the list of speakers externally).
# The remaining set of speakers is the the training set.

cv_spk_percent=10
cv_spk_list= # To be used with perturbed data,
seed=777
cv_utt_percent= # ignored (compatibility),
. utils/parse_options.sh

if [ $# != 3 ]; then
  echo "Usage: $0 [opts] <src-data> <train-data> <cv-data>"
  echo "  --cv-spk-percent N (default 10)"
  echo "  --cv-spk-list <file> (a pre-defined list with cv speakers)"
  exit 1;
fi

set -euo pipefail

src_data=$1
trn_data=$2
cv_data=$3

[ ! -r $src_data/spk2utt ] && echo "Missing '$src_data/spk2utt'. Error!" && exit 1

tmp=$(mktemp -d /tmp/${USER}_XXXXX)

if [ -z "$cv_spk_list" ]; then
  # Select 'cv_spk_percent' speakers randomly,
  cat $src_data/spk2utt | awk '{ print $1; }' | utils/shuffle_list.pl --srand $seed >$tmp/speakers
  n_spk=$(wc -l <$tmp/speakers)
  n_spk_cv=$(perl -e "print int($cv_spk_percent * $n_spk / 100); ")
  #
  head -n $n_spk_cv $tmp/speakers >$tmp/speakers_cv
  tail -n+$((n_spk_cv+1)) $tmp/speakers >$tmp/speakers_trn
else
  # Use pre-defined list of speakers,
  cp $cv_spk_list $tmp/speakers_cv
  join -v2 <(sort $cv_spk_list) <(awk '{ print $1; }' <$src_data/spk2utt | sort) >$tmp/speakers_trn
fi

# Sanity checks,
n_spk=$(wc -l <$src_data/spk2utt)
echo "Speakers, src=$n_spk, trn=$(wc -l <$tmp/speakers_trn), cv=$(wc -l $tmp/speakers_cv)"
overlap=$(join <(sort $tmp/speakers_trn) <(sort $tmp/speakers_cv) | wc -l)
[ $overlap != 0 ] && \
  echo "WARNING, speaker overlap detected!" && \
  join <(sort $tmp/speakers_trn) <(sort $tmp/speakers_cv) | head && \
  echo '...'

# Create new data dirs,
utils/data/subset_data_dir.sh --spk-list $tmp/speakers_trn $src_data $trn_data
utils/data/subset_data_dir.sh --spk-list $tmp/speakers_cv $src_data $cv_data


================================================
FILE: egs/utils/nnet-cpu/make_nnet_config.pl
================================================
#!/usr/bin/env perl
use warnings; #sed replacement for -w perl parameter
# Copyright 2012  Johns Hopkins University (Author: Daniel Povey)

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#  http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.


# These options can be useful if we want to splice the input
# features across time.
$input_left_context = 0;
$input_right_context = 0;
$param_stddev_factor = 1.0;  # can be used to adjust initial variance
  # of parameters.
$initial_num_hidden_layers = -1; # if >= 0, the number of hidden layers
  # the model should start with, which may be less than the final number
  # (the final number is used to calculate the #neurons).
$single_layer_config = "";
$bias_stddev = 2.0;
$learning_rate = 0.001;
$nobias = "";

for ($x = 1; $x < 10; $x++) {
  if ($ARGV[0] eq "--input-left-context") {
    $input_left_context = $ARGV[1];
    shift; shift;
  }
  if ($ARGV[0] eq "--input-right-context") {
    $input_right_context = $ARGV[1];
    shift; shift;
  }
  if ($ARGV[0] eq "--param-stddev-factor") {
    $param_stddev_factor = $ARGV[1];
    shift; shift;
  }
  if ($ARGV[0] eq "--bias-stddev") {
    $bias_stddev = $ARGV[1];
    shift; shift;
  }
  if ($ARGV[0] eq "--nobias") {
    $nobias = "Nobias";
    shift;
  }
  if ($ARGV[0] eq "--learning-rate") {
    $learning_rate = $ARGV[1];
    shift; shift;
  }
  if ($ARGV[0] eq "--initial-num-hidden-layers") {
    $initial_num_hidden_layers = $ARGV[1];
    $single_layer_config = $ARGV[2];
    shift; shift; shift;
  }
}


if (@ARGV != 4) {
  print STDERR "Usage: make_nnet_config.pl  [options] <feat-dim> <num-leaves> <num-hidden-layers> <num-parameters>  >config-file
Options:
   --input-left-context <n>        #  #frames of left context for input features; default 0.
   --input-right-context <n>       #  #frames of right context for input features; default 0.
   --param-stdddev-factor <f>      #  Factor which can be used to modify the standard deviation of
                                   #  randomly initialized features (default, 1.  Gets multiplied by
                                   #  1/sqrt of number of inputs).
   --initial-num-hidden-layers <n> <config-file>   #  If >0, number of hidden layers to initialize the network with.
                                   #  In this case, the positional parameter <num-hidden-layers> is only
                                   #  used to work out the number of units per hidden layer (based on
                                   #  parameter count), and we write to <config-file> the config corresponding
                                   #  to a single hidden layer.
   --learning-rate <f>             # Initial learning rate, default 0.001\n";
     exit(1);
}

($feat_dim, $num_leaves, $num_hidden_layers, $num_params) = @ARGV;
($input_left_context < 0) &&  die "Invalid input left context $input_left_context";
($input_right_context < 0) &&  die "Invalid input right context $input_right_context";
($feat_dim <= 0) &&  die "Invalid feature dimension $feat_dim";
($num_leaves <= 0) && die "Invalid number of leaves $num_leaves";
($num_hidden_layers <= 0) && die "Invalid number of hidden layers $num_hidden_layers";
if ($initial_num_hidden_layers < 0) {
  $initial_num_hidden_layers = $num_hidden_layers;
}
if ($initial_num_hidden_layers > $num_hidden_layers) {
  print STDERR "Initial number of hidden layers is more than #hidden layers.\n" .
    "This does not really make sense but continuing anyway.";
}

$context_size = 1 + $input_left_context + $input_right_context;
($num_params < ($num_leaves + ($feat_dim * $context_size) + $num_hidden_layers + 1))
  && die "Invalid number of params $num_params";

## num_params = hidden_layer_size^2 * (num_hidden_layers-1)
##            + hidden_layer_size * (num_leaves + feat_dim * context_size)
## solve for hidden_layer_size = x.
## a x^2 + b x + c, with
## a = num_hidden_layers - 1
## b = num_leaves + feat_dim * context_size
## c = -num_params

$a = $num_hidden_layers - 1;
$b = $num_leaves + $feat_dim * $context_size;
$c = -$num_params;

if ($a > 0) {
  $hidden_layer_size =  int((-$b + sqrt($b*$b - 4*$a*$c)) / (2*$a));
} else {
  $hidden_layer_size = int(-$c/$b);
}


$actual_num_params = $hidden_layer_size * $hidden_layer_size * ($num_hidden_layers - 1)
                   + $hidden_layer_size * ($num_leaves + $feat_dim * $context_size);

if (abs($actual_num_params - $num_params) > 0.1 * $num_params) {
  print STDERR "Warning: make_nnet_config.pl: possible failure $actual_num_params != $num_params";
}

if ($input_left_context + $input_right_context != 0) {
  # First component has to be splicing component...
  # Note: we might be interested in decorrelating this e.g. with
  # DCT layer at some point, but for now, splicing isn't seeming to be
  # that useful.
  print "SpliceComponent input-dim=$feat_dim left-context=$input_left_context right-context=$input_right_context\n";
}
$cur_input_dim = $feat_dim * (1 + $input_left_context + $input_right_context);

for ($hidden_layer = 0; $hidden_layer < $initial_num_hidden_layers; $hidden_layer++) {
  $param_stddev = $param_stddev_factor * 1.0 / sqrt($cur_input_dim);
  print "AffineComponent$nobias input-dim=$cur_input_dim output-dim=$hidden_layer_size " .
    "learning-rate=$learning_rate param-stddev=$param_stddev bias-stddev=$bias_stddev\n";
  $cur_input_dim = $hidden_layer_size;
  print "TanhComponent dim=$cur_input_dim\n";
}

if ($single_layer_config ne "") {
  # Create a config file we'll use to add new hidden layers.
  open(F, ">$single_layer_config") || die "Error opening $single_layer_config for output";
  $param_stddev = $param_stddev_factor * 1.0 / sqrt($hidden_layer_size);
  print F "AffineComponent$nobias input-dim=$hidden_layer_size output-dim=$hidden_layer_size " .
    "learning-rate=$learning_rate param-stddev=$param_stddev bias-stddev=$bias_stddev\n";
  print F "TanhComponent dim=$hidden_layer_size\n";
  close (F) || die "Closing config file";
}

## Now the output layer.
print "AffineComponent$nobias input-dim=$cur_input_dim output-dim=$num_leaves " .
  "learning-rate=$learning_rate param-stddev=0 bias-stddev=0\n"; # we just set the parameters to zero for this layer.
## the softmax nonlinearity.
print "SoftmaxComponent dim=$num_leaves\n";

##


================================================
FILE: egs/utils/nnet-cpu/make_nnet_config_block.pl
================================================
#!/usr/bin/env perl
use warnings; #sed replacement for -w perl parameter
# Copyright 2012  Johns Hopkins University (Author: Daniel Povey)

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#  http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.


# These options can be useful if we want to splice the input
# features across time.
$input_left_context = 0;
$input_right_context = 0;
$param_stddev_factor = 1.0;  # can be used to adjust initial variance
  # of parameters.
$initial_num_hidden_layers = -1; # if >= 0, the number of hidden layers
  # the model should start with, which may be less than the final number
  # (the final number is used to calculate the #neurons).
$single_layer_config = "";

for ($x = 1; $x < 10; $x++) {
  if ($ARGV[0] eq "--input-left-context") {
    $input_left_context = $ARGV[1];
    shift; shift;
  }
  if ($ARGV[0] eq "--input-right-context") {
    $input_right_context = $ARGV[1];
    shift; shift;
  }
  if ($ARGV[0] eq "--param-stddev-factor") {
    $param_stddev_factor = $ARGV[1];
    shift; shift;
  }
  if ($ARGV[0] eq "--initial-num-hidden-layers") {
    $initial_num_hidden_layers = $ARGV[1];
    $single_layer_config = $ARGV[2];
    shift; shift; shift;
  }
}


if (@ARGV != 5) {
  print STDERR "Usage: make_nnet_config_block.pl  [options] <feat-dim> <num-leaves> <num-hidden-layers> <num-blocks> <num-parameters>  >config-file
Options:
   --input-left-context <n>        #  #frames of left context for input features; default 0.
   --input-right-context <n>       #  #frames of right context for input features; default 0.
   --param-stdddev-factor <f>      #  Factor which can be used to modify the standard deviation of
                                   #  randomly nitialized features (default, 1.  Gets multiplied by
                                   #  1/sqrt of number of inputs).
   --initial-num-hidden-layers <n> <config-file>   #  If >0, number of hidden layers to initialize the network with.
                                   #  In this case, the positional parameter <num-hidden-layers> is only
                                   #  used to work out the number of units per hidden layer (based on
                                   #  parameter count), and we write to <config-file> the config corresponding
                                   #  to a single hidden layer.\n";
     exit(1);
}

($feat_dim, $num_leaves, $num_hidden_layers, $num_blocks, $num_params) = @ARGV;

($input_left_context < 0) &&  die "Invalid input left context $input_left_context";
($input_right_context < 0) &&  die "Invalid input right context $input_right_context";
($feat_dim <= 0) &&  die "Invalid feature dimension $feat_dim";
($num_leaves <= 0) && die "Invalid number of leaves $num_leaves";
($num_blocks <= 0) && die "Invalid number of blocks $num_blocks";
($num_blocks > 20) && die "Implausibly high number of blocks $num_blocks";
($num_hidden_layers <= 0) && die "Invalid number of hidden layers $num_hidden_layers";
if ($initial_num_hidden_layers < 0) {
  $initial_num_hidden_layers = $num_hidden_layers;
}
if ($initial_num_hidden_layers > $num_hidden_layers) {
  print STDERR "Initial number of hidden layers is more than #hidden layers.\n" .
    "This does not really make sense but continuing anyway.";
}

$context_size = 1 + $input_left_context + $input_right_context;
($num_params < ($num_leaves + ($feat_dim * $context_size) + $num_hidden_layers + 1))
  && die "Invalid number of params $num_params";

## num_params = hidden_layer_size^2/num_blocks * (num_hidden_layers-1)
##            + hidden_layer_size * (num_leaves + feat_dim * context_size)
## solve for hidden_layer_size = x.
## a x^2 + b  + c, with
## a = (num_hidden_layers - 1) / num_blocks
## b = num_leaves + feat_dim * context_size
## c = -num_params

$a = ($num_hidden_layers - 1) / ($num_blocks * 1.0); # * 1.0 to make sure it's float.
$b = $num_leaves + $feat_dim * $context_size;
$c = -$num_params;

if ($a > 0) {
  $hidden_layer_size =  int((-$b + sqrt($b*$b - 4*$a*$c)) / (2*$a));
} else {
  $hidden_layer_size = int(-$c/$b);
}
##  make sure num_blocks divides hidden_layer_size.
$hidden_layer_size -= $hidden_layer_size % $num_blocks;

$actual_num_params = ($hidden_layer_size * $hidden_layer_size)/$num_blocks * ($num_hidden_layers - 1)
                   + $hidden_layer_size * ($num_leaves + $feat_dim * $context_size);

if (abs($actual_num_params - $num_params) > 0.1 * $num_params) {
  print STDERR "Warning: $0: possible failure $actual_num_params != $num_params";
}

if ($input_left_context + $input_right_context != 0) {
  # First component has to be splicing component...
  # Note: we might be interested in decorrelating this e.g. with
  # DCT layer at some point, but for now, splicing isn't seeming to be
  # that useful.
  print "SpliceComponent input-dim=$feat_dim left-context=$input_left_context right-context=$input_right_context\n";
}
$cur_input_dim = $feat_dim * (1 + $input_left_context + $input_right_context);

for ($hidden_layer = 0; $hidden_layer < $initial_num_hidden_layers; $hidden_layer++) {
  if ($hidden_layer == 0) {
    $param_stddev = $param_stddev_factor * 1.0 / sqrt($cur_input_dim);
    print "AffineComponent input-dim=$cur_input_dim output-dim=$hidden_layer_size " .
      "param-stddev=$param_stddev\n";
    print "TanhComponent dim=$hidden_layer_size\n";
  } else {
    $param_stddev = $param_stddev_factor * 1.0 / sqrt($cur_input_dim / $num_blocks);
    print "PermuteComponent dim=$cur_input_dim\n";
    print "BlockAffineComponent num-blocks=$num_blocks input-dim=$cur_input_dim output-dim=$hidden_layer_size " .
      "param-stddev=$param_stddev\n";
    print "TanhComponent dim=$hidden_layer_size\n";
  }
  $cur_input_dim = $hidden_layer_size;
}

if ($single_layer_config ne "") {
  # Create a config file we'll use to add new hidden layers.
  open(F, ">$single_layer_config") || die "Error opening $single_layer_config for output";
  $param_stddev = $param_stddev_factor * 1.0 / sqrt($hidden_layer_size);
  print F "PermuteComponent dim=$hidden_layer_size\n";
  print F "BlockAffineComponent num-blocks=$num_blocks input-dim=$hidden_layer_size output-dim=$hidden_layer_size " .
    "param-stddev=$param_stddev\n";
  print F "TanhComponent dim=$hidden_layer_size\n";
  close (F) || die "Closing config file";
}

## Now the output layer.
print "AffineComponent input-dim=$cur_input_dim output-dim=$num_leaves " .
  "param-stddev=0\n"; # we just set the parameters to zero for this layer.
## the softmax nonlinearity.
print "SoftmaxComponent dim=$num_leaves\n";

##


================================================
FILE: egs/utils/nnet-cpu/make_nnet_config_preconditioned.pl
================================================
#!/usr/bin/env perl
use warnings; #sed replacement for -w perl parameter
# Copyright 2012  Johns Hopkins University (Author: Daniel Povey)

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#  http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.


# These options can be useful if we want to splice the input
# features across time.
$input_left_context = 0;
$input_right_context = 0;
$param_stddev_factor = 1.0;  # can be used to adjust initial variance
  # of parameters.
$initial_num_hidden_layers = -1; # if >= 0, the number of hidden layers
  # the model should start with, which may be less than the final number
  # (the final number is used to calculate the #neurons).
$single_layer_config = ""; # a file to which we'll output a config corresponding
       # to a single layer; we'll later use this to add layers to the neural
       # network.
$bias_stddev = 2.0;  # Standard deviation for random initialization of the
                     # bias terms (mean is zero).
$splice_max_context = 0; # Relates to SpliceMaxComponent (experimental feature)
$learning_rate = 0.001;
$max_change = 0.0;
$nonlinear_component_type = "Tanh";

$alpha = 4.0;
$l2_penalty_opt = ""; # Option for AffineComponentPreconditioned layer.
$tree_map = ""; # If supplied, a text file that maps from l2 to l1 tree nodes (output
   # by build-tree-two-level).  Used for initializing mixture-prob component.

$splice_context = 0;
$dropout_scale = -1.0; # if not -1.0, scale for "lower" part of 
                       # dropout scale, typically 0 <= dropout_scale < 1.
$additive_noise_stddev = 0.0; # I didn't find this helpful either.
$lda_dim = 0;
$expand_power = 1;
$expand_scale = 1.0;
$lda_mat = "";

for ($x = 1; $x < 10; $x++) {
  if ($ARGV[0] eq "--input-left-context") {
    $input_left_context = $ARGV[1];
    shift; shift;
  }
  if ($ARGV[0] eq "--l2-penalty") {
    my $l2_penalty = $ARGV[1];
    $l2_penalty_opt = "l2-penalty=$l2_penalty";
    shift; shift;
  }
  if ($ARGV[0] eq "--dropout-scale") {
    $dropout_scale = $ARGV[1];
    shift; shift;
  }
  if ($ARGV[0] eq "--expand-power") {
    $expand_power = $ARGV[1];
    shift; shift;
  }
  if ($ARGV[0] eq "--expand-scale") {
    $expand_scale = $ARGV[1];
    shift; shift;
  }
  if ($ARGV[0] eq "--max-change") {
    $max_change = $ARGV[1];
    shift; shift;
  }
  if ($ARGV[0] eq "--additive-noise-stddev") {
    $additive_noise_stddev = $ARGV[1];
    shift; shift;
  }
  if ($ARGV[0] eq "--nonlinear-component-type") {
    $nonlinear_component_type = $ARGV[1];
    shift; shift;
  }
  if ($ARGV[0] eq "--lda-mat") {
    $splice_context = $ARGV[1];
    $lda_dim = $ARGV[2];
    $lda_mat = $ARGV[3];
    shift; shift; shift; shift;
  }
  if ($ARGV[0] eq "--input-right-context") {
    $input_right_context = $ARGV[1];
    shift; shift;
  }
  if ($ARGV[0] eq "--param-stddev-factor") {
    $param_stddev_factor = $ARGV[1];
    shift; shift;
  }
  if ($ARGV[0] eq "--bias-stddev") {
    $bias_stddev = $ARGV[1];
    shift; shift;
  }
  if ($ARGV[0] eq "--alpha") {
    $alpha = $ARGV[1];
    shift; shift;
  }
  if ($ARGV[0] eq "--splice-max-context") {
    $splice_max_context = $ARGV[1];
    shift; shift;
  }
  if ($ARGV[0] eq "--learning-rate") {
    $learning_rate = $ARGV[1];
    shift; shift;
  }
  if ($ARGV[0] eq "--initial-num-hidden-layers") {
    $initial_num_hidden_layers = $ARGV[1];
    $single_layer_config = $ARGV[2];
    shift; shift; shift;
  }
  if ($ARGV[0] eq "--tree-map") { # Note: this was for an idea that
    # didn't end up working for me; it relates to SCTM-like systems.
    $tree_map = $ARGV[1];
    shift; shift;
  }
}


if (@ARGV != 4) {
  print STDERR "Usage: make_nnet_config_preconditioned.pl  [options] <feat-dim> <num-leaves> <num-hidden-layers> <num-parameters>  >config-file
Options:
   --input-left-context <n>        #  #frames of left context for input features; default 0 (this separate from pre-LDA splicing).
   --input-right-context <n>       #  #frames of right context for input features; default 0  (this separate from pre-LDA splicing).
   --param-stdddev-factor <f>      #  Factor which can be used to modify the standard deviation of
                                   #  randomly nitialized features (default, 1.  Gets multiplied by
                                   #  1/sqrt of number of inputs).
   --initial-num-hidden-layers <n> <config-file>   #  If >0, number of hidden layers to initialize the network with.
                                   #  In this case, the positional parameter <num-hidden-layers> is only
                                   #  used to work out the number of units per hidden layer (based on
                                   #  parameter count), and we write to <config-file> the config corresponding
                                   #  to a single hidden layer.
   --alpha <f>                     #  Factor (default 0.1) which affects the preconditioning.  0 < alpha <= 1;
                                   #  smaller means more aggressive preconditioning / less smoothing of the Fisher
                                   #  matrix.
   --learning-rate <f>             # Initial learning rate, default 0.001
   --lda-mat <splice-width> <lda-dimension> <lda-matrix-filename>  # Allows the user to specify splice-and-lda
                                   # with a given transformation, as a fixed component in the network.  E.g.
                                   # splice-width of 4 represents context of +- 4 frames.  Here, lda-dimension is
                                   # the output dimension of LDA, which must be the same as in the file.\n";
  exit(1);
}

($feat_dim, $num_leaves, $num_hidden_layers, $num_params) = @ARGV;
($input_left_context < 0) &&  die "Invalid input left context $input_left_context";
($input_right_context < 0) &&  die "Invalid input right context $input_right_context";
($feat_dim <= 0) &&  die "Invalid feature dimension $feat_dim";
($num_leaves <= 0) && die "Invalid number of leaves $num_leaves";
($num_hidden_layers <= 0) && die "Invalid number of hidden layers $num_hidden_layers";
if ($initial_num_hidden_layers < 0) {
  $initial_num_hidden_layers = $num_hidden_layers;
}
if ($initial_num_hidden_layers > $num_hidden_layers) {
  print STDERR "Initial number of hidden layers is more than #hidden layers.\n" .
    "This does not really make sense but continuing anyway.";
}

$context_size = 1 + $input_left_context + $input_right_context;
($num_params < ($num_leaves + ($feat_dim * $context_size) + $num_hidden_layers + 1))
  && die "Invalid number of params $num_params";

## num_params = hidden_layer_size^2 * (num_hidden_layers-1)
##            + hidden_layer_size * (num_leaves + feat_dim * context_size * expand_power)
## solve for hidden_layer_size = x.
## a x^2 + b x + c, with
## a = num_hidden_layers - 1
## b = num_leaves + feat_dim * context_size
## c = -num_params

$a = $num_hidden_layers - 1;
$b = $num_leaves + $feat_dim * $context_size * $expand_power;
$c = -$num_params;

if ($a > 0) {
  $hidden_layer_size =  int((-$b + sqrt($b*$b - 4*$a*$c)) / (2*$a));
} else {
  $hidden_layer_size = int(-$c/$b);
}


$actual_num_params = $hidden_layer_size * $hidden_layer_size * ($num_hidden_layers - 1)
                   + $hidden_layer_size * ($num_leaves + $feat_dim * $context_size * $expand_power);

if (abs($actual_num_params - $num_params) > 0.1 * $num_params) {
  print STDERR "Warning: make_nnet_config.pl: possible failure $actual_num_params != $num_params";
}

if ($splice_context > 0) { # --lda-mat <splice-context> <lda-matrix> was specified...
  print "SpliceComponent input-dim=$feat_dim left-context=$splice_context right-context=$splice_context\n";
  print "FixedLinearComponent matrix=$lda_mat\n"; # specify the filename.
  $feat_dim = $lda_dim; # This is now the input dimension.
}

if ($splice_max_context > 0) {
  print "SpliceMaxComponent dim=$feat_dim left-context=$splice_max_context right-context=$splice_max_context\n";
}


if ($input_left_context + $input_right_context != 0) {
  # First component has to be splicing component...
  # Note: we might be interested in decorrelating this e.g. with
  # DCT layer at some point, but for now, splicing isn't seeming to be
  # that useful.
  print "SpliceComponent input-dim=$feat_dim left-context=$input_left_context right-context=$input_right_context\n";
}
$cur_input_dim = $feat_dim * (1 + $input_left_context + $input_right_context);

if ($expand_power > 1) {
  print "PowerExpandComponent input-dim=$cur_input_dim max-power=$expand_power higher-power-scale=$expand_scale\n";
  $cur_input_dim *= $expand_power;
}

for ($hidden_layer = 0; $hidden_layer < $initial_num_hidden_layers; $hidden_layer++) {
  $param_stddev = $param_stddev_factor * 1.0 / sqrt($cur_input_dim);
  print "AffineComponentPreconditioned input-dim=$cur_input_dim output-dim=$hidden_layer_size alpha=$alpha max-change=$max_change " .
    "$l2_penalty_opt learning-rate=$learning_rate param-stddev=$param_stddev bias-stddev=$bias_stddev\n";
  $cur_input_dim = $hidden_layer_size;
  print "${nonlinear_component_type}Component dim=$cur_input_dim\n";
  if ($dropout_scale != -1.0) {
    print "DropoutComponent dim=$cur_input_dim dropout-scale=$dropout_scale\n";
  }
  if ($additive_noise_stddev != 0.0) {
    print "AdditiveNoiseComponent dim=$cur_input_dim stddev=$additive_noise_stddev\n";
  }
}

if ($single_layer_config ne "") {
  # Create a config file we'll use to add new hidden layers.
  open(F, ">$single_layer_config") || die "Error opening $single_layer_config for output";
  $param_stddev = $param_stddev_factor * 1.0 / sqrt($hidden_layer_size);
  print F "AffineComponentPreconditioned input-dim=$hidden_layer_size output-dim=$hidden_layer_size alpha=$alpha max-change=$max_change " .
    "$l2_penalty_opt learning-rate=$learning_rate param-stddev=$param_stddev bias-stddev=$bias_stddev\n";
  print F "${nonlinear_component_type}Component dim=$hidden_layer_size\n";
  if ($dropout_scale != -1.0) {
    print F "DropoutComponent dim=$cur_input_dim dropout-scale=$dropout_scale\n";
  }
  if ($additive_noise_stddev != 0.0) {
    print F "AdditiveNoiseComponent dim=$cur_input_dim stddev=$additive_noise_stddev\n";
  }
  close (F) || die "Closing config file";
}

## Now the output layer.
print "AffineComponentPreconditioned input-dim=$cur_input_dim output-dim=$num_leaves alpha=$alpha max-change=$max_change " .
  "$l2_penalty_opt learning-rate=$learning_rate param-stddev=0 bias-stddev=0\n"; # we just set the parameters to zero for this layer.
## the softmax nonlinearity.
print "SoftmaxComponent dim=$num_leaves\n";

if ($tree_map ne "") {
  # Create a MixtureProbComponent at the end, that shares "Gaussians"
  # among leaves that share the same level-1 tree index.
  open(F, "<$tree_map") || die "opening tree map file $tree_map";
  $map = <F>;
  close(F);
  $map =~ s/\s*\[\s*// || die "Unexpected data in tree map file $tree_map";
  $map =~ s/\s*\]\s*// || die "Unexpected data in tree map file $tree_map";
  @map = split(" ", $map);
  @dims = ();
  while (@map > 0) {
    $index = shift @map;
    $n = 1;
    while (@map > 0 && $map[0] == $index) { shift @map; $n++; }
    push @dims, $n;
  }
  $dims = join(":", @dims);
  print "MixtureProbComponent learning-rate=$learning_rate diag-element=0.9 dims=$dims\n";
}

##


================================================
FILE: egs/utils/nnet-cpu/update_learning_rates.pl
================================================
#!/usr/bin/env perl
use warnings; #sed replacement for -w perl parameter
# Copyright 2012  Johns Hopkins University (Author: Daniel Povey)

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#  http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.


# This script takes three command-line arguments.
# The first is a log-file such as exp/tri4b_nnet/log/combine.10.log,
# which is the output of nnet-combine.  The second is a file such
# as exp/tri4b_nnet/11.tmp.mdl, i.e. a model file, for which we will
# update the learning rates; the third is the output nnet file e.g.
# exp/tri4b_nnet/11.mdl

# This script assumes that the "combine" script is called as:
# nnet-combine <old-model> <new-model-1> <new-model-2> ... <new-model-n> <validation-examples> <output-model>.
# It gets from the logging output a line like this:
# LOG (nnet-combine:CombineNnets():combine-nnet.cc:184) Combining nnets, validation objf per frame changed from -1.43424 to -1.42067, scale factors are  [ 0.727508 0.79889 0.299533 0.137696 -0.0479123 0.210445 0.0195638 0.123843 0.167453 0.0193894 -0.0128672 0.178384 0.0516549 0.0958205 0.125495 ]
# [in this case the 1st 3 numbers correspond to the <old-model> ] and for each
# updatable layer, it works out the total weight on the new models.
# It interprets this as being (for each layer) a step length along
# the path old-model -> new-model.
# Basically, we change the learning rate by a factor equal to this step length,
# subject to limits on the change  [by default limit to halving/doubling].
# It's fairly obvious why we would want do do this.

# These options can be useful if we want to splice the input
# features across time.
$sources_to_exclude = 1; # may make this configurable later.
$min_learning_rate_factor = 0.5;
$max_learning_rate_factor = 2.0;
$min_learning_rate = 0.0001; # Put a floor because if too small,
  # the changes become zero due to roundoff.

if (@ARGV > 0) {
  for ($x = 1; $x < 10; $x++) {
    if ($ARGV[0] eq "--min-learning-rate-factor") {
      $min_learning_rate_factor = $ARGV[1];
      shift; shift;
    }
    if ($ARGV[0] eq "--max-learning-rate-factor") {
      $max_learning_rate_factor = $ARGV[1];
      shift; shift;
    }
    if ($ARGV[0] eq "--min-learning-rate") {
      $min_learning_rate = $ARGV[1];
      shift; shift;
    }
  }
}


if (@ARGV != 3) {
  print STDERR "Usage: update_learning_rates.pl [options] <log-file-for-nnet-combine> <nnet-in> <nnet-out>
Options:
   --min-learning-rate-factor       #  minimum factor to change learning rate by (default: 0.5)
   --max-learning-rate-factor       #  maximum factor to change learning rate by (default: 2.0)\n";
   exit(1);
}

($combine_log, $nnet_in, $nnet_out) = @ARGV;

open(L, "<$combine_log") || die "Opening log file \"$combine_log\"";


while(<L>) {
  if (m/Objective functions for the source neural nets are\s+\[(.+)\]/) {
    ## a line like:
    ##  LOG (nnet-combine:GetInitialScaleParams():combine-nnet.cc:66) Objective functions for the source neural nets are  [ -1.37002 -1.52115 -1.52103 -1.50189 -1.51912 ]
    @A = split(" ", $1);
    $num_sources = @A; # number of source neural nets (dimension of @A); 5 in this case.
  }
  ## a line like:
  ## LOG (nnet-combine:CombineNnets():combine-nnet.cc:184) Combining nnets, validation objf per frame changed from -1.37002 to -1.36574, scale factors are  [ 0.819379 0.696122 0.458798 0.040513 -0.0448875 0.171431 0.0274615 0.139143 0.133846 0.0372585 0.114193 0.17944 0.0491838 0.0668778 0.0328936 ]
  if (m/Combining nnets.+scale factors are\s+\[(.+)\]/) {
    @scale_factors = split(" ", $1);
  }
}

if (!defined $num_sources) {
  die "Log file $combine_log did not have expected format: no line with \"Objective functions\"\n";
}
if (!defined @scale_factors) {
  die "Log file $combine_log did not have expected format: no line with \"Combining nnets\"\n";
}


$num_scales = @scale_factors; # length of the array.
if ($num_scales % $num_sources != 0) {
  die "Error interpreting log file $combine_log: $num_sources does not divide $num_scales\n";
}
close(L);

open(P, "nnet-am-info $nnet_in |") || die "Opening pipe from nnet-am-info";
@learning_rates = ();
while(<P>) {
  if (m/learning rate = ([^,]+),/) {
    push @learning_rates, $1;
  }
}
close(P);

$num_layers = $num_scales / $num_sources;

$num_info_learning_rates = @learning_rates;
if ($num_layers != $num_info_learning_rates) {
  die "From log file we expect there to be $num_layers updatable components, but from the output of nnet-am-info we saw $num_info_learning_rates";
}

for ($layer = 0; $layer < $num_layers; $layer++) {
  # getting the sum of the weights for this layer from all the non-excluded sources.
  $sum = 0.0;
  for ($source = $sources_to_exclude; $source < $num_sources; $source++) {
    $index = ($source * $num_layers) + $layer;
    $sum += $scale_factors[$index];
  }
  $learning_rate_factor = $sum;
  if ($learning_rate_factor > $max_learning_rate_factor) { $learning_rate_factor = $max_learning_rate_factor; }
  if ($learning_rate_factor < $min_learning_rate_factor) { $learning_rate_factor = $min_learning_rate_factor; }
  $old_learning_rate = $learning_rates[$layer];
  $new_learning_rate = $old_learning_rate * $learning_rate_factor;
  if ($new_learning_rate < $min_learning_rate) { $new_learning_rate = $min_learning_rate; }
  print STDERR "For layer $layer, sum of weights of non-excluded sources is $sum, learning-rate factor is $learning_rate_factor\n";
  $learning_rates[$layer] = $new_learning_rate;
}

$lrates_string=join(":", @learning_rates);

$ret = system("nnet-am-copy --learning-rates=$lrates_string $nnet_in $nnet_out");

exit($ret != 0);


================================================
FILE: egs/utils/nnet3/convert_config_tdnn_to_affine.py
================================================
#!/usr/bin/env python3

# Copyright 2020  Yiming Wang
#
# Apache 2.0.

import argparse
import re
import sys


def get_parser():
    parser = argparse.ArgumentParser(
        description="""
        Convert a config file with Tdnn components to their equivalent
        Affine/Linear components. Useful when we are using MACE (a deep learning
        inference framework using Kaldi's trained models) that doesn't
        support Tdnn components.
        Usage:
            convert_config_tdnn_to_affine.py exp/chain/tdnn_1a/configs/final.config > \\
              exp/chain/tdnn_1a/configs/converted.config
        """)
    # fmt: off
    parser.add_argument('input', type=str)
    # fmt: on

    return parser


def main(args):
    offsets_dict = {}  # mapping from each TdnnComponent's name to its offsets
    with open(args.input, 'r', encoding='utf-8') as f:
        for line in f:
            if (
                (line.startswith('component ') and not 'type=TdnnComponent' in line)
                or line.startswith('input-node')
                or line.startswith('output-node')
                or line.startswith('#')
                or (line.strip() == '' and len(line) > 0)
            ):  # normal component line (all but Tdnn) or input/output node or comments or empty
                print(line.strip())
            elif line.startswith('component-node'):
                new_split_line = []
                offsets = None
                component = re.findall(r'component=(\S+)', line)[-1]
                if component in offsets_dict:
                    offsets = offsets_dict[component]
                for col in line.strip().split():
                    if col.startswith('input=') and offsets is not None:  # converted from Tdnn with input splices
                        inp = col.split('=')[1]
                        offsets_str = [
                            'Offset({}, {})'.format(inp, o) if o is not '0' else inp for o in offsets
                        ]
                        if len(offsets_str) > 1:
                            new_split_line.append('input=Append({})'.format((', ').join(offsets_str)))
                        else:
                            new_split_line.append('input={}'.format(offsets_str[0]))
                    else:
                        new_split_line.append(col)
                print(' '.join(new_split_line))
            elif line != '':  # Tdnn component line
                assert 'type=TdnnComponent' in line, line
                use_bias = True
                m = re.findall(r'use-bias=(\w+)', line)
                if len(m) > 0 and m[-1] == 'false':  # determine converting to Affine or Linear
                    use_bias = False
                new_split_line = []
                offsets = re.findall(r'time-offsets=(\S+)', line)
                if len(offsets) > 0:  # extract time-offsets for determining input-dim below
                    offsets = offsets[-1].split(',')  # -1 in case multiple fields of "time-offsets"
                else:
                    offsets = None
                for col in line.strip().split():
                    if col.startswith('name='):  # keep the name of Component
                        name = col.split('=')[1]
                        assert name not in offsets_dict
                        new_split_line.append(col)
                    elif col == 'type=TdnnComponent':  # convert Component type
                        type_str = 'type={}'.format(
                            'NaturalGradientAffineComponent' if use_bias else
                            'LinearComponent'
                        )
                        new_split_line.append(type_str)
                    elif col.startswith('input-dim='):  # change input-dim for Affine/Linear Component
                        input_dim = int(col.split('=')[1])
                        if offsets is not None:
                            input_dim *= len(offsets)
                        new_split_line.append('input-dim={}'.format(input_dim))
                    elif col.startswith('time-offsets='):  # record time-offsets for component-node
                        offsets_dict[name] = offsets
                    elif not col.startswith('use-bias='):  # all the other fields: simply copy over
                        new_split_line.append(col)
                print(' '.join(new_split_line))


if __name__ == '__main__':
    parser = get_parser()
    args = parser.parse_args()
    main(args)


================================================
FILE: egs/utils/parallel/limit_num_gpus.sh
================================================
#!/usr/bin/env bash

# This script functions as a wrapper of a bash command that uses GPUs.
#
# It sets the CUDA_VISIBLE_DEVICES variable so that it limits the number of GPUs
# used for programs. It is neccesary for running a job on the grid if the job
# would automatically grabs all resources available on the system, e.g. a
# TensorFlow program.

num_gpus=1 # this variable indicates how many GPUs we will allow the command
           # passed to this script will run on. We achieve this by setting the
           # CUDA_VISIBLE_DEVICES variable
set -e

if [ "$1" == "--num-gpus" ]; then
  num_gpus=$2
  shift
  shift
fi

if ! printf "%d" "$num_gpus" >/dev/null || [ $num_gpus -le -1 ]; then
  echo $0: Must pass a positive interger or 0 after --num-gpus
  echo e.g. $0 --num-gpus 2 local/tfrnnlm/run_lstm.sh
  exit 1
fi

if [ $# -eq 0 ]; then
  echo "Usage:  $0 [--num-gpus <num-gpus>] <command> [<arg1>...]"
  echo "Runs <command> with args after setting CUDA_VISIBLE_DEVICES to "
  echo "make sure exactly <num-gpus> GPUs are visible (default: 1)."
  exit 1
fi

CUDA_VISIBLE_DEVICES=
num_total_gpus=`nvidia-smi -L | wc -l`
num_gpus_assigned=0

if [ $num_gpus -eq 0 ] ; then
    echo "$0: Running the job on CPU. Disabling submitting to gpu"
    export CUDA_VISIBLE_DEVICES=""
else
    for i in `seq 0 $[$num_total_gpus-1]`; do
    # going over all GPUs and check if it is idle, and add to the list if yes
      if nvidia-smi -i $i | grep "No running processes found" >/dev/null; then
        CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES}$i, && num_gpus_assigned=$[$num_gpus_assigned+1]
      fi
    # once we have enough GPUs, break out of the loop
      [ $num_gpus_assigned -eq $num_gpus ] && break
    done

    [ $num_gpus_assigned -ne $num_gpus ] && echo Could not find enough idle GPUs && exit 1

    export CUDA_VISIBLE_DEVICES=$(echo $CUDA_VISIBLE_DEVICES | sed "s=,$==g")

    echo "$0: Running the job on GPU(s) $CUDA_VISIBLE_DEVICES"
fi

"$@"


================================================
FILE: egs/utils/parallel/pbs.pl
================================================
#!/usr/bin/env perl
use strict;
use warnings;

# Copyright 2012  Johns Hopkins University (Author: Daniel Povey).
#           2014  Johns Hopkins University (Author: Vimal Manohar)
#           2015  Queensland University of Technology (Author: Ahilan Kanagasundaram <a.kanagasundaram@qut.edu.au>)
# Apache 2.0.

use File::Basename;
use Cwd;
use Getopt::Long;

# This is a version of the queue.pl modified so that it works under PBS
# The PBS is one of the several "almost compatible" queueing systems. The
# command switches and environment variables are different, so we are adding
# a this script. An optimal solution might probably be to make the variable
# names and the commands configurable, as similar problems can be expected
# with Torque, Univa... and who knows what else
#
# pbs.pl has the same functionality as run.pl, except that
# it runs the job in question on the queue (PBS).
# This version of pbs.pl uses the task array functionality
# of PBS.  
# The script now supports configuring the queue system using a config file
# (default in conf/pbs.conf; but can be passed specified with --config option)
# and a set of command line options.
# The current script handles:
# 1) Normal configuration arguments
# For e.g. a command line option of "--gpu 1" could be converted into the option
# "-q g.q -l gpu=1" to qsub. How the CLI option is handled is determined by a
# line in the config file like
# gpu=* -q g.q -l gpu=$0
# $0 here in the line is replaced with the argument read from the CLI and the
# resulting string is passed to qsub.
# 2) Special arguments to options such as
# gpu=0
# If --gpu 0 is given in the command line, then no special "-q" is given.
# 3) Default argument
# default gpu=0
# If --gpu option is not passed in the command line, then the script behaves as
# if --gpu 0 was passed since 0 is specified as the default argument for that
# option
# 4) Arbitrary options and arguments.
# Any command line option starting with '--' and its argument would be handled
# as long as its defined in the config file.
# 5) Default behavior
# If the config file that is passed using is not readable, then the script
# behaves as if the queue has the following config file:
# $ cat conf/pbs.conf
# # Default configuration
# command qsub -v PATH -S /bin/bash -l arch=*64*
# option mem=* -l mem_free=$0,ram_free=$0
# option mem=0          # Do not add anything to qsub_opts
# option num_threads=* -pe smp $0
# option num_threads=1  # Do not add anything to qsub_opts
# option max_jobs_run=* -tc $0
# default gpu=0
# option gpu=0 -q all.q
# option gpu=* -l gpu=$0 -q g.q

my $qsub_opts = "";
my $sync = 0;
my $num_threads = 1;
my $gpu = 0;

my $config = "conf/pbs.conf";

my %cli_options = ();

my $jobname;
my $jobstart;
my $jobend;

my $array_job = 0;

sub print_usage() {
  print STDERR
   "Usage: pbs.pl [options] [JOB=1:n] log-file command-line arguments...\n" .
   "e.g.: pbs.pl foo.log echo baz\n" .
   " (which will echo \"baz\", with stdout and stderr directed to foo.log)\n" .
   "or: pbs.pl -q all.q\@xyz foo.log echo bar \| sed s/bar/baz/ \n" .
   " (which is an example of using a pipe; you can provide other escaped bash constructs)\n" .
   "or: pbs.pl -q all.q\@qyz JOB=1:10 foo.JOB.log echo JOB \n" .
   " (which illustrates the mechanism to submit parallel jobs; note, you can use \n" .
   "  another string other than JOB)\n" .
   "Note: if you pass the \"-sync y\" option to qsub, this script will take note\n" .
   "and change its behavior.  Otherwise it uses qstat to work out when the job finished\n" .
   "Options:\n" .
   "  --config <config-file> (default: $config)\n" .
   "  --mem <mem-requirement> (e.g. --mem 2G, --mem 500M, \n" .
   "                           also support K and numbers mean bytes)\n" .
   "  --num-threads <num-threads> (default: $num_threads)\n" .
   "  --max-jobs-run <num-jobs>\n" .
   "  --gpu <0|1> (default: $gpu)\n";
  exit 1;
}

if (@ARGV < 2) {
  print_usage();
}

for (my $x = 1; $x <= 2; $x++) { # This for-loop is to
  # allow the JOB=1:n option to be interleaved with the
  # options to qsub.
  while (@ARGV >= 2 && $ARGV[0] =~ m:^-:) {
    my $switch = shift @ARGV;

    if ($switch eq "-V") {
      $qsub_opts .= "-V ";
    } else {
      my $argument = shift @ARGV;
      if ($argument =~ m/^--/) {
        print STDERR "pbs.pl: Warning: suspicious argument '$argument' to $switch; starts with '-'\n";
      }
      if ($switch eq "-sync" && $argument =~ m/^[yY]/) {
        $sync = 1;
        $qsub_opts .= "$switch $argument ";
      } elsif ($switch eq "-pe") { # e.g. -pe smp 5
        my $argument2 = shift @ARGV;
        $qsub_opts .= "$switch $argument $argument2 ";
        $num_threads = $argument2;
      } elsif ($switch =~ m/^--/) { # Config options
        # Convert CLI option to variable name
        # by removing '--' from the switch and replacing any
        # '-' with a '_'
        $switch =~ s/^--//;
        $switch =~ s/-/_/g;
        $cli_options{$switch} = $argument;
      } else {  # Other qsub options - passed as is
        $qsub_opts .= "$switch $argument ";
      }
    }
  }
  if ($ARGV[0] =~ m/^([\w_][\w\d_]*)+=(\d+):(\d+)$/) { # e.g. JOB=1:20
    $array_job = 1;
    $jobname = $1;
    $jobstart = $2;
    $jobend = $3;
    shift;
    if ($jobstart > $jobend) {
      die "pbs.pl: invalid job range $ARGV[0]";
    }
    if ($jobstart <= 0) {
      die "run.pl: invalid job range $ARGV[0], start must be strictly positive (this is a GridEngine limitation).";
    }
  } elsif ($ARGV[0] =~ m/^([\w_][\w\d_]*)+=(\d+)$/) { # e.g. JOB=1.
    $array_job = 1;
    $jobname = $1;
    $jobstart = $2;
    $jobend = $2;
    shift;
  } elsif ($ARGV[0] =~ m/.+\=.*\:.*$/) {
    print STDERR "pbs.pl: Warning: suspicious first argument to queue.pl: $ARGV[0]\n";
  }
}

if (@ARGV < 2) {
  print_usage();
}

if (exists $cli_options{"config"}) {
  $config = $cli_options{"config"};
}

my $default_config_file = <<'EOF';
# Default configuration
command qsub -V -v PATH -S /bin/bash -l mem=4G
option mem=* -l mem=$0
option mem=0          # Do not add anything to qsub_opts
option num_threads=* -l ncpus=$0
option num_threads=1  # Do not add anything to qsub_opts
default gpu=0
option gpu=0
option gpu=* -l ncpus=$0
EOF

# Here the configuration options specified by the user on the command line
# (e.g. --mem 2G) are converted to options to the qsub system as defined in
# the config file. (e.g. if the config file has the line
# "option mem=* -l ram_free=$0,mem_free=$0"
# and the user has specified '--mem 2G' on the command line, the options
# passed to queue system would be "-l ram_free=2G,mem_free=2G
# A more detailed description of the ways the options would be handled is at
# the top of this file.

my $opened_config_file = 1;

open CONFIG, "<$config" or $opened_config_file = 0;

my %cli_config_options = ();
my %cli_default_options = ();

if ($opened_config_file == 0 && exists($cli_options{"config"})) {
  print STDERR "Could not open config file $config\n";
  exit(1);
} elsif ($opened_config_file == 0 && !exists($cli_options{"config"})) {
  # Open the default config file instead
  open (CONFIG, "echo '$default_config_file' |") or die "Unable to open pipe\n";
  $config = "Default config";
}

my $qsub_cmd = "";
my $read_command = 0;

while(<CONFIG>) {
  chomp;
  my $line = $_;
  $_ =~ s/\s*#.*//g;
  if ($_ eq "") { next; }
  if ($_ =~ /^command (.+)/) {
    $read_command = 1;
    $qsub_cmd = $1 . " ";
  } elsif ($_ =~ m/^option ([^=]+)=\* (.+)$/) {
    # Config option that needs replacement with parameter value read from CLI
    # e.g.: option mem=* -l mem_free=$0,ram_free=$0
    my $option = $1;     # mem
    my $arg= $2;         # -l mem_free=$0,ram_free=$0
    if ($arg !~ m:\$0:) {
      die "Unable to parse line '$line' in config file ($config)\n";
    }
    if (exists $cli_options{$option}) {
      # Replace $0 with the argument read from command line.
      # e.g. "-l mem_free=$0,ram_free=$0" -> "-l mem_free=2G,ram_free=2G"
      $arg =~ s/\$0/$cli_options{$option}/g;
      $cli_config_options{$option} = $arg;
    }
  } elsif ($_ =~ m/^option ([^=]+)=(\S+)\s?(.*)$/) {
    # Config option that does not need replacement
    # e.g. option gpu=0 -q all.q
    my $option = $1;      # gpu
    my $value = $2;       # 0
    my $arg = $3;         # -q all.q
    if (exists $cli_options{$option}) {
      $cli_default_options{($option,$value)} = $arg;
    }
  } elsif ($_ =~ m/^default (\S+)=(\S+)/) {
    # Default options. Used for setting default values to options i.e. when
    # the user does not specify the option on the command line
    # e.g. default gpu=0
    my $option = $1;  # gpu
    my $value = $2;   # 0
    if (!exists $cli_options{$option}) {
      # If the user has specified this option on the command line, then we
      # don't have to do anything
      $cli_options{$option} = $value;
    }
  } else {
    print STDERR "pbs.pl: unable to parse line '$line' in config file ($config)\n";
    exit(1);
  }
}

close(CONFIG);

if ($read_command != 1) {
  print STDERR "pbs.pl: config file ($config) does not contain the line \"command .*\"\n";
  exit(1);
}

for my $option (keys %cli_options) {
  if ($option eq "config") { next; }
  if ($option eq "max_jobs_run" && $array_job != 1) { next; }
  my $value = $cli_options{$option};

  if (exists $cli_default_options{($option,$value)}) {
    $qsub_opts .= "$cli_default_options{($option,$value)} ";
  } elsif (exists $cli_config_options{$option}) {
    $qsub_opts .= "$cli_config_options{$option} ";
  } else {
    if ($opened_config_file == 0) { $config = "default config file"; }
    die "pbs.pl: Command line option $option not described in $config (or value '$value' not allowed)\n";
  }
}

my $cwd = getcwd();
my $logfile = shift @ARGV;

if ($array_job == 1 && $logfile !~ m/$jobname/
    && $jobend > $jobstart) {
  print STDERR "pbs.pl: you are trying to run a parallel job but "
    . "you are putting the output into just one log file ($logfile)\n";
  exit(1);
}

#
# Work out the command; quote escaping is done here.
# Note: the rules for escaping stuff are worked out pretty
# arbitrarily, based on what we want it to do.  Some things that
# we pass as arguments to pbs.pl, such as "|", we want to be
# interpreted by bash, so we don't escape them.  Other things,
# such as archive specifiers like 'ark:gunzip -c foo.gz|', we want
# to be passed, in quotes, to the Kaldi program.  Our heuristic
# is that stuff with spaces in should be quoted.  This doesn't
# always work.
#
my $cmd = "";

foreach my $x (@ARGV) {
  if ($x =~ m/^\S+$/) { $cmd .= $x . " "; } # If string contains no spaces, take
                                            # as-is.
  elsif ($x =~ m:\":) { $cmd .= "'$x' "; } # else if no dbl-quotes, use single
  else { $cmd .= "\"$x\" "; }  # else use double.
}

#
# Work out the location of the script file, and open it for writing.
#
my $dir = dirname($logfile);
my $base = basename($logfile);
my $qdir = "$dir/q";
$qdir =~ s:/(log|LOG)/*q:/q:; # If qdir ends in .../log/q, make it just .../q.
my $queue_logfile = "$qdir/$base";

if (!-d $dir) { system "mkdir -p $dir 2>/dev/null"; } # another job may be doing this...
if (!-d $dir) { die "Cannot make the directory $dir\n"; }
# make a directory called "q",
# where we will put the log created by qsub... normally this doesn't contain
# anything interesting, evertyhing goes to $logfile.
if (! -d "$qdir") {
  system "mkdir $qdir 2>/dev/null";
  sleep(5); ## This is to fix an issue we encountered in denominator lattice creation,
  ## where if e.g. the exp/tri2b_denlats/log/15/q directory had just been
  ## created and the job immediately ran, it would die with an error because nfs
  ## had not yet synced.  I'm also decreasing the acdirmin and acdirmax in our
  ## NFS settings to something like 5 seconds.
}

my $queue_array_opt = "";
if ($array_job == 1) { # It's an array job.
  $queue_array_opt = "-J $jobstart-$jobend";
  $logfile =~ s/$jobname/\$PBS_ARRAY_INDEX/g; # This variable will get
  # replaced by qsub, in each job, with the job-id.
  $cmd =~ s/$jobname/\$\{PBS_ARRAY_INDEX\}/g; # same for the command...
  $queue_logfile =~ s/\.?$jobname//; # the log file in the q/ subdirectory
  # is for the queue to put its log, and this doesn't need the task array subscript
  # so we remove it.
}

# queue_scriptfile is as $queue_logfile [e.g. dir/q/foo.log] but
# with the suffix .sh.
my $queue_scriptfile = $queue_logfile;
($queue_scriptfile =~ s/\.[a-zA-Z]{1,5}$/.sh/) || ($queue_scriptfile .= ".sh");
if ($queue_scriptfile !~ m:^/:) {
  $queue_scriptfile = $cwd . "/" . $queue_scriptfile; # just in case.
}

# We'll write to the standard input of "qsub" (the file-handle Q),
# the job that we want it to execute.
# Also keep our current PATH around, just in case there was something
# in it that we need (although we also source ./path.sh)

my $syncfile = "$qdir/done.$$";

system("rm $queue_logfile $syncfile 2>/dev/null");
#
# Write to the script file, and then close it.
#
open(Q, ">$queue_scriptfile") || die "Failed to write to $queue_scriptfile";

print Q "#!/bin/bash\n";
print Q "cd $cwd\n";
print Q ". ./path.sh\n";
print Q "( echo '#' Running on \`hostname\`\n";
print Q "  echo '#' Started at \`date\`\n";
print Q "  echo -n '# '; cat <<EOF\n";
print Q "$cmd\n"; # this is a way of echoing the command into a comment in the log file,
print Q "EOF\n"; # without having to escape things like "|" and quote characters.
print Q ") >$logfile\n";
print Q "time1=\`date +\"%s\"\`\n";
print Q " ( $cmd ) 2>>$logfile >>$logfile\n";
print Q "ret=\$?\n";
print Q "time2=\`date +\"%s\"\`\n";
print Q "echo '#' Accounting: time=\$((\$time2-\$time1)) threads=$num_threads >>$logfile\n";
print Q "echo '#' Finished at \`date\` with status \$ret >>$logfile\n";
print Q "[ \$ret -eq 137 ] && exit 100;\n"; # If process was killed (e.g. oom) it will exit with status 137;
  # let the script return with status 100 which will put it to E state; more easily rerunnable.
if ($array_job == 0) { # not an array job
  print Q "touch $syncfile\n"; # so we know it's done.
} else {
  print Q "touch $syncfile.\$PBS_ARRAY_INDEX\n"; # touch a bunch of sync-files.
}
print Q "exit \$[\$ret ? 1 : 0]\n"; # avoid status 100 which grid-engine
print Q "## submitted with:\n";       # treats specially.
$qsub_cmd .= "-o $queue_logfile $qsub_opts $queue_array_opt $queue_scriptfile >>$queue_logfile 2>&1";
print Q "# $qsub_cmd\n";
if (!close(Q)) { # close was not successful... || die "Could not close script file $shfile";
  die "Failed to close the script file (full disk?)";
}

my $ret = system ($qsub_cmd);
if ($ret != 0) {
  if ($sync && $ret == 256) { # this is the exit status when a job failed (bad exit status)
    if (defined $jobname) { $logfile =~ s/\$PBS_ARRAY_INDEX/*/g; }
    print STDERR "pbs.pl: job writing to $logfile failed\n";
  } else {
    print STDERR "pbs.pl: error submitting jobs to queue (return status was $ret)\n";
    print STDERR "queue log file is $queue_logfile, command was $qsub_cmd\n";
    print STDERR `tail $queue_logfile`;
  }
  exit(1);
}

my $pbs_job_id;
if (! $sync) { # We're not submitting with -sync y, so we
  # need to wait for the jobs to finish.  We wait for the
  # sync-files we "touched" in the script to exist.
  my @syncfiles = ();
  if (!defined $jobname) { # not an array job.
    push @syncfiles, $syncfile;
  } else {
    for (my $jobid = $jobstart; $jobid <= $jobend; $jobid++) {
      push @syncfiles, "$syncfile.$jobid";
    }
  }
  # We will need the pbs_job_id, to check that job still exists
  { # Get the PBS job-id from the log file in q/
    open my $L, '<', $queue_logfile || die "Error opening log file $queue_logfile";
    undef $pbs_job_id;
    while (<$L>) {
      if (/(\d+.+\.pbsserver)/) {
        if (defined $pbs_job_id) {
          die "Error: your job was submitted more than once (see $queue_logfile)";
        } else {
          $pbs_job_id = $1;
        }
      }
    }
    close $L;
    if (!defined $pbs_job_id) {
      die "Error: log file $queue_logfile does not specify the PBS job-id.";
    }
  }
  my $check_pbs_job_ctr=1;
  #
  my $wait = 0.1;
  my $counter = 0;
  foreach my $f (@syncfiles) {
    # wait for them to finish one by one.
    while (! -f $f) {
      sleep($wait);
      $wait *= 1.2;
      if ($wait > 3.0) {
        $wait = 3.0; # never wait more than 3 seconds.
        # the following (.kick) commands are basically workarounds for NFS bugs.
        if (rand() < 0.25) { # don't do this every time...
          if (rand() > 0.5) {
            system("touch $qdir/.kick");
          } else {
            system("rm $qdir/.kick 2>/dev/null");
          }
        }
        if ($counter++ % 10 == 0) {
          # This seems to kick NFS in the teeth to cause it to refresh the
          # directory.  I've seen cases where it would indefinitely fail to get
          # updated, even though the file exists on the server.
          # Only do this every 10 waits (every 30 seconds) though, or if there
          # are many jobs waiting they can overwhelm the file server.
          system("ls $qdir >/dev/null");
        }
      }

      # Check that the job exists in PBS. Job can be killed if duration
      # exceeds some hard limit, or in case of a machine shutdown.
      if (($check_pbs_job_ctr++ % 10) == 0) { # Don't run qstat too often, avoid stress on PBS.
        if ( -f $f ) { next; }; #syncfile appeared: OK.
        $ret = system("qstat -t $pbs_job_id >/dev/null 2>/dev/null");
        # system(...) : To get the actual exit value, shift $ret right by eight bits.
        if ($ret>>8 == 1) {     # Job does not seem to exist
          # Don't consider immediately missing job as error, first wait some
          # time to make sure it is not just delayed creation of the syncfile.

          sleep(3);
          # Sometimes NFS gets confused and thinks it's transmitted the directory
          # but it hasn't, due to timestamp issues.  Changing something in the
          # directory will usually fix that.
          system("touch $qdir/.kick");
          system("rm $qdir/.kick 2>/dev/null");
          if ( -f $f ) { next; }   #syncfile appeared, ok
          sleep(7);
          system("touch $qdir/.kick");
          sleep(1);
          system("rm $qdir/.kick 2>/dev/null");
          if ( -f $f ) {  next; }   #syncfile appeared, ok
          sleep(60);
          system("touch $qdir/.kick");
          sleep(1);
          system("rm $qdir/.kick 2>/dev/null");
          if ( -f $f ) { next; }  #syncfile appeared, ok
          $f =~ m/\.(\d+)$/ || die "Bad sync-file name $f";
          my $job_id = $1;
          if (defined $jobname) {
            $logfile =~ s/\$PBS_ARRAY_INDEX/$job_id/g;
          }
          my $last_line = `tail -n 1 $logfile`;
          if ($last_line =~ m/status 0$/ && (-M $logfile) < 0) {
            # if the last line of $logfile ended with "status 0" and
            # $logfile is newer than this program [(-M $logfile) gives the
            # time elapsed between file modification and the start of this
            # program], then we assume the program really finished OK,
            # and maybe something is up with the file system.
            print STDERR "**pbs.pl: syncfile $f was not created but job seems\n" .
              "**to have finished OK.  Probably your file-system has problems.\n" .
              "**This is just a warning.\n";
            last;
          } else {
            chop $last_line;
            print STDERR "pbs.pl: Error, unfinished job no " .
              "longer exists, log is in $logfile, last line is '$last_line', " .
              "syncfile is $f, return status of qstat was $ret\n" .
              "Possible reasons: a) Exceeded time limit? -> Use more jobs!" .
              " b) Shutdown/Frozen machine? -> Run again!\n";
            exit(1);
          }
        } elsif ($ret != 0) {
          print STDERR "pbs.pl: Warning: qstat command returned status $ret (qstat -t $pbs_job_id,$!)\n";
        }
      }
    }
  }
  my $all_syncfiles = join(" ", @syncfiles);
  system("rm $all_syncfiles 2>/dev/null");
}

# OK, at this point we are synced; we know the job is done.
# But we don't know about its exit status.  We'll look at $logfile for this.
# First work out an array @logfiles of file-locations we need to
# read (just one, unless it's an array job).
my @logfiles = ();
if (!defined $jobname) { # not an array job.
  push @logfiles, $logfile;
} else {
  for (my $jobid = $jobstart; $jobid <= $jobend; $jobid++) {
    my $l = $logfile;
    $l =~ s/\$PBS_ARRAY_INDEX/$jobid/g;
    push @logfiles, $l;
  }
}

my $num_failed = 0;
my $status = 1;
foreach my $l (@logfiles) {
  my @wait_times = (0.1, 0.2, 0.2, 0.3, 0.5, 0.5, 1.0, 2.0, 5.0, 5.0, 5.0, 10.0, 25.0);
  for (my $iter = 0; $iter <= @wait_times; $iter++) {
    my $line = `tail -10 $l 2>/dev/null`; # Note: although this line should be the last
    # line of the file, I've seen cases where it was not quite the last line because
    # of delayed output by the process that was running, or processes it had called.
    # so tail -10 gives it a little leeway.
    if ($line =~ m/with status (\d+)/) {
      $status = $1;
      last;
    } else {
      if ($iter < @wait_times) {
        sleep($wait_times[$iter]);
      } else {
        if (! -f $l) {
          print STDERR "Log-file $l does not exist.\n";
        } else {
          print STDERR "The last line of log-file $l does not seem to indicate the "
            . "return status as expected\n";
        }
        exit(1);                # Something went wrong with the queue, or the
        # machine it was running on, probably.
      }
    }
  }
  # OK, now we have $status, which is the return-status of
  # the command in the job.
  if ($status != 0) { $num_failed++; }
}
if ($num_failed == 0) { exit(0); }
else { # we failed.
  if (@logfiles == 1) {
    if (defined $jobname) { $logfile =~ s/\$PBS_ARRAY_INDEX/$jobstart/g; }
    print STDERR "pbs.pl: job failed with status $status, log is in $logfile\n";
    if ($logfile =~ m/JOB/) {
      print STDERR "pbs.pl: probably you forgot to put JOB=1:\$nj in your script.\n";
    }
  } else {
    if (defined $jobname) { $logfile =~ s/\$PBS_ARRAY_INDEX/*/g; }
    my $numjobs = 1 + $jobend - $jobstart;
    print STDERR "pbs.pl: $num_failed / $numjobs failed, log is in $logfile\n";
  }
  exit(1);
}


================================================
FILE: egs/utils/parallel/queue.pl
================================================
#!/usr/bin/env perl
use strict;
use warnings;

# Copyright 2012  Johns Hopkins University (Author: Daniel Povey).
#           2014  Vimal Manohar (Johns Hopkins University)
# Apache 2.0.

use File::Basename;
use Cwd;
use Getopt::Long;

# queue.pl has the same functionality as run.pl, except that
# it runs the job in question on the queue (Sun GridEngine).
# This version of queue.pl uses the task array functionality
# of the grid engine.  Note: it's different from the queue.pl
# in the s4 and earlier scripts.

# The script now supports configuring the queue system using a config file
# (default in conf/queue.conf; but can be passed specified with --config option)
# and a set of command line options.
# The current script handles:
# 1) Normal configuration arguments
# For e.g. a command line option of "--gpu 1" could be converted into the option
# "-q g.q -l gpu=1" to qsub. How the CLI option is handled is determined by a
# line in the config file like
# gpu=* -q g.q -l gpu=$0
# $0 here in the line is replaced with the argument read from the CLI and the
# resulting string is passed to qsub.
# 2) Special arguments to options such as
# gpu=0
# If --gpu 0 is given in the command line, then no special "-q" is given.
# 3) Default argument
# default gpu=0
# If --gpu option is not passed in the command line, then the script behaves as
# if --gpu 0 was passed since 0 is specified as the default argument for that
# option
# 4) Arbitrary options and arguments.
# Any command line option starting with '--' and its argument would be handled
# as long as its defined in the config file.
# 5) Default behavior
# If the config file that is passed using is not readable, then the script
# behaves as if the queue has the following config file:
# $ cat conf/queue.conf
# # Default configuration
# command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
# option mem=* -l mem_free=$0,ram_free=$0
# option mem=0          # Do not add anything to qsub_opts
# option num_threads=* -pe smp $0
# option num_threads=1  # Do not add anything to qsub_opts
# option max_jobs_run=* -tc $0
# default gpu=0
# option gpu=0 -q all.q
# option gpu=* -l gpu=$0 -q g.q

my $qsub_opts = "";
my $sync = 0;
my $num_threads = 1;
my $gpu = 0;

my $config = "conf/queue.conf";

my %cli_options = ();

my $jobname;
my $jobstart;
my $jobend;
my $array_job = 0;
my $sge_job_id;

sub print_usage() {
  print STDERR
   "Usage: queue.pl [options] [JOB=1:n] log-file command-line arguments...\n" .
   "e.g.: queue.pl foo.log echo baz\n" .
   " (which will echo \"baz\", with stdout and stderr directed to foo.log)\n" .
   "or: queue.pl -q all.q\@xyz foo.log echo bar \| sed s/bar/baz/ \n" .
   " (which is an example of using a pipe; you can provide other escaped bash constructs)\n" .
   "or: queue.pl -q all.q\@qyz JOB=1:10 foo.JOB.log echo JOB \n" .
   " (which illustrates the mechanism to submit parallel jobs; note, you can use \n" .
   "  another string other than JOB)\n" .
   "Note: if you pass the \"-sync y\" option to qsub, this script will take note\n" .
   "and change its behavior.  Otherwise it uses qstat to work out when the job finished\n" .
   "Options:\n" .
   "  --config <config-file> (default: $config)\n" .
   "  --mem <mem-requirement> (e.g. --mem 2G, --mem 500M, \n" .
   "                           also support K and numbers mean bytes)\n" .
   "  --num-threads <num-threads> (default: $num_threads)\n" .
   "  --max-jobs-run <num-jobs>\n" .
   "  --gpu <0|1> (default: $gpu)\n";
  exit 1;
}

sub caught_signal {
  if ( defined $sge_job_id ) { # Signal trapped after submitting jobs
    my $signal = $!;
    system ("qdel $sge_job_id");
    print STDERR "Caught a signal: $signal , deleting SGE task: $sge_job_id and exiting\n";
    exit(2);
  }
}

if (@ARGV < 2) {
  print_usage();
}

for (my $x = 1; $x <= 2; $x++) { # This for-loop is to
  # allow the JOB=1:n option to be interleaved with the
  # options to qsub.
  while (@ARGV >= 2 && $ARGV[0] =~ m:^-:) {
    my $switch = shift @ARGV;

    if ($switch eq "-V") {
      $qsub_opts .= "-V ";
    } else {
      my $argument = shift @ARGV;
      if ($argument =~ m/^--/) {
        print STDERR "WARNING: suspicious argument '$argument' to $switch; starts with '-'\n";
      }
      if ($switch eq "-sync" && $argument =~ m/^[yY]/) {
        $sync = 1;
        $qsub_opts .= "$switch $argument ";
      } elsif ($switch eq "-pe") { # e.g. -pe smp 5
        my $argument2 = shift @ARGV;
        $qsub_opts .= "$switch $argument $argument2 ";
        $num_threads = $argument2;
      } elsif ($switch =~ m/^--/) { # Config options
        # Convert CLI option to variable name
        # by removing '--' from the switch and replacing any
        # '-' with a '_'
        $switch =~ s/^--//;
        $switch =~ s/-/_/g;
        $cli_options{$switch} = $argument;
      } else {  # Other qsub options - passed as is
        $qsub_opts .= "$switch $argument ";
      }
    }
  }
  if ($ARGV[0] =~ m/^([\w_][\w\d_]*)+=(\d+):(\d+)$/) { # e.g. JOB=1:20
    $array_job = 1;
    $jobname = $1;
    $jobstart = $2;
    $jobend = $3;
    shift;
    if ($jobstart > $jobend) {
      die "queue.pl: invalid job range $ARGV[0]";
    }
    if ($jobstart <= 0) {
      die "run.pl: invalid job range $ARGV[0], start must be strictly positive (this is a GridEngine limitation).";
    }
  } elsif ($ARGV[0] =~ m/^([\w_][\w\d_]*)+=(\d+)$/) { # e.g. JOB=1.
    $array_job = 1;
    $jobname = $1;
    $jobstart = $2;
    $jobend = $2;
    shift;
  } elsif ($ARGV[0] =~ m/.+\=.*\:.*$/) {
    print STDERR "queue.pl: Warning: suspicious first argument to queue.pl: $ARGV[0]\n";
  }
}

if (@ARGV < 2) {
  print_usage();
}

if (exists $cli_options{"config"}) {
  $config = $cli_options{"config"};
}

my $default_config_file = <<'EOF';
# Default configuration
command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
option mem=* -l mem_free=$0,ram_free=$0
option mem=0          # Do not add anything to qsub_opts
option num_threads=* -pe smp $0
option num_threads=1  # Do not add anything to qsub_opts
option max_jobs_run=* -tc $0
default gpu=0
option gpu=0
option gpu=* -l gpu=$0 -q '*.q'
EOF

# Here the configuration options specified by the user on the command line
# (e.g. --mem 2G) are converted to options to the qsub system as defined in
# the config file. (e.g. if the config file has the line
# "option mem=* -l ram_free=$0,mem_free=$0"
# and the user has specified '--mem 2G' on the command line, the options
# passed to queue system would be "-l ram_free=2G,mem_free=2G
# A more detailed description of the ways the options would be handled is at
# the top of this file.

$SIG{INT} = \&caught_signal;
$SIG{TERM} = \&caught_signal;

my $opened_config_file = 1;

open CONFIG, "<$config" or $opened_config_file = 0;

my %cli_config_options = ();
my %cli_default_options = ();

if ($opened_config_file == 0 && exists($cli_options{"config"})) {
  print STDERR "Could not open config file $config\n";
  exit(1);
} elsif ($opened_config_file == 0 && !exists($cli_options{"config"})) {
  # Open the default config file instead
  open (CONFIG, "echo '$default_config_file' |") or die "Unable to open pipe\n";
  $config = "Default config";
}

my $qsub_cmd = "";
my $read_command = 0;

while(<CONFIG>) {
  chomp;
  my $line = $_;
  $_ =~ s/\s*#.*//g;
  if ($_ eq "") { next; }
  if ($_ =~ /^command (.+)/) {
    $read_command = 1;
    $qsub_cmd = $1 . " ";
  } elsif ($_ =~ m/^option ([^=]+)=\* (.+)$/) {
    # Config option that needs replacement with parameter value read from CLI
    # e.g.: option mem=* -l mem_free=$0,ram_free=$0
    my $option = $1;     # mem
    my $arg= $2;         # -l mem_free=$0,ram_free=$0
    if ($arg !~ m:\$0:) {
      die "Unable to parse line '$line' in config file ($config)\n";
    }
    if (exists $cli_options{$option}) {
      # Replace $0 with the argument read from command line.
      # e.g. "-l mem_free=$0,ram_free=$0" -> "-l mem_free=2G,ram_free=2G"
      $arg =~ s/\$0/$cli_options{$option}/g;
      $cli_config_options{$option} = $arg;
    }
  } elsif ($_ =~ m/^option ([^=]+)=(\S+)\s?(.*)$/) {
    # Config option that does not need replacement
    # e.g. option gpu=0 -q all.q
    my $option = $1;      # gpu
    my $value = $2;       # 0
    my $arg = $3;         # -q all.q
    if (exists $cli_options{$option}) {
      $cli_default_options{($option,$value)} = $arg;
    }
  } elsif ($_ =~ m/^default (\S+)=(\S+)/) {
    # Default options. Used for setting default values to options i.e. when
    # the user does not specify the option on the command line
    # e.g. default gpu=0
    my $option = $1;  # gpu
    my $value = $2;   # 0
    if (!exists $cli_options{$option}) {
      # If the user has specified this option on the command line, then we
      # don't have to do anything
      $cli_options{$option} = $value;
    }
  } else {
    print STDERR "queue.pl: unable to parse line '$line' in config file ($config)\n";
    exit(1);
  }
}

close(CONFIG);

if ($read_command != 1) {
  print STDERR "queue.pl: config file ($config) does not contain the line \"command .*\"\n";
  exit(1);
}

for my $option (keys %cli_options) {
  if ($option eq "config") { next; }
  if ($option eq "max_jobs_run" && $array_job != 1) { next; }
  my $value = $cli_options{$option};

  if (exists $cli_default_options{($option,$value)}) {
    $qsub_opts .= "$cli_default_options{($option,$value)} ";
  } elsif (exists $cli_config_options{$option}) {
    $qsub_opts .= "$cli_config_options{$option} ";
  } else {
    if ($opened_config_file == 0) { $config = "default config file"; }
    die "queue.pl: Command line option $option not described in $config (or value '$value' not allowed)\n";
  }
}

my $cwd = getcwd();
my $logfile = shift @ARGV;

if ($array_job == 1 && $logfile !~ m/$jobname/
    && $jobend > $jobstart) {
  print STDERR "queue.pl: you are trying to run a parallel job but "
    . "you are putting the output into just one log file ($logfile)\n";
  exit(1);
}

#
# Work out the command; quote escaping is done here.
# Note: the rules for escaping stuff are worked out pretty
# arbitrarily, based on what we want it to do.  Some things that
# we pass as arguments to queue.pl, such as "|", we want to be
# interpreted by bash, so we don't escape them.  Other things,
# such as archive specifiers like 'ark:gunzip -c foo.gz|', we want
# to be passed, in quotes, to the Kaldi program.  Our heuristic
# is that stuff with spaces in should be quoted.  This doesn't
# always work.
#
my $cmd = "";

foreach my $x (@ARGV) {
  if ($x =~ m/^\S+$/) { $cmd .= $x . " "; } # If string contains no spaces, take
                                            # as-is.
  elsif ($x =~ m:\":) { $cmd .= "'$x' "; } # else if no dbl-quotes, use single
  else { $cmd .= "\"$x\" "; }  # else use double.
}

#
# Work out the location of the script file, and open it for writing.
#
my $dir = dirname($logfile);
my $base = basename($logfile);
my $qdir = "$dir/q";
$qdir =~ s:/(log|LOG)/*q:/q:; # If qdir ends in .../log/q, make it just .../q.
my $queue_logfile = "$qdir/$base";

if (!-d $dir) { system "mkdir -p $dir 2>/dev/null"; } # another job may be doing this...
if (!-d $dir) { die "Cannot make the directory $dir\n"; }
# make a directory called "q",
# where we will put the log created by qsub... normally this doesn't contain
# anything interesting, evertyhing goes to $logfile.
# in $qdir/sync we'll put the done.* files... we try to keep this
# directory small because it's transmitted over NFS many times.
if (! -d "$qdir/sync") {
  system "mkdir -p $qdir/sync 2>/dev/null";
  sleep(5); ## This is to fix an issue we encountered in denominator lattice creation,
  ## where if e.g. the exp/tri2b_denlats/log/15/q directory had just been
  ## created and the job immediately ran, it would die with an error because nfs
  ## had not yet synced.  I'm also decreasing the acdirmin and acdirmax in our
  ## NFS settings to something like 5 seconds.
}

my $queue_array_opt = "";
if ($array_job == 1) { # It's an array job.
  $queue_array_opt = "-t $jobstart:$jobend";
  $logfile =~ s/$jobname/\$SGE_TASK_ID/g; # This variable will get
  # replaced by qsub, in each job, with the job-id.
  $cmd =~ s/$jobname/\$\{SGE_TASK_ID\}/g; # same for the command...
  $queue_logfile =~ s/\.?$jobname//; # the log file in the q/ subdirectory
  # is for the queue to put its log, and this doesn't need the task array subscript
  # so we remove it.
}

# queue_scriptfile is as $queue_logfile [e.g. dir/q/foo.log] but
# with the suffix .sh.
my $queue_scriptfile = $queue_logfile;
($queue_scriptfile =~ s/\.[a-zA-Z]{1,5}$/.sh/) || ($queue_scriptfile .= ".sh");
if ($queue_scriptfile !~ m:^/:) {
  $queue_scriptfile = $cwd . "/" . $queue_scriptfile; # just in case.
}

# We'll write to the standard input of "qsub" (the file-handle Q),
# the job that we want it to execute.
# Also keep our current PATH around, just in case there was something
# in it that we need (although we also source ./path.sh)

my $syncfile = "$qdir/sync/done.$$";

unlink($queue_logfile, $syncfile);
#
# Write to the script file, and then close it.
#
open(Q, ">$queue_scriptfile") || die "Failed to write to $queue_scriptfile";

print Q "#!/bin/bash\n";
print Q "cd $cwd\n";
print Q ". ./path.sh\n";
print Q "( echo '#' Running on \`hostname\`\n";
print Q "  echo '#' Started at \`date\`\n";
print Q "  echo -n '# '; cat <<EOF\n";
print Q "$cmd\n"; # this is a way of echoing the command into a comment in the log file,
print Q "EOF\n"; # without having to escape things like "|" and quote characters.
print Q ") >$logfile\n";
print Q "time1=\`date +\"%s\"\`\n";
print Q " ( $cmd ) 2>>$logfile >>$logfile\n";
print Q "ret=\$?\n";
print Q "time2=\`date +\"%s\"\`\n";
print Q "echo '#' Accounting: time=\$((\$time2-\$time1)) threads=$num_threads >>$logfile\n";
print Q "echo '#' Finished at \`date\` with status \$ret >>$logfile\n";
print Q "[ \$ret -eq 137 ] && exit 100;\n"; # If process was killed (e.g. oom) it will exit with status 137;
  # let the script return with status 100 which will put it to E state; more easily rerunnable.
if ($array_job == 0) { # not an array job
  print Q "touch $syncfile\n"; # so we know it's done.
} else {
  print Q "touch $syncfile.\$SGE_TASK_ID\n"; # touch a bunch of sync-files.
}
print Q "exit \$[\$ret ? 1 : 0]\n"; # avoid status 100 which grid-engine
print Q "## submitted with:\n";       # treats specially.
$qsub_cmd .= "-o $queue_logfile $qsub_opts $queue_array_opt $queue_scriptfile >>$queue_logfile 2>&1";
print Q "# $qsub_cmd\n";
if (!close(Q)) { # close was not successful... || die "Could not close script file $shfile";
  die "Failed to close the script file (full disk?)";
}
chmod 0755, $queue_scriptfile;

# This block submits the job to the queue.
for (my $try = 1; $try < 5; $try++) {
  my $ret = system ($qsub_cmd);
  if ($ret != 0) {
    if ($sync && $ret == 256) { # this is the exit status when a job failed (bad exit status)
      if (defined $jobname) {
        $logfile =~ s/\$SGE_TASK_ID/*/g;
      }
      print STDERR "queue.pl: job writing to $logfile failed\n";
      exit(1);
    } else {
      print STDERR "queue.pl: Error submitting jobs to queue (return status was $ret)\n";
      print STDERR "queue log file is $queue_logfile, command was $qsub_cmd\n";
      my $err = `tail $queue_logfile`;
      print STDERR "Output of qsub was: $err\n";
      if ($err =~ m/gdi request/ || $err =~ m/qmaster/) {
        # When we get queue connectivity problems we usually see a message like:
        # Unable to run job: failed receiving gdi request response for mid=1 (got
        # syncron message receive timeout error)..
        my $waitfor = 20;
        print STDERR "queue.pl: It looks like the queue master may be inaccessible. " .
          " Trying again after $waitfor seconts\n";
        sleep($waitfor);
        # ... and continue throught the loop.
      } else {
        exit(1);
      }
    }
  } else {
    last;  # break from the loop.
  }
}

if (! $sync) { # We're not submitting with -sync y, so we
  # need to wait for the jobs to finish.  We wait for the
  # sync-files we "touched" in the script to exist.
  my @syncfiles = ();
  if (!defined $jobname) { # not an array job.
    push @syncfiles, $syncfile;
  } else {
    for (my $jobid = $jobstart; $jobid <= $jobend; $jobid++) {
      push @syncfiles, "$syncfile.$jobid";
    }
  }
  # We will need the sge_job_id, to check that job still exists
  { # This block extracts the numeric SGE job-id from the log file in q/.
    # It may be used later to query 'qstat' about the job.
    open(L, "<$queue_logfile") || die "Error opening log file $queue_logfile";
    undef $sge_job_id;
    while (<L>) {
      if (m/Your job\S* (\d+)[. ].+ has been submitted/) {
        if (defined $sge_job_id) {
          die "Error: your job was submitted more than once (see $queue_logfile)";
        } else {
          $sge_job_id = $1;
        }
      }
    }
    close(L);
    if (!defined $sge_job_id) {
      die "Error: log file $queue_logfile does not specify the SGE job-id.";
    }
  }
  my $check_sge_job_ctr=1;

  my $wait = 0.1;
  my $counter = 0;
  foreach my $f (@syncfiles) {
    # wait for the jobs to finish one by one.
    while (! -f $f) {
      sleep($wait);
      $wait *= 1.2;
      if ($wait > 3.0) {
        $wait = 3.0; # never wait more than 3 seconds.
        # the following (.kick) commands are basically workarounds for NFS bugs.
        if (rand() < 0.25) { # don't do this every time...
          if (rand() > 0.5) {
            system("touch $qdir/sync/.kick");
          } else {
            unlink("$qdir/sync/.kick");
          }
        }
        if ($counter++ % 10 == 0) {
          # This seems to kick NFS in the teeth to cause it to refresh the
          # directory.  I've seen cases where it would indefinitely fail to get
          # updated, even though the file exists on the server.
          # Only do this every 10 waits (every 30 seconds) though, or if there
          # are many jobs waiting they can overwhelm the file server.
          system("ls $qdir/sync >/dev/null");
        }
      }

      # The purpose of the next block is so that queue.pl can exit if the job
      # was killed without terminating.  It's a bit complicated because (a) we
      # don't want to overload the qmaster by querying it too frequently), and
      # (b) sometimes the qmaster is unreachable or temporarily down, and we
      # don't want this to necessarily kill the job.
      if (($check_sge_job_ctr < 100 && ($check_sge_job_ctr++ % 10) == 0) ||
          ($check_sge_job_ctr >= 100 && ($check_sge_job_ctr++ % 50) == 0)) {
        # Don't run qstat too often, avoid stress on SGE; the if-condition above
        # is designed to check every 10 waits at first, and eventually every 50
        # waits.
        if ( -f $f ) { next; }  #syncfile appeared: OK.
        my $output = `qstat -j $sge_job_id 2>&1`;
        my $ret = $?;
        if ($ret >> 8 == 1 && $output !~ m/qmaster/ &&
            $output !~ m/gdi request/) {
          # Don't consider immediately missing job as error, first wait some
          # time to make sure it is not just delayed creation of the syncfile.

          sleep(3);
          # Sometimes NFS gets confused and thinks it's transmitted the directory
          # but it hasn't, due to timestamp issues.  Changing something in the
          # directory will usually fix that.
          system("touch $qdir/sync/.kick");
          unlink("$qdir/sync/.kick");
          if ( -f $f ) { next; }   #syncfile appeared, ok
          sleep(7);
          system("touch $qdir/sync/.kick");
          sleep(1);
          unlink("qdir/sync/.kick");
          if ( -f $f ) {  next; }   #syncfile appeared, ok
          sleep(60);
          system("touch $qdir/sync/.kick");
          sleep(1);
          unlink("$qdir/sync/.kick");
          if ( -f $f ) { next; }  #syncfile appeared, ok
          $f =~ m/\.(\d+)$/ || die "Bad sync-file name $f";
          my $job_id = $1;
          if (defined $jobname) {
            $logfile =~ s/\$SGE_TASK_ID/$job_id/g;
          }
          my $last_line = `tail -n 1 $logfile`;
          if ($last_line =~ m/status 0$/ && (-M $logfile) < 0) {
            # if the last line of $logfile ended with "status 0" and
            # $logfile is newer than this program [(-M $logfile) gives the
            # time elapsed between file modification and the start of this
            # program], then we assume the program really finished OK,
            # and maybe something is up with the file system.
            print STDERR "**queue.pl: syncfile $f was not created but job seems\n" .
              "**to have finished OK.  Probably your file-system has problems.\n" .
              "**This is just a warning.\n";
            last;
          } else {
            chop $last_line;
            print STDERR "queue.pl: Error, unfinished job no " .
              "longer exists, log is in $logfile, last line is '$last_line', " .
              "syncfile is $f, return status of qstat was $ret\n" .
              "Possible reasons: a) Exceeded time limit? -> Use more jobs!" .
              " b) Shutdown/Frozen machine? -> Run again!  Qmaster output " .
              "was: $output\n";
            exit(1);
          }
        } elsif ($ret != 0) {
          print STDERR "queue.pl: Warning: qstat command returned status $ret (qstat -j $sge_job_id,$!)\n";
          print STDERR "queue.pl: output was: $output";
        }
      }
    }
  }
  unlink(@syncfiles);
}

# OK, at this point we are synced; we know the job is done.
# But we don't know about its exit status.  We'll look at $logfile for this.
# First work out an array @logfiles of file-locations we need to
# read (just one, unless it's an array job).
my @logfiles = ();
if (!defined $jobname) { # not an array job.
  push @logfiles, $logfile;
} else {
  for (my $jobid = $jobstart; $jobid <= $jobend; $jobid++) {
    my $l = $logfile;
    $l =~ s/\$SGE_TASK_ID/$jobid/g;
    push @logfiles, $l;
  }
}

my $num_failed = 0;
my $status = 1;
foreach my $l (@logfiles) {
  my @wait_times = (0.1, 0.2, 0.2, 0.3, 0.5, 0.5, 1.0, 2.0, 5.0, 5.0, 5.0, 10.0, 25.0);
  for (my $iter = 0; $iter <= @wait_times; $iter++) {
    my $line = `tail -10 $l 2>/dev/null`; # Note: although this line should be the last
    # line of the file, I've seen cases where it was not quite the last line because
    # of delayed output by the process that was running, or processes it had called.
    # so tail -10 gives it a little leeway.
    if ($line =~ m/with status (\d+)/) {
      $status = $1;
      last;
    } else {
      if ($iter < @wait_times) {
        sleep($wait_times[$iter]);
      } else {
        if (! -f $l) {
          print STDERR "Log-file $l does not exist.\n";
        } else {
          print STDERR "The last line of log-file $l does not seem to indicate the "
            . "return status as expected\n";
        }
        exit(1);                # Something went wrong with the queue, or the
        # machine it was running on, probably.
      }
    }
  }
  # OK, now we have $status, which is the return-status of
  # the command in the job.
  if ($status != 0) { $num_failed++; }
}
if ($num_failed == 0) { exit(0); }
else { # we failed.
  if (@logfiles == 1) {
    if (defined $jobname) { $logfile =~ s/\$SGE_TASK_ID/$jobstart/g; }
    print STDERR "queue.pl: job failed with status $status, log is in $logfile\n";
    if ($logfile =~ m/JOB/) {
      print STDERR "queue.pl: probably you forgot to put JOB=1:\$nj in your script.\n";
    }
  } else {
    if (defined $jobname) { $logfile =~ s/\$SGE_TASK_ID/*/g; }
    my $numjobs = 1 + $jobend - $jobstart;
    print STDERR "queue.pl: $num_failed / $numjobs failed, log is in $logfile\n";
  }
  exit(1);
}


================================================
FILE: egs/utils/parallel/retry.pl
================================================
#!/usr/bin/env perl
use strict;
use warnings;

# Copyright 2018  Johns Hopkins University (Author: Daniel Povey).
# Apache 2.0.

use File::Basename;
use Cwd;
use Getopt::Long;


# retry.pl is a wrapper for queue.pl.  It can be used to retry jobs that failed,
# e.g. if your command line was "queue.pl [args]", you can replace that
# with "retry.pl queue.pl [args]" and it will retry jobs that failed.


my $num_tries = 2;

sub print_usage() {
  print STDERR
    "Usage: retry.pl  <some-other-wrapper-script> <rest-of-command>\n" .
    "  e.g.:  retry.pl [options] queue.pl foo.log do_something\n" .
    "This will retry jobs that failed (only once)\n" .
    "Options:\n" .
    "      --num-tries <n>        # default: 2\n";
  exit 1;
}

if ($ARGV[0] eq "--num-tries") {
  shift;
  $num_tries =  $ARGV[0] + 0;
  if ($num_tries < 1) {
    die "$0: invalid option --num-tries $ARGV[0]";
  }
  shift;
}

if (@ARGV < 3) {
  print_usage();
}


sub get_log_file {
  my $n;
  # First just look for the first command-line arg that ends in ".log".  If that
  # exists, it's almost certainly the log file.
  for ($n = 1; $n < @ARGV; $n++) {
    if ($ARGV[$n] =~ m/\.log$/) {
      return $ARGV[$n];
    }
  }
  for ($n = 1; $n < @ARGV; $n++) {
    # If this arg isn't of the form "-some-option', and isn't of the form
    # "JOB=1:10", and the previous arg wasn't of the form "-some-option", and this
    # isn't just a number (note: the 'not-a-number' things is mostly to exclude
    # things like the 5 in "-pe smp 5" which is an older but still-supported
    # option to queue.pl)... then assume it's a log file.
    if ($ARGV[$n] !~ m/^-=/ &&  $ARGV[$n] !~ m/=/ && $ARGV[$n] !~ m/^\d+$/ &&
        $ARGV[$n-1] !~ m/^-/) {
      return $ARGV[$n];
    }
  }
  print STDERR "$0: failed to parse log-file name from args:" . join(" ", @ARGV);
  exit(1);
}


my $log_file = get_log_file();
my $return_status;

for (my $n = 1; $n <= $num_tries; $n++) {
  system(@ARGV);
  $return_status = $?;
  if ($return_status == 0) {
    exit(0);  # The command succeeded.  We return success.
  } elsif ($return_status != 256) {
    # The command did not "die normally".  When queue.pl and similar scripts
    # detect a normal error, they exit(1), which becomes a status of 256
    # in perl's $? variable.
    # See http://perldoc.perl.org/perlvar.html#%24CHILD_ERROR for more info.
    # An example of an abnormal death that would cause us to want to exit
    # immediately, is when the user does ctrl-c or KILLs the script,
    # which gets caught by 'caught_signal' in queue.pl and causes that program
    # to return with exit status 2.
    exit(1);
  }


  if ($n < $num_tries) {
    if (! -f $log_file) {
      # $log_file doesn't exist as a file.  Maybe it was an array job.
      # This script doesn't yet support array jobs.  We just give up.
      # Later on we might want to figure out which array jobs failed
      # and have to be rerun, but for now we just die.
      print STDERR "$0: job failed and log file $log_file does not exist (array job?).\n";
    } else {
      rename($log_file, $log_file . ".bak");
      print STDERR "$0: job failed; renaming log file to ${log_file}.bak and rerunning\n";
    }
  }
}

print STDERR "$0: job failed $num_tries times; log is in $log_file\n";
exit(1);


================================================
FILE: egs/utils/parallel/run.pl
================================================
#!/usr/bin/env perl
use warnings; #sed replacement for -w perl parameter
# In general, doing
#  run.pl some.log a b c is like running the command a b c in
# the bash shell, and putting the standard error and output into some.log.
# To run parallel jobs (backgrounded on the host machine), you can do (e.g.)
#  run.pl JOB=1:4 some.JOB.log a b c JOB is like running the command a b c JOB
# and putting it in some.JOB.log, for each one. [Note: JOB can be any identifier].
# If any of the jobs fails, this script will fail.

# A typical example is:
#  run.pl some.log my-prog "--opt=foo bar" foo \|  other-prog baz
# and run.pl will run something like:
# ( my-prog '--opt=foo bar' foo |  other-prog baz ) >& some.log
#
# Basically it takes the command-line arguments, quotes them
# as necessary to preserve spaces, and evaluates them with bash.
# In addition it puts the command line at the top of the log, and
# the start and end times of the command at the beginning and end.
# The reason why this is useful is so that we can create a different
# version of this program that uses a queueing system instead.

#use Data::Dumper;

@ARGV < 2 && die "usage: run.pl log-file command-line arguments...";

#print STDERR "COMMAND-LINE: " .  Dumper(\@ARGV) . "\n";
$job_pick = 'all';
$max_jobs_run = -1;
$jobstart = 1;
$jobend = 1;
$ignored_opts = ""; # These will be ignored.

# First parse an option like JOB=1:4, and any
# options that would normally be given to
# queue.pl, which we will just discard.

for (my $x = 1; $x <= 2; $x++) { # This for-loop is to
  # allow the JOB=1:n option to be interleaved with the
  # options to qsub.
  while (@ARGV >= 2 && $ARGV[0] =~ m:^-:) {
    # parse any options that would normally go to qsub, but which will be ignored here.
    my $switch = shift @ARGV;
    if ($switch eq "-V") {
      $ignored_opts .= "-V ";
    } elsif ($switch eq "--max-jobs-run" || $switch eq "-tc") {
      # we do support the option --max-jobs-run n, and its GridEngine form -tc n.
      # if the command appears multiple times uses the smallest option.
      if ( $max_jobs_run <= 0 ) {
          $max_jobs_run =  shift @ARGV;
      } else {
        my $new_constraint = shift @ARGV;
        if ( ($new_constraint < $max_jobs_run) ) {
          $max_jobs_run = $new_constraint;
        }
      }
      
      if (! ($max_jobs_run > 0)) {
        die "run.pl: invalid option --max-jobs-run $max_jobs_run";
      }
    } else {
      my $argument = shift @ARGV;
      if ($argument =~ m/^--/) {
        print STDERR "run.pl: WARNING: suspicious argument '$argument' to $switch; starts with '-'\n";
      }
      if ($switch eq "-sync" && $argument =~ m/^[yY]/) {
        $ignored_opts .= "-sync "; # Note: in the
        # corresponding code in queue.pl it says instead, just "$sync = 1;".
      } elsif ($switch eq "-pe") { # e.g. -pe smp 5
        my $argument2 = shift @ARGV;
        $ignored_opts .= "$switch $argument $argument2 ";
      } elsif ($switch eq "--gpu") {
        $using_gpu = $argument;
      } elsif ($switch eq "--pick") {
        if($argument =~ m/^(all|failed|incomplete)$/) {
          $job_pick = $argument;
        } else {
          print STDERR "run.pl: ERROR: --pick argument must be one of 'all', 'failed' or 'incomplete'"
        }
      } else {
        # Ignore option.
        $ignored_opts .= "$switch $argument ";
      }
    }
  }
  if ($ARGV[0] =~ m/^([\w_][\w\d_]*)+=(\d+):(\d+)$/) { # e.g. JOB=1:20
    $jobname = $1;
    $jobstart = $2;
    $jobend = $3;
    if ($jobstart > $jobend) {
      die "run.pl: invalid job range $ARGV[0]";
    }
    if ($jobstart <= 0) {
      die "run.pl: invalid job range $ARGV[0], start must be strictly positive (this is required for GridEngine compatibility).";
    }
    shift;
  } elsif ($ARGV[0] =~ m/^([\w_][\w\d_]*)+=(\d+)$/) { # e.g. JOB=1.
    $jobname = $1;
    $jobstart = $2;
    $jobend = $2;
    shift;
  } elsif ($ARGV[0] =~ m/.+\=.*\:.*$/) {
    print STDERR "run.pl: Warning: suspicious first argument to run.pl: $ARGV[0]\n";
  }
}

# Users found this message confusing so we are removing it.
# if ($ignored_opts ne "") {
#   print STDERR "run.pl: Warning: ignoring options \"$ignored_opts\"\n";
# }

if ($max_jobs_run == -1) { # If --max-jobs-run option not set,
                           # then work out the number of processors if possible,
                           # and set it based on that.
  $max_jobs_run = 0;
  if ($using_gpu) {
    if (open(P, "nvidia-smi -L |")) {
      $max_jobs_run++ while (<P>);
      close(P);
    }
    if ($max_jobs_run == 0) {
      $max_jobs_run = 1;
      print STDERR "run.pl: Warning: failed to detect number of GPUs from nvidia-smi, using ${max_jobs_run}\n";
    }
  } elsif (open(P, "</proc/cpuinfo")) {  # Linux
    while (<P>) { if (m/^processor/) { $max_jobs_run++; } }
    if ($max_jobs_run == 0) {
      print STDERR "run.pl: Warning: failed to detect any processors from /proc/cpuinfo\n";
      $max_jobs_run = 10;  # reasonable default.
    }
    close(P);
  } elsif (open(P, "sysctl -a |")) {  # BSD/Darwin
    while (<P>) {
      if (m/hw\.ncpu\s*[:=]\s*(\d+)/) { # hw.ncpu = 4, or hw.ncpu: 4
        $max_jobs_run = $1;
        last;
      }
    }
    close(P);
    if ($max_jobs_run == 0) {
      print STDERR "run.pl: Warning: failed to detect any processors from sysctl -a\n";
      $max_jobs_run = 10;  # reasonable default.
    }
  } else {
    # allow at most 32 jobs at once, on non-UNIX systems; change this code
    # if you need to change this default.
    $max_jobs_run = 32;
  }
  # The just-computed value of $max_jobs_run is just the number of processors
  # (or our best guess); and if it happens that the number of jobs we need to
  # run is just slightly above $max_jobs_run, it will make sense to increase
  # $max_jobs_run to equal the number of jobs, so we don't have a small number
  # of leftover jobs.
  $num_jobs = $jobend - $jobstart + 1;
  if (!$using_gpu &&
      $num_jobs > $max_jobs_run && $num_jobs < 1.4 * $max_jobs_run) {
    $max_jobs_run = $num_jobs;
  }
}

sub pick_or_exit {
  # pick_or_exit ( $logfile ) 
  # Invoked before each job is started helps to run jobs selectively.
  #
  # Given the name of the output logfile decides whether the job must be 
  # executed (by returning from the subroutine) or not (by terminating the
  # process calling exit)
  # 
  # PRE: $job_pick is a global variable set by command line switch --pick
  #      and indicates which class of jobs must be executed.
  #
  # 1) If a failed job is not executed the process exit code will indicate 
  #    failure, just as if the task was just executed  and failed.
  #
  # 2) If a task is incomplete it will be executed. Incomplete may be either
  #    a job whose log file does not contain the accounting notes in the end,
  #    or a job whose log file does not exist.
  #
  # 3) If the $job_pick is set to 'all' (default behavior) a task will be
  #    executed regardless of the result of previous attempts.
  #
  # This logic could have been implemented in the main execution loop
  # but a subroutine to preserve the current level of readability of
  # that part of the code.
  #
  # Alexandre Felipe, (o.alexandre.felipe@gmail.com) 14th of August of 2020
  #
  if($job_pick eq 'all'){
    return; # no need to bother with the previous log
  }
  open my $fh, "<", $_[0] or return; # job not executed yet
  my $log_line;
  my $cur_line;
  while ($cur_line = <$fh>) {
    if( $cur_line =~ m/# Ended \(code .*/ ) {
      $log_line = $cur_line;
    }
  }
  close $fh;
  if (! defined($log_line)){
    return; # incomplete
  }
  if ( $log_line =~ m/# Ended \(code 0\).*/ ) {
    exit(0); # complete
  } elsif ( $log_line =~ m/# Ended \(code \d+(; signal \d+)?\).*/ ){
    if ($job_pick !~ m/^(failed|all)$/) {
      exit(1); # failed but not going to run
    } else {
      return; # failed
    }
  } elsif ( $log_line =~ m/.*\S.*/ ) {
    return; # incomplete jobs are always run
  }
}


$logfile = shift @ARGV;

if (defined $jobname && $logfile !~ m/$jobname/ &&
    $jobend > $jobstart) {
  print STDERR "run.pl: you are trying to run a parallel job but "
    . "you are putting the output into just one log file ($logfile)\n";
  exit(1);
}

$cmd = "";

foreach $x (@ARGV) {
    if ($x =~ m/^\S+$/) { $cmd .=  $x . " "; }
    elsif ($x =~ m:\":) { $cmd .= "'$x' "; }
    else { $cmd .= "\"$x\" "; }
}

#$Data::Dumper::Indent=0;
$ret = 0;
$numfail = 0;
%active_pids=();

use POSIX ":sys_wait_h";
for ($jobid = $jobstart; $jobid <= $jobend; $jobid++) {
  if (scalar(keys %active_pids) >= $max_jobs_run) {

    # Lets wait for a change in any child's status
    # Then we have to work out which child finished
    $r = waitpid(-1, 0);
    $code = $?;
    if ($r < 0 ) { die "run.pl: Error waiting for child process"; } # should never happen.
    if ( defined $active_pids{$r} ) {
        $jid=$active_pids{$r};
        $fail[$jid]=$code;
        if ($code !=0) { $numfail++;}
        delete $active_pids{$r};
        # print STDERR "Finished: $r/$jid " .  Dumper(\%active_pids) . "\n";
    } else {
        die "run.pl: Cannot find the PID of the child process that just finished.";
    }

    # In theory we could do a non-blocking waitpid over all jobs running just
    # to find out if only one or more jobs finished during the previous waitpid()
    # However, we just omit this and will reap the next one in the next pass
    # through the for(;;) cycle
  }
  $childpid = fork();
  if (!defined $childpid) { die "run.pl: Error forking in run.pl (writing to $logfile)"; }
  if ($childpid == 0) { # We're in the child... this branch
    # executes the job and returns (possibly with an error status).
    if (defined $jobname) {
      $cmd =~ s/$jobname/$jobid/g;
      $logfile =~ s/$jobname/$jobid/g;
    }
    # exit if the job does not need to be executed
    pick_or_exit( $logfile );

    system("mkdir -p `dirname $logfile` 2>/dev/null");
    open(F, ">$logfile") || die "run.pl: Error opening log file $logfile";
    print F "# " . $cmd . "\n";
    print F "# Started at " . `date`;
    $starttime = `date +'%s'`;
    print F "#\n";
    close(F);

    # Pipe into bash.. make sure we're not using any other shell.
    open(B, "|bash") || die "run.pl: Error opening shell command";
    print B "( " . $cmd . ") 2>>$logfile >> $logfile";
    close(B);                   # If there was an error, exit status is in $?
    $ret = $?;

    $lowbits = $ret & 127;
    $highbits = $ret >> 8;
    if ($lowbits != 0) { $return_str = "code $highbits; signal $lowbits" }
    else { $return_str = "code $highbits"; }

    $endtime = `date +'%s'`;
    open(F, ">>$logfile") || die "run.pl: Error opening log file $logfile (again)";
    $enddate = `date`;
    chop $enddate;
    print F "# Accounting: time=" . ($endtime - $starttime) . " threads=1\n";
    print F "# Ended ($return_str) at " . $enddate . ", elapsed time " . ($endtime-$starttime) . " seconds\n";
    close(F);
    exit($ret == 0 ? 0 : 1);
  } else {
    $pid[$jobid] = $childpid;
    $active_pids{$childpid} = $jobid;
    # print STDERR "Queued: " .  Dumper(\%active_pids) . "\n";
  }
}

# Now we have submitted all the jobs, lets wait until all the jobs finish
foreach $child (keys %active_pids) {
    $jobid=$active_pids{$child};
    $r = waitpid($pid[$jobid], 0);
    $code = $?;
    if ($r == -1) { die "run.pl: Error waiting for child process"; } # should never happen.
    if ($r != 0) { $fail[$jobid]=$code; $numfail++ if $code!=0; } # Completed successfully
}

# Some sanity checks:
# The $fail array should not contain undefined codes
# The number of non-zeros in that array  should be equal to $numfail
# We cannot do foreach() here, as the JOB ids do not start at zero
$failed_jids=0;
for ($jobid = $jobstart; $jobid <= $jobend; $jobid++) {
  $job_return = $fail[$jobid];
  if (not defined $job_return ) {
    # print Dumper(\@fail);

    die "run.pl: Sanity check failed: we have indication that some jobs are running " .
      "even after we waited for all jobs to finish" ;
  }
  if ($job_return != 0 ){ $failed_jids++;}
}
if ($failed_jids != $numfail) {
  die "run.pl: Sanity check failed: cannot find out how many jobs failed ($failed_jids x $numfail)."
}
if ($numfail > 0) { $ret = 1; }

if ($ret != 0) {
  $njobs = $jobend - $jobstart + 1;
  if ($njobs == 1) {
    if (defined $jobname) {
      $logfile =~ s/$jobname/$jobstart/; # only one numbered job, so replace name with
                                         # that job.
    }
    print STDERR "run.pl: job failed, log is in $logfile\n";
    if ($logfile =~ m/JOB/) {
      print STDERR "run.pl: probably you forgot to put JOB=1:\$nj in your script.";
    }
  }
  else {
    $logfile =~ s/$jobname/*/g;
    print STDERR "run.pl: $numfail / $njobs failed, log is in $logfile\n";
  }
}


exit ($ret);


================================================
FILE: egs/utils/parallel/slurm.pl
================================================
#!/usr/bin/env perl
use strict;
use warnings;

# Copyright 2012  Johns Hopkins University (Author: Daniel Povey).
#           2014  Vimal Manohar (Johns Hopkins University)
#           2015  Johns Hopkins University (Yenda Trmal <jtrmal@gmail.com>>)
# Apache 2.0.

use File::Basename;
use Cwd;
use Getopt::Long;

# slurm.pl was created from the queue.pl
# queue.pl has the same functionality as run.pl, except that
# it runs the job in question on the queue (Sun GridEngine).
# This version of queue.pl uses the task array functionality
# of the grid engine.  Note: it's different from the queue.pl
# in the s4 and earlier scripts.

# The script now supports configuring the queue system using a config file
# (default in conf/queue.conf; but can be passed specified with --config option)
# and a set of command line options.
# The current script handles:
# 1) Normal configuration arguments
# For e.g. a command line option of "--gpu 1" could be converted into the option
# "-q g.q -l gpu=1" to qsub. How the CLI option is handled is determined by a
# line in the config file like
# gpu=* -q g.q -l gpu=$0
# $0 here in the line is replaced with the argument read from the CLI and the
# resulting string is passed to qsub.
# 2) Special arguments to options such as
# gpu=0
# If --gpu 0 is given in the command line, then no special "-q" is given.
# 3) Default argument
# default gpu=0
# If --gpu option is not passed in the command line, then the script behaves as
# if --gpu 0 was passed since 0 is specified as the default argument for that
# option
# 4) Arbitrary options and arguments.
# Any command line option starting with '--' and its argument would be handled
# as long as its defined in the config file.
# 5) Default behavior
# If the config file that is passed using is not readable, then the script
# behaves as if the queue has the following config file:
# $ cat conf/queue.conf
# # Default configuration
# command sbatch --export=PATH  -S /bin/bash -j y -l arch=*64*
# option mem=* --mem-per-cpu $0
# option mem=0          # Do not add anything to qsub_opts
# option num_threads=* --cpus-per-task $0
# option num_threads=1  # Do not add anything to qsub_opts
# option max_jobs_run=* -tc $0
# default gpu=0
# option gpu=0 -p shared
# option gpu=*  -p gpu  #this has to be figured out

#print STDERR "$0 " . join(" ", @ARGV) . "\n";

my $qsub_opts = "";
my $sync = 0;
my $num_threads = 1;
my $max_jobs_run;
my $gpu = 0;

my $config = "conf/slurm.conf";

my %cli_options = ();

my $jobname;
my $jobstart;
my $jobend;

my $array_job = 0;

sub print_usage() {
  print STDERR
   "Usage: $0 [options] [JOB=1:n] log-file command-line arguments...\n" .
   "e.g.: $0 foo.log echo baz\n" .
   " (which will echo \"baz\", with stdout and stderr directed to foo.log)\n" .
   "or: $0 -q all.q\@xyz foo.log echo bar \| sed s/bar/baz/ \n" .
   " (which is an example of using a pipe; you can provide other escaped bash constructs)\n" .
   "or: $0 -q all.q\@qyz JOB=1:10 foo.JOB.log echo JOB \n" .
   " (which illustrates the mechanism to submit parallel jobs; note, you can use \n" .
   "  another string other than JOB)\n" .
   "Note: if you pass the \"-sync y\" option to qsub, this script will take note\n" .
   "and change its behavior.  Otherwise it uses squeue to work out when the job finished\n" .
   "Options:\n" .
   "  --config <config-file> (default: $config)\n" .
   "  --mem <mem-requirement> (e.g. --mem 2G, --mem 500M, \n" .
   "                           also support K and numbers mean bytes)\n" .
   "  --num-threads <num-threads> (default: $num_threads)\n" .
   "  --max-jobs-run <num-jobs>\n" .
   "  --gpu <0|1> (default: $gpu)\n";
  exit 1;
}

sub exec_command {
  # Execute command and return a tuple of stdout and exit code
  my $command = join ' ', @_;
  # To get the actual exit value, shift right by eight bits.
  ($_ = `$command 2>&1`, $? >> 8);
}

if (@ARGV < 2) {
  print_usage();
}

for (my $x = 1; $x <= 3; $x++) { # This for-loop is to
  # allow the JOB=1:n option to be interleaved with the
  # options to qsub.
  while (@ARGV >= 2 && $ARGV[0] =~ m:^-:) {
    my $switch = shift @ARGV;

    if ($switch eq "-V") {
      $qsub_opts .= "-V ";
    } else {
      my $argument = shift @ARGV;
      if ($argument =~ m/^--/) {
        print STDERR "WARNING: suspicious argument '$argument' to $switch; starts with '-'\n";
      }
      if ($switch eq "-sync" && $argument =~ m/^[yY]/) {
        $sync = 1;
        $qsub_opts .= "$switch $argument ";
      } elsif ($switch eq "-pe") { # e.g. -pe smp 5
        my $argument2 = shift @ARGV;
        $qsub_opts .= "$switch $argument $argument2 ";
        $num_threads = $argument2;
      } elsif ($switch =~ m/^--/) { # Config options
        # Convert CLI option to variable name
        # by removing '--' from the switch and replacing any
        # '-' with a '_'
        $switch =~ s/^--//;
        $switch =~ s/-/_/g;
        $cli_options{$switch} = $argument;
      } else {  # Other qsub options - passed as is
        $qsub_opts .= "$switch $argument ";
      }
    }
  }
  if ($ARGV[0] =~ m/^([\w_][\w\d_]*)+=(\d+):(\d+)$/) { # e.g. JOB=1:20
    $array_job = 1;
    $jobname = $1;
    $jobstart = $2;
    $jobend = $3;
    shift;
    if ($jobstart > $jobend) {
      die "$0: invalid job range $ARGV[0]";
    }
    if ($jobstart <= 0) {
      die "$0: invalid job range $ARGV[0], start must be strictly positive (this is a GridEngine limitation).";
    }
  } elsif ($ARGV[0] =~ m/^([\w_][\w\d_]*)+=(\d+)$/) { # e.g. JOB=1.
    $array_job = 1;
    $jobname = $1;
    $jobstart = $2;
    $jobend = $2;
    shift;
  } elsif ($ARGV[0] =~ m/.+\=.*\:.*$/) {
    print STDERR "Warning: suspicious first argument to $0: $ARGV[0]\n";
  }
}

if (@ARGV < 2) {
  print_usage();
}

if (exists $cli_options{"config"}) {
  $config = $cli_options{"config"};
}

my $default_config_file = <<'EOF';
# Default configuration
command sbatch --export=PATH  --ntasks-per-node=1
option time=* --time $0
option mem=* --mem-per-cpu $0
option mem=0          # Do not add anything to qsub_opts
option num_threads=* --cpus-per-task $0 --ntasks-per-node=1
option num_threads=1 --cpus-per-task 1  --ntasks-per-node=1 # Do not add anything to qsub_opts
default gpu=0
option gpu=0 -p shared
option gpu=* -p gpu --gres=gpu:$0 --time 4:0:0  # this has to be figured out
EOF

# note: the --max-jobs-run option is supported as a special case
# by slurm.pl and you don't have to handle it in the config file.

# Here the configuration options specified by the user on the command line
# (e.g. --mem 2G) are converted to options to the qsub system as defined in
# the config file. (e.g. if the config file has the line
# "option mem=* -l ram_free=$0,mem_free=$0"
# and the user has specified '--mem 2G' on the command line, the options
# passed to queue system would be "-l ram_free=2G,mem_free=2G
# A more detailed description of the ways the options would be handled is at
# the top of this file.

my $opened_config_file = 1;

open CONFIG, "<$config" or $opened_config_file = 0;

my %cli_config_options = ();
my %cli_default_options = ();

if ($opened_config_file == 0 && exists($cli_options{"config"})) {
  print STDERR "Could not open config file $config\n";
  exit(1);
} elsif ($opened_config_file == 0 && !exists($cli_options{"config"})) {
  # Open the default config file instead
  open (CONFIG, "echo '$default_config_file' |") or die "Unable to open pipe\n";
  $config = "Default config";
}

my $qsub_cmd = "";
my $read_command = 0;

while(<CONFIG>) {
  chomp;
  my $line = $_;
  $_ =~ s/\s*#.*//g;
  if ($_ eq "") { next; }
  if ($_ =~ /^command (.+)/) {
    $read_command = 1;
    $qsub_cmd = $1 . " ";
  } elsif ($_ =~ m/^option ([^=]+)=\* (.+)$/) {
    # Config option that needs replacement with parameter value read from CLI
    # e.g.: option mem=* -l mem_free=$0,ram_free=$0
    my $option = $1;     # mem
    my $arg= $2;         # -l mem_free=$0,ram_free=$0
    if ($arg !~ m:\$0:) {
      print STDERR "Warning: the line '$line' in config file ($config) does not substitution variable \$0\n";
    }
    if (exists $cli_options{$option}) {
      # Replace $0 with the argument read from command line.
      # e.g. "-l mem_free=$0,ram_free=$0" -> "-l mem_free=2G,ram_free=2G"
      $arg =~ s/\$0/$cli_options{$option}/g;
      $cli_config_options{$option} = $arg;
    }
  } elsif ($_ =~ m/^option ([^=]+)=(\S+)\s?(.*)$/) {
    # Config option that does not need replacement
    # e.g. option gpu=0 -q all.q
    my $option = $1;      # gpu
    my $value = $2;       # 0
    my $arg = $3;         # -q all.q
    if (exists $cli_options{$option}) {
      $cli_default_options{($option,$value)} = $arg;
    }
  } elsif ($_ =~ m/^default (\S+)=(\S+)/) {
    # Default options. Used for setting default values to options i.e. when
    # the user does not specify the option on the command line
    # e.g. default gpu=0
    my $option = $1;  # gpu
    my $value = $2;   # 0
    if (!exists $cli_options{$option}) {
      # If the user has specified this option on the command line, then we
      # don't have to do anything
      $cli_options{$option} = $value;
    }
  } else {
    print STDERR "$0: unable to parse line '$line' in config file ($config)\n";
    exit(1);
  }
}

close(CONFIG);

if ($read_command != 1) {
  print STDERR "$0: config file ($config) does not contain the line \"command .*\"\n";
  exit(1);
}

for my $option (keys %cli_options) {
  if ($option eq "config") { next; }

  my $value = $cli_options{$option};

  if ($option eq "max_jobs_run") {
    if ($array_job != 1) {
      print STDERR "Ignoring $option since this is not an array task.";
    } else {
      $max_jobs_run = $value;
    }
  } elsif (exists $cli_default_options{($option,$value)}) {
    $qsub_opts .= "$cli_default_options{($option,$value)} ";
  } elsif (exists $cli_config_options{$option}) {
    $qsub_opts .= "$cli_config_options{$option} ";
  } elsif (exists $cli_default_options{($option,"*")}) {
    $qsub_opts .= $cli_default_options{($option,"*")} . " ";
  } else {
    if ($opened_config_file == 0) {
      $config = "default config file";
    }
    die "$0: Command line option $option not described in $config (or value '$value' not allowed)\n";
  }
}

my $cwd = getcwd();
my $logfile = shift @ARGV;

if ($array_job == 1 && $logfile !~ m/$jobname/
    && $jobend > $jobstart) {
  print STDERR "$0: you are trying to run a parallel job but "
    . "you are putting the output into just one log file ($logfile)\n";
  exit(1);
}

#
# Work out the command; quote escaping is done here.
# Note: the rules for escaping stuff are worked out pretty
# arbitrarily, based on what we want it to do.  Some things that
# we pass as arguments to $0, such as "|", we want to be
# interpreted by bash, so we don't escape them.  Other things,
# such as archive specifiers like 'ark:gunzip -c foo.gz|', we want
# to be passed, in quotes, to the Kaldi program.  Our heuristic
# is that stuff with spaces in should be quoted.  This doesn't
# always work.
#
my $cmd = "";

foreach my $x (@ARGV) {
  if ($x =~ m/^\S+$/) { $cmd .= $x . " "; } # If string contains no spaces, take
                                            # as-is.
  elsif ($x =~ m:\":) { $cmd .= "'$x' "; } # else if no dbl-quotes, use single
  else { $cmd .= "\"$x\" "; }  # else use double.
}

#
# Work out the location of the script file, and open it for writing.
#
my $dir = dirname($logfile);
my $base = basename($logfile);
my $qdir = "$dir/q";
$qdir =~ s:/(log|LOG)/*q:/q:; # If qdir ends in .../log/q, make it just .../q.
my $queue_logfile = "$qdir/$base";

if (!-d $dir) { system "mkdir -p $dir 2>/dev/null"; } # another job may be doing this...
if (!-d $dir) { die "Cannot make the directory $dir\n"; }
# make a directory called "q",
# where we will put the log created by qsub... normally this doesn't contain
# anything interesting, evertyhing goes to $logfile.
if (! -d "$qdir") {
  system "mkdir $qdir 2>/dev/null";
  sleep(5); ## This is to fix an issue we encountered in denominator lattice creation,
  ## where if e.g. the exp/tri2b_denlats/log/15/q directory had just been
  ## created and the job immediately ran, it would die with an error because nfs
  ## had not yet synced.  I'm also decreasing the acdirmin and acdirmax in our
  ## NFS settings to something like 5 seconds.
}

my $queue_array_opt = "";
if ($array_job == 1) { # It's an array job.
  if ($max_jobs_run) {
      $queue_array_opt = "--array ${jobstart}-${jobend}%${max_jobs_run}";
  } else {
      $queue_array_opt = "--array ${jobstart}-${jobend}";
  }
  $logfile =~ s/$jobname/\$SLURM_ARRAY_TASK_ID/g; # This variable will get
  # replaced by qsub, in each job, with the job-id.
  $cmd =~ s/$jobname/\$\{SLURM_ARRAY_TASK_ID\}/g; # same for the command...
  $queue_logfile =~ s/\.?$jobname//; # the log file in the q/ subdirectory
  # is for the queue to put its log, and this doesn't need the task array subscript
  # so we remove it.
}

# queue_scriptfile is as $queue_logfile [e.g. dir/q/foo.log] but
# with the suffix .sh.
my $queue_scriptfile = $queue_logfile;
($queue_scriptfile =~ s/\.[a-zA-Z]{1,5}$/.sh/) || ($queue_scriptfile .= ".sh");
if ($queue_scriptfile !~ m:^/:) {
  $queue_scriptfile = $cwd . "/" . $queue_scriptfile; # just in case.
}

# We'll write to the standard input of "qsub" (the file-handle Q),
# the job that we want it to execute.
# Also keep our current PATH around, just in case there was something
# in it that we need (although we also source ./path.sh)

my $syncfile = "$qdir/done.$$";

system("rm $queue_logfile $syncfile 2>/dev/null");
#
# Write to the script file, and then close it.
#
open(Q, ">$queue_scriptfile") || die "Failed to write to $queue_scriptfile";

print Q "#!/bin/bash\n";
print Q "cd $cwd\n";
print Q ". ./path.sh\n";
print Q "( echo '#' Running on \`hostname\`\n";
print Q "  echo '#' Started at \`date\`\n";
print Q "  set | grep SLURM | while read line; do echo \"# \$line\"; done\n";
print Q "  echo -n '# '; cat <<EOF\n";
print Q "$cmd\n"; # this is a way of echoing the command into a comment in the log file,
print Q "EOF\n"; # without having to escape things like "|" and quote characters.
print Q ") >$logfile\n";
print Q "if [ \"\$CUDA_VISIBLE_DEVICES\" == \"NoDevFiles\" ]; then\n";
print Q "  ( echo CUDA_VISIBLE_DEVICES set to NoDevFiles, unsetting it... \n";
print Q "  )>>$logfile\n";
print Q "  unset CUDA_VISIBLE_DEVICES\n";
print Q "fi\n";
print Q "time1=\`date +\"%s\"\`\n";
print Q " ( $cmd ) &>>$logfile\n";
print Q "ret=\$?\n";
print Q "sync || true\n";
print Q "time2=\`date +\"%s\"\`\n";
print Q "echo '#' Accounting: begin_time=\$time1 >>$logfile\n";
print Q "echo '#' Accounting: end_time=\$time2 >>$logfile\n";
print Q "echo '#' Accounting: time=\$((\$time2-\$time1)) threads=$num_threads >>$logfile\n";
print Q "echo '#' Finished at \`date\` with status \$ret >>$logfile\n";
print Q "[ \$ret -eq 137 ] && exit 100;\n"; # If process was killed (e.g. oom) it will exit with status 137;
  # let the script return with status 100 which will put it to E state; more easily rerunnable.
if ($array_job == 0) { # not an array job
  print Q "touch $syncfile\n"; # so we know it's done.
} else {
  print Q "touch $syncfile.\$SLURM_ARRAY_TASK_ID\n"; # touch a bunch of sync-files.
}
print Q "exit \$[\$ret ? 1 : 0]\n"; # avoid status 100 which grid-engine
print Q "## submitted with:\n";       # treats specially.
$qsub_cmd .= " $qsub_opts --open-mode=append -e ${queue_logfile} -o ${queue_logfile} $queue_array_opt $queue_scriptfile >>$queue_logfile 2>&1";
print Q "# $qsub_cmd\n";
if (!close(Q)) { # close was not successful... || die "Could not close script file $shfile";
  die "Failed to close the script file (full disk?)";
}

my $ret = system ($qsub_cmd);
if ($ret != 0) {
  if ($sync && $ret == 256) { # this is the exit status when a job failed (bad exit status)
    if (defined $jobname) { $logfile =~ s/\$SLURM_ARRAY_TASK_ID/*/g; }
    print STDERR "$0: job writing to $logfile failed\n";
  } else {
    print STDERR "$0: error submitting jobs to queue (return status was $ret)\n";
    print STDERR "queue log file is $queue_logfile, command was $qsub_cmd\n";
    print STDERR `tail $queue_logfile`;
  }
  exit(1);
}

my $sge_job_id;
if (! $sync) { # We're not submitting with -sync y, so we
  # need to wait for the jobs to finish.  We wait for the
  # sync-files we "touched" in the script to exist.
  my @syncfiles = ();
  if (!defined $jobname) { # not an array job.
    push @syncfiles, $syncfile;
  } else {
    for (my $jobid = $jobstart; $jobid <= $jobend; $jobid++) {
      push @syncfiles, "$syncfile.$jobid";
    }
  }
  # We will need the sge_job_id, to check that job still exists
  { # Get the SLURM job-id from the log file in q/
    open(L, "<$queue_logfile") || die "Error opening log file $queue_logfile";
    undef $sge_job_id;
    while (<L>) {
      if (m/Submitted batch job (\d+)/) {
        if (defined $sge_job_id) {
          die "Error: your job was submitted more than once (see $queue_logfile)";
        } else {
          $sge_job_id = $1;
        }
      }
    }
    close(L);
    if (!defined $sge_job_id) {
      die "Error: log file $queue_logfile does not specify the SLURM job-id.";
    }
  }
  my $check_sge_job_ctr=1;
  #
  my $wait = 0.1;
  my $counter = 0;
  foreach my $f (@syncfiles) {
    # wait for them to finish one by one.
    while (! -f $f) {
      sleep($wait);
      $wait *= 1.2;
      if ($wait > 3.0) {
        $wait = 3.0; # never wait more than 3 seconds.
        # the following (.kick) commands are basically workarounds for NFS bugs.
        if (rand() < 0.25) { # don't do this every time...
          if (rand() > 0.5) {
            system("touch $qdir/.kick 2>/dev/null");
          } else {
            system("rm $qdir/.kick 2>/dev/null");
          }
        }
        if ($counter++ % 10 == 0) {
          # This seems to kick NFS in the teeth to cause it to refresh the
          # directory.  I've seen cases where it would indefinitely fail to get
          # updated, even though the file exists on the server.
          # Only do this every 10 waits (every 30 seconds) though, or if there
          # are many jobs waiting they can overwhelm the file server.
          system("ls $qdir >/dev/null");
        }
      }

      # Check that the job exists in SLURM. Job can be killed if duration
      # exceeds some hard limit, or in case of a machine shutdown.
      if (($check_sge_job_ctr++ % 10) == 0) { # Don't run qstat too often, avoid stress on SGE.
        if ( -f $f ) { next; }; #syncfile appeared: OK.
        # system(...) : To get the actual exit value, shift $ret right by eight bits.
        my ($squeue_output, $squeue_status) = exec_command("squeue -j $sge_job_id");
        if ($squeue_status == 1) {
          # Don't consider immediately missing job as error, first wait some
          sleep(4);
          ($squeue_output, $squeue_status) = exec_command("squeue -j $sge_job_id");
        }
        if ($squeue_status == 1) {
          # time to make sure it is not just delayed creation of the syncfile.

          # Don't consider immediately missing job as error, first wait some
          # time to make sure it is not just delayed creation of the syncfile.
          sleep(4);
          # Sometimes NFS gets confused and thinks it's transmitted the directory
          # but it hasn't, due to timestamp issues.  Changing something in the
          # directory will usually fix that.
          system("touch $qdir/.kick");
          system("rm $qdir/.kick 2>/dev/null");
          if ( -f $f ) { next; }   #syncfile appeared, ok
          sleep(7);
          system("touch $qdir/.kick");
          sleep(1);
          system("rm $qdir/.kick 2>/dev/null");
          if ( -f $f ) {  next; }   #syncfile appeared, ok
          sleep(60);
          system("touch $qdir/.kick");
          sleep(1);
          system("rm $qdir/.kick 2>/dev/null");
          if ( -f $f ) { next; }  #syncfile appeared, ok
          $f =~ m/\.(\d+)$/ || die "Bad sync-file name $f";
          my $job_id = $1;
          if (defined $jobname) {
            $logfile =~ s/\$SLURM_ARRAY_TASK_ID/$job_id/g;
          }
          my $last_line = `tail -n 1 $logfile`;
          if ($last_line =~ m/status 0$/ && (-M $logfile) < 0) {
            # if the last line of $logfile ended with "status 0" and
            # $logfile is newer than this program [(-M $logfile) gives the
            # time elapsed between file modification and the start of this
            # program], then we assume the program really finished OK,
            # and maybe something is up with the file system.
            print STDERR "**$0: syncfile $f was not created but job seems\n" .
              "**to have finished OK.  Probably your file-system has problems.\n" .
              "**This is just a warning.\n";
            last;
          } else {
            chop $last_line;
            print STDERR "$0: Error: Job $sge_job_id seems to no longer exists:\n" .
              "'squeue -j $sge_job_id' returned error code $squeue_status and said:\n" .
              "  $squeue_output\n" .
              "Syncfile $f does not exist, meaning that the job did not finish.\n" .
              "Log is in $logfile. Last line '$last_line' does not end in 'status 0'.\n" .
              "Possible reasons:\n" .
              "  a) Exceeded time limit? -> Use more jobs!\n" .
              "  b) Shutdown/Frozen machine? -> Run again! squeue:\n";
            system("squeue -j $sge_job_id");
            exit(1);
          }
        } elsif ($ret != 0) {
          print STDERR "$0: Warning: squeue command returned status $ret (squeue -j $sge_job_id,$!)\n";
        }
      }
    }
  }
  my $all_syncfiles = join(" ", @syncfiles);
  system("rm $all_syncfiles 2>/dev/null");
}

# OK, at this point we are synced; we know the job is done.
# But we don't know about its exit status.  We'll look at $logfile for this.
# First work out an array @logfiles of file-locations we need to
# read (just one, unless it's an array job).
my @logfiles = ();
if (!defined $jobname) { # not an array job.
  push @logfiles, $logfile;
} else {
  for (my $jobid = $jobstart; $jobid <= $jobend; $jobid++) {
    my $l = $logfile;
    $l =~ s/\$SLURM_ARRAY_TASK_ID/$jobid/g;
    push @logfiles, $l;
  }
}

my $num_failed = 0;
my $status = 1;
foreach my $l (@logfiles) {
  my @wait_times = (0.1, 0.2, 0.2, 0.3, 0.5, 0.5, 1.0, 2.0, 5.0, 5.0, 5.0, 10.0, 25.0);
  for (my $iter = 0; $iter <= @wait_times; $iter++) {
    my $line = `tail -10 $l 2>/dev/null`; # Note: although this line should be the last
    # line of the file, I've seen cases where it was not quite the last line because
    # of delayed output by the process that was running, or processes it had called.
    # so tail -10 gives it a little leeway.
    if ($line =~ m/with status (\d+)/) {
      $status = $1;
      last;
    } else {
      if ($iter < @wait_times) {
        sleep($wait_times[$iter]);
      } else {
        if (! -f $l) {
          print STDERR "Log-file $l does not exist.\n";
        } else {
          print STDERR "The last line of log-file $l does not seem to indicate the "
            . "return status as expected\n";
        }
        exit(1);                # Something went wrong with the queue, or the
        # machine it was running on, probably.
      }
    }
  }
  # OK, now we have $status, which is the return-status of
  # the command in the job.
  if ($status != 0) { $num_failed++; }
}
if ($num_failed == 0) { exit(0); }
else { # we failed.
  if (@logfiles == 1) {
    if (defined $jobname) { $logfile =~ s/\$SLURM_TASK_ARRAY_ID/$jobstart/g; }
    print STDERR "$0: job failed with status $status, log is in $logfile\n";
    if ($logfile =~ m/JOB/) {
      print STDERR "$0: probably you forgot to put JOB=1:\$nj in your script.\n";
    }
  } else {
    if (defined $jobname) { $logfile =~ s/\$SLURM_ARRAY_TASK_ID/*/g; }
    my $numjobs = 1 + $jobend - $jobstart;
    print STDERR "$0: $num_failed / $numjobs failed, log is in $logfile\n";
  }
  exit(1);
}


================================================
FILE: egs/utils/parse_options.sh
================================================
#!/usr/bin/env bash

# Copyright 2012  Johns Hopkins University (Author: Daniel Povey);
#                 Arnab Ghoshal, Karel Vesely

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#  http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.


# Parse command-line options.
# To be sourced by another script (as in ". parse_options.sh").
# Option format is: --option-name arg
# and shell variable "option_name" gets set to value "arg."
# The exception is --help, which takes no arguments, but prints the
# $help_message variable (if defined).


###
### The --config file options have lower priority to command line
### options, so we need to import them first...
###

# Now import all the configs specified by command-line, in left-to-right order
for ((argpos=1; argpos<$#; argpos++)); do
  if [ "${!argpos}" == "--config" ]; then
    argpos_plus1=$((argpos+1))
    config=${!argpos_plus1}
    [ ! -r $config ] && echo "$0: missing config '$config'" && exit 1
    . $config  # source the config file.
  fi
done


###
### Now we process the command line options
###
while true; do
  [ -z "${1:-}" ] && break;  # break if there are no arguments
  case "$1" in
    # If the enclosing script is called with --help option, print the help
    # message and exit.  Scripts should put help messages in $help_message
    --help|-h) if [ -z "$help_message" ]; then echo "No help found." 1>&2;
      else printf "$help_message\n" 1>&2 ; fi;
      exit 0 ;;
    --*=*) echo "$0: options to scripts must be of the form --name value, got '$1'"
      exit 1 ;;
    # If the first command-line argument begins with "--" (e.g. --foo-bar),
    # then work out the variable name as $name, which will equal "foo_bar".
    --*) name=`echo "$1" | sed s/^--// | sed s/-/_/g`;
      # Next we test whether the variable in question is undefned-- if so it's
      # an invalid option and we die.  Note: $0 evaluates to the name of the
      # enclosing script.
      # The test [ -z ${foo_bar+xxx} ] will return true if the variable foo_bar
      # is undefined.  We then have to wrap this test inside "eval" because
      # foo_bar is itself inside a variable ($name).
      eval '[ -z "${'$name'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1;

      oldval="`eval echo \\$$name`";
      # Work out whether we seem to be expecting a Boolean argument.
      if [ "$oldval" == "true" ] || [ "$oldval" == "false" ]; then
        was_bool=true;
      else
        was_bool=false;
      fi

      # Set the variable to the right value-- the escaped quotes make it work if
      # the option had spaces, like --cmd "queue.pl -sync y"
      eval $name=\"$2\";

      # Check that Boolean-valued arguments are really Boolean.
      if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then
        echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2
        exit 1;
      fi
      shift 2;
      ;;
  *) break;
  esac
done


# Check for an empty argument to the --cmd option, which can easily occur as a
# result of scripting errors.
[ ! -z "${cmd+xxx}" ] && [ -z "$cmd" ] && echo "$0: empty argument to --cmd option" 1>&2 && exit 1;


true; # so this script returns exit code 0.


================================================
FILE: egs/utils/perturb_data_dir_speed.sh
================================================
#!/usr/bin/env bash

# Copyright 2013  Johns Hopkins University (author: Daniel Povey)
#           2014  Tom Ko
#           2018  Emotech LTD (author: Pawel Swietojanski)
# Apache 2.0

# This script operates on a directory, such as in data/train/,
# that contains some subset of the following files:
#  wav.scp
#  spk2utt
#  utt2spk
#  text
#  utt2dur
#  reco2dur
#
# It generates the files which are used for perturbing the speed of the original data.

. utils/parse_options.sh

if [ $# != 3 ]; then
  echo "Usage: perturb_data_dir_speed.sh <warping-factor> <srcdir> <destdir>"
  echo "e.g.:"
  echo " $0 0.9 data/train_si284 data/train_si284p"
  exit 1
fi

export LC_ALL=C

factor=$1
srcdir=$2
destdir=$3
label="sp"
spk_prefix=$label$factor"-"
utt_prefix=$label$factor"-"

#check is sox on the path
which sox &>/dev/null
! [ $? -eq 0 ] && echo "sox: command not found" && exit 1;

if [ ! -f $srcdir/utt2spk ]; then
  echo "$0: no such file $srcdir/utt2spk"
  exit 1;
fi

if [ "$destdir" == "$srcdir" ]; then
  echo "$0: this script requires <srcdir> and <destdir> to be different."
  exit 1
fi

set -e;
set -o pipefail

mkdir -p $destdir

cat $srcdir/utt2spk | awk -v p=$utt_prefix '{printf("%s %s%s\n", $1, p, $1);}' > $destdir/utt_map
cat $srcdir/spk2utt | awk -v p=$spk_prefix '{printf("%s %s%s\n", $1, p, $1);}' > $destdir/spk_map
cat $srcdir/wav.scp | awk -v p=$spk_prefix '{printf("%s %s%s\n", $1, p, $1);}' > $destdir/reco_map
if [ ! -f $srcdir/utt2uniq ]; then
  cat $srcdir/utt2spk | awk -v p=$utt_prefix '{printf("%s%s %s\n", p, $1, $1);}' > $destdir/utt2uniq
else
  cat $srcdir/utt2uniq | awk -v p=$utt_prefix '{printf("%s%s %s\n", p, $1, $2);}' > $destdir/utt2uniq
fi


cat $srcdir/utt2spk | utils/apply_map.pl -f 1 $destdir/utt_map  | \
  utils/apply_map.pl -f 2 $destdir/spk_map >$destdir/utt2spk

utils/utt2spk_to_spk2utt.pl <$destdir/utt2spk >$destdir/spk2utt

if [ -f $srcdir/segments ]; then

  utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/segments | \
    utils/apply_map.pl -f 2 $destdir/reco_map | \
      awk -v factor=$factor \
        '{s=$3/factor; e=$4/factor; if (e > s + 0.01) { printf("%s %s %.2f %.2f\n", $1, $2, $3/factor, $4/factor);} }' >$destdir/segments

  utils/apply_map.pl -f 1 $destdir/reco_map <$srcdir/wav.scp | sed 's/| *$/ |/' | \
    # Handle three cases of rxfilenames appropriately; "input piped command", "file offset" and "filename" 
    awk -v factor=$factor \
        '{wid=$1; $1=""; if ($NF=="|") {print wid $_ " sox -t wav - -t wav - speed " factor " |"}
          else if (match($0, /:[0-9]+$/)) {print wid " wav-copy" $_ " - | sox -t wav - -t wav - speed " factor " |" } 
          else  {print wid " sox -t wav" $_ " -t wav - speed " factor " |"}}' > $destdir/wav.scp
  if [ -f $srcdir/reco2file_and_channel ]; then
    utils/apply_map.pl -f 1 $destdir/reco_map <$srcdir/reco2file_and_channel >$destdir/reco2file_and_channel
  fi

else # no segments->wav indexed by utterance.
  if [ -f $srcdir/wav.scp ]; then
    utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/wav.scp | sed 's/| *$/ |/' | \
     # Handle three cases of rxfilenames appropriately; "input piped command", "file offset" and "filename" 
     awk -v factor=$factor \
       '{wid=$1; $1=""; if ($NF=="|") {print wid $_ " sox -t wav - -t wav - speed " factor " |"}
         else if (match($0, /:[0-9]+$/)) {print wid " wav-copy" $_ " - | sox -t wav - -t wav - speed " factor " |" } 
         else {print wid " sox -t wav" $_ " -t wav - speed " factor " |"}}' > $destdir/wav.scp
  fi
fi

if [ -f $srcdir/text ]; then
  utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/text >$destdir/text
fi
if [ -f $srcdir/spk2gender ]; then
  utils/apply_map.pl -f 1 $destdir/spk_map <$srcdir/spk2gender >$destdir/spk2gender
fi
if [ -f $srcdir/utt2lang ]; then
  utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/utt2lang >$destdir/utt2lang
fi

#prepare speed-perturbed utt2dur
if [ ! -f $srcdir/utt2dur ]; then
  # generate utt2dur if it does not exist in srcdir
  utils/data/get_utt2dur.sh $srcdir
fi
cat $srcdir/utt2dur | utils/apply_map.pl -f 1 $destdir/utt_map  | \
  awk -v factor=$factor '{print $1, $2/factor;}' >$destdir/utt2dur

#prepare speed-perturbed reco2dur 
if [ ! -f $srcdir/reco2dur ]; then
  # generate reco2dur if it does not exist in srcdir
  utils/data/get_reco2dur.sh $srcdir
fi
cat $srcdir/reco2dur | utils/apply_map.pl -f 1 $destdir/reco_map  | \
  awk -v factor=$factor '{print $1, $2/factor;}' >$destdir/reco2dur

rm $destdir/spk_map $destdir/utt_map $destdir/reco_map 2>/dev/null
echo "$0: generated speed-perturbed version of data in $srcdir, in $destdir"

utils/validate_data_dir.sh --no-feats --no-text $destdir


================================================
FILE: egs/utils/pinyin_map.pl
================================================
#!/usr/bin/env perl
use warnings; #sed replacement for -w perl parameter

$num_args = $#ARGV + 1;
if ($num_args != 1) {
  print "\nUsage: pinyin2phone.pl pinyin2phone\n";
  exit;
}

open(MAPS, $ARGV[0]) or die("Could not open pinyin map file.");
my %py2ph; foreach $line (<MAPS>) { @A = split(" ", $line);
  $py = shift(@A);
  $py2ph{$py} = [@A];
}

#foreach $word ( keys %py2ph ) {
     #foreach $i ( 0 .. $#{ $py2ph{$word} } ) {
     #    print " $word = $py2ph{$word}[$i]";
     #}
     #print " $#{ $py2ph{$word} }";
     #print "\n";
#}

my @entry;

while (<STDIN>) {
  @A = split(" ", $_);
  @entry = ();
  $W = shift(@A);
  push(@entry, $W);
  for($i = 0; $i < @A; $i++) {
    $initial= $A[$i]; $final = $A[$i];
    #print $initial, " ", $final, "\n";
    if ($A[$i] =~ /^CH[A-Z0-9]+$/) {$initial =~ s:(CH)[A-Z0-9]+:$1:; $final =~ s:CH([A-Z0-9]+):$1:;}
    elsif ($A[$i] =~ /^SH[A-Z0-9]+$/) {$initial =~ s:(SH)[A-Z0-9]+:$1:; $final =~ s:SH([A-Z0-9]+):$1:;}
    elsif ($A[$i] =~ /^ZH[A-Z0-9]+$/) {$initial =~ s:(ZH)[A-Z0-9]+:$1:; $final =~ s:ZH([A-Z0-9]+):$1:;}
    elsif ($A[$i] =~ /^B[A-Z0-9]+$/) {$initial =~ s:(B)[A-Z0-9]+:$1:; $final =~ s:B([A-Z0-9]+):$1:;}
    elsif ($A[$i] =~ /^C[A-Z0-9]+$/) {$initial =~ s:(C)[A-Z0-9]+:$1:; $final =~ s:C([A-Z0-9]+):$1:;}
    elsif ($A[$i] =~ /^D[A-Z0-9]+$/) {$initial =~ s:(D)[A-Z0-9]+:$1:; $final =~ s:D([A-Z0-9]+):$1:;}
    elsif ($A[$i] =~ /^F[A-Z0-9]+$/) {$initial =~ s:(F)[A-Z0-9]+:$1:; $final =~ s:F([A-Z0-9]+):$1:;}
    elsif ($A[$i] =~ /^G[A-Z0-9]+$/) {$initial =~ s:(G)[A-Z0-9]+:$1:; $final =~ s:G([A-Z0-9]+):$1:;}
    elsif ($A[$i] =~ /^H[A-Z0-9]+$/) {$initial =~ s:(H)[A-Z0-9]+:$1:; $final =~ s:H([A-Z0-9]+):$1:;}
    elsif ($A[$i] =~ /^J[A-Z0-9]+$/) {$initial =~ s:(J)[A-Z0-9]+:$1:; $final =~ s:J([A-Z0-9]+):$1:;}
    elsif ($A[$i] =~ /^K[A-Z0-9]+$/) {$initial =~ s:(K)[A-Z0-9]+:$1:; $final =~ s:K([A-Z0-9]+):$1:;}
    elsif ($A[$i] =~ /^L[A-Z0-9]+$/) {$initial =~ s:(L)[A-Z0-9]+:$1:; $final =~ s:L([A-Z0-9]+):$1:;}
    elsif ($A[$i] =~ /^M[A-Z0-9]+$/) {$initial =~ s:(M)[A-Z0-9]+:$1:; $final =~ s:M([A-Z0-9]+):$1:;}
    elsif ($A[$i] =~ /^N[A-Z0-9]+$/) {$initial =~ s:(N)[A-Z0-9]+:$1:; $final =~ s:N([A-Z0-9]+):$1:;}
    elsif ($A[$i] =~ /^P[A-Z0-9]+$/) {$initial =~ s:(P)[A-Z0-9]+:$1:; $final =~ s:P([A-Z0-9]+):$1:;}
    elsif ($A[$i] =~ /^Q[A-Z0-9]+$/) {$initial =~ s:(Q)[A-Z0-9]+:$1:; $final =~ s:Q([A-Z0-9]+):$1:;}
    elsif ($A[$i] =~ /^R[A-Z0-9]+$/) {$initial =~ s:(R)[A-Z0-9]+:$1:; $final =~ s:R([A-Z0-9]+):$1:;}
    elsif ($A[$i] =~ /^S[A-Z0-9]+$/) {$initial =~ s:(S)[A-Z0-9]+:$1:; $final =~ s:S([A-Z0-9]+):$1:;}
    elsif ($A[$i] =~ /^T[A-Z0-9]+$/) {$initial =~ s:(T)[A-Z0-9]+:$1:; $final =~ s:T([A-Z0-9]+):$1:;}
    elsif ($A[$i] =~ /^W[A-Z0-9]+$/) {$initial =~ s:(W)[A-Z0-9]+:$1:; $final =~ s:W([A-Z0-9]+):$1:;}
    elsif ($A[$i] =~ /^X[A-Z0-9]+$/) {$initial =~ s:(X)[A-Z0-9]+:$1:; $final =~ s:X([A-Z0-9]+):$1:;}
    elsif ($A[$i] =~ /^Y[A-Z0-9]+$/) {$initial =~ s:(Y)[A-Z0-9]+:$1:; $final =~ s:Y([A-Z0-9]+):$1:;}
    elsif ($A[$i] =~ /^Z[A-Z0-9]+$/) {$initial =~ s:(Z)[A-Z0-9]+:$1:; $final =~ s:Z([A-Z0-9]+):$1:;}
    if ($initial ne $A[$i]) {
      $tone = $final;
      $final =~ s:([A-Z]+)[0-9]:$1:;
      $tone =~ s:[A-Z]+([0-9]):$1:;
      if (!(exists $py2ph{$initial}) or !(exists $py2ph{$final})) { die "$0: no entry find for ", $A[$i], " ", $initial, " ", $final;}
      push(@entry, @{$py2ph{$initial}});
      @tmp = @{$py2ph{$final}};
      for($j = 0; $j < @tmp ; $j++) {$tmp[$j] = $tmp[$j].$tone;}
      push(@entry, @tmp);
    }
    else {
      $tone = $A[$i];
      $A[$i] =~ s:([A-Z]+)[0-9]:$1:;
      $tone =~ s:[A-Z]+([0-9]):$1:;
      if (!(exists $py2ph{$A[$i]})) { die "$0: no entry find for ", $A[$i];}
      @tmp = @{$py2ph{$A[$i]}};
      for($j = 0; $j < @tmp ; $j++) {$tmp[$j] = $tmp[$j].$tone;}
      push(@entry, @tmp);
    }
  }
  print "@entry";
  print "\n";
}


================================================
FILE: egs/utils/prepare_extended_lang.sh
================================================
#!/usr/bin/env bash
# Copyright 2018  Xiaohui Zhang

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#  http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.

# This script adds word-position-dependent phones and constructs a host of other
# derived files, that go in data/lang/.

# Begin configuration section.
prep_lang_opts=
stage=0
word_list= # if a word list (mapping words from the srcdict to IDs) is provided,
# we'll make sure the IDs of these words are kept as before.
# end configuration sections

echo "$0: warning: This sript is is now deprecated. You may want to use utils/lang/extend_lang.sh"
echo "$0 $@"  # Print the command line for logging

. utils/parse_options.sh

if [ $# -ne 7 ]; then
  echo "usage: utils/prepare_extended_lang.sh <dict-src-dir> <oov-dict-entry> <extra-lexicon> "
  echo "<phone-symbol-table> <extended-dict-dir> <tmp-dir> <extended-lang-dir>"
  echo "e.g.: utils/prepare_extended_lang.sh data/local/dict '<SPOKEN_NOISE>' lexicon_extra.txt"
  echo "data/lang/phones.txt data/local/dict_ext data/local/lang_ext data/lang_ext"
  echo "The goal is to extend the lexicon from <dict-src-dir> with extra lexical entries from "
  echo "<extra-lexicon>, putting the extended lexicon into <extended-dict-dir>, and then build"
  echo "a valid lang dir <extended-lang-dir>. This is useful when we want to extend the vocab"
  echo "in test time."
  echo "<dict-src-dir> must be a valid dictionary dir and <oov-dict-entry> is the oov word "
  echo "(see utils/prepare_lang.sh for details). A phone symbol table from a previsouly built "
  echo "lang dir is required, for validating provided lexical entries."
  echo "options: "
  echo "     --prep-lang-opts STRING              # options to pass to utils/prepare_lang.sh"
  echo "     --word-list <filename>               # default: \"\"; if not empty, re-order the "
  echo "                                          # words in the generated words.txt so that the"
  echo "                                          # words from the provided list have their ids"
  echo "                                          # kept unchanged."
  exit 1;
fi

srcdict=$1
oov_word=$2
extra_lexicon=$3
phone_symbol_table=$4
extdict=$5 # extended dict dir
tmpdir=$6
extlang=$7 # extended lang dir

mkdir -p $extlang $tmpdir 

[ -f path.sh ] && . ./path.sh

! utils/validate_dict_dir.pl $srcdict && \
  echo "*Error validating directory $srcdict*" && exit 1;

if [[ ! -f $srcdict/lexicon.txt ]]; then
  echo "**Creating $dir/lexicon.txt from $dir/lexiconp.txt"
  perl -ape 's/(\S+\s+)\S+\s+(.+)/$1$2/;' < $srcdict/lexiconp.txt \
    > $srcdict/lexicon.txt || exit 1;
fi

if [[ ! -f $srcdict/lexiconp.txt ]]; then
  echo "**Creating $srcdict/lexiconp.txt from $srcdict/lexicon.txt"
  perl -ape 's/(\S+\s+)(.+)/${1}1.0\t$2/;' < $srcdict/lexicon.txt > $srcdict/lexiconp.txt || exit 1;
fi

# Checks if the phone sets match.
echo "$(basename $0): Validating the source lexicon"
cat $srcdict/lexicon.txt | awk -v f=$phone_symbol_table '
BEGIN { while ((getline < f) > 0) { sub(/_[BEIS]$/, "", $1); phones[$1] = 1; }}
{ for (x = 2; x <= NF; ++x) { 
  if (!($x in phones)) {
    print "The source lexicon contains a phone not in the phones.txt: "$x;
    print "You must provide a phones.txt from the lang built with the source lexicon.";
    exit 1; 
  }
}}' || exit 1;

echo "$(basename $0): Validating the extra lexicon"
cat $extra_lexicon | awk -v f=$phone_symbol_table '
BEGIN { while ((getline < f) > 0) { sub(/_[BEIS]$/, "", $1); phones[$1] = 1; }}
{ for (x = 2; x <= NF; ++x) { if (!($x in phones)) {
    print "The extra lexicon contains a phone not in the phone symbol table: "$x; exit 1; }
  }
}' || exit 1;

if [ $stage -le 0 ]; then
  # Genearte the extended dict dir
  echo "$(basename $0): Creating the extended lexicon $extdict/lexicon.txt"
  [ -d $extdict ] && rm -r $extdict 2>/dev/null
  cp -R $srcdict $extdict 2>/dev/null
  
  # Reformat the source lexicon
  perl -ape 's/(\S+\s+)\S+\s+(.+)/$1$2/;' <$srcdict/lexiconp.txt | awk '{ gsub(/\t/, " "); print }' \
   >$tmpdir/lexicon.txt || exit 1;
  
  # Filter lexical entries which are already in the source lexicon
  awk '{ gsub(/\t/, " "); print }' $extra_lexicon | sort -u | \
    awk 'NR==FNR{a[$0]=1;next} {if (!($0 in a)) print $0 }' $tmpdir/lexicon.txt - \
    > $extdict/lexicon_extra.txt || exit 1;
  
  echo "$(basename $0): Creating $extdict/lexiconp.txt from $srcdict/lexiconp.txt and $extdict/lexicon_extra.txt"
  perl -ape 's/(\S+\s+)(.+)/${1}1 $2/;' < $extdict/lexicon_extra.txt | \
    cat $srcdict/lexiconp.txt - | awk '{ gsub(/\t/, " "); print }' | \
    sort -u -k1,1 -k2g,2 -k3 > $extdict/lexiconp.txt || exit 1;
  
  perl -ape 's/(\S+\s+)\S+\s+(.+)/$1$2/;' <$extdict/lexiconp.txt  >$extdict/lexicon.txt || exit 1;
  
  # Create lexicon_silprobs.txt
  silprob=false
  [ -f $srcdict/lexiconp_silprob.txt ] && silprob=true
  if "$silprob"; then
    echo "$(basename $0): Creating $extdict/lexiconp_silprob.txt from $srcdict/lexiconp_silprob.txt"
    # Here we assume no acoustic evidence for the extra word-pron pairs.
    # So we assign silprob1 = overall_silprob, silprob2 = silprob3 = 1.00
    overall_silprob=`awk '{if ($1=="overall") print $2}' $srcdict/silprob.txt`
    awk -v overall=$overall_silprob '{
      printf("%s %d %.1f %.2f %.2f",$1, 1, overall, 1.00, 1.00); 
      for(n=2;n<=NF;n++) printf " "$n; printf("\n");
      }' $extdict/lexicon_extra.txt | cat $srcdict/lexiconp_silprob.txt - | \
      sort -k1,1 -k2g,2 -k6 \
      > $extdict/lexiconp_silprob.txt || exit 1;
  fi
  
  if ! utils/validate_dict_dir.pl $extdict >&/dev/null; then
    utils/validate_dict_dir.pl $extdict  # show the output.
    echo "$(basename $0): Validation failed on the extended dict"
    exit 1;
  fi
fi

if [ $stage -le 1 ]; then
  echo "$(basename $0): Preparing the extended lang dir."
  [ -d $extlang ] && rm -r $extlang 2>/dev/null
  utils/prepare_lang.sh $prep_lang_opts $extdict \
    $oov_word $tmpdir $extlang || exit 1;
  
  # If a word list is provided, make sure the word-ids of these words are kept unchanged
  # in the extended word list.
  if [ -f $word_list ]; then
    # First, make sure there's no OOV in the provided word-list.
    if [ `awk -v s=$extlang/words.txt 'BEGIN{ while((getline < s) > 0) { vocab[$1] = 1;}} \
        {if (!($1 in vocab)) print $0}' $word_list | wc -l` -gt 0 ]; then
      echo "$(basename $0): The provided word list contains words out of the extended vocab."
      exit 1;
    fi
    awk -v s=$word_list -v oov=$oov_word -v boost=$oov_unigram_prob -v prob=$oov_prob \
      'BEGIN{ while((getline < s) > 0) { vocab[$1] = 1; n+=1; print $0}} \
       { if (!($1 in vocab)) {print $1" "n; n+=1;}}' $extlang/words.txt > $extlang/words.txt.$$
    mv $extlang/words.txt.$$ $extlang/words.txt
  fi
fi

exit 0;


================================================
FILE: egs/utils/prepare_lang.sh
================================================
#!/usr/bin/env bash
# Copyright 2012-2013  Johns Hopkins University (Author: Daniel Povey);
#                      Arnab Ghoshal
#                2014  Guoguo Chen
#                2015  Hainan Xu
#                2016  FAU Erlangen (Author: Axel Horndasch)

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#  http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.

# This script prepares a directory such as data/lang/, in the standard format,
# given a source directory containing a dictionary lexicon.txt in a form like:
# word phone1 phone2 ... phoneN
# per line (alternate prons would be separate lines), or a dictionary with probabilities
# called lexiconp.txt in a form:
# word pron-prob phone1 phone2 ... phoneN
# (with 0.0 < pron-prob <= 1.0); note: if lexiconp.txt exists, we use it even if
# lexicon.txt exists.
# and also files silence_phones.txt, nonsilence_phones.txt, optional_silence.txt
# and extra_questions.txt
# Here, silence_phones.txt and nonsilence_phones.txt are lists of silence and
# non-silence phones respectively (where silence includes various kinds of
# noise, laugh, cough, filled pauses etc., and nonsilence phones includes the
# "real" phones.)
# In each line of those files is a list of phones, and the phones on each line
# are assumed to correspond to the same "base phone", i.e. they will be
# different stress or tone variations of the same basic phone.
# The file "optional_silence.txt" contains just a single phone (typically SIL)
# which is used for optional silence in the lexicon.
# extra_questions.txt might be empty; typically will consist of lists of phones,
# all members of each list with the same stress or tone; and also possibly a
# list for the silence phones.  This will augment the automatically generated
# questions (note: the automatically generated ones will treat all the
# stress/tone versions of a phone the same, so will not "get to ask" about
# stress or tone).
#

# This script adds word-position-dependent phones and constructs a host of other
# derived files, that go in data/lang/.

# Begin configuration section.
num_sil_states=5
num_nonsil_states=3
position_dependent_phones=true
# position_dependent_phones is false also when position dependent phones and word_boundary.txt
# have been generated by another source
share_silence_phones=false  # if true, then share pdfs of different silence
                            # phones together.
sil_prob=0.5
unk_fst=        # if you want to model the unknown-word (<oov-dict-entry>)
                # with a phone-level LM as created by make_unk_lm.sh,
                # provide the text-form FST via this flag, e.g. <work-dir>/unk_fst.txt
                # where <work-dir> was the 2nd argument of make_unk_lm.sh.
phone_symbol_table=              # if set, use a specified phones.txt file.
extra_word_disambig_syms=        # if set, add disambiguation symbols from this file (one per line)
                                 # to phones/disambig.txt, phones/wdisambig.txt and words.txt
num_extra_phone_disambig_syms=1 # Standard one phone disambiguation symbol is used for optional silence.
                                # Increasing this number does not harm, but is only useful if you later
                                # want to introduce this labels to L_disambig.fst


# end configuration sections

echo "$0 $@"  # Print the command line for logging

. utils/parse_options.sh

if [ $# -ne 4 ]; then
  echo "Usage: utils/prepare_lang.sh <dict-src-dir> <oov-dict-entry> <tmp-dir> <lang-dir>"
  echo "e.g.: utils/prepare_lang.sh data/local/dict <SPOKEN_NOISE> data/local/lang data/lang"
  echo "<dict-src-dir> should contain the following files:"
  echo " extra_questions.txt  lexicon.txt nonsilence_phones.txt  optional_silence.txt  silence_phones.txt"
  echo "See http://kaldi-asr.org/doc/data_prep.html#data_prep_lang_creating for more info."
  echo "options: "
  echo "<dict-src-dir> may also, for the grammar-decoding case (see http://kaldi-asr.org/doc/grammar.html)"
  echo "contain a file nonterminals.txt containing symbols like #nonterm:contact_list, one per line."
  echo "     --num-sil-states <number of states>             # default: 5, #states in silence models."
  echo "     --num-nonsil-states <number of states>          # default: 3, #states in non-silence models."
  echo "     --position-dependent-phones (true|false)        # default: true; if true, use _B, _E, _S & _I"
  echo "                                                     # markers on phones to indicate word-internal positions. "
  echo "     --share-silence-phones (true|false)             # default: false; if true, share pdfs of "
  echo "                                                     # all silence phones. "
  echo "     --sil-prob <probability of silence>             # default: 0.5 [must have 0 <= silprob < 1]"
  echo "     --phone-symbol-table <filename>                 # default: \"\"; if not empty, use the provided "
  echo "                                                     # phones.txt as phone symbol table. This is useful "
  echo "                                                     # if you use a new dictionary for the existing setup."
  echo "     --unk-fst <text-fst>                            # default: none.  e.g. exp/make_unk_lm/unk_fst.txt."
  echo "                                                     # This is for if you want to model the unknown word"
  echo "                                                     # via a phone-level LM rather than a special phone"
  echo "                                                     # (this should be more useful for test-time than train-time)."
  echo "     --extra-word-disambig-syms <filename>           # default: \"\"; if not empty, add disambiguation symbols"
  echo "                                                     # from this file (one per line) to phones/disambig.txt,"
  echo "                                                     # phones/wdisambig.txt and words.txt"
  exit 1;
fi

srcdir=$1
oov_word=$2
tmpdir=$3
dir=$4


if [ -d $dir/phones ]; then
  rm -r $dir/phones
fi
mkdir -p $dir $tmpdir $dir/phones

silprob=false
[ -f $srcdir/lexiconp_silprob.txt ] && silprob=true

[ -f path.sh ] && . ./path.sh

! utils/validate_dict_dir.pl $srcdir && \
  echo "*Error validating directory $srcdir*" && exit 1;

if [[ ! -f $srcdir/lexicon.txt ]]; then
  echo "**Creating $srcdir/lexicon.txt from $srcdir/lexiconp.txt"
  perl -ape 's/(\S+\s+)\S+\s+(.+)/$1$2/;' < $srcdir/lexiconp.txt > $srcdir/lexicon.txt || exit 1;
fi
if [[ ! -f $srcdir/lexiconp.txt ]]; then
  echo "**Creating $srcdir/lexiconp.txt from $srcdir/lexicon.txt"
  perl -ape 's/(\S+\s+)(.+)/${1}1.0\t$2/;' < $srcdir/lexicon.txt > $srcdir/lexiconp.txt || exit 1;
fi

if [ ! -z "$unk_fst" ] && [ ! -f "$unk_fst" ]; then
  echo "$0: expected --unk-fst $unk_fst to exist as a file"
  exit 1
fi

if ! utils/validate_dict_dir.pl $srcdir >&/dev/null; then
  utils/validate_dict_dir.pl $srcdir  # show the output.
  echo "Validation failed (second time)"
  exit 1;
fi

# phones.txt file provided, we will do some sanity check here.
if [[ ! -z $phone_symbol_table ]]; then
  # Checks if we have position dependent phones
  n1=`cat $phone_symbol_table | grep -v -E "^#[0-9]+$" | cut -d' ' -f1 | sort -u | wc -l`
  n2=`cat $phone_symbol_table | grep -v -E "^#[0-9]+$" | cut -d' ' -f1 | sed 's/_[BIES]$//g' | sort -u | wc -l`
  $position_dependent_phones && [ $n1 -eq $n2 ] &&\
    echo "$0: Position dependent phones requested, but not in provided phone symbols" && exit 1;
  ! $position_dependent_phones && [ $n1 -ne $n2 ] &&\
      echo "$0: Position dependent phones not requested, but appear in the provided phones.txt" && exit 1;

  # Checks if the phone sets match.
  cat $srcdir/{,non}silence_phones.txt | awk -v f=$phone_symbol_table '
  BEGIN { while ((getline < f) > 0) { sub(/_[BEIS]$/, "", $1); phones[$1] = 1; }}
  { for (x = 1; x <= NF; ++x) { if (!($x in phones)) {
      print "Phone appears in the lexicon but not in the provided phones.txt: "$x; exit 1; }}}' || exit 1;
fi

# In case there are extra word-level disambiguation symbols we need
# to make sure that all symbols in the provided file are valid.
if [ ! -z "$extra_word_disambig_syms" ]; then
  if ! utils/lang/validate_disambig_sym_file.pl --allow-numeric "false" $extra_word_disambig_syms; then
    echo "$0: Validation of disambiguation file \"$extra_word_disambig_syms\" failed."
    exit 1;
  fi
fi

if $position_dependent_phones; then
  # Create $tmpdir/lexiconp.txt from $srcdir/lexiconp.txt (or
  # $tmpdir/lexiconp_silprob.txt from $srcdir/lexiconp_silprob.txt) by
  # adding the markers _B, _E, _S, _I depending on word position.
  # In this recipe, these markers apply to silence also.
  # Do this starting from lexiconp.txt only.
  if "$silprob"; then
    perl -ane '@A=split(" ",$_); $w = shift @A; $p = shift @A; $silword_p = shift @A;
              $wordsil_f = shift @A; $wordnonsil_f = shift @A; @A>0||die;
         if(@A==1) { print "$w $p $silword_p $wordsil_f $wordnonsil_f $A[0]_S\n"; }
         else { print "$w $p $silword_p $wordsil_f $wordnonsil_f $A[0]_B ";
         for($n=1;$n<@A-1;$n++) { print "$A[$n]_I "; } print "$A[$n]_E\n"; } ' \
                < $srcdir/lexiconp_silprob.txt > $tmpdir/lexiconp_silprob.txt
  else
    perl -ane '@A=split(" ",$_); $w = shift @A; $p = shift @A; @A>0||die;
         if(@A==1) { print "$w $p $A[0]_S\n"; } else { print "$w $p $A[0]_B ";
         for($n=1;$n<@A-1;$n++) { print "$A[$n]_I "; } print "$A[$n]_E\n"; } ' \
         < $srcdir/lexiconp.txt > $tmpdir/lexiconp.txt || exit 1;
  fi

  # create $tmpdir/phone_map.txt
  # this has the format (on each line)
  # <original phone> <version 1 of original phone> <version 2> ...
  # where the versions depend on the position of the phone within a word.
  # For instance, we'd have:
  # AA AA_B AA_E AA_I AA_S
  # for (B)egin, (E)nd, (I)nternal and (S)ingleton
  # and in the case of silence
  # SIL SIL SIL_B SIL_E SIL_I SIL_S
  # [because SIL on its own is one of the variants; this is for when it doesn't
  #  occur inside a word but as an option in the lexicon.]

  # This phone map expands the phone lists into all the word-position-dependent
  # versions of the phone lists.
  cat <(set -f; for x in `cat $srcdir/silence_phones.txt`; do for y in "" "" "_B" "_E" "_I" "_S"; do echo -n "$x$y "; done; echo; done) \
    <(set -f; for x in `cat $srcdir/nonsilence_phones.txt`; do for y in "" "_B" "_E" "_I" "_S"; do echo -n "$x$y "; done; echo; done) \
    > $tmpdir/phone_map.txt
else
  if "$silprob"; then
    cp $srcdir/lexiconp_silprob.txt $tmpdir/lexiconp_silprob.txt
  else
    cp $srcdir/lexiconp.txt $tmpdir/lexiconp.txt
  fi

  cat $srcdir/silence_phones.txt $srcdir/nonsilence_phones.txt | \
    awk '{for(n=1;n<=NF;n++) print $n; }' > $tmpdir/phones
  paste -d' ' $tmpdir/phones $tmpdir/phones > $tmpdir/phone_map.txt
fi


# Sets of phones for use in clustering, and making monophone systems.

if $share_silence_phones; then
  # build a roots file that will force all the silence phones to share the
  # same pdf's. [three distinct states, only the transitions will differ.]
  # 'shared'/'not-shared' means, do we share the 3 states of the HMM
  # in the same tree-root?
  # Sharing across models(phones) is achieved by writing several phones
  # into one line of roots.txt (shared/not-shared doesn't affect this).
  # 'not-shared not-split' means we have separate tree roots for the 3 states,
  # but we never split the tree so they remain stumps,
  # so all phones in the line correspond to the same model.

  cat $srcdir/silence_phones.txt | awk '{printf("%s ", $0); } END{printf("\n");}' | cat - $srcdir/nonsilence_phones.txt | \
    utils/apply_map.pl $tmpdir/phone_map.txt > $dir/phones/sets.txt
  cat $dir/phones/sets.txt | \
    awk '{if(NR==1) print "not-shared", "not-split", $0; else print "shared", "split", $0;}' > $dir/phones/roots.txt
else
  # different silence phones will have different GMMs.  [note: here, all "shared split" means
  # is that we may have one GMM for all the states, or we can split on states.  because they're
  # context-independent phones, they don't see the context.]
  cat $srcdir/{,non}silence_phones.txt | utils/apply_map.pl $tmpdir/phone_map.txt > $dir/phones/sets.txt
  cat $dir/phones/sets.txt | awk '{print "shared", "split", $0;}' > $dir/phones/roots.txt
fi

cat $srcdir/silence_phones.txt | utils/apply_map.pl $tmpdir/phone_map.txt | \
  awk '{for(n=1;n<=NF;n++) print $n;}' > $dir/phones/silence.txt
cat $srcdir/nonsilence_phones.txt | utils/apply_map.pl $tmpdir/phone_map.txt | \
  awk '{for(n=1;n<=NF;n++) print $n;}' > $dir/phones/nonsilence.txt
cp $srcdir/optional_silence.txt $dir/phones/optional_silence.txt
cp $dir/phones/silence.txt $dir/phones/context_indep.txt

# if extra_questions.txt is empty, it's OK.
cat $srcdir/extra_questions.txt 2>/dev/null | utils/apply_map.pl $tmpdir/phone_map.txt \
  >$dir/phones/extra_questions.txt

# Want extra questions about the word-start/word-end stuff. Make it separate for
# silence and non-silence. Probably doesn't matter, as silence will rarely
# be inside a word.
if $position_dependent_phones; then
  for suffix in _B _E _I _S; do
    (set -f; for x in `cat $srcdir/nonsilence_phones.txt`; do echo -n "$x$suffix "; done; echo) >>$dir/phones/extra_questions.txt
  done
  for suffix in "" _B _E _I _S; do
    (set -f; for x in `cat $srcdir/silence_phones.txt`; do echo -n "$x$suffix "; done; echo) >>$dir/phones/extra_questions.txt
  done
fi

# add_lex_disambig.pl is responsible for adding disambiguation symbols to
# the lexicon, for telling us how many disambiguation symbols it used,
# and also for modifying the unknown-word's pronunciation (if the
# --unk-fst was provided) to the sequence "#1 #2 #3", and reserving those
# disambig symbols for that purpose.
# The #2 will later be replaced with the actual unk model.  The reason
# for the #1 and the #3 is for disambiguation and also to keep the
# FST compact.  If we didn't have the #1, we might have a different copy of
# the unk-model FST, or at least some of its arcs, for each start-state from
# which an <unk> transition comes (instead of per end-state, which is more compact);
# and adding the #3 prevents us from potentially having 2 copies of the unk-model
# FST due to the optional-silence [the last phone of any word gets 2 arcs].
if [ ! -z "$unk_fst" ]; then  # if the --unk-fst option was provided...
  if "$silprob"; then
    utils/lang/internal/modify_unk_pron.py $tmpdir/lexiconp_silprob.txt "$oov_word" || exit 1
  else
    utils/lang/internal/modify_unk_pron.py $tmpdir/lexiconp.txt "$oov_word" || exit 1
  fi
  unk_opt="--first-allowed-disambig 4"
else
  unk_opt=
fi

if "$silprob"; then
  ndisambig=$(utils/add_lex_disambig.pl $unk_opt --pron-probs --sil-probs $tmpdir/lexiconp_silprob.txt $tmpdir/lexiconp_silprob_disambig.txt)
else
  ndisambig=$(utils/add_lex_disambig.pl $unk_opt --pron-probs $tmpdir/lexiconp.txt $tmpdir/lexiconp_disambig.txt)
fi
ndisambig=$[$ndisambig+$num_extra_phone_disambig_syms]; # add (at least) one disambig symbol for silence in lexicon FST.
echo $ndisambig > $tmpdir/lex_ndisambig

# Format of lexiconp_disambig.txt:
# !SIL	1.0   SIL_S
# <SPOKEN_NOISE>	1.0   SPN_S #1
# <UNK>	1.0  SPN_S #2
# <NOISE>	1.0  NSN_S
# !EXCLAMATION-POINT	1.0  EH2_B K_I S_I K_I L_I AH0_I M_I EY1_I SH_I AH0_I N_I P_I OY2_I N_I T_E

( for n in `seq 0 $ndisambig`; do echo '#'$n; done ) >$dir/phones/disambig.txt

# In case there are extra word-level disambiguation symbols they also
# need to be added to the list of phone-level disambiguation symbols.
if [ ! -z "$extra_word_disambig_syms" ]; then
  # We expect a file containing valid word-level disambiguation symbols.
  cat $extra_word_disambig_syms | awk '{ print $1 }' >> $dir/phones/disambig.txt
fi

# Create phone symbol table.
if [[ ! -z $phone_symbol_table ]]; then
  start_symbol=`grep \#0 $phone_symbol_table | awk '{print $2}'`
  echo "<eps>" | cat - $dir/phones/{silence,nonsilence}.txt | awk -v f=$phone_symbol_table '
  BEGIN { while ((getline < f) > 0) { phones[$1] = $2; }} { print $1" "phones[$1]; }' | sort -k2 -g |\
    cat - <(cat $dir/phones/disambig.txt | awk -v x=$start_symbol '{n=x+NR-1; print $1, n;}') > $dir/phones.txt
else
  echo "<eps>" | cat - $dir/phones/{silence,nonsilence,disambig}.txt | \
    awk '{n=NR-1; print $1, n;}' > $dir/phones.txt
fi

# Create a file that describes the word-boundary information for
# each phone.  5 categories.
if $position_dependent_phones; then
  cat $dir/phones/{silence,nonsilence}.txt | \
    awk '/_I$/{print $1, "internal"; next;} /_B$/{print $1, "begin"; next; }
         /_S$/{print $1, "singleton"; next;} /_E$/{print $1, "end"; next; }
         {print $1, "nonword";} ' > $dir/phones/word_boundary.txt
else
  # word_boundary.txt might have been generated by another source
  [ -f $srcdir/word_boundary.txt ] && cp $srcdir/word_boundary.txt $dir/phones/word_boundary.txt
fi

# Create word symbol table.
# <s> and </s> are only needed due to the need to rescore lattices with
# ConstArpaLm format language model. They do not normally appear in G.fst or
# L.fst.

if "$silprob"; then
  # remove the silprob
  cat $tmpdir/lexiconp_silprob.txt |\
    awk '{
      for(i=1; i<=NF; i++) {
        if(i!=3 && i!=4 && i!=5) printf("%s\t", $i); if(i==NF) print "";
      }
    }' > $tmpdir/lexiconp.txt
fi

cat $tmpdir/lexiconp.txt | awk '{print $1}' | sort | uniq  | awk '
  BEGIN {
    print "<eps> 0";
  }
  {
    if ($1 == "<s>") {
      print "<s> is in the vocabulary!" | "cat 1>&2"
      exit 1;
    }
    if ($1 == "</s>") {
      print "</s> is in the vocabulary!" | "cat 1>&2"
      exit 1;
    }
    printf("%s %d\n", $1, NR);
  }
  END {
    printf("#0 %d\n", NR+1);
    printf("<s> %d\n", NR+2);
    printf("</s> %d\n", NR+3);
  }' > $dir/words.txt || exit 1;

# In case there are extra word-level disambiguation symbols they also
# need to be added to words.txt
if [ ! -z "$extra_word_disambig_syms" ]; then
  # Since words.txt already exists, we need to extract the current word count.
  word_count=`tail -n 1 $dir/words.txt | awk '{ print $2 }'`

  # We expect a file containing valid word-level disambiguation symbols.
  # The list of symbols is attached to the current words.txt (including
  # a numeric identifier for each symbol).
  cat $extra_word_disambig_syms | \
    awk -v WC=$word_count '{ printf("%s %d\n", $1, ++WC); }' >> $dir/words.txt || exit 1;
fi

# format of $dir/words.txt:
#<eps> 0
#a 1
#aa 2
#aarvark 3
#...

silphone=`cat $srcdir/optional_silence.txt` || exit 1;
[ -z "$silphone" ] && \
  ( echo "You have no optional-silence phone; it is required in the current scripts"
    echo "but you may use the option --sil-prob 0.0 to stop it being used." ) && \
   exit 1;

# create $dir/phones/align_lexicon.{txt,int}.
# This is the method we use for lattice word alignment if we are not
# using word-position-dependent phones.

# First remove pron-probs from the lexicon.
perl -ape 's/(\S+\s+)\S+\s+(.+)/$1$2/;' <$tmpdir/lexiconp.txt >$tmpdir/align_lexicon.txt

# Note: here, $silphone will have no suffix e.g. _S because it occurs as optional-silence,
# and is not part of a word.
[ ! -z "$silphone" ] && echo "<eps> $silphone" >> $tmpdir/align_lexicon.txt

cat $tmpdir/align_lexicon.txt | \
  perl -ane '@A = split; print $A[0], " ", join(" ", @A), "\n";' | sort | uniq > $dir/phones/align_lexicon.txt

if [ -f $srcdir/nonterminals.txt ]; then
  utils/lang/grammar/augment_phones_txt.py $dir/phones.txt $srcdir/nonterminals.txt $dir/phones.txt
  utils/lang/grammar/augment_words_txt.py $dir/words.txt $srcdir/nonterminals.txt $dir/words.txt
  cp $srcdir/nonterminals.txt $dir/phones/nonterminals.txt
  utils/sym2int.pl $dir/phones.txt <$dir/phones/nonterminals.txt >$dir/phones/nonterminals.int

  for w in "#nonterm_begin" "#nonterm_end" $(cat $srcdir/nonterminals.txt); do
    echo $w $w  # These are words without pronunciations, so leave those prons
                # empty.
  done >> $dir/phones/align_lexicon.txt
  nonterm_phones_offset=$(grep '#nonterm_bos' <$dir/phones.txt | awk '{print $2}')
  echo $nonterm_phones_offset > $dir/phones/nonterm_phones_offset.int
  echo '#nonterm_bos' > $dir/phones/nonterm_phones_offset.txt  # temporary.

  if [ -f $dir/phones/word_boundary.txt ]; then
    # word-position-dependent system.  Only include the optional-silence phone,
    # and phones that can end a word, plus the special symbol #nonterm_bos, in the
    # left-context phones.
    awk '{if ($2 == "end" || $2 == "singleton") print $1; }' <$dir/phones/word_boundary.txt | \
        cat - $dir/phones/optional_silence.txt $dir/phones/nonterm_phones_offset.txt > $dir/phones/left_context_phones.txt
  else
    cat $dir/phones/{silence,nonsilence}.txt $dir/phones/nonterm_phones_offset.txt > $dir/phones/left_context_phones.txt
  fi
  utils/sym2int.pl $dir/phones.txt <$dir/phones/left_context_phones.txt >$dir/phones/left_context_phones.int

  # we need to write utils/lang/make_lexicon_fst_silprob.py before this can work.
  grammar_opts="--left-context-phones=$dir/phones/left_context_phones.txt --nonterminals=$srcdir/nonterminals.txt"
else
  grammar_opts=
fi

# create phones/align_lexicon.int from phones/align_lexicon.txt
cat $dir/phones/align_lexicon.txt | utils/sym2int.pl -f 3- $dir/phones.txt | \
  utils/sym2int.pl -f 1-2 $dir/words.txt > $dir/phones/align_lexicon.int

# Create the basic L.fst without disambiguation symbols, for use
# in training.

if $silprob; then
  # Add silence probabilities (models the prob. of silence before and after each
  # word).  On some setups this helps a bit.  See utils/dict_dir_add_pronprobs.sh
  # and where it's called in the example scripts (run.sh).
  utils/lang/make_lexicon_fst_silprob.py $grammar_opts --sil-phone=$silphone \
         $tmpdir/lexiconp_silprob.txt $srcdir/silprob.txt | \
     fstcompile --isymbols=$dir/phones.txt --osymbols=$dir/words.txt \
       --keep_isymbols=false --keep_osymbols=false |   \
     fstarcsort --sort_type=olabel > $dir/L.fst || exit 1;
else
  utils/lang/make_lexicon_fst.py $grammar_opts --sil-prob=$sil_prob --sil-phone=$silphone \
            $tmpdir/lexiconp.txt | \
    fstcompile --isymbols=$dir/phones.txt --osymbols=$dir/words.txt \
      --keep_isymbols=false --keep_osymbols=false | \
    fstarcsort --sort_type=olabel > $dir/L.fst || exit 1;
fi

# The file oov.txt contains a word that we will map any OOVs to during
# training.
echo "$oov_word" > $dir/oov.txt || exit 1;
cat $dir/oov.txt | utils/sym2int.pl $dir/words.txt >$dir/oov.int || exit 1;
# integer version of oov symbol, used in some scripts.


# the file wdisambig.txt contains a (line-by-line) list of the text-form of the
# disambiguation symbols that are used in the grammar and passed through by the
# lexicon.  At this stage it's hardcoded as '#0', but we're laying the groundwork
# for more generality (which probably would be added by another script).
# wdisambig_words.int contains the corresponding list interpreted by the
# symbol table words.txt, and wdisambig_phones.int contains the corresponding
# list interpreted by the symbol table phones.txt.
echo '#0' >$dir/phones/wdisambig.txt

# In case there are extra word-level disambiguation symbols they need
# to be added to the existing word-level disambiguation symbols file.
if [ ! -z "$extra_word_disambig_syms" ]; then
  # We expect a file containing valid word-level disambiguation symbols.
  # The regular expression for awk is just a paranoia filter (e.g. for empty lines).
  cat $extra_word_disambig_syms | awk '{ print $1 }' >> $dir/phones/wdisambig.txt
fi

utils/sym2int.pl $dir/phones.txt <$dir/phones/wdisambig.txt >$dir/phones/wdisambig_phones.int
utils/sym2int.pl $dir/words.txt <$dir/phones/wdisambig.txt >$dir/phones/wdisambig_words.int

# Create these lists of phones in colon-separated integer list form too,
# for purposes of being given to programs as command-line options.
for f in silence nonsilence optional_silence disambig context_indep; do
  utils/sym2int.pl $dir/phones.txt <$dir/phones/$f.txt >$dir/phones/$f.int
  utils/sym2int.pl $dir/phones.txt <$dir/phones/$f.txt | \
   awk '{printf(":%d", $1);} END{printf "\n"}' | sed s/:// > $dir/phones/$f.csl || exit 1;
done

for x in sets extra_questions; do
  utils/sym2int.pl $dir/phones.txt <$dir/phones/$x.txt > $dir/phones/$x.int || exit 1;
done

utils/sym2int.pl -f 3- $dir/phones.txt <$dir/phones/roots.txt \
   > $dir/phones/roots.int || exit 1;

if [ -f $dir/phones/word_boundary.txt ]; then
  utils/sym2int.pl -f 1 $dir/phones.txt <$dir/phones/word_boundary.txt \
    > $dir/phones/word_boundary.int || exit 1;
fi

silphonelist=`cat $dir/phones/silence.csl`
nonsilphonelist=`cat $dir/phones/nonsilence.csl`

# Note: it's OK, after generating the 'lang' directory, to overwrite the topo file
# with another one of your choice if the 'topo' file you want can't be generated by
# utils/gen_topo.pl.  We do this in the 'chain' recipes.  Of course, the 'topo' file
# should cover all the phones.  Try running utils/validate_lang.pl to check that
# everything is OK after modifying the topo file.
utils/gen_topo.pl $num_nonsil_states $num_sil_states $nonsilphonelist $silphonelist >$dir/topo


# Create the lexicon FST with disambiguation symbols, and put it in lang_test.
# There is an extra step where we create a loop to "pass through" the
# disambiguation symbols from G.fst.

if $silprob; then
  utils/lang/make_lexicon_fst_silprob.py $grammar_opts \
     --sil-phone=$silphone --sil-disambig='#'$ndisambig \
     $tmpdir/lexiconp_silprob_disambig.txt $srcdir/silprob.txt | \
     fstcompile --isymbols=$dir/phones.txt --osymbols=$dir/words.txt \
       --keep_isymbols=false --keep_osymbols=false |   \
     fstaddselfloops  $dir/phones/wdisambig_phones.int $dir/phones/wdisambig_words.int | \
     fstarcsort --sort_type=olabel > $dir/L_disambig.fst || exit 1;
else
  utils/lang/make_lexicon_fst.py $grammar_opts \
       --sil-prob=$sil_prob --sil-phone=$silphone --sil-disambig='#'$ndisambig \
         $tmpdir/lexiconp_disambig.txt | \
     fstcompile --isymbols=$dir/phones.txt --osymbols=$dir/words.txt \
       --keep_isymbols=false --keep_osymbols=false |   \
     fstaddselfloops  $dir/phones/wdisambig_phones.int $dir/phones/wdisambig_words.int | \
     fstarcsort --sort_type=olabel > $dir/L_disambig.fst || exit 1;
fi


if [ ! -z "$unk_fst" ]; then
  utils/lang/internal/apply_unk_lm.sh $unk_fst $dir || exit 1

  if ! $position_dependent_phones; then
    echo "$0: warning: you are using the --unk-lm option and setting --position-dependent-phones false."
    echo " ... this will make it impossible to properly work out the word boundaries after"
    echo " ... decoding; quite a few scripts will not work as a result, and many scoring scripts"
    echo " ... will die."
    sleep 4
  fi
fi

echo "$(basename $0): validating output directory"
! utils/validate_lang.pl $dir && echo "$(basename $0): error validating output" &&  exit 1;

exit 0;


================================================
FILE: egs/utils/prepare_online_nnet_dist_build.sh
================================================
#!/usr/bin/env bash

# Copyright 2015  Johns Hopkins University (Author: Vijayaditya Peddinti)
#                 Guoguo Chen
# Apache 2.0
# Script to prepare the distribution from the online-nnet build

other_files= #other files to be included in the build
other_dirs=
conf_files="ivector_extractor.conf mfcc.conf online_cmvn.conf online_nnet2_decoding.conf splice.conf"
ivec_extractor_files="final.dubm final.ie final.mat global_cmvn.stats online_cmvn.conf splice_opts"

echo "$0 $@"  # Print the command line for logging
[ -f path.sh ] && . ./path.sh;
. parse_options.sh || exit 1;

if [ $# -ne 3 ]; then
   echo "Usage: $0 <lang-dir> <model-dir> <output-tgz>"
   echo "e.g.: $0 data/lang exp/nnet2_online/nnet_ms_a_online tedlium.tgz"
   exit 1;
fi

lang=$1
modeldir=$2
tgzfile=$3

for f in $lang/phones.txt $other_files; do
  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
done

build_files=
for d in $modeldir/conf $modeldir/ivector_extractor; do
  [ ! -d $d ] && echo "$0: no such directory $d" && exit 1;
done

for f in $ivec_extractor_files; do
  f=$modeldir/ivector_extractor/$f
  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
  build_files="$build_files $f"
done

# Makes a copy of the original config files, as we will change the absolute path
# to relative.
rm -rf $modeldir/conf_abs_path
mkdir -p $modeldir/conf_abs_path
cp -r $modeldir/conf/* $modeldir/conf_abs_path

for f in $conf_files; do 
  [ ! -f $modeldir/conf/$f ] && \
    echo "$0: no such file $modeldir/conf/$f" && exit 1;
  # Changes absolute path to relative path. The path entries in the config file
  # are generated by scripts and it is safe to assume that they have structure:
  # variable=path
  cat $modeldir/conf_abs_path/$f | perl -e '
    use File::Spec;
    while(<STDIN>) {
      chomp;
      @col = split("=", $_);
      if (@col == 2 && (-f $col[1])) {
        $col[1] = File::Spec->abs2rel($col[1]);
        print "$col[0]=$col[1]\n";
      } else {
        print "$_\n";
      }
    }
  ' > $modeldir/conf/$f
  build_files="$build_files $modeldir/conf/$f"
done

tar -hczvf $tgzfile $lang $build_files $other_files $other_dirs \
  $modeldir/final.mdl $modeldir/tree >/dev/null

# Changes back to absolute path.
rm -rf $modeldir/conf
mv $modeldir/conf_abs_path $modeldir/conf


================================================
FILE: egs/utils/remove_data_links.sh
================================================
#!/usr/bin/env bash

# This program searches within a directory for soft links that
# appear to be created by 'create_data_link.pl' to a 'storage/' subdirectory,
# and it removes both the soft links and the things they point to.
# for instance, if you have a soft link 
#   foo/egs/1.1.egs -> storage/2/1.1.egs
# it will remove both foo/egs/storage/2/1.1.egs, and foo/egs/1.1.egs.

ret=0

dry_run=false

if [ "$1" == "--dry-run" ]; then
  dry_run=true
  shift
fi

if [ $# == 0 ]; then
  echo "Usage:  $0 [--dry-run] <list-of-directories>"
  echo "e.g.: $0 exp/nnet4a/egs/"
  echo " Removes from any subdirectories of the command-line arguments, soft links that "
  echo " appear to have been created by utils/create_data_link.pl, as well as the things"
  echo " that those soft links point to.  Will typically be called on a directory prior"
  echo " to 'rm -r' on that directory, to ensure that data that was distributed on other"
  echo " volumes also gets deleted."
  echo " With --dry-run, just prints what it would do."
fi

for dir in $*; do
  if [ ! -d $dir ]; then
    echo "$0: not a directory: $dir"
    ret=1
  else
    for subdir in $(find $dir -type d); do
      if [ -d $subdir/storage ]; then
        for x in $(ls $subdir); do
          f=$subdir/$x
          if [ -L $f ] && [[ $(readlink $f) == storage/* ]]; then
            target=$subdir/$(readlink $f)
            if $dry_run; then
              echo rm $f $target
            else
              rm $f $target
            fi
          fi
        done
      fi
    done
  fi
done

exit $ret


================================================
FILE: egs/utils/remove_oovs.pl
================================================
#!/usr/bin/env perl
# Copyright 2010-2011 Microsoft Corporation

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#  http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.

# This script removes lines that contain these OOVs on either the
# third or fourth fields  of the line.  It is intended to remove arcs
# with OOVs on, from FSTs (probably compiled from ARPAs with OOVs in).

if (  @ARGV < 1 && @ARGV > 2) {
    die "Usage: remove_oovs.pl unk_list.txt [ printed-fst ]\n";
}

$unklist = shift @ARGV;
open(S, "<$unklist") || die "Failed opening unknown-symbol list $unklist\n";
while(<S>){ 
    @A = split(" ", $_);
    @A == 1 || die "Bad line in unknown-symbol list: $_";
    $unk{$A[0]} = 1;
}

$num_removed = 0;
while(<>){ 
    @A = split(" ", $_);
    if(defined $unk{$A[2]} || defined $unk{$A[3]}) {
        $num_removed++;
    } else {
        print;
    }
}
print STDERR "remove_oovs.pl: removed $num_removed lines.\n";


================================================
FILE: egs/utils/reverse_arpa.py
================================================
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Copyright 2012 Mirko Hannemann BUT, mirko.hannemann@gmail.com

from __future__ import print_function
import sys
import codecs # for UTF-8/unicode

if len(sys.argv) != 2:
    print('usage: reverse_arpa arpa.in')
    sys.exit()
arpaname = sys.argv[1]

#\data\
#ngram 1=4
#ngram 2=2
#ngram 3=2
#
#\1-grams:
#-5.234679	a -3.3
#-3.456783	b
#0.0000000	<s> -2.5
#-4.333333	</s>
#
#\2-grams:
#-1.45678	a b -3.23
#-1.30490	<s> a -4.2
#
#\3-grams:
#-0.34958	<s> a b
#-0.23940	a b </s>
#\end\

# read language model in ARPA format
try:
  file = codecs.open(arpaname, "r", "utf-8")
except IOError:
  print('file not found: ' + arpaname)
  sys.exit()

text=file.readline()
while (text and text[:6] != "\\data\\"): text=file.readline()
if not text:
  print("invalid ARPA file")
  sys.exit()
#print text,
while (text and text[:5] != "ngram"): text=file.readline()

# get ngram counts
cngrams=[]
n=0
while (text and text[:5] == "ngram"):
  ind = text.split("=")
  counts = int(ind[1].strip())
  r = ind[0].split()
  read_n = int(r[1].strip())
  if read_n != n+1:
    print("invalid ARPA file: {}".format(text))
    sys.exit()
  n = read_n
  cngrams.append(counts)
  #print text,
  text=file.readline()

# read all n-grams order by order
sentprob = 0.0 # sentence begin unigram
ngrams=[]
inf=float("inf")
for n in range(1,len(cngrams)+1): # unigrams, bigrams, trigrams
  while (text and "-grams:" not in text): text=file.readline()
  if n != int(text[1]):
    print("invalid ARPA file:{}".format(text))
    sys.exit()
  #print text,cngrams[n-1]
  this_ngrams={} # stores all read ngrams
  for ng in range(cngrams[n-1]):
    while (text and len(text.split())<2):
      text=file.readline()
      if (not text) or ((len(text.split())==1) and (("-grams:" in text) or (text[:5] == "\\end\\"))): break
    if (not text) or ((len(text.split())==1) and (("-grams:" in text) or (text[:5] == "\\end\\"))):
      break # to deal with incorrect ARPA files
    entry = text.split()
    prob = float(entry[0])
    if len(entry)>n+1:
      back = float(entry[-1])
      words = entry[1:n+1]
    else:
      back = 0.0
      words = entry[1:]
    ngram = " ".join(words)
    if (n==1) and words[0]=="<s>":
      sentprob = prob
      prob = 0.0
    this_ngrams[ngram] = (prob,back)
    #print prob,ngram.encode("utf-8"),back

    for x in range(n-1,0,-1):
      # add all missing backoff ngrams for reversed lm
      l_ngram = " ".join(words[:x]) # shortened ngram
      r_ngram = " ".join(words[1:1+x]) # shortened ngram with offset one
      if l_ngram not in ngrams[x-1]: # create missing ngram
        ngrams[x-1][l_ngram] = (0.0,inf)
        #print ngram, "create 0.0", l_ngram, "inf"
      if r_ngram not in ngrams[x-1]: # create missing ngram
        ngrams[x-1][r_ngram] = (0.0,inf)
        #print ngram, "create 0.0", r_ngram, "inf",x,n,h_ngram

      # add all missing backoff ngrams for forward lm
      h_ngram = " ".join(words[n-x:]) # shortened history
      if h_ngram not in ngrams[x-1]: # create missing ngram
        ngrams[x-1][h_ngram] = (0.0,inf)
        #print "create inf", h_ngram, "0.0"
    text=file.readline()
    if (not text) or ((len(text.split())==1) and (("-grams:" in text) or (text[:5] == "\\end\\"))): break
  ngrams.append(this_ngrams)

while (text and text[:5] != "\\end\\"): text=file.readline()
if not text:
  print("invalid ARPA file")
  sys.exit()
file.close()
#print text,

#fourgram "maxent" model (b(ABCD)=0):
#p(A)+b(A) A 0
#p(AB)+b(AB)-b(A)-p(B) AB 0
#p(ABC)+b(ABC)-b(AB)-p(BC) ABC 0
#p(ABCD)+b(ABCD)-b(ABC)-p(BCD) ABCD 0

#fourgram reverse ARPA model (b(ABCD)=0):
#p(A)+b(A) A 0
#p(AB)+b(AB)-p(B)+p(A) BA 0
#p(ABC)+b(ABC)-p(BC)+p(AB)-p(B)+p(A) CBA 0
#p(ABCD)+b(ABCD)-p(BCD)+p(ABC)-p(BC)+p(AB)-p(B)+p(A) DCBA 0

# compute new reversed ARPA model
print("\\data\\")
for n in range(1,len(cngrams)+1): # unigrams, bigrams, trigrams
  print("ngram {0} = {1}".format(n, len(ngrams[n-1].keys())))
offset = 0.0
for n in range(1,len(cngrams)+1): # unigrams, bigrams, trigrams
  print("\\{}-grams:".format(n))
  keys = sorted(ngrams[n-1].keys())
  for ngram in keys:
    prob = ngrams[n-1][ngram]
    # reverse word order
    words = ngram.split()
    rstr = " ".join(reversed(words))
    # swap <s> and </s>
    rev_ngram = rstr.replace("<s>","<temp>").replace("</s>","<s>").replace("<temp>","</s>")

    revprob = prob[0]
    if (prob[1] != inf): # only backoff weights from not newly created ngrams
      revprob = revprob + prob[1]
    #print prob[0],prob[1]
    # sum all missing terms in decreasing ngram order
    for x in range(n-1,0,-1): 
      l_ngram = " ".join(words[:x]) # shortened ngram
      if l_ngram not in ngrams[x-1]:
        sys.stderr.write(rev_ngram+": not found "+l_ngram+"\n")
      p_l = ngrams[x-1][l_ngram][0]
      #print p_l,l_ngram
      revprob = revprob + p_l

      r_ngram = " ".join(words[1:1+x]) # shortened ngram with offset one
      if r_ngram not in ngrams[x-1]:
        sys.stderr.write(rev_ngram+": not found "+r_ngram+"\n")
      p_r = ngrams[x-1][r_ngram][0]
      #print -p_r,r_ngram
      revprob = revprob - p_r

    if n != len(cngrams): #not highest order
      back = 0.0
      if rev_ngram[:3] == "<s>": # special handling since arpa2fst ignores <s> weight
        if n == 1:
          offset = revprob # remember <s> weight
          revprob = sentprob # apply <s> weight from forward model
          back = offset
        elif n == 2:
          revprob = revprob + offset # add <s> weight to bigrams starting with <s>
      if (prob[1] != inf): # only backoff weights from not newly created ngrams
        print(revprob,rev_ngram.encode("utf-8"),back)
      else:
        print(revprob,rev_ngram.encode("utf-8"),"-100000.0")
    else: # highest order - no backoff weights
      if (n==2) and (rev_ngram[:3] == "<s>"): revprob = revprob + offset
      print(revprob,rev_ngram.encode("utf-8"))
print("\\end\\")


================================================
FILE: egs/utils/rnnlm_compute_scores.sh
================================================
#!/usr/bin/env bash

# Compute scores from RNNLM.  This script takes a directory
# $dir (e.g. dir=local/rnnlm/rnnlm.voc30.hl30 ),
# where it expects the files:
#  rnnlm  wordlist.rnn  unk.probs,
# and also an input file location where it can get the sentences to score, and
# an output file location to put the scores (negated logprobs) for each
# sentence.  This script uses the Kaldi-style "archive" format, so the input and
# output files will have a first field that corresponds to some kind of
# utterance-id or, in practice, utterance-id-1, utterance-id-2, etc., for the
# N-best list.
#
# Here, "wordlist.rnn" is the set of words, like a vocabulary,
# that the RNN was trained on (note, it won't include <s> or </s>),
# plus <RNN_UNK> which is a kind of class where we put low-frequency
# words; unk.probs gives the probs for words given this class, and it
# has, on each line, "word prob".

rnnlm_ver=rnnlm-0.3e
ensure_normalized_probs=false  # if true then we add the neccesary options to
                               # normalize the probabilities of RNNLM
                               # e.g. when using faster-rnnlm in the nce mode

. ./path.sh || exit 1;
. utils/parse_options.sh

rnnlm=$KALDI_ROOT/tools/$rnnlm_ver/rnnlm

[ ! -f $rnnlm ] && echo No such program $rnnlm && exit 1;

if [ $# != 4 ]; then
  echo "Usage: rnnlm_compute_scores.sh <rnn-dir> <temp-dir> <input-text> <output-scores>"
  exit 1;
fi

dir=$1
tempdir=$2
text_in=$3
scores_out=$4

for x in rnnlm wordlist.rnn unk.probs; do
  if [ ! -f $dir/$x ]; then 
    echo "rnnlm_compute_scores.sh: expected file $dir/$x to exist."
    exit 1;
  fi
done

mkdir -p $tempdir
cat $text_in | awk '{for (x=2;x<=NF;x++) {printf("%s ", $x)} printf("\n");}' >$tempdir/text
cat $text_in | awk '{print $1}' > $tempdir/ids # e.g. utterance ids.
cat $tempdir/text | awk -v voc=$dir/wordlist.rnn -v unk=$dir/unk.probs \
  -v logprobs=$tempdir/loglikes.oov \
 'BEGIN{ while((getline<voc)>0) { invoc[$1]=1; } while ((getline<unk)>0){ unkprob[$1]=$2;} }
  { logprob=0;
    if (NF==0) { printf "<RNN_UNK>"; logprob = log(1.0e-07);
      print "Warning: empty sequence." | "cat 1>&2"; }
    for (x=1;x<=NF;x++) { w=$x;  
    if (invoc[w]) { printf("%s ",w); } else {
      printf("<RNN_UNK> ");
      if (unkprob[w] != 0) { logprob += log(unkprob[w]); }
      else { print "Warning: unknown word ", w | "cat 1>&2"; logprob += log(1.0e-07); }}}
    printf("\n"); print logprob > logprobs } ' > $tempdir/text.nounk

# OK, now we compute the scores on the text with OOVs replaced
# with <RNN_UNK>

if [ $rnnlm_ver == "faster-rnnlm" ]; then
  extra_options=
  if [ "$ensure_normalized_probs" = true ]; then
    extra_options="--nce-accurate-test 1"
  fi
  $rnnlm $extra_options -independent -rnnlm $dir/rnnlm -test $tempdir/text.nounk -nbest -debug 0 | \
     awk '{print $1*log(10);}' > $tempdir/loglikes.rnn
else
  # add the utterance_id as required by Mikolove's rnnlm
  paste $tempdir/ids $tempdir/text.nounk > $tempdir/id_text.nounk

  $rnnlm -independent -rnnlm $dir/rnnlm -test $tempdir/id_text.nounk -nbest -debug 0 | \
     awk '{print $1*log(10);}' > $tempdir/loglikes.rnn
fi

[ `cat $tempdir/loglikes.rnn | wc -l` -ne `cat $tempdir/loglikes.oov | wc -l` ] && \
  echo "rnnlm rescoring failed" && exit 1;

paste $tempdir/loglikes.rnn $tempdir/loglikes.oov | awk '{print -($1+$2);}' >$tempdir/scores

# scores out, with utterance-ids.
paste $tempdir/ids $tempdir/scores  > $scores_out


================================================
FILE: egs/utils/s2eps.pl
================================================
#!/usr/bin/env perl
# Copyright 2010-2011 Microsoft Corporation

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#  http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.

# This script replaces <s> and </s> with <eps> (on both input and output sides),
# for the G.fst acceptor.

while(<>){
    @A = split(" ", $_);
    if ( @A >= 4 ) {
        if ($A[2] eq "<s>" || $A[2] eq "</s>") { $A[2] = "<eps>"; }
        if ($A[3] eq "<s>" || $A[3] eq "</s>") { $A[3] = "<eps>"; }
    }
    print join("\t", @A) . "\n";
}


================================================
FILE: egs/utils/scoring/wer_ops_details.pl
================================================
#!/usr/bin/env perl
# Copyright 2015 Johns Hopkins University (Author: Yenda Trmal <jtrmal@gmail.com>)

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#  http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.


# These scripts are (or can be) used by scoring scripts to generate
# additional information (such as per-spk wer, per-sentence alignments and so on)
# during the scoring. See the wsj/local/score.sh script for example how
# the scripts are used
# For help and instructions about usage, see the bottom of this file,
# or call it with the parameter --help

use strict;
use warnings;
use Getopt::Long;
use Pod::Usage;


my $help;
my $special_symbol= "<eps>";
my $separator=";";
my $extra_size=4;
my $max_size=16;

# this function reads the opened file (supplied as a first
# parameter) into an array of lines. For each
# line, it tests whether it's a valid utf-8 compatible
# line. If all lines are valid utf-8, it returns the lines 
# decoded as utf-8, otherwise it assumes the file's encoding
# is one of those 1-byte encodings, such as ISO-8859-x
# or Windows CP-X.
# Please recall we do not really care about
# the actually encoding, we just need to 
# make sure the length of the (decoded) string 
# is correct (to make the output formatting looking right).
sub get_utf8_or_bytestream {
  use Encode qw(decode encode);
  my $is_utf_compatible = 1;
  my @unicode_lines;
  my @raw_lines;
  my $raw_text;
  my $lineno = 0;
  my $file = shift;

  while (<$file>) {
    $raw_text = $_;
    last unless $raw_text;
    if ($is_utf_compatible) {
      my $decoded_text = eval { decode("UTF-8", $raw_text, Encode::FB_CROAK) } ;
      $is_utf_compatible = $is_utf_compatible && defined($decoded_text); 
      push @unicode_lines, $decoded_text;
    }
    push @raw_lines, $raw_text;
    $lineno += 1;
  }

  if (!$is_utf_compatible) {
    print STDERR "$0: Note: handling as byte stream\n";
    return (0, @raw_lines);
  } else {
    print STDERR "$0: Note: handling as utf-8 text\n";
    return (1, @unicode_lines);
  }

  return 0;
}
sub print_line {
  my $op = $_[0];
  my $rewf = $_[1];
  my $hypw = $_[2];
  my $nofop = $_[3];

}

sub max {
  $_[ 0 ] < $_[ -1 ] ? shift : pop while @_ > 1;
  return @_;
}


GetOptions("special-symbol=s" => \$special_symbol,
           "separator=s" => \$separator,
           "help|?" => \$help
           ) or pod2usage(2);
pod2usage(1) if $help;
pod2usage("$0: Too many files given.\n")  if (@ARGV != 0);

my %EDIT_OPS;
my %UTT;
(my $is_utf8, my @text) = get_utf8_or_bytestream(\*STDIN);
if ($is_utf8) {
  binmode(STDOUT, ":utf8");
}

while (@text) {
  my $line = shift @text;
  chomp $line;
  my @entries = split(" ", $line);
  next if  @entries < 2;
  next if  ($entries[1] ne "hyp") and ($entries[1] ne "ref") ;
  if (scalar @entries <= 2 ) {
    print STDERR "$0: Warning: skipping entry \"$_\", either an  empty phrase or incompatible format\n" ;
    next;
  }

  die "The input stream contains duplicate entry $entries[0] $entries[1]\n"
    if exists $UTT{$entries[0]}->{$entries[1]};
  push @{$UTT{$entries[0]}->{$entries[1]}}, @entries[2..$#entries];
  #print join(" ", @{$UTT{$entries[0]}->{$entries[1]}}) . "\n";
  #print $_ . "\n";
}

for my $utterance( sort (keys %UTT) ) {

  die "The input stream does not contain entry \"hyp\" for utterance $utterance\n"
    unless exists $UTT{$utterance}->{"hyp"};
  die "The input stream does not contain entry \"ref\" for utterance $utterance\n"
    unless exists $UTT{$utterance}->{"ref"};

  my $hyp = $UTT{$utterance}->{"hyp"};
  my $ref = $UTT{$utterance}->{"ref"};

  die "The \"ref\" an \"hyp\" entries do not have the same number of fields"
    unless (scalar @{$hyp}) == (scalar @{$ref});

  for ( my $i = 0; $i < @{$hyp}; $i += 1) {
    $EDIT_OPS{$ref->[$i]}->{$hyp->[$i]} += 1;
  }
}

my $word_len = 0;
my $ops_len =0;
foreach my $refw ( sort (keys %EDIT_OPS) ) {
  foreach my $hypw ( sort (keys %{$EDIT_OPS{$refw}} ) ) {
    my $q = length($refw) > length($hypw) ? length($refw):  length($hypw) ;
    if ( $q > $max_size ) {
      #print STDERR Dumper( [$refw, $hypw, $q, length($refw), length($hypw) ]);
      ;
    }
    $word_len = $q > $word_len ? $q : $word_len ;

    my $d = length(sprintf("%d", $EDIT_OPS{$refw}->{$hypw}));
    $ops_len =  $d > $ops_len ? $d: $ops_len ;
  }
}

if ($word_len > $max_size) {
  ## We used to warn about this, but it was just confusing-- dan.
  ## print STDERR "wer_ops_details.pl [info; affects only whitespace]: we are limiting the width to $max_size, max word len was $word_len\n";
  $word_len = $max_size
};


foreach my $refw ( sort (keys %EDIT_OPS) ) {
  foreach my $hypw ( sort (keys %{$EDIT_OPS{$refw}} ) ) {
    if ( $refw eq $hypw ) {
      printf "correct       %${word_len}s    %${word_len}s    %${ops_len}d\n", ($refw,  $hypw,  $EDIT_OPS{$refw}->{$hypw});
    } elsif ( $refw eq   $special_symbol ) {
      printf "insertion     %${word_len}s    %${word_len}s    %${ops_len}d\n", ($refw,  $hypw,  $EDIT_OPS{$refw}->{$hypw});
    } elsif ( $hypw eq $special_symbol ) {
      printf "deletion      %${word_len}s    %${word_len}s    %${ops_len}d\n", ($refw,  $hypw,  $EDIT_OPS{$refw}->{$hypw});
    } else {
      printf "substitution  %${word_len}s    %${word_len}s    %${ops_len}d\n", ($refw,  $hypw,  $EDIT_OPS{$refw}->{$hypw});
    }
  }
}
exit 0;
__END__
=head1 NAME
  wer_ops_details.pl -- generate aggregated ops statistics

=head1 SYNOPSIS

  wer_per_spk_details.pl

  Options:
    --special-symbol        special symbol used in align-text to denote empty word
                            in case insertion or deletion ("<eps>" by default)
    --help                  Print this help

==head1 DESCRIPTION
  The program generates global statistic on how many time was each word
  recognized correctly, confused as another word, incorrectly deleted or inserted.
  The output will contain similar info as the sclite dtl file, the format is,
  however, completely different.


==head1 EXAMPLE INPUT AND OUTPUT
  Input:
    UTT-A ref  word-A   <eps>  word-B  word-C  word-D  word-E
    UTT-A hyp  word-A  word-A  word-B   <eps>  word-D  word-X

  Output:
    correct       word-A  word-A  1
    correct       word-B  word-B  1
    correct       word-D  word-D  1
    deletion      word-C  <eps>   1
    insertion     <eps>   word-A  1
    substitution  word-E  word-X  1


  Note:
    The input can contain other lines as well -- those will be ignored during
    reading the input. I.E. this is a completely legal input:

      UTT-A ref  word-A   <eps>  word-B  word-C  word-D  word-E
      UTT-A hyp  word-A  word-A  word-B   <eps>  word-D  word-X
      UTT-A op      C       I       C       D       C       S
      UTT-A #csid 3 1 1 1
=cut


================================================
FILE: egs/utils/scoring/wer_per_spk_details.pl
================================================
#!/usr/bin/env perl
# Copyright 2015 Johns Hopkins University (Author: Yenda Trmal <jtrmal@gmail.com>)

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#  http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.


# These scripts are (or can be) used by scoring scripts to generate 
# additional information (such as per-spk wer, per-sentence alignments and so on) 
# during the scoring. See the wsj/local/score.sh script for example how 
# the scripts are used
# For help and instructions about usage, see the bottom of this file, 
# or call it with the parameter --help
 
use strict;
use warnings;
use List::Util qw[max];
use Getopt::Long;
use Pod::Usage;


#use Data::Dumper;

my $WIDTH=10;
my $SPK_WIDTH=15;
my $help;

GetOptions("spk-field-width" => \$SPK_WIDTH,
           "field-width" => \$WIDTH,
           "help|?" => \$help
           ) or pod2usage(2);
pod2usage(1) if $help;
pod2usage("$0: Too many files given.\n")  if (@ARGV != 1);

my %UTTMAP;
my %PERSPK_STATS;

# this function reads the opened file (supplied as a first
# parameter) into an array of lines. For each
# line, it tests whether it's a valid utf-8 compatible
# line. If all lines are valid utf-8, it returns the lines 
# decoded as utf-8, otherwise it assumes the file's encoding
# is one of those 1-byte encodings, such as ISO-8859-x
# or Windows CP-X.
# Please recall we do not really care about
# the actually encoding, we just need to 
# make sure the length of the (decoded) string 
# is correct (to make the output formatting looking right).
sub get_utf8_or_bytestream {
  use Encode qw(decode encode);
  my $is_utf_compatible = 1;
  my @unicode_lines;
  my @raw_lines;
  my $raw_text;
  my $lineno = 0;
  my $file = shift;

  while (<$file>) {
    $raw_text = $_;
    last unless $raw_text;
    if ($is_utf_compatible) {
      my $decoded_text = eval { decode("UTF-8", $raw_text, Encode::FB_CROAK) } ;
      $is_utf_compatible = $is_utf_compatible && defined($decoded_text); 
      push @unicode_lines, $decoded_text;
    }
    push @raw_lines, $raw_text;
    $lineno += 1;
  }

  if (!$is_utf_compatible) {
    print STDERR "$0: Note: handling as byte stream\n";
    return (0, @raw_lines);
  } else {
    print STDERR "$0: Note: handling as utf-8 text\n";
    return (1, @unicode_lines);
  }

  return 0;
}

sub print_header {
  
  my $f="%${WIDTH}s";
  my $str = sprintf("%-${SPK_WIDTH}s id  $f $f $f $f $f $f $f $f\n", "SPEAKER", 
                    "#SENT", "#WORD", "Corr", "Sub", "Ins", "Del", "Err", "S.Err");
  return $str;
}
sub format_raw {
  my $spk = $_[0];
  my $sent = $_[1];
  my $word = $_[2];
  my $c = $_[3];
  my $s = $_[4];
  my $i = $_[5];
  my $d = $_[6];
  my $err = $_[7];
  my $serr = $_[8];

  my $f = "%${WIDTH}d"; 
  my $str = sprintf("%-${SPK_WIDTH}s raw $f $f $f $f $f $f $f $f\n", $spk, 
                    $sent, $word, $c, $s, $i, $d, $err, $serr);
  return $str;
}
sub format_sys {
  my $spk = $_[0];
  my $sent = $_[1];
  my $word = $_[2];
  my $c = $_[3];
  my $s = $_[4];
  my $i = $_[5];
  my $d = $_[6];
  my $err = $_[7];
  my $serr = $_[8];

  my $fd = "%${WIDTH}d"; 
  my $ff = "%${WIDTH}.2f"; 
  my $str = sprintf("%-${SPK_WIDTH}s sys $fd $fd $ff $ff $ff $ff $ff $ff\n", $spk, 
                    $sent, $word, $c, $s, $i, $d, $err, $serr);
  return $str;
}

open(UTT2SPK,$ARGV[0]) or die "Could not open the utt2spk file $ARGV[0]";

(my $utt_is_utf8, my @utt_lines) = get_utf8_or_bytestream(\*UTT2SPK);
die "Cannot read file" unless @utt_lines;

while (@utt_lines) {
  my $line = shift @utt_lines;
  chomp $line;
  my @F=split(" ", $line);
  die "Incompatible format of the utt2spk file: $_" if @F != 2;
  $UTTMAP{$F[0]} = $F[1];
  # Set width of speaker column by its longest label,
  if($SPK_WIDTH < length($F[1])) { $SPK_WIDTH = length($F[1]) }
}
close(UTT2SPK);

(my $is_utf8, my @text) = get_utf8_or_bytestream(\*STDIN);
if ($is_utf8) {
  binmode(STDOUT, ":utf8");
}

while (@text) {
  my $line = shift @text;
  chomp $line;
  my @entries = split(" ", $line);
  next if  @entries < 2;
  next if  $entries[1] ne "#csid" ; 
  die "Incompatible entry $_ " if @entries != 6;

  my $c=$entries[2]; 
  my $s=$entries[3]; 
  my $i=$entries[4]; 
  my $d=$entries[5]; 
  
  my $UTT=$entries[0];
  my $SPK=$UTTMAP{$UTT};
  $PERSPK_STATS{$SPK}->{"C"} += $c;
  $PERSPK_STATS{$SPK}->{"S"} += $s;
  $PERSPK_STATS{$SPK}->{"I"} += $i;
  $PERSPK_STATS{$SPK}->{"D"} += $d;
  $PERSPK_STATS{$SPK}->{"SENT"} += 1;
  $PERSPK_STATS{$SPK}->{"SERR"} += 1 if ($s + $i + $d != 0);
}

my $C = 0;
my $S = 0;
my $I = 0;
my $D = 0;
my $SENT = 0;
my $WORD = 0;
my $ERR = 0;
my $SERR = 0;

print print_header;

for my $SPK (sort (keys %PERSPK_STATS)) {
  my $c=$PERSPK_STATS{$SPK}->{"C"}; 
  my $s=$PERSPK_STATS{$SPK}->{"S"}; 
  my $i=$PERSPK_STATS{$SPK}->{"I"}; 
  my $d=$PERSPK_STATS{$SPK}->{"D"}; 
  my $sent=$PERSPK_STATS{$SPK}->{"SENT"} ;
  my $word=$c+$s+$d;
  my $err =$s+$d+$i;
  my $serr = $PERSPK_STATS{$SPK}->{"SERR"} // 0;

  my $spk = "$SPK";
  $C += $c; $S += $s; $I += $i; $D += $d; 
  $SENT += $sent; $SERR += $serr;

  my $w = 1.0 *$word;
  print format_raw($spk, $sent, $word, $c, $s, $i, $d, $err, $serr);
  print format_sys($spk, $sent, $word, 100 * $c/$w, 100 * $s/$w, 
                   100 * $i/$w, 100 * $d/$w, 100 * $err/$w, 100.0 * $serr/$sent) unless $w == 0;

}
$WORD= $C + $S + $D;
$ERR= $S + $D + $I;
my $W = 1.0 * $WORD;

print format_raw("SUM", $SENT, $WORD, $C, $S, $I, $D, $ERR, $SERR);
print format_sys("SUM", $SENT, $WORD, 100* $C/$W, 100*$S/$W, 
                         100*$I/$W,100*$D/$W,100*$ERR/$W, 100.0 * $SERR/$SENT) unless $W==0;


 __END__

=head1 NAME
  wer_per_spk_details.pl -- generate aggregated per-speaker details

=head1 SYNOPSIS

  wer_per_spk_details.pl  data/dev/utt2spk

  Options:
    --spk-field-width         Width of the first field (spk ID field)
    --field-width             Width of the fields (with exception of the SPK ID 
                              field)

=head1 DESCRIPTION
  This program aggregates the per-utterance output from utils/wer_per_utt_details.pl
  It cares only about the "#csid" field (counts of Corr, Sub, Ins and Del);

  It expects one parameter -- file in the format of the kaldi utt2spk.
  In case the SPK ID is longer that 15 characters, the parameter spk-field-width
  can be used; the same for all other fields and field-width parameter.
  The field-width parameter should not be necessary under normal circumstances.

==head1 EXAMPLE INPUT AND OUTPUT
  Input:
    UTT-A #csid 3 1 1 1

  Output:
    SPEAKER         id       #SENT      #WORD       Corr        Sub        Ins        Del        Err      S.Err
    A               raw          1          5          3          1          1          1          3          1
    A               sys          1          5      60.00      20.00      20.00      20.00      60.00     100.00
    SUM             raw          1          5          3          1          1          1          3          1
    SUM             sys          1          5      60.00      20.00      20.00      20.00      60.00     100.00
    
    The input can contain other lines as well -- those will be ignored during
    reading the input. I.E. this is a completely legal input:
      
      UTT-A ref  word-A   <eps>  word-B  word-C  word-D  word-E
      UTT-A hyp  word-A  word-A  word-B   <eps>  word-D  word-X
      UTT-A op      C       I       C       D       C       S
      UTT-A #csid 3 1 1 1

=cut


================================================
FILE: egs/utils/scoring/wer_per_utt_details.pl
================================================
#!/usr/bin/env perl
# Copyright 2015 Johns Hopkins University (Author: Yenda Trmal <jtrmal@gmail.com>)

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#  http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.


#These scripts are (or can be) used by scoring scripts to generate 
#additional information (such as per-spk wer, per-sentence alignments and so on) 
#during the scoring. See the wsj/local/score.sh script for example how 
#the scripts are used
#For help and instructions about usage, see the bottom of this file, 
#or call it with the parameter --help
#
use strict;
use warnings;
use List::Util qw[max];
use Getopt::Long;
use Pod::Usage;


#use Data::Dumper;

my $special_symbol= "<eps>";
my $separator=";";
my $output_hyp = 1;
my $output_ref = 1;
my $output_ops = 1;
my $output_csid = 1;
my $help;

GetOptions("special-symbol=s" => \$special_symbol,
           "separator=s" => \$separator,
           "output-hyp!" => \$output_hyp,
           "output-ref!" => \$output_ref,
           "output-ops!" => \$output_ops,
           "output-csid!" => \$output_csid,
           "help|?" => \$help
           ) or pod2usage(2);
pod2usage(1) if $help;
pod2usage("$0: Too many parameters.\n")  if (@ARGV != 0);

sub rjustify {
  my $maxlen =  $_[1];
  my $str =  $_[0];
  return sprintf("%-${maxlen}s", $str);
}
sub ljustify {
  my $maxlen =  $_[1];
  my $str =  $_[0];
  return sprintf("%${maxlen}s", $str);
}
sub cjustify {
  my $maxlen =  $_[1];
  my $str =  $_[0];
  my $right_spaces = int(($maxlen - length($str)) / 2);
  my $left_spaces =$maxlen - length($str) - $right_spaces;
  return sprintf("%s%s%s", " " x $left_spaces,  $str, " " x $right_spaces);
}

# this function reads the opened file (supplied as a first
# parameter) into an array of lines. For each
# line, it tests whether it's a valid utf-8 compatible
# line. If all lines are valid utf-8, it returns the lines 
# decoded as utf-8, otherwise it assumes the file's encoding
# is one of those 1-byte encodings, such as ISO-8859-x
# or Windows CP-X.
# Please recall we do not really care about
# the actually encoding, we just need to 
# make sure the length of the (decoded) string 
# is correct (to make the output formatting looking right).
sub get_utf8_or_bytestream {
  use Encode qw(decode encode);
  my $is_utf_compatible = 1;
  my @unicode_lines;
  my @raw_lines;
  my $raw_text;
  my $lineno = 0;
  my $file = shift;

  while (<$file>) {
    $raw_text = $_;
    last unless $raw_text;
    if ($is_utf_compatible) {
      my $decoded_text = eval { decode("UTF-8", $raw_text, Encode::FB_CROAK) } ;
      $is_utf_compatible = $is_utf_compatible && defined($decoded_text); 
      push @unicode_lines, $decoded_text;
    }
    push @raw_lines, $raw_text;
    $lineno += 1;
  }

  if (!$is_utf_compatible) {
    print STDERR "$0: Note: handling as byte stream\n";
    return (0, @raw_lines);
  } else {
    print STDERR "$0: Note: handling as utf-8 text\n";
    return (1, @unicode_lines);
  }
}

(my $is_utf8, my @text) = get_utf8_or_bytestream(\*STDIN);
if ($is_utf8) {
  binmode(STDOUT, ":utf8");
}

while (@text) {
  my $line = shift @text;
  chomp $line;
  (my $utt_id, my $alignment) = split (" ", $line, 2);
  my @alignment_pairs = split(" ", $alignment); #splits on spaces, does not create empty fields
 
  my @HYP;
  my @REF;
  my @OP;
  my %OPCOUNTS= (
    "I" => 0,
    "D" => 0,
    "S" => 0,
    "C" => 0
  );

  while(@alignment_pairs) {
    my $ref = shift @alignment_pairs;
    my $hyp = shift @alignment_pairs;
    if (@alignment_pairs) {
      my $sep = shift @alignment_pairs;
      die "Detected incorrect separator $sep (expected $separator).\n" unless ($sep eq $separator);
    }

    push @HYP, $hyp;
    push @REF, $ref;

    if ( $hyp eq $special_symbol ) {
      push @OP, "D";
      $OPCOUNTS{"D"} +=1;
    } elsif ( $ref eq $special_symbol ) {
      push @OP, "I";
      $OPCOUNTS{"I"} +=1;
    } elsif ($ref ne $hyp ) {
      push @OP, "S";
      $OPCOUNTS{"S"} +=1;
    } else {
      push @OP, "C";
      $OPCOUNTS{"C"} +=1;
    }
  }

  die "Number of edit ops is not equal to the length of the text for utterance $utt_id\n" if scalar(@OP) != scalar(@HYP);
   
  my @hyp_str;
  my @ref_str;
  my @op_str;
  for (my $i=0; $i <= $#OP; $i+=1) {
    my $maxlen=max(length($REF[$i]), length($HYP[$i]), length($OP[$i]));

    push @ref_str, cjustify($REF[$i], $maxlen);
    push @hyp_str, cjustify($HYP[$i], $maxlen);
    push @op_str, cjustify($OP[$i], ${maxlen});
  }
  print $utt_id . " ref  " . join("  ", @ref_str) . "\n" if $output_ref;
  print $utt_id . " hyp  " . join("  ", @hyp_str) . "\n" if $output_hyp;
  print $utt_id . " op   " . join("  ", @op_str) . "\n" if $output_ops;
  print $utt_id . " #csid" . " " .$OPCOUNTS{"C"} . " " . $OPCOUNTS{"S"} . " " . $OPCOUNTS{"I"} . " " . $OPCOUNTS{"D"} . "\n" if $output_csid;
}


 __END__

=head1 NAME
  wer_per_utt_details.pl -- generate detailed stats

=head1 SYNOPSIS

  Example:
    align-text ark:text.filt ark:10.txt ark,t:-  | wer_per_utt_details.pl

  Options:
    --special-symbol        special symbol used in align-text to denote empty word 
                            in case insertion or deletion ("<eps>" by default)
    --separator             special symbol used to separate individual word-pairs
                            in the align-text output (";" by default)

    --[no]output-hyp        disable/enable printing of the hyp (hypothesis) entry
    --[no]output-ref        disable/enable printing of the ref (reference) entry
    --[no]output-ops        disable/enable printing of the ops (edit operations) entry
    --[no]output-csid       disable/enable printing of the #csid entry (counts
                            of the individual edit operations)

=head1 DESCRIPTION
    The program works as a filter -- reads the output from align-text program,
    parses it and outputs the requested entries on the output. The format of
    the entries was chosen so that it allows for easy parsing while being human
    readable.

    By default, all entries (hyp, ref, ops, #csid) are printed. 

    The filter can be used (for example) to generate detailed statistics
    from scoring (similar to the dtl/prf output of the sctk sclite outut)

==head1 EXAMPLE INPUT AND OUTPUT
  Input:
    "UTT-A word-A word-A; <eps> word-A; word-B word-B; word-C <eps>; word-D word-D; word-E word-X;

  Output:
    UTT-A ref  word-A   <eps>  word-B  word-C  word-D  word-E
    UTT-A hyp  word-A  word-A  word-B   <eps>  word-D  word-X
    UTT-A op      C       I       C       D       C       S
    UTT-A #csid 3 1 1 1

=cut


================================================
FILE: egs/utils/scoring/wer_report.pl
================================================
#!/usr/bin/env perl
# Copyright 2015 Johns Hopkins University (author: Jan Trmal <jtrmal@gmail.com>)

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#  http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.


# This script reads per-utt table generated for example during scoring
# and outpus the WER similar to the format the compute-wer utility 
# or the utils/best_wer.pl produces
# i.e. from table containing lines in this format
# SUM raw 23344 243230 176178 46771 9975 20281 77027 16463
# produces something output like this
# %WER 31.67 [ 77027 / 243230, 9975 ins, 20281 del, 46771 sub ] 
# NB: if the STDIN stream will contain more of the SUM raw entries,
#     the best one will be found and printed 
#
# If the script is called with parameters, it uses them pro provide 
# a description of the output
# i.e.
# cat per-spk-report | utils/scoring/wer_report.pl Full set
# the following output will be produced
# %WER 31.67 [ 77027 / 243230, 9975 ins, 20281 del, 46771 sub ] Full set


while (<STDIN>) {
  if ( m:SUM\s+raw:) {
    @F = split;
    if ((!defined $wer) || ($wer > $F[8])) {
      $corr=$F[4];
      $sub=$F[5];
      $ins=$F[6];
      $del=$F[7];
      $wer=$F[8];
      $words=$F[3];
    }
  }
}

if (defined $wer) {
  $wer_str = sprintf("%.2f", (100.0 * $wer) / $words);
  print "%WER $wer_str [ $wer / $words,  $ins ins, $del del, $sub sub ]";
  print " " . join(" ", @ARGV) if @ARGV > 0;
  print "\n";
}


================================================
FILE: egs/utils/segmentation.pl
================================================
#!/usr/bin/env perl
use warnings; #sed replacement for -w perl parameter
# Copyright 2013  Johns Hopkins University (Author: Daniel Povey)
# Apache 2.0.

# This program is for segmentation of data, e.g. long telephone conversations,
# into short chunks.  The input (stdin) should be a sequence of lines like
# sw0-20348-A  0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 0 0 0 0 0 0 0 2 2 2 2 2 2 2 2 2 2 ...  2 2 0 0 0
# where there is a number for each frame and the numbers mean 0 for silence, 1
# for noise, laughter and other nonspeech events, and 2 for speech.  This will
# typically be derived from some kind of fast recognition (see
# ../steps/resegment_data.sh), followed by ali-to-phones --per-frame=true and
# then mapping phones to these classes 0, 1 and 2.
#
# The algorithm is as follows:
#  (1) Find contiguous sequences of classes 1 or 2 (i.e. speech and/or noise), with e.g.
#      "1 1 1 2 2" counted as a single contiguous sequence.  Each such sequence is an
#      initial segment.
#  (2) While the proportion of silence in the segments is less than $silence_proportion,
#      add a single silence frame to the left and right of each segment, as long
#      as this does not take us past the ends of the file or into another segment.  
#      At this point, do not merge segments.
#  (3) Merging segments:
#      Get a list of all boundaries between segments that ended up touching each other
#      during phase 2.  Sort them according to the number of silence frames at the boundary,
#      with those with the least silence to be processed first.  Go through the boundaries
#      in order, merging each pair of segments, as long as doing so does not create
#      a segment larger than $max_segment_length.
#  (4) Splitting excessively long segments:
#      For all segments that are longer than $hard_max_segment_length, split them equally
#      into the smallest number of pieces such that the pieces will be no longer than
#      $hard_max_segment_length.  Print a warning.
#  (5) Removing any segments that contain no speech.  (remove segments that have only silence
#      and noise.
#
#  By default, the utterance-ids will be of the form <RECORDING-ID>-<START-TIME>-<END-TIME>,
#  where <START-TIME> and <END-TIME> are measured 0.01 seconds, using fixed-width
#  integers with enough digits to print out all the segments (the number of digits being
#  decided per line of the input).  For instance, if the input recording-id was
#  sw0-20348-A, an example line of the "segments-file" output would be:
#   sw0-20348-A-00124-00298 sw0-20348-A 1.24 2.98
#  (interpreted as <UTTERANCE-ID> <RECORDING-ID> <START-TIME> <END-TIME>)
#  and the number of digits has to be that large because the same recording has
#  a segment something like
#   sw0-20348-A-13491-13606 sw0-20348-A 134.91 136.06
#  The "_" and "-" in the output are separately configurable by means of the
#  --first-separator and --second-separator options.  However, generally speaking,
#  it is safer to use "-" than, say, "_", because "-" appears very early in the
#  ASCII table, and using it as the separator will tend to ensure than when
#  you sort the utterances and the recording-ids they will sort the same way.
#  This matters because recording-ids will often equal speaker-ids, and Kaldi scripts
#  require that the utterance-ids and speaker-ids sort in the "same order".


use Getopt::Long;

$silence_proportion = 0.2; # The amount of silence at the sides of segments is
                           # tuned to give this proportion of silence.

$frame_shift = 0.01; # Affects the interpretation of the options such as max_segment_length,
                     # and the seconds in the "segments" file.
$max_segment_length = 15.0; # Maximum segment length while we are merging segments...
                            # it will not allow merging segments to make segments longer than this.
$hard_max_segment_length = 30.0; # A hard maximum on the segment length; it will
                                 # break segments to get below this, even if there is
                                 # no silence, and print a warning.
$first_separator = "-";   # separator between recording-id and start-time, in utterance-id.
$second_separator = "-";  # separator between start-time and end-time, in utterance-id.
$remove_noise_only_segments = "true";  # boolean option; if true,
                                       # remove segments that have no speech.


GetOptions('silence-proportion:f' => \$silence_proportion,
           'frame-shift:f' => \$frame_shift,
           'max-segment-length:f' => \$max_segment_length,
           'hard-max-segment-length:f' => \$hard_max_segment_length,
           'first-separator:s' => \$first_separator,
           'second-separator:s' => \$second_separator,
           'remove-noise-only-segments:s' => \$remove_noise_only_segments);

if (@ARGV != 0) {
  print STDERR "$0:\n" .
               "Usage: segmentation.pl [options] < per-frame-archive > segments-file\n" .
               "This program is called from steps/resegment_data.sh.  Please see\n" .
               "the extensive comment in the source.  Options:\n" .
               "--silence-proportion <float> (default: $silence_proportion)\n" .
               "--frame-shift <float> (default: $frame_shift, in seconds)\n" .
               "--max-segment-length <float> (default: $max_segment_length, in seconds)\n" .
               "--hard-max-segment-length (default: $hard_max_segment_length, in seconds)\n" .
               "--first-separator <string> (default: $first_separator), affects utterance-ids\n" .
               "--second-separator <string> (default: $second_separator), affects utterance-ids\n" .
               "--remove-noise-only-segments <true|false> (default: true)\n";
  exit 1;
}

($silence_proportion > 0.01 && $silence_proportion < 0.99) ||
  die "Invalid silence-proportion value '$silence_proportion'";
($frame_shift > 0.0001 && $frame_shift <= 1.0) ||
  die "Very strange frame-shift value '$frame_shift'";
($max_segment_length > 1.0 && $max_segment_length < 100.0) ||
  die "Very strange max-segment-length value '$max_segment_length'";
($hard_max_segment_length > 4.0 && $hard_max_segment_length < 500.0) ||
  die "Very strange hard-max-segment-length value '$hard_max_segment_length'";
($hard_max_segment_length >= $max_segment_length) ||
  die "hard-max-segment-length may not be less than max-segment-length";
($remove_noise_only_segments eq 'false' ||
 $remove_noise_only_segments eq 'true') || 
  die "Option --remove-noise-only-segments takes args true or false";


sub get_initial_segments {
  # This operates on the global arrays @A, @S and @N.  It sets the elements of
  # @S to 1 if start of segment, and @E to 1 if end of segment, end of segment
  # being defined as one past the last frame in the segment.

  for (my $n = 0; $n < $N; $n++) {
    if ($A[$n] == 0) {
      if ($n > 0 && $A[$n-1] != 0) {
        $E[$n] = 1;
      }
    } else {
      if ($n == 0 || $A[$n-1] == 0) {
        $S[$n] = 1;
      }
    }
  }
  if ($A[$N-1] != 0) { # Handle the special case
    $E[$N] = 1;        # where the last frame of the file is silence or noise.
  }
}


sub set_silence_proportion {
  $num_nonsil_frames = 0;
  # Get number of frames that are inside segments.  Initially, this will
  # all be non-silence.
  $in_segment = 0;

  my @active_frames = (); # active_frames are segment start/end frames.
  for (my $n = 0; $n <= $N; $n++) {
    if ($n < $N && $S[$n] == 1) {
      $in_segment == 0 || die; 
      $in_segment = 1; 
      push @active_frames, $n;
    }
    if ($E[$n] == 1) { 
      $in_segment == 1 || die; 
      $in_segment = 0; 
      push @active_frames, $n;
    }
    if ($n < $N) {
      ($in_segment == ($A[$n] != 0 ? 1 : 0)) || die; # Just a check.
      if ($in_segment) { $num_nonsil_frames++; }
    }
  }
  $in_segment == 0 || die; # should not be still in a segment after file-end.
  if ($num_nonsil_frames == 0) {
    print STDERR "$0: warning: no segments found for recording $recording_id\n";
    return;
  }
  #(target-segment-frames - num-nonsil-frames) / target-segment-frames =  sil-proportion
  # -> target-segment-frames = (num-nonsil-frames) / (1 - sil-proportion).
  my $target_segment_frames = int($num_nonsil_frames / (1.0 - $silence_proportion));
  my $num_segment_frames = $num_nonsil_frames;
  while ($num_segment_frames < $target_segment_frames) {
    $changed = 0;
    for (my $i = 0; $i < @active_frames; $i++) {
      my $n = $active_frames[$i];
      if ($E[$n] == 1 && $n < $N && $S[$n] != 1) {
        # shift the end of this segment one frame to the right.
        $E[$n] = 0;
        $E[$n+1] = 1;
        $active_frames[$i] = $n + 1;
        $num_segment_frames++;
        $changed = 1;
      }
      if ($n < $N && $S[$n] == 1 && $n > 0 && $E[$n] != 1) {
        # shift the start of this segment one frame to the left
        $S[$n] = 0;
        $S[$n-1] = 1;
        $active_frames[$i] = $n - 1;
        $num_segment_frames++;
        $changed = 1;
      }
      if ($num_segment_frames == $target_segment_frames) {
        last;
      }
    }
    if ($changed == 0) { last; } # avoid an infinite loop.
  }
  if ($num_segment_frames < $target_segment_frames) {
    my $proportion = 
      ($num_segment_frames - $num_nonsil_frames) / $num_segment_frames;
    print STDERR "$0: warning: for recording $recording_id, only got a proportion " .
      "$proportion of silence frames, versus target $silence_proportion\n";
  }
}

sub merge_segments() {
  my @boundaries = ();
  my @num_silence_phones = (); # for any index into @S where there
                               # is a boundary between contiguous segments
                               # (i.e. an index which is both a segment-start
                               # and segment-end index), the number of silence
                               # frames at that boundary (i.e. at the end of the
                               # previous segment and the beginning of the next
                               # one.
  for ($n = 0; $n < $N; $n++) {
    if ($S[$n] == 1 && $E[$n] == 1) {
      push @boundaries, $n;
      my $num_sil = 0;
      my $p;
      # note: here we can count the silence phones without regard to the
      # segment boundaries, since we'll hit nonsilence before we get to
      # the end/beginning of these segments.
      for ($p = $n; $p < $N; $p++) {
        if ($A[$p] == 0) { $num_sil++; }
        else { last; }
      }
      for ($p = $n - 1; $p >= 0; $p--) {
        if ($A[$p] == 0) { $num_sil++; }
        else { last; }
      }
      
      $num_silence_phones[$n] = $num_sil; # should be the num of silence
    }
  }

  # Sort on increasing number of silence-phones, so we join the segments with
  # the smallest amount of silence at the boundary first.
  my @sorted_boundaries = 
    sort { $num_silence_phones[$a] <=> $num_silence_phones[$b] } @boundaries;

  foreach $n (@sorted_boundaries) {
    # Join the segments only if the length of the resulting segment would
    # be no more than $max_segment_length.
    ($S[$n] == 1 && $E[$n] == 1) || die;
    my $num_frames = 2; # total number of frames in the two segments we'll be merging..
                        # start the count from 2 because the loops below do not
                        # count the 1st frame of the segment to the right and
                        # the last frame of the segment to the left.
    my $p;
    for ($p = $n + 1; $p <= @A && $E[$p] == 0; $p++) {
      $num_frames++;
    }
    $E[$p] == 1 || die;
    for ($p = $n - 1; $p >= 0 && $S[$p] == 0; $p--) {
      $num_frames++;
    }
    $S[$p] == 1 || die;
    if ($num_frames * $frame_shift <= $max_segment_length) {
      # Join this pair of segments.
      $S[$n] = 0;
      $E[$n] = 0;
    }
  }
}

sub split_long_segments {
  for (my $n = 0; $n < @A; $n++) {
    if ($S[$n] == 1) { # segment starts here...
      my $p;
      for ($p = $n + 1; $p <= @A; $p++) {
        if ($E[$p] == 1) { last; }
      }
      ($E[$p] == 1) || die;
      my $segment_length = $p - $n;
      my $max_frames = int($hard_max_segment_length / $frame_shift);
      if ($segment_length > $max_frames) {
        # The segment is too long, we need to split it.  First work out
        # how many pieces to split it into.
        # We divide and round up to nearest larger int.
        my $num_pieces = int(($segment_length / $max_frames) + 0.99999);
        my $segment_length_in_seconds = $segment_length * $frame_shift;
        print STDERR "$0: warning: for recording $recording_id, splitting segment of " .
          "length $segment_length_in_seconds seconds into $num_pieces pieces " .
          "(--hard-max-segment-length $hard_max_segment_length)\n";
        my $frames_per_piece = int($segment_length / $num_pieces);
        my $i;
        for ($i = 1; $i < $num_pieces; $i++) {
          my $q = $n + $i * $frames_per_piece;
          # Insert a segment boundary at frame $q.
          $S[$q] = 1;
          $E[$q] = 1;
        }
      }
      if ($p - 1 > $n) {
        $n = $p - 1; # avoids some redundant work.
      }
    }
  }
}

sub remove_noise_only_segments {
  for (my $n = 0; $n < $N; $n++) {
    if ($S[$n] == 1) { # segment starts here...
      my $p;
      my $saw_speech = 0;
      for ($p = $n; $p <= $N; $p++) {
        if ($E[$p] == 1 && $p != $n) { last; }
        if ($A[$p] == 2) { $saw_speech = 1; }
      }
      $E[$p] == 1 || die;
      if (! $saw_speech) { # There was no speech in this segment, so remove it.
        $S[$n] = 0;
        $E[$p] = 0;
      }
      if ($p - 1 > $n) {
        $n = $p - 1; # Avoid some redundant work.
      }
    }
  }
}

sub print_segments {
  # We also do some sanity checking here.
  my @segments = (); # each element will be a string start-time:end-time, in frames.

  $N == @S || die; # check array size.
  ($N+1) == @E || die; # check array size.

  my $max_end_time = 0;

  for (my $n = 0; $n < $N; $n++) {
    if ($E[$n] == 1 && $S[$n] != 1) {
      die "Ending segment before starting it: n=$n.\n";
    }
    if ($S[$n]) {
      my $p;
      for ($p = $n + 1; $p < $N && $E[$p] != 1; $p++) {
        $S[$p] && die; # should not start a segment again, before ending it.
      }
      $E[$p] == 1 || die;
      push @segments, "$n:$p"; # push the start/end times.
      $max_end_time = $p;
      if ($p < $N && $S[$p] == 1) { $n = $p - 1; }
      else { $n = $p; }
      # note: we increment $n again before the next loop instance.
    }
  }

  if (@segments == 0) {
    print STDERR "$0: warning: no segments for recording $recording_id\n";
  }

  # we'll be printing the times out in hundredths of a second (regardless of the
  # value of $frame_shift), and first need to know how many digits we need (we'll be
  # printing with "%05d" or similar, for zero-padding.
  $max_end_time_hundredths_second = int(100.0 * $frame_shift * $max_end_time);
  $num_digits = 1;
  my $i = 1;
  while ($i < $max_end_time_hundredths_second) {
    $i *= 10;
    $num_digits++;
  }
  $format_str = "%0${num_digits}d"; # e.g. "%05d"

  foreach $s (@segments) {
    my ($start,$end) = split(":", $s);
    ($end > $start) || die;
    my $start_seconds = sprintf("%.2f", $frame_shift * $start);
    my $end_seconds = sprintf("%.2f", $frame_shift * $end);
    my $start_str = sprintf($format_str, $start_seconds * 100);
    my $end_str = sprintf($format_str, $end_seconds * 100);
    my $utterance_id = "${recording_id}${first_separator}${start_str}${second_separator}${end_str}";
    print "$utterance_id $recording_id $start_seconds $end_seconds\n"; # <-- Here is where the output happens.
  }
}


while (<STDIN>) {
  @A = split; # split line on whitespace.
  if (@A <= 1) {
    print STDERR "$0: warning: invalid input line $_";
    next;
  }
  $recording_id = shift @A;  # e.g. sw0-12430
  for ($n = 0; $n < @A; $n++) {
    $a = $A[$n];
    if ($a != 0 && $a != 1 && $a != 2) {
      die "Invalid value $a: expecting 0, 1 or 2.  Line is: $_";
    }
    $A[$n] = 0 + $a; # cast to integer, might be a bit faster.
  }
  # The array @S will contain 1 if a segment starts there and 0
  # otherwise.  The array @E will contain 1 if a segment ends there
  # and 0 otherwise.
  $N = @A; # number of elements in @A.  Used globally.
  @S = (0) x $N;         # 0 repeated $N times.
  @E = (0) x ($N + 1);   # 0 repeated $N+1 times (one more since if the last frame is
                         # in a segment, the end-marker goes one past that, at index $N.)

  get_initial_segments();       # stage (1) in the comment above.
  set_silence_proportion();     # stage (2)
  merge_segments();             # stage (3)
  split_long_segments();        # stage (4)
  if ($remove_noise_only_segments eq 'true') {
    remove_noise_only_segments(); # stage (5)
  }
  print_segments();
}


================================================
FILE: egs/utils/show_lattice.sh
================================================
#!/usr/bin/env bash

format=pdf # pdf svg
mode=save # display save
lm_scale=0.0
acoustic_scale=0.0
outdir=
#end of config

. utils/parse_options.sh

if [ $# != 3 ]; then
   echo "usage: $0 [--mode display|save] [--format pdf|svg] <utt-id> <lattice-ark> <word-list>"
   echo "e.g.:  $0 utt-0001 \"test/lat.*.gz\" tri1/graph/words.txt"
   exit 1;
fi

. ./path.sh

uttid=$1
lat=$2
words=$3

tmpdir=$outdir; # trap "rm -r $tmpdir" EXIT # cleanup

gunzip -c $lat | lattice-to-fst --lm-scale=$lm_scale --acoustic-scale=$acoustic_scale ark:- "scp,p:echo $uttid $tmpdir/$uttid.fst|" || exit 1;
! [ -s $tmpdir/$uttid.fst ] && \
  echo "Failed to extract lattice for utterance $uttid (not present?)" && exit 1;
fstdraw --portrait=true --osymbols=$words $tmpdir/$uttid.fst | dot -T${format} > $tmpdir/$uttid.${format}

if [ "$(uname)" == "Darwin" ]; then
    doc_open=open
elif [ "$(expr substr $(uname -s) 1 5)" == "Linux" ]; then
    doc_open=xdg-open
elif [ $mode == "display" ] ; then
        echo "Can not automaticaly open file on your operating system"
        mode=save
fi

[ $mode == "display" ] && $doc_open $tmpdir/$uttid.${format}
[[ $mode == "display" && $? -ne 0 ]] && echo "Failed to open ${format} format." && mode=save
# [ $mode == "save" ] && echo "Saving to $uttid.${format}" && cp $tmpdir/$uttid.${format} $outdir

exit 0


================================================
FILE: egs/utils/shuffle_list.pl
================================================
#!/usr/bin/env perl

# Copyright 2013  Johns Hopkins University (author: Daniel Povey)

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#  http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.


if ($ARGV[0] eq "--srand") {
  $n = $ARGV[1];
  $n =~ m/\d+/ || die "Bad argument to --srand option: \"$n\"";
  srand($ARGV[1]);
  shift;
  shift;
} else {
  srand(0); # Gives inconsistent behavior if we don't seed.
}

if (@ARGV > 1 || $ARGV[0] =~ m/^-.+/) { # >1 args, or an option we 
  # don't understand.
  print "Usage: shuffle_list.pl [--srand N] [input file]  > output\n";
  print "randomizes the order of lines of input.\n";
  exit(1);
}

@lines;
while (<>) {
  push @lines, [ (rand(), $_)] ;
}

@lines = sort { $a->[0] cmp $b->[0] } @lines;
foreach $l (@lines) {
    print $l->[1];
}


================================================
FILE: egs/utils/spk2utt_to_utt2spk.pl
================================================
#!/usr/bin/env perl
# Copyright 2010-2011 Microsoft Corporation

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#  http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.


while(<>){ 
    @A = split(" ", $_);
    @A > 1 || die "Invalid line in spk2utt file: $_";
    $s = shift @A;
    foreach $u ( @A ) {
        print "$u $s\n";
    }
}


================================================
FILE: egs/utils/split_data.sh
================================================
#!/usr/bin/env bash
# Copyright 2010-2013 Microsoft Corporation
#                     Johns Hopkins University (Author: Daniel Povey)

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#  http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.

split_per_spk=true
if [ "$1" == "--per-utt" ]; then
  split_per_spk=false
  shift
fi

if [ $# != 2 ]; then
  echo "Usage: $0 [--per-utt] <data-dir> <num-to-split>"
  echo "E.g.: $0 data/train 50"
  echo "It creates its output in e.g. data/train/split50/{1,2,3,...50}, or if the "
  echo "--per-utt option was given, in e.g. data/train/split50utt/{1,2,3,...50}."
  echo ""
  echo "This script will not split the data-dir if it detects that the output is newer than the input."
  echo "By default it splits per speaker (so each speaker is in only one split dir),"
  echo "but with the --per-utt option it will ignore the speaker information while splitting."
  exit 1
fi

data=$1
numsplit=$2

if ! [ "$numsplit" -gt 0 ]; then
  echo "Invalid num-split argument $numsplit";
  exit 1;
fi

if $split_per_spk; then
  warning_opt=
else
  # suppress warnings from filter_scps.pl about 'some input lines were output
  # to multiple files'.
  warning_opt="--no-warn"
fi

n=0;
feats=""
wavs=""
utt2spks=""
texts=""

nu=`cat $data/utt2spk | wc -l`
nf=`cat $data/feats.scp 2>/dev/null | wc -l`
nt=`cat $data/text 2>/dev/null | wc -l` # take it as zero if no such file
if [ -f $data/feats.scp ] && [ $nu -ne $nf ]; then
  echo "** split_data.sh: warning, #lines is (utt2spk,feats.scp) is ($nu,$nf); you can "
  echo "**  use utils/fix_data_dir.sh $data to fix this."
fi
if [ -f $data/text ] && [ $nu -ne $nt ]; then
  echo "** split_data.sh: warning, #lines is (utt2spk,text) is ($nu,$nt); you can "
  echo "** use utils/fix_data_dir.sh to fix this."
fi


if $split_per_spk; then
  utt2spk_opt="--utt2spk=$data/utt2spk"
  utt=""
else
  utt2spk_opt=
  utt="utt"
fi

s1=$data/split${numsplit}${utt}/1
if [ ! -d $s1 ]; then
  need_to_split=true
else
  need_to_split=false
  for f in utt2spk spk2utt spk2warp feats.scp text wav.scp cmvn.scp spk2gender \
    vad.scp segments reco2file_and_channel utt2lang; do
    if [[ -f $data/$f && ( ! -f $s1/$f || $s1/$f -ot $data/$f ) ]]; then
      need_to_split=true
    fi
  done
fi

if ! $need_to_split; then
  exit 0;
fi

utt2spks=$(for n in `seq $numsplit`; do echo $data/split${numsplit}${utt}/$n/utt2spk; done)

directories=$(for n in `seq $numsplit`; do echo $data/split${numsplit}${utt}/$n; done)

# if this mkdir fails due to argument-list being too long, iterate.
if ! mkdir -p $directories >&/dev/null; then
  for n in `seq $numsplit`; do
    mkdir -p $data/split${numsplit}${utt}/$n
  done
fi

# If lockfile is not installed, just don't lock it.  It's not a big deal.
which lockfile >&/dev/null && lockfile -l 60 $data/.split_lock
trap 'rm -f $data/.split_lock' EXIT HUP INT PIPE TERM

utils/split_scp.pl $utt2spk_opt $data/utt2spk $utt2spks || exit 1

for n in `seq $numsplit`; do
  dsn=$data/split${numsplit}${utt}/$n
  utils/utt2spk_to_spk2utt.pl $dsn/utt2spk > $dsn/spk2utt || exit 1;
done

maybe_wav_scp=
if [ ! -f $data/segments ]; then
  maybe_wav_scp=wav.scp  # If there is no segments file, then wav file is
                         # indexed per utt.
fi

# split some things that are indexed by utterance.
for f in feats.scp text vad.scp utt2lang $maybe_wav_scp utt2dur utt2num_frames; do
  if [ -f $data/$f ]; then
    utils/filter_scps.pl JOB=1:$numsplit \
      $data/split${numsplit}${utt}/JOB/utt2spk $data/$f $data/split${numsplit}${utt}/JOB/$f || exit 1;
  fi
done

# split some things that are indexed by speaker
for f in spk2gender spk2warp cmvn.scp; do
  if [ -f $data/$f ]; then
    utils/filter_scps.pl $warning_opt JOB=1:$numsplit \
      $data/split${numsplit}${utt}/JOB/spk2utt $data/$f $data/split${numsplit}${utt}/JOB/$f || exit 1;
  fi
done

if [ -f $data/segments ]; then
  utils/filter_scps.pl JOB=1:$numsplit \
     $data/split${numsplit}${utt}/JOB/utt2spk $data/segments $data/split${numsplit}${utt}/JOB/segments || exit 1
  for n in `seq $numsplit`; do
    dsn=$data/split${numsplit}${utt}/$n
    awk '{print $2;}' $dsn/segments | sort | uniq > $dsn/tmp.reco # recording-ids.
  done
  if [ -f $data/reco2file_and_channel ]; then
    utils/filter_scps.pl $warning_opt JOB=1:$numsplit \
      $data/split${numsplit}${utt}/JOB/tmp.reco $data/reco2file_and_channel \
      $data/split${numsplit}${utt}/JOB/reco2file_and_channel || exit 1
  fi
  if [ -f $data/wav.scp ]; then
    utils/filter_scps.pl $warning_opt JOB=1:$numsplit \
      $data/split${numsplit}${utt}/JOB/tmp.reco $data/wav.scp \
      $data/split${numsplit}${utt}/JOB/wav.scp || exit 1
  fi
  for f in $data/split${numsplit}${utt}/*/tmp.reco; do rm $f; done
fi

exit 0


================================================
FILE: egs/utils/split_scp.pl
================================================
#!/usr/bin/env perl

# Copyright 2010-2011 Microsoft Corporation

# See ../../COPYING for clarification regarding multiple authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#  http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.


# This program splits up any kind of .scp or archive-type file.
# If there is no utt2spk option it will work on any text  file and
# will split it up with an approximately equal number of lines in
# each but.
# With the --utt2spk option it will work on anything that has the
# utterance-id as the first entry on each line; the utt2spk file is
# of the form "utterance speaker" (on each line).
# It splits it into equal size chunks as far as it can.  If you use the utt2spk
# option it will make sure these chunks coincide with speaker boundaries.  In
# this case, if there are more chunks than speakers (and in some other
# circumstances), some of the resulting chunks will be empty and it will print
# an error message and exit with nonzero status.
# You will normally call this like:
# split_scp.pl scp scp.1 scp.2 scp.3 ...
# or
# split_scp.pl --utt2spk=utt2spk scp scp.1 scp.2 scp.3 ...
# Note that you can use this script to split the utt2spk file itself,
# e.g. split_scp.pl --utt2spk=utt2spk utt2spk utt2spk.1 utt2spk.2 ...

# You can also call the scripts like:
# split_scp.pl -j 3 0 scp scp.0
# [note: with this option, it assumes zero-based indexing of the split parts,
# i.e. the second number must be 0 <= n < num-jobs.]

use warnings;

$num_jobs = 0;
$job_id = 0;
$utt2spk_file = "";
$one_based = 0;

for ($x = 1; $x <= 3 && @ARGV > 0; $x++) {
    if ($ARGV[0] eq "-j") {
        shift @ARGV;
        $num_jobs = shift @ARGV;
        $job_id = shift @ARGV;
    }
    if ($ARGV[0] =~ /--utt2spk=(.+)/) {
        $utt2spk_file=$1;
        shift;
    }
    if ($ARGV[0] eq '--one-based') {
        $one_based = 1;
        shift @ARGV;
    }
}

if ($num_jobs != 0 && ($num_jobs < 0 || $job_id - $one_based < 0 ||
                       $job_id - $one_based >= $num_jobs)) {
  die "$0: Invalid job number/index values for '-j $num_jobs $job_id" .
      ($one_based ? " --one-based" : "") . "'\n"
}

$one_based
    and $job_id--;

if(($num_jobs == 0 && @ARGV < 2) || ($num_jobs > 0 && (@ARGV < 1 || @ARGV > 2))) {
    die
"Usage: split_scp.pl [--utt2spk=<utt2spk_file>] in.scp out1.scp out2.scp ...
   or: split_scp.pl -j num-jobs job-id [--one-based] [--utt2spk=<utt2spk_file>] in.scp [out.scp]
 ... where 0 <= job-id < num-jobs, or 1 <= job-id <- num-jobs if --one-based.\n";
}

$error = 0;
$inscp = shift @ARGV;
if ($num_jobs == 0) { # without -j option
    @OUTPUTS = @ARGV;
} else {
    for ($j = 0; $j < $num_jobs; $j++) {
        if ($j == $job_id) {
            if (@ARGV > 0) { push @OUTPUTS, $ARGV[0]; }
            else { push @OUTPUTS, "-"; }
        } else {
            push @OUTPUTS, "/dev/null";
        }
    }
}

if ($utt2spk_file ne "") {  # We have the --utt2spk option...
    open($u_fh, '<', $utt2spk_file) || die "$0: Error opening utt2spk file $utt2spk_file: $!\n";
    while(<$u_fh>) {
        @A = split;
        @A == 2 || die "$0: Bad line $_ in utt2spk file $utt2spk_file\n";
        ($u,$s) = @A;
        $utt2spk{$u} = $s;
    }
    close $u_fh;
    open($i_fh, '<', $inscp) || die "$0: Error opening input scp file $inscp: $!\n";
    @spkrs = ();
    while(<$i_fh>) {
        @A = split;
        if(@A == 0) { die "$0: Empty or space-only line in scp file $inscp\n"; }
        $u = $A[0];
        $s = $utt2spk{$u};
        defined $s || die "$0: No utterance $u in utt2spk file $utt2spk_file\n";
        if(!defined $spk_count{$s}) {
            push @spkrs, $s;
            $spk_count{$s} = 0;
            $spk_data{$s} = [];  # ref to new empty array.
        }
        $spk_count{$s}++;
        push @{$spk_data{$s}}, $_;
    }
    # Now split as equally as possible ..
    # First allocate spks to files by allocating an approximately
    # equal number of speakers.
    $numspks = @spkrs;  # number of speakers.
    $numscps = @OUTPUTS; # number of output files.
    if ($numspks < $numscps) {
      die "$0: Refusing to split data because number of speakers $numspks " .
          "is less than the number of output .scp files $numscps\n";
    }
    for($scpidx = 0; $scpidx < $numscps; $scpidx++) {
        $scparray[$scpidx] = []; # [] is array reference.
    }
    for ($spkidx = 0; $spkidx < $numspks; $spkidx++) {
        $scpidx = int(($spkidx*$numscps) / $numspks);
        $spk = $spkrs[$spkidx];
        push @{$scparray[$scpidx]}, $spk;
        $scpcount[$scpidx] += $spk_count{$spk};
    }

    # Now will try to reassign beginning + ending speakers
    # to different scp's and see if it gets more balanced.
    # Suppose objf we're minimizing is sum_i (num utts in scp[i] - average)^2.
    # We can show that if considering changing just 2 scp's, we minimize
    # this by minimizing the squared difference in sizes.  This is
    # equivalent to minimizing the absolute difference in sizes.  This
    # shows this method is bound to converge.

    $changed = 1;
    while($changed) {
        $changed = 0;
        for($scpidx = 0; $scpidx < $numscps; $scpidx++) {
            # First try to reassign ending spk of this scp.
            if($scpidx < $numscps-1) {
                $sz = @{$scparray[$scpidx]};
                if($sz > 0) {
                    $spk = $scparray[$scpidx]->[$sz-1];
                    $count = $spk_count{$spk};
                    $nutt1 = $scpcount[$scpidx];
                    $nutt2 = $scpcount[$scpidx+1];
                    if( abs( ($nutt2+$count) - ($nutt1-$count))
                        < abs($nutt2 - $nutt1))  { # Would decrease
                        # size-diff by reassigning spk...
                        $scpcount[$scpidx+1] += $count;
                        $scpcount[$scpidx] -= $count;
                        pop @{$scparray[$scpidx]};
                        unshift @{$scparray[$scpidx+1]}, $spk;
                        $changed = 1;
                    }
                }
            }
            if($scpidx > 0 && @{$scparray[$scpidx]} > 0) {
                $spk = $scparray[$scpidx]->[0];
                $count = $spk_count{$spk};
                $nutt1 = $scpcount[$scpidx-1];
                $nutt2 = $scpcount[$scpidx];
                if( abs( ($nutt2-$count) - ($nutt1+$count))
                    < abs($nutt2 - $nutt1))  { # Would decrease
                    # size-diff by reassigning spk...
                    $scpcount[$scpidx-1] += $count;
                    $scpcount[$scpidx] -= $count;
                    shift @{$scparray[$scpidx]};
                    push @{$scparray[$scpidx-1]}, $spk;
                    $changed = 1;
                }
            }
        }
    }
    # Now print out the files...
    for($scpidx = 0; $scpidx < $numscps; $scpidx++) {
        $scpfile = $OUTPUTS[$scpidx];
        ($scpfile ne '-' ? open($f_fh, '>', $scpfile)
                         : open($f_fh, '>&', \*STDOUT)) ||
            die "$0: Could not open scp file $scpfile for writing: $!\n";
        $count = 0;
        if(@{$scparray[$scpidx]} == 0) {
            print STDERR "$0: eError: split_scp.pl producing empty .scp file " .
                         "$scpfile (too many splits and too few speakers?)\n";
            $error = 1;
        } else {
            foreach $spk ( @{$scparray[$scpidx]} ) {
                print $f_fh @{$spk_data{$spk}};
                $count += $spk_count{$spk};
            }
            $count == $scpcount[$scpidx] || die "Count mismatch [code error]";
        }
        close($f_fh);
    }
} else {
   # This block is the "normal" case where there is no --utt2spk
   # option and we just break into equal size chunks.

    open($i_fh, '<', $inscp) || die "$0: Error opening input scp file $inscp: $!\n";

    $numscps = @OUTPUTS;  # size of array.
    @F = ();
    while(<$i_fh>) {
        push @F, $_;
    }
    $numlines = @F;
    if($numlines == 0) {
        print STDERR "$0: error: empty input scp file $inscp\n";
        $error = 1;
    }
    $linesperscp = int( $numlines / $numscps); # the "whole part"..
    $linesperscp >= 1 || die "$0: You are splitting into too many pieces! [reduce \$nj ($numscps) to be smaller than the number of lines ($numlines) in $inscp]\n";
    $remainder = $numlines - ($linesperscp * $numscps);
    ($remainder >= 0 && $remainder < $numlines) || die "bad remainder $remainder";
    # [just doing int() rounds down].
    $n = 0;
    for($scpidx = 0; $scpidx < @OUTPUTS; $scpidx++) {
        $scpfile = $OUTPUTS[$scpidx];
        ($scpfile ne '-' ? open($o_fh, '>', $scpfile)
                         : open($o_fh, '>&', \*STDOUT)) ||
            die "$0: Could not open scp file $scpfile for writing: $!\n";
        for($k = 0; $k < $linesperscp + ($scpidx < $remainder ? 1 : 0); $k++) {
            print $o_fh $F[$n++];
        }
        close($o_fh) || die "$0: Eror closing scp file $scpfile: $!\n";
    }
    $n == $numlines || die "$n != $numlines [code error]";
}

exit ($error);


================================================
FILE: egs/utils/ssh.pl
================================================
#!/usr/bin/env perl
use warnings; #sed replacement for -w perl parameter

use Cwd;
use File::Basename;

# This program is like run.pl except rather than just running on a local
# machine, it can be configured to run on remote machines via ssh.
# It requires that you have set up passwordless access to those machines,
# and that Kaldi is running from a location that is accessible via the
# same path on those machines (presumably via an NFS mount).
#
# It looks for a file .queue/machines that should have, on each line, the name
# of a machine that you can ssh to (which may include this machine).  It doesn't
# have to be a fully qualified name.
#
# Later we may extend this so that on each line of .queue/machines you
# can specify various resources that each machine has, such as how
# many slots and how much memory, and make it wait if machines are 
# busy.  But for now it simply ssh's to a machine from those in the list.

# The command-line interface of this program is the same as run.pl;
# see run.pl for more information about the usage.


@ARGV < 2 && die "usage: ssh.pl log-file command-line arguments...";

$jobstart = 1;
$jobend = 1;
$qsub_opts=""; # These will be ignored.

# First parse an option like JOB=1:4, and any
# options that would normally be given to 
# ssh.pl, which we will just discard.

if (@ARGV > 0) {
  while (@ARGV >= 2 && $ARGV[0] =~ m:^-:) { # parse any options
    # that would normally go to qsub, but which will be ignored here.
    $switch = shift @ARGV;
    if ($switch eq "-V") {
      $qsub_opts .= "-V ";
    } else {
      $option = shift @ARGV;
      if ($switch eq "-sync" && $option =~ m/^[yY]/) {
        $qsub_opts .= "-sync "; # Note: in the
        # corresponding code in queue.pl it says instead, just "$sync = 1;".
      }
      $qsub_opts .= "$switch $option ";
      if ($switch eq "-pe") { # e.g. -pe smp 5
        $option2 = shift @ARGV;
        $qsub_opts .= "$option2 ";
      }
    }
  }
  if ($ARGV[0] =~ m/^([\w_][\w\d_]*)+=(\d+):(\d+)$/) { # e.g. JOB=1:10
    $jobname = $1;
    $jobstart = $2;
    $jobend = $3;
    shift;
    if ($jobstart > $jobend) {
      die "run.pl: invalid job range $ARGV[0]";
    }
    if ($jobstart <= 0) {
      die "run.pl: invalid job range $ARGV[0], start must be strictly positive (this is required for GridEngine compatibility)";
    }
  } elsif ($ARGV[0] =~ m/^([\w_][\w\d_]*)+=(\d+)$/) { # e.g. JOB=1.
    $jobname = $1;
    $jobstart = $2;
    $jobend = $2;
    shift;
  } elsif ($ARGV[0] =~ m/.+\=.*\:.*$/) {
    print STDERR "Warning: suspicious first argument to run.pl: $ARGV[0]\n";
  }
}

if ($qsub_opts ne "") {
  print STDERR "Warning: ssh.pl ignoring options \"$qsub_opts\"\n";
}

{ # Read .queue/machines
  if (!open(Q, "<.queue/machines")) {
    print STDERR "ssh.pl: expected the file .queue/machines to exist.\n";
    exit(1);
  }
  @machines = ();
  while (<Q>) {
    chop;
    if ($_ ne "") {
      @A = split;
      if (@A != 1) {
        die "ssh.pl: bad line '$_' in .queue/machines.";
      }
      if ($A[0] !~ m/^[a-z0-9\.\-]+/) {
        die "ssh.pl: invalid machine name '$A[0]'";
      }
      push @machines, $A[0];
    }
  }
  if (@machines == 0) {   die "ssh.pl: no machines listed in .queue/machines";  }
}

$logfile = shift @ARGV;

if (defined $jobname && $logfile !~ m/$jobname/ &&
    $jobend > $jobstart) {
  print STDERR "ssh.pl: you are trying to run a parallel job but "
    . "you are putting the output into just one log file ($logfile)\n";
  exit(1);
}

{
  $offset = 0;  # $offset will be an offset added to any index from the job-id
                # specified if the user does JOB=1:10.  The main point of this is
                # that there are instances where a script will manually submit a
                # number of jobs to the queue, e.g. with log files foo.1.log,
                # foo.2.log and so on, and we don't want all of these to go
                # to the first machine.
  @A = split(".", basename($logfile));
  # if $logfile looks like foo.9.log, add 9 to $offset.
  foreach $a (@A) {  if ($a =~ m/^\d+$/) { $offset += $a; } }
}

$cmd = "";

foreach $x (@ARGV) { 
    if ($x =~ m/^\S+$/) { $cmd .=  $x . " "; }
    elsif ($x =~ m:\":) { $cmd .= "'$x' "; }
    else { $cmd .= "\"$x\" "; } 
}


for ($jobid = $jobstart; $jobid <= $jobend; $jobid++) {
  $childpid = fork();
  if (!defined $childpid) { die "Error forking in ssh.pl (writing to $logfile)"; }
  if ($childpid == 0) {
    # We're in the child... this branch executes the job and returns (possibly
    # with an error status).
    if (defined $jobname) {
      $cmd =~ s/$jobname/$jobid/g;
      $logfile =~ s/$jobname/$jobid/g;
    }
    { # work out the machine to ssh to.
      $local_offset = $offset + $jobid - 1;  # subtract 1 since jobs never start
                                             # from 0; we'd like the first job
                                             # to normally run on the first
                                             # machine.
      $num_machines = scalar @machines;
      # in the next line, the "+ $num_machines" is in case $local_offset is
      # negative, to ensure the modulus is calculated in the mathematical way, not
      # in the C way where (negative number % positive number) is negative.
      $machines_index = ($local_offset + $num_machines) % $num_machines;
      $machine = $machines[$machines_index];
    }
    if (!open(S, "|ssh $machine bash")) {
      print STDERR "ssh.pl failed to ssh to $machine";
      exit(1);  # exits from the forked process within ssh.pl.
    }
    $cwd = getcwd();
    $logdir = dirname($logfile);
    # Below, we're printing into ssh which has opened a bash session; these are
    # bash commands.
    print S "set -e\n";  # if any of the later commands fails, we want it to exit.
    print S "cd $cwd\n";
    print S ". ./path.sh\n";
    print S "mkdir -p $logdir\n";
    print S "time1=\`date +\"%s\"\`\n";
    print S "( echo '#' Running on \`hostname\`\n";
    print S "  echo '#' Started at \`date\`\n";
    print S "  echo -n '# '; cat <<EOF\n";
    print S "$cmd\n";
    print S "EOF\n";
    print S ") >$logfile\n";
    print S "set +e\n";  # we don't want bash to exit if the next line fails.
    # in the next line, || true means allow this one to fail and not have bash exit immediately.
    print S " ( $cmd ) 2>>$logfile >>$logfile\n"; 
    print S "ret=\$?\n";
    print S "set -e\n"; # back into mode where it will exit on error.
    print S "time2=\`date +\"%s\"\`\n";
    print S "echo '#' Accounting: time=\$((\$time2-\$time1)) threads=1 >>$logfile\n";
    print S "echo '#' Finished at \`date\` with status \$ret >>$logfile\n";
    print S "exit \$ret";  # return with the status the command exited with.
    $ret = close(S);
    $ssh_return_status = $?;
    # see http://perldoc.perl.org/functions/close.html for explanation of return
    # status of close() and the variables it sets.
    if (! $ret && $! != 0) { die "ssh.pl: unexpected problem ssh'ing to machine $machine"; }
    if ($ssh_return_status != 0) { exit(1); } # exit with error status from this forked process.
    else { exit(0); } # else exit with non-error status.
  }
}

$ret = 0;
$numfail = 0;
for ($jobid = $jobstart; $jobid <= $jobend; $jobid++) {
  $r = wait();
  if ($r == -1) { die "Error waiting for child process"; } # should never happen.
  if ($? != 0) { $numfail++; $ret = 1; } # The child process failed.
}

if ($ret != 0) {
  $njobs = $jobend - $jobstart + 1;
  if ($njobs == 1) { 
    if (defined $jobname) {
      $logfile =~ s/$jobname/$jobstart/; # only one numbered job, so replace name with
                                         # that job.
    }
    print STDERR "ssh.pl: job failed, log is in $logfile\n";
    if ($logfile =~ m/JOB/) {
      print STDERR "run.pl: probably you forgot to put JOB=1:\$nj in your script.";
    }
  }
  else {
    $logfile =~ s/$jobname/*/g;
    print STDERR "ssh.pl: $numfail / $njobs failed, log is in $logfile\n";
  }
}


exit ($ret);


================================================
FILE: egs/utils/subset_data_dir.sh
================================================
#!/usr/bin/env bash
# Copyright 2010-2011  Microsoft Corporation
#           2012-2013  Johns Hopkins University (Author: Daniel Povey)
# Apache 2.0


# This script operates on a data directory, such as in data/train/.
# See http://kaldi-asr.org/doc/data_prep.html#data_prep_data
# for what these directories contain.

# This script creates a subset of that data, consisting of some specified
# number of utterances.  (The selected utterances are distributed evenly
# throughout the file, by the program ./subset_scp.pl).

# There are six options, none compatible with any other.

# If you give the --per-spk option, it will attempt to select the supplied
# number of utterances for each speaker (typically you would supply a much
# smaller number in this case).

# If you give the --speakers option, it selects a subset of n randomly
# selected speakers.

# If you give the --shortest option, it will give you the n shortest utterances.

# If you give the --first option, it will just give you the n first utterances.

# If you give the --last option, it will just give you the n last utterances.

# If you give the --spk-list or --utt-list option, it reads the
# speakers/utterances to keep from <speaker-list-file>/<utt-list-file>" (note,
# in this case there is no <num-utt> positional parameter; see usage message.)


shortest=false
perspk=false
speakers=false
first_opt=
spk_list=
utt_list=

expect_args=3
case $1 in
  --first|--last) first_opt=$1; shift ;;
  --per-spk)  perspk=true; shift ;;
  --shortest) shortest=true; shift ;;
  --speakers) speakers=true; shift ;;
  --spk-list) shift; spk_list=$1; shift; expect_args=2 ;;
  --utt-list) shift; utt_list=$1; shift; expect_args=2 ;;
  --*) echo "$0: invalid option '$1'"; exit 1
esac

if [ $# != $expect_args ]; then
  echo "Usage:"
  echo "  subset_data_dir.sh [--speakers|--shortest|--first|--last|--per-spk] <srcdir> <num-utt> <destdir>"
  echo "  subset_data_dir.sh [--spk-list <speaker-list-file>] <srcdir> <destdir>"
  echo "  subset_data_dir.sh [--utt-list <utt-list-file>] <srcdir> <destdir>"
  echo "By default, randomly selects <num-utt> utterances from the data directory."
  echo "With --speakers, randomly selects enough speakers that we have <num-utt> utterances"
  echo "With --per-spk, selects <num-utt> utterances per speaker, if available."
  echo "With --first, selects the first <num-utt> utterances"
  echo "With --last, selects the last <num-utt> utterances"
  echo "With --shortest, selects the shortest <num-utt> utterances."
  echo "With --spk-list, reads the speakers to keep from <speaker-list-file>"
  echo "With --utt-list, reads the utterances to keep from <utt-list-file>"
  exit 1;
fi

srcdir=$1
if [[ $spk_list || $utt_list ]]; then
  numutt=
  destdir=$2
else
  numutt=$2
  destdir=$3
fi

export LC_ALL=C

if [ ! -f $srcdir/utt2spk ]; then
  echo "$0: no such file $srcdir/utt2spk"
  exit 1
fi

if [[ $numutt && $numutt -gt $(wc -l <$srcdir/utt2spk) ]]; then
  echo "$0: cannot subset to more utterances than you originally had."
  exit 1
fi

if $shortest && [ ! -f $srcdir/feats.scp ]; then
  echo "$0: you selected --shortest but no feats.scp exist."
  exit 1
fi

mkdir -p $destdir || exit 1

if [[ $spk_list ]]; then
  utils/filter_scp.pl "$spk_list" $srcdir/spk2utt > $destdir/spk2utt || exit 1;
  utils/spk2utt_to_utt2spk.pl < $destdir/spk2utt > $destdir/utt2spk || exit 1;
elif [[ $utt_list ]]; then
  utils/filter_scp.pl "$utt_list" $srcdir/utt2spk > $destdir/utt2spk || exit 1;
  utils/utt2spk_to_spk2utt.pl < $destdir/utt2spk > $destdir/spk2utt || exit 1;
elif $speakers; then
  utils/shuffle_list.pl < $srcdir/spk2utt |
    awk -v numutt=$numutt '{ if (tot < numutt){ print; } tot += (NF-1); }' |
    sort > $destdir/spk2utt
  utils/spk2utt_to_utt2spk.pl < $destdir/spk2utt > $destdir/utt2spk
elif $perspk; then
  awk '{ n='$numutt'; printf("%s ",$1);
         skip=1; while(n*(skip+1) <= NF-1) { skip++; }
         for(x=2; x<=NF && x <= (n*skip+1); x += skip) { printf("%s ", $x); }
         printf("\n"); }' <$srcdir/spk2utt >$destdir/spk2utt
  utils/spk2utt_to_utt2spk.pl < $destdir/spk2utt > $destdir/utt2spk
else
  if $shortest; then
    # Select $numutt shortest utterances.
    . ./path.sh
    feat-to-len scp:$srcdir/feats.scp ark,t:$destdir/tmp.len || exit 1;
    sort -n -k2 $destdir/tmp.len |
      awk '{print $1}' |
      head -$numutt >$destdir/tmp.uttlist
    utils/filter_scp.pl $destdir/tmp.uttlist $srcdir/utt2spk >$destdir/utt2spk
    rm $destdir/tmp.uttlist $destdir/tmp.len
  else
    # Select $numutt random utterances.
    utils/subset_scp.pl $first_opt $numutt $srcdir/utt2spk > $destdir/utt2spk || exit 1;
  fi
  utils/utt2spk_to_spk2utt.pl < $destdir/utt2spk > $destdir/spk2utt
fi

# Perform filtering. utt2spk and spk2utt files already exist by this point.
# Filter by utterance.
[ -f $srcdir/feats.scp ] &&
  utils/filter_scp.pl $destdir/utt2spk <$srcdir/feats.scp >$destdir/feats.scp
[ -f $srcdir/vad.scp ] &&
  utils/filter_scp.pl $destdir/utt2spk <$srcdir/vad.scp >$destdir/vad.scp
[ -f $srcdir/utt2lang ] &&
  utils/filter_scp.pl $destdir/utt2spk <$srcdir/utt2lang >$destdir/utt2lang
[ -f $srcdir/utt2dur ] &&
  utils/filter_scp.pl $destdir/utt2spk <$srcdir/utt2dur >$destdir/utt2dur
[ -f $srcdir/utt2num_frames ] &&
  utils/filter_scp.pl $destdir/utt2spk <$srcdir/utt2num_frames >$destdir/utt2num_frames
[ -f $srcdir/utt2uniq ] &&
  utils/filter_scp.pl $destdir/utt2spk <$srcdir/utt2uniq >$destdir/utt2uniq
[ -f $srcdir/wav.scp ] &&
  utils/filter_scp.pl $destdir/utt2spk <$srcdir/wav.scp >$destdir/wav.scp
[ -f $srcdir/utt2warp ] &&
  utils/filter_scp.pl $destdir/utt2spk <$srcdir/utt2warp >$destdir/utt2warp
[ -f $srcdir/text ] &&
  utils/filter_scp.pl $destdir/utt2spk <$srcdir/text >$destdir/text

# Filter by speaker.
[ -f $srcdir/spk2warp ] &&
  utils/filter_scp.pl $destdir/spk2utt <$srcdir/spk2warp >$destdir/spk2warp
[ -f $srcdir/spk2gender ] &&
  utils/filter_scp.pl $destdir/spk2utt <$srcdir/spk2gender >$destdir/spk2gender
[ -f $srcdir/cmvn.scp ] &&
  utils/filter_scp.pl $destdir/spk2utt <$srcdir/cmvn.scp >$destdir/cmvn.scp

# Filter by recording-id.
if [ -f $srcdir/segments ]; then
  utils/filter_scp.pl $destdir/utt2spk <$srcdir/segments >$destdir/segments
  # Recording-ids are in segments.
  awk '{print $2}' $destdir/segments | sort | uniq >$destdir/reco
  # The next line overrides the command above for wav.scp, which would be incorrect.
  [ -f $srcdir/wav.scp ] &&
    utils/filter_scp.pl $destdir/reco <$srcdir/wav.scp >$destdir/wav.scp
else
  # No segments; recording-ids are in wav.scp.
  awk '{print $1}' $destdir/wav.scp | sort | uniq >$destdir/reco
fi

[ -f $srcdir/reco2file_and_channel ] &&
  utils/filter_scp.pl $destdir/reco <$srcdir/reco2file_and_channel >$destdir/reco2file_and_channel
[ -f $srcdir/reco2dur ] &&
  utils/filter_scp.pl $destdir/reco <$srcdir/reco2dur >$destdir/reco2dur

# Filter the STM file for proper sclite scoring.
# Copy over the comments from STM file.
[ -f $srcdir/stm ] &&
  (grep "^;;" $srcdir/stm
   utils/filter_scp.pl $destdir/reco $srcdir/stm) >$destdir/stm

rm $destdir/reco

# Copy frame_shift if present.
[ -f $srcdir/frame_shift ] && cp $srcdir/frame_shift $destdir

srcutts=$(wc -l <$srcdir/utt2spk)
destutts=$(wc -l <$destdir/utt2spk)
echo "$0: reducing #utt from $srcutts to $destutts"
exit 0


================================================
FILE: egs/utils/subset_scp.pl
================================================
#!/usr/bin/env perl
use warnings; #sed replacement for -w perl parameter
# Copyright 2010-2011 Microsoft Corporation

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#  http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.

# This program selects a subset of N elements in the scp.

# By default, it selects them evenly from throughout the scp, in order to avoid
# selecting too many from the same speaker.  It prints them on the standard
# output.
# With the option --first, it just selects the N first utterances.
# With the option --last, it just selects the N last utterances.

# Last modified by JHU & HKUST @2013


$quiet = 0;
$first = 0;
$last = 0;

if (@ARGV > 0 && $ARGV[0] eq "--quiet") {
  shift;
  $quiet = 1;
}
if (@ARGV > 0 && $ARGV[0] eq "--first") {
  shift;
  $first = 1;
}
if (@ARGV > 0 && $ARGV[0] eq "--last") {
  shift;
  $last = 1;
}

if(@ARGV < 2 ) {
    die "Usage: subset_scp.pl [--quiet][--first|--last] N in.scp\n" .
        " --quiet  causes it to not die if N < num lines in scp.\n" .
        " --first and --last make it equivalent to head or tail.\n" .
        "See also: filter_scp.pl\n";
}

$N = shift @ARGV;
if($N == 0) {
    die "First command-line parameter to subset_scp.pl must be an integer, got \"$N\"";
}
$inscp = shift @ARGV;
open(I, "<$inscp") || die "Opening input scp file $inscp";

@F = ();
while(<I>) {
    push @F, $_;
}
$numlines = @F;
if($N > $numlines) {
  if ($quiet) {
    $N = $numlines;
  } else {
    die "You requested from subset_scp.pl more elements than available: $N > $numlines";
  }
}

sub select_n {
  my ($start,$end,$num_needed) = @_;
  my $diff = $end - $start;
  if ($num_needed > $diff) {
    die "select_n: code error";
  }
  if ($diff == 1 ) {
    if ($num_needed  > 0) {
      print $F[$start];
    }
  } else {
    my $halfdiff = int($diff/2);
    my $halfneeded = int($num_needed/2);
    select_n($start, $start+$halfdiff, $halfneeded);
    select_n($start+$halfdiff, $end, $num_needed - $halfneeded);
  }
}

if ( ! $first && ! $last) {
  if ($N > 0) {
    select_n(0, $numlines, $N);
  }
} else {
  if ($first) { # --first option: same as head.
    for ($n = 0; $n < $N; $n++) {
      print $F[$n];
    }
  } else { # --last option: same as tail.
    for ($n = @F - $N; $n < @F; $n++) {
      print $F[$n];
    }
  }
}


================================================
FILE: egs/utils/subword/prepare_lang_subword.sh
================================================
#!/usr/bin/env bash
# Copyright 2012-2013  Johns Hopkins University (Author: Daniel Povey);
#                      Arnab Ghoshal
#                2014  Guoguo Chen
#                2015  Hainan Xu
#                2016  FAU Erlangen (Author: Axel Horndasch)
#                2019  Dongji Gao

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#  http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.

# This script prepares a directory (for subword) such as data/lang_subword/, in the standard format,
# given a source directory containing a subword dictionary lexicon.txt in a form like:
# subword phone1 phone2 ... phoneN
# per line (alternate prons would be separate lines), or a dictionary with probabilities
# called lexiconp.txt in a form:
# subword pron-prob phone1 phone2 ... phoneN
# (with 0.0 < pron-prob <= 1.0); note: if lexiconp.txt exists, we use it even if
# lexicon.txt exists.
# and also files silence_phones.txt, nonsilence_phones.txt, optional_silence.txt
# and extra_questions.txt
# Here, silence_phones.txt and nonsilence_phones.txt are lists of silence and
# non-silence phones respectively (where silence includes various kinds of
# noise, laugh, cough, filled pauses etc., and nonsilence phones includes the
# "real" phones.)
# In each line of those files is a list of phones, and the phones on each line
# are assumed to correspond to the same "base phone", i.e. they will be
# different stress or tone variations of the same basic phone.
# The file "optional_silence.txt" contains just a single phone (typically SIL)
# which is used for optional silence in the lexicon.
# extra_questions.txt might be empty; typically will consist of lists of phones,
# all members of each list with the same stress or tone; and also possibly a
# list for the silence phones.  This will augment the automatically generated
# questions (note: the automatically generated ones will treat all the
# stress/tone versions of a phone the same, so will not "get to ask" about
# stress or tone).
#

# This script adds word-position-dependent phones and constructs a host of other
# derived files, that go in data/lang_subword/.

# Currently it only support the most basic functions.
# Begin configuration section.
num_sil_states=5
num_nonsil_states=3
position_dependent_phones=true
# position_dependent_phones is false also when position dependent phones and word_boundary.txt
# have been generated by another source
share_silence_phones=false  # if true, then share pdfs of different silence
                            # phones together.
sil_prob=0.5
phone_symbol_table=             # if set, use a specified phones.txt file
num_extra_phone_disambig_syms=1 # Standard one phone disambiguation symbol is used for optional silence.
                                # Increasing this number does not harm, but is only useful if you later
                                # want to introduce this labels to L_disambig.fst
separator="@@"   # Separator is a suffix or prefix of subword indicating the position of this subword in word.
                 # By default, subword which is not at the end of word would have separator as suffix.
                 # For example: international -> inter@@ nation@@ al

# end configuration sections

echo "$0 $@"  # Print the command line for logging

. utils/parse_options.sh

if [ $# -ne 4 ]; then
  echo "Usage: utils/prepare_lang.sh <dict-src-dir> <oov-dict-entry> <tmp-dir> <lang-dir>"
  echo "e.g.: utils/prepare_lang.sh data/local/dict <SPOKEN_NOISE> data/local/lang data/lang"
  echo "<dict-src-dir> should contain the following files:"
  echo " extra_questions.txt  lexicon.txt nonsilence_phones.txt  optional_silence.txt  silence_phones.txt"
  echo "See http://kaldi-asr.org/doc/data_prep.html#data_prep_lang_creating for more info."
  echo "options: "
  echo "<dict-src-dir> may also, for the grammar-decoding case (see http://kaldi-asr.org/doc/grammar.html)"
  echo "contain a file nonterminals.txt containing symbols like #nonterm:contact_list, one per line."
  echo "     --num-sil-states <number of states>             # default: 5, #states in silence models."
  echo "     --num-nonsil-states <number of states>          # default: 3, #states in non-silence models."
  echo "     --position-dependent-phones (true|false)        # default: true; if true, use _B, _E, _S & _I"
  echo "                                                     # markers on phones to indicate word-internal positions. "
  echo "     --share-silence-phones (true|false)             # default: false; if true, share pdfs of "
  echo "                                                     # all silence phones. "
  echo "     --sil-prob <probability of silence>             # default: 0.5 [must have 0 <= silprob < 1]"
  echo "     --separator <separator>                         # default: @@"
  exit 1;
fi

srcdir=$1
oov_word=$2
tmpdir=$3
dir=$4
mkdir -p $dir $tmpdir $dir/phones

silprob=false
[ -f $srcdir/lexiconp_silprob.txt ] && echo "$0: Currently we do not support word-dependent silence probability." && exit 1;

if [ -f $srcdir/nonterminals.txt ]; then
  echo "$0: Currently we do not support nonterminals" && exit 1;
else
  grammar_opts=
fi

[ -f path.sh ] && . ./path.sh

# Validate dict directory
! utils/validate_dict_dir.pl $srcdir && \
  echo "*Error validating directory $srcdir*" && exit 1;

if [[ ! -f $srcdir/lexicon.txt ]]; then
  echo "**Creating $srcdir/lexicon.txt from $srcdir/lexiconp.txt"
  perl -ape 's/(\S+\s+)\S+\s+(.+)/$1$2/;' < $srcdir/lexiconp.txt > $srcdir/lexicon.txt || exit 1;
fi
if [[ ! -f $srcdir/lexiconp.txt ]]; then
  echo "**Creating $srcdir/lexiconp.txt from $srcdir/lexicon.txt"
  perl -ape 's/(\S+\s+)(.+)/${1}1.0\t$2/;' < $srcdir/lexicon.txt > $srcdir/lexiconp.txt || exit 1;
fi

# Currently The lexicon in dict directory have to be a subword lexicon.
# If the lexicon is for word and is not phonemic, we can not get a subword lexicon without knowing the alignment.
! grep -q $separator $srcdir/lexiconp.txt && \
echo "$0: Warning, this lexicon contains no separator \"$separator\" and may not be a subword lexicon." && exit 1;

# Write the separator into file for future use.
echo $separator > $dir/subword_separator.txt

if ! utils/validate_dict_dir.pl $srcdir >&/dev/null; then
  utils/validate_dict_dir.pl $srcdir  # show the output.
  echo "Validation failed (second time)"
  exit 1;
fi

# phones.txt file provided, we will do some sanity check here.
if [ ! -z $phone_symbol_table ]; then
  # Checks if we have position dependent phones
  n1=`cat $phone_symbol_table | grep -v -E "^#[0-9]+$" | cut -d' ' -f1 | sort -u | wc -l`
  n2=`cat $phone_symbol_table | grep -v -E "^#[0-9]+$" | cut -d' ' -f1 | sed 's/_[BIES]$//g' | sort -u | wc -l`
  $position_dependent_phones && [ $n1 -eq $n2 ] &&\
    echo "$0: Position dependent phones requested, but not in provided phone symbols" && exit 1;
  ! $position_dependent_phones && [ $n1 -ne $n2 ] &&\
    echo "$0: Position dependent phones not requested, but appear in the provided phones.txt" && exit 1;

  # Checks if the phone sets match.
  cat $srcdir/{,non}silence_phones.txt | awk -v f=$phone_symbol_table '
  BEGIN { while ((getline < f) > 0) { sub(/_[BEIS]$/, "", $1); phones[$1] = 1; }}
  { for (x = 1; x <= NF; ++x) { if (!($x in phones)) {
      print "Phone appears in the lexicon but not in the provided phones.txt: "$x; exit 1; }}}' || exit 1;
fi

if $position_dependent_phones; then
  # Create $tmpdir/lexiconp.txt from $srcdir/lexiconp.txt (or
  # $tmpdir/lexiconp_silprob.txt from $srcdir/lexiconp_silprob.txt) by
  # adding the markers _B, _E, _S, _I depending on word position.
  # In this recipe, these markers apply to silence also.
  # Do this starting from lexiconp.txt only.
  if "$silprob"; then
    echo "$0: Currently we do not support word-dependent silence probability" && exit 1;
  else
    utils/lang/make_position_dependent_subword_lexicon.py $srcdir/lexiconp.txt > $tmpdir/lexiconp.txt || exit 1;
  fi

  # create $tmpdir/phone_map.txt
  # this has the format (on each line)
  # <original phone> <version 1 of original phone> <version 2> ...
  # where the versions depend on the position of the phone within a word.
  # For instance, we'd have:
  # AA AA_B AA_E AA_I AA_S
  # for (B)egin, (E)nd, (I)nternal and (S)ingleton
  # and in the case of silence
  # SIL SIL SIL_B SIL_E SIL_I SIL_S
  # [because SIL on its own is one of the variants; this is for when it doesn't
  #  occur inside a word but as an option in the lexicon.]

  # This phone map expands the phone lists into all the word-position-dependent
  # versions of the phone lists.
  cat <(set -f; for x in `cat $srcdir/silence_phones.txt`; do for y in "" "" "_B" "_E" "_I" "_S"; do echo -n "$x$y "; done; echo; done) \
    <(set -f; for x in `cat $srcdir/nonsilence_phones.txt`; do for y in "" "_B" "_E" "_I" "_S"; do echo -n "$x$y "; done; echo; done) \
    > $tmpdir/phone_map.txt
else
  if "$silprob"; then
    echo "$0: Currently we do not support word-dependent silence probability" && exit 1;
  else
    cp $srcdir/lexiconp.txt $tmpdir/lexiconp.txt
  fi

  cat $srcdir/silence_phones.txt $srcdir/nonsilence_phones.txt | \
    awk '{for(n=1;n<=NF;n++) print $n; }' > $tmpdir/phones
  paste -d' ' $tmpdir/phones $tmpdir/phones > $tmpdir/phone_map.txt
fi

mkdir -p $dir/phones  # various sets of phones...

# Sets of phones for use in clustering, and making monophone systems.

if $share_silence_phones; then
  # build a roots file that will force all the silence phones to share the
  # same pdf's. [three distinct states, only the transitions will differ.]
  # 'shared'/'not-shared' means, do we share the 3 states of the HMM
  # in the same tree-root?
  # Sharing across models(phones) is achieved by writing several phones
  # into one line of roots.txt (shared/not-shared doesn't affect this).
  # 'not-shared not-split' means we have separate tree roots for the 3 states,
  # but we never split the tree so they remain stumps,
  # so all phones in the line correspond to the same model.

  cat $srcdir/silence_phones.txt | awk '{printf("%s ", $0); } END{printf("\n");}' | cat - $srcdir/nonsilence_phones.txt | \
    utils/apply_map.pl $tmpdir/phone_map.txt > $dir/phones/sets.txt
  cat $dir/phones/sets.txt | \
    awk '{if(NR==1) print "not-shared", "not-split", $0; else print "shared", "split", $0;}' > $dir/phones/roots.txt
else
  # different silence phones will have different GMMs.  [note: here, all "shared split" means
  # is that we may have one GMM for all the states, or we can split on states.  because they're
  # context-independent phones, they don't see the context.]
  cat $srcdir/{,non}silence_phones.txt | utils/apply_map.pl $tmpdir/phone_map.txt > $dir/phones/sets.txt
  cat $dir/phones/sets.txt | awk '{print "shared", "split", $0;}' > $dir/phones/roots.txt
fi

cat $srcdir/silence_phones.txt | utils/apply_map.pl $tmpdir/phone_map.txt | \
  awk '{for(n=1;n<=NF;n++) print $n;}' > $dir/phones/silence.txt
cat $srcdir/nonsilence_phones.txt | utils/apply_map.pl $tmpdir/phone_map.txt | \
  awk '{for(n=1;n<=NF;n++) print $n;}' > $dir/phones/nonsilence.txt
cp $srcdir/optional_silence.txt $dir/phones/optional_silence.txt
cp $dir/phones/silence.txt $dir/phones/context_indep.txt

# if extra_questions.txt is empty, it's OK.
cat $srcdir/extra_questions.txt 2>/dev/null | utils/apply_map.pl $tmpdir/phone_map.txt \
  >$dir/phones/extra_questions.txt

# Want extra questions about the word-start/word-end stuff. Make it separate for
# silence and non-silence. Probably doesn't matter, as silence will rarely
# be inside a word.
if $position_dependent_phones; then
  for suffix in _B _E _I _S; do
    (set -f; for x in `cat $srcdir/nonsilence_phones.txt`; do echo -n "$x$suffix "; done; echo) >>$dir/phones/extra_questions.txt
  done
  for suffix in "" _B _E _I _S; do
    (set -f; for x in `cat $srcdir/silence_phones.txt`; do echo -n "$x$suffix "; done; echo) >>$dir/phones/extra_questions.txt
  done
fi

# add_lex_disambig.pl is responsible for adding disambiguation symbols to
# the lexicon, for telling us how many disambiguation symbols it used,
# and and also for modifying the unknown-word's pronunciation (if the
# --unk-fst was provided) to the sequence "#1 #2 #3", and reserving those
# disambig symbols for that purpose.
# The #2 will later be replaced with the actual unk model.  The reason
# for the #1 and the #3 is for disambiguation and also to keep the
# FST compact.  If we didn't have the #1, we might have a different copy of
# the unk-model FST, or at least some of its arcs, for each start-state from
# which an <unk> transition comes (instead of per end-state, which is more compact);
# and adding the #3 prevents us from potentially having 2 copies of the unk-model
# FST due to the optional-silence [the last phone of any word gets 2 arcs].

if "$silprob"; then
  echo "$0: Currently we do not support word-dependent silence probability" && exit 1;
else
  ndisambig=$(utils/add_lex_disambig.pl $unk_opt --pron-probs $tmpdir/lexiconp.txt $tmpdir/lexiconp_disambig.txt)
fi
ndisambig=$[$ndisambig+$num_extra_phone_disambig_syms]; # add (at least) one disambig symbol for silence in lexicon FST.
echo $ndisambig > $tmpdir/lex_ndisambig

# Format of lexiconp_disambig.txt:
# !SIL	1.0   SIL_S
# <SPOKEN_NOISE>	1.0   SPN_S #1
# <UNK>	1.0  SPN_S #2
# <NOISE>	1.0  NSN_S
# !EXCLAMATION-POINT	1.0  EH2_B K_I S_I K_I L_I AH0_I M_I EY1_I SH_I AH0_I N_I P_I OY2_I N_I T_E

( for n in `seq 0 $ndisambig`; do echo '#'$n; done ) >$dir/phones/disambig.txt

# Create phone symbol table.
if [ ! -z $phone_symbol_table ]; then
  start_symbol=`grep \#0 $phone_symbol_table | awk '{print $2}'`
  echo "<eps>" | cat - $dir/phones/{silence,nonsilence}.txt | awk -v f=$phone_symbol_table '
  BEGIN { while ((getline < f) > 0) { phones[$1] = $2; }} { print $1" "phones[$1]; }' | sort -k2 -g |\
    cat - <(cat $dir/phones/disambig.txt | awk -v x=$start_symbol '{n=x+NR-1; print $1, n;}') > $dir/phones.txt
else
  echo "<eps>" | cat - $dir/phones/{silence,nonsilence,disambig}.txt | \
     awk '{n=NR-1; print $1, n;}' > $dir/phones.txt
fi

# Create a file that describes the word-boundary information for
# each phone.  5 categories.
if $position_dependent_phones; then
  cat $dir/phones/{silence,nonsilence}.txt | \
    awk '/_I$/{print $1, "internal"; next;} /_B$/{print $1, "begin"; next; }
         /_S$/{print $1, "singleton"; next;} /_E$/{print $1, "end"; next; }
         {print $1, "nonword";} ' > $dir/phones/word_boundary_moved.txt
else
  # word_boundary.txt might have been generated by another source
  [ -f $srcdir/word_boundary.txt ] && cp $srcdir/word_boundary.txt $dir/phones/word_boundary_moved.txt
fi

# Create word symbol table.
# <s> and </s> are only needed due to the need to rescore lattices with
# ConstArpaLm format language model. They do not normally appear in G.fst or
# L.fst.

if "$silprob"; then
  echo "$0: Currently we do not support word-dependent silence probability" && exit 1;
fi

cat $tmpdir/lexiconp.txt | awk '{print $1}' | sort | uniq  | awk '
  BEGIN {
    print "<eps> 0";
  }
  {
    if ($1 == "<s>") {
      print "<s> is in the vocabulary!" | "cat 1>&2"
      exit 1;
    }
    if ($1 == "</s>") {
      print "</s> is in the vocabulary!" | "cat 1>&2"
      exit 1;
    }
    printf("%s %d\n", $1, NR);
  }
  END {
    printf("#0 %d\n", NR+1);
    printf("<s> %d\n", NR+2);
    printf("</s> %d\n", NR+3);
  }' > $dir/words.txt || exit 1;

# In case there are extra word-level disambiguation symbols they also
# need to be added to words.txt

# format of $dir/words.txt:
# <eps> 0
# a 1
# aa 2
# aarvark 3
# ...

silphone=`cat $srcdir/optional_silence.txt` || exit 1;
[ -z "$silphone" ] && \
  ( echo "You have no optional-silence phone; it is required in the current scripts"
    echo "but you may use the option --sil-prob 0.0 to stop it being used." ) && \
   exit 1;

# create $dir/phones/align_lexicon.{txt,int}.
# This is the method we use for lattice word alignment if we are not
# using word-position-dependent phones.

# First remove pron-probs from the lexicon.
perl -ape 's/(\S+\s+)\S+\s+(.+)/$1$2/;' <$tmpdir/lexiconp.txt >$tmpdir/align_lexicon.txt

# Note: here, $silphone will have no suffix e.g. _S because it occurs as optional-silence,
# and is not part of a word.
[ ! -z "$silphone" ] && echo "<eps> $silphone" >> $tmpdir/align_lexicon.txt

cat $tmpdir/align_lexicon.txt | \
  perl -ane '@A = split; print $A[0], " ", join(" ", @A), "\n";' | sort | uniq > $dir/phones/align_lexicon.txt

# create phones/align_lexicon.int from phones/align_lexicon.txt
cat $dir/phones/align_lexicon.txt | utils/sym2int.pl -f 3- $dir/phones.txt | \
  utils/sym2int.pl -f 1-2 $dir/words.txt > $dir/phones/align_lexicon.int

# Create the basic L.fst without disambiguation symbols, for use
# in training.

if $silprob; then
#  # Add silence probabilities (models the prob. of silence before and after each
#  # word).  On some setups this helps a bit.  See utils/dict_dir_add_pronprobs.sh
#  # and where it's called in the example scripts (run.sh).
  echo "$0: Currently we do not support word-dependnet silence probability" && exit 1;
else
  utils/lang/make_subword_lexicon_fst.py $grammar_opts --sil-prob=$sil_prob --sil-phone=$silphone --position-dependent\
            --separator=$separator $tmpdir/lexiconp.txt | \
    fstcompile --isymbols=$dir/phones.txt --osymbols=$dir/words.txt \
      --keep_isymbols=false --keep_osymbols=false | \
    fstarcsort --sort_type=olabel > $dir/L.fst || exit 1;
fi

# The file oov.txt contains a word that we will map any OOVs to during
# training.
echo "$oov_word" > $dir/oov.txt || exit 1;
cat $dir/oov.txt | utils/sym2int.pl $dir/words.txt >$dir/oov.int || exit 1;
# integer version of oov symbol, used in some scripts.

# the file wdisambig.txt contains a (line-by-line) list of the text-form of the
# disambiguation symbols that are used in the grammar and passed through by the
# lexicon.  At this stage it's hardcoded as '#0', but we're laying the groundwork
# for more generality (which probably would be added by another script).
# wdisambig_words.int contains the corresponding list interpreted by the
# symbol table words.txt, and wdisambig_phones.int contains the corresponding
# list interpreted by the symbol table phones.txt.
echo '#0' >$dir/phones/wdisambig.txt

utils/sym2int.pl $dir/phones.txt <$dir/phones/wdisambig.txt >$dir/phones/wdisambig_phones.int
utils/sym2int.pl $dir/words.txt <$dir/phones/wdisambig.txt >$dir/phones/wdisambig_words.int

# Create these lists of phones in colon-separated integer list form too,
# for purposes of being given to programs as command-line options.
for f in silence nonsilence optional_silence disambig context_indep; do
  utils/sym2int.pl $dir/phones.txt <$dir/phones/$f.txt >$dir/phones/$f.int
  utils/sym2int.pl $dir/phones.txt <$dir/phones/$f.txt | \
   awk '{printf(":%d", $1);} END{printf "\n"}' | sed s/:// > $dir/phones/$f.csl || exit 1;
done

for x in sets extra_questions; do
  utils/sym2int.pl $dir/phones.txt <$dir/phones/$x.txt > $dir/phones/$x.int || exit 1;
done

utils/sym2int.pl -f 3- $dir/phones.txt <$dir/phones/roots.txt \
   > $dir/phones/roots.int || exit 1;

if [ -f $dir/phones/word_boundary_moved.txt ]; then
  utils/sym2int.pl -f 1 $dir/phones.txt <$dir/phones/word_boundary_moved.txt \
    > $dir/phones/word_boundary_moved.int || exit 1;
fi

silphonelist=`cat $dir/phones/silence.csl`
nonsilphonelist=`cat $dir/phones/nonsilence.csl`

# Note: it's OK, after generating the 'lang' directory, to overwrite the topo file
# with another one of your choice if the 'topo' file you want can't be generated by
# utils/gen_topo.pl.  We do this in the 'chain' recipes.  Of course, the 'topo' file
# should cover all the phones.  Try running utils/validate_lang.pl to check that
# everything is OK after modifying the topo file.
utils/gen_topo.pl $num_nonsil_states $num_sil_states $nonsilphonelist $silphonelist >$dir/topo

# Create the lexicon FST with disambiguation symbols, and put it in lang_test.
# There is an extra step where we create a loop to "pass through" the
# disambiguation symbols from G.fst.

if $silprob; then
  echo "$0: Currently we do not support word-dependnet silence probability" && exit 1;
else
  utils/lang/make_subword_lexicon_fst.py $grammar_opts \
       --sil-prob=$sil_prob --sil-phone=$silphone --sil-disambig='#'$ndisambig --position-dependent \
       --separator=$separator $tmpdir/lexiconp_disambig.txt | \
     fstcompile --isymbols=$dir/phones.txt --osymbols=$dir/words.txt \
       --keep_isymbols=false --keep_osymbols=false |   \
     fstaddselfloops  $dir/phones/wdisambig_phones.int $dir/phones/wdisambig_words.int | \
     fstarcsort --sort_type=olabel > $dir/L_disambig.fst || exit 1;
fi

echo "$(basename $0): validating output directory"
! utils/validate_lang.pl $dir && echo "$(basename $0): error validating output" &&  exit 1;

exit 0;


================================================
FILE: egs/utils/subword/prepare_subword_text.sh
================================================
#!/usr/bin/env bash

# 2019 Dongji Gao

# This script generates subword text form word text.
# For example, <noise> internatioal -> <noise> inter@@ nation@@ al
# @@ here is the separator indicate the poisition of subword in word.
# Subword directly followed by separator can only appear at he begining or middle of word.
# "<noise>" here can be reserved if added to the option "--glossaries"

# Begin configuration section
separator="@@"
glossaries=
# End configuration section

. utils/parse_options.sh

echo "$0 $@"

if [ $# -ne 3 ]; then
  echo "Usage: utils/prepare_subword_text.sh <word-text> <pair_code> <subword-text>"
  echo "e.g.: utils/prepare_subword_text.sh data/train/text data/local/pair_code.txt data/train/text_subword"
  echo "    --seperator <separator>         # default: @@"
  echo "    --glossaries <reserved-words>   # glossaries are words reserved"
  exit 1;
fi

word_text=$1
pair_code=$2
subword_text=$3

[ ! -f $word_text ] && echo "Word text $word_text does not exits." && exit 1;

grep -q $separator $word_text && echo "$0: Error, word text file contains separator $separator. This might be a subword text file or you need to choose a different separator" && exit 1;

glossaries_opt=
[ -z $glossaires ] && glossaries_opt="--glossaries $glossaries"
cut -d ' ' -f2- $word_text | \
  utils/lang/bpe/apply_bpe.py -c $pair_code --separator $separator $glossaires_opt > ${word_text}.sub
  if [ $word_text == $subword_text ]; then
    mv $word_text ${word_text}.old
    cut -d ' ' -f1 ${word_text}.old | paste -d ' ' - ${word_text}.sub > $subword_text
  else
    cut -d ' ' -f1 $word_text | paste -d ' ' - ${word_text}.sub > $subword_text
  fi

rm ${word_text}.sub
echo "Subword text created."


================================================
FILE: egs/utils/summarize_logs.pl
================================================
#!/usr/bin/env perl

# Copyright 2012 Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.

#scalar(@ARGV) >= 1 && print STDERR "Usage: summarize_warnings.pl <log-dir>\n" && exit 1;

sub split_hundreds { # split list of filenames into groups of 100.
  my $names = shift @_;
  my @A = split(" ", $names);
  my @ans = ();
  while (@A > 0) {
    my $group = "";
    for ($x = 0; $x < 100 && @A>0; $x++) {
      $fname = pop @A;
      $group .= "$fname ";
    }
    push @ans, $group;
  }
  return @ans;
}

sub parse_accounting_entry {
  $entry= shift @_;

  @elems = split " ", $entry;
  
  $time=undef;
  $threads=undef;
  foreach $elem (@elems) {
    if ( $elem=~ m/time=(\d+)/ ) {
      $elem =~ s/time=(\d+)/$1/;
      $time = $elem;
    } elsif ( $elem=~ m/threads=(\d+)/ ) {
      $elem =~ s/threads=(\d+)/$1/g;
      $threads = $elem;
    } else {
      die "Unknown entry \"$elem\" when parsing \"$entry\" \n";
    }
  }

  if (defined($time) and defined($threads) ) {
    return ($time, $threads);
  } else {
    die "The accounting entry \"$entry\" did not contain all necessary attributes";
  }
}

foreach $dir (@ARGV) {

  #$dir = $ARGV[0];
  print $dir

  ! -d $dir && print STDERR "summarize_warnings.pl: no such directory $dir\n" ;

  $dir =~ s:/$::; # Remove trailing slash.


  # Group the files into categories where all have the same base-name.
  foreach $f (glob ("$dir/*.log")) {
    $f_category = $f;
    # do next expression twice; s///g doesn't work as they overlap.
    $f_category =~ s:\.\d+\.(?!\d+):.*.:;
    #$f_category =~ s:\.\d+\.:.*.:;
    $fmap{$f_category} .= " $f";
  }
}

foreach $c (sort (keys %fmap) ) {
  $n = 0;
  foreach $fgroup (split_hundreds($fmap{$c})) {
    $n += `grep -w WARNING $fgroup | wc -l`;
  }
  if ($n != 0) {
    print "$n warnings in $c\n"
  }
}
foreach $c (sort (keys %fmap)) {
  $n = 0;
  foreach $fgroup (split_hundreds($fmap{$c})) {
    $n += `grep -w ERROR $fgroup | wc -l`;
  }
  if ($n != 0) {
    print "$n errors in $c\n"
  }
}

$supertotal_cpu_time=0.0;
$supertotal_clock_time=0.0;
$supertotal_threads=0.0;

foreach $c (sort (keys %fmap)) {
  $n = 0;

  $total_cpu_time=0.0;
  $total_clock_time=0.0;
  $total_threads=0.0;
  foreach $fgroup (split_hundreds($fmap{$c})) {
    $lines=`grep -a "# Accounting: " $fgroup |sed 's/.* Accounting: *//g'`;
    
    #print $lines ."\n";

    @entries = split "\n", $lines;

    foreach $line (@entries) {
      $time, $threads = parse_accounting_entry($line);

      $total_cpu_time += $time * $threads;
      $total_threads += $threads;
      if ( $time > $total_clock_time ) {
        $total_clock_time = $time;
      }
    }
  }
  print "total_cpu_time=$total_cpu_time clock_time=$total_clock_time total_threads=$total_threads group=$c\n";

  $supertotal_cpu_time += $total_cpu_time;
  $supertotal_clock_time += $total_clock_time;
  $supertotal_threads += $total_threads;
}
print "total_cpu_time=$supertotal_cpu_time clock_time=$supertotal_clock_time total_threads=$supertotal_threads group=all\n";


================================================
FILE: egs/utils/summarize_warnings.pl
================================================
#!/usr/bin/env perl

# Copyright 2012 Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.

 @ARGV != 1 && print STDERR "Usage: summarize_warnings.pl <log-dir>\n" && exit 1;

$dir = $ARGV[0];

! -d $dir && print STDERR "summarize_warnings.pl: no such directory $dir\n" && exit 1;

$dir =~ s:/$::; # Remove trailing slash.


# Group the files into categories where all have the same base-name.
foreach $f (glob ("$dir/*.log")) {
  $f_category = $f;
  # do next expression twice; s///g doesn't work as they overlap.
  $f_category =~ s:\.\d+\.:.*.:;
  $f_category =~ s:\.\d+\.:.*.:;
  $fmap{$f_category} .= " $f";
}

sub split_hundreds { # split list of filenames into groups of 100.
  my $names = shift @_;
  my @A = split(" ", $names);
  my @ans = ();
  while (@A > 0) {
    my $group = "";
    for ($x = 0; $x < 100 && @A>0; $x++) {
      $fname = pop @A;
      $group .= "$fname ";
    }
    push @ans, $group;
  }
  return @ans;
}

foreach $c (keys %fmap) {
  $n = 0;
  foreach $fgroup (split_hundreds($fmap{$c})) {
    $n += `grep -w WARNING $fgroup | wc -l`;
  }
  if ($n != 0) {
    print "$n warnings in $c\n"
  }
}


================================================
FILE: egs/utils/sym2int.pl
================================================
#!/usr/bin/env perl
# Copyright 2010-2012 Microsoft Corporation  Johns Hopkins University (Author: Daniel Povey)

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#  http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.


$ignore_oov = 0;

for($x = 0; $x < 2; $x++) {
  if ($ARGV[0] eq "--map-oov") {
    shift @ARGV;
    $map_oov = shift @ARGV;
    if ($map_oov eq "-f" || $map_oov =~ m/words\.txt$/ || $map_oov eq "") {
      # disallow '-f', the empty string and anything ending in words.txt as the
      # OOV symbol because these are likely command-line errors.
      die "the --map-oov option requires an argument";
    }
  }
  if ($ARGV[0] eq "-f") {
    shift @ARGV;
    $field_spec = shift @ARGV;
    if ($field_spec =~ m/^\d+$/) {
      $field_begin = $field_spec - 1; $field_end = $field_spec - 1;
    }
    if ($field_spec =~ m/^(\d*)[-:](\d*)/) { # accept e.g. 1:10 as a courtesy (properly, 1-10)
      if ($1 ne "") {
        $field_begin = $1 - 1;  # Change to zero-based indexing.
      }
      if ($2 ne "") {
        $field_end = $2 - 1;    # Change to zero-based indexing.
      }
    }
    if (!defined $field_begin && !defined $field_end) {
      die "Bad argument to -f option: $field_spec";
    }
  }
}

$symtab = shift @ARGV;
if (!defined $symtab) {
  print STDERR "Usage: sym2int.pl [options] symtab [input transcriptions] > output transcriptions\n" .
    "options: [--map-oov <oov-symbol> ]  [-f <field-range> ]\n" .
      "note: <field-range> can look like 4-5, or 4-, or 5-, or 1.\n";
}
open(F, "<$symtab") || die "Error opening symbol table file $symtab";
while(<F>) {
    @A = split(" ", $_);
    @A == 2 || die "bad line in symbol table file: $_";
    $sym2int{$A[0]} = $A[1] + 0;
}

if (defined $map_oov && $map_oov !~ m/^\d+$/) { # not numeric-> look it up
  if (!defined $sym2int{$map_oov}) { die "OOV symbol $map_oov not defined."; }
  $map_oov = $sym2int{$map_oov};
}

$num_warning = 0;
$max_warning = 20;

while (<>) {
  @A = split(" ", $_);
  @B = ();
  for ($n = 0; $n < @A; $n++) {
    $a = $A[$n];
    if ( (!defined $field_begin || $n >= $field_begin)
         && (!defined $field_end || $n <= $field_end)) {
      $i = $sym2int{$a};
      if (!defined ($i)) {
        if (defined $map_oov) {
          if ($num_warning++ < $max_warning) {
            print STDERR "sym2int.pl: replacing $a with $map_oov\n";
            if ($num_warning == $max_warning) {
              print STDERR "sym2int.pl: not warning for OOVs any more times\n";
            }
          }
          $i = $map_oov;
        } else {
          $pos = $n+1;
          die "sym2int.pl: undefined symbol $a (in position $pos)\n";
        }
      }
      $a = $i;
    }
    push @B, $a;
  }
  print join(" ", @B);
  print "\n";
}
if ($num_warning > 0) {
  print STDERR "** Replaced $num_warning instances of OOVs with $map_oov\n";
}

exit(0);


================================================
FILE: egs/utils/utt2spk_to_spk2utt.pl
================================================
#!/usr/bin/env perl
# Copyright 2010-2011 Microsoft Corporation

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#  http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.

# converts an utt2spk file to a spk2utt file.
# Takes input from the stdin or from a file argument;
# output goes to the standard out.

if ( @ARGV > 1 ) {
    die "Usage: utt2spk_to_spk2utt.pl [ utt2spk ] > spk2utt";
}

while(<>){ 
    @A = split(" ", $_);
    @A == 2 || die "Invalid line in utt2spk file: $_";
    ($u,$s) = @A;
    if(!$seen_spk{$s}) {
        $seen_spk{$s} = 1;
        push @spklist, $s;
    }
    push (@{$spk_hash{$s}}, "$u");
}
foreach $s (@spklist) {
    $l = join(' ',@{$spk_hash{$s}});
    print "$s $l\n";
}


================================================
FILE: egs/utils/validate_data_dir.sh
================================================
#!/usr/bin/env bash

cmd="$@"

no_feats=false
no_wav=false
no_text=false
no_spk_sort=false
non_print=false


function show_help
{
      echo "Usage: $0 [--no-feats] [--no-text] [--non-print] [--no-wav] [--no-spk-sort] <data-dir>"
      echo "The --no-xxx options mean that the script does not require "
      echo "xxx.scp to be present, but it will check it if it is present."
      echo "--no-spk-sort means that the script does not require the utt2spk to be "
      echo "sorted by the speaker-id in addition to being sorted by utterance-id."
      echo "--non-print ignore the presence of non-printable characters."
      echo "By default, utt2spk is expected to be sorted by both, which can be "
      echo "achieved by making the speaker-id prefixes of the utterance-ids"
      echo "e.g.: $0 data/train"
}      

while [ $# -ne 0 ] ; do
  case "$1" in
    "--no-feats")
      no_feats=true;
      ;;
    "--no-text")
      no_text=true;
      ;;
    "--non-print")
      non_print=true;
      ;;
    "--no-wav")
      no_wav=true;
      ;;
    "--no-spk-sort")
      no_spk_sort=true;
      ;;
    *)
      if ! [ -z "$data" ] ; then
        show_help;
        exit 1
      fi
      data=$1
      ;;
  esac
  shift
done


if [ ! -d $data ]; then
  echo "$0: no such directory $data"
  exit 1;
fi

if [ -f $data/images.scp ]; then
  cmd=${cmd/--no-wav/}  # remove --no-wav if supplied
  image/validate_data_dir.sh $cmd
  exit $?
fi

for f in spk2utt utt2spk; do
  if [ ! -f $data/$f ]; then
    echo "$0: no such file $f"
    exit 1;
  fi
  if [ ! -s $data/$f ]; then
    echo "$0: empty file $f"
    exit 1;
  fi
done

! cat $data/utt2spk | awk '{if (NF != 2) exit(1); }' && \
  echo "$0: $data/utt2spk has wrong format." && exit;

ns=$(wc -l < $data/spk2utt)
if [ "$ns" == 1 ]; then
  echo "$0: WARNING: you have only one speaker.  This probably a bad idea."
  echo "   Search for the word 'bold' in http://kaldi-asr.org/doc/data_prep.html"
  echo "   for more information."
fi


tmpdir=$(mktemp -d /tmp/kaldi.XXXX);
trap 'rm -rf "$tmpdir"' EXIT HUP INT PIPE TERM

export LC_ALL=C

function check_sorted_and_uniq {
  ! perl -ne '((substr $_,-1) eq "\n") or die "file $ARGV has invalid newline";' $1 && exit 1;
  ! awk '{print $1}' < $1 | sort -uC && echo "$0: file $1 is not sorted or has duplicates" && exit 1;
}

function partial_diff {
  diff -U1 $1 $2 | (head -n 6; echo "..."; tail -n 6)
  n1=`cat $1 | wc -l`
  n2=`cat $2 | wc -l`
  echo "[Lengths are $1=$n1 versus $2=$n2]"
}

check_sorted_and_uniq $data/utt2spk

if ! $no_spk_sort; then
  ! sort -k2 -C $data/utt2spk && \
     echo "$0: utt2spk is not in sorted order when sorted first on speaker-id " && \
     echo "(fix this by making speaker-ids prefixes of utt-ids)" && exit 1;
fi

check_sorted_and_uniq $data/spk2utt

! cmp -s <(cat $data/utt2spk | awk '{print $1, $2;}') \
     <(utils/spk2utt_to_utt2spk.pl $data/spk2utt)  && \
   echo "$0: spk2utt and utt2spk do not seem to match" && exit 1;

cat $data/utt2spk | awk '{print $1;}' > $tmpdir/utts

if [ ! -f $data/text ] && ! $no_text; then
  echo "$0: no such file $data/text (if this is by design, specify --no-text)"
  exit 1;
fi

num_utts=`cat $tmpdir/utts | wc -l`
if ! $no_text; then
  if ! $non_print; then
    n_non_print=$(LC_ALL="C.UTF-8" grep -c '[^[:print:][:space:]]' $data/text) && \
    echo "$0: text contains $n_non_print lines with non-printable characters" &&\
    exit 1;
  fi
  utils/validate_text.pl $data/text || exit 1;
  check_sorted_and_uniq $data/text
  text_len=`cat $data/text | wc -l`
  illegal_sym_list="<s> </s> #0"
  for x in $illegal_sym_list; do
    if grep -w "$x" $data/text > /dev/null; then
      echo "$0: Error: in $data, text contains illegal symbol $x"
      exit 1;
    fi
  done
  awk '{print $1}' < $data/text > $tmpdir/utts.txt
  if ! cmp -s $tmpdir/utts{,.txt}; then
    echo "$0: Error: in $data, utterance lists extracted from utt2spk and text"
    echo "$0: differ, partial diff is:"
    partial_diff $tmpdir/utts{,.txt}
    exit 1;
  fi
fi

if [ -f $data/segments ] && [ ! -f $data/wav.scp ]; then
  echo "$0: in directory $data, segments file exists but no wav.scp"
  exit 1;
fi


if [ ! -f $data/wav.scp ] && ! $no_wav; then
  echo "$0: no such file $data/wav.scp (if this is by design, specify --no-wav)"
  exit 1;
fi

if [ -f $data/wav.scp ]; then
  check_sorted_and_uniq $data/wav.scp

  if grep -E -q '^\S+\s+~' $data/wav.scp; then
    # note: it's not a good idea to have any kind of tilde in wav.scp, even if
    # part of a command, as it would cause compatibility problems if run by
    # other users, but this used to be not checked for so we let it slide unless
    # it's something of the form "foo ~/foo.wav" (i.e. a plain file name) which
    # would definitely cause problems as the fopen system call does not do
    # tilde expansion.
    echo "$0: Please do not use tilde (~) in your wav.scp."
    exit 1;
  fi

  if [ -f $data/segments ]; then

    check_sorted_and_uniq $data/segments
    # We have a segments file -> interpret wav file as "recording-ids" not utterance-ids.
    ! cat $data/segments | \
      awk '{if (NF != 4 || $4 <= $3) { print "Bad line in segments file", $0; exit(1); }}' && \
      echo "$0: badly formatted segments file" && exit 1;

    segments_len=`cat $data/segments | wc -l`
    if [ -f $data/text ]; then
      ! cmp -s $tmpdir/utts <(awk '{print $1}' <$data/segments) && \
        echo "$0: Utterance list differs between $data/utt2spk and $data/segments " && \
        echo "$0: Lengths are $segments_len vs $num_utts" && \
        exit 1
    fi

    cat $data/segments | awk '{print $2}' | sort | uniq > $tmpdir/recordings
    awk '{print $1}' $data/wav.scp > $tmpdir/recordings.wav
    if ! cmp -s $tmpdir/recordings{,.wav}; then
      echo "$0: Error: in $data, recording-ids extracted from segments and wav.scp"
      echo "$0: differ, partial diff is:"
      partial_diff $tmpdir/recordings{,.wav}
      exit 1;
    fi
    if [ -f $data/reco2file_and_channel ]; then
      # this file is needed only for ctm scoring; it's indexed by recording-id.
      check_sorted_and_uniq $data/reco2file_and_channel
      ! cat $data/reco2file_and_channel | \
        awk '{if (NF != 3 || ($3 != "A" && $3 != "B" )) {
                if ( NF == 3 && $3 == "1" ) {
                  warning_issued = 1;
                } else {
                  print "Bad line ", $0; exit 1;
                }
              }
            }
            END {
              if (warning_issued == 1) {
                print "The channel should be marked as A or B, not 1! You should change it ASAP! "
              }
            }' && echo "$0: badly formatted reco2file_and_channel file" && exit 1;
      cat $data/reco2file_and_channel | awk '{print $1}' > $tmpdir/recordings.r2fc
      if ! cmp -s $tmpdir/recordings{,.r2fc}; then
        echo "$0: Error: in $data, recording-ids extracted from segments and reco2file_and_channel"
        echo "$0: differ, partial diff is:"
        partial_diff $tmpdir/recordings{,.r2fc}
        exit 1;
      fi
    fi
  else
    # No segments file -> assume wav.scp indexed by utterance.
    cat $data/wav.scp | awk '{print $1}' > $tmpdir/utts.wav
    if ! cmp -s $tmpdir/utts{,.wav}; then
      echo "$0: Error: in $data, utterance lists extracted from utt2spk and wav.scp"
      echo "$0: differ, partial diff is:"
      partial_diff $tmpdir/utts{,.wav}
      exit 1;
    fi

    if [ -f $data/reco2file_and_channel ]; then
      # this file is needed only for ctm scoring; it's indexed by recording-id.
      check_sorted_and_uniq $data/reco2file_and_channel
      ! cat $data/reco2file_and_channel | \
        awk '{if (NF != 3 || ($3 != "A" && $3 != "B" )) {
                if ( NF == 3 && $3 == "1" ) {
                  warning_issued = 1;
                } else {
                  print "Bad line ", $0; exit 1;
                }
              }
            }
            END {
              if (warning_issued == 1) {
                print "The channel should be marked as A or B, not 1! You should change it ASAP! "
              }
            }' && echo "$0: badly formatted reco2file_and_channel file" && exit 1;
      cat $data/reco2file_and_channel | awk '{print $1}' > $tmpdir/utts.r2fc
      if ! cmp -s $tmpdir/utts{,.r2fc}; then
        echo "$0: Error: in $data, utterance-ids extracted from segments and reco2file_and_channel"
        echo "$0: differ, partial diff is:"
        partial_diff $tmpdir/utts{,.r2fc}
        exit 1;
      fi
    fi
  fi
fi

if [ ! -f $data/feats.scp ] && ! $no_feats; then
  echo "$0: no such file $data/feats.scp (if this is by design, specify --no-feats)"
  exit 1;
fi

if [ -f $data/feats.scp ]; then
  check_sorted_and_uniq $data/feats.scp
  cat $data/feats.scp | awk '{print $1}' > $tmpdir/utts.feats
  if ! cmp -s $tmpdir/utts{,.feats}; then
    echo "$0: Error: in $data, utterance-ids extracted from utt2spk and features"
    echo "$0: differ, partial diff is:"
    partial_diff $tmpdir/utts{,.feats}
    exit 1;
  fi
fi


if [ -f $data/cmvn.scp ]; then
  check_sorted_and_uniq $data/cmvn.scp
  cat $data/cmvn.scp | awk '{print $1}' > $tmpdir/speakers.cmvn
  cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers
  if ! cmp -s $tmpdir/speakers{,.cmvn}; then
    echo "$0: Error: in $data, speaker lists extracted from spk2utt and cmvn"
    echo "$0: differ, partial diff is:"
    partial_diff $tmpdir/speakers{,.cmvn}
    exit 1;
  fi
fi

if [ -f $data/spk2gender ]; then
  check_sorted_and_uniq $data/spk2gender
  ! cat $data/spk2gender | awk '{if (!((NF == 2 && ($2 == "m" || $2 == "f")))) exit 1; }' && \
     echo "$0: Mal-formed spk2gender file" && exit 1;
  cat $data/spk2gender | awk '{print $1}' > $tmpdir/speakers.spk2gender
  cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers
  if ! cmp -s $tmpdir/speakers{,.spk2gender}; then
    echo "$0: Error: in $data, speaker lists extracted from spk2utt and spk2gender"
    echo "$0: differ, partial diff is:"
    partial_diff $tmpdir/speakers{,.spk2gender}
    exit 1;
  fi
fi

if [ -f $data/spk2warp ]; then
  check_sorted_and_uniq $data/spk2warp
  ! cat $data/spk2warp | awk '{if (!((NF == 2 && ($2 > 0.5 && $2 < 1.5)))){ print; exit 1; }}' && \
     echo "$0: Mal-formed spk2warp file" && exit 1;
  cat $data/spk2warp | awk '{print $1}' > $tmpdir/speakers.spk2warp
  cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers
  if ! cmp -s $tmpdir/speakers{,.spk2warp}; then
    echo "$0: Error: in $data, speaker lists extracted from spk2utt and spk2warp"
    echo "$0: differ, partial diff is:"
    partial_diff $tmpdir/speakers{,.spk2warp}
    exit 1;
  fi
fi

if [ -f $data/utt2warp ]; then
  check_sorted_and_uniq $data/utt2warp
  ! cat $data/utt2warp | awk '{if (!((NF == 2 && ($2 > 0.5 && $2 < 1.5)))){ print; exit 1; }}' && \
     echo "$0: Mal-formed utt2warp file" && exit 1;
  cat $data/utt2warp | awk '{print $1}' > $tmpdir/utts.utt2warp
  cat $data/utt2spk | awk '{print $1}' > $tmpdir/utts
  if ! cmp -s $tmpdir/utts{,.utt2warp}; then
    echo "$0: Error: in $data, utterance lists extracted from utt2spk and utt2warp"
    echo "$0: differ, partial diff is:"
    partial_diff $tmpdir/utts{,.utt2warp}
    exit 1;
  fi
fi

# check some optionally-required things
for f in vad.scp utt2lang utt2uniq; do
  if [ -f $data/$f ]; then
    check_sorted_and_uniq $data/$f
    if ! cmp -s <( awk '{print $1}' $data/utt2spk ) \
      <( awk '{print $1}' $data/$f ); then
      echo "$0: error: in $data, $f and utt2spk do not have identical utterance-id list"
      exit 1;
    fi
  fi
done


if [ -f $data/utt2dur ]; then
  check_sorted_and_uniq $data/utt2dur
  cat $data/utt2dur | awk '{print $1}' > $tmpdir/utts.utt2dur
  if ! cmp -s $tmpdir/utts{,.utt2dur}; then
    echo "$0: Error: in $data, utterance-ids extracted from utt2spk and utt2dur file"
    echo "$0: differ, partial diff is:"
    partial_diff $tmpdir/utts{,.utt2dur}
    exit 1;
  fi
  cat $data/utt2dur | \
    awk '{ if (NF != 2 || !($2 > 0)) { print "Bad line utt2dur:" NR ":" $0; exit(1) }}' || exit 1
fi

if [ -f $data/utt2num_frames ]; then
  check_sorted_and_uniq $data/utt2num_frames
  cat $data/utt2num_frames | awk '{print $1}' > $tmpdir/utts.utt2num_frames
  if ! cmp -s $tmpdir/utts{,.utt2num_frames}; then
    echo "$0: Error: in $data, utterance-ids extracted from utt2spk and utt2num_frames file"
    echo "$0: differ, partial diff is:"
    partial_diff $tmpdir/utts{,.utt2num_frames}
    exit 1
  fi
  awk <$data/utt2num_frames '{
    if (NF != 2 || !($2 > 0) || $2 != int($2)) {
      print "Bad line utt2num_frames:" NR ":" $0
      exit 1 } }' || exit 1
fi

if [ -f $data/reco2dur ]; then
  check_sorted_and_uniq $data/reco2dur
  cat $data/reco2dur | awk '{print $1}' > $tmpdir/recordings.reco2dur
  if [ -f $tmpdir/recordings ]; then
    if ! cmp -s $tmpdir/recordings{,.reco2dur}; then
      echo "$0: Error: in $data, recording-ids extracted from segments and reco2dur file"
      echo "$0: differ, partial diff is:"
      partial_diff $tmpdir/recordings{,.reco2dur}
    exit 1;
    fi
  else
    if ! cmp -s $tmpdir/{utts,recordings.reco2dur}; then
      echo "$0: Error: in $data, recording-ids extracted from wav.scp and reco2dur file"
      echo "$0: differ, partial diff is:"
      partial_diff $tmpdir/{utts,recordings.reco2dur}
    exit 1;
    fi
  fi
  cat $data/reco2dur | \
    awk '{ if (NF != 2 || !($2 > 0)) { print "Bad line : " $0; exit(1) }}' || exit 1
fi


echo "$0: Successfully validated data-directory $data"


================================================
FILE: egs/utils/validate_dict_dir.pl
================================================
#!/usr/bin/env perl

# Apache 2.0.
# Copyright  2012 Guoguo Chen
#            2015 Daniel Povey
#            2017 Johns Hopkins University (Jan "Yenda" Trmal <jtrmal@gmail.com>)
#
# Validation script for 'dict' directories (e.g. data/local/dict)

# this function reads the opened file (supplied as a first
# parameter) into an array of lines. For each
# line, it tests whether it's a valid utf-8 compatible
# line. If all lines are valid utf-8, it returns the lines
# decoded as utf-8, otherwise it assumes the file's encoding
# is one of those 1-byte encodings, such as ISO-8859-x
# or Windows CP-X.
# Please recall we do not really care about
# the actually encoding, we just need to
# make sure the length of the (decoded) string
# is correct (to make the output formatting looking right).
sub get_utf8_or_bytestream {
  use Encode qw(decode encode);
  my $is_utf_compatible = 1;
  my @unicode_lines;
  my @raw_lines;
  my $raw_text;
  my $lineno = 0;
  my $file = shift;

  while (<$file>) {
    $raw_text = $_;
    last unless $raw_text;
    if ($is_utf_compatible) {
      my $decoded_text = eval { decode("UTF-8", $raw_text, Encode::FB_CROAK) } ;
      $is_utf_compatible = $is_utf_compatible && defined($decoded_text);
      push @unicode_lines, $decoded_text;
    } else {
      #print STDERR "WARNING: the line($.) $raw_text cannot be interpreted as UTF-8: $decoded_text\n";
      ;
    }
    push @raw_lines, $raw_text;
    $lineno += 1;
  }

  if (!$is_utf_compatible) {
    return (0, @raw_lines);
  } else {
    return (1, @unicode_lines);
  }
}

# check if the given unicode string contain unicode whitespaces
# other than the usual four: TAB, LF, CR and SPACE
sub validate_utf8_whitespaces {
  my $unicode_lines = shift;
  use feature 'unicode_strings';
  for (my $i = 0; $i < scalar @{$unicode_lines}; $i++) {
    my $current_line = $unicode_lines->[$i];
    if ((substr $current_line, -1) ne "\n"){
      print STDERR "$0: The current line (nr. $i) has invalid newline\n";
      return 1;
    }
    my @A = split(" ", $current_line);
    my $utt_id = $A[0];
    # we replace TAB, LF, CR, and SPACE
    # this is to simplify the test
    if ($current_line =~ /\x{000d}/) {
      print STDERR "$0: The line for utterance $utt_id contains CR (0x0D) character\n";
      return 1;
    }
    $current_line =~ s/[\x{0009}\x{000a}\x{0020}]/./g;
    if ($current_line =~/\s/) {
      print STDERR "$0: The line for utterance $utt_id contains disallowed Unicode whitespaces\n";
      return 1;
    }
  }
  return 0;
}

# checks if the text in the file (supplied as the argument) is utf-8 compatible
# if yes, checks if it contains only allowed whitespaces. If no, then does not
# do anything. The function seeks to the original position in the file after
# reading the text.
sub check_allowed_whitespace {
  my $file = shift;
  my $pos = tell($file);
  (my $is_utf, my @lines) = get_utf8_or_bytestream($file);
  seek($file, $pos, SEEK_SET);
  if ($is_utf) {
    my $has_invalid_whitespaces = validate_utf8_whitespaces(\@lines);
    print "--> text seems to be UTF-8 or ASCII, checking whitespaces\n";
    if ($has_invalid_whitespaces) {
      print "--> ERROR: the text containes disallowed UTF-8 whitespace character(s)\n";
      return 0;
    } else {
      print "--> text contains only allowed whitespaces\n";
    }
  } else {
    print "--> text doesn't seem to be UTF-8 or ASCII, won't check whitespaces\n";
  }
  return 1;
}


if(@ARGV != 1) {
  die "Usage: validate_dict_dir.pl <dict-dir>\n" .
      "e.g.: validate_dict_dir.pl data/local/dict\n";
}

$dict = shift @ARGV;
$dict =~ s:/$::;

$exit = 0;
$success = 1;  # this is re-set each time we read a file.

sub set_to_fail { $exit = 1; $success = 0; }

# Checking silence_phones.txt -------------------------------
print "Checking $dict/silence_phones.txt ...\n";
if(-z "$dict/silence_phones.txt") {print "--> ERROR: $dict/silence_phones.txt is empty or not exists\n"; exit 1;}
if(!open(S, "<$dict/silence_phones.txt")) {print "--> ERROR: fail to open $dict/silence_phones.txt\n"; exit 1;}
$idx = 1;
%silence = ();
$crlf = 1;

print "--> reading $dict/silence_phones.txt\n";
check_allowed_whitespace(\*S) || set_to_fail();
while(<S>) {
  if (! s/\n$//) {
    print "--> ERROR: last line '$_' of $dict/silence_phones.txt does not end in newline.\n";
    set_to_fail();
  }
  if ($crlf == 1 && m/\r/) {
    print "--> ERROR: $dict/silence_phones.txt contains Carriage Return (^M) characters.\n";
    set_to_fail();
    $crlf = 0;
  }
  my @col = split(" ", $_);
  if (@col == 0) {
    set_to_fail();
    print "--> ERROR: empty line in $dict/silence_phones.txt (line $idx)\n";
  }
  foreach(0 .. @col-1) {
    my $p = $col[$_];
    if($silence{$p}) {
      set_to_fail(); print "--> ERROR: phone \"$p\" duplicates in $dict/silence_phones.txt (line $idx)\n";
    } else {
      $silence{$p} = 1;
    }
    # disambiguation symbols; phones ending in _B, _E, _S or _I will cause
    # problems with word-position-dependent systems, and <eps> is obviously
    # confusable with epsilon.
    if ($p =~ m/^#/ || $p =~ m/_[BESI]$/ || $p eq "<eps>"){
      set_to_fail();
      print "--> ERROR: phone \"$p\" has disallowed written form\n";
    }
  }
  $idx ++;
}
close(S);
$success == 0 || print "--> $dict/silence_phones.txt is OK\n";
print "\n";

# Checking optional_silence.txt -------------------------------
print "Checking $dict/optional_silence.txt ...\n";
if(-z "$dict/optional_silence.txt") {print "--> ERROR: $dict/optional_silence.txt is empty or not exists\n"; exit 1;}
if(!open(OS, "<$dict/optional_silence.txt")) {print "--> ERROR: fail to open $dict/optional_silence.txt\n"; exit 1;}
$idx = 1;
$success = 1;
$crlf = 1;
print "--> reading $dict/optional_silence.txt\n";
check_allowed_whitespace(\*OS) or exit 1;
while(<OS>) {
  chomp;
  my @col = split(" ", $_);
  if ($idx > 1 or @col > 1) {
    set_to_fail(); print "--> ERROR: only 1 phone expected in $dict/optional_silence.txt\n";
  } elsif (!$silence{$col[0]}) {
    set_to_fail(); print "--> ERROR: phone $col[0] not found in $dict/silence_phones.txt\n";
  }
  if ($crlf == 1 && m/\r/) {
    print "--> ERROR: $dict/optional_silence.txt contains Carriage Return (^M) characters.\n";
    set_to_fail();
    $crlf = 0;
  }
  $idx ++;
}
close(OS);
$success == 0 || print "--> $dict/optional_silence.txt is OK\n";
print "\n";

# Checking nonsilence_phones.txt -------------------------------
print "Checking $dict/nonsilence_phones.txt ...\n";
if(-z "$dict/nonsilence_phones.txt") {print "--> ERROR: $dict/nonsilence_phones.txt is empty or not exists\n"; exit 1;}
if(!open(NS, "<$dict/nonsilence_phones.txt")) {print "--> ERROR: fail to open $dict/nonsilence_phones.txt\n"; exit 1;}
$idx = 1;
%nonsilence = ();
$success = 1;
$crlf = 1;
print "--> reading $dict/nonsilence_phones.txt\n";
check_allowed_whitespace(\*NS) or set_to_fail();
while(<NS>) {
  if ($crlf == 1 && m/\r/) {
    print "--> ERROR: $dict/nonsilence_phones.txt contains Carriage Return (^M) characters.\n";
    set_to_fail();
    $crlf = 0;
  }
  if (! s/\n$//) {
    print "--> ERROR: last line '$_' of $dict/nonsilence_phones.txt does not end in newline.\n";
    set_to_fail();
  }
  my @col = split(" ", $_);
  if (@col == 0) {
    set_to_fail();
    print "--> ERROR: empty line in $dict/nonsilence_phones.txt (line $idx)\n";
  }
  foreach(0 .. @col-1) {
    my $p = $col[$_];
    if($nonsilence{$p}) {
      set_to_fail(); print "--> ERROR: phone \"$p\" duplicates in $dict/nonsilence_phones.txt (line $idx)\n";
    } else {
      $nonsilence{$p} = 1;
    }
    # phones that start with the pound sign/hash may be mistaken for
    # disambiguation symbols; phones ending in _B, _E, _S or _I will cause
    # problems with word-position-dependent systems, and <eps> is obviously
    # confusable with epsilon.
    if ($p =~ m/^#/ || $p =~ m/_[BESI]$/ || $p eq "<eps>"){
      set_to_fail();
      print "--> ERROR: phone \"$p\" has disallowed written form\n";
    }
  }
  $idx ++;
}
close(NS);
$success == 0 || print "--> $dict/nonsilence_phones.txt is OK\n";
print "\n";

# Checking disjoint -------------------------------
sub intersect {
  my ($a, $b) = @_;
  @itset = ();
  %itset = ();
  foreach(keys %$a) {
    if(exists $b->{$_} and !$itset{$_}) {
      push(@itset, $_);
      $itset{$_} = 1;
    }
  }
  return @itset;
}

print "Checking disjoint: silence_phones.txt, nonsilence_phones.txt\n";
@itset = intersect(\%silence, \%nonsilence);
if(@itset == 0) {print "--> disjoint property is OK.\n";}
else {set_to_fail(); print "--> ERROR: silence_phones.txt and nonsilence_phones.txt has overlap: "; foreach(@itset) {print "$_ ";} print "\n";}
print "\n";


sub check_lexicon {
  my ($lex, $num_prob_cols, $num_skipped_cols) = @_;
  print "Checking $lex\n";
  !open(L, "<$lex") && print "--> ERROR: fail to open $lex\n" && set_to_fail();
  my %seen_line = {};
  $idx = 1; $success = 1; $crlf = 1;
  print "--> reading $lex\n";
  check_allowed_whitespace(\*L) or set_to_fail();
  while (<L>) {
    if ($crlf == 1 && m/\r/) {
      print "--> ERROR: $lex contains Carriage Return (^M) characters.\n";
      set_to_fail();
      $crlf = 0;
    }
    if (defined $seen_line{$_}) {
      print "--> ERROR: line '$_' of $lex is repeated\n";
      set_to_fail();
    }
    $seen_line{$_} = 1;
    if (! s/\n$//) {
      print "--> ERROR: last line '$_' of $lex does not end in newline.\n";
      set_to_fail();
    }
    my @col = split(" ", $_);
    $word = shift @col;
    if (!defined $word) {
      print "--> ERROR: empty lexicon line in $lex\n"; set_to_fail();
    }
    if ($word eq "<s>" || $word eq "</s>" || $word eq "<eps>" || $word eq "#0") {
      print "--> ERROR: lexicon.txt contains forbidden word $word\n";
      set_to_fail();
    }
    for ($n = 0; $n < $num_prob_cols; $n++) {
      $prob = shift @col;
      if (!($prob > 0.0 && $prob <= 1.0)) {
        print "--> ERROR: bad pron-prob in lexicon-line '$_', in $lex\n";
        set_to_fail();
      }
    }
    for ($n = 0; $n < $num_skipped_cols; $n++) { shift @col; }
    if (@col == 0) {
      print "--> ERROR: lexicon.txt contains word $word with empty ";
      print "pronunciation.\n";
      set_to_fail();
    }
    foreach (0 .. @col-1) {
      if (!$silence{@col[$_]} and !$nonsilence{@col[$_]}) {
        print "--> ERROR: phone \"@col[$_]\" is not in {, non}silence.txt ";
        print "(line $idx)\n";
        set_to_fail();
      }
    }
    $idx ++;
  }
  close(L);
  $success == 0 || print "--> $lex is OK\n";
  print "\n";
}

if (-f "$dict/lexicon.txt") { check_lexicon("$dict/lexicon.txt", 0, 0); }
if (-f "$dict/lexiconp.txt") { check_lexicon("$dict/lexiconp.txt", 1, 0); }
if (-f "$dict/lexiconp_silprob.txt") {
  # If $dict/lexiconp_silprob.txt exists, we expect $dict/silprob.txt to also
  # exist.
  check_lexicon("$dict/lexiconp_silprob.txt", 2, 2);
  if (-f "$dict/silprob.txt") {
    !open(SP, "<$dict/silprob.txt") &&
      print "--> ERROR: fail to open $dict/silprob.txt\n" && set_to_fail();
      $crlf = 1;
    while (<SP>) {
      if ($crlf == 1 && m/\r/) {
        print "--> ERROR: $dict/silprob.txt contains Carriage Return (^M) characters.\n";
        set_to_fail();
        $crlf = 0;
      }
      chomp; my @col = split;
      @col != 2 && die "--> ERROR: bad line \"$_\"\n" && set_to_fail();
      if ($col[0] eq "<s>" || $col[0] eq "overall") {
        if (!($col[1] > 0.0 && $col[1] <= 1.0)) {
          set_to_fail();
          print "--> ERROR: bad probability in $dir/silprob.txt \"$_\"\n";
        }
      } elsif ($col[0] eq "</s>_s" || $col[0] eq "</s>_n") {
        if ($col[1] <= 0.0) {
          set_to_fail();
          print "--> ERROR: bad correction term in $dir/silprob.txt \"$_\"\n";
        }
      } else {
        print "--> ERROR: unexpected line in $dir/silprob.txt \"$_\"\n";
        set_to_fail();
      }
    }
    close(SP);
  } else {
    set_to_fail();
    print "--> ERROR: expecting $dict/silprob.txt to exist\n";
  }
}

if (!(-f "$dict/lexicon.txt" || -f "$dict/lexiconp.txt")) {
  print "--> ERROR: neither lexicon.txt or lexiconp.txt exist in directory $dir\n";
  set_to_fail();
}

sub check_lexicon_pair {
  my ($lex1, $num_prob_cols1, $num_skipped_cols1,
      $lex2, $num_prob_cols2, $num_skipped_cols2) = @_;
  # We have checked individual lexicons already.
  open(L1, "<$lex1"); open(L2, "<$lex2");
  print "Checking lexicon pair $lex1 and $lex2\n";
  my $line_num = 0;
  while(<L1>) {
    $line_num++;
    @A = split;
    $line_B = <L2>;
    if (!defined $line_B) {
      print "--> ERROR: $lex1 and $lex2 have different number of lines.\n";
      set_to_fail(); last;
    }
    @B = split(" ", $line_B);
    # Check if the word matches.
    if ($A[0] ne $B[0]) {
      print "--> ERROR: $lex1 and $lex2 mismatch at line $line_num. sorting?\n";
      set_to_fail(); last;
    }
    shift @A; shift @B;
    for ($n = 0; $n < $num_prob_cols1 + $num_skipped_cols1; $n ++) { shift @A; }
    for ($n = 0; $n < $num_prob_cols2 + $num_skipped_cols2; $n ++) { shift @B; }
    # Check if the pronunciation matches
    if (join(" ", @A) ne join(" ", @B)) {
      print "--> ERROR: $lex1 and $lex2 mismatch at line $line_num. sorting?\n";
      set_to_fail(); last;
    }
  }
  $line_B = <L2>;
  if (defined $line_B && $exit == 0) {
    print "--> ERROR: $lex1 and $lex2 have different number of lines.\n";
    set_to_fail();
  }
  $success == 0 || print "--> lexicon pair $lex1 and $lex2 match\n\n";
}

# If more than one lexicon exist, we have to check if they correspond to each
# other. It could be that the user overwrote one and we need to regenerate the
# other, but we do not know which is which.
if ( -f "$dict/lexicon.txt" && -f "$dict/lexiconp.txt") {
  check_lexicon_pair("$dict/lexicon.txt", 0, 0, "$dict/lexiconp.txt", 1, 0);
}
if ( -f "$dict/lexiconp.txt" && -f "$dict/lexiconp_silprob.txt") {
  check_lexicon_pair("$dict/lexiconp.txt", 1, 0,
                     "$dict/lexiconp_silprob.txt", 2, 2);
}

# Checking extra_questions.txt -------------------------------
%distinguished = (); # Keep track of all phone-pairs including nonsilence that
                     # are distinguished (split apart) by extra_questions.txt,
                     # as $distinguished{$p1,$p2} = 1.  This will be used to
                     # make sure that we don't have pairs of phones on the same
                     # line in nonsilence_phones.txt that can never be
                     # distinguished from each other by questions.  (If any two
                     # phones appear on the same line in nonsilence_phones.txt,
                     # they share a tree root, and since the automatic
                     # question-building treats all phones that appear on the
                     # same line of nonsilence_phones.txt as being in the same
                     # group, we can never distinguish them without resorting to
                     # questions in extra_questions.txt.
print "Checking $dict/extra_questions.txt ...\n";
if (-s "$dict/extra_questions.txt") {
  if (!open(EX, "<$dict/extra_questions.txt")) {
    set_to_fail(); print "--> ERROR: fail to open $dict/extra_questions.txt\n";
  }
  $idx = 1;
  $success = 1;
  $crlf = 1;
  print "--> reading $dict/extra_questions.txt\n";
  check_allowed_whitespace(\*EX) or set_to_fail();
  while(<EX>) {
    if ($crlf == 1 && m/\r/) {
      print "--> ERROR: $dict/extra_questions.txt contains Carriage Return (^M) characters.\n";
      set_to_fail();
      $crlf = 0;
    }
    if (! s/\n$//) {
      print "--> ERROR: last line '$_' of $dict/extra_questions.txt does not end in newline.\n";
      set_to_fail();
    }
    my @col = split(" ", $_);
    if (@col == 0) {
      set_to_fail();  print "--> ERROR: empty line in $dict/extra_questions.txt\n";
    }
    foreach (0 .. @col-1) {
      if(!$silence{@col[$_]} and !$nonsilence{@col[$_]}) {
        set_to_fail();  print "--> ERROR: phone \"@col[$_]\" is not in {, non}silence_phones.txt (line $idx, block ", $_+1, ")\n";
      }
      $idx ++;
    }
    %col_hash = ();
    foreach $p (@col) { $col_hash{$p} = 1; }
    foreach $p1 (@col) {
      # Update %distinguished hash.
      foreach $p2 (keys %nonsilence) {
        if (!defined $col_hash{$p2}) { # for each p1 in this question and p2 not
                                       # in this question (and in nonsilence
                                       # phones)... mark p1,p2 as being split apart
          $distinguished{$p1,$p2} = 1;
          $distinguished{$p2,$p1} = 1;
        }
      }
    }
  }
  close(EX);
  $success == 0 || print "--> $dict/extra_questions.txt is OK\n";
} else { print "--> $dict/extra_questions.txt is empty (this is OK)\n";}

if (-f "$dict/nonterminals.txt") {
  open(NT, "<$dict/nonterminals.txt") || die "opening $dict/nonterminals.txt";
  my %nonterminals = ();
  my $line_number = 1;
  while (<NT>) {
    chop;
    my @line = split(" ", $_);
    if (@line != 1 || ! m/^#nonterm:/ || defined $nonterminals{$line[0]}) {
      print "--> ERROR: bad (or duplicate) line $line_number: '$_' in $dict/nonterminals.txt\n"; exit 1;
    }
    $nonterminals{$line[0]} = 1;
    $line_number++;
  }
  print "--> $dict/nonterminals.txt is OK\n";
}


# check nonsilence_phones.txt again for phone-pairs that are never
# distnguishable.  (note: this situation is normal and expected for silence
# phones, so we don't check it.)
if(!open(NS, "<$dict/nonsilence_phones.txt")) {
  print "--> ERROR: fail to open $dict/nonsilence_phones.txt the second time\n"; exit 1;
}

$num_warn_nosplit = 0;
$num_warn_nosplit_limit = 10;
while(<NS>) {
  my @col = split(" ", $_);
  foreach $p1 (@col) {
    foreach $p2 (@col) {
      if ($p1 ne $p2 && ! $distinguished{$p1,$p2}) {
        set_to_fail();
        if ($num_warn_nosplit <= $num_warn_nosplit_limit) {
          print "--> ERROR: phones $p1 and $p2 share a tree root but can never be distinguished by extra_questions.txt.\n";
        }
        if ($num_warn_nosplit == $num_warn_nosplit_limit) {
          print "... Not warning any more times about this issue.\n";
        }
        if ($num_warn_nosplit == 0) {
          print "    (note: we started checking for this only recently.  You can still build a system but\n";
          print "     phones $p1 and $p2 will be acoustically indistinguishable).\n";
        }
        $num_warn_nosplit++;
      }
    }
  }
}


if ($exit == 1) {
  print "--> ERROR validating dictionary directory $dict (see detailed error ";
  print "messages above)\n\n";
  exit 1;
} else {
  print "--> SUCCESS [validating dictionary directory $dict]\n\n";
}

exit 0;


================================================
FILE: egs/utils/validate_lang.pl
================================================
#!/usr/bin/env perl

# Apache 2.0.
# Copyright  2012   Guoguo Chen
#            2014   Neil Nelson
#            2017   Johns Hopkins University (Jan "Yenda" Trmal <jtrmal@gmail.com>)
#            2019   Dongji Gao
#
# Validation script for data/lang

# this function reads the opened file (supplied as a first
# parameter) into an array of lines. For each
# line, it tests whether it's a valid utf-8 compatible
# line. If all lines are valid utf-8, it returns the lines
# decoded as utf-8, otherwise it assumes the file's encoding
# is one of those 1-byte encodings, such as ISO-8859-x
# or Windows CP-X.
# Please recall we do not really care about
# the actually encoding, we just need to
# make sure the length of the (decoded) string
# is correct (to make the output formatting looking right).
sub get_utf8_or_bytestream {
  use Encode qw(decode encode);
  my $is_utf_compatible = 1;
  my @unicode_lines;
  my @raw_lines;
  my $raw_text;
  my $lineno = 0;
  my $file = shift;

  while (<$file>) {
    $raw_text = $_;
    last unless $raw_text;
    if ($is_utf_compatible) {
      my $decoded_text = eval { decode("UTF-8", $raw_text, Encode::FB_CROAK) } ;
      $is_utf_compatible = $is_utf_compatible && defined($decoded_text);
      push @unicode_lines, $decoded_text;
    } else {
      #print STDERR "WARNING: the line $raw_text cannot be interpreted as UTF-8: $decoded_text\n";
      ;
    }
    push @raw_lines, $raw_text;
    $lineno += 1;
  }

  if (!$is_utf_compatible) {
    return (0, @raw_lines);
  } else {
    return (1, @unicode_lines);
  }
}

# check if the given unicode string contain unicode whitespaces
# other than the usual four: TAB, LF, CR and SPACE
sub validate_utf8_whitespaces {
  my $unicode_lines = shift;
  use feature 'unicode_strings';
  for (my $i = 0; $i < scalar @{$unicode_lines}; $i++) {
    my $current_line = $unicode_lines->[$i];
    if ((substr $current_line, -1) ne "\n"){
      print STDERR "$0: The current line (nr. $i) has invalid newline\n";
      return 1;
    }
    # we replace TAB, LF, CR, and SPACE
    # this is to simplify the test
    if ($current_line =~ /\x{000d}/) {
      print STDERR "$0: The current line (nr. $i) contains CR (0x0D) character\n";
      return 1;
    }
    $current_line =~ s/[\x{0009}\x{000a}\x{0020}]/./g;
    if ($current_line =~/\s/) {
      return 1;
    }
  }
  return 0;
}

# checks if the text in the file (supplied as the argument) is utf-8 compatible
# if yes, checks if it contains only allowed whitespaces. If no, then does not
# do anything. The function seeks to the original position in the file after
# reading the text.
sub check_allowed_whitespace {
  my $file = shift;
  my $pos = tell($file);
  (my $is_utf, my @lines) = get_utf8_or_bytestream($file);
  seek($file, $pos, SEEK_SET);
  if ($is_utf) {
    my $has_invalid_whitespaces = validate_utf8_whitespaces(\@lines);
    print "--> text seems to be UTF-8 or ASCII, checking whitespaces\n";
    if ($has_invalid_whitespaces) {
      print "--> ERROR: the text containes disallowed UTF-8 whitespace character(s)\n";
      return 0;
    } else {
      print "--> text contains only allowed whitespaces\n";
    }
  } else {
    print "--> text doesn't seem to be UTF-8 or ASCII, won't check whitespaces\n";
  }
  return 1;
}

$skip_det_check = 0;
$skip_disambig_check = 0;
$skip_generate_words_check = 0;
$subword_check = 0;

for ($x=0; $x <= 3; $x++) {
  if (@ARGV > 0 && $ARGV[0] eq "--skip-determinization-check") {
    $skip_det_check = 1;
    shift @ARGV;
  }
  if (@ARGV > 0 && $ARGV[0] eq "--skip-disambig-check") {
    $skip_disambig_check = 1;
    shift @ARGV;
  }
  if (@ARGV > 0 && $ARGV[0] eq "--skip-generate-words-check") {
    $skip_generate_words_check = 1;
    shift @ARGV;
  }
}

if (@ARGV != 1) {
  print "Usage: $0 [options] <lang_directory>\n";
  print "e.g.:  $0 data/lang\n";
  print "Options:\n";
  print " --skip-generate-words-check              (this flag causes it to skip a check of generated word sequences).\n";
  print " --skip-determinization-check             (this flag causes it to skip a time consuming check).\n";
  print " --skip-disambig-check                    (this flag causes it to skip a disambig check in phone bigram models).\n";
  exit(1);
}

print "$0 " . join(" ", @ARGV) . "\n";

$lang = shift @ARGV;
$exit = 0;
$warning = 0;

# Checking existence of separator file ------------------
print "Checking existence of separator file\n";
if (!-e "$lang/subword_separator.txt") {
  print "separator file $lang/subword_separator.txt is empty or does not exist, deal in word case.\n";
} else {
  if (!open(S, "<$lang/subword_separator.txt")) {
    print "--> ERROR: fail to open $lang/subword_separator.txt\n"; exit 1;
  } else {
    $line_num = `wc -l <$lang/subword_separator.txt`;
    if ($line_num != 1) {
      print "--> ERROR, $lang/subword_separator.txt should only contain one line.\n"; exit 1;
    } else {
      while (<S>) {
        chomp;
        my @col = split(" ", $_);
        if (@col != 1) {
          print "--> ERROR, invalid separator.\n"; exit 1;
        } else {
         $separator = shift @col;
         $separator_length = length $separator;
         $subword_check = 1;
        }
      }
    }
  }
}

if (!$subword_check) {
  $word_boundary = "word_boundary";
} else {
  $word_boundary = "word_boundary_moved";
}

# Checking phones.txt -------------------------------
print "Checking $lang/phones.txt ...\n";
if (-z "$lang/phones.txt") {
  print "--> ERROR: $lang/phones.txt is empty or does not exist\n"; exit 1;
}
if (!open(P, "<$lang/phones.txt")) {
  print "--> ERROR: fail to open $lang/phones.txt\n"; exit 1;
}
$idx = 1;
%psymtab = ();
check_allowed_whitespace(\*P) or exit 1;
while (<P>) {
  chomp;
  my @col = split(" ", $_);
  if (@col != 2) {
    print "--> ERROR: expect 2 columns in $lang/phones.txt (break at line $idx)\n"; exit 1;
  }
  my $phone = shift @col;
  my $id = shift @col;
  $psymtab{$phone} = $id;
  $idx ++;
}
close(P);
%pint2sym = ();
foreach (keys %psymtab) {
  if ($pint2sym{$psymtab{$_}}) {
    print "--> ERROR: ID \"$psymtab{$_}\" duplicates\n"; exit 1;
  } else {
    $pint2sym{$psymtab{$_}} = $_;
  }
}
print "--> $lang/phones.txt is OK\n";
print "\n";

# Check word.txt -------------------------------
print "Checking words.txt: #0 ...\n";
if (-z "$lang/words.txt") {
  print "--> ERROR: $lang/words.txt is empty or does not exist\n"; exit 1;
}
if (!open(W, "<$lang/words.txt")) {
  print "--> ERROR: fail to open $lang/words.txt\n"; exit 1;
}
$idx = 1;
%wsymtab = ();
check_allowed_whitespace(\*W) or exit 1;
while (<W>) {
  chomp;
  my @col = split(" ", $_);
  if (@col != 2) {
    print "--> ERROR: expect 2 columns in $lang/words.txt (line $idx)\n"; exit 1;
  }
  $word = shift @col;
  $id = shift @col;
  $wsymtab{$word} = $id;
  $idx ++;
}
close(W);
%wint2sym = ();
foreach (keys %wsymtab) {
  if ($wint2sym{$wsymtab{$_}}) {
    print "--> ERROR: ID \"$wsymtab{$_}\" duplicates\n"; exit 1;
  } else {
    $wint2sym{$wsymtab{$_}} = $_;
  }
}
print "--> $lang/words.txt is OK\n";
print "\n";

# Checking phones/* -------------------------------
sub check_txt_int_csl {
  my ($cat, $symtab) = @_;
  print "Checking $cat.\{txt, int, csl\} ...\n";
  if (!open(TXT, "<$cat.txt")) {
    $exit = 1; return print "--> ERROR: fail to open $cat.txt\n";
  }
  if (!open(INT, "<$cat.int")) {
    $exit = 1; return print "--> ERROR: fail to open $cat.int\n";
  }
  if (!open(CSL, "<$cat.csl")) {
    $exit = 1; return print "--> ERROR: fail to open $cat.csl\n";
  }
  if (-z "$cat.txt") {
    $warning = 1; print "--> WARNING: $cat.txt is empty\n";
  }
  if (-z "$cat.int") {
    $warning = 1; print "--> WARNING: $cat.int is empty\n";
  }
  if (-z "$cat.csl") {
    $warning = 1; print "--> WARNING: $cat.csl is empty\n";
  }

  $idx1 = 1;
  check_allowed_whitespace(\*TXT) or $exit = 1;
  while (<TXT>) {
    chomp;
    my @col = split(" ", $_);
    if (@col != 1) {
      $exit = 1; return print "--> ERROR: expect 1 column in $cat.txt (break at line $idx1)\n";
    }
    $entry[$idx1] = shift @col;
    $idx1 ++;
  }
  close(TXT); $idx1 --;
  print "--> $idx1 entry/entries in $cat.txt\n";

  $idx2 = 1;
  while (<INT>) {
    chomp;
    my @col = split(" ", $_);
    if (@col != 1) {
      $exit = 1; return print "--> ERROR: expect 1 column in $cat.int (break at line $idx2)\n";
    }
    if ($symtab->{$entry[$idx2]} ne shift @col) {
      $exit = 1; return print "--> ERROR: $cat.int doesn't correspond to $cat.txt (break at line $idx2)\n";
    }
    $idx2 ++;
  }
  close(INT); $idx2 --;
  if ($idx1 != $idx2) {
    $exit = 1; return print "--> ERROR: $cat.int doesn't correspond to $cat.txt (break at line ", $idx2+1, ")\n";
  }
  print "--> $cat.int corresponds to $cat.txt\n";

  $num_lines = 0;
  while (<CSL>) {
    chomp;
    my @col = split(":", $_);
    $num_lines++;
    if (@col != $idx1) {
      $exit = 1; return print "--> ERROR: expect $idx1 block/blocks in $cat.csl (break at line $idx3)\n";
    }
    foreach (1 .. $idx1) {
      if ($symtab->{$entry[$_]} ne @col[$_-1]) {
        $exit = 1; return print "--> ERROR: $cat.csl doesn't correspond to $cat.txt (break at line $idx3, block $_)\n";
      }
    }
  }
  close(CSL);
  if ($idx1 != 0) {             # nonempty .txt,.int files
    if ($num_lines != 1) {
      $exit = 1;
      return print "--> ERROR: expect 1 line in $cat.csl\n";
    }
  } else {
    if ($num_lines != 1 && $num_lines != 0) {
      $exit = 1;
      return print "--> ERROR: expect 0 or 1 line in $cat.csl, since empty .txt,int\n";
    }
  }
  print "--> $cat.csl corresponds to $cat.txt\n";

  return print "--> $cat.\{txt, int, csl\} are OK\n";
}

sub check_txt_int {
  my ($cat, $symtab, $sym_check) = @_;
  print "Checking $cat.\{txt, int\} ...\n";
  if (-z "$cat.txt") {
    $exit = 1; return print "--> ERROR: $cat.txt is empty or does not exist\n";
  }
  if (-z "$cat.int") {
    $exit = 1; return print "--> ERROR: $cat.int is empty or does not exist\n";
  }
  if (!open(TXT, "<$cat.txt")) {
    $exit = 1; return print "--> ERROR: fail to open $cat.txt\n";
  }
  if (!open(INT, "<$cat.int")) {
    $exit = 1; return print "--> ERROR: fail to open $cat.int\n";
  }

  $idx1 = 1;
  check_allowed_whitespace(\*TXT) or $exit = 1;
  while (<TXT>) {
    chomp;
    s/^(shared|not-shared) (split|not-split) //g;
    s/ nonword$//g;
    s/ begin$//g;
    s/ end$//g;
    s/ internal$//g;
    s/ singleton$//g;
    $entry[$idx1] = $_;
    $idx1 ++;
  }
  close(TXT); $idx1 --;
  print "--> $idx1 entry/entries in $cat.txt\n";

  my %used_syms = ();
  $idx2 = 1;
  while (<INT>) {
    chomp;
    s/^(shared|not-shared) (split|not-split) //g;
    s/ nonword$//g;
    s/ begin$//g;
    s/ end$//g;
    s/ internal$//g;
    s/ singleton$//g;
    my @col = split(" ", $_);
    @set = split(" ", $entry[$idx2]);
    if (@set != @col) {
      $exit = 1; return print "--> ERROR: $cat.int doesn't correspond to $cat.txt (break at line $idx2)\n";
    }
    foreach (0 .. @set-1) {
      if ($symtab->{@set[$_]} ne @col[$_]) {
        $exit = 1; return print "--> ERROR: $cat.int doesn't correspond to $cat.txt (break at line $idx2, block " ,$_+1, ")\n";
      }
      if ($sym_check && defined $used_syms{@set[$_]}) {
        $exit = 1; return print "--> ERROR: $cat.txt and $cat.int contain duplicate symbols (break at line $idx2, block " ,$_+1, ")\n";
      }
      $used_syms{@set[$_]} = 1;
    }
    $idx2 ++;
  }
  close(INT); $idx2 --;
  if ($idx1 != $idx2) {
    $exit = 1; return print "--> ERROR: $cat.int doesn't correspond to $cat.txt (break at line ", $idx2+1, ")\n";
  }
  print "--> $cat.int corresponds to $cat.txt\n";

  if ($sym_check) {
    while ( my ($key, $value) = each(%silence) ) {
      if (!defined $used_syms{$key}) {
        $exit = 1; return print "--> ERROR: $cat.txt and $cat.int do not contain all silence phones\n";
      }
    }
    while ( my ($key, $value) = each(%nonsilence) ) {
      if (!defined $used_syms{$key}) {
        $exit = 1; return print "--> ERROR: $cat.txt and $cat.int do not contain all non-silence phones\n";
      }
    }
  }

  return print "--> $cat.\{txt, int\} are OK\n";
}

# Check disjoint and summation -------------------------------
sub intersect {
  my ($a, $b) = @_;
  @itset = ();
  %itset = ();
  foreach (keys %$a) {
    if (exists $b->{$_} and !$itset{$_}) {
      push(@itset, $_);
      $itset{$_} = 1;
    }
  }
  return @itset;
}

sub check_disjoint {
  print "Checking disjoint: silence.txt, nonsilence.txt, disambig.txt ...\n";
  if (!open(S, "<$lang/phones/silence.txt")) {
    $exit = 1; return print "--> ERROR: fail to open $lang/phones/silence.txt\n";
  }
  if (!open(N, "<$lang/phones/nonsilence.txt")) {
    $exit = 1; return print "--> ERROR: fail to open $lang/phones/nonsilence.txt\n";
  }
  if (!$skip_disambig_check && !open(D, "<$lang/phones/disambig.txt")) {
    $exit = 1; return print "--> ERROR: fail to open $lang/phones/disambig.txt\n";
  }

  $idx = 1;
  while (<S>) {
    chomp;
    my @col = split(" ", $_);
    $phone = shift @col;
    if ($silence{$phone}) {
      $exit = 1; print "--> ERROR: phone \"$phone\" duplicates in $lang/phones/silence.txt (line $idx)\n";
    }
    $silence{$phone} = 1;
    push(@silence, $phone);
    $idx ++;
  }
  close(S);

  $idx = 1;
  while (<N>) {
    chomp;
    my @col = split(" ", $_);
    $phone = shift @col;
    if ($nonsilence{$phone}) {
      $exit = 1; print "--> ERROR: phone \"$phone\" duplicates in $lang/phones/nonsilence.txt (line $idx)\n";
    }
    $nonsilence{$phone} = 1;
    push(@nonsilence, $phone);
    $idx ++;
  }
  close(N);

  $idx = 1;
  while (<D>) {
    chomp;
    my @col = split(" ", $_);
    $phone = shift @col;
    if ($disambig{$phone}) {
      $exit = 1; print "--> ERROR: phone \"$phone\" duplicates in $lang/phones/disambig.txt (line $idx)\n";
    }
    $disambig{$phone} = 1;
    $idx ++;
  }
  close(D);

  my @itsect1 = intersect(\%silence, \%nonsilence);
  my @itsect2 = intersect(\%silence, \%disambig);
  my @itsect3 = intersect(\%disambig, \%nonsilence);

  $success = 1;
  if (@itsect1 != 0) {
    $success = 0;
    $exit = 1; print "--> ERROR: silence.txt and nonsilence.txt have intersection -- ";
    foreach (@itsect1) {
      print $_, " ";
    }
    print "\n";
  } else {
    print "--> silence.txt and nonsilence.txt are disjoint\n";
  }

  if (@itsect2 != 0) {
    $success = 0;
    $exit = 1; print "--> ERROR: silence.txt and disambig.txt have intersection -- ";
    foreach (@itsect2) {
      print $_, " ";
    }
    print "\n";
  } else {
    print "--> silence.txt and disambig.txt are disjoint\n";
  }

  if (@itsect3 != 0) {
    $success = 0;
    $exit = 1; print "--> ERROR: disambig.txt and nonsilence.txt have intersection -- ";
    foreach (@itsect1) {
      print $_, " ";
    }
    print "\n";
  } else {
    print "--> disambig.txt and nonsilence.txt are disjoint\n";
  }

  $success == 0 || print "--> disjoint property is OK\n";
  return;
}

sub check_summation {
  print "Checking sumation: silence.txt, nonsilence.txt, disambig.txt ...\n";
  if (scalar(keys %silence) == 0) {
    $exit = 1; return print "--> ERROR: $lang/phones/silence.txt is empty or does not exist\n";
  }
  if (scalar(keys %nonsilence) == 0) {
    $exit = 1; return print "--> ERROR: $lang/phones/nonsilence.txt is empty or does not exist\n";
  }
  if (!$skip_disambig_check && scalar(keys %disambig) == 0) {
    $warning = 1; print "--> WARNING: $lang/phones/disambig.txt is empty or does not exist\n";
  }

  %sum = (%silence, %nonsilence, %disambig);
  $sum{"<eps>"} = 1;

  my $ok = 1;
  foreach $p (keys %psymtab) {
    if (! defined $sum{$p} && $p !~ m/^#nonterm/) {
      $exit = 1;  $ok = 0;  print("--> ERROR: phone $p is not in silence.txt, nonsilence.txt or disambig.txt...\n");
    }
  }

  if ($ok) {
    print "--> found no unexplainable phones in phones.txt\n";
  }
  return;
}

%silence = ();
@silence = ();
%nonsilence = ();
@nonsilence = ();
%disambig = ();
check_disjoint; print "\n";
check_summation; print "\n";

@list1 = ("context_indep", "nonsilence", "silence", "optional_silence");
@list2 = ("roots", "sets");
if (!$skip_disambig_check) {
    push(@list1, "disambig");
}
foreach (@list1) {
  check_txt_int_csl("$lang/phones/$_", \%psymtab); print "\n";
}
foreach (@list2) {
  check_txt_int("$lang/phones/$_", \%psymtab, 1); print "\n";
}
if ((-s "$lang/phones/extra_questions.txt") || (-s "$lang/phones/extra_questions.int")) {
  check_txt_int("$lang/phones/extra_questions", \%psymtab, 0); print "\n";
} else {
  print "Checking $lang/phones/extra_questions.\{txt, int\} ...\n";
  if (!((-f "$lang/phones/extra_questions.txt") && (-f "$lang/phones/extra_questions.int"))) {
    print "--> ERROR: $lang/phones/extra_questions.\{txt, int\} do not exist (they may be empty, but should be present)\n\n";
    $exit = 1;
  }
}
if (-e "$lang/phones/$word_boundary.txt") {
  check_txt_int("$lang/phones/$word_boundary", \%psymtab, 0); print "\n";
}

# Checking optional_silence.txt -------------------------------
print "Checking optional_silence.txt ...\n";
$idx = 1;
$success = 1;
if (-z "$lang/phones/optional_silence.txt") {
  $exit = 1; $success = 0; print "--> ERROR: $lang/phones/optional_silence.txt is empty or does not exist\n";
}
if (!open(OS, "<$lang/phones/optional_silence.txt")) {
  $exit = 1; $success = 0; print "--> ERROR: fail to open $lang/phones/optional_silence.txt\n";
}
print "--> reading $lang/phones/optional_silence.txt\n";
while (<OS>) {
  chomp;
  my @col = split(" ", $_);
  if ($idx > 1 or @col > 1) {
    $exit = 1; print "--> ERROR: only 1 phone expected in $lang/phones/optional_silence.txt\n"; $success = 0;
  } elsif (!$silence{$col[0]}) {
    $exit = 1; print "--> ERROR: phone $col[0] not found in $lang/phones/silence_phones.txt\n"; $success = 0;
  }
  $idx ++;
}
close(OS);
$success == 0 || print "--> $lang/phones/optional_silence.txt is OK\n";
print "\n";

if (!$skip_disambig_check) {
  # Check disambiguation symbols -------------------------------
  print "Checking disambiguation symbols: #0 and #1\n";
  if (scalar(keys %disambig) == 0) {
    $warning = 1; print "--> WARNING: $lang/phones/disambig.txt is empty or does not exist\n";
  }
  if (exists $disambig{"#0"} and exists $disambig{"#1"}) {
    print "--> $lang/phones/disambig.txt has \"#0\" and \"#1\"\n";
    print "--> $lang/phones/disambig.txt is OK\n\n";
  } else {
    print "--> WARNING: $lang/phones/disambig.txt doesn't have \"#0\" or \"#1\";\n";
    print "-->          this would not be OK with a conventional ARPA-type language\n";
    print "-->          model or a conventional lexicon (L.fst)\n";
    $warning = 1;
  }
}


# Check topo -------------------------------
print "Checking topo ...\n";
if (-z "$lang/topo") {
  $exit = 1; print "--> ERROR: $lang/topo is empty or does not exist\n";
}
if (!open(T, "<$lang/topo")) {
  $exit = 1; print "--> ERROR: fail to open $lang/topo\n";
} else {
  $topo_ok = 1;
  $idx = 1;
  %phones_in_topo_int_hash = ( );
  %phones_in_topo_hash = ( );
  while (<T>) {
    chomp;
    next if (m/^<.*>[ ]*$/);
    foreach $i (split(" ", $_)) {
      if (defined $phones_in_topo_int_hash{$i}) {
        $topo_ok = 0;
        $exit = 1; print "--> ERROR: $lang/topo has phone $i twice\n";
      }
      if (!defined $pint2sym{$i}) {
        $topo_ok = 0;
        $exit = 1; print "--> ERROR: $lang/topo has phone $i which is not in phones.txt\n";
      }
      $phones_in_topo_int_hash{$i} = 1;
      $phones_in_topo_hash{$pint2sym{$i}} = 1;
    }
  }
  close(T);
  $phones_that_should_be_in_topo_hash = {};
  foreach $p (@silence, @nonsilence) { $phones_that_should_be_in_topo_hash{$p} = 1; }
  foreach $p (keys %phones_that_should_be_in_topo_hash) {
    if ( ! defined $phones_in_topo_hash{$p}) {
      $topo_ok = 0;
      $i = $pint2sym{$p};
      $exit = 1; print "--> ERROR: $lang/topo does not cover phone $p (label = $i)\n";
    }
  }
  foreach $i (keys %phones_in_topo_int_hash) {
    $p = $pint2sym{$i};
    if ( ! defined $phones_that_should_be_in_topo_hash{$p}) {
      $topo_ok = 0;
      $exit = 1; print "--> ERROR: $lang/topo covers phone $p (label = $i) which is not a real phone\n";
    }
  }
  if ($topo_ok) {
    "--> $lang/topo is OK\n";
  }
  print "\n";
}

# Check word_boundary -------------------------------
$nonword   = "";
$begin     = "";
$end       = "";
$internal  = "";
$singleton = "";
if (-s "$lang/phones/$word_boundary.txt") {
  print "Checking $word_boundary.txt: silence.txt, nonsilence.txt, disambig.txt ...\n";
  if (!open (W, "<$lang/phones/$word_boundary.txt")) {
    $exit = 1; print "--> ERROR: fail to open $lang/phones/$word_boundary.txt\n";
  }
  $idx = 1;
  %wb = ();
  while (<W>) {
    chomp;
    my @col;
    if (m/^.*nonword$/  ) {
      s/ nonword//g;    @col = split(" ", $_); if (@col == 1) {$nonword   .= "$col[0] ";}
    }
    if (m/^.*begin$/    ) {
      s/ begin$//g;     @col = split(" ", $_); if (@col == 1) {$begin     .= "$col[0] ";}
    }
    if (m/^.*end$/      ) {
      s/ end$//g;       @col = split(" ", $_); if (@col == 1) {$end       .= "$col[0] ";}
    }
    if (m/^.*internal$/ ) {
      s/ internal$//g;  @col = split(" ", $_); if (@col == 1) {$internal  .= "$col[0] ";}
    }
    if (m/^.*singleton$/) {
      s/ singleton$//g; @col = split(" ", $_); if (@col == 1) {$singleton .= "$col[0] ";}
    }
    if (@col != 1) {
      $exit = 1; print "--> ERROR: expect 1 column in $lang/phones/$word_boundary.txt (line $idx)\n";
    }
    $wb{shift @col} = 1;
    $idx ++;
  }
  close(W);

  @itset = intersect(\%disambig, \%wb);
  $success1 = 1;
  if (@itset != 0) {
    $success1 = 0;
    $exit = 1; print "--> ERROR: $lang/phones/$word_boundary.txt has disambiguation symbols -- ";
    foreach (@itset) {
      print "$_ ";
    }
    print "\n";
  }
  $success1 == 0 || print "--> $lang/phones/$word_boundary.txt doesn't include disambiguation symbols\n";

  %sum = (%silence, %nonsilence);
  @itset = intersect(\%sum, \%wb);
  %itset = (); foreach(@itset) {$itset{$_} = 1;}
  $success2 = 1;
  if (@itset < scalar(keys %sum)) {
    $success2 = 0;
    $exit = 1; print "--> ERROR: phones in nonsilence.txt and silence.txt but not in $word_boundary.txt -- ";
    foreach (keys %sum) {
      if (!$itset{$_}) {
        print "$_ ";
      }
    }
    print "\n";
  }
  if (@itset < scalar(keys %wb)) {
    $success2 = 0;
    $exit = 1; print "--> ERROR: phones in $word_boundary.txt but not in nonsilence.txt or silence.txt -- ";
    foreach (keys %wb) {
      if (!$itset{$_}) {
        print "$_ ";
      }
    }
    print "\n";
  }
  $success2 == 0 || print "--> $lang/phones/$word_boundary.txt is the union of nonsilence.txt and silence.txt\n";
  $success1 != 1 or $success2 != 1 || print "--> $lang/phones/$word_boundary.txt is OK\n";
  print "\n";
}


{
  print "Checking word-level disambiguation symbols...\n";
  # This block checks that one of the two following conditions hold:
  # (1) for lang diretories prepared by older versions of prepare_lang.sh:
  #  The symbol  '#0' should appear in words.txt and phones.txt, and should
  # or (2): the files wdisambig.txt, wdisambig_phones.int and wdisambig_words.int
  #  exist, and have the expected properties (see below for details).

  # note, %wdisambig_words_hash hashes from the integer word-id of word-level
  # disambiguation symbols, to 1 if the word is a disambig symbol.

  if (! -e "$lang/phones/wdisambig.txt") {
    print "--> no $lang/phones/wdisambig.txt (older prepare_lang.sh)\n";
    if (exists $wsymtab{"#0"}) {
      print "--> $lang/words.txt has \"#0\"\n";
      $wdisambig_words_hash{$wsymtab{"#0"}} = 1;
    } else {
      print "--> WARNING: $lang/words.txt doesn't have \"#0\"\n";
      print "-->          (if you are using ARPA-type language models, you will normally\n";
      print "-->           need the disambiguation symbol \"#0\" to ensure determinizability)\n";
    }
  } else {
    print "--> $lang/phones/wdisambig.txt exists (newer prepare_lang.sh)\n";
    if (!open(T, "<$lang/phones/wdisambig.txt")) {
      print "--> ERROR: fail to open $lang/phones/wdisambig.txt\n"; $exit = 1; return;
    }
    chomp(my @wdisambig = <T>);
    close(T);
    if (!open(W, "<$lang/phones/wdisambig_words.int")) {
      print "--> ERROR: fail to open $lang/phones/wdisambig_words.int\n"; $exit = 1; return;
    }
    chomp(my @wdisambig_words = <W>);
    close(W);
    if (!open(P, "<$lang/phones/wdisambig_phones.int")) {
      print "--> ERROR: fail to open $lang/phones/wdisambig_phones.int\n"; $exit = 1; return;
    }
    chomp(my @wdisambig_phones = <P>);
    close(P);
    my $len = @wdisambig, $len2;
    if (($len2 = @wdisambig_words) != $len) {
      print "--> ERROR: files $lang/phones/wdisambig.txt and $lang/phones/wdisambig_words.int have different lengths\n";
      $exit = 1; return;
    }
    if (($len2 = @wdisambig_phones) != $len) {
      print "--> ERROR: files $lang/phones/wdisambig.txt and $lang/phones/wdisambig_phones.int have different lengths\n";
      $exit = 1; return;
    }
    for (my $i = 0; $i < $len; $i++) {
      if ($wsymtab{$wdisambig[$i]} ne $wdisambig_words[$i]) {
        my $ii = $i + 1;
        print "--> ERROR: line $ii of files $lang/phones/wdisambig.txt and $lang/phones/wdisambig_words.int mismatch\n";
        $exit = 1; return;
      }
    }
    for (my $i = 0; $i < $len; $i++) {
      if ($psymtab{$wdisambig[$i]} ne $wdisambig_phones[$i]) {
        my $ii = $i + 1;
        print "--> ERROR: line $ii of files $lang/phones/wdisambig.txt and $lang/phones/wdisambig_phones.int mismatch\n";
        $exit = 1; return;
      }
    }
    foreach my $i ( @wdisambig_words ) {
      $wdisambig_words_hash{$i} = 1;
    }
  }
}

# Check validity of L.fst, L_disambig.fst, and word_boundary.int.
# First we generate a random word/subword sequence. We then compile it into fst and compose it with L.fst/L_disambig.fst.
# For subword case the last subword of the sequence must be a end-subword 
# (i.e. the subword can only be at the end of word or is a single word itself) 
# to guarantee the composition would not fail.
# We then get the corresponging phones sequence and apply a transition matrix on it to get the number of valid boundaries.
# In word case, the number of valid boundaries should be equal to the number of words.
# In subword case, the number of valid boundaries should be equal to the number of end-subwords.
if (-s "$lang/phones/$word_boundary.int") {
  print "Checking $word_boundary.int and disambig.int\n";
  if (!open (W, "<$lang/phones/$word_boundary.int")) {
    $exit = 1; print "--> ERROR: fail to open $lang/phones/$word_boundary.int\n";
  }
  while (<W>) {
    @A = split;
    if (@A != 2) {
      $exit = 1; print "--> ERROR: bad line $_ in $lang/phones/$word_boundary.int\n";
    }
    $wbtype{$A[0]} = $A[1];
  }
  close(W);
  if (!open (D, "<$lang/phones/disambig.int")) {
    $exit = 1; print "--> ERROR: fail to open $lang/phones/disambig.int\n";
  }
  while (<D>) {
    @A = split;
    if (@A != 1) {
      $exit = 1; print "--> ERROR: bad line $_ in $lang/phones/disambig.int\n";
    }
    $is_disambig{$A[0]} = 1;
  }

  $text = `. ./path.sh`;
  if ($text ne "") {
    print "*** This script cannot continue because your path.sh or bash profile prints something: $text" .
      "*** Please fix that and try again.\n";
    exit(1);
  }

  foreach $fst ("L.fst", "L_disambig.fst") {
    if ($skip_generate_words_check) {
      next;
    }
    $wlen = int(rand(100)) + 1;
    $end_subword = 0;
    print "--> generating a $wlen word/subword sequence\n";
    $wordseq = "";
    $sid = 0;
    $wordseq_syms = "";
    # exclude disambiguation symbols, BOS and EOS, epsilon, and
    # grammar-related symbols from the word sequence.
    while ($sid < ($wlen - 1)) {
      $id = int(rand(scalar(keys %wint2sym)));
      while (defined $wdisambig_words_hash{$id} or
           $wint2sym{$id} eq "<s>" or $wint2sym{$id} eq "</s>" or
           $wint2sym{$id} =~ m/^#nonterm/ or $id == 0) {
        $id = int(rand(scalar(keys %wint2sym)));
      }
      $wordseq_syms = $wordseq_syms . $wint2sym{$id} . " ";
      $wordseq = $wordseq . "$sid ". ($sid + 1) . " $id $id 0\n";
      $sid ++;

      if ($subword_check) {
        $subword = $wint2sym{$id};
        $suffix = substr($subword, -$separator_length, $separator_length);
        if ($suffix ne $separator) {
          $end_subword ++;
        }
      }
    } 

    # generate the last word (subword)
    $id = int(rand(scalar(keys %wint2sym)));
    if ($subword_check) {
      $subword = $wint2sym{$id};
      $suffix = substr($subword, -$separator_length, $separator_length);
      # the last subword can not followed by separator  
      while (defined $wdisambig_words_hash{$id} or
           $wint2sym{$id} eq "<s>" or $wint2sym{$id} eq "</s>" or
           $wint2sym{$id} =~ m/^#nonterm/ or $id == 0 or $suffix eq $separator) {
        $id = int(rand(scalar(keys %wint2sym)));
        $subword = $wint2sym{$id};
        $suffix = substr($subword, -$separator_length, $separator_length);
      }
      $end_subword ++;
    } else {
      while (defined $wdisambig_words_hash{$id} or
           $wint2sym{$id} eq "<s>" or $wint2sym{$id} eq "</s>" or
           $wint2sym{$id} =~ m/^#nonterm/ or $id == 0) {
       $id = int(rand(scalar(keys %wint2sym)));
      }
    }
    $wordseq_syms = $wordseq_syms . $wint2sym{$id} . " ";
    $wordseq = $wordseq . "$sid ". ($sid + 1) . " $id $id 0\n";
    $sid ++;

    $wordseq = $wordseq . "$sid 0";
    $phoneseq = `. ./path.sh; echo \"$wordseq" | fstcompile | fstcompose $lang/$fst - | fstproject | fstrandgen | fstrmepsilon | fsttopsort | fstprint | awk '{if (NF > 2) {print \$3}}';`;
    $transition = { }; # empty assoc. array of allowed transitions between phone types.  1 means we count a word,
    # 0 means transition is allowed.  bos and eos are added as extra symbols here.
    foreach $x ("bos", "nonword", "end", "singleton") {
      $transition{$x, "nonword"} = 0;
      $transition{$x, "begin"} = 1;
      $transition{$x, "singleton"} = 1;
      $transition{$x, "eos"} = 0;
    }
    $transition{"begin", "end"} = 0;
    $transition{"begin", "internal"} = 0;
    $transition{"internal", "internal"} = 0;
    $transition{"internal", "end"} = 0;

    $cur_state = "bos";
    $num_words = 0;
    foreach $phone (split (" ", "$phoneseq <<eos>>")) {
      # Note: now that we support unk-LMs (see the --unk-fst option to
      # prepare_lang.sh), the regular L.fst may contain some disambiguation
      # symbols.
      if (! defined $is_disambig{$phone}) {
        if ($phone eq "<<eos>>") {
          $state = "eos";
        } elsif ($phone == 0) {
          $exit = 1; print "--> ERROR: unexpected phone sequence=$phoneseq, wordseq=$wordseq\n"; last;
        } else {
          $state = $wbtype{$phone};
        }
        if (!defined $state) {
          $exit = 1; print "--> ERROR: phone $phone is not specified in $lang/phones/$word_boundary.int\n";
          last;
        } elsif (!defined $transition{$cur_state, $state}) {
          $exit = 1; print "--> ERROR: transition from state $cur_state to $state indicates error in $word_boundary.int or L.fst\n";
          last;
        } else {
          $num_words += $transition{$cur_state, $state};
          $cur_state = $state;
        }
      }
    }
    if (!$exit) {
      if ($subword_check) { 
        $wlen = $end_subword;
      }
      if ($num_words != $wlen) {
        $phoneseq_syms = "";
        foreach my $id (split(" ", $phoneseq)) { $phoneseq_syms = $phoneseq_syms . " " . $pint2sym{$id}; }
        $exit = 1; print "--> ERROR: number of reconstructed words $num_words does not match real number of words $wlen; indicates problem in $fst or $word_boundary.int.  phoneseq = $phoneseq_syms, wordseq = $wordseq_syms\n";
      } else {
        print "--> resulting phone sequence from $fst corresponds to the word sequence\n";
        print "--> $fst is OK\n";
      }
    }
  }
  print "\n";
}

# Check oov -------------------------------
check_txt_int("$lang/oov", \%wsymtab, 0); print "\n";

# Check if L.fst is olabel sorted.
if (-e "$lang/L.fst") {
  $cmd = "fstinfo $lang/L.fst | grep -E 'output label sorted.*y' > /dev/null";
  $res = system(". ./path.sh; $cmd");
  if ($res == 0) {
    print "--> $lang/L.fst is olabel sorted\n";
  } else {
    print "--> ERROR: $lang/L.fst is not olabel sorted\n";
    $exit = 1;
  }
}

# Check if L_disambig.fst is olabel sorted.
if (-e "$lang/L_disambig.fst") {
  $cmd = "fstinfo $lang/L_disambig.fst | grep -E 'output label sorted.*y' > /dev/null";
  $res = system(". ./path.sh; $cmd");
  if ($res == 0) {
    print "--> $lang/L_disambig.fst is olabel sorted\n";
  } else {
    print "--> ERROR: $lang/L_disambig.fst is not olabel sorted\n";
    $exit = 1;
  }
}

if (-e "$lang/G.fst") {
  # Check that G.fst is ilabel sorted and nonempty.
  $text = `. ./path.sh; fstinfo $lang/G.fst`;
  if ($? != 0) {
    print "--> ERROR: fstinfo failed on $lang/G.fst\n";
    $exit = 1;
  }
  if ($text =~ m/input label sorted\s+y/) {
    print "--> $lang/G.fst is ilabel sorted\n";
  } else {
    print "--> ERROR: $lang/G.fst is not ilabel sorted\n";
    $exit = 1;
  }
  if ($text =~ m/# of states\s+(\d+)/) {
    $num_states = $1;
    if ($num_states == 0) {
      print "--> ERROR: $lang/G.fst is empty\n";
      $exit = 1;
    } else {
      print "--> $lang/G.fst has $num_states states\n";
    }
  }

  # Check that G.fst is determinizable.
  if (!$skip_det_check) {
    # Check determinizability of G.fst
    # fstdeterminizestar is much faster, and a more relevant test as it's what
    # we do in the actual graph creation recipe.
    if (-e "$lang/G.fst") {
      $cmd = "fstdeterminizestar $lang/G.fst /dev/null";
      $res = system(". ./path.sh; $cmd");
      if ($res == 0) {
        print "--> $lang/G.fst is determinizable\n";
      } else {
        print "--> ERROR: fail to determinize $lang/G.fst\n";
        $exit = 1;
      }
    }
  }

  # Check that G.fst does not have cycles with only disambiguation symbols or
  # epsilons on the input, or the forbidden symbols <s> and </s> (and a few
  # related checks

  if (-e "$lang/G.fst") {
    system("utils/lang/check_g_properties.pl $lang");
    if ($? != 0) {
      print "--> ERROR: failure running check_g_properties.pl\n";
      $exit = 1;
    } else {
      print("--> utils/lang/check_g_properties.pl succeeded.\n");
    }
  }
}


if (!$skip_det_check) {
  if (-e "$lang/G.fst" && -e "$lang/L_disambig.fst") {
    print "--> Testing determinizability of L_disambig . G\n";
    $output = `. ./path.sh; fsttablecompose $lang/L_disambig.fst $lang/G.fst | fstdeterminizestar | fstinfo 2>&1 `;
    if ($output =~ m/# of states\s*[1-9]/) {
      print "--> L_disambig . G is determinizable\n";
    } else {
      print "--> ERROR: fail to determinize L_disambig . G.  Output is:\n";
      print "$output\n";
      $exit = 1;
    }
  }
}

if ($exit == 1) {
  print "--> ERROR (see error messages above)\n"; exit 1;
} else {
  if ($warning == 1) {
    print "--> WARNING (check output above for warnings)\n"; exit 0;
  } else {
    print "--> SUCCESS [validating lang directory $lang]\n"; exit 0;
  }
}


================================================
FILE: egs/utils/validate_text.pl
================================================
#!/usr/bin/env perl
#
#===============================================================================
# Copyright 2017  Johns Hopkins University (author: Yenda Trmal <jtrmal@gmail.com>)
#                 Johns Hopkins University (author: Daniel Povey)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#  http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
#===============================================================================

# validation script for data/<dataset>/text
# to be called (preferably) from utils/validate_data_dir.sh
use strict;
use warnings;
use utf8;
use Fcntl qw< SEEK_SET >;

# this function reads the opened file (supplied as a first
# parameter) into an array of lines. For each
# line, it tests whether it's a valid utf-8 compatible
# line. If all lines are valid utf-8, it returns the lines
# decoded as utf-8, otherwise it assumes the file's encoding
# is one of those 1-byte encodings, such as ISO-8859-x
# or Windows CP-X.
# Please recall we do not really care about
# the actually encoding, we just need to
# make sure the length of the (decoded) string
# is correct (to make the output formatting looking right).
sub get_utf8_or_bytestream {
  use Encode qw(decode encode);
  my $is_utf_compatible = 1;
  my @unicode_lines;
  my @raw_lines;
  my $raw_text;
  my $lineno = 0;
  my $file = shift;

  while (<$file>) {
    $raw_text = $_;
    last unless $raw_text;
    if ($is_utf_compatible) {
      my $decoded_text = eval { decode("UTF-8", $raw_text, Encode::FB_CROAK) } ;
      $is_utf_compatible = $is_utf_compatible && defined($decoded_text);
      push @unicode_lines, $decoded_text;
    } else {
      #print STDERR "WARNING: the line $raw_text cannot be interpreted as UTF-8: $decoded_text\n";
      ;
    }
    push @raw_lines, $raw_text;
    $lineno += 1;
  }

  if (!$is_utf_compatible) {
    return (0, @raw_lines);
  } else {
    return (1, @unicode_lines);
  }
}

# check if the given unicode string contain unicode whitespaces
# other than the usual four: TAB, LF, CR and SPACE
sub validate_utf8_whitespaces {
  my $unicode_lines = shift;
  use feature 'unicode_strings';
  for (my $i = 0; $i < scalar @{$unicode_lines}; $i++) {
    my $current_line = $unicode_lines->[$i];
    if ((substr $current_line, -1) ne "\n"){
      print STDERR "$0: The current line (nr. $i) has invalid newline\n";
      return 1;
    }
    my @A = split(" ", $current_line);
    my $utt_id = $A[0];
    # we replace TAB, LF, CR, and SPACE
    # this is to simplify the test
    if ($current_line =~ /\x{000d}/) {
      print STDERR "$0: The line for utterance $utt_id contains CR (0x0D) character\n";
      return 1;
    }
    $current_line =~ s/[\x{0009}\x{000a}\x{0020}]/./g;
    if ($current_line =~/\s/) {
      print STDERR "$0: The line for utterance $utt_id contains disallowed Unicode whitespaces\n";
      return 1;
    }
  }
  return 0;
}

# checks if the text in the file (supplied as the argument) is utf-8 compatible
# if yes, checks if it contains only allowed whitespaces. If no, then does not
# do anything. The function seeks to the original position in the file after
# reading the text.
sub check_allowed_whitespace {
  my $file = shift;
  my $filename = shift;
  my $pos = tell($file);
  (my $is_utf, my @lines) = get_utf8_or_bytestream($file);
  seek($file, $pos, SEEK_SET);
  if ($is_utf) {
    my $has_invalid_whitespaces = validate_utf8_whitespaces(\@lines);
    if ($has_invalid_whitespaces) {
      print STDERR "$0: ERROR: text file '$filename' contains disallowed UTF-8 whitespace character(s)\n";
      return 0;
    }
  }
  return 1;
}

if(@ARGV != 1) {
  die "Usage: validate_text.pl <text-file>\n" .
      "e.g.: validate_text.pl data/train/text\n";
}

my $text = shift @ARGV;

if (-z "$text") {
  print STDERR "$0: ERROR: file '$text' is empty or does not exist\n";
  exit 1;
}

if(!open(FILE, "<$text")) {
  print STDERR "$0: ERROR: failed to open $text\n";
  exit 1;
}

check_allowed_whitespace(\*FILE, $text) or exit 1;
close(FILE);


================================================
FILE: egs/utils/write_kwslist.pl
================================================
#!/usr/bin/env perl

# Copyright 2012  Johns Hopkins University (Author: Guoguo Chen)
# Apache 2.0.
#
use strict;
use warnings;
use Getopt::Long;

my $Usage = <<EOU;
This script reads the raw keyword search results [result.*] and writes them as the kwslist.xml file.
It can also do things like score normalization, decision making, duplicates removal, etc.

Usage: utils/write_kwslist.pl [options] <raw_result_in|-> <kwslist_out|->
 e.g.: utils/write_kwslist.pl --flen=0.01 --duration=1000 --segments=data/eval/segments
                              --normalize=true --map-utter=data/kws/utter_map raw_results kwslist.xml

Allowed options:
  --beta                      : Beta value when computing ATWV              (float,   default = 999.9)
  --digits                    : How many digits should the score use        (int,     default = "infinite")
  --duptime                   : Tolerance for duplicates                    (float,   default = 0.5)
  --duration                  : Duration of all audio, you must set this    (float,   default = 999.9)
  --ecf-filename              : ECF file name                               (string,  default = "") 
  --flen                      : Frame length                                (float,   default = 0.01)
  --index-size                : Size of index                               (float,   default = 0)
  --kwlist-filename           : Kwlist.xml file name                        (string,  default = "") 
  --language                  : Language type                               (string,  default = "cantonese")
  --map-utter                 : Map utterance for evaluation                (string,  default = "")
  --normalize                 : Normalize scores or not                     (boolean, default = false)
  --Ntrue-scale               : Keyword independent scale factor for Ntrue  (float,   default = 1.0)
  --remove-dup                : Remove duplicates                           (boolean, default = false)
  --remove-NO                 : Remove the "NO" decision instances          (boolean, default = false)
  --segments                  : Segments file from Kaldi                    (string,  default = "")
  --system-id                 : System ID                                   (string,  default = "")
  --verbose                   : Verbose level (higher --> more kws section) (integer, default = 0)
  --YES-cutoff                : Only keep "\$YES-cutoff" yeses for each kw   (int,    default = -1)
  --nbest                     | Output upto nbest hits into the kwlist      (int,     default = -1)

EOU

my $segment = "";
my $flen = 0.01;
my $beta = 999.9;
my $duration = 999.9;
my $language = "cantonese";
my $ecf_filename = "";
my $index_size = 0;
my $system_id = "";
my $normalize = "false";
my $map_utter = "";
my $Ntrue_scale = 1.0;
my $digits = 0;
my $kwlist_filename = "";
my $verbose = 0;
my $duptime = 0.5;
my $remove_dup = "false";
my $remove_NO = "false";
my $YES_cutoff = -1;
my $nbest_max = -1;
GetOptions('segments=s'     => \$segment,
  'flen=f'         => \$flen,
  'beta=f'         => \$beta,
  'duration=f'     => \$duration,
  'language=s'     => \$language,
  'ecf-filename=s' => \$ecf_filename,
  'index-size=f'   => \$index_size,
  'system-id=s'    => \$system_id,
  'normalize=s'    => \$normalize,
  'map-utter=s'    => \$map_utter,
  'Ntrue-scale=f'  => \$Ntrue_scale,
  'digits=i'       => \$digits,
  'kwlist-filename=s' => \$kwlist_filename,
  'verbose=i'         => \$verbose,
  'duptime=f'         => \$duptime,
  'remove-dup=s'      => \$remove_dup,
  'YES-cutoff=i'      => \$YES_cutoff,
  'remove-NO=s'       => \$remove_NO,
  'nbest=i'           => \$nbest_max) or die "Cannot continue\n";

($normalize eq "true" || $normalize eq "false") || die "$0: Bad value for option --normalize\n";
($remove_dup eq "true" || $remove_dup eq "false") || die "$0: Bad value for option --remove-dup\n";
($remove_NO eq "true" || $remove_NO eq "false") || die "$0: Bad value for option --remove-NO\n";

if ($segment) {
  open(SEG, "<$segment") || die "$0: Fail to open segment file $segment\n";
}

if ($map_utter) {
  open(UTT, "<$map_utter") || die "$0: Fail to open utterance table $map_utter\n";
}

if (@ARGV != 2) {
  die $Usage;
}

# Get parameters
my $filein = shift @ARGV;
my $fileout = shift @ARGV;

# Get input source
my $source = "";
if ($filein eq "-") {
  $source = "STDIN";
} else {
  open(I, "<$filein") || die "$0: Fail to open input file $filein\n";
  $source = "I";
}

# Get symbol table and start time
my %tbeg;
if ($segment) {
  while (<SEG>) {
    chomp;
    my @col = split(" ", $_);
    @col == 4 || die "$0: Bad number of columns in $segment \"$_\"\n";
    $tbeg{$col[0]} = $col[2];
  }
}

# Get utterance mapper
my %utter_mapper;
if ($map_utter) {
  while (<UTT>) {
    chomp;
    my @col = split(" ", $_);
    @col == 2 || die "$0: Bad number of columns in $map_utter \"$_\"\n";
    $utter_mapper{$col[0]} = $col[1];
  }
}

# Function for printing Kwslist.xml
sub PrintKwslist {
  my ($info, $KWS) = @_;

  my $kwslist = "";

  # Start printing
  $kwslist .= "<kwslist kwlist_filename=\"$info->[0]\" language=\"$info->[1]\" system_id=\"$info->[2]\">\n";
  my $prev_kw = "";
  my $nbest = $nbest_max;
  foreach my $kwentry (@{$KWS}) {
    if (($prev_kw eq $kwentry->[0])  && ($nbest le 0) && ($nbest_max gt 0)) {
      next;
    }
    if ($prev_kw ne $kwentry->[0]) {
      if ($prev_kw ne "") {$kwslist .= "  </detected_kwlist>\n";}
      $kwslist .= "  <detected_kwlist kwid=\"$kwentry->[0]\" search_time=\"1\" oov_count=\"0\">\n";
      $prev_kw = $kwentry->[0];
      $nbest = $nbest_max;
    }
    $nbest -= 1 if $nbest_max gt 0;
    my $score = sprintf("%g", $kwentry->[5]);
    $kwslist .= "    <kw file=\"$kwentry->[1]\" channel=\"$kwentry->[2]\" tbeg=\"$kwentry->[3]\" dur=\"$kwentry->[4]\" score=\"$score\" decision=\"$kwentry->[6]\"";
    if (defined($kwentry->[7])) {$kwslist .= " threshold=\"$kwentry->[7]\"";}
    if (defined($kwentry->[8])) {$kwslist .= " raw_score=\"$kwentry->[8]\"";}
    $kwslist .= "/>\n";
  }
  if ($prev_kw ne "") {$kwslist .= "  </detected_kwlist>\n";}
  $kwslist .= "</kwslist>\n";

  return $kwslist;
}

# Function for sorting
sub KwslistOutputSort {
  if ($a->[0] ne $b->[0]) {
    if ($a->[0] =~ m/[0-9]+$/ && $b->[0] =~ m/[0-9]+$/) {
      ($a->[0] =~ /([0-9]*)$/)[0] <=> ($b->[0] =~ /([0-9]*)$/)[0]
    } else {
      $a->[0] cmp $b->[0];
    }
  } elsif ($a->[5] ne $b->[5]) {
    $b->[5] <=> $a->[5];
  } else {
    $a->[1] cmp $b->[1];
  }
}
sub KwslistDupSort {
  my ($a, $b, $duptime) = @_;
  if ($a->[0] ne $b->[0]) {
    $a->[0] cmp $b->[0];
  } elsif ($a->[1] ne $b->[1]) {
    $a->[1] cmp $b->[1];
  } elsif ($a->[2] ne $b->[2]) {
    $a->[2] cmp $b->[2];
  } elsif (abs($a->[3]-$b->[3]) >= $duptime){
    $a->[3] <=> $b->[3];
  } elsif ($a->[5] ne $b->[5]) {
    $b->[5] <=> $a->[5];
  } else {
    $b->[4] <=> $a->[4];
  }
}

# Processing
my @KWS;
while (<$source>) {
  chomp;
  my @col = split(" ", $_);
  @col == 5 || die "$0: Bad number of columns in raw results \"$_\"\n";
  my $kwid = shift @col;
  my $utter = $col[0];
  my $start = sprintf("%.2f", $col[1]*$flen);
  my $dur = sprintf("%.2f", $col[2]*$flen-$start);
  my $score = exp(-$col[3]);

  if ($segment) {
    $start = sprintf("%.2f", $start+$tbeg{$utter});
  }
  if ($map_utter) {
    my $utter_x = $utter_mapper{$utter};
    die "Unmapped utterance $utter\n" unless $utter_x;
    $utter = $utter_x;
  }

  push(@KWS, [$kwid, $utter, 1, $start, $dur, $score, ""]);
}

my %Ntrue = ();
foreach my $kwentry (@KWS) {
  if (!defined($Ntrue{$kwentry->[0]})) {
    $Ntrue{$kwentry->[0]} = 0.0;
  }
  $Ntrue{$kwentry->[0]} += $kwentry->[5];
}

# Scale the Ntrue
my %threshold;
foreach my $key (keys %Ntrue) {
  $Ntrue{$key} *= $Ntrue_scale;
  $threshold{$key} = $Ntrue{$key}/($duration/$beta+($beta-1)/$beta*$Ntrue{$key});
}

# Removing duplicates
if ($remove_dup eq "true") {
  my @tmp = sort {KwslistDupSort($a, $b, $duptime)} @KWS;
  @KWS = ();
  if (@tmp >= 1) {push(@KWS, $tmp[0])};
  for (my $i = 1; $i < scalar(@tmp); $i ++) {
    my $prev = $KWS[-1];
    my $curr = $tmp[$i];
    if ((abs($prev->[3]-$curr->[3]) < $duptime ) &&
        ($prev->[2] eq $curr->[2]) &&
        ($prev->[1] eq $curr->[1]) &&
        ($prev->[0] eq $curr->[0])) {
      next;
    } else {
      push(@KWS, $curr);
    }
  }
}

my $format_string = "%g";
if ($digits gt 0 ) {
  $format_string = "%." . $digits ."f";
}

my @info = ($kwlist_filename, $language, $system_id);
my %YES_count;
foreach my $kwentry (@KWS) {
  my $threshold = $threshold{$kwentry->[0]};
  if ($kwentry->[5] > $threshold) {
    $kwentry->[6] = "YES";
    if (defined($YES_count{$kwentry->[0]})) {
      $YES_count{$kwentry->[0]} ++;
    } else {
      $YES_count{$kwentry->[0]} = 1;
    }
  } else {
    $kwentry->[6] = "NO";
    if (!defined($YES_count{$kwentry->[0]})) {
      $YES_count{$kwentry->[0]} = 0;
    }
  }
  if ($verbose > 0) {
    push(@{$kwentry}, sprintf("%g", $threshold));
  }
  if ($normalize eq "true") {
    if ($verbose > 0) {
      push(@{$kwentry}, $kwentry->[5]);
    }
    my $numerator = (1-$threshold)*$kwentry->[5];
    my $denominator = (1-$threshold)*$kwentry->[5]+(1-$kwentry->[5])*$threshold;
    if ($denominator != 0) {
      $kwentry->[5] = sprintf($format_string, $numerator/$denominator);
    } else {
      $kwentry->[5] = sprintf($format_string, $kwentry->[5]);
    }
  } else {
    $kwentry->[5] = sprintf($format_string, $kwentry->[5]);
  }
}

# Output sorting
my @tmp = sort KwslistOutputSort @KWS;

# Process the YES-cutoff. Note that you don't need this for the normal cases where
# hits and false alarms are balanced
if ($YES_cutoff != -1) {
  my $count = 1;
  for (my $i = 1; $i < scalar(@tmp); $i ++) { 
    if ($tmp[$i]->[0] ne $tmp[$i-1]->[0]) {
      $count = 1;
      next;
    }
    if ($YES_count{$tmp[$i]->[0]} > $YES_cutoff*2) {
      $tmp[$i]->[6] = "NO";
      $tmp[$i]->[5] = 0;
      next;
    }
    if (($count == $YES_cutoff) && ($tmp[$i]->[6] eq "YES")) {
      $tmp[$i]->[6] = "NO";
      $tmp[$i]->[5] = 0;
      next;
    }
    if ($tmp[$i]->[6] eq "YES") {
      $count ++;
    }
  }
}

# Process the remove-NO decision
if ($remove_NO eq "true") {
  my @KWS = @tmp;
  @tmp = ();
  for (my $i = 0; $i < scalar(@KWS); $i ++) {
    if ($KWS[$i]->[6] eq "YES") {
      push(@tmp, $KWS[$i]);
    }
  }
}

# Printing
my $kwslist = PrintKwslist(\@info, \@tmp);

if ($segment) {close(SEG);}
if ($map_utter) {close(UTT);}
if ($filein  ne "-") {close(I);}
if ($fileout eq "-") {
    print $kwslist;
} else {
  open(O, ">$fileout") || die "$0: Fail to open output file $fileout\n";
  print O $kwslist;
  close(O);
}


================================================
FILE: env/build_env.sh
================================================
# Author: Jinchuan Tian; tianjinchuan@stu.pku.edu.cn
# Build the environment for ASR repositry https://github.com/jctian98/e2e_lfmmi

# Here is only an example, you may need to revise this script to suit your machine
# This script can hardly run automatically. You may need to run it line-by-line

# Our system:
# Centos 7; GCC 7.3.1
# Python; Pytorch 1.7.1, 

rootdir=/home/tian/tools/opensource
stage=$1
nj=48

cd $rootdir
if [ ${stage} -le 1 ]; then
  echo "Install GCC 7.3.1 and system dependency. You need root account for this"
  yum install -y cmake sox libsndfile ffmpeg flac

  yum install -y centos-release-scl
  yum install -y devtoolset-7
  scl enable devtoolset-7 bash
  # After this, run 'gcc -v' to ensure the GCC version is correct
fi

if [ ${stage} -le 2 ]; then
  echo "Install Kaldi and its auxiliary tools"
  git clone https://github.com/kaldi-asr/kaldi.git
  cd kaldi/tools/
  bash extras/check_dependencies.sh # make sure it's ok

  make -j $nj
  cd ../src/
  ./configure --shared
  make depend -j $nj
  make -j $nj

  # Additionally, you need kaldi_lm to train word-level N-gram LM
  cd ../tools
  bash extras/install_kaldi_lm.sh
fi

if [ ${stage} -le 3 ]; then
  echo "Install Espnet environment"
  # git clone https://github.com/espnet/espnet
  cd $rootdir/espnet/tools
  ln -s $rootdir/kaldi .

  # Build espnet environment. You may choose other versions
  CONDA_TOOLS_DIR=$(dirname ${CONDA_EXE})/..
  ./setup_anaconda.sh ${CONDA_TOOLS_DIR} lfmmi 3.8
  make TH_VERSION=1.7.1 CUDA_VERSION=10.1 

  # NT warpper is required if you will run NT examples
  # Installing this warpper is difficult. This issue might be
  # helpful: https://github.com/HawkAaron/warp-transducer/pull/90
  # Also in our installing process, we find the pytorch test
  # cannot pass as the gradients mismatch the desired value
  installers/install_warp-transducer.sh 

  # to use our code rather than standard espnet code
  pip3 uninstall espnet
fi

if [ ${stage} -le 4 ]; then
  echo "Install k2 library"
  conda install -c k2-fsa -c pytorch -c conda-forge k2 python=3.8 cudatoolkit=10.1 pytorch=1.7.1

fi

if [ ${stage} -le 5 ]; then
  echo "Install other python libraries"
  pip3 install kaldilm chainer==6.0.0 kaldialign graphviz lhotse numpy==1.20 
fi


================================================
FILE: kaldi
================================================
../kaldi/

================================================
FILE: lm/__init__.py
================================================
"""Initialize sub package."""


================================================
FILE: lm/chainer_backend/__init__.py
================================================
"""Initialize sub package."""


================================================
FILE: lm/chainer_backend/extlm.py
================================================
#!/usr/bin/env python3

# Copyright 2018 Mitsubishi Electric Research Laboratories (Takaaki Hori)
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)


import math

import chainer
import chainer.functions as F
from espnet.lm.lm_utils import make_lexical_tree


# Definition of a multi-level (subword/word) language model
class MultiLevelLM(chainer.Chain):
    logzero = -10000000000.0
    zero = 1.0e-10

    def __init__(
        self,
        wordlm,
        subwordlm,
        word_dict,
        subword_dict,
        subwordlm_weight=0.8,
        oov_penalty=1.0,
        open_vocab=True,
    ):
        super(MultiLevelLM, self).__init__()
        self.wordlm = wordlm
        self.subwordlm = subwordlm
        self.word_eos = word_dict["<eos>"]
        self.word_unk = word_dict["<unk>"]
        self.xp_word_eos = self.xp.full(1, self.word_eos, "i")
        self.xp_word_unk = self.xp.full(1, self.word_unk, "i")
        self.space = subword_dict["<space>"]
        self.eos = subword_dict["<eos>"]
        self.lexroot = make_lexical_tree(word_dict, subword_dict, self.word_unk)
        self.log_oov_penalty = math.log(oov_penalty)
        self.open_vocab = open_vocab
        self.subword_dict_size = len(subword_dict)
        self.subwordlm_weight = subwordlm_weight
        self.normalized = True

    def __call__(self, state, x):
        # update state with input label x
        if state is None:  # make initial states and log-prob vectors
            wlm_state, z_wlm = self.wordlm(None, self.xp_word_eos)
            wlm_logprobs = F.log_softmax(z_wlm).data
            clm_state, z_clm = self.subwordlm(None, x)
            log_y = F.log_softmax(z_clm).data * self.subwordlm_weight
            new_node = self.lexroot
            clm_logprob = 0.0
            xi = self.space
        else:
            clm_state, wlm_state, wlm_logprobs, node, log_y, clm_logprob = state
            xi = int(x)
            if xi == self.space:  # inter-word transition
                if node is not None and node[1] >= 0:  # check if the node is word end
                    w = self.xp.full(1, node[1], "i")
                else:  # this node is not a word end, which means <unk>
                    w = self.xp_word_unk
                # update wordlm state and log-prob vector
                wlm_state, z_wlm = self.wordlm(wlm_state, w)
                wlm_logprobs = F.log_softmax(z_wlm).data
                new_node = self.lexroot  # move to the tree root
                clm_logprob = 0.0
            elif node is not None and xi in node[0]:  # intra-word transition
                new_node = node[0][xi]
                clm_logprob += log_y[0, xi]
            elif self.open_vocab:  # if no path in the tree, enter open-vocabulary mode
                new_node = None
                clm_logprob += log_y[0, xi]
            else:  # if open_vocab flag is disabled, return 0 probabilities
                log_y = self.xp.full((1, self.subword_dict_size), self.logzero, "f")
                return (clm_state, wlm_state, None, log_y, 0.0), log_y

            clm_state, z_clm = self.subwordlm(clm_state, x)
            log_y = F.log_softmax(z_clm).data * self.subwordlm_weight

        # apply word-level probabilies for <space> and <eos> labels
        if xi != self.space:
            if new_node is not None and new_node[1] >= 0:  # if new node is word end
                wlm_logprob = wlm_logprobs[:, new_node[1]] - clm_logprob
            else:
                wlm_logprob = wlm_logprobs[:, self.word_unk] + self.log_oov_penalty
            log_y[:, self.space] = wlm_logprob
            log_y[:, self.eos] = wlm_logprob
        else:
            log_y[:, self.space] = self.logzero
            log_y[:, self.eos] = self.logzero

        return (clm_state, wlm_state, wlm_logprobs, new_node, log_y, clm_logprob), log_y

    def final(self, state):
        clm_state, wlm_state, wlm_logprobs, node, log_y, clm_logprob = state
        if node is not None and node[1] >= 0:  # check if the node is word end
            w = self.xp.full(1, node[1], "i")
        else:  # this node is not a word end, which means <unk>
            w = self.xp_word_unk
        wlm_state, z_wlm = self.wordlm(wlm_state, w)
        return F.log_softmax(z_wlm).data[:, self.word_eos]


# Definition of a look-ahead word language model
class LookAheadWordLM(chainer.Chain):
    logzero = -10000000000.0
    zero = 1.0e-10

    def __init__(
        self, wordlm, word_dict, subword_dict, oov_penalty=0.0001, open_vocab=True
    ):
        super(LookAheadWordLM, self).__init__()
        self.wordlm = wordlm
        self.word_eos = word_dict["<eos>"]
        self.word_unk = word_dict["<unk>"]
        self.xp_word_eos = self.xp.full(1, self.word_eos, "i")
        self.xp_word_unk = self.xp.full(1, self.word_unk, "i")
        self.space = subword_dict["<space>"]
        self.eos = subword_dict["<eos>"]
        self.lexroot = make_lexical_tree(word_dict, subword_dict, self.word_unk)
        self.oov_penalty = oov_penalty
        self.open_vocab = open_vocab
        self.subword_dict_size = len(subword_dict)
        self.normalized = True

    def __call__(self, state, x):
        # update state with input label x
        if state is None:  # make initial states and cumlative probability vector
            wlm_state, z_wlm = self.wordlm(None, self.xp_word_eos)
            cumsum_probs = self.xp.cumsum(F.softmax(z_wlm).data, axis=1)
            new_node = self.lexroot
            xi = self.space
        else:
            wlm_state, cumsum_probs, node = state
            xi = int(x)
            if xi == self.space:  # inter-word transition
                if node is not None and node[1] >= 0:  # check if the node is word end
                    w = self.xp.full(1, node[1], "i")
                else:  # this node is not a word end, which means <unk>
                    w = self.xp_word_unk
                # update wordlm state and cumlative probability vector
                wlm_state, z_wlm = self.wordlm(wlm_state, w)
                cumsum_probs = self.xp.cumsum(F.softmax(z_wlm).data, axis=1)
                new_node = self.lexroot  # move to the tree root
            elif node is not None and xi in node[0]:  # intra-word transition
                new_node = node[0][xi]
            elif self.open_vocab:  # if no path in the tree, enter open-vocabulary mode
                new_node = None
            else:  # if open_vocab flag is disabled, return 0 probabilities
                log_y = self.xp.full((1, self.subword_dict_size), self.logzero, "f")
                return (wlm_state, None, None), log_y

        if new_node is not None:
            succ, wid, wids = new_node
            # compute parent node probability
            sum_prob = (
                (cumsum_probs[:, wids[1]] - cumsum_probs[:, wids[0]])
                if wids is not None
                else 1.0
            )
            if sum_prob < self.zero:
                log_y = self.xp.full((1, self.subword_dict_size), self.logzero, "f")
                return (wlm_state, cumsum_probs, new_node), log_y
            # set <unk> probability as a default value
            unk_prob = (
                cumsum_probs[:, self.word_unk] - cumsum_probs[:, self.word_unk - 1]
            )
            y = self.xp.full(
                (1, self.subword_dict_size), unk_prob * self.oov_penalty, "f"
            )
            # compute transition probabilities to child nodes
            for cid, nd in succ.items():
                y[:, cid] = (
                    cumsum_probs[:, nd[2][1]] - cumsum_probs[:, nd[2][0]]
                ) / sum_prob
            # apply word-level probabilies for <space> and <eos> labels
            if wid >= 0:
                wlm_prob = (cumsum_probs[:, wid] - cumsum_probs[:, wid - 1]) / sum_prob
                y[:, self.space] = wlm_prob
                y[:, self.eos] = wlm_prob
            elif xi == self.space:
                y[:, self.space] = self.zero
                y[:, self.eos] = self.zero
            log_y = self.xp.log(
                self.xp.clip(y, self.zero, None)
            )  # clip to avoid log(0)
        else:  # if no path in the tree, transition probability is one
            log_y = self.xp.zeros((1, self.subword_dict_size), "f")
        return (wlm_state, cumsum_probs, new_node), log_y

    def final(self, state):
        wlm_state, cumsum_probs, node = state
        if node is not None and node[1] >= 0:  # check if the node is word end
            w = self.xp.full(1, node[1], "i")
        else:  # this node is not a word end, which means <unk>
            w = self.xp_word_unk
        wlm_state, z_wlm = self.wordlm(wlm_state, w)
        return F.log_softmax(z_wlm).data[:, self.word_eos]


================================================
FILE: lm/chainer_backend/lm.py
================================================
#!/usr/bin/env python3

# Copyright 2017 Johns Hopkins University (Shinji Watanabe)
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

# This code is ported from the following implementation written in Torch.
# https://github.com/chainer/chainer/blob/master/examples/ptb/train_ptb_custom_loop.py


import copy
import json
import logging
import numpy as np
import six

import chainer
from chainer.dataset import convert
import chainer.functions as F
import chainer.links as L

# for classifier link
from chainer.functions.loss import softmax_cross_entropy
from chainer import link
from chainer import reporter
from chainer import training
from chainer.training import extensions

from espnet.lm.lm_utils import compute_perplexity
from espnet.lm.lm_utils import count_tokens
from espnet.lm.lm_utils import MakeSymlinkToBestModel
from espnet.lm.lm_utils import ParallelSentenceIterator
from espnet.lm.lm_utils import read_tokens

import espnet.nets.chainer_backend.deterministic_embed_id as DL
from espnet.nets.lm_interface import LMInterface
from espnet.optimizer.factory import dynamic_import_optimizer
from espnet.scheduler.chainer import ChainerScheduler
from espnet.scheduler.scheduler import dynamic_import_scheduler

from espnet.utils.training.tensorboard_logger import TensorboardLogger
from tensorboardX import SummaryWriter

from espnet.utils.deterministic_utils import set_deterministic_chainer
from espnet.utils.training.evaluator import BaseEvaluator
from espnet.utils.training.iterators import ShufflingEnabler
from espnet.utils.training.train_utils import check_early_stop
from espnet.utils.training.train_utils import set_early_stop


# TODO(karita): reimplement RNNLM with new interface
class DefaultRNNLM(LMInterface, link.Chain):
    """Default RNNLM wrapper to compute reduce framewise loss values.

    Args:
        n_vocab (int): The size of the vocabulary
        args (argparse.Namespace): configurations. see `add_arguments`
    """

    @staticmethod
    def add_arguments(parser):
        parser.add_argument(
            "--type",
            type=str,
            default="lstm",
            nargs="?",
            choices=["lstm", "gru"],
            help="Which type of RNN to use",
        )
        parser.add_argument(
            "--layer", "-l", type=int, default=2, help="Number of hidden layers"
        )
        parser.add_argument(
            "--unit", "-u", type=int, default=650, help="Number of hidden units"
        )
        return parser


class ClassifierWithState(link.Chain):
    """A wrapper for a chainer RNNLM

    :param link.Chain predictor : The RNNLM
    :param function lossfun: The loss function to use
    :param int/str label_key:
    """

    def __init__(
        self,
        predictor,
        lossfun=softmax_cross_entropy.softmax_cross_entropy,
        label_key=-1,
    ):
        if not (isinstance(label_key, (int, str))):
            raise TypeError("label_key must be int or str, but is %s" % type(label_key))

        super(ClassifierWithState, self).__init__()
        self.lossfun = lossfun
        self.y = None
        self.loss = None
        self.label_key = label_key

        with self.init_scope():
            self.predictor = predictor

    def __call__(self, state, *args, **kwargs):
        """Computes the loss value for an input and label pair.

            It also computes accuracy and stores it to the attribute.
            When ``label_key`` is ``int``, the corresponding element in ``args``
            is treated as ground truth labels. And when it is ``str``, the
            element in ``kwargs`` is used.
            The all elements of ``args`` and ``kwargs`` except the groundtruth
            labels are features.
            It feeds features to the predictor and compare the result
            with ground truth labels.

        :param state : The LM state
        :param list[chainer.Variable] args : Input minibatch
        :param dict[chainer.Variable] kwargs : Input minibatch
        :return loss value
        :rtype chainer.Variable
        """

        if isinstance(self.label_key, int):
            if not (-len(args) <= self.label_key < len(args)):
                msg = "Label key %d is out of bounds" % self.label_key
                raise ValueError(msg)
            t = args[self.label_key]
            if self.label_key == -1:
                args = args[:-1]
            else:
                args = args[: self.label_key] + args[self.label_key + 1 :]
        elif isinstance(self.label_key, str):
            if self.label_key not in kwargs:
                msg = 'Label key "%s" is not found' % self.label_key
                raise ValueError(msg)
            t = kwargs[self.label_key]
            del kwargs[self.label_key]

        self.y = None
        self.loss = None
        state, self.y = self.predictor(state, *args, **kwargs)
        self.loss = self.lossfun(self.y, t)
        return state, self.loss

    def predict(self, state, x):
        """Predict log probabilities for given state and input x using the predictor

        :param state : the state
        :param x : the input
        :return a tuple (state, log prob vector)
        :rtype cupy/numpy array
        """
        if hasattr(self.predictor, "normalized") and self.predictor.normalized:
            return self.predictor(state, x)
        else:
            state, z = self.predictor(state, x)
            return state, F.log_softmax(z).data

    def final(self, state):
        """Predict final log probabilities for given state using the predictor

        :param state : the state
        :return log probability vector
        :rtype cupy/numpy array

        """
        if hasattr(self.predictor, "final"):
            return self.predictor.final(state)
        else:
            return 0.0


# Definition of a recurrent net for language modeling
class RNNLM(chainer.Chain):
    """A chainer RNNLM

    :param int n_vocab: The size of the vocabulary
    :param int n_layers: The number of layers to create
    :param int n_units: The number of units per layer
    :param str type: The RNN type
    """

    def __init__(self, n_vocab, n_layers, n_units, typ="lstm"):
        super(RNNLM, self).__init__()
        with self.init_scope():
            self.embed = DL.EmbedID(n_vocab, n_units)
            self.rnn = (
                chainer.ChainList(
                    *[L.StatelessLSTM(n_units, n_units) for _ in range(n_layers)]
                )
                if typ == "lstm"
                else chainer.ChainList(
                    *[L.StatelessGRU(n_units, n_units) for _ in range(n_layers)]
                )
            )
            self.lo = L.Linear(n_units, n_vocab)

        for param in self.params():
            param.data[...] = np.random.uniform(-0.1, 0.1, param.data.shape)
        self.n_layers = n_layers
        self.n_units = n_units
        self.typ = typ

    def __call__(self, state, x):
        if state is None:
            if self.typ == "lstm":
                state = {"c": [None] * self.n_layers, "h": [None] * self.n_layers}
            else:
                state = {"h": [None] * self.n_layers}

        h = [None] * self.n_layers
        emb = self.embed(x)
        if self.typ == "lstm":
            c = [None] * self.n_layers
            c[0], h[0] = self.rnn[0](state["c"][0], state["h"][0], F.dropout(emb))
            for n in six.moves.range(1, self.n_layers):
                c[n], h[n] = self.rnn[n](
                    state["c"][n], state["h"][n], F.dropout(h[n - 1])
                )
            state = {"c": c, "h": h}
        else:
            if state["h"][0] is None:
                xp = self.xp
                with chainer.backends.cuda.get_device_from_id(self._device_id):
                    state["h"][0] = chainer.Variable(
                        xp.zeros((emb.shape[0], self.n_units), dtype=emb.dtype)
                    )
            h[0] = self.rnn[0](state["h"][0], F.dropout(emb))
            for n in six.moves.range(1, self.n_layers):
                if state["h"][n] is None:
                    xp = self.xp
                    with chainer.backends.cuda.get_device_from_id(self._device_id):
                        state["h"][n] = chainer.Variable(
                            xp.zeros(
                                (h[n - 1].shape[0], self.n_units), dtype=h[n - 1].dtype
                            )
                        )
                h[n] = self.rnn[n](state["h"][n], F.dropout(h[n - 1]))
            state = {"h": h}
        y = self.lo(F.dropout(h[-1]))
        return state, y


class BPTTUpdater(training.updaters.StandardUpdater):
    """An updater for a chainer LM

    :param chainer.dataset.Iterator train_iter : The train iterator
    :param optimizer:
    :param schedulers:
    :param int device : The device id
    :param int accum_grad :
    """

    def __init__(self, train_iter, optimizer, schedulers, device, accum_grad):
        super(BPTTUpdater, self).__init__(train_iter, optimizer, device=device)
        self.scheduler = ChainerScheduler(schedulers, optimizer)
        self.accum_grad = accum_grad

    # The core part of the update routine can be customized by overriding.
    def update_core(self):
        # When we pass one iterator and optimizer to StandardUpdater.__init__,
        # they are automatically named 'main'.
        train_iter = self.get_iterator("main")
        optimizer = self.get_optimizer("main")

        count = 0
        sum_loss = 0
        optimizer.target.cleargrads()  # Clear the parameter gradients
        for _ in range(self.accum_grad):
            # Progress the dataset iterator for sentences at each iteration.
            batch = train_iter.__next__()
            x, t = convert.concat_examples(batch, device=self.device, padding=(0, -1))
            # Concatenate the token IDs to matrices and send them to the device
            # self.converter does this job
            # (it is chainer.dataset.concat_examples by default)
            xp = chainer.backends.cuda.get_array_module(x)
            loss = 0
            state = None
            batch_size, sequence_length = x.shape
            for i in six.moves.range(sequence_length):
                # Compute the loss at this time step and accumulate it
                state, loss_batch = optimizer.target(
                    state, chainer.Variable(x[:, i]), chainer.Variable(t[:, i])
                )
                non_zeros = xp.count_nonzero(x[:, i])
                loss += loss_batch * non_zeros
                count += int(non_zeros)
            # backward
            loss /= batch_size * self.accum_grad  # normalized by batch size
            sum_loss += float(loss.data)
            loss.backward()  # Backprop
            loss.unchain_backward()  # Truncate the graph

        reporter.report({"loss": sum_loss}, optimizer.target)
        reporter.report({"count": count}, optimizer.target)
        # update
        optimizer.update()  # Update the parameters
        self.scheduler.step(self.iteration)


class LMEvaluator(BaseEvaluator):
    """A custom evaluator for a chainer LM

    :param chainer.dataset.Iterator val_iter : The validation iterator
    :param eval_model : The model to evaluate
    :param int device : The device id to use
    """

    def __init__(self, val_iter, eval_model, device):
        super(LMEvaluator, self).__init__(val_iter, eval_model, device=device)

    def evaluate(self):
        val_iter = self.get_iterator("main")
        target = self.get_target("main")
        loss = 0
        count = 0
        for batch in copy.copy(val_iter):
            x, t = convert.concat_examples(batch, device=self.device, padding=(0, -1))
            xp = chainer.backends.cuda.get_array_module(x)
            state = None
            for i in six.moves.range(len(x[0])):
                state, loss_batch = target(state, x[:, i], t[:, i])
                non_zeros = xp.count_nonzero(x[:, i])
                loss += loss_batch.data * non_zeros
                count += int(non_zeros)
        # report validation loss
        observation = {}
        with reporter.report_scope(observation):
            reporter.report({"loss": float(loss / count)}, target)
        return observation


def train(args):
    """Train with the given args

    :param Namespace args: The program arguments
    """
    # TODO(karita): support this
    if args.model_module != "default":
        raise NotImplementedError("chainer backend does not support --model-module")

    # display chainer version
    logging.info("chainer version = " + chainer.__version__)

    set_deterministic_chainer(args)

    # check cuda and cudnn availability
    if not chainer.cuda.available:
        logging.warning("cuda is not available")
    if not chainer.cuda.cudnn_enabled:
        logging.warning("cudnn is not available")

    # get special label ids
    unk = args.char_list_dict["<unk>"]
    eos = args.char_list_dict["<eos>"]
    # read tokens as a sequence of sentences
    train = read_tokens(args.train_label, args.char_list_dict)
    val = read_tokens(args.valid_label, args.char_list_dict)
    # count tokens
    n_train_tokens, n_train_oovs = count_tokens(train, unk)
    n_val_tokens, n_val_oovs = count_tokens(val, unk)
    logging.info("#vocab = " + str(args.n_vocab))
    logging.info("#sentences in the training data = " + str(len(train)))
    logging.info("#tokens in the training data = " + str(n_train_tokens))
    logging.info(
        "oov rate in the training data = %.2f %%"
        % (n_train_oovs / n_train_tokens * 100)
    )
    logging.info("#sentences in the validation data = " + str(len(val)))
    logging.info("#tokens in the validation data = " + str(n_val_tokens))
    logging.info(
        "oov rate in the validation data = %.2f %%" % (n_val_oovs / n_val_tokens * 100)
    )

    use_sortagrad = args.sortagrad == -1 or args.sortagrad > 0

    # Create the dataset iterators
    train_iter = ParallelSentenceIterator(
        train,
        args.batchsize,
        max_length=args.maxlen,
        sos=eos,
        eos=eos,
        shuffle=not use_sortagrad,
    )
    val_iter = ParallelSentenceIterator(
        val, args.batchsize, max_length=args.maxlen, sos=eos, eos=eos, repeat=False
    )
    epoch_iters = int(len(train_iter.batch_indices) / args.accum_grad)
    logging.info("#iterations per epoch = %d" % epoch_iters)
    logging.info("#total iterations = " + str(args.epoch * epoch_iters))
    # Prepare an RNNLM model
    rnn = RNNLM(args.n_vocab, args.layer, args.unit, args.type)
    model = ClassifierWithState(rnn)
    if args.ngpu > 1:
        logging.warning("currently, multi-gpu is not supported. use single gpu.")
    if args.ngpu > 0:
        # Make the specified GPU current
        gpu_id = 0
        chainer.cuda.get_device_from_id(gpu_id).use()
        model.to_gpu()
    else:
        gpu_id = -1

    # Save model conf to json
    model_conf = args.outdir + "/model.json"
    with open(model_conf, "wb") as f:
        logging.info("writing a model config file to " + model_conf)
        f.write(
            json.dumps(vars(args), indent=4, ensure_ascii=False, sort_keys=True).encode(
                "utf_8"
            )
        )

    # Set up an optimizer
    opt_class = dynamic_import_optimizer(args.opt, args.backend)
    optimizer = opt_class.from_args(model, args)
    if args.schedulers is None:
        schedulers = []
    else:
        schedulers = [dynamic_import_scheduler(v)(k, args) for k, v in args.schedulers]

    optimizer.setup(model)
    optimizer.add_hook(chainer.optimizer.GradientClipping(args.gradclip))

    updater = BPTTUpdater(train_iter, optimizer, schedulers, gpu_id, args.accum_grad)
    trainer = training.Trainer(updater, (args.epoch, "epoch"), out=args.outdir)
    trainer.extend(LMEvaluator(val_iter, model, device=gpu_id))
    trainer.extend(
        extensions.LogReport(
            postprocess=compute_perplexity,
            trigger=(args.report_interval_iters, "iteration"),
        )
    )
    trainer.extend(
        extensions.PrintReport(
            ["epoch", "iteration", "perplexity", "val_perplexity", "elapsed_time"]
        ),
        trigger=(args.report_interval_iters, "iteration"),
    )
    trainer.extend(extensions.ProgressBar(update_interval=args.report_interval_iters))
    trainer.extend(extensions.snapshot(filename="snapshot.ep.{.updater.epoch}"))
    trainer.extend(extensions.snapshot_object(model, "rnnlm.model.{.updater.epoch}"))
    # MEMO(Hori): wants to use MinValueTrigger, but it seems to fail in resuming
    trainer.extend(MakeSymlinkToBestModel("validation/main/loss", "rnnlm.model"))

    if use_sortagrad:
        trainer.extend(
            ShufflingEnabler([train_iter]),
            trigger=(args.sortagrad if args.sortagrad != -1 else args.epoch, "epoch"),
        )

    if args.resume:
        logging.info("resumed from %s" % args.resume)
        chainer.serializers.load_npz(args.resume, trainer)

    set_early_stop(trainer, args, is_lm=True)
    if args.tensorboard_dir is not None and args.tensorboard_dir != "":
        writer = SummaryWriter(args.tensorboard_dir)
        trainer.extend(
            TensorboardLogger(writer), trigger=(args.report_interval_iters, "iteration")
        )

    trainer.run()
    check_early_stop(trainer, args.epoch)

    # compute perplexity for test set
    if args.test_label:
        logging.info("test the best model")
        chainer.serializers.load_npz(args.outdir + "/rnnlm.model.best", model)
        test = read_tokens(args.test_label, args.char_list_dict)
        n_test_tokens, n_test_oovs = count_tokens(test, unk)
        logging.info("#sentences in the test data = " + str(len(test)))
        logging.info("#tokens in the test data = " + str(n_test_tokens))
        logging.info(
            "oov rate in the test data = %.2f %%" % (n_test_oovs / n_test_tokens * 100)
        )
        test_iter = ParallelSentenceIterator(
            test, args.batchsize, max_length=args.maxlen, sos=eos, eos=eos, repeat=False
        )
        evaluator = LMEvaluator(test_iter, model, device=gpu_id)
        with chainer.using_config("train", False):
            result = evaluator()
        logging.info("test perplexity: " + str(np.exp(float(result["main/loss"]))))


================================================
FILE: lm/lm_utils.py
================================================
#!/usr/bin/env python3

# Copyright 2017 Johns Hopkins University (Shinji Watanabe)
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

# This code is ported from the following implementation written in Torch.
# https://github.com/chainer/chainer/blob/master/examples/ptb/train_ptb_custom_loop.py

import chainer
import h5py
import logging
import numpy as np
import os
import random
import six
from tqdm import tqdm

from chainer.training import extension


def load_dataset(path, label_dict, outdir=None):
    """Load and save HDF5 that contains a dataset and stats for LM

    Args:
        path (str): The path of an input text dataset file
        label_dict (dict[str, int]):
            dictionary that maps token label string to its ID number
        outdir (str): The path of an output dir

    Returns:
        tuple[list[np.ndarray], int, int]: Tuple of
            token IDs in np.int32 converted by `read_tokens`
            the number of tokens by `count_tokens`,
            and the number of OOVs by `count_tokens`
    """
    if outdir is not None:
        os.makedirs(outdir, exist_ok=True)
        filename = outdir + "/" + os.path.basename(path) + ".h5"
        if os.path.exists(filename):
            logging.info(f"loading binary dataset: {filename}")
            f = h5py.File(filename, "r")
            return f["data"][:], f["n_tokens"][()], f["n_oovs"][()]
    else:
        logging.info("skip dump/load HDF5 because the output dir is not specified")
    logging.info(f"reading text dataset: {path}")
    ret = read_tokens(path, label_dict)
    n_tokens, n_oovs = count_tokens(ret, label_dict["<unk>"])
    if outdir is not None:
        logging.info(f"saving binary dataset: {filename}")
        with h5py.File(filename, "w") as f:
            # http://docs.h5py.org/en/stable/special.html#arbitrary-vlen-data
            data = f.create_dataset(
                "data", (len(ret),), dtype=h5py.special_dtype(vlen=np.int32)
            )
            data[:] = ret
            f["n_tokens"] = n_tokens
            f["n_oovs"] = n_oovs
    return ret, n_tokens, n_oovs


def read_tokens(filename, label_dict):
    """Read tokens as a sequence of sentences

    :param str filename : The name of the input file
    :param dict label_dict : dictionary that maps token label string to its ID number
    :return list of ID sequences
    :rtype list
    """

    data = []
    unk = label_dict["<unk>"]
    for ln in tqdm(open(filename, "r", encoding="utf-8")):
        data.append(
            np.array(
                [label_dict.get(label, unk) for label in ln.split()], dtype=np.int32
            )
        )
    return data


def count_tokens(data, unk_id=None):
    """Count tokens and oovs in token ID sequences.

    Args:
        data (list[np.ndarray]): list of token ID sequences
        unk_id (int): ID of unknown token

    Returns:
        tuple: tuple of number of token occurrences and number of oov tokens

    """

    n_tokens = 0
    n_oovs = 0
    for sentence in data:
        n_tokens += len(sentence)
        if unk_id is not None:
            n_oovs += np.count_nonzero(sentence == unk_id)
    return n_tokens, n_oovs


def compute_perplexity(result):
    """Computes and add the perplexity to the LogReport

    :param dict result: The current observations
    """
    # Routine to rewrite the result dictionary of LogReport to add perplexity values
    result["perplexity"] = np.exp(result["main/loss"] / result["main/count"])
    if "validation/main/loss" in result:
        result["val_perplexity"] = np.exp(result["validation/main/loss"])


class ParallelSentenceIterator(chainer.dataset.Iterator):
    """Dataset iterator to create a batch of sentences.

    This iterator returns a pair of sentences, where one token is shifted
    between the sentences like '<sos> w1 w2 w3' and 'w1 w2 w3 <eos>'
    Sentence batches are made in order of longer sentences, and then
    randomly shuffled.
    """

    def __init__(
        self, dataset, batch_size, max_length=0, sos=0, eos=0, repeat=True, shuffle=True
    ):
        self.dataset = dataset
        self.batch_size = batch_size  # batch size
        # Number of completed sweeps over the dataset. In this case, it is
        # incremented if every word is visited at least once after the last
        # increment.
        self.epoch = 0
        # True if the epoch is incremented at the last iteration.
        self.is_new_epoch = False
        self.repeat = repeat
        length = len(dataset)
        self.batch_indices = []
        # make mini-batches
        if batch_size > 1:
            indices = sorted(range(len(dataset)), key=lambda i: -len(dataset[i]))
            bs = 0
            while bs < length:
                be = min(bs + batch_size, length)
                # batch size is automatically reduced if the sentence length
                # is larger than max_length
                if max_length > 0:
                    sent_length = len(dataset[indices[bs]])
                    be = min(
                        be, bs + max(batch_size // (sent_length // max_length + 1), 1)
                    )
                self.batch_indices.append(np.array(indices[bs:be]))
                bs = be
            if shuffle:
                # shuffle batches
                random.shuffle(self.batch_indices)
        else:
            self.batch_indices = [np.array([i]) for i in six.moves.range(length)]

        # NOTE: this is not a count of parameter updates. It is just a count of
        # calls of ``__next__``.
        self.iteration = 0
        self.sos = sos
        self.eos = eos
        # use -1 instead of None internally
        self._previous_epoch_detail = -1.0

    def __next__(self):
        # This iterator returns a list representing a mini-batch. Each item
        # indicates a sentence pair like '<sos> w1 w2 w3' and 'w1 w2 w3 <eos>'
        # represented by token IDs.
        n_batches = len(self.batch_indices)
        if not self.repeat and self.iteration >= n_batches:
            # If not self.repeat, this iterator stops at the end of the first
            # epoch (i.e., when all words are visited once).
            raise StopIteration

        batch = []
        for idx in self.batch_indices[self.iteration % n_batches]:
            batch.append(
                (
                    np.append([self.sos], self.dataset[idx]),
                    np.append(self.dataset[idx], [self.eos]),
                )
            )

        self._previous_epoch_detail = self.epoch_detail
        self.iteration += 1

        epoch = self.iteration // n_batches
        self.is_new_epoch = self.epoch < epoch
        if self.is_new_epoch:
            self.epoch = epoch

        return batch

    def start_shuffle(self):
        random.shuffle(self.batch_indices)

    @property
    def epoch_detail(self):
        # Floating point version of epoch.
        return self.iteration / len(self.batch_indices)

    @property
    def previous_epoch_detail(self):
        if self._previous_epoch_detail < 0:
            return None
        return self._previous_epoch_detail

    def serialize(self, serializer):
        # It is important to serialize the state to be recovered on resume.
        self.iteration = serializer("iteration", self.iteration)
        self.epoch = serializer("epoch", self.epoch)
        try:
            self._previous_epoch_detail = serializer(
                "previous_epoch_detail", self._previous_epoch_detail
            )
        except KeyError:
            # guess previous_epoch_detail for older version
            self._previous_epoch_detail = self.epoch + (
                self.current_position - 1
            ) / len(self.batch_indices)
            if self.epoch_detail > 0:
                self._previous_epoch_detail = max(self._previous_epoch_detail, 0.0)
            else:
                self._previous_epoch_detail = -1.0


class MakeSymlinkToBestModel(extension.Extension):
    """Extension that makes a symbolic link to the best model

    :param str key: Key of value
    :param str prefix: Prefix of model files and link target
    :param str suffix: Suffix of link target
    """

    def __init__(self, key, prefix="model", suffix="best"):
        super(MakeSymlinkToBestModel, self).__init__()
        self.best_model = -1
        self.min_loss = 0.0
        self.key = key
        self.prefix = prefix
        self.suffix = suffix

    def __call__(self, trainer):
        observation = trainer.observation
        if self.key in observation:
            loss = observation[self.key]
            if self.best_model == -1 or loss < self.min_loss:
                self.min_loss = loss
                self.best_model = trainer.updater.epoch
                src = "%s.%d" % (self.prefix, self.best_model)
                dest = os.path.join(trainer.out, "%s.%s" % (self.prefix, self.suffix))
                if os.path.lexists(dest):
                    os.remove(dest)
                os.symlink(src, dest)
                logging.info("best model is " + src)

    def serialize(self, serializer):
        if isinstance(serializer, chainer.serializer.Serializer):
            serializer("_best_model", self.best_model)
            serializer("_min_loss", self.min_loss)
            serializer("_key", self.key)
            serializer("_prefix", self.prefix)
            serializer("_suffix", self.suffix)
        else:
            self.best_model = serializer("_best_model", -1)
            self.min_loss = serializer("_min_loss", 0.0)
            self.key = serializer("_key", "")
            self.prefix = serializer("_prefix", "model")
            self.suffix = serializer("_suffix", "best")


# TODO(Hori): currently it only works with character-word level LM.
#             need to consider any types of subwords-to-word mapping.
def make_lexical_tree(word_dict, subword_dict, word_unk):
    """Make a lexical tree to compute word-level probabilities"""
    # node [dict(subword_id -> node), word_id, word_set[start-1, end]]
    root = [{}, -1, None]
    for w, wid in word_dict.items():
        if wid > 0 and wid != word_unk:  # skip <blank> and <unk>
            if True in [c not in subword_dict for c in w]:  # skip unknown subword
                print(f"{w} is skipped due to invalid subword")
                continue
            succ = root[0]  # get successors from root node
            for i, c in enumerate(w):
                cid = subword_dict[c]
                if cid not in succ:  # if next node does not exist, make a new node
                    succ[cid] = [{}, -1, (wid - 1, wid)]
                else:
                    prev = succ[cid][2]
                    succ[cid][2] = (min(prev[0], wid - 1), max(prev[1], wid))
                if i == len(w) - 1:  # if word end, set word id
                    succ[cid][1] = wid
                succ = succ[cid][0]  # move to the child successors
        else:
            print(f"word {wid} is skipped")
    return root


================================================
FILE: lm/pytorch_backend/__init__.py
================================================
"""Initialize sub package."""


================================================
FILE: lm/pytorch_backend/extlm.py
================================================
#!/usr/bin/env python3

# Copyright 2018 Mitsubishi Electric Research Laboratories (Takaaki Hori)
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)


import math

import torch
import torch.nn as nn
import torch.nn.functional as F

from espnet.lm.lm_utils import make_lexical_tree
from espnet.nets.pytorch_backend.nets_utils import to_device


# Definition of a multi-level (subword/word) language model
class MultiLevelLM(nn.Module):
    logzero = -10000000000.0
    zero = 1.0e-10

    def __init__(
        self,
        wordlm,
        subwordlm,
        word_dict,
        subword_dict,
        subwordlm_weight=0.8,
        oov_penalty=1.0,
        open_vocab=True,
    ):
        super(MultiLevelLM, self).__init__()
        self.wordlm = wordlm
        self.subwordlm = subwordlm
        self.word_eos = word_dict["<eos>"]
        self.word_unk = word_dict["<unk>"]
        self.var_word_eos = torch.LongTensor([self.word_eos])
        self.var_word_unk = torch.LongTensor([self.word_unk])
        self.space = subword_dict["<space>"]
        self.eos = subword_dict["<eos>"]
        self.lexroot = make_lexical_tree(word_dict, subword_dict, self.word_unk)
        self.log_oov_penalty = math.log(oov_penalty)
        self.open_vocab = open_vocab
        self.subword_dict_size = len(subword_dict)
        self.subwordlm_weight = subwordlm_weight
        self.normalized = True

        # lexroot: [dict(subword->node), word_id, range of word_id with this prefix(start-1, end)]

    def forward(self, state, x):
        # update state with input label x
        if state is None:  # make initial states and log-prob vectors
            self.var_word_eos = to_device(x, self.var_word_eos)
            self.var_word_unk = to_device(x, self.var_word_eos)
            wlm_state, z_wlm = self.wordlm(None, self.var_word_eos)
            wlm_logprobs = F.log_softmax(z_wlm, dim=1)
            clm_state, z_clm = self.subwordlm(None, x)
            log_y = F.log_softmax(z_clm, dim=1) * self.subwordlm_weight
            new_node = self.lexroot
            clm_logprob = 0.0
            xi = self.space
        else:
            clm_state, wlm_state, wlm_logprobs, node, log_y, clm_logprob = state
            xi = int(x)
            if xi == self.space:  # inter-word transition
                if node is not None and node[1] >= 0:  # check if the node is word end
                    w = to_device(x, torch.LongTensor([node[1]]))
                else:  # this node is not a word end, which means <unk>
                    w = self.var_word_unk
                # update wordlm state and log-prob vector
                wlm_state, z_wlm = self.wordlm(wlm_state, w)
                wlm_logprobs = F.log_softmax(z_wlm, dim=1)
                new_node = self.lexroot  # move to the tree root
                clm_logprob = 0.0
            elif node is not None and xi in node[0]:  # intra-word transition
                new_node = node[0][xi]
                clm_logprob += log_y[0, xi]
            elif self.open_vocab:  # if no path in the tree, enter open-vocabulary mode
                new_node = None
                clm_logprob += log_y[0, xi]
            else:  # if open_vocab flag is disabled, return 0 probabilities
                log_y = to_device(
                    x, torch.full((1, self.subword_dict_size), self.logzero)
                )
                return (clm_state, wlm_state, wlm_logprobs, None, log_y, 0.0), log_y

            clm_state, z_clm = self.subwordlm(clm_state, x)
            log_y = F.log_softmax(z_clm, dim=1) * self.subwordlm_weight

        # apply word-level probabilies for <space> and <eos> labels
        if xi != self.space:
            if new_node is not None and new_node[1] >= 0:  # if new node is word end
                wlm_logprob = wlm_logprobs[:, new_node[1]] - clm_logprob
            else:
                wlm_logprob = wlm_logprobs[:, self.word_unk] + self.log_oov_penalty
            log_y[:, self.space] = wlm_logprob
            log_y[:, self.eos] = wlm_logprob
        else:
            log_y[:, self.space] = self.logzero
            log_y[:, self.eos] = self.logzero

        return (
            (clm_state, wlm_state, wlm_logprobs, new_node, log_y, float(clm_logprob)),
            log_y,
        )

    def final(self, state):
        clm_state, wlm_state, wlm_logprobs, node, log_y, clm_logprob = state
        if node is not None and node[1] >= 0:  # check if the node is word end
            w = to_device(wlm_logprobs, torch.LongTensor([node[1]]))
        else:  # this node is not a word end, which means <unk>
            w = self.var_word_unk
        wlm_state, z_wlm = self.wordlm(wlm_state, w)
        return float(F.log_softmax(z_wlm, dim=1)[:, self.word_eos])


# Definition of a look-ahead word language model
class LookAheadWordLM(nn.Module):
    logzero = -10000000000.0
    zero = 1.0e-10

    def __init__(
        self, wordlm, word_dict, subword_dict, oov_penalty=0.0001, open_vocab=True
    ):
        super(LookAheadWordLM, self).__init__()
        self.wordlm = wordlm
        self.word_eos = word_dict["<eos>"]
        self.word_unk = word_dict["<unk>"]
        self.var_word_eos = torch.LongTensor([self.word_eos])
        self.var_word_unk = torch.LongTensor([self.word_unk])
        self.space = subword_dict["<space>"]
        self.eos = subword_dict["<eos>"]
        self.lexroot = make_lexical_tree(word_dict, subword_dict, self.word_unk)
        self.oov_penalty = oov_penalty
        self.open_vocab = open_vocab
        self.subword_dict_size = len(subword_dict)
        self.zero_tensor = torch.FloatTensor([self.zero])
        self.normalized = True
        
        # any node including lex_root: [dict(word_id -> node), word_id, range of word prefixed with this]

    def forward(self, state, x):
        # update state with input label x
        if state is None:  # make initial states and cumlative probability vector
            self.var_word_eos = to_device(x, self.var_word_eos)
            self.var_word_unk = to_device(x, self.var_word_eos)
            self.zero_tensor = to_device(x, self.zero_tensor)
            wlm_state, z_wlm = self.wordlm(None, self.var_word_eos)
            cumsum_probs = torch.cumsum(F.softmax(z_wlm, dim=1), dim=1)
            new_node = self.lexroot
            xi = self.space
        else:
            wlm_state, cumsum_probs, node = state
            xi = int(x)
            if xi == self.space:  # inter-word transition
                if node is not None and node[1] >= 0:  # check if the node is word end
                    w = to_device(x, torch.LongTensor([node[1]]))
                else:  # this node is not a word end, which means <unk>
                    w = self.var_word_unk
                # update wordlm state and cumlative probability vector
                wlm_state, z_wlm = self.wordlm(wlm_state, w)
                cumsum_probs = torch.cumsum(F.softmax(z_wlm, dim=1), dim=1)
                new_node = self.lexroot  # move to the tree root
            elif node is not None and xi in node[0]:  # intra-word transition
                new_node = node[0][xi]
            elif self.open_vocab:  # if no path in the tree, enter open-vocabulary mode
                new_node = None
            else:  # if open_vocab flag is disabled, return 0 probabilities
                log_y = to_device(
                    x, torch.full((1, self.subword_dict_size), self.logzero)
                )
                return (wlm_state, None, None), log_y

        if new_node is not None:
            succ, wid, wids = new_node
            # compute parent node probability
            sum_prob = (
                (cumsum_probs[:, wids[1]] - cumsum_probs[:, wids[0]])
                if wids is not None
                else 1.0
            )
            if sum_prob < self.zero:
                log_y = to_device(
                    x, torch.full((1, self.subword_dict_size), self.logzero)
                )
                return (wlm_state, cumsum_probs, new_node), log_y
            # set <unk> probability as a default value
            unk_prob = (
                cumsum_probs[:, self.word_unk] - cumsum_probs[:, self.word_unk - 1]
            )
            y = to_device(
                x,
                torch.full(
                    (1, self.subword_dict_size), float(unk_prob) * self.oov_penalty
                ),
            )
            # compute transition probabilities to child nodes
            for cid, nd in succ.items():
                y[:, cid] = (
                    cumsum_probs[:, nd[2][1]] - cumsum_probs[:, nd[2][0]]
                ) / sum_prob
            # apply word-level probabilies for <space> and <eos> labels
            if wid >= 0:
                wlm_prob = (cumsum_probs[:, wid] - cumsum_probs[:, wid - 1]) / sum_prob
                y[:, self.space] = wlm_prob
                y[:, self.eos] = wlm_prob
            elif xi == self.space:
                y[:, self.space] = self.zero
                y[:, self.eos] = self.zero
            log_y = torch.log(torch.max(y, self.zero_tensor))  # clip to avoid log(0)
        else:  # if no path in the tree, transition probability is one
            log_y = to_device(x, torch.zeros(1, self.subword_dict_size))
        return (wlm_state, cumsum_probs, new_node), log_y

    def final(self, state):
        wlm_state, cumsum_probs, node = state
        if node is not None and node[1] >= 0:  # check if the node is word end
            w = to_device(cumsum_probs, torch.LongTensor([node[1]]))
        else:  # this node is not a word end, which means <unk>
            w = self.var_word_unk
        wlm_state, z_wlm = self.wordlm(wlm_state, w)
        return float(F.log_softmax(z_wlm, dim=1)[:, self.word_eos])


================================================
FILE: lm/pytorch_backend/lm.py
================================================
#!/usr/bin/env python3
# Copyright 2017 Johns Hopkins University (Shinji Watanabe)
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
# This code is ported from the following implementation written in Torch.
# https://github.com/chainer/chainer/blob/master/examples/ptb/train_ptb_custom_loop.py

"""LM training in pytorch."""

import copy
import json
import logging
import numpy as np

import torch
import torch.nn as nn
from torch.nn.parallel import data_parallel

from chainer import Chain
from chainer.dataset import convert
from chainer import reporter
from chainer import training
from chainer.training import extensions

from espnet.lm.lm_utils import count_tokens
from espnet.lm.lm_utils import load_dataset
from espnet.lm.lm_utils import MakeSymlinkToBestModel
from espnet.lm.lm_utils import ParallelSentenceIterator
from espnet.lm.lm_utils import read_tokens
from espnet.nets.lm_interface import dynamic_import_lm
from espnet.nets.lm_interface import LMInterface
from espnet.optimizer.factory import dynamic_import_optimizer
from espnet.scheduler.pytorch import PyTorchScheduler
from espnet.scheduler.scheduler import dynamic_import_scheduler

from espnet.asr.asr_utils import snapshot_object
from espnet.asr.asr_utils import torch_load
from espnet.asr.asr_utils import torch_resume
from espnet.asr.asr_utils import torch_snapshot

from espnet.utils.training.tensorboard_logger import TensorboardLogger
from tensorboardX import SummaryWriter

from espnet.utils.deterministic_utils import set_deterministic_pytorch
from espnet.utils.training.evaluator import BaseEvaluator
from espnet.utils.training.iterators import ShufflingEnabler
from espnet.utils.training.train_utils import check_early_stop
from espnet.utils.training.train_utils import set_early_stop


def compute_perplexity(result):
    """Compute and add the perplexity to the LogReport.

    :param dict result: The current observations
    """
    # Routine to rewrite the result dictionary of LogReport to add perplexity values
    result["perplexity"] = np.exp(result["main/nll"] / result["main/count"])
    if "validation/main/nll" in result:
        result["val_perplexity"] = np.exp(
            result["validation/main/nll"] / result["validation/main/count"]
        )


class Reporter(Chain):
    """Dummy module to use chainer's trainer."""

    def report(self, loss):
        """Report nothing."""
        pass


def concat_examples(batch, device=None, padding=None):
    """Concat examples in minibatch.

    :param np.ndarray batch: The batch to concatenate
    :param int device: The device to send to
    :param Tuple[int,int] padding: The padding to use
    :return: (inputs, targets)
    :rtype (torch.Tensor, torch.Tensor)
    """
    x, t = convert.concat_examples(batch, padding=padding)
    x = torch.from_numpy(x)
    t = torch.from_numpy(t)
    if device is not None and device >= 0:
        x = x.cuda(device)
        t = t.cuda(device)
    return x, t


class BPTTUpdater(training.StandardUpdater):
    """An updater for a pytorch LM."""

    def __init__(
        self,
        train_iter,
        model,
        optimizer,
        schedulers,
        device,
        gradclip=None,
        use_apex=False,
        accum_grad=1,
    ):
        """Initialize class.

        Args:
            train_iter (chainer.dataset.Iterator): The train iterator
            model (LMInterface) : The model to update
            optimizer (torch.optim.Optimizer): The optimizer for training
            schedulers (espnet.scheduler.scheduler.SchedulerInterface):
                The schedulers of `optimizer`
            device (int): The device id
            gradclip (float): The gradient clipping value to use
            use_apex (bool): The flag to use Apex in backprop.
            accum_grad (int): The number of gradient accumulation.

        """
        super(BPTTUpdater, self).__init__(train_iter, optimizer)
        self.model = model
        self.device = device
        self.gradclip = gradclip
        self.use_apex = use_apex
        self.scheduler = PyTorchScheduler(schedulers, optimizer)
        self.accum_grad = accum_grad

    # The core part of the update routine can be customized by overriding.
    def update_core(self):
        """Update the model."""
        # When we pass one iterator and optimizer to StandardUpdater.__init__,
        # they are automatically named 'main'.
        train_iter = self.get_iterator("main")
        optimizer = self.get_optimizer("main")
        # Progress the dataset iterator for sentences at each iteration.
        self.model.zero_grad()  # Clear the parameter gradients
        accum = {"loss": 0.0, "nll": 0.0, "count": 0}
        for _ in range(self.accum_grad):
            batch = train_iter.__next__()
            # Concatenate the token IDs to matrices and send them to the device
            # self.converter does this job
            # (it is chainer.dataset.concat_examples by default)
            x, t = concat_examples(batch, device=self.device[0], padding=(0, -100))
            if self.device[0] == -1:
                loss, nll, count = self.model(x, t)
            else:
                # apex does not support torch.nn.DataParallel
                loss, nll, count = data_parallel(self.model, (x, t), self.device)

            # backward
            loss = loss.mean() / self.accum_grad
            if self.use_apex:
                from apex import amp

                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()  # Backprop
            # accumulate stats
            accum["loss"] += float(loss)
            accum["nll"] += float(nll.sum())
            accum["count"] += int(count.sum())

        for k, v in accum.items():
            reporter.report({k: v}, optimizer.target)
        if self.gradclip is not None:
            nn.utils.clip_grad_norm_(self.model.parameters(), self.gradclip)
        optimizer.step()  # Update the parameters
        self.scheduler.step(n_iter=self.iteration)


class LMEvaluator(BaseEvaluator):
    """A custom evaluator for a pytorch LM."""

    def __init__(self, val_iter, eval_model, reporter, device):
        """Initialize class.

        :param chainer.dataset.Iterator val_iter : The validation iterator
        :param LMInterface eval_model : The model to evaluate
        :param chainer.Reporter reporter : The observations reporter
        :param int device : The device id to use

        """
        super(LMEvaluator, self).__init__(val_iter, reporter, device=-1)
        self.model = eval_model
        self.device = device

    def evaluate(self):
        """Evaluate the model."""
        val_iter = self.get_iterator("main")
        loss = 0
        nll = 0
        count = 0
        self.model.eval()
        with torch.no_grad():
            for batch in copy.copy(val_iter):
                x, t = concat_examples(batch, device=self.device[0], padding=(0, -100))
                if self.device[0] == -1:
                    l, n, c = self.model(x, t)
                else:
                    # apex does not support torch.nn.DataParallel
                    l, n, c = data_parallel(self.model, (x, t), self.device)
                loss += float(l.sum())
                nll += float(n.sum())
                count += int(c.sum())
        self.model.train()
        # report validation loss
        observation = {}
        with reporter.report_scope(observation):
            reporter.report({"loss": loss}, self.model.reporter)
            reporter.report({"nll": nll}, self.model.reporter)
            reporter.report({"count": count}, self.model.reporter)
        return observation


def train(args):
    """Train with the given args.

    :param Namespace args: The program arguments
    :param type model_class: LMInterface class for training
    """
    model_class = dynamic_import_lm(args.model_module, args.backend)
    assert issubclass(model_class, LMInterface), "model should implement LMInterface"
    # display torch version
    logging.info("torch version = " + torch.__version__)

    set_deterministic_pytorch(args)

    # check cuda and cudnn availability
    if not torch.cuda.is_available():
        logging.warning("cuda is not available")

    # get special label ids
    unk = args.char_list_dict["<unk>"]
    eos = args.char_list_dict["<eos>"]
    # read tokens as a sequence of sentences
    val, n_val_tokens, n_val_oovs = load_dataset(
        args.valid_label, args.char_list_dict, args.dump_hdf5_path
    )
    train, n_train_tokens, n_train_oovs = load_dataset(
        args.train_label, args.char_list_dict, args.dump_hdf5_path
    )
    logging.info("#vocab = " + str(args.n_vocab))
    logging.info("#sentences in the training data = " + str(len(train)))
    logging.info("#tokens in the training data = " + str(n_train_tokens))
    logging.info(
        "oov rate in the training data = %.2f %%"
        % (n_train_oovs / n_train_tokens * 100)
    )
    logging.info("#sentences in the validation data = " + str(len(val)))
    logging.info("#tokens in the validation data = " + str(n_val_tokens))
    logging.info(
        "oov rate in the validation data = %.2f %%" % (n_val_oovs / n_val_tokens * 100)
    )

    use_sortagrad = args.sortagrad == -1 or args.sortagrad > 0
    # Create the dataset iterators
    batch_size = args.batchsize * max(args.ngpu, 1)
    if batch_size * args.accum_grad > args.batchsize:
        logging.info(
            f"batch size is automatically increased "
            f"({args.batchsize} -> {batch_size * args.accum_grad})"
        )
    train_iter = ParallelSentenceIterator(
        train,
        batch_size,
        max_length=args.maxlen,
        sos=eos,
        eos=eos,
        shuffle=not use_sortagrad,
    )
    val_iter = ParallelSentenceIterator(
        val, batch_size, max_length=args.maxlen, sos=eos, eos=eos, repeat=False
    )
    epoch_iters = int(len(train_iter.batch_indices) / args.accum_grad)
    logging.info("#iterations per epoch = %d" % epoch_iters)
    logging.info("#total iterations = " + str(args.epoch * epoch_iters))
    # Prepare an RNNLM model
    if args.train_dtype in ("float16", "float32", "float64"):
        dtype = getattr(torch, args.train_dtype)
    else:
        dtype = torch.float32
    model = model_class(args.n_vocab, args).to(dtype=dtype)
    if args.ngpu > 0:
        model.to("cuda")
        gpu_id = list(range(args.ngpu))
    else:
        gpu_id = [-1]

    # Save model conf to json
    model_conf = args.outdir + "/model.json"
    with open(model_conf, "wb") as f:
        logging.info("writing a model config file to " + model_conf)
        f.write(
            json.dumps(vars(args), indent=4, ensure_ascii=False, sort_keys=True).encode(
                "utf_8"
            )
        )

    logging.warning(
        "num. model params: {:,} (num. trained: {:,} ({:.1f}%))".format(
            sum(p.numel() for p in model.parameters()),
            sum(p.numel() for p in model.parameters() if p.requires_grad),
            sum(p.numel() for p in model.parameters() if p.requires_grad)
            * 100.0
            / sum(p.numel() for p in model.parameters()),
        )
    )

    # Set up an optimizer
    opt_class = dynamic_import_optimizer(args.opt, args.backend)
    optimizer = opt_class.from_args(model.parameters(), args)
    if args.schedulers is None:
        schedulers = []
    else:
        schedulers = [dynamic_import_scheduler(v)(k, args) for k, v in args.schedulers]

    # setup apex.amp
    if args.train_dtype in ("O0", "O1", "O2", "O3"):
        try:
            from apex import amp
        except ImportError as e:
            logging.error(
                f"You need to install apex for --train-dtype {args.train_dtype}. "
                "See https://github.com/NVIDIA/apex#linux"
            )
            raise e
        model, optimizer = amp.initialize(model, optimizer, opt_level=args.train_dtype)
        use_apex = True
    else:
        use_apex = False

    # FIXME: TOO DIRTY HACK
    reporter = Reporter()
    setattr(model, "reporter", reporter)
    setattr(optimizer, "target", reporter)
    setattr(optimizer, "serialize", lambda s: reporter.serialize(s))

    updater = BPTTUpdater(
        train_iter,
        model,
        optimizer,
        schedulers,
        gpu_id,
        gradclip=args.gradclip,
        use_apex=use_apex,
        accum_grad=args.accum_grad,
    )
    trainer = training.Trainer(updater, (args.epoch, "epoch"), out=args.outdir)
    trainer.extend(LMEvaluator(val_iter, model, reporter, device=gpu_id))
    trainer.extend(
        extensions.LogReport(
            postprocess=compute_perplexity,
            trigger=(args.report_interval_iters, "iteration"),
        )
    )
    trainer.extend(
        extensions.PrintReport(
            [
                "epoch",
                "iteration",
                "main/loss",
                "perplexity",
                "val_perplexity",
                "elapsed_time",
            ]
        ),
        trigger=(args.report_interval_iters, "iteration"),
    )
    trainer.extend(extensions.ProgressBar(update_interval=args.report_interval_iters))
    # Save best models
    trainer.extend(torch_snapshot(filename="snapshot.ep.{.updater.epoch}"))
    trainer.extend(snapshot_object(model, "rnnlm.model.{.updater.epoch}"))
    # T.Hori: MinValueTrigger should be used, but it fails when resuming
    trainer.extend(MakeSymlinkToBestModel("validation/main/loss", "rnnlm.model"))

    if use_sortagrad:
        trainer.extend(
            ShufflingEnabler([train_iter]),
            trigger=(args.sortagrad if args.sortagrad != -1 else args.epoch, "epoch"),
        )
    if args.resume:
        logging.info("resumed from %s" % args.resume)
        torch_resume(args.resume, trainer)

    set_early_stop(trainer, args, is_lm=True)
    if args.tensorboard_dir is not None and args.tensorboard_dir != "":
        writer = SummaryWriter(args.tensorboard_dir)
        trainer.extend(
            TensorboardLogger(writer), trigger=(args.report_interval_iters, "iteration")
        )

    trainer.run()
    check_early_stop(trainer, args.epoch)

    # compute perplexity for test set
    if args.test_label:
        logging.info("test the best model")
        torch_load(args.outdir + "/rnnlm.model.best", model)
        test = read_tokens(args.test_label, args.char_list_dict)
        n_test_tokens, n_test_oovs = count_tokens(test, unk)
        logging.info("#sentences in the test data = " + str(len(test)))
        logging.info("#tokens in the test data = " + str(n_test_tokens))
        logging.info(
            "oov rate in the test data = %.2f %%" % (n_test_oovs / n_test_tokens * 100)
        )
        test_iter = ParallelSentenceIterator(
            test, batch_size, max_length=args.maxlen, sos=eos, eos=eos, repeat=False
        )
        evaluator = LMEvaluator(test_iter, model, reporter, device=gpu_id)
        result = evaluator()
        compute_perplexity(result)
        logging.info(f"test perplexity: {result['perplexity']}")


================================================
FILE: mt/__init__.py
================================================
"""Initialize sub package."""


================================================
FILE: mt/mt_utils.py
================================================
#!/usr/bin/env python3
# encoding: utf-8

# Copyright 2019 Kyoto University (Hirofumi Inaguma)
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

"""Utility funcitons for the text translation task."""

import logging


# * ------------------ recognition related ------------------ *
def parse_hypothesis(hyp, char_list):
    """Parse hypothesis.

    :param list hyp: recognition hypothesis
    :param list char_list: list of characters
    :return: recognition text string
    :return: recognition token string
    :return: recognition tokenid string
    """
    # remove sos and get results
    tokenid_as_list = list(map(int, hyp["yseq"][1:]))
    token_as_list = [char_list[idx] for idx in tokenid_as_list]
    score = float(hyp["score"])

    # convert to string
    tokenid = " ".join([str(idx) for idx in tokenid_as_list])
    token = " ".join(token_as_list)
    text = "".join(token_as_list).replace("<space>", " ")

    return text, token, tokenid, score


def add_results_to_json(js, nbest_hyps, char_list):
    """Add N-best results to json.

    :param dict js: groundtruth utterance dict
    :param list nbest_hyps: list of hypothesis
    :param list char_list: list of characters
    :return: N-best results added utterance dict
    """
    # copy old json info
    new_js = dict()
    if "utt2spk" in js.keys():
        new_js["utt2spk"] = js["utt2spk"]
    new_js["output"] = []

    for n, hyp in enumerate(nbest_hyps, 1):
        # parse hypothesis
        rec_text, rec_token, rec_tokenid, score = parse_hypothesis(hyp, char_list)

        # copy ground-truth
        if len(js["output"]) > 0:
            out_dic = dict(js["output"][0].items())
        else:
            out_dic = {"name": ""}

        # update name
        out_dic["name"] += "[%d]" % n

        # add recognition results
        out_dic["rec_text"] = rec_text
        out_dic["rec_token"] = rec_token
        out_dic["rec_tokenid"] = rec_tokenid
        out_dic["score"] = score

        # add source reference
        out_dic["text_src"] = js["output"][1]["text"]
        out_dic["token_src"] = js["output"][1]["token"]
        out_dic["tokenid_src"] = js["output"][1]["tokenid"]

        # add to list of N-best result dicts
        new_js["output"].append(out_dic)

        # show 1-best result
        if n == 1:
            if "text" in out_dic.keys():
                logging.info("groundtruth: %s" % out_dic["text"])
            logging.info("prediction : %s" % out_dic["rec_text"])
            logging.info("source : %s" % out_dic["token_src"])

    return new_js


================================================
FILE: mt/pytorch_backend/__init__.py
================================================
"""Initialize sub package."""


================================================
FILE: mt/pytorch_backend/mt.py
================================================
#!/usr/bin/env python3
# encoding: utf-8

# Copyright 2019 Kyoto University (Hirofumi Inaguma)
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

"""Training/decoding definition for the text translation task."""

import json
import logging
import os
import sys

from chainer import training
from chainer.training import extensions
import numpy as np
from tensorboardX import SummaryWriter
import torch

from espnet.asr.asr_utils import adadelta_eps_decay
from espnet.asr.asr_utils import adam_lr_decay
from espnet.asr.asr_utils import add_results_to_json
from espnet.asr.asr_utils import CompareValueTrigger
from espnet.asr.asr_utils import restore_snapshot
from espnet.asr.asr_utils import snapshot_object
from espnet.asr.asr_utils import torch_load
from espnet.asr.asr_utils import torch_resume
from espnet.asr.asr_utils import torch_snapshot
from espnet.nets.mt_interface import MTInterface
from espnet.nets.pytorch_backend.e2e_asr import pad_list
from espnet.utils.dataset import ChainerDataLoader
from espnet.utils.dataset import TransformDataset
from espnet.utils.deterministic_utils import set_deterministic_pytorch
from espnet.utils.dynamic_import import dynamic_import
from espnet.utils.io_utils import LoadInputsAndTargets
from espnet.utils.training.batchfy import make_batchset
from espnet.utils.training.iterators import ShufflingEnabler
from espnet.utils.training.tensorboard_logger import TensorboardLogger
from espnet.utils.training.train_utils import check_early_stop
from espnet.utils.training.train_utils import set_early_stop

from espnet.asr.pytorch_backend.asr import CustomEvaluator
from espnet.asr.pytorch_backend.asr import CustomUpdater
from espnet.asr.pytorch_backend.asr import load_trained_model

import matplotlib

matplotlib.use("Agg")

if sys.version_info[0] == 2:
    from itertools import izip_longest as zip_longest
else:
    from itertools import zip_longest as zip_longest


class CustomConverter(object):
    """Custom batch converter for Pytorch."""

    def __init__(self):
        """Construct a CustomConverter object."""
        self.ignore_id = -1
        self.pad = 0
        # NOTE: we reserve index:0 for <pad> although this is reserved for a blank class
        # in ASR. However,
        # blank labels are not used in NMT. To keep the vocabulary size,
        # we use index:0 for padding instead of adding one more class.

    def __call__(self, batch, device=torch.device("cpu")):
        """Transform a batch and send it to a device.

        Args:
            batch (list): The batch to transform.
            device (torch.device): The device to send to.

        Returns:
            tuple(torch.Tensor, torch.Tensor, torch.Tensor)

        """
        # batch should be located in list
        assert len(batch) == 1
        xs, ys = batch[0]

        # get batch of lengths of input sequences
        ilens = np.array([x.shape[0] for x in xs])

        # perform padding and convert to tensor
        xs_pad = pad_list([torch.from_numpy(x).long() for x in xs], self.pad).to(device)
        ilens = torch.from_numpy(ilens).to(device)
        ys_pad = pad_list([torch.from_numpy(y).long() for y in ys], self.ignore_id).to(
            device
        )

        return xs_pad, ilens, ys_pad


def train(args):
    """Train with the given args.

    Args:
        args (namespace): The program arguments.

    """
    set_deterministic_pytorch(args)

    # check cuda availability
    if not torch.cuda.is_available():
        logging.warning("cuda is not available")

    # get input and output dimension info
    with open(args.valid_json, "rb") as f:
        valid_json = json.load(f)["utts"]
    utts = list(valid_json.keys())
    idim = int(valid_json[utts[0]]["output"][1]["shape"][1])
    odim = int(valid_json[utts[0]]["output"][0]["shape"][1])
    logging.info("#input dims : " + str(idim))
    logging.info("#output dims: " + str(odim))

    # specify model architecture
    model_class = dynamic_import(args.model_module)
    model = model_class(idim, odim, args)
    assert isinstance(model, MTInterface)

    # write model config
    if not os.path.exists(args.outdir):
        os.makedirs(args.outdir)
    model_conf = args.outdir + "/model.json"
    with open(model_conf, "wb") as f:
        logging.info("writing a model config file to " + model_conf)
        f.write(
            json.dumps(
                (idim, odim, vars(args)), indent=4, ensure_ascii=False, sort_keys=True
            ).encode("utf_8")
        )
    for key in sorted(vars(args).keys()):
        logging.info("ARGS: " + key + ": " + str(vars(args)[key]))

    reporter = model.reporter

    # check the use of multi-gpu
    if args.ngpu > 1:
        if args.batch_size != 0:
            logging.warning(
                "batch size is automatically increased (%d -> %d)"
                % (args.batch_size, args.batch_size * args.ngpu)
            )
            args.batch_size *= args.ngpu

    # set torch device
    device = torch.device("cuda" if args.ngpu > 0 else "cpu")
    if args.train_dtype in ("float16", "float32", "float64"):
        dtype = getattr(torch, args.train_dtype)
    else:
        dtype = torch.float32
    model = model.to(device=device, dtype=dtype)

    logging.warning(
        "num. model params: {:,} (num. trained: {:,} ({:.1f}%))".format(
            sum(p.numel() for p in model.parameters()),
            sum(p.numel() for p in model.parameters() if p.requires_grad),
            sum(p.numel() for p in model.parameters() if p.requires_grad)
            * 100.0
            / sum(p.numel() for p in model.parameters()),
        )
    )

    # Setup an optimizer
    if args.opt == "adadelta":
        optimizer = torch.optim.Adadelta(
            model.parameters(), rho=0.95, eps=args.eps, weight_decay=args.weight_decay
        )
    elif args.opt == "adam":
        optimizer = torch.optim.Adam(
            model.parameters(), lr=args.lr, weight_decay=args.weight_decay
        )
    elif args.opt == "noam":
        from espnet.nets.pytorch_backend.transformer.optimizer import get_std_opt

        optimizer = get_std_opt(
            model.parameters(),
            args.adim,
            args.transformer_warmup_steps,
            args.transformer_lr,
        )
    else:
        raise NotImplementedError("unknown optimizer: " + args.opt)

    # setup apex.amp
    if args.train_dtype in ("O0", "O1", "O2", "O3"):
        try:
            from apex import amp
        except ImportError as e:
            logging.error(
                f"You need to install apex for --train-dtype {args.train_dtype}. "
                "See https://github.com/NVIDIA/apex#linux"
            )
            raise e
        if args.opt == "noam":
            model, optimizer.optimizer = amp.initialize(
                model, optimizer.optimizer, opt_level=args.train_dtype
            )
        else:
            model, optimizer = amp.initialize(
                model, optimizer, opt_level=args.train_dtype
            )
        use_apex = True
    else:
        use_apex = False

    # FIXME: TOO DIRTY HACK
    setattr(optimizer, "target", reporter)
    setattr(optimizer, "serialize", lambda s: reporter.serialize(s))

    # Setup a converter
    converter = CustomConverter()

    # read json data
    with open(args.train_json, "rb") as f:
        train_json = json.load(f)["utts"]
    with open(args.valid_json, "rb") as f:
        valid_json = json.load(f)["utts"]

    use_sortagrad = args.sortagrad == -1 or args.sortagrad > 0
    # make minibatch list (variable length)
    train = make_batchset(
        train_json,
        args.batch_size,
        args.maxlen_in,
        args.maxlen_out,
        args.minibatches,
        min_batch_size=args.ngpu if args.ngpu > 1 else 1,
        shortest_first=use_sortagrad,
        count=args.batch_count,
        batch_bins=args.batch_bins,
        batch_frames_in=args.batch_frames_in,
        batch_frames_out=args.batch_frames_out,
        batch_frames_inout=args.batch_frames_inout,
        mt=True,
        iaxis=1,
        oaxis=0,
    )
    valid = make_batchset(
        valid_json,
        args.batch_size,
        args.maxlen_in,
        args.maxlen_out,
        args.minibatches,
        min_batch_size=args.ngpu if args.ngpu > 1 else 1,
        count=args.batch_count,
        batch_bins=args.batch_bins,
        batch_frames_in=args.batch_frames_in,
        batch_frames_out=args.batch_frames_out,
        batch_frames_inout=args.batch_frames_inout,
        mt=True,
        iaxis=1,
        oaxis=0,
    )

    load_tr = LoadInputsAndTargets(mode="mt", load_output=True)
    load_cv = LoadInputsAndTargets(mode="mt", load_output=True)
    # hack to make batchsize argument as 1
    # actual bathsize is included in a list
    # default collate function converts numpy array to pytorch tensor
    # we used an empty collate function instead which returns list
    train_iter = ChainerDataLoader(
        dataset=TransformDataset(train, lambda data: converter([load_tr(data)])),
        batch_size=1,
        num_workers=args.n_iter_processes,
        shuffle=not use_sortagrad,
        collate_fn=lambda x: x[0],
    )
    valid_iter = ChainerDataLoader(
        dataset=TransformDataset(valid, lambda data: converter([load_cv(data)])),
        batch_size=1,
        shuffle=False,
        collate_fn=lambda x: x[0],
        num_workers=args.n_iter_processes,
    )

    # Set up a trainer
    updater = CustomUpdater(
        model,
        args.grad_clip,
        {"main": train_iter},
        optimizer,
        device,
        args.ngpu,
        False,
        args.accum_grad,
        use_apex=use_apex,
    )
    trainer = training.Trainer(updater, (args.epochs, "epoch"), out=args.outdir)

    if use_sortagrad:
        trainer.extend(
            ShufflingEnabler([train_iter]),
            trigger=(args.sortagrad if args.sortagrad != -1 else args.epochs, "epoch"),
        )

    # Resume from a snapshot
    if args.resume:
        logging.info("resumed from %s" % args.resume)
        torch_resume(args.resume, trainer)

    # Evaluate the model with the test dataset for each epoch
    if args.save_interval_iters > 0:
        trainer.extend(
            CustomEvaluator(model, {"main": valid_iter}, reporter, device, args.ngpu),
            trigger=(args.save_interval_iters, "iteration"),
        )
    else:
        trainer.extend(
            CustomEvaluator(model, {"main": valid_iter}, reporter, device, args.ngpu)
        )

    # Save attention weight each epoch
    if args.num_save_attention > 0:
        # NOTE: sort it by output lengths
        data = sorted(
            list(valid_json.items())[: args.num_save_attention],
            key=lambda x: int(x[1]["output"][0]["shape"][0]),
            reverse=True,
        )
        if hasattr(model, "module"):
            att_vis_fn = model.module.calculate_all_attentions
            plot_class = model.module.attention_plot_class
        else:
            att_vis_fn = model.calculate_all_attentions
            plot_class = model.attention_plot_class
        att_reporter = plot_class(
            att_vis_fn,
            data,
            args.outdir + "/att_ws",
            converter=converter,
            transform=load_cv,
            device=device,
            ikey="output",
            iaxis=1,
        )
        trainer.extend(att_reporter, trigger=(1, "epoch"))
    else:
        att_reporter = None

    # Make a plot for training and validation values
    trainer.extend(
        extensions.PlotReport(
            ["main/loss", "validation/main/loss"], "epoch", file_name="loss.png"
        )
    )
    trainer.extend(
        extensions.PlotReport(
            ["main/acc", "validation/main/acc"], "epoch", file_name="acc.png"
        )
    )
    trainer.extend(
        extensions.PlotReport(
            ["main/ppl", "validation/main/ppl"], "epoch", file_name="ppl.png"
        )
    )
    trainer.extend(
        extensions.PlotReport(
            ["main/bleu", "validation/main/bleu"], "epoch", file_name="bleu.png"
        )
    )

    # Save best models
    trainer.extend(
        snapshot_object(model, "model.loss.best"),
        trigger=training.triggers.MinValueTrigger("validation/main/loss"),
    )
    trainer.extend(
        snapshot_object(model, "model.acc.best"),
        trigger=training.triggers.MaxValueTrigger("validation/main/acc"),
    )

    # save snapshot which contains model and optimizer states
    if args.save_interval_iters > 0:
        trainer.extend(
            torch_snapshot(filename="snapshot.iter.{.updater.iteration}"),
            trigger=(args.save_interval_iters, "iteration"),
        )
    else:
        trainer.extend(torch_snapshot(), trigger=(1, "epoch"))

    # epsilon decay in the optimizer
    if args.opt == "adadelta":
        if args.criterion == "acc":
            trainer.extend(
                restore_snapshot(
                    model, args.outdir + "/model.acc.best", load_fn=torch_load
                ),
                trigger=CompareValueTrigger(
                    "validation/main/acc",
                    lambda best_value, current_value: best_value > current_value,
                ),
            )
            trainer.extend(
                adadelta_eps_decay(args.eps_decay),
                trigger=CompareValueTrigger(
                    "validation/main/acc",
                    lambda best_value, current_value: best_value > current_value,
                ),
            )
        elif args.criterion == "loss":
            trainer.extend(
                restore_snapshot(
                    model, args.outdir + "/model.loss.best", load_fn=torch_load
                ),
                trigger=CompareValueTrigger(
                    "validation/main/loss",
                    lambda best_value, current_value: best_value < current_value,
                ),
            )
            trainer.extend(
                adadelta_eps_decay(args.eps_decay),
                trigger=CompareValueTrigger(
                    "validation/main/loss",
                    lambda best_value, current_value: best_value < current_value,
                ),
            )
    elif args.opt == "adam":
        if args.criterion == "acc":
            trainer.extend(
                restore_snapshot(
                    model, args.outdir + "/model.acc.best", load_fn=torch_load
                ),
                trigger=CompareValueTrigger(
                    "validation/main/acc",
                    lambda best_value, current_value: best_value > current_value,
                ),
            )
            trainer.extend(
                adam_lr_decay(args.lr_decay),
                trigger=CompareValueTrigger(
                    "validation/main/acc",
                    lambda best_value, current_value: best_value > current_value,
                ),
            )
        elif args.criterion == "loss":
            trainer.extend(
                restore_snapshot(
                    model, args.outdir + "/model.loss.best", load_fn=torch_load
                ),
                trigger=CompareValueTrigger(
                    "validation/main/loss",
                    lambda best_value, current_value: best_value < current_value,
                ),
            )
            trainer.extend(
                adam_lr_decay(args.lr_decay),
                trigger=CompareValueTrigger(
                    "validation/main/loss",
                    lambda best_value, current_value: best_value < current_value,
                ),
            )

    # Write a log of evaluation statistics for each epoch
    trainer.extend(
        extensions.LogReport(trigger=(args.report_interval_iters, "iteration"))
    )
    report_keys = [
        "epoch",
        "iteration",
        "main/loss",
        "validation/main/loss",
        "main/acc",
        "validation/main/acc",
        "main/ppl",
        "validation/main/ppl",
        "elapsed_time",
    ]
    if args.opt == "adadelta":
        trainer.extend(
            extensions.observe_value(
                "eps",
                lambda trainer: trainer.updater.get_optimizer("main").param_groups[0][
                    "eps"
                ],
            ),
            trigger=(args.report_interval_iters, "iteration"),
        )
        report_keys.append("eps")
    elif args.opt in ["adam", "noam"]:
        trainer.extend(
            extensions.observe_value(
                "lr",
                lambda trainer: trainer.updater.get_optimizer("main").param_groups[0][
                    "lr"
                ],
            ),
            trigger=(args.report_interval_iters, "iteration"),
        )
        report_keys.append("lr")
    if args.report_bleu:
        report_keys.append("main/bleu")
        report_keys.append("validation/main/bleu")
    trainer.extend(
        extensions.PrintReport(report_keys),
        trigger=(args.report_interval_iters, "iteration"),
    )

    trainer.extend(extensions.ProgressBar(update_interval=args.report_interval_iters))
    set_early_stop(trainer, args)

    if args.tensorboard_dir is not None and args.tensorboard_dir != "":
        trainer.extend(
            TensorboardLogger(SummaryWriter(args.tensorboard_dir), att_reporter),
            trigger=(args.report_interval_iters, "iteration"),
        )
    # Run the training
    trainer.run()
    check_early_stop(trainer, args.epochs)


def trans(args):
    """Decode with the given args.

    Args:
        args (namespace): The program arguments.

    """
    set_deterministic_pytorch(args)
    model, train_args = load_trained_model(args.model)
    assert isinstance(model, MTInterface)
    model.trans_args = args

    # gpu
    if args.ngpu == 1:
        gpu_id = list(range(args.ngpu))
        logging.info("gpu id: " + str(gpu_id))
        model.cuda()

    # read json data
    with open(args.trans_json, "rb") as f:
        js = json.load(f)["utts"]
    new_js = {}

    # remove enmpy utterances
    if train_args.multilingual:
        js = {
            k: v
            for k, v in js.items()
            if v["output"][0]["shape"][0] > 1 and v["output"][1]["shape"][0] > 1
        }
    else:
        js = {
            k: v
            for k, v in js.items()
            if v["output"][0]["shape"][0] > 0 and v["output"][1]["shape"][0] > 0
        }

    if args.batchsize == 0:
        with torch.no_grad():
            for idx, name in enumerate(js.keys(), 1):
                logging.info("(%d/%d) decoding " + name, idx, len(js.keys()))
                feat = [js[name]["output"][1]["tokenid"].split()]
                nbest_hyps = model.translate(feat, args, train_args.char_list)
                new_js[name] = add_results_to_json(
                    js[name], nbest_hyps, train_args.char_list
                )

    else:

        def grouper(n, iterable, fillvalue=None):
            kargs = [iter(iterable)] * n
            return zip_longest(*kargs, fillvalue=fillvalue)

        # sort data
        keys = list(js.keys())
        feat_lens = [js[key]["output"][1]["shape"][0] for key in keys]
        sorted_index = sorted(range(len(feat_lens)), key=lambda i: -feat_lens[i])
        keys = [keys[i] for i in sorted_index]

        with torch.no_grad():
            for names in grouper(args.batchsize, keys, None):
                names = [name for name in names if name]
                feats = [
                    np.fromiter(
                        map(int, js[name]["output"][1]["tokenid"].split()),
                        dtype=np.int64,
                    )
                    for name in names
                ]
                nbest_hyps = model.translate_batch(
                    feats,
                    args,
                    train_args.char_list,
                )

                for i, nbest_hyp in enumerate(nbest_hyps):
                    name = names[i]
                    new_js[name] = add_results_to_json(
                        js[name], nbest_hyp, train_args.char_list
                    )

    with open(args.result_label, "wb") as f:
        f.write(
            json.dumps(
                {"utts": new_js}, indent=4, ensure_ascii=False, sort_keys=True
            ).encode("utf_8")
        )


================================================
FILE: nets/__init__.py
================================================
"""Initialize sub package."""


================================================
FILE: nets/asr_interface.py
================================================
"""ASR Interface module."""
import argparse

from espnet.bin.asr_train import get_parser
from espnet.utils.dynamic_import import dynamic_import
from espnet.utils.fill_missing_args import fill_missing_args


class ASRInterface:
    """ASR Interface for ESPnet model implementation."""

    @staticmethod
    def add_arguments(parser):
        """Add arguments to parser."""
        return parser

    @classmethod
    def build(cls, idim: int, odim: int, **kwargs):
        """Initialize this class with python-level args.

        Args:
            idim (int): The number of an input feature dim.
            odim (int): The number of output vocab.

        Returns:
            ASRinterface: A new instance of ASRInterface.

        """

        def wrap(parser):
            return get_parser(parser, required=False)

        args = argparse.Namespace(**kwargs)
        args = fill_missing_args(args, wrap)
        args = fill_missing_args(args, cls.add_arguments)
        return cls(idim, odim, args)

    def forward(self, xs, ilens, ys):
        """Compute loss for training.

        :param xs:
            For pytorch, batch of padded source sequences torch.Tensor (B, Tmax, idim)
            For chainer, list of source sequences chainer.Variable
        :param ilens: batch of lengths of source sequences (B)
            For pytorch, torch.Tensor
            For chainer, list of int
        :param ys:
            For pytorch, batch of padded source sequences torch.Tensor (B, Lmax)
            For chainer, list of source sequences chainer.Variable
        :return: loss value
        :rtype: torch.Tensor for pytorch, chainer.Variable for chainer
        """
        raise NotImplementedError("forward method is not implemented")

    def recognize(self, x, recog_args, char_list=None, rnnlm=None):
        """Recognize x for evaluation.

        :param ndarray x: input acouctic feature (B, T, D) or (T, D)
        :param namespace recog_args: argment namespace contraining options
        :param list char_list: list of characters
        :param torch.nn.Module rnnlm: language model module
        :return: N-best decoding results
        :rtype: list
        """
        raise NotImplementedError("recognize method is not implemented")

    def recognize_batch(self, x, recog_args, char_list=None, rnnlm=None):
        """Beam search implementation for batch.

        :param torch.Tensor x: encoder hidden state sequences (B, Tmax, Henc)
        :param namespace recog_args: argument namespace containing options
        :param list char_list: list of characters
        :param torch.nn.Module rnnlm: language model module
        :return: N-best decoding results
        :rtype: list
        """
        raise NotImplementedError("Batch decoding is not supported yet.")

    def calculate_all_attentions(self, xs, ilens, ys):
        """Caluculate attention.

        :param list xs: list of padded input sequences [(T1, idim), (T2, idim), ...]
        :param ndarray ilens: batch of lengths of input sequences (B)
        :param list ys: list of character id sequence tensor [(L1), (L2), (L3), ...]
        :return: attention weights (B, Lmax, Tmax)
        :rtype: float ndarray
        """
        raise NotImplementedError("calculate_all_attentions method is not implemented")

    def calculate_all_ctc_probs(self, xs, ilens, ys):
        """Caluculate CTC probability.

        :param list xs_pad: list of padded input sequences [(T1, idim), (T2, idim), ...]
        :param ndarray ilens: batch of lengths of input sequences (B)
        :param list ys: list of character id sequence tensor [(L1), (L2), (L3), ...]
        :return: CTC probabilities (B, Tmax, vocab)
        :rtype: float ndarray
        """
        raise NotImplementedError("calculate_all_ctc_probs method is not implemented")

    @property
    def attention_plot_class(self):
        """Get attention plot class."""
        from espnet.asr.asr_utils import PlotAttentionReport

        return PlotAttentionReport

    @property
    def ctc_plot_class(self):
        """Get CTC plot class."""
        from espnet.asr.asr_utils import PlotCTCReport

        return PlotCTCReport

    def get_total_subsampling_factor(self):
        """Get total subsampling factor."""
        raise NotImplementedError(
            "get_total_subsampling_factor method is not implemented"
        )

    def encode(self, feat):
        """Encode feature in `beam_search` (optional).

        Args:
            x (numpy.ndarray): input feature (T, D)
        Returns:
            torch.Tensor for pytorch, chainer.Variable for chainer:
                encoded feature (T, D)

        """
        raise NotImplementedError("encode method is not implemented")

    def scorers(self):
        """Get scorers for `beam_search` (optional).

        Returns:
            dict[str, ScorerInterface]: dict of `ScorerInterface` objects

        """
        raise NotImplementedError("decoders method is not implemented")


predefined_asr = {
    "pytorch": {
        "rnn": "espnet.nets.pytorch_backend.e2e_asr:E2E",
        "transducer": "espnet.nets.pytorch_backend.e2e_asr_transducer:E2E",
        "transformer": "espnet.nets.pytorch_backend.e2e_asr_transformer:E2E",
        "conformer": "espnet.nets.pytorch_backend.e2e_asr_conformer:E2E",
    },
    "chainer": {
        "rnn": "espnet.nets.chainer_backend.e2e_asr:E2E",
        "transformer": "espnet.nets.chainer_backend.e2e_asr_transformer:E2E",
    },
}


def dynamic_import_asr(module, backend):
    """Import ASR models dynamically.

    Args:
        module (str): module_name:class_name or alias in `predefined_asr`
        backend (str): NN backend. e.g., pytorch, chainer

    Returns:
        type: ASR class

    """
    model_class = dynamic_import(module, predefined_asr.get(backend, dict()))
    assert issubclass(
        model_class, ASRInterface
    ), f"{module} does not implement ASRInterface"
    return model_class


================================================
FILE: nets/batch_beam_search.py
================================================
"""Parallel beam search module."""

import logging
from typing import Any
from typing import Dict
from typing import List
from typing import NamedTuple
from typing import Tuple

import torch
from torch.nn.utils.rnn import pad_sequence

from espnet.nets.beam_search import BeamSearch
from espnet.nets.beam_search import Hypothesis


class BatchHypothesis(NamedTuple):
    """Batchfied/Vectorized hypothesis data type."""

    yseq: torch.Tensor = torch.tensor([])  # (batch, maxlen)
    score: torch.Tensor = torch.tensor([])  # (batch,)
    length: torch.Tensor = torch.tensor([])  # (batch,)
    scores: Dict[str, torch.Tensor] = dict()  # values: (batch,)
    states: Dict[str, Dict] = dict()

    def __len__(self) -> int:
        """Return a batch size."""
        return len(self.length)


class BatchBeamSearch(BeamSearch):
    """Batch beam search implementation."""

    def batchfy(self, hyps: List[Hypothesis]) -> BatchHypothesis:
        """Convert list to batch."""
        if len(hyps) == 0:
            return BatchHypothesis()
        return BatchHypothesis(
            yseq=pad_sequence(
                [h.yseq for h in hyps], batch_first=True, padding_value=self.eos
            ),
            length=torch.tensor([len(h.yseq) for h in hyps], dtype=torch.int64),
            score=torch.tensor([h.score for h in hyps]),
            scores={k: torch.tensor([h.scores[k] for h in hyps]) for k in self.scorers},
            states={k: [h.states[k] for h in hyps] for k in self.scorers},
        )

    def _batch_select(self, hyps: BatchHypothesis, ids: List[int]) -> BatchHypothesis:
        return BatchHypothesis(
            yseq=hyps.yseq[ids],
            score=hyps.score[ids],
            length=hyps.length[ids],
            scores={k: v[ids] for k, v in hyps.scores.items()},
            states={
                k: [self.scorers[k].select_state(v, i) for i in ids]
                for k, v in hyps.states.items()
            },
        )

    def _select(self, hyps: BatchHypothesis, i: int) -> Hypothesis:
        return Hypothesis(
            yseq=hyps.yseq[i, : hyps.length[i]],
            score=hyps.score[i],
            scores={k: v[i] for k, v in hyps.scores.items()},
            states={
                k: self.scorers[k].select_state(v, i) for k, v in hyps.states.items()
            },
        )

    def unbatchfy(self, batch_hyps: BatchHypothesis) -> List[Hypothesis]:
        """Revert batch to list."""
        return [
            Hypothesis(
                yseq=batch_hyps.yseq[i][: batch_hyps.length[i]],
                score=batch_hyps.score[i],
                scores={k: batch_hyps.scores[k][i] for k in self.scorers},
                states={
                    k: v.select_state(batch_hyps.states[k], i)
                    for k, v in self.scorers.items()
                },
            )
            for i in range(len(batch_hyps.length))
        ]

    def batch_beam(
        self, weighted_scores: torch.Tensor, ids: torch.Tensor
    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
        """Batch-compute topk full token ids and partial token ids.

        Args:
            weighted_scores (torch.Tensor): The weighted sum scores for each tokens.
                Its shape is `(n_beam, self.vocab_size)`.
            ids (torch.Tensor): The partial token ids to compute topk.
                Its shape is `(n_beam, self.pre_beam_size)`.

        Returns:
            Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
                The topk full (prev_hyp, new_token) ids
                and partial (prev_hyp, new_token) ids.
                Their shapes are all `(self.beam_size,)`

        """
        top_ids = weighted_scores.view(-1).topk(self.beam_size)[1]
        # Because of the flatten above, `top_ids` is organized as:
        # [hyp1 * V + token1, hyp2 * V + token2, ..., hypK * V + tokenK],
        # where V is `self.n_vocab` and K is `self.beam_size`
        prev_hyp_ids = top_ids // self.n_vocab
        new_token_ids = top_ids % self.n_vocab
        return prev_hyp_ids, new_token_ids, prev_hyp_ids, new_token_ids

    def init_hyp(self, x: torch.Tensor) -> BatchHypothesis:
        """Get an initial hypothesis data.

        Args:
            x (torch.Tensor): The encoder output feature

        Returns:
            Hypothesis: The initial hypothesis.

        """
        init_states = dict()
        init_scores = dict()
        for k, d in self.scorers.items():
            init_states[k] = d.batch_init_state(x)
            init_scores[k] = 0.0
        return self.batchfy(
            [
                Hypothesis(
                    score=0.0,
                    scores=init_scores,
                    states=init_states,
                    yseq=torch.tensor([self.sos], device=x.device),
                )
            ]
        )

    def score_full(
        self, hyp: BatchHypothesis, x: torch.Tensor
    ) -> Tuple[Dict[str, torch.Tensor], Dict[str, Any]]:
        """Score new hypothesis by `self.full_scorers`.

        Args:
            hyp (Hypothesis): Hypothesis with prefix tokens to score
            x (torch.Tensor): Corresponding input feature

        Returns:
            Tuple[Dict[str, torch.Tensor], Dict[str, Any]]: Tuple of
                score dict of `hyp` that has string keys of `self.full_scorers`
                and tensor score values of shape: `(self.n_vocab,)`,
                and state dict that has string keys
                and state values of `self.full_scorers`

        """
        scores = dict()
        states = dict()
        for k, d in self.full_scorers.items():
            scores[k], states[k] = d.batch_score(hyp.yseq, hyp.states[k], x)
        return scores, states

    def score_partial(
        self, hyp: BatchHypothesis, ids: torch.Tensor, x: torch.Tensor
    ) -> Tuple[Dict[str, torch.Tensor], Dict[str, Any]]:
        """Score new hypothesis by `self.full_scorers`.

        Args:
            hyp (Hypothesis): Hypothesis with prefix tokens to score
            ids (torch.Tensor): 2D tensor of new partial tokens to score
            x (torch.Tensor): Corresponding input feature

        Returns:
            Tuple[Dict[str, torch.Tensor], Dict[str, Any]]: Tuple of
                score dict of `hyp` that has string keys of `self.full_scorers`
                and tensor score values of shape: `(self.n_vocab,)`,
                and state dict that has string keys
                and state values of `self.full_scorers`

        """
        scores = dict()
        states = dict()
        for k, d in self.part_scorers.items():
            scores[k], states[k] = d.batch_score_partial(
                hyp.yseq, ids, hyp.states[k], x
            )
        return scores, states

    def merge_states(self, states: Any, part_states: Any, part_idx: int) -> Any:
        """Merge states for new hypothesis.

        Args:
            states: states of `self.full_scorers`
            part_states: states of `self.part_scorers`
            part_idx (int): The new token id for `part_scores`

        Returns:
            Dict[str, torch.Tensor]: The new score dict.
                Its keys are names of `self.full_scorers` and `self.part_scorers`.
                Its values are states of the scorers.

        """
        new_states = dict()
        for k, v in states.items():
            new_states[k] = v
        for k, v in part_states.items():
            new_states[k] = v
        return new_states

    def search(self, running_hyps: BatchHypothesis, x: torch.Tensor) -> BatchHypothesis:
        """Search new tokens for running hypotheses and encoded speech x.

        Args:
            running_hyps (BatchHypothesis): Running hypotheses on beam
            x (torch.Tensor): Encoded speech feature (T, D)

        Returns:
            BatchHypothesis: Best sorted hypotheses

        """
        n_batch = len(running_hyps)
        part_ids = None  # no pre-beam
        # batch scoring
        weighted_scores = torch.zeros(
            n_batch, self.n_vocab, dtype=x.dtype, device=x.device
        )
        scores, states = self.score_full(running_hyps, x.expand(n_batch, *x.shape))
        for k in self.full_scorers:
            weighted_scores += self.weights[k] * scores[k]
        # partial scoring
        if self.do_pre_beam:
            pre_beam_scores = (
                weighted_scores
                if self.pre_beam_score_key == "full"
                else scores[self.pre_beam_score_key]
            )
            part_ids = torch.topk(pre_beam_scores, self.pre_beam_size, dim=-1)[1]
        # NOTE(takaaki-hori): Unlike BeamSearch, we assume that score_partial returns
        # full-size score matrices, which has non-zero scores for part_ids and zeros
        # for others.
        part_scores, part_states = self.score_partial(running_hyps, part_ids, x)
        for k in self.part_scorers:
            weighted_scores += self.weights[k] * part_scores[k]
        # add previous hyp scores
        weighted_scores += running_hyps.score.to(
            dtype=x.dtype, device=x.device
        ).unsqueeze(1)

        # TODO(karita): do not use list. use batch instead
        # see also https://github.com/espnet/espnet/pull/1402#discussion_r354561029
        # update hyps
        best_hyps = []
        prev_hyps = self.unbatchfy(running_hyps)
        for (
            full_prev_hyp_id,
            full_new_token_id,
            part_prev_hyp_id,
            part_new_token_id,
        ) in zip(*self.batch_beam(weighted_scores, part_ids)):
            prev_hyp = prev_hyps[full_prev_hyp_id]
            best_hyps.append(
                Hypothesis(
                    score=weighted_scores[full_prev_hyp_id, full_new_token_id],
                    yseq=self.append_token(prev_hyp.yseq, full_new_token_id),
                    scores=self.merge_scores(
                        prev_hyp.scores,
                        {k: v[full_prev_hyp_id] for k, v in scores.items()},
                        full_new_token_id,
                        {k: v[part_prev_hyp_id] for k, v in part_scores.items()},
                        part_new_token_id,
                    ),
                    states=self.merge_states(
                        {
                            k: self.full_scorers[k].select_state(v, full_prev_hyp_id)
                            for k, v in states.items()
                        },
                        {
                            k: self.part_scorers[k].select_state(
                                v, part_prev_hyp_id, part_new_token_id
                            )
                            for k, v in part_states.items()
                        },
                        part_new_token_id,
                    ),
                )
            )
        return self.batchfy(best_hyps)

    def post_process(
        self,
        i: int,
        maxlen: int,
        maxlenratio: float,
        running_hyps: BatchHypothesis,
        ended_hyps: List[Hypothesis],
    ) -> BatchHypothesis:
        """Perform post-processing of beam search iterations.

        Args:
            i (int): The length of hypothesis tokens.
            maxlen (int): The maximum length of tokens in beam search.
            maxlenratio (int): The maximum length ratio in beam search.
            running_hyps (BatchHypothesis): The running hypotheses in beam search.
            ended_hyps (List[Hypothesis]): The ended hypotheses in beam search.

        Returns:
            BatchHypothesis: The new running hypotheses.

        """
        n_batch = running_hyps.yseq.shape[0]
        logging.debug(f"the number of running hypothes: {n_batch}")
        if self.token_list is not None:
            logging.debug(
                "best hypo: "
                + "".join(
                    [
                        self.token_list[x]
                        for x in running_hyps.yseq[0, 1 : running_hyps.length[0]]
                    ]
                )
            )
        # add eos in the final loop to avoid that there are no ended hyps
        if i == maxlen - 1:
            logging.info("adding <eos> in the last position in the loop")
            yseq_eos = torch.cat(
                (
                    running_hyps.yseq,
                    torch.full(
                        (n_batch, 1),
                        self.eos,
                        device=running_hyps.yseq.device,
                        dtype=torch.int64,
                    ),
                ),
                1,
            )
            running_hyps.yseq.resize_as_(yseq_eos)
            running_hyps.yseq[:] = yseq_eos
            running_hyps.length[:] = yseq_eos.shape[1]

        # add ended hypotheses to a final list, and removed them from current hypotheses
        # (this will be a probmlem, number of hyps < beam)
        is_eos = (
            running_hyps.yseq[torch.arange(n_batch), running_hyps.length - 1]
            == self.eos
        )
        for b in torch.nonzero(is_eos).view(-1):
            hyp = self._select(running_hyps, b)
            ended_hyps.append(hyp)
        remained_ids = torch.nonzero(is_eos == 0).view(-1)
        return self._batch_select(running_hyps, remained_ids)


================================================
FILE: nets/batch_beam_search_online_sim.py
================================================
"""Parallel beam search module for online simulation."""

import logging
from pathlib import Path
from typing import List

import yaml

import torch

from espnet.nets.batch_beam_search import BatchBeamSearch
from espnet.nets.beam_search import Hypothesis
from espnet.nets.e2e_asr_common import end_detect


class BatchBeamSearchOnlineSim(BatchBeamSearch):
    """Online beam search implementation.

    This simulates streaming decoding.
    It requires encoded features of entire utterance and
    extracts block by block from it as it shoud be done
    in streaming processing.
    This is based on Tsunoo et al, "STREAMING TRANSFORMER ASR
    WITH BLOCKWISE SYNCHRONOUS BEAM SEARCH"
    (https://arxiv.org/abs/2006.14941).
    """

    def set_streaming_config(self, asr_config: str):
        """Set config file for streaming decoding.

        Args:
            asr_config (str): The config file for asr training

        """
        train_config_file = Path(asr_config)
        self.block_size = None
        self.hop_size = None
        self.look_ahead = None
        config = None
        with train_config_file.open("r", encoding="utf-8") as f:
            args = yaml.safe_load(f)
            if "encoder_conf" in args.keys():
                if "block_size" in args["encoder_conf"].keys():
                    self.block_size = args["encoder_conf"]["block_size"]
                if "hop_size" in args["encoder_conf"].keys():
                    self.hop_size = args["encoder_conf"]["hop_size"]
                if "look_ahead" in args["encoder_conf"].keys():
                    self.look_ahead = args["encoder_conf"]["look_ahead"]
            elif "config" in args.keys():
                config = args["config"]
                if config is None:
                    logging.info(
                        "Cannot find config file for streaming decoding: "
                        + "apply batch beam search instead."
                    )
                    return
        if (
            self.block_size is None or self.hop_size is None or self.look_ahead is None
        ) and config is not None:
            config_file = Path(config)
            with config_file.open("r", encoding="utf-8") as f:
                args = yaml.safe_load(f)
            if "encoder_conf" in args.keys():
                enc_args = args["encoder_conf"]
            if enc_args and "block_size" in enc_args:
                self.block_size = enc_args["block_size"]
            if enc_args and "hop_size" in enc_args:
                self.hop_size = enc_args["hop_size"]
            if enc_args and "look_ahead" in enc_args:
                self.look_ahead = enc_args["look_ahead"]

    def set_block_size(self, block_size: int):
        """Set block size for streaming decoding.

        Args:
            block_size (int): The block size of encoder
        """
        self.block_size = block_size

    def set_hop_size(self, hop_size: int):
        """Set hop size for streaming decoding.

        Args:
            hop_size (int): The hop size of encoder
        """
        self.hop_size = hop_size

    def set_look_ahead(self, look_ahead: int):
        """Set look ahead size for streaming decoding.

        Args:
            look_ahead (int): The look ahead size of encoder
        """
        self.look_ahead = look_ahead

    def forward(
        self, x: torch.Tensor, maxlenratio: float = 0.0, minlenratio: float = 0.0
    ) -> List[Hypothesis]:
        """Perform beam search.

        Args:
            x (torch.Tensor): Encoded speech feature (T, D)
            maxlenratio (float): Input length ratio to obtain max output length.
                If maxlenratio=0.0 (default), it uses a end-detect function
                to automatically find maximum hypothesis lengths
            minlenratio (float): Input length ratio to obtain min output length.

        Returns:
            list[Hypothesis]: N-best decoding results

        """
        self.conservative = True  # always true

        if self.block_size and self.hop_size and self.look_ahead:
            cur_end_frame = int(self.block_size - self.look_ahead)
        else:
            cur_end_frame = x.shape[0]
        process_idx = 0
        if cur_end_frame < x.shape[0]:
            h = x.narrow(0, 0, cur_end_frame)
        else:
            h = x

        # set length bounds
        if maxlenratio == 0:
            maxlen = x.shape[0]
        else:
            maxlen = max(1, int(maxlenratio * x.size(0)))
        minlen = int(minlenratio * x.size(0))
        logging.info("decoder input length: " + str(x.shape[0]))
        logging.info("max output length: " + str(maxlen))
        logging.info("min output length: " + str(minlen))

        # main loop of prefix search
        running_hyps = self.init_hyp(h)
        prev_hyps = []
        ended_hyps = []
        prev_repeat = False

        continue_decode = True

        while continue_decode:
            move_to_next_block = False
            if cur_end_frame < x.shape[0]:
                h = x.narrow(0, 0, cur_end_frame)
            else:
                h = x

            # extend states for ctc
            self.extend(h, running_hyps)

            while process_idx < maxlen:
                logging.debug("position " + str(process_idx))
                best = self.search(running_hyps, h)

                if process_idx == maxlen - 1:
                    # end decoding
                    running_hyps = self.post_process(
                        process_idx, maxlen, maxlenratio, best, ended_hyps
                    )
                n_batch = best.yseq.shape[0]
                local_ended_hyps = []
                is_local_eos = (
                    best.yseq[torch.arange(n_batch), best.length - 1] == self.eos
                )
                for i in range(is_local_eos.shape[0]):
                    if is_local_eos[i]:
                        hyp = self._select(best, i)
                        local_ended_hyps.append(hyp)
                    # NOTE(tsunoo): check repetitions here
                    # This is a implicit implementation of
                    # Eq (11) in https://arxiv.org/abs/2006.14941
                    # A flag prev_repeat is used instead of using set
                    elif (
                        not prev_repeat
                        and best.yseq[i, -1] in best.yseq[i, :-1]
                        and cur_end_frame < x.shape[0]
                    ):
                        move_to_next_block = True
                        prev_repeat = True
                if maxlenratio == 0.0 and end_detect(
                    [lh.asdict() for lh in local_ended_hyps], process_idx
                ):
                    logging.info(f"end detected at {process_idx}")
                    continue_decode = False
                    break
                if len(local_ended_hyps) > 0 and cur_end_frame < x.shape[0]:
                    move_to_next_block = True

                if move_to_next_block:
                    if (
                        self.hop_size
                        and cur_end_frame + int(self.hop_size) + int(self.look_ahead)
                        < x.shape[0]
                    ):
                        cur_end_frame += int(self.hop_size)
                    else:
                        cur_end_frame = x.shape[0]
                    logging.debug("Going to next block: %d", cur_end_frame)
                    if process_idx > 1 and len(prev_hyps) > 0 and self.conservative:
                        running_hyps = prev_hyps
                        process_idx -= 1
                        prev_hyps = []
                    break

                prev_repeat = False
                prev_hyps = running_hyps
                running_hyps = self.post_process(
                    process_idx, maxlen, maxlenratio, best, ended_hyps
                )

                if cur_end_frame >= x.shape[0]:
                    for hyp in local_ended_hyps:
                        ended_hyps.append(hyp)

                if len(running_hyps) == 0:
                    logging.info("no hypothesis. Finish decoding.")
                    continue_decode = False
                    break
                else:
                    logging.debug(f"remained hypotheses: {len(running_hyps)}")
                # increment number
                process_idx += 1

        nbest_hyps = sorted(ended_hyps, key=lambda x: x.score, reverse=True)
        # check the number of hypotheses reaching to eos
        if len(nbest_hyps) == 0:
            logging.warning(
                "there is no N-best results, perform recognition "
                "again with smaller minlenratio."
            )
            return (
                []
                if minlenratio < 0.1
                else self.forward(x, maxlenratio, max(0.0, minlenratio - 0.1))
            )

        # report the best result
        best = nbest_hyps[0]
        for k, v in best.scores.items():
            logging.info(
                f"{v:6.2f} * {self.weights[k]:3} = {v * self.weights[k]:6.2f} for {k}"
            )
        logging.info(f"total log probability: {best.score:.2f}")
        logging.info(f"normalized log probability: {best.score / len(best.yseq):.2f}")
        logging.info(f"total number of ended hypotheses: {len(nbest_hyps)}")
        if self.token_list is not None:
            logging.info(
                "best hypo: "
                + "".join([self.token_list[x] for x in best.yseq[1:-1]])
                + "\n"
            )
        return nbest_hyps

    def extend(self, x: torch.Tensor, hyps: Hypothesis) -> List[Hypothesis]:
        """Extend probabilities and states with more encoded chunks.

        Args:
            x (torch.Tensor): The extended encoder output feature
            hyps (Hypothesis): Current list of hypothesis

        Returns:
            Hypothesis: The exxtended hypothesis

        """
        for k, d in self.scorers.items():
            if hasattr(d, "extend_prob"):
                d.extend_prob(x)
            if hasattr(d, "extend_state"):
                hyps.states[k] = d.extend_state(hyps.states[k])


================================================
FILE: nets/beam_search.py
================================================
"""Beam search module."""

from itertools import chain
import logging
from typing import Any
from typing import Dict
from typing import List
from typing import NamedTuple
from typing import Tuple
from typing import Union

import torch

from espnet.nets.e2e_asr_common import end_detect
from espnet.nets.scorer_interface import PartialScorerInterface
from espnet.nets.scorer_interface import ScorerInterface
from snowfall.warpper.mmi_utils import parse_step

class Hypothesis(NamedTuple):
    """Hypothesis data type."""

    yseq: torch.Tensor
    score: Union[float, torch.Tensor] = 0
    scores: Dict[str, Union[float, torch.Tensor]] = dict()
    states: Dict[str, Any] = dict()

    def asdict(self) -> dict:
        """Convert data to JSON-friendly dict."""
        return self._replace(
            yseq=self.yseq.tolist(),
            score=float(self.score),
            scores={k: float(v) for k, v in self.scores.items()},
        )._asdict()

    def __str__(self):
        ans = ""

        ans += f"Total Scores: {self.score}\n"
        info = "Scores -> "
        for k, v in self.scores.items():
            info += "| {}: {:<7.2f} |".format(k, v)
        ans += info
        return ans


class BeamSearch(object):
    """Beam search implementation."""

    def __init__(
        self,
        scorers: Dict[str, ScorerInterface],
        weights: Dict[str, float],
        beam_size: int,
        vocab_size: int,
        sos: int,
        eos: int,
        token_list: List[str] = None,
        pre_beam_ratio: float = 1.5,
        pre_beam_score_key: str = None,
        mmi_rescorer = None,
    ):
        """Initialize beam search.

        Args:
            scorers (dict[str, ScorerInterface]): Dict of decoder modules
                e.g., Decoder, CTCPrefixScorer, LM
                The scorer will be ignored if it is `None`
            weights (dict[str, float]): Dict of weights for each scorers
                The scorer will be ignored if its weight is 0
            beam_size (int): The number of hypotheses kept during search
            vocab_size (int): The number of vocabulary
            sos (int): Start of sequence id
            eos (int): End of sequence id
            token_list (list[str]): List of tokens for debug log
            pre_beam_score_key (str): key of scores to perform pre-beam search
            pre_beam_ratio (float): beam size in the pre-beam search
                will be `int(pre_beam_ratio * beam_size)`

        """
        super().__init__()
        # set scorers
        self.weights = weights
        self.scorers = dict()
        self.full_scorers = dict()
        self.part_scorers = dict()
        # this module dict is required for recursive cast
        # `self.to(device, dtype)` in `recog.py`
        self.nn_dict = torch.nn.ModuleDict()
        for k, v in scorers.items():
            w = weights.get(k, 0)
            if w == 0 or v is None:
                continue
            assert isinstance(
                v, ScorerInterface
            ), f"{k} ({type(v)}) does not implement ScorerInterface"
            self.scorers[k] = v
            if isinstance(v, PartialScorerInterface):
                self.part_scorers[k] = v
                print(f"Using part scorer: {k} with weight: {w}", flush=True)
            else:
                self.full_scorers[k] = v
                print(f"Using full scorer: {k} with weight: {w}", flush=True)
            if isinstance(v, torch.nn.Module):
                self.nn_dict[k] = v

        # set configurations
        self.sos = sos
        self.eos = eos
        self.token_list = token_list
        self.pre_beam_size = int(pre_beam_ratio * beam_size)
        self.beam_size = beam_size
        self.n_vocab = vocab_size
        if (
            pre_beam_score_key is not None
            and pre_beam_score_key != "full"
            and pre_beam_score_key not in self.full_scorers
        ):
            raise KeyError(f"{pre_beam_score_key} is not found in {self.full_scorers}")
        self.pre_beam_score_key = pre_beam_score_key
        self.do_pre_beam = (
            self.pre_beam_score_key is not None
            and self.pre_beam_size < self.n_vocab
            and len(self.part_scorers) > 0
        )
        print(f"Do pre-beam: {self.do_pre_beam}")

        self.mmi_rescorer = mmi_rescorer
        # score below this would be deleted even it is in beam
        self.min_score = -1000 

    def init_hyp(self, x: torch.Tensor) -> List[Hypothesis]:
        """Get an initial hypothesis data.

        Args:
            x (torch.Tensor): The encoder output feature

        Returns:
            Hypothesis: The initial hypothesis.

        """
        init_states = dict()
        init_scores = dict()
        for k, d in self.scorers.items():
            init_states[k] = d.init_state(x)
            init_scores[k] = 0.0
        return [
            Hypothesis(
                score=0.0,
                scores=init_scores,
                states=init_states,
                yseq=torch.tensor([self.sos], device=x.device),
            )
        ]

    @staticmethod
    def append_token(xs: torch.Tensor, x: int) -> torch.Tensor:
        """Append new token to prefix tokens.

        Args:
            xs (torch.Tensor): The prefix token
            x (int): The new token to append

        Returns:
            torch.Tensor: New tensor contains: xs + [x] with xs.dtype and xs.device

        """
        x = torch.tensor([x], dtype=xs.dtype, device=xs.device)
        return torch.cat((xs, x))

    def score_full(
        self, hyp: Hypothesis, x: torch.Tensor
    ) -> Tuple[Dict[str, torch.Tensor], Dict[str, Any]]:
        """Score new hypothesis by `self.full_scorers`.

        Args:
            hyp (Hypothesis): Hypothesis with prefix tokens to score
            x (torch.Tensor): Corresponding input feature

        Returns:
            Tuple[Dict[str, torch.Tensor], Dict[str, Any]]: Tuple of
                score dict of `hyp` that has string keys of `self.full_scorers`
                and tensor score values of shape: `(self.n_vocab,)`,
                and state dict that has string keys
                and state values of `self.full_scorers`

        """
        scores = dict()
        states = dict()
        for k, d in self.full_scorers.items():
            scores[k], states[k] = d.score(hyp.yseq, hyp.states[k], x)
        return scores, states

    def score_partial(
        self, hyp: Hypothesis, ids: torch.Tensor, x: torch.Tensor
    ) -> Tuple[Dict[str, torch.Tensor], Dict[str, Any]]:
        """Score new hypothesis by `self.part_scorers`.

        Args:
            hyp (Hypothesis): Hypothesis with prefix tokens to score
            ids (torch.Tensor): 1D tensor of new partial tokens to score
            x (torch.Tensor): Corresponding input feature

        Returns:
            Tuple[Dict[str, torch.Tensor], Dict[str, Any]]: Tuple of
                score dict of `hyp` that has string keys of `self.part_scorers`
                and tensor score values of shape: `(len(ids),)`,
                and state dict that has string keys
                and state values of `self.part_scorers`

        """
        scores = dict()
        states = dict()
        for k, d in self.part_scorers.items():
            scores[k], states[k] = d.score_partial(hyp.yseq, ids, hyp.states[k], x)
        return scores, states

    def beam(
        self, weighted_scores: torch.Tensor, ids: torch.Tensor
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        """Compute topk full token ids and partial token ids.

        Args:
            weighted_scores (torch.Tensor): The weighted sum scores for each tokens.
            Its shape is `(self.n_vocab,)`.
            ids (torch.Tensor): The partial token ids to compute topk

        Returns:
            Tuple[torch.Tensor, torch.Tensor]:
                The topk full token ids and partial token ids.
                Their shapes are `(self.beam_size,)`

        """
        # no pre beam performed
        if weighted_scores.size(0) == ids.size(0):
            top_ids = weighted_scores.topk(self.beam_size)[1]
            return top_ids, top_ids

        # mask pruned in pre-beam not to select in topk
        tmp = weighted_scores[ids]
        weighted_scores[:] = -float("inf")
        weighted_scores[ids] = tmp
        top_ids = weighted_scores.topk(self.beam_size)[1]
        local_ids = weighted_scores[ids].topk(self.beam_size)[1]
        return top_ids, local_ids

    @staticmethod
    def merge_scores(
        prev_scores: Dict[str, float],
        next_full_scores: Dict[str, torch.Tensor],
        full_idx: int,
        next_part_scores: Dict[str, torch.Tensor],
        part_idx: int,
    ) -> Dict[str, torch.Tensor]:
        """Merge scores for new hypothesis.

        Args:
            prev_scores (Dict[str, float]):
                The previous hypothesis scores by `self.scorers`
            next_full_scores (Dict[str, torch.Tensor]): scores by `self.full_scorers`
            full_idx (int): The next token id for `next_full_scores`
            next_part_scores (Dict[str, torch.Tensor]):
                scores of partial tokens by `self.part_scorers`
            part_idx (int): The new token id for `next_part_scores`

        Returns:
            Dict[str, torch.Tensor]: The new score dict.
                Its keys are names of `self.full_scorers` and `self.part_scorers`.
                Its values are scalar tensors by the scorers.

        """
        new_scores = dict()
        for k, v in next_full_scores.items():
            new_scores[k] = prev_scores[k] + v[full_idx]
        for k, v in next_part_scores.items():
            new_scores[k] = prev_scores[k] + v[part_idx]
        return new_scores

    def merge_states(self, states: Any, part_states: Any, part_idx: int) -> Any:
        """Merge states for new hypothesis.

        Args:
            states: states of `self.full_scorers`
            part_states: states of `self.part_scorers`
            part_idx (int): The new token id for `part_scores`

        Returns:
            Dict[str, torch.Tensor]: The new score dict.
                Its keys are names of `self.full_scorers` and `self.part_scorers`.
                Its values are states of the scorers.

        """
        new_states = dict()
        for k, v in states.items():
            new_states[k] = v
        for k, d in self.part_scorers.items():
            new_states[k] = d.select_state(part_states[k], part_idx)
        return new_states

    def search(
        self, running_hyps: List[Hypothesis], x: torch.Tensor
    ) -> List[Hypothesis]:
        """Search new tokens for running hypotheses and encoded speech x.

        Args:
            running_hyps (List[Hypothesis]): Running hypotheses on beam
            x (torch.Tensor): Encoded speech feature (T, D)

        Returns:
            List[Hypotheses]: Best sorted hypotheses

        """
        best_hyps = []
        part_ids = torch.arange(self.n_vocab, device=x.device)  # no pre-beam
        for hyp in running_hyps:

            # scoring
            weighted_scores = torch.zeros(self.n_vocab, dtype=x.dtype, device=x.device)
            scores, states = self.score_full(hyp, x)
            for k in self.full_scorers:
                weighted_scores += self.weights[k] * scores[k]
            # partial scoring
            if self.do_pre_beam:
                pre_beam_scores = (
                    weighted_scores
                    if self.pre_beam_score_key == "full"
                    else scores[self.pre_beam_score_key]
                )
                part_ids = torch.topk(pre_beam_scores, self.pre_beam_size)[1]
            part_scores, part_states = self.score_partial(hyp, part_ids, x)
            for k in self.part_scorers:
                weighted_scores[part_ids] += self.weights[k] * part_scores[k]
            # Show the scores step by step
            # parse_step(hyp, self.token_list, part_ids,
            #            self.weights, scores,
            #            part_scores, weighted_scores)
            weighted_scores += hyp.score

            # update hyps
            for j, part_j in zip(*self.beam(weighted_scores, part_ids)):
                # will be (2 x beam at most)
                this_hyp = Hypothesis(
                        score=weighted_scores[j],
                        yseq=self.append_token(hyp.yseq, j),
                        scores=self.merge_scores(
                            hyp.scores, scores, j, part_scores, part_j
                        ),
                        states=self.merge_states(states, part_states, part_j),
                    )
                best_hyps.append(this_hyp)

            # sort and prune 2 x beam -> beam
            best_hyps = sorted(best_hyps, key=lambda x: x.score, reverse=True)[
                : min(len(best_hyps), self.beam_size)
            ]
        return best_hyps

    def __call__(
        self, x: torch.Tensor, maxlenratio: float = 0.0, minlenratio: float = 0.0
    ) -> List[Hypothesis]:
        """Perform beam search.

        Args:
            x (torch.Tensor): Encoded speech feature (T, D)
            maxlenratio (float): Input length ratio to obtain max output length.
                If maxlenratio=0.0 (default), it uses a end-detect function
                to automatically find maximum hypothesis lengths
            minlenratio (float): Input length ratio to obtain min output length.

        Returns:
            list[Hypothesis]: N-best decoding results

        """
        # set length bounds
        if maxlenratio == 0:
            maxlen = x.shape[0]
        else:
            maxlen = max(1, int(maxlenratio * x.size(0)))
        minlen = int(minlenratio * x.size(0))
        logging.info("decoder input length: " + str(x.shape[0]))
        logging.info("max output length: " + str(maxlen))
        logging.info("min output length: " + str(minlen))

        # main loop of prefix search
        running_hyps = self.init_hyp(x)
        ended_hyps = []
        for i in range(maxlen):
            # print(f"######### Iteration {i} #########")
            logging.debug("position " + str(i))
            best = self.search(running_hyps, x)

            # post process of one iteration
            running_hyps = self.post_process(i, maxlen, maxlenratio, best, ended_hyps)
 
            # delete hypothesis that below min_score. this means need to be killed by mmi
            running_hyps = [h for h in running_hyps if h.score > self.min_score]

            # end detection
            if maxlenratio == 0.0 and end_detect([h.asdict() for h in ended_hyps], i):
                logging.info(f"end detected at {i}")
                break
            if len(running_hyps) == 0:
                logging.info("no hypothesis. Finish decoding.")
                break
            else:
                logging.debug(f"remained hypotheses: {len(running_hyps)}")

        nbest_hyps = sorted(ended_hyps, key=lambda x: x.score, reverse=True)
        # print("#" * 20, "Details of Final Best Hypothesis", "#" * 20)
        # for h in nbest_hyps:
        #     print("Hypothesis: " + "".join([self.token_list[x] for x in h.yseq[1:-1]]))
        #     print(h, flush=True)
        # check the number of hypotheses reaching to eos
        if len(nbest_hyps) == 0:
            logging.warning(
                "there is no N-best results, perform recognition "
                "again with smaller minlenratio."
            )
            return (
                []
                if minlenratio < 0.1
                else self.forward(x, maxlenratio, max(0.0, minlenratio - 0.1))
            )

        # report the best result
        best = nbest_hyps[0]
        for k, v in best.scores.items():
            logging.info(
                f"{v:6.2f} * {self.weights[k]:3} = {v * self.weights[k]:6.2f} for {k}"
            )
        logging.info(f"total log probability: {best.score:.2f}")
        logging.info(f"normalized log probability: {best.score / len(best.yseq):.2f}")
        logging.info(f"total number of ended hypotheses: {len(nbest_hyps)}")
        if self.token_list is not None:
            logging.info(
                "best hypo: "
                + "".join([self.token_list[x] for x in best.yseq[1:-1]])
                + "\n"
            )
        # print("Start MMI rescoring", flush=True)
        if self.mmi_rescorer:
            nbest_hyps = self.mmi_rescorer.score(x, nbest_hyps, v2=True)

        return nbest_hyps

    def post_process(
        self,
        i: int,
        maxlen: int,
        maxlenratio: float,
        running_hyps: List[Hypothesis],
        ended_hyps: List[Hypothesis],
    ) -> List[Hypothesis]:
        """Perform post-processing of beam search iterations.

        Args:
            i (int): The length of hypothesis tokens.
            maxlen (int): The maximum length of tokens in beam search.
            maxlenratio (int): The maximum length ratio in beam search.
            running_hyps (List[Hypothesis]): The running hypotheses in beam search.
            ended_hyps (List[Hypothesis]): The ended hypotheses in beam search.

        Returns:
            List[Hypothesis]: The new running hypotheses.

        """
        logging.debug(f"the number of running hypotheses: {len(running_hyps)}")
        if self.token_list is not None:
            logging.debug(
                "best hypo: "
                + "".join([self.token_list[x] for x in running_hyps[0].yseq[1:]])
            )
        # add eos in the final loop to avoid that there are no ended hyps
        if i == maxlen - 1:
            logging.info("adding <eos> in the last position in the loop")
            running_hyps = [
                h._replace(yseq=self.append_token(h.yseq, self.eos))
                for h in running_hyps
            ]

        # add ended hypotheses to a final list, and removed them from current hypotheses
        # (this will be a problem, number of hyps < beam)
        remained_hyps = []
        for hyp in running_hyps:
            if hyp.yseq[-1] == self.eos:
                # e.g., Word LM needs to add final <eos> score
                for k, d in chain(self.full_scorers.items(), self.part_scorers.items()):
                    s = d.final_score(hyp.states[k])
                    hyp.scores[k] += s
                    hyp = hyp._replace(score=hyp.score + self.weights[k] * s)
                ended_hyps.append(hyp)
            else:
                remained_hyps.append(hyp)
        return remained_hyps


def beam_search(
    x: torch.Tensor,
    sos: int,
    eos: int,
    beam_size: int,
    vocab_size: int,
    scorers: Dict[str, ScorerInterface],
    weights: Dict[str, float],
    token_list: List[str] = None,
    maxlenratio: float = 0.0,
    minlenratio: float = 0.0,
    pre_beam_ratio: float = 1.5,
    pre_beam_score_key: str = "full",
) -> list:
    """Perform beam search with scorers.

    Args:
        x (torch.Tensor): Encoded speech feature (T, D)
        sos (int): Start of sequence id
        eos (int): End of sequence id
        beam_size (int): The number of hypotheses kept during search
        vocab_size (int): The number of vocabulary
        scorers (dict[str, ScorerInterface]): Dict of decoder modules
            e.g., Decoder, CTCPrefixScorer, LM
            The scorer will be ignored if it is `None`
        weights (dict[str, float]): Dict of weights for each scorers
            The scorer will be ignored if its weight is 0
        token_list (list[str]): List of tokens for debug log
        maxlenratio (float): Input length ratio to obtain max output length.
            If maxlenratio=0.0 (default), it uses a end-detect function
            to automatically find maximum hypothesis lengths
        minlenratio (float): Input length ratio to obtain min output length.
        pre_beam_score_key (str): key of scores to perform pre-beam search
        pre_beam_ratio (float): beam size in the pre-beam search
            will be `int(pre_beam_ratio * beam_size)`

    Returns:
        list: N-best decoding results

    """
    ret = BeamSearch(
        scorers,
        weights,
        beam_size=beam_size,
        vocab_size=vocab_size,
        pre_beam_ratio=pre_beam_ratio,
        pre_beam_score_key=pre_beam_score_key,
        sos=sos,
        eos=eos,
        token_list=token_list,
    ).forward(x=x, maxlenratio=maxlenratio, minlenratio=minlenratio)
    return [h.asdict() for h in ret]


================================================
FILE: nets/beam_search_transducer.py
================================================
"""Search algorithms for transducer models."""
from typing import List
from typing import Union
from collections import Counter, defaultdict
import numpy as np
import torch
import time
import math
from itertools import groupby
from espnet.nets.pytorch_backend.transducer.utils import create_lm_batch_state
from espnet.nets.pytorch_backend.transducer.utils import init_lm_state
from espnet.nets.pytorch_backend.transducer.utils import is_prefix
from espnet.nets.pytorch_backend.transducer.utils import recombine_hyps
from espnet.nets.pytorch_backend.transducer.utils import select_lm_state
from espnet.nets.pytorch_backend.transducer.utils import substract
from espnet.nets.transducer_decoder_interface import Hypothesis
from espnet.nets.transducer_decoder_interface import NSCHypothesis
from espnet.nets.transducer_decoder_interface import TransducerDecoderInterface
from espnet.nets.scorers.mmi_rescorer import MMIRescorer
# from espnet.nets.scorers.mmi_rnnt_scorer import MMIRNNTScorer
from espnet.nets.scorers.mmi_alignment_score import MMIRNNTScorer
from espnet.nets.scorers.ctc_rnnt_scorer import CTCRNNTScorer
from espnet.nets.scorers.mmi_rnnt_lookahead_scorer import MMIRNNTLookaheadScorer

class BeamSearchTransducer:
    """Beam search implementation for transducer."""

    def __init__(
        self,
        decoder: Union[TransducerDecoderInterface, torch.nn.Module],
        joint_network: torch.nn.Module,
        beam_size: int,
        lm: torch.nn.Module = None,
        lm_weight: float = 0.1,
        search_type: str = "default",
        char_list = None,
        max_sym_exp: int = 2,
        u_max: int = 50,
        nstep: int = 1,
        prefix_alpha: int = 1,
        score_norm: bool = True,
        nbest: int = 1,
        mmi_scorer=None,
        mmi_weight=0.0,
        ctc_module=None,
        ctc_weight=0.0,
        ngram_scorer=None,
        ngram_weight=0.0,
        word_ngram_scorer=None,
        word_ngram_weight=0.0,
        tlg_scorer=None,
        tlg_weight=0.0,
        forbid_eng=False,
        eng_vocab=None,
    ):
        """Initialize transducer beam search.

        Args:
            decoder: Decoder class to use
            joint_network: Joint Network class
            beam_size: Number of hypotheses kept during search
            lm: LM class to use
            lm_weight: lm weight for soft fusion
            search_type: type of algorithm to use for search
            max_sym_exp: number of maximum symbol expansions at each time step ("tsd")
            u_max: maximum output sequence length ("alsd")
            nstep: number of maximum expansion steps at each time step ("nsc")
            prefix_alpha: maximum prefix length in prefix search ("nsc")
            score_norm: normalize final scores by length ("default")
            nbest: number of returned final hypothesis
        """
        self.decoder = decoder
        self.joint_network = joint_network

        self.beam_size = beam_size
        self.hidden_size = decoder.dunits
        self.vocab_size = decoder.odim
        self.blank = decoder.blank

        # MMI alignment scorer
        self.mmi_scorer = mmi_scorer
        self.mmi_weight = mmi_weight
        print(f"MMI scorer: {mmi_scorer} | MMI weight: {mmi_weight}")

        # deprecated. CTC scorer
        self.ctc_module = ctc_module
        self.ctc_weight = ctc_weight
        print(f"CTC scorer: {ctc_module} | CTC weight: {ctc_weight}")

        # character-level Ngram scorer implemented by kenlm
        self.ngram_scorer = ngram_scorer
        self.ngram_weight = ngram_weight
        print(f"ngram scorer: {ngram_scorer} | ngram weight: {ngram_weight}")

        # word-level Ngram scorer implemented by k2
        self.word_ngram_scorer = word_ngram_scorer
        self.word_ngram_weight = word_ngram_weight
        print(f"word ngram scorer: {word_ngram_scorer} | word ngram weight: {word_ngram_weight}")

        # word-level Ngram scorer implemented by pykaldi (cweng)
        self.tlg_scorer = tlg_scorer
        self.tlg_weight = tlg_weight
        print(f"tlg scorer: {tlg_scorer} | tlg weight: {tlg_weight}")

        if search_type == "ctc_greedy":
            self.search_algorithm = self.ctc_greedy_search
            assert self.ctc_module is not None
        elif self.beam_size <= 1:
            self.search_algorithm = self.greedy_search
        elif search_type == "ctc_beam":
            self.search_algorithm = self.ctc_beam_search
            assert self.ctc_module is not None
        elif search_type == "default":
            self.search_algorithm = self.default_beam_search
        elif search_type == "tsd":
            self.search_algorithm = self.time_sync_decoding
        elif search_type == "alsd":
            self.search_algorithm = self.align_length_sync_decoding
        elif search_type == "nsc":
            self.search_algorithm = self.nsc_beam_search
        else:
            raise NotImplementedError

        self.lm = lm
        self.lm_weight = lm_weight
        print(f"Using LM {lm} with weight {lm_weight}")

        if lm is not None and lm_weight > 0.0:
            self.use_lm = True
            self.is_wordlm = True if hasattr(lm, "predictor") and \
                             hasattr(lm.predictor, "wordlm") else False
            if hasattr(lm, "predictor"):
                self.lm_predictor = lm.predictor.wordlm if self.is_wordlm else lm.predictor
                self.lm_layers = len(self.lm_predictor.rnn)
            else:
                self.is_transformer_lm = True
        else:
            self.use_lm = False

        self.max_sym_exp = max_sym_exp
        self.u_max = u_max
        self.nstep = nstep
        self.prefix_alpha = prefix_alpha
        self.score_norm = score_norm

        self.nbest = nbest
        self.char_list = char_list

        self.forbid_lst = []
        if forbid_eng:
            self.forbid_lst = [self.char_list.index(x) \
                               for x in self.char_list \
                               if (x >= '\u0041' and x <= '\u005a') \
                               or (x >= '\u0061' and x <= '\u007a')]
        print("Forbid chars: ", self.forbid_lst, flush=True)

        self.eng_vocab = eng_vocab
        if self.eng_vocab is not None:
            self.eng_token_list = [x if not is_all_chinese(x) else "" \
                                   for x in self.char_list]

    def __call__(self, h: torch.Tensor) -> Union[List[Hypothesis], List[NSCHypothesis]]:
        """Perform beam search.

        Args:
            h: Encoded speech features (T_max, D_enc)

        Returns:
            nbest_hyps: N-best decoding results

        """
        self.decoder.set_device(h.device)

        if len(h.size()) == 3:
            h = h.squeeze(0)

        if not hasattr(self.decoder, "decoders"):
            self.decoder.set_data_type(h.dtype)

        nbest_hyps = self.search_algorithm(h)
        
        if isinstance(self.mmi_scorer, MMIRescorer):
            nbest_hyps = self.mmi_scorer.score(h, nbest_hyps, v2=True)
        return nbest_hyps

    def sort_nbest(
        self, hyps: Union[List[Hypothesis], List[NSCHypothesis]]
    ) -> Union[List[Hypothesis], List[NSCHypothesis]]:
        """Sort hypotheses by score or score given sequence length.

        Args:
            hyps: list of hypotheses

        Return:
            hyps: sorted list of hypotheses

        """
        if self.score_norm:
            hyps.sort(key=lambda x: x.score / len(x.yseq), reverse=True)
        else:
            hyps.sort(key=lambda x: x.score, reverse=True)

        return hyps[: self.nbest]

    def vocab_regularization(self, hyps):
        bpe_seperator = u'\u2581'
 
        ans = []
        for h in hyps:
            yseq = h.yseq if isinstance(h, Hypothesis) else h[0] # rnnt or ctc hypothesis
            text = "".join([self.eng_token_list[x] for x in yseq[1:]])
            eng_words = [x for x in text.split(bpe_seperator)[:-1] if x != ""] # the last may not finish
            if all([x in self.eng_vocab for x in eng_words]):
                ans.append(h)
 
        return ans

    def greedy_search(self, h: torch.Tensor) -> List[Hypothesis]:
        """Greedy search implementation for transformer-transducer.

        Args:
            h: Encoded speech features (T_max, D_enc)

        Returns:
            hyp: 1-best decoding results

        """
        dec_state = self.decoder.init_state(1)

        hyp = Hypothesis(score=0.0, yseq=[self.blank], dec_state=dec_state)
        cache = {}

        y, state, _ = self.decoder.score(hyp, cache)

        for i, hi in enumerate(h):
            ytu = torch.log_softmax(self.joint_network(hi, y), dim=-1)
            logp, pred = torch.max(ytu, dim=-1)
            if pred != self.blank:
                hyp.yseq.append(int(pred))
                hyp.score += float(logp)

                hyp.dec_state = state

                y, state, _ = self.decoder.score(hyp, cache)
        return [hyp]

    def ctc_greedy_search(self, h: torch.Tensor) -> List[Hypothesis]:
        if len(h.size()) == 2:
            h = h.unsqueeze(0)       
 
        lpz = self.ctc_module.argmax(h)
        collapsed_indices = [x[0] for x in groupby(lpz[0])]
        hyp = [x for x in filter(lambda x: x != self.blank, collapsed_indices)]
        nbest_hyps = [Hypothesis(score=0.0, yseq=[self.blank] + hyp, dec_state=None)]
        return nbest_hyps

    # mainly derived from wenet
    def ctc_beam_search(self, h: torch.Tensor) -> List[Hypothesis]:
        if len(h.size()) == 2:
            h = h.unsqueeze(0)

        ctc_prob = self.ctc_module.log_softmax(h)[0]
        maxlen = ctc_prob.size(0)

        use_full_score = False
        if self.word_ngram_weight > 0.0:
            lm, lm_weight = self.word_ngram_scorer, self.word_ngram_weight
        elif self.ngram_weight > 0.0:
            lm, lm_weight = self.ngram_scorer, self.ngram_weight
        elif self.lm_weight > 0.0:
            lm, lm_weight = self.lm, self.lm_weight
            use_full_score = True
        else:
            lm, lm_weight = None, 0.0
        
        if lm is not None:
            # yseq: (lm_score, lm_state)
            lm_cache = {(self.blank,): (0.0, lm.init_state(None))}
            sort_fn = lambda x: log_add(list(x[1])) + lm_cache[x[0]][0]
        else:
            lm_cache = None
            sort_fn = lambda x: log_add(list(x[1])) 

        # non-blank sequence; (blank_ending_score, non_blank_ending_score)
        cur_hyps = [((self.blank,), (0.0, -float('inf')))]
        for t in range(0, maxlen):
            logp = ctc_prob[t]
            next_hyps = defaultdict(lambda: (-float('inf'), -float('inf')))
            top_k_logp, top_k_index = logp.topk(self.beam_size)
     
            for s in top_k_index:
                s = s.item()
                ps = logp[s].item()

                for prefix, (pb, pnb) in cur_hyps:
                    last = prefix[-1] if len(prefix) > 0 else None
                    if s == self.blank: # blank
                        n_pb, n_pnb = next_hyps[prefix]
                        n_pb = log_add([n_pb, pb + ps, pnb + ps])
                        next_hyps[prefix] = (n_pb, n_pnb)
                    elif s == last:
                        #  Update *ss -> *s;
                        n_pb, n_pnb = next_hyps[prefix]
                        n_pnb = log_add([n_pnb, pnb + ps])
                        next_hyps[prefix] = (n_pb, n_pnb)
                        #  Update *s-s -> *ss, - is for blank
                        n_prefix = prefix + (s, )
                        n_pb, n_pnb = next_hyps[n_prefix]
                        n_pnb = log_add([n_pnb, pb + ps])
                        next_hyps[n_prefix] = (n_pb, n_pnb)
                    else:
                        n_prefix = prefix + (s, )
                        n_pb, n_pnb = next_hyps[n_prefix]
                        n_pnb = log_add([n_pnb, pb + ps, pnb + ps])
                        next_hyps[n_prefix] = (n_pb, n_pnb)

            # LM on-the-fly rescore for unseen prefix
            if lm is not None:
                for prefix, (_, _) in next_hyps.items():
                    if not prefix in lm_cache.keys():
                        y = prefix[:-1]
                        # update all children hypotheses: NNLM 
                        if use_full_score:
                            scores, state = lm.score(torch.Tensor(y).long(), 
                                                     lm_cache[y][1], h)
                            for k in range(len(scores)):
                                lm_cache[y + (k,)] = (lm_cache[y][0] \
                                  + scores[k].item() * lm_weight, 
                                  lm.select_state(state, k)
                                )
                        # update only this hypothesis: N-gram LM                                
                        else:
                            next_token = prefix[-1:]
                            score, state = lm.score_partial(
                                             torch.Tensor(y).long(), 
                                             torch.Tensor(next_token).long(),
                                             lm_cache[y][1], h
                                           )
                            lm_cache[prefix] = (lm_cache[y][0] + score[0].item() * lm_weight, 
                                                lm.select_state(state, 0)
                                               )
             
            next_hyps = sorted(next_hyps.items(), key=sort_fn, reverse=True)
            if self.eng_vocab:
                next_hyps = self.vocab_regularization(next_hyps)
            cur_hyps = next_hyps[:self.beam_size]

        hyps = [Hypothesis(score=log_add([hyp[1][0], hyp[1][1]]),
                           yseq=list(hyp[0]),
                           dec_state=None,
                           mmi_tot_score=lm_cache[hyp[0]][0] \
                             if lm_cache is not None else 0.0
                           )
                           for hyp in cur_hyps
               ]

        return hyps 

    def default_beam_search(self, h: torch.Tensor) -> List[Hypothesis]:
        """Beam search implementation.

        Args:
            x: Encoded speech features (T_max, D_enc)

        Returns:
            nbest_hyps: N-best decoding results

        """
        beam = min(self.beam_size, self.vocab_size)
        beam_k = min(beam, (self.vocab_size - 1))

        dec_state = self.decoder.init_state(1)

        kept_hyps = [Hypothesis(score=0.0, yseq=[self.blank], dec_state=dec_state)]
        cache = {}

        for hi in h:
            hyps = kept_hyps
            kept_hyps = []

            while True:
                max_hyp = max(hyps, key=lambda x: x.score)
                hyps.remove(max_hyp)

                y, state, lm_tokens = self.decoder.score(max_hyp, cache)

                ytu = torch.log_softmax(self.joint_network(hi, y), dim=-1)
                top_k = ytu[1:].topk(beam_k, dim=-1)
                
                # add a blank only
                kept_hyps.append(
                    Hypothesis(
                        score=(max_hyp.score + float(ytu[0:1])),
                        yseq=max_hyp.yseq[:],
                        dec_state=max_hyp.dec_state,
                        lm_state=max_hyp.lm_state,
                    )
                )

                if self.use_lm:
                    lm_state, lm_scores = self.lm.predict(max_hyp.lm_state, lm_tokens)
                else:
                    lm_state = max_hyp.lm_state

                for logp, k in zip(*top_k):
                    score = max_hyp.score + float(logp)

                    if self.use_lm:
                        score += self.lm_weight * lm_scores[0][k + 1]

                    hyps.append(
                        Hypothesis(
                            score=score,
                            yseq=max_hyp.yseq[:] + [int(k + 1)],
                            dec_state=state,
                            lm_state=lm_state,
                        )
                    )

                hyps_max = float(max(hyps, key=lambda x: x.score).score)
                kept_most_prob = sorted(
                    [hyp for hyp in kept_hyps if hyp.score > hyps_max],
                    key=lambda x: x.score,
                )
                if len(kept_most_prob) >= beam:
                    kept_hyps = kept_most_prob
                    break

        return self.sort_nbest(kept_hyps)

    def time_sync_decoding(self, h: torch.Tensor) -> List[Hypothesis]:
        """Time synchronous beam search implementation.

        Based on https://ieeexplore.ieee.org/document/9053040

        Args:
            h: Encoded speech features (T_max, D_enc)

        Returns:
            nbest_hyps: N-best decoding results

        """
        beam = min(self.beam_size, self.vocab_size)

        beam_state = self.decoder.init_state(beam)

        B = [
            Hypothesis(
                yseq=[self.blank],
                score=0.0,
                dec_state=self.decoder.select_state(beam_state, 0),
            )
        ]
        cache = {}

        if self.use_lm and not self.is_wordlm:
            B[0].lm_state = init_lm_state(self.lm_predictor)

        for hi in h:
            A = []
            C = B

            h_enc = hi.unsqueeze(0)

            for v in range(self.max_sym_exp):
                D = []

                beam_y, beam_state, beam_lm_tokens = self.decoder.batch_score(
                    C,
                    beam_state,
                    cache,
                    self.use_lm,
                )

                beam_logp = torch.log_softmax(self.joint_network(h_enc, beam_y), dim=-1)
                beam_topk = beam_logp[:, 1:].topk(beam, dim=-1)

                seq_A = [h.yseq for h in A]

                for i, hyp in enumerate(C):
                    if hyp.yseq not in seq_A:
                        A.append(
                            Hypothesis(
                                score=(hyp.score + float(beam_logp[i, 0])),
                                yseq=hyp.yseq[:],
                                dec_state=hyp.dec_state,
                                lm_state=hyp.lm_state,
                            )
                        )
                    else:
                        dict_pos = seq_A.index(hyp.yseq)

                        A[dict_pos].score = np.logaddexp(
                            A[dict_pos].score, (hyp.score + float(beam_logp[i, 0]))
                        )

                if v < (self.max_sym_exp - 1):
                    if self.use_lm:
                        beam_lm_states = create_lm_batch_state(
                            [c.lm_state for c in C], self.lm_layers, self.is_wordlm
                        )

                        beam_lm_states, beam_lm_scores = self.lm.buff_predict(
                            beam_lm_states, beam_lm_tokens, len(C)
                        )

                    for i, hyp in enumerate(C):
                        for logp, k in zip(beam_topk[0][i], beam_topk[1][i] + 1):
                            new_hyp = Hypothesis(
                                score=(hyp.score + float(logp)),
                                yseq=(hyp.yseq + [int(k)]),
                                dec_state=self.decoder.select_state(beam_state, i),
                                lm_state=hyp.lm_state,
                            )

                            if self.use_lm:
                                new_hyp.score += self.lm_weight * beam_lm_scores[i, k]

                                new_hyp.lm_state = select_lm_state(
                                    beam_lm_states, i, self.lm_layers, self.is_wordlm
                                )

                            D.append(new_hyp)

                C = sorted(D, key=lambda x: x.score, reverse=True)[:beam]

            B = sorted(A, key=lambda x: x.score, reverse=True)[:beam]

        return self.sort_nbest(B)

    def align_length_sync_decoding(self, h: torch.Tensor) -> List[Hypothesis]:
        """Alignment-length synchronous beam search implementation.

        Based on https://ieeexplore.ieee.org/document/9053040

        Args:
            h: Encoded speech features (T_max, D_enc)

        Returns:
            nbest_hyps: N-best decoding results

        """

        hidden = h
        beam = min(self.beam_size, self.vocab_size)

        h_length = int(h.size(0))
        u_max = min(self.u_max, (h_length - 1))

        beam_state = self.decoder.init_state(beam)        
 
        B = [
            Hypothesis(
                yseq=[self.blank],
                score=0.0,
                dec_state=self.decoder.select_state(beam_state, 0),
                mmi_tot_score=0.0,
                word_ngram_score=0.0,
                tlg_state=self.tlg_scorer.init_state() if self.tlg_scorer else None
            )
        ]
        # final hypothesis set to return
        final = []
        # For hypothesis with same yseq, its decoder output could be cached
        # yseq -> decoder_out, decoder_state
        cache = {} 

        # lm initialization
        if self.use_lm and not self.is_wordlm:
            if hasattr(self, "lm_predictor"):
                B[0].lm_state = init_lm_state(self.lm_predictor)
            else:
                B[0].lm_state = self.lm.init_state(h)

        if self.mmi_scorer is not None and self.mmi_weight > 0.0:
            mmi_nnet_output, mmi_den_scores = self.mmi_scorer.den_scores(h)

        if self.ctc_module:
            ctc_pred = self.ctc_module.log_softmax(h.unsqueeze(0))[0]

        for tu_sum in range(h_length + u_max):
            A = [] # collection for next step
            B_ = [] # collection for search in this step. state of pred. net is kept
            h_states = [] # collection of encoder_out frame for each hypothesis
            p_ctc = [] # collection of ctc distribution
            for j, hyp in enumerate(B): # skip all hypothesis that head the last frame.
                u = len(hyp.yseq) - 1
                t = tu_sum - u + 1

                if t > (h_length - 1):
                    continue

                B_.append(hyp)
                h_states.append((t, h[t]))

                if self.ctc_module:
                    p_ctc.append(ctc_pred[t])

            if B_:
                beam_y, beam_state, beam_lm_tokens = self.decoder.batch_score(
                    B_,
                    beam_state,
                    cache,
                    self.use_lm,
                )
                
                h_enc = torch.stack([h[1] for h in h_states])

                # [beam, h_dim], [beam, h_dim]
                beam_logp = torch.log_softmax(self.joint_network(h_enc, beam_y), dim=-1) # [beam, vocab]
                if self.forbid_lst:
                    beam_logp[:, self.forbid_lst] = -1e20
                    beam_logp = torch.log_softmax(beam_logp, dim=-1)              
 
                if self.ctc_module:
                    p_ctc = torch.stack([p for p in p_ctc])
                    beam_logp += self.ctc_weight * p_ctc

                # warning: like in LASCTC, the LM score would not be considered in top-k process
                beam_topk = beam_logp[:, 1:].topk(beam, dim=-1) # values and indices: [beam, beam]. blank excluded

                if self.use_lm and not self.is_transformer_lm:
                    beam_lm_states = create_lm_batch_state(
                        [b.lm_state for b in B_], self.lm_layers, self.is_wordlm
                    )

                    beam_lm_states, beam_lm_scores = self.lm.buff_predict(
                        beam_lm_states, beam_lm_tokens, len(B_)
                    )

                for i, hyp in enumerate(B_):
                    new_hyp = Hypothesis(
                        score=(hyp.score + float(beam_logp[i, 0])),
                        yseq=hyp.yseq[:],
                        dec_state=hyp.dec_state,
                        lm_state=hyp.lm_state,
                        mmi_tot_score=hyp.mmi_tot_score,
                        word_ngram_score=hyp.word_ngram_score,
                        tlg_state=hyp.tlg_state,
                    )

                    if h_states[i][0] == (h_length - 1):
                        final.append(new_hyp)
                    
                    A.append(new_hyp)

                    # Only search a part of candidate tokens
                    if self.word_ngram_scorer and self.word_ngram_weight > 0.0:
                        next_tokens = beam_topk[1][i] + 1
                        word_ngram_scores, word_ngram_states = self.word_ngram_scorer.score_partial(
                                                                 hyp.yseq[1:], next_tokens, 
                                                                 hyp.word_ngram_score, None)
                    else:
                        word_ngram_scores = [0.0] * len(beam_topk[1][i])
                        word_ngram_states = [0.0] * len(beam_topk[1][i])

                    if self.tlg_scorer and self.tlg_weight > 0.0:
                        next_tokens = beam_topk[1][i] + 1
                        tlg_scores, tlg_states = self.tlg_scorer.score_partial(
                                                     None, next_tokens,
                                                     hyp.tlg_state, None)
                    else:
                        tlg_scores = [0.0] * len(beam_topk[1][i])
                        tlg_states = [None] * len(beam_topk[1][i])

                    if self.use_lm and self.is_transformer_lm:
                        lm_score, lm_state = self.lm.score(torch.Tensor(hyp.yseq).long(),
                                                           hyp.lm_state,
                                                           None)
                     
                    for j, (logp, k) in enumerate(zip(beam_topk[0][i], beam_topk[1][i] + 1)):
 
                        new_hyp = Hypothesis(
                            score=(hyp.score + float(logp)),
                            yseq=(hyp.yseq[:] + [int(k)]),
                            dec_state=self.decoder.select_state(beam_state, i),
                            lm_state=hyp.lm_state,
                            mmi_tot_score=hyp.mmi_tot_score,
                            word_ngram_score=word_ngram_states[j],
                            tlg_state=tlg_states[j] 
                        )

                        # add LM scores. possibly 5 styles
                        if self.use_lm and not self.is_transformer_lm:
                            new_hyp.score += self.lm_weight * beam_lm_scores[i, k]

                            new_hyp.lm_state = select_lm_state(
                                beam_lm_states, i, self.lm_layers, self.is_wordlm
                            )

                        if self.use_lm and self.is_transformer_lm:
                            new_hyp.score += self.lm_weight * lm_score[k]

                            new_hyp.lm_state = lm_state   
 
                        # Word-level N-gram LM
                        if self.word_ngram_scorer and self.word_ngram_weight > 0.0:
                            new_hyp.score += self.word_ngram_weight * word_ngram_scores[j]

                        # TLG.fst
                        if self.tlg_scorer and self.tlg_weight > 0.0:
                            new_hyp.score += self.tlg_weight * tlg_scores[j]

                        # N-gram LM
                        if self.ngram_scorer and self.ngram_weight > 0.0:
                            ngram_score, _ = self.ngram_scorer.score_partial(
                                             torch.Tensor(hyp.yseq[:]).int(),
                                             torch.Tensor([int(k)]).int(), 
                                             None, h)
                            new_hyp.score += self.ngram_weight * ngram_score.item()
                            
                        A.append(new_hyp)
           
            if self.eng_vocab is not None:
                A = self.vocab_regularization(A)
 
            if self.mmi_scorer is not None and self.mmi_weight > 0.0:
                A = self.mmi_scorer.batch_score(A, mmi_nnet_output, mmi_den_scores, tu_sum+1, self.mmi_weight)

            # unlike the original implementation, we combine the hypotheses before pruning
            # this allow the hypothesis different and possibly make the rescore more effective
            B = recombine_hyps(A, self.mmi_weight)
            B = sorted(B, key=lambda x: x.score, reverse=True)[:beam]
 
        if self.tlg_scorer and self.tlg_weight > 0.0 and final:
            tlg_final_states = [h.tlg_state for h in final]
            tlg_final_scores = self.tlg_scorer.final_score(tlg_final_states)
            for i, h in enumerate(final):
                h.score += self.tlg_weight * tlg_final_scores[i]

        if self.mmi_scorer is not None and self.mmi_weight == 0.0:
            final = self.mmi_scorer.batch_rescore(final, hidden)

        if final:
            return self.sort_nbest(final)
        else:
            print("Warning: No finished hypothesis found. return the partial hypothesis", flush=True)
            return B

    def nsc_beam_search(self, h: torch.Tensor) -> List[NSCHypothesis]:
        """N-step constrained beam search implementation.

        Based and modified from https://arxiv.org/pdf/2002.03577.pdf.
        Please reference ESPnet (b-flo, PR #2444) for any usage outside ESPnet
        until further modifications.

        Note: the algorithm is not in his "complete" form but works almost as
        intended.

        Args:
            h: Encoded speech features (T_max, D_enc)

        Returns:
            nbest_hyps: N-best decoding results

        """
        beam = min(self.beam_size, self.vocab_size)
        beam_k = min(beam, (self.vocab_size - 1))

        beam_state = self.decoder.init_state(beam)

        init_tokens = [
            NSCHypothesis(
                yseq=[self.blank],
                score=0.0,
                dec_state=self.decoder.select_state(beam_state, 0),
            )
        ]

        cache = {}

        beam_y, beam_state, beam_lm_tokens = self.decoder.batch_score(
            init_tokens,
            beam_state,
            cache,
            self.use_lm,
        )

        state = self.decoder.select_state(beam_state, 0)

        if self.use_lm:
            beam_lm_states, beam_lm_scores = self.lm.buff_predict(
                None, beam_lm_tokens, 1
            )
            lm_state = select_lm_state(
                beam_lm_states, 0, self.lm_layers, self.is_wordlm
            )
            lm_scores = beam_lm_scores[0]
        else:
            lm_state = None
            lm_scores = None

        kept_hyps = [
            NSCHypothesis(
                yseq=[self.blank],
                score=0.0,
                dec_state=state,
                y=[beam_y[0]],
                lm_state=lm_state,
                lm_scores=lm_scores,
            )
        ]

        for hi in h:
            hyps = sorted(kept_hyps, key=lambda x: len(x.yseq), reverse=True)
            kept_hyps = []

            h_enc = hi.unsqueeze(0)

            for j, hyp_j in enumerate(hyps[:-1]):
                for hyp_i in hyps[(j + 1) :]:
                    curr_id = len(hyp_j.yseq)
                    next_id = len(hyp_i.yseq)

                    if (
                        is_prefix(hyp_j.yseq, hyp_i.yseq)
                        and (curr_id - next_id) <= self.prefix_alpha
                    ):
                        ytu = torch.log_softmax(
                            self.joint_network(hi, hyp_i.y[-1]), dim=-1
                        )

                        curr_score = hyp_i.score + float(ytu[hyp_j.yseq[next_id]])

                        for k in range(next_id, (curr_id - 1)):
                            ytu = torch.log_softmax(
                                self.joint_network(hi, hyp_j.y[k]), dim=-1
                            )

                            curr_score += float(ytu[hyp_j.yseq[k + 1]])

                        hyp_j.score = np.logaddexp(hyp_j.score, curr_score)

            S = []
            V = []
            for n in range(self.nstep):
                beam_y = torch.stack([hyp.y[-1] for hyp in hyps])

                beam_logp = torch.log_softmax(self.joint_network(h_enc, beam_y), dim=-1)
                beam_topk = beam_logp[:, 1:].topk(beam_k, dim=-1)

                for i, hyp in enumerate(hyps):
                    S.append(
                        NSCHypothesis(
                            yseq=hyp.yseq[:],
                            score=hyp.score + float(beam_logp[i, 0:1]),
                            y=hyp.y[:],
                            dec_state=hyp.dec_state,
                            lm_state=hyp.lm_state,
                            lm_scores=hyp.lm_scores,
                        )
                    )

                    for logp, k in zip(beam_topk[0][i], beam_topk[1][i] + 1):
                        score = hyp.score + float(logp)

                        if self.use_lm:
                            score += self.lm_weight * float(hyp.lm_scores[k])

                        V.append(
                            NSCHypothesis(
                                yseq=hyp.yseq[:] + [int(k)],
                                score=score,
                                y=hyp.y[:],
                                dec_state=hyp.dec_state,
                                lm_state=hyp.lm_state,
                                lm_scores=hyp.lm_scores,
                            )
                        )

                V.sort(key=lambda x: x.score, reverse=True)
                V = substract(V, hyps)[:beam]

                beam_state = self.decoder.create_batch_states(
                    beam_state,
                    [v.dec_state for v in V],
                    [v.yseq for v in V],
                )
                beam_y, beam_state, beam_lm_tokens = self.decoder.batch_score(
                    V,
                    beam_state,
                    cache,
                    self.use_lm,
                )

                if self.use_lm:
                    beam_lm_states = create_lm_batch_state(
                        [v.lm_state for v in V], self.lm_layers, self.is_wordlm
                    )
                    beam_lm_states, beam_lm_scores = self.lm.buff_predict(
                        beam_lm_states, beam_lm_tokens, len(V)
                    )

                if n < (self.nstep - 1):
                    for i, v in enumerate(V):
                        v.y.append(beam_y[i])

                        v.dec_state = self.decoder.select_state(beam_state, i)

                        if self.use_lm:
                            v.lm_state = select_lm_state(
                                beam_lm_states, i, self.lm_layers, self.is_wordlm
                            )
                            v.lm_scores = beam_lm_scores[i]

                    hyps = V[:]
                else:
                    beam_logp = torch.log_softmax(
                        self.joint_network(h_enc, beam_y), dim=-1
                    )

                    for i, v in enumerate(V):
                        if self.nstep != 1:
                            v.score += float(beam_logp[i, 0])

                        v.y.append(beam_y[i])

                        v.dec_state = self.decoder.select_state(beam_state, i)

                        if self.use_lm:
                            v.lm_state = select_lm_state(
                                beam_lm_states, i, self.lm_layers, self.is_wordlm
                            )
                            v.lm_scores = beam_lm_scores[i]

            kept_hyps = sorted((S + V), key=lambda x: x.score, reverse=True)[:beam]

        return self.sort_nbest(kept_hyps)

# wenet log_add implementation used in beam search
def log_add(args: List[int]) -> float:
    """
    Stable log add
    """
    if all(a == -float('inf') for a in args):
        return -float('inf')
    a_max = max(args)
    lsp = math.log(sum(math.exp(a - a_max) for a in args))
    return a_max + lsp

def is_all_chinese(strs):
    for _char in strs:
        if not '\u4e00' <= _char <= '\u9fa5':
            return False
    return True


================================================
FILE: nets/chainer_backend/__init__.py
================================================
"""Initialize sub package."""


================================================
FILE: nets/chainer_backend/asr_interface.py
================================================
"""ASR Interface module."""
import chainer

from espnet.nets.asr_interface import ASRInterface


class ChainerASRInterface(ASRInterface, chainer.Chain):
    """ASR Interface for ESPnet model implementation."""

    @staticmethod
    def custom_converter(*args, **kw):
        """Get customconverter of the model (Chainer only)."""
        raise NotImplementedError("custom converter method is not implemented")

    @staticmethod
    def custom_updater(*args, **kw):
        """Get custom_updater of the model (Chainer only)."""
        raise NotImplementedError("custom updater method is not implemented")

    @staticmethod
    def custom_parallel_updater(*args, **kw):
        """Get custom_parallel_updater of the model (Chainer only)."""
        raise NotImplementedError("custom parallel updater method is not implemented")

    def get_total_subsampling_factor(self):
        """Get total subsampling factor."""
        raise NotImplementedError(
            "get_total_subsampling_factor method is not implemented"
        )


================================================
FILE: nets/chainer_backend/ctc.py
================================================
import logging

import chainer
from chainer import cuda
import chainer.functions as F
import chainer.links as L
import numpy as np


class CTC(chainer.Chain):
    """Chainer implementation of ctc layer.

    Args:
        odim (int): The output dimension.
        eprojs (int | None): Dimension of input vectors from encoder.
        dropout_rate (float): Dropout rate.

    """

    def __init__(self, odim, eprojs, dropout_rate):
        super(CTC, self).__init__()
        self.dropout_rate = dropout_rate
        self.loss = None

        with self.init_scope():
            self.ctc_lo = L.Linear(eprojs, odim)

    def __call__(self, hs, ys):
        """CTC forward.

        Args:
            hs (list of chainer.Variable | N-dimension array):
                Input variable from encoder.
            ys (list of chainer.Variable | N-dimension array):
                Input variable of decoder.

        Returns:
            chainer.Variable: A variable holding a scalar value of the CTC loss.

        """
        self.loss = None
        ilens = [x.shape[0] for x in hs]
        olens = [x.shape[0] for x in ys]

        # zero padding for hs
        y_hat = self.ctc_lo(
            F.dropout(F.pad_sequence(hs), ratio=self.dropout_rate), n_batch_axes=2
        )
        y_hat = F.separate(y_hat, axis=1)  # ilen list of batch x hdim

        # zero padding for ys
        y_true = F.pad_sequence(ys, padding=-1)  # batch x olen

        # get length info
        input_length = chainer.Variable(self.xp.array(ilens, dtype=np.int32))
        label_length = chainer.Variable(self.xp.array(olens, dtype=np.int32))
        logging.info(
            self.__class__.__name__ + " input lengths:  " + str(input_length.data)
        )
        logging.info(
            self.__class__.__name__ + " output lengths: " + str(label_length.data)
        )

        # get ctc loss
        self.loss = F.connectionist_temporal_classification(
            y_hat, y_true, 0, input_length, label_length
        )
        logging.info("ctc loss:" + str(self.loss.data))

        return self.loss

    def log_softmax(self, hs):
        """Log_softmax of frame activations.

        Args:
            hs (list of chainer.Variable | N-dimension array):
                Input variable from encoder.

        Returns:
            chainer.Variable: A n-dimension float array.

        """
        y_hat = self.ctc_lo(F.pad_sequence(hs), n_batch_axes=2)
        return F.log_softmax(y_hat.reshape(-1, y_hat.shape[-1])).reshape(y_hat.shape)


class WarpCTC(chainer.Chain):
    """Chainer implementation of warp-ctc layer.

    Args:
        odim (int): The output dimension.
        eproj (int | None): Dimension of input vector from encoder.
        dropout_rate (float): Dropout rate.

    """

    def __init__(self, odim, eprojs, dropout_rate):
        super(WarpCTC, self).__init__()
        self.dropout_rate = dropout_rate
        self.loss = None

        with self.init_scope():
            self.ctc_lo = L.Linear(eprojs, odim)

    def __call__(self, hs, ys):
        """Core function of the Warp-CTC layer.

        Args:
            hs (iterable of chainer.Variable | N-dimention array):
                Input variable from encoder.
            ys (iterable of chainer.Variable | N-dimension array):
                Input variable of decoder.

        Returns:
           chainer.Variable: A variable holding a scalar value of the CTC loss.

        """
        self.loss = None
        ilens = [x.shape[0] for x in hs]
        olens = [x.shape[0] for x in ys]

        # zero padding for hs
        y_hat = self.ctc_lo(
            F.dropout(F.pad_sequence(hs), ratio=self.dropout_rate), n_batch_axes=2
        )
        y_hat = y_hat.transpose(1, 0, 2)  # batch x frames x hdim

        # get length info
        logging.info(self.__class__.__name__ + " input lengths:  " + str(ilens))
        logging.info(self.__class__.__name__ + " output lengths: " + str(olens))

        # get ctc loss
        from chainer_ctc.warpctc import ctc as warp_ctc

        self.loss = warp_ctc(y_hat, ilens, [cuda.to_cpu(y.data) for y in ys])[0]
        logging.info("ctc loss:" + str(self.loss.data))

        return self.loss

    def log_softmax(self, hs):
        """Log_softmax of frame activations.

        Args:
            hs (list of chainer.Variable | N-dimension array):
                Input variable from encoder.

        Returns:
            chainer.Variable: A n-dimension float array.

        """
        y_hat = self.ctc_lo(F.pad_sequence(hs), n_batch_axes=2)
        return F.log_softmax(y_hat.reshape(-1, y_hat.shape[-1])).reshape(y_hat.shape)

    def argmax(self, hs_pad):
        """argmax of frame activations

        :param chainer variable hs_pad: 3d tensor (B, Tmax, eprojs)
        :return: argmax applied 2d tensor (B, Tmax)
        :rtype: chainer.Variable
        """
        return F.argmax(self.ctc_lo(F.pad_sequence(hs_pad), n_batch_axes=2), axis=-1)


def ctc_for(args, odim):
    """Return the CTC layer corresponding to the args.

    Args:
        args (Namespace): The program arguments.
        odim (int): The output dimension.

    Returns:
        The CTC module.

    """
    ctc_type = args.ctc_type
    if ctc_type == "builtin":
        logging.info("Using chainer CTC implementation")
        ctc = CTC(odim, args.eprojs, args.dropout_rate)
    elif ctc_type == "warpctc":
        logging.info("Using warpctc CTC implementation")
        ctc = WarpCTC(odim, args.eprojs, args.dropout_rate)
    else:
        raise ValueError('ctc_type must be "builtin" or "warpctc": {}'.format(ctc_type))
    return ctc


================================================
FILE: nets/chainer_backend/deterministic_embed_id.py
================================================
import numpy
import six

import chainer
from chainer import cuda
from chainer import function_node
from chainer.initializers import normal

# from chainer.functions.connection import embed_id
from chainer import link
from chainer.utils import type_check
from chainer import variable

"""Deterministic EmbedID link and function

   copied from chainer/links/connection/embed_id.py
   and chainer/functions/connection/embed_id.py,
   and modified not to use atomicAdd operation
"""


class EmbedIDFunction(function_node.FunctionNode):
    def __init__(self, ignore_label=None):
        self.ignore_label = ignore_label
        self._w_shape = None

    def check_type_forward(self, in_types):
        type_check.expect(in_types.size() == 2)
        x_type, w_type = in_types
        type_check.expect(
            x_type.dtype.kind == "i",
            x_type.ndim >= 1,
        )
        type_check.expect(w_type.dtype == numpy.float32, w_type.ndim == 2)

    def forward(self, inputs):
        self.retain_inputs((0,))
        x, W = inputs
        self._w_shape = W.shape

        if not type_check.same_types(*inputs):
            raise ValueError(
                "numpy and cupy must not be used together\n"
                "type(W): {0}, type(x): {1}".format(type(W), type(x))
            )

        xp = cuda.get_array_module(*inputs)
        if chainer.is_debug():
            valid_x = xp.logical_and(0 <= x, x < len(W))
            if self.ignore_label is not None:
                valid_x = xp.logical_or(valid_x, x == self.ignore_label)
            if not valid_x.all():
                raise ValueError(
                    "Each not ignored `x` value need to satisfy" "`0 <= x < len(W)`"
                )

        if self.ignore_label is not None:
            mask = x == self.ignore_label
            return (xp.where(mask[..., None], 0, W[xp.where(mask, 0, x)]),)

        return (W[x],)

    def backward(self, indexes, grad_outputs):
        inputs = self.get_retained_inputs()
        gW = EmbedIDGrad(self._w_shape, self.ignore_label).apply(inputs + grad_outputs)[
            0
        ]
        return None, gW


class EmbedIDGrad(function_node.FunctionNode):
    def __init__(self, w_shape, ignore_label=None):
        self.w_shape = w_shape
        self.ignore_label = ignore_label
        self._gy_shape = None

    def forward(self, inputs):
        self.retain_inputs((0,))
        xp = cuda.get_array_module(*inputs)
        x, gy = inputs
        self._gy_shape = gy.shape
        gW = xp.zeros(self.w_shape, dtype=gy.dtype)

        if xp is numpy:
            # It is equivalent to `numpy.add.at(gW, x, gy)` but ufunc.at is
            # too slow.
            for ix, igy in six.moves.zip(x.ravel(), gy.reshape(x.size, -1)):
                if ix == self.ignore_label:
                    continue
                gW[ix] += igy
        else:
            """
            # original code based on cuda elementwise method
            if self.ignore_label is None:
                cuda.elementwise(
                    'T gy, S x, S n_out', 'raw T gW',
                    'ptrdiff_t w_ind[] = {x, i % n_out};'
                    'atomicAdd(&gW[w_ind], gy)',
                    'embed_id_bwd')(
                        gy, xp.expand_dims(x, -1), gW.shape[1], gW)
            else:
                cuda.elementwise(
                    'T gy, S x, S n_out, S ignore', 'raw T gW',
                    '''
                    if (x != ignore) {
                      ptrdiff_t w_ind[] = {x, i % n_out};
                      atomicAdd(&gW[w_ind], gy);
                    }
                    ''',
                    'embed_id_bwd_ignore_label')(
                        gy, xp.expand_dims(x, -1), gW.shape[1],
                        self.ignore_label, gW)
            """
            # EmbedID gradient alternative without atomicAdd, which simply
            # creates a one-hot vector and applies dot product
            xi = xp.zeros((x.size, len(gW)), dtype=numpy.float32)
            idx = xp.arange(x.size, dtype=numpy.int32) * len(gW) + x.ravel()
            xi.ravel()[idx] = 1.0
            if self.ignore_label is not None:
                xi[:, self.ignore_label] = 0.0
            gW = xi.T.dot(gy.reshape(x.size, -1)).astype(gW.dtype, copy=False)

        return (gW,)

    def backward(self, indexes, grads):
        xp = cuda.get_array_module(*grads)
        x = self.get_retained_inputs()[0].data
        ggW = grads[0]

        if self.ignore_label is not None:
            mask = x == self.ignore_label
            # To prevent index out of bounds, we need to check if ignore_label
            # is inside of W.
            if not (0 <= self.ignore_label < self.w_shape[1]):
                x = xp.where(mask, 0, x)

        ggy = ggW[x]

        if self.ignore_label is not None:
            mask, zero, _ = xp.broadcast_arrays(
                mask[..., None], xp.zeros((), "f"), ggy.data
            )
            ggy = chainer.functions.where(mask, zero, ggy)
        return None, ggy


def embed_id(x, W, ignore_label=None):
    r"""Efficient linear function for one-hot input.

    This function implements so called *word embeddings*. It takes two
    arguments: a set of IDs (words) ``x`` in :math:`B` dimensional integer
    vector, and a set of all ID (word) embeddings ``W`` in :math:`V \\times d`
    float32 matrix. It outputs :math:`B \\times d` matrix whose ``i``-th
    column is the ``x[i]``-th column of ``W``.
    This function is only differentiable on the input ``W``.

    Args:
        x (chainer.Variable | np.ndarray): Batch vectors of IDs. Each
            element must be signed integer.
        W (chainer.Variable | np.ndarray): Distributed representation
            of each ID (a.k.a. word embeddings).
        ignore_label (int): If ignore_label is an int value, i-th column
            of return value is filled with 0.

    Returns:
        chainer.Variable: Embedded variable.


    .. rubric:: :class:`~chainer.links.EmbedID`

    Examples:

        >>> x = np.array([2, 1]).astype('i')
        >>> x
        array([2, 1], dtype=int32)
        >>> W = np.array([[0, 0, 0],
        ...               [1, 1, 1],
        ...               [2, 2, 2]]).astype('f')
        >>> W
        array([[ 0.,  0.,  0.],
               [ 1.,  1.,  1.],
               [ 2.,  2.,  2.]], dtype=float32)
        >>> F.embed_id(x, W).data
        array([[ 2.,  2.,  2.],
               [ 1.,  1.,  1.]], dtype=float32)
        >>> F.embed_id(x, W, ignore_label=1).data
        array([[ 2.,  2.,  2.],
               [ 0.,  0.,  0.]], dtype=float32)

    """
    return EmbedIDFunction(ignore_label=ignore_label).apply((x, W))[0]


class EmbedID(link.Link):
    """Efficient linear layer for one-hot input.

    This is a link that wraps the :func:`~chainer.functions.embed_id` function.
    This link holds the ID (word) embedding matrix ``W`` as a parameter.

    Args:
        in_size (int): Number of different identifiers (a.k.a. vocabulary size).
        out_size (int): Output dimension.
        initialW (Initializer): Initializer to initialize the weight.
        ignore_label (int): If `ignore_label` is an int value, i-th column of
            return value is filled with 0.

    .. rubric:: :func:`~chainer.functions.embed_id`

    Attributes:
        W (~chainer.Variable): Embedding parameter matrix.

    Examples:

        >>> W = np.array([[0, 0, 0],
        ...               [1, 1, 1],
        ...               [2, 2, 2]]).astype('f')
        >>> W
        array([[ 0.,  0.,  0.],
               [ 1.,  1.,  1.],
               [ 2.,  2.,  2.]], dtype=float32)
        >>> l = L.EmbedID(W.shape[0], W.shape[1], initialW=W)
        >>> x = np.array([2, 1]).astype('i')
        >>> x
        array([2, 1], dtype=int32)
        >>> y = l(x)
        >>> y.data
        array([[ 2.,  2.,  2.],
               [ 1.,  1.,  1.]], dtype=float32)

    """

    ignore_label = None

    def __init__(self, in_size, out_size, initialW=None, ignore_label=None):
        super(EmbedID, self).__init__()
        self.ignore_label = ignore_label

        with self.init_scope():
            if initialW is None:
                initialW = normal.Normal(1.0)
            self.W = variable.Parameter(initialW, (in_size, out_size))

    def __call__(self, x):
        """Extracts the word embedding of given IDs.

        Args:
            x (chainer.Variable): Batch vectors of IDs.

        Returns:
            chainer.Variable: Batch of corresponding embeddings.

        """
        return embed_id(x, self.W, ignore_label=self.ignore_label)


================================================
FILE: nets/chainer_backend/e2e_asr.py
================================================
# Copyright 2017 Johns Hopkins University (Shinji Watanabe)
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

"""RNN sequence-to-sequence speech recognition model (chainer)."""

import logging
import math

import chainer
from chainer import reporter
import numpy as np

from espnet.nets.chainer_backend.asr_interface import ChainerASRInterface
from espnet.nets.chainer_backend.ctc import ctc_for
from espnet.nets.chainer_backend.rnn.attentions import att_for
from espnet.nets.chainer_backend.rnn.decoders import decoder_for
from espnet.nets.chainer_backend.rnn.encoders import encoder_for
from espnet.nets.e2e_asr_common import label_smoothing_dist
from espnet.nets.pytorch_backend.e2e_asr import E2E as E2E_pytorch
from espnet.nets.pytorch_backend.nets_utils import get_subsample

CTC_LOSS_THRESHOLD = 10000


class E2E(ChainerASRInterface):
    """E2E module for chainer backend.

    Args:
        idim (int): Dimension of the inputs.
        odim (int): Dimension of the outputs.
        args (parser.args): Training config.
        flag_return (bool): If True, train() would return
            additional metrics in addition to the training
            loss.

    """

    @staticmethod
    def add_arguments(parser):
        """Add arguments."""
        return E2E_pytorch.add_arguments(parser)

    def get_total_subsampling_factor(self):
        """Get total subsampling factor."""
        return self.enc.conv_subsampling_factor * int(np.prod(self.subsample))

    def __init__(self, idim, odim, args, flag_return=True):
        """Construct an E2E object.

        :param int idim: dimension of inputs
        :param int odim: dimension of outputs
        :param Namespace args: argument Namespace containing options
        """
        chainer.Chain.__init__(self)
        self.mtlalpha = args.mtlalpha
        assert 0 <= self.mtlalpha <= 1, "mtlalpha must be [0,1]"
        self.etype = args.etype
        self.verbose = args.verbose
        self.char_list = args.char_list
        self.outdir = args.outdir

        # below means the last number becomes eos/sos ID
        # note that sos/eos IDs are identical
        self.sos = odim - 1
        self.eos = odim - 1

        # subsample info
        self.subsample = get_subsample(args, mode="asr", arch="rnn")

        # label smoothing info
        if args.lsm_type:
            logging.info("Use label smoothing with " + args.lsm_type)
            labeldist = label_smoothing_dist(
                odim, args.lsm_type, transcript=args.train_json
            )
        else:
            labeldist = None

        with self.init_scope():
            # encoder
            self.enc = encoder_for(args, idim, self.subsample)
            # ctc
            self.ctc = ctc_for(args, odim)
            # attention
            self.att = att_for(args)
            # decoder
            self.dec = decoder_for(args, odim, self.sos, self.eos, self.att, labeldist)

        self.acc = None
        self.loss = None
        self.flag_return = flag_return

    def forward(self, xs, ilens, ys):
        """E2E forward propagation.

        Args:
            xs (chainer.Variable): Batch of padded charactor ids. (B, Tmax)
            ilens (chainer.Variable): Batch of length of each input batch. (B,)
            ys (chainer.Variable): Batch of padded target features. (B, Lmax, odim)

        Returns:
            float: Loss that calculated by attention and ctc loss.
            float (optional): Ctc loss.
            float (optional): Attention loss.
            float (optional): Accuracy.

        """
        # 1. encoder
        hs, ilens = self.enc(xs, ilens)

        # 3. CTC loss
        if self.mtlalpha == 0:
            loss_ctc = None
        else:
            loss_ctc = self.ctc(hs, ys)

        # 4. attention loss
        if self.mtlalpha == 1:
            loss_att = None
            acc = None
        else:
            loss_att, acc = self.dec(hs, ys)

        self.acc = acc
        alpha = self.mtlalpha
        if alpha == 0:
            self.loss = loss_att
        elif alpha == 1:
            self.loss = loss_ctc
        else:
            self.loss = alpha * loss_ctc + (1 - alpha) * loss_att

        if self.loss.data < CTC_LOSS_THRESHOLD and not math.isnan(self.loss.data):
            reporter.report({"loss_ctc": loss_ctc}, self)
            reporter.report({"loss_att": loss_att}, self)
            reporter.report({"acc": acc}, self)

            logging.info("mtl loss:" + str(self.loss.data))
            reporter.report({"loss": self.loss}, self)
        else:
            logging.warning("loss (=%f) is not correct", self.loss.data)
        if self.flag_return:
            return self.loss, loss_ctc, loss_att, acc
        else:
            return self.loss

    def recognize(self, x, recog_args, char_list, rnnlm=None):
        """E2E greedy/beam search.

        Args:
            x (chainer.Variable): Input tensor for recognition.
            recog_args (parser.args): Arguments of config file.
            char_list (List[str]): List of Charactors.
            rnnlm (Module): RNNLM module defined at `espnet.lm.chainer_backend.lm`.

        Returns:
            List[Dict[str, Any]]: Result of recognition.

        """
        # subsample frame
        x = x[:: self.subsample[0], :]
        ilen = self.xp.array(x.shape[0], dtype=np.int32)
        h = chainer.Variable(self.xp.array(x, dtype=np.float32))

        with chainer.no_backprop_mode(), chainer.using_config("train", False):
            # 1. encoder
            # make a utt list (1) to use the same interface for encoder
            h, _ = self.enc([h], [ilen])

            # calculate log P(z_t|X) for CTC scores
            if recog_args.ctc_weight > 0.0:
                lpz = self.ctc.log_softmax(h).data[0]
            else:
                lpz = None

            # 2. decoder
            # decode the first utterance
            y = self.dec.recognize_beam(h[0], lpz, recog_args, char_list, rnnlm)

            return y

    def calculate_all_attentions(self, xs, ilens, ys):
        """E2E attention calculation.

        Args:
            xs (List): List of padded input sequences. [(T1, idim), (T2, idim), ...]
            ilens (np.ndarray): Batch of lengths of input sequences. (B)
            ys (List): List of character id sequence tensor. [(L1), (L2), (L3), ...]

        Returns:
            float np.ndarray: Attention weights. (B, Lmax, Tmax)

        """
        hs, ilens = self.enc(xs, ilens)
        att_ws = self.dec.calculate_all_attentions(hs, ys)

        return att_ws

    @staticmethod
    def custom_converter(subsampling_factor=0):
        """Get customconverter of the model."""
        from espnet.nets.chainer_backend.rnn.training import CustomConverter

        return CustomConverter(subsampling_factor=subsampling_factor)

    @staticmethod
    def custom_updater(iters, optimizer, converter, device=-1, accum_grad=1):
        """Get custom_updater of the model."""
        from espnet.nets.chainer_backend.rnn.training import CustomUpdater

        return CustomUpdater(
            iters, optimizer, converter=converter, device=device, accum_grad=accum_grad
        )

    @staticmethod
    def custom_parallel_updater(iters, optimizer, converter, devices, accum_grad=1):
        """Get custom_parallel_updater of the model."""
        from espnet.nets.chainer_backend.rnn.training import CustomParallelUpdater

        return CustomParallelUpdater(
            iters,
            optimizer,
            converter=converter,
            devices=devices,
            accum_grad=accum_grad,
        )


================================================
FILE: nets/chainer_backend/e2e_asr_transformer.py
================================================
# encoding: utf-8
"""Transformer-based model for End-to-end ASR."""

from argparse import Namespace
from distutils.util import strtobool
import logging
import math

import chainer
import chainer.functions as F
from chainer import reporter
import numpy as np
import six

from espnet.nets.chainer_backend.asr_interface import ChainerASRInterface
from espnet.nets.chainer_backend.transformer.attention import MultiHeadAttention
from espnet.nets.chainer_backend.transformer import ctc
from espnet.nets.chainer_backend.transformer.decoder import Decoder
from espnet.nets.chainer_backend.transformer.encoder import Encoder
from espnet.nets.chainer_backend.transformer.label_smoothing_loss import (
    LabelSmoothingLoss,  # noqa: H301
)
from espnet.nets.chainer_backend.transformer.training import CustomConverter
from espnet.nets.chainer_backend.transformer.training import CustomUpdater
from espnet.nets.chainer_backend.transformer.training import (
    CustomParallelUpdater,  # noqa: H301
)
from espnet.nets.ctc_prefix_score import CTCPrefixScore
from espnet.nets.e2e_asr_common import end_detect
from espnet.nets.e2e_asr_common import ErrorCalculator
from espnet.nets.pytorch_backend.nets_utils import get_subsample
from espnet.nets.pytorch_backend.transformer.plot import PlotAttentionReport


CTC_SCORING_RATIO = 1.5
MAX_DECODER_OUTPUT = 5


class E2E(ChainerASRInterface):
    """E2E module.

    Args:
        idim (int): Input dimmensions.
        odim (int): Output dimmensions.
        args (Namespace): Training config.
        ignore_id (int, optional): Id for ignoring a character.
        flag_return (bool, optional): If true, return a list with (loss,
        loss_ctc, loss_att, acc) in forward. Otherwise, return loss.

    """

    @staticmethod
    def add_arguments(parser):
        """Customize flags for transformer setup.

        Args:
            parser (Namespace): Training config.

        """
        group = parser.add_argument_group("transformer model setting")
        group.add_argument(
            "--transformer-init",
            type=str,
            default="pytorch",
            help="how to initialize transformer parameters",
        )
        group.add_argument(
            "--transformer-input-layer",
            type=str,
            default="conv2d",
            choices=["conv2d", "linear", "embed"],
            help="transformer input layer type",
        )
        group.add_argument(
            "--transformer-attn-dropout-rate",
            default=None,
            type=float,
            help="dropout in transformer attention. use --dropout-rate if None is set",
        )
        group.add_argument(
            "--transformer-lr",
            default=10.0,
            type=float,
            help="Initial value of learning rate",
        )
        group.add_argument(
            "--transformer-warmup-steps",
            default=25000,
            type=int,
            help="optimizer warmup steps",
        )
        group.add_argument(
            "--transformer-length-normalized-loss",
            default=True,
            type=strtobool,
            help="normalize loss by length",
        )

        group.add_argument(
            "--dropout-rate",
            default=0.0,
            type=float,
            help="Dropout rate for the encoder",
        )
        # Encoder
        group.add_argument(
            "--elayers",
            default=4,
            type=int,
            help="Number of encoder layers (for shared recognition part "
            "in multi-speaker asr mode)",
        )
        group.add_argument(
            "--eunits",
            "-u",
            default=300,
            type=int,
            help="Number of encoder hidden units",
        )
        # Attention
        group.add_argument(
            "--adim",
            default=320,
            type=int,
            help="Number of attention transformation dimensions",
        )
        group.add_argument(
            "--aheads",
            default=4,
            type=int,
            help="Number of heads for multi head attention",
        )
        # Decoder
        group.add_argument(
            "--dlayers", default=1, type=int, help="Number of decoder layers"
        )
        group.add_argument(
            "--dunits", default=320, type=int, help="Number of decoder hidden units"
        )
        return parser

    def get_total_subsampling_factor(self):
        """Get total subsampling factor."""
        return self.encoder.conv_subsampling_factor * int(np.prod(self.subsample))

    def __init__(self, idim, odim, args, ignore_id=-1, flag_return=True):
        """Initialize the transformer."""
        chainer.Chain.__init__(self)
        self.mtlalpha = args.mtlalpha
        assert 0 <= self.mtlalpha <= 1, "mtlalpha must be [0,1]"
        if args.transformer_attn_dropout_rate is None:
            args.transformer_attn_dropout_rate = args.dropout_rate
        self.use_label_smoothing = False
        self.char_list = args.char_list
        self.space = args.sym_space
        self.blank = args.sym_blank
        self.scale_emb = args.adim ** 0.5
        self.sos = odim - 1
        self.eos = odim - 1
        self.subsample = get_subsample(args, mode="asr", arch="transformer")
        self.ignore_id = ignore_id
        self.reset_parameters(args)
        with self.init_scope():
            self.encoder = Encoder(
                idim=idim,
                attention_dim=args.adim,
                attention_heads=args.aheads,
                linear_units=args.eunits,
                input_layer=args.transformer_input_layer,
                dropout_rate=args.dropout_rate,
                positional_dropout_rate=args.dropout_rate,
                attention_dropout_rate=args.transformer_attn_dropout_rate,
                initialW=self.initialW,
                initial_bias=self.initialB,
            )
            self.decoder = Decoder(
                odim, args, initialW=self.initialW, initial_bias=self.initialB
            )
            self.criterion = LabelSmoothingLoss(
                args.lsm_weight,
                len(args.char_list),
                args.transformer_length_normalized_loss,
            )
            if args.mtlalpha > 0.0:
                if args.ctc_type == "builtin":
                    logging.info("Using chainer CTC implementation")
                    self.ctc = ctc.CTC(odim, args.adim, args.dropout_rate)
                elif args.ctc_type == "warpctc":
                    logging.info("Using warpctc CTC implementation")
                    self.ctc = ctc.WarpCTC(odim, args.adim, args.dropout_rate)
                else:
                    raise ValueError(
                        'ctc_type must be "builtin" or "warpctc": {}'.format(
                            args.ctc_type
                        )
                    )
            else:
                self.ctc = None
        self.dims = args.adim
        self.odim = odim
        self.flag_return = flag_return
        if args.report_cer or args.report_wer:
            self.error_calculator = ErrorCalculator(
                args.char_list,
                args.sym_space,
                args.sym_blank,
                args.report_cer,
                args.report_wer,
            )
        else:
            self.error_calculator = None
        if "Namespace" in str(type(args)):
            self.verbose = 0 if "verbose" not in args else args.verbose
        else:
            self.verbose = 0 if args.verbose is None else args.verbose

    def reset_parameters(self, args):
        """Initialize the Weight according to the give initialize-type.

        Args:
            args (Namespace): Transformer config.

        """
        type_init = args.transformer_init
        if type_init == "lecun_uniform":
            logging.info("Using LeCunUniform as Parameter initializer")
            self.initialW = chainer.initializers.LeCunUniform
        elif type_init == "lecun_normal":
            logging.info("Using LeCunNormal as Parameter initializer")
            self.initialW = chainer.initializers.LeCunNormal
        elif type_init == "gorot_uniform":
            logging.info("Using GlorotUniform as Parameter initializer")
            self.initialW = chainer.initializers.GlorotUniform
        elif type_init == "gorot_normal":
            logging.info("Using GlorotNormal as Parameter initializer")
            self.initialW = chainer.initializers.GlorotNormal
        elif type_init == "he_uniform":
            logging.info("Using HeUniform as Parameter initializer")
            self.initialW = chainer.initializers.HeUniform
        elif type_init == "he_normal":
            logging.info("Using HeNormal as Parameter initializer")
            self.initialW = chainer.initializers.HeNormal
        elif type_init == "pytorch":
            logging.info("Using Pytorch initializer")
            self.initialW = chainer.initializers.Uniform
        else:
            logging.info("Using Chainer default as Parameter initializer")
            self.initialW = chainer.initializers.Uniform
        self.initialB = chainer.initializers.Uniform

    def forward(self, xs, ilens, ys_pad, calculate_attentions=False):
        """E2E forward propagation.

        Args:
            xs (chainer.Variable): Batch of padded charactor ids. (B, Tmax)
            ilens (chainer.Variable): Batch of length of each input batch. (B,)
            ys (chainer.Variable): Batch of padded target features. (B, Lmax, odim)
            calculate_attentions (bool): If true, return value is the output of encoder.

        Returns:
            float: Training loss.
            float (optional): Training loss for ctc.
            float (optional): Training loss for attention.
            float (optional): Accuracy.
            chainer.Variable (Optional): Output of the encoder.

        """
        alpha = self.mtlalpha

        # 1. Encoder
        xs, x_mask, ilens = self.encoder(xs, ilens)

        # 2. CTC loss
        cer_ctc = None
        if alpha == 0.0:
            loss_ctc = None
        else:
            _ys = [y.astype(np.int32) for y in ys_pad]
            loss_ctc = self.ctc(xs, _ys)
            if self.error_calculator is not None:
                with chainer.no_backprop_mode():
                    ys_hat = chainer.backends.cuda.to_cpu(self.ctc.argmax(xs).data)
                cer_ctc = self.error_calculator(ys_hat, ys_pad, is_ctc=True)

        # 3. Decoder
        if calculate_attentions:
            self.calculate_attentions(xs, x_mask, ys_pad)
        ys = self.decoder(ys_pad, xs, x_mask)

        # 4. Attention Loss
        cer, wer = None, None
        if alpha == 1:
            loss_att = None
            acc = None
        else:
            # Make target
            eos = np.array([self.eos], "i")
            with chainer.no_backprop_mode():
                ys_pad_out = [np.concatenate([y, eos], axis=0) for y in ys_pad]
                ys_pad_out = F.pad_sequence(ys_pad_out, padding=-1).data
                ys_pad_out = self.xp.array(ys_pad_out)

            loss_att = self.criterion(ys, ys_pad_out)
            acc = F.accuracy(
                ys.reshape(-1, self.odim), ys_pad_out.reshape(-1), ignore_label=-1
            )
            if (not chainer.config.train) and (self.error_calculator is not None):
                cer, wer = self.error_calculator(ys, ys_pad)

        if alpha == 0.0:
            self.loss = loss_att
            loss_att_data = loss_att.data
            loss_ctc_data = None
        elif alpha == 1.0:
            self.loss = loss_ctc
            loss_att_data = None
            loss_ctc_data = loss_ctc.data
        else:
            self.loss = alpha * loss_ctc + (1 - alpha) * loss_att
            loss_att_data = loss_att.data
            loss_ctc_data = loss_ctc.data
        loss_data = self.loss.data

        if not math.isnan(loss_data):
            reporter.report({"loss_ctc": loss_ctc_data}, self)
            reporter.report({"loss_att": loss_att_data}, self)
            reporter.report({"acc": acc}, self)

            reporter.report({"cer_ctc": cer_ctc}, self)
            reporter.report({"cer": cer}, self)
            reporter.report({"wer": wer}, self)

            logging.info("mtl loss:" + str(loss_data))
            reporter.report({"loss": loss_data}, self)
        else:
            logging.warning("loss (=%f) is not correct", loss_data)

        if self.flag_return:
            loss_ctc = None
            return self.loss, loss_ctc, loss_att, acc
        else:
            return self.loss

    def calculate_attentions(self, xs, x_mask, ys_pad):
        """Calculate Attentions."""
        self.decoder(ys_pad, xs, x_mask)

    def recognize(self, x_block, recog_args, char_list=None, rnnlm=None):
        """E2E recognition function.

        Args:
            x (ndarray): Input acouctic feature (B, T, D) or (T, D).
            recog_args (Namespace): Argment namespace contraining options.
            char_list (List[str]): List of characters.
            rnnlm (chainer.Chain): Language model module defined at
            `espnet.lm.chainer_backend.lm`.

        Returns:
            List: N-best decoding results.

        """
        with chainer.no_backprop_mode(), chainer.using_config("train", False):
            # 1. encoder
            ilens = [x_block.shape[0]]
            batch = len(ilens)
            xs, _, _ = self.encoder(x_block[None, :, :], ilens)

            # calculate log P(z_t|X) for CTC scores
            if recog_args.ctc_weight > 0.0:
                lpz = self.ctc.log_softmax(xs.reshape(batch, -1, self.dims)).data[0]
            else:
                lpz = None
            # 2. decoder
            if recog_args.lm_weight == 0.0:
                rnnlm = None
            y = self.recognize_beam(xs, lpz, recog_args, char_list, rnnlm)

        return y

    def recognize_beam(self, h, lpz, recog_args, char_list=None, rnnlm=None):
        """E2E beam search.

        Args:
            h (ndarray): Encoder ouput features (B, T, D) or (T, D).
            lpz (ndarray): Log probabilities from CTC.
            recog_args (Namespace): Argment namespace contraining options.
            char_list (List[str]): List of characters.
            rnnlm (chainer.Chain): Language model module defined at
            `espnet.lm.chainer_backend.lm`.

        Returns:
            List: N-best decoding results.

        """
        logging.info("input lengths: " + str(h.shape[1]))

        # initialization
        n_len = h.shape[1]
        xp = self.xp
        h_mask = xp.ones((1, n_len))

        # search parms
        beam = recog_args.beam_size
        penalty = recog_args.penalty
        ctc_weight = recog_args.ctc_weight

        # prepare sos
        y = self.sos
        if recog_args.maxlenratio == 0:
            maxlen = n_len
        else:
            maxlen = max(1, int(recog_args.maxlenratio * n_len))
        minlen = int(recog_args.minlenratio * n_len)
        logging.info("max output length: " + str(maxlen))
        logging.info("min output length: " + str(minlen))

        # initialize hypothesis
        if rnnlm:
            hyp = {"score": 0.0, "yseq": [y], "rnnlm_prev": None}
        else:
            hyp = {"score": 0.0, "yseq": [y]}

        if lpz is not None:
            ctc_prefix_score = CTCPrefixScore(lpz, 0, self.eos, self.xp)
            hyp["ctc_state_prev"] = ctc_prefix_score.initial_state()
            hyp["ctc_score_prev"] = 0.0
            if ctc_weight != 1.0:
                # pre-pruning based on attention scores
                ctc_beam = min(lpz.shape[-1], int(beam * CTC_SCORING_RATIO))
            else:
                ctc_beam = lpz.shape[-1]

        hyps = [hyp]
        ended_hyps = []

        for i in six.moves.range(maxlen):
            logging.debug("position " + str(i))

            hyps_best_kept = []
            for hyp in hyps:
                ys = F.expand_dims(xp.array(hyp["yseq"]), axis=0).data
                out = self.decoder(ys, h, h_mask)

                # get nbest local scores and their ids
                local_att_scores = F.log_softmax(out[:, -1], axis=-1).data
                if rnnlm:
                    rnnlm_state, local_lm_scores = rnnlm.predict(
                        hyp["rnnlm_prev"], hyp["yseq"][i]
                    )
                    local_scores = (
                        local_att_scores + recog_args.lm_weight * local_lm_scores
                    )
                else:
                    local_scores = local_att_scores

                if lpz is not None:
                    local_best_ids = xp.argsort(local_scores, axis=1)[0, ::-1][
                        :ctc_beam
                    ]
                    ctc_scores, ctc_states = ctc_prefix_score(
                        hyp["yseq"], local_best_ids, hyp["ctc_state_prev"]
                    )
                    local_scores = (1.0 - ctc_weight) * local_att_scores[
                        :, local_best_ids
                    ] + ctc_weight * (ctc_scores - hyp["ctc_score_prev"])
                    if rnnlm:
                        local_scores += (
                            recog_args.lm_weight * local_lm_scores[:, local_best_ids]
                        )
                    joint_best_ids = xp.argsort(local_scores, axis=1)[0, ::-1][:beam]
                    local_best_scores = local_scores[:, joint_best_ids]
                    local_best_ids = local_best_ids[joint_best_ids]
                else:
                    local_best_ids = self.xp.argsort(local_scores, axis=1)[0, ::-1][
                        :beam
                    ]
                    local_best_scores = local_scores[:, local_best_ids]

                for j in six.moves.range(beam):
                    new_hyp = {}
                    new_hyp["score"] = hyp["score"] + float(local_best_scores[0, j])
                    new_hyp["yseq"] = [0] * (1 + len(hyp["yseq"]))
                    new_hyp["yseq"][: len(hyp["yseq"])] = hyp["yseq"]
                    new_hyp["yseq"][len(hyp["yseq"])] = int(local_best_ids[j])
                    if rnnlm:
                        new_hyp["rnnlm_prev"] = rnnlm_state
                    if lpz is not None:
                        new_hyp["ctc_state_prev"] = ctc_states[joint_best_ids[j]]
                        new_hyp["ctc_score_prev"] = ctc_scores[joint_best_ids[j]]
                    hyps_best_kept.append(new_hyp)

                hyps_best_kept = sorted(
                    hyps_best_kept, key=lambda x: x["score"], reverse=True
                )[:beam]

            # sort and get nbest
            hyps = hyps_best_kept
            logging.debug("number of pruned hypothesis: " + str(len(hyps)))
            if char_list is not None:
                logging.debug(
                    "best hypo: "
                    + "".join([char_list[int(x)] for x in hyps[0]["yseq"][1:]])
                    + " score: "
                    + str(hyps[0]["score"])
                )

            # add eos in the final loop to avoid that there are no ended hyps
            if i == maxlen - 1:
                logging.info("adding <eos> in the last postion in the loop")
                for hyp in hyps:
                    hyp["yseq"].append(self.eos)

            # add ended hypothes to a final list, and removed them from current hypothes
            # (this will be a probmlem, number of hyps < beam)
            remained_hyps = []
            for hyp in hyps:
                if hyp["yseq"][-1] == self.eos:
                    # only store the sequence that has more than minlen outputs
                    # also add penalty
                    if len(hyp["yseq"]) > minlen:
                        hyp["score"] += (i + 1) * penalty
                        if rnnlm:  # Word LM needs to add final <eos> score
                            hyp["score"] += recog_args.lm_weight * rnnlm.final(
                                hyp["rnnlm_prev"]
                            )
                        ended_hyps.append(hyp)
                else:
                    remained_hyps.append(hyp)

            # end detection
            if end_detect(ended_hyps, i) and recog_args.maxlenratio == 0.0:
                logging.info("end detected at %d", i)
                break

            hyps = remained_hyps
            if len(hyps) > 0:
                logging.debug("remained hypothes: " + str(len(hyps)))
            else:
                logging.info("no hypothesis. Finish decoding.")
                break
            if char_list is not None:
                for hyp in hyps:
                    logging.debug(
                        "hypo: " + "".join([char_list[int(x)] for x in hyp["yseq"][1:]])
                    )

            logging.debug("number of ended hypothes: " + str(len(ended_hyps)))

        nbest_hyps = sorted(
            ended_hyps, key=lambda x: x["score"], reverse=True
        )  # [:min(len(ended_hyps), recog_args.nbest)]

        logging.debug(nbest_hyps)
        # check number of hypotheis
        if len(nbest_hyps) == 0:
            logging.warn(
                "there is no N-best results, perform recognition "
                "again with smaller minlenratio."
            )
            # should copy becasuse Namespace will be overwritten globally
            recog_args = Namespace(**vars(recog_args))
            recog_args.minlenratio = max(0.0, recog_args.minlenratio - 0.1)
            return self.recognize_beam(h, lpz, recog_args, char_list, rnnlm)

        logging.info("total log probability: " + str(nbest_hyps[0]["score"]))
        logging.info(
            "normalized log probability: "
            + str(nbest_hyps[0]["score"] / len(nbest_hyps[0]["yseq"]))
        )
        # remove sos
        return nbest_hyps

    def calculate_all_attentions(self, xs, ilens, ys):
        """E2E attention calculation.

        Args:
            xs (List[tuple()]): List of padded input sequences.
                [(T1, idim), (T2, idim), ...]
            ilens (ndarray): Batch of lengths of input sequences. (B)
            ys (List): List of character id sequence tensor. [(L1), (L2), (L3), ...]

        Returns:
            float ndarray: Attention weights. (B, Lmax, Tmax)

        """
        with chainer.no_backprop_mode():
            self(xs, ilens, ys, calculate_attentions=True)
        ret = dict()
        for name, m in self.namedlinks():
            if isinstance(m, MultiHeadAttention):
                var = m.attn
                var.to_cpu()
                _name = name[1:].replace("/", "_")
                ret[_name] = var.data
        return ret

    @property
    def attention_plot_class(self):
        """Attention plot function.

        Redirects to PlotAttentionReport

        Returns:
            PlotAttentionReport

        """
        return PlotAttentionReport

    @staticmethod
    def custom_converter(subsampling_factor=0):
        """Get customconverter of the model."""
        return CustomConverter()

    @staticmethod
    def custom_updater(iters, optimizer, converter, device=-1, accum_grad=1):
        """Get custom_updater of the model."""
        return CustomUpdater(
            iters, optimizer, converter=converter, device=device, accum_grad=accum_grad
        )

    @staticmethod
    def custom_parallel_updater(iters, optimizer, converter, devices, accum_grad=1):
        """Get custom_parallel_updater of the model."""
        return CustomParallelUpdater(
            iters,
            optimizer,
            converter=converter,
            devices=devices,
            accum_grad=accum_grad,
        )


================================================
FILE: nets/chainer_backend/nets_utils.py
================================================
import chainer.functions as F


def _subsamplex(x, n):
    x = [F.get_item(xx, (slice(None, None, n), slice(None))) for xx in x]
    ilens = [xx.shape[0] for xx in x]
    return x, ilens


================================================
FILE: nets/chainer_backend/rnn/__init__.py
================================================
"""Initialize sub package."""


================================================
FILE: nets/chainer_backend/rnn/attentions.py
================================================
import chainer
import chainer.functions as F
import chainer.links as L

import numpy as np


# dot product based attention
class AttDot(chainer.Chain):
    """Compute attention based on dot product.

    Args:
        eprojs (int | None): Dimension of input vectors from encoder.
        dunits (int | None): Dimension of input vectors for decoder.
        att_dim (int): Dimension of input vectors for attention.

    """

    def __init__(self, eprojs, dunits, att_dim):
        super(AttDot, self).__init__()
        with self.init_scope():
            self.mlp_enc = L.Linear(eprojs, att_dim)
            self.mlp_dec = L.Linear(dunits, att_dim)

        self.dunits = dunits
        self.eprojs = eprojs
        self.att_dim = att_dim
        self.h_length = None
        self.enc_h = None
        self.pre_compute_enc_h = None

    def reset(self):
        """Reset states."""
        self.h_length = None
        self.enc_h = None
        self.pre_compute_enc_h = None

    def __call__(self, enc_hs, dec_z, att_prev, scaling=2.0):
        """Compute AttDot forward layer.

        Args:
            enc_hs (chainer.Variable | N-dimensional array):
                Input variable from encoder.
            dec_z (chainer.Variable | N-dimensional array): Input variable of decoder.
            scaling (float): Scaling weight to make attention sharp.

        Returns:
            chainer.Variable: Weighted sum over flames.
            chainer.Variable: Attention weight.

        """
        batch = len(enc_hs)
        # pre-compute all h outside the decoder loop
        if self.pre_compute_enc_h is None:
            self.enc_h = F.pad_sequence(enc_hs)  # utt x frame x hdim
            self.h_length = self.enc_h.shape[1]
            # utt x frame x att_dim
            self.pre_compute_enc_h = F.tanh(self.mlp_enc(self.enc_h, n_batch_axes=2))

        if dec_z is None:
            dec_z = chainer.Variable(
                self.xp.zeros((batch, self.dunits), dtype=np.float32)
            )
        else:
            dec_z = dec_z.reshape(batch, self.dunits)

        # <phi (h_t), psi (s)> for all t
        u = F.broadcast_to(
            F.expand_dims(F.tanh(self.mlp_dec(dec_z)), 1), self.pre_compute_enc_h.shape
        )
        e = F.sum(self.pre_compute_enc_h * u, axis=2)  # utt x frame
        # Applying a minus-large-number filter
        # to make a probability value zero for a padded area
        # simply degrades the performance, and I gave up this implementation
        # Apply a scaling to make an attention sharp
        w = F.softmax(scaling * e)
        # weighted sum over flames
        # utt x hdim
        c = F.sum(
            self.enc_h * F.broadcast_to(F.expand_dims(w, 2), self.enc_h.shape), axis=1
        )

        return c, w


# location based attention
class AttLoc(chainer.Chain):
    """Compute location-based attention.

    Args:
        eprojs (int | None): Dimension of input vectors from encoder.
        dunits (int | None): Dimension of input vectors for decoder.
        att_dim (int): Dimension of input vectors for attention.
        aconv_chans (int): Number of channels of output arrays from convolutional layer.
        aconv_filts (int): Size of filters of convolutional layer.

    """

    def __init__(self, eprojs, dunits, att_dim, aconv_chans, aconv_filts):
        super(AttLoc, self).__init__()
        with self.init_scope():
            self.mlp_enc = L.Linear(eprojs, att_dim)
            self.mlp_dec = L.Linear(dunits, att_dim, nobias=True)
            self.mlp_att = L.Linear(aconv_chans, att_dim, nobias=True)
            self.loc_conv = L.Convolution2D(
                1, aconv_chans, ksize=(1, 2 * aconv_filts + 1), pad=(0, aconv_filts)
            )
            self.gvec = L.Linear(att_dim, 1)

        self.dunits = dunits
        self.eprojs = eprojs
        self.att_dim = att_dim
        self.h_length = None
        self.enc_h = None
        self.pre_compute_enc_h = None
        self.aconv_chans = aconv_chans

    def reset(self):
        """Reset states."""
        self.h_length = None
        self.enc_h = None
        self.pre_compute_enc_h = None

    def __call__(self, enc_hs, dec_z, att_prev, scaling=2.0):
        """Compute AttLoc forward layer.

        Args:
            enc_hs (chainer.Variable | N-dimensional array):
                Input variable from encoders.
            dec_z (chainer.Variable | N-dimensional array): Input variable of decoder.
            att_prev (chainer.Variable | None): Attention weight.
            scaling (float): Scaling weight to make attention sharp.

        Returns:
            chainer.Variable: Weighted sum over flames.
            chainer.Variable: Attention weight.

        """
        batch = len(enc_hs)
        # pre-compute all h outside the decoder loop
        if self.pre_compute_enc_h is None:
            self.enc_h = F.pad_sequence(enc_hs)  # utt x frame x hdim
            self.h_length = self.enc_h.shape[1]
            # utt x frame x att_dim
            self.pre_compute_enc_h = self.mlp_enc(self.enc_h, n_batch_axes=2)

        if dec_z is None:
            dec_z = chainer.Variable(
                self.xp.zeros((batch, self.dunits), dtype=np.float32)
            )
        else:
            dec_z = dec_z.reshape(batch, self.dunits)

        # initialize attention weight with uniform dist.
        if att_prev is None:
            att_prev = [
                self.xp.full(hh.shape[0], 1.0 / hh.shape[0], dtype=np.float32)
                for hh in enc_hs
            ]
            att_prev = [chainer.Variable(att) for att in att_prev]
            att_prev = F.pad_sequence(att_prev)

        # att_prev: utt x frame -> utt x 1 x 1 x frame
        # -> utt x att_conv_chans x 1 x frame
        att_conv = self.loc_conv(att_prev.reshape(batch, 1, 1, self.h_length))
        # att_conv: utt x att_conv_chans x 1 x frame -> utt x frame x att_conv_chans
        att_conv = F.swapaxes(F.squeeze(att_conv, axis=2), 1, 2)
        # att_conv: utt x frame x att_conv_chans -> utt x frame x att_dim
        att_conv = self.mlp_att(att_conv, n_batch_axes=2)

        # dec_z_tiled: utt x frame x att_dim
        dec_z_tiled = F.broadcast_to(
            F.expand_dims(self.mlp_dec(dec_z), 1), self.pre_compute_enc_h.shape
        )

        # dot with gvec
        # utt x frame x att_dim -> utt x frame
        # TODO(watanabe) use batch_matmul
        e = F.squeeze(
            self.gvec(
                F.tanh(att_conv + self.pre_compute_enc_h + dec_z_tiled), n_batch_axes=2
            ),
            axis=2,
        )
        # Applying a minus-large-number filter
        # to make a probability value zero for a padded area
        # simply degrades the performance, and I gave up this implementation
        # Apply a scaling to make an attention sharp
        w = F.softmax(scaling * e)

        # weighted sum over flames
        # utt x hdim
        c = F.sum(
            self.enc_h * F.broadcast_to(F.expand_dims(w, 2), self.enc_h.shape), axis=1
        )

        return c, w


class NoAtt(chainer.Chain):
    """Compute non-attention layer.

    This layer is a dummy attention layer to be compatible with other
    attention-based models.

    """

    def __init__(self):
        super(NoAtt, self).__init__()
        self.h_length = None
        self.enc_h = None
        self.pre_compute_enc_h = None
        self.c = None

    def reset(self):
        """Reset states."""
        self.h_length = None
        self.enc_h = None
        self.pre_compute_enc_h = None
        self.c = None

    def __call__(self, enc_hs, dec_z, att_prev):
        """Compute NoAtt forward layer.

        Args:
            enc_hs (chainer.Variable | N-dimensional array):
                Input variable from encoders.
            dec_z: Dummy.
            att_prev (chainer.Variable | None): Attention weight.

        Returns:
            chainer.Variable: Sum over flames.
            chainer.Variable: Attention weight.

        """
        # pre-compute all h outside the decoder loop
        if self.pre_compute_enc_h is None:
            self.enc_h = F.pad_sequence(enc_hs)  # utt x frame x hdim
            self.h_length = self.enc_h.shape[1]

        # initialize attention weight with uniform dist.
        if att_prev is None:
            att_prev = [
                self.xp.full(hh.shape[0], 1.0 / hh.shape[0], dtype=np.float32)
                for hh in enc_hs
            ]
            att_prev = [chainer.Variable(att) for att in att_prev]
            att_prev = F.pad_sequence(att_prev)
            self.c = F.sum(
                self.enc_h
                * F.broadcast_to(F.expand_dims(att_prev, 2), self.enc_h.shape),
                axis=1,
            )

        return self.c, att_prev


def att_for(args):
    """Returns an attention layer given the program arguments.

    Args:
        args (Namespace): The arguments.

    Returns:
        chainer.Chain: The corresponding attention module.

    """
    if args.atype == "dot":
        att = AttDot(args.eprojs, args.dunits, args.adim)
    elif args.atype == "location":
        att = AttLoc(
            args.eprojs, args.dunits, args.adim, args.aconv_chans, args.aconv_filts
        )
    elif args.atype == "noatt":
        att = NoAtt()
    else:
        raise NotImplementedError(
            "chainer supports only noatt, dot, and location attention."
        )
    return att


================================================
FILE: nets/chainer_backend/rnn/decoders.py
================================================
import logging
import random
import six

import chainer
import chainer.functions as F
import chainer.links as L
import numpy as np

import espnet.nets.chainer_backend.deterministic_embed_id as DL

from argparse import Namespace

from espnet.nets.ctc_prefix_score import CTCPrefixScore
from espnet.nets.e2e_asr_common import end_detect

CTC_SCORING_RATIO = 1.5
MAX_DECODER_OUTPUT = 5


class Decoder(chainer.Chain):
    """Decoder layer.

    Args:
        eprojs (int): Dimension of input variables from encoder.
        odim (int): The output dimension.
        dtype (str): Decoder type.
        dlayers (int): Number of layers for decoder.
        dunits (int): Dimension of input vector of decoder.
        sos (int): Number to indicate the start of sequences.
        eos (int): Number to indicate the end of sequences.
        att (Module): Attention module defined at
            `espnet.espnet.nets.chainer_backend.attentions`.
        verbose (int): Verbosity level.
        char_list (List[str]): List of all charactors.
        labeldist (numpy.array): Distributed array of counted transcript length.
        lsm_weight (float): Weight to use when calculating the training loss.
        sampling_probability (float): Threshold for scheduled sampling.

    """

    def __init__(
        self,
        eprojs,
        odim,
        dtype,
        dlayers,
        dunits,
        sos,
        eos,
        att,
        verbose=0,
        char_list=None,
        labeldist=None,
        lsm_weight=0.0,
        sampling_probability=0.0,
    ):
        super(Decoder, self).__init__()
        with self.init_scope():
            self.embed = DL.EmbedID(odim, dunits)
            self.rnn0 = (
                L.StatelessLSTM(dunits + eprojs, dunits)
                if dtype == "lstm"
                else L.StatelessGRU(dunits + eprojs, dunits)
            )
            for i in six.moves.range(1, dlayers):
                setattr(
                    self,
                    "rnn%d" % i,
                    L.StatelessLSTM(dunits, dunits)
                    if dtype == "lstm"
                    else L.StatelessGRU(dunits, dunits),
                )
            self.output = L.Linear(dunits, odim)
        self.dtype = dtype
        self.loss = None
        self.att = att
        self.dlayers = dlayers
        self.dunits = dunits
        self.sos = sos
        self.eos = eos
        self.verbose = verbose
        self.char_list = char_list
        # for label smoothing
        self.labeldist = labeldist
        self.vlabeldist = None
        self.lsm_weight = lsm_weight
        self.sampling_probability = sampling_probability

    def rnn_forward(self, ey, z_list, c_list, z_prev, c_prev):
        if self.dtype == "lstm":
            c_list[0], z_list[0] = self.rnn0(c_prev[0], z_prev[0], ey)
            for i in six.moves.range(1, self.dlayers):
                c_list[i], z_list[i] = self["rnn%d" % i](
                    c_prev[i], z_prev[i], z_list[i - 1]
                )
        else:
            if z_prev[0] is None:
                xp = self.xp
                with chainer.backends.cuda.get_device_from_id(self._device_id):
                    z_prev[0] = chainer.Variable(
                        xp.zeros((ey.shape[0], self.dunits), dtype=ey.dtype)
                    )
            z_list[0] = self.rnn0(z_prev[0], ey)
            for i in six.moves.range(1, self.dlayers):
                if z_prev[i] is None:
                    xp = self.xp
                    with chainer.backends.cuda.get_device_from_id(self._device_id):
                        z_prev[i] = chainer.Variable(
                            xp.zeros(
                                (z_list[i - 1].shape[0], self.dunits),
                                dtype=z_list[i - 1].dtype,
                            )
                        )
                z_list[i] = self["rnn%d" % i](z_prev[i], z_list[i - 1])
        return z_list, c_list

    def __call__(self, hs, ys):
        """Core function of Decoder layer.

        Args:
            hs (list of chainer.Variable | N-dimension array):
                Input variable from encoder.
            ys (list of chainer.Variable | N-dimension array):
                Input variable of decoder.

        Returns:
            chainer.Variable: A variable holding a scalar array of the training loss.
            chainer.Variable: A variable holding a scalar array of the accuracy.

        """
        self.loss = None
        # prepare input and output word sequences with sos/eos IDs
        eos = self.xp.array([self.eos], "i")
        sos = self.xp.array([self.sos], "i")
        ys_in = [F.concat([sos, y], axis=0) for y in ys]
        ys_out = [F.concat([y, eos], axis=0) for y in ys]

        # padding for ys with -1
        # pys: utt x olen
        pad_ys_in = F.pad_sequence(ys_in, padding=self.eos)
        pad_ys_out = F.pad_sequence(ys_out, padding=-1)

        # get dim, length info
        batch = pad_ys_out.shape[0]
        olength = pad_ys_out.shape[1]
        logging.info(
            self.__class__.__name__
            + " input lengths:  "
            + str(self.xp.array([h.shape[0] for h in hs]))
        )
        logging.info(
            self.__class__.__name__
            + " output lengths: "
            + str(self.xp.array([y.shape[0] for y in ys_out]))
        )

        # initialization
        c_list = [None]  # list of cell state of each layer
        z_list = [None]  # list of hidden state of each layer
        for _ in six.moves.range(1, self.dlayers):
            c_list.append(None)
            z_list.append(None)
        att_w = None
        z_all = []
        self.att.reset()  # reset pre-computation of h

        # pre-computation of embedding
        eys = self.embed(pad_ys_in)  # utt x olen x zdim
        eys = F.separate(eys, axis=1)

        # loop for an output sequence
        for i in six.moves.range(olength):
            att_c, att_w = self.att(hs, z_list[0], att_w)
            if i > 0 and random.random() < self.sampling_probability:
                logging.info(" scheduled sampling ")
                z_out = self.output(z_all[-1])
                z_out = F.argmax(F.log_softmax(z_out), axis=1)
                z_out = self.embed(z_out)
                ey = F.hstack((z_out, att_c))  # utt x (zdim + hdim)
            else:
                ey = F.hstack((eys[i], att_c))  # utt x (zdim + hdim)
            z_list, c_list = self.rnn_forward(ey, z_list, c_list, z_list, c_list)
            z_all.append(z_list[-1])

        z_all = F.stack(z_all, axis=1).reshape(batch * olength, self.dunits)
        # compute loss
        y_all = self.output(z_all)
        self.loss = F.softmax_cross_entropy(y_all, F.flatten(pad_ys_out))
        # -1: eos, which is removed in the loss computation
        self.loss *= np.mean([len(x) for x in ys_in]) - 1
        acc = F.accuracy(y_all, F.flatten(pad_ys_out), ignore_label=-1)
        logging.info("att loss:" + str(self.loss.data))

        # show predicted character sequence for debug
        if self.verbose > 0 and self.char_list is not None:
            y_hat = y_all.reshape(batch, olength, -1)
            y_true = pad_ys_out
            for (i, y_hat_), y_true_ in zip(enumerate(y_hat.data), y_true.data):
                if i == MAX_DECODER_OUTPUT:
                    break
                idx_hat = self.xp.argmax(y_hat_[y_true_ != -1], axis=1)
                idx_true = y_true_[y_true_ != -1]
                seq_hat = [self.char_list[int(idx)] for idx in idx_hat]
                seq_true = [self.char_list[int(idx)] for idx in idx_true]
                seq_hat = "".join(seq_hat).replace("<space>", " ")
                seq_true = "".join(seq_true).replace("<space>", " ")
                logging.info("groundtruth[%d]: " % i + seq_true)
                logging.info("prediction [%d]: " % i + seq_hat)

        if self.labeldist is not None:
            if self.vlabeldist is None:
                self.vlabeldist = chainer.Variable(self.xp.asarray(self.labeldist))
            loss_reg = -F.sum(
                F.scale(F.log_softmax(y_all), self.vlabeldist, axis=1)
            ) / len(ys_in)
            self.loss = (1.0 - self.lsm_weight) * self.loss + self.lsm_weight * loss_reg

        return self.loss, acc

    def recognize_beam(self, h, lpz, recog_args, char_list, rnnlm=None):
        """Beam search implementation.

        Args:
            h (chainer.Variable): One of the output from the encoder.
            lpz (chainer.Variable | None): Result of net propagation.
            recog_args (Namespace): The argument.
            char_list (List[str]): List of all charactors.
            rnnlm (Module): RNNLM module. Defined at `espnet.lm.chainer_backend.lm`

        Returns:
            List[Dict[str,Any]]: Result of recognition.

        """
        logging.info("input lengths: " + str(h.shape[0]))
        # initialization
        c_list = [None]  # list of cell state of each layer
        z_list = [None]  # list of hidden state of each layer
        for _ in six.moves.range(1, self.dlayers):
            c_list.append(None)
            z_list.append(None)
        a = None
        self.att.reset()  # reset pre-computation of h

        # search parms
        beam = recog_args.beam_size
        penalty = recog_args.penalty
        ctc_weight = recog_args.ctc_weight

        # preprate sos
        y = self.xp.full(1, self.sos, "i")
        if recog_args.maxlenratio == 0:
            maxlen = h.shape[0]
        else:
            # maxlen >= 1
            maxlen = max(1, int(recog_args.maxlenratio * h.shape[0]))
        minlen = int(recog_args.minlenratio * h.shape[0])
        logging.info("max output length: " + str(maxlen))
        logging.info("min output length: " + str(minlen))

        # initialize hypothesis
        if rnnlm:
            hyp = {
                "score": 0.0,
                "yseq": [y],
                "c_prev": c_list,
                "z_prev": z_list,
                "a_prev": a,
                "rnnlm_prev": None,
            }
        else:
            hyp = {
                "score": 0.0,
                "yseq": [y],
                "c_prev": c_list,
                "z_prev": z_list,
                "a_prev": a,
            }
        if lpz is not None:
            ctc_prefix_score = CTCPrefixScore(lpz, 0, self.eos, self.xp)
            hyp["ctc_state_prev"] = ctc_prefix_score.initial_state()
            hyp["ctc_score_prev"] = 0.0
            if ctc_weight != 1.0:
                # pre-pruning based on attention scores
                ctc_beam = min(lpz.shape[-1], int(beam * CTC_SCORING_RATIO))
            else:
                ctc_beam = lpz.shape[-1]
        hyps = [hyp]
        ended_hyps = []

        for i in six.moves.range(maxlen):
            logging.debug("position " + str(i))

            hyps_best_kept = []
            for hyp in hyps:
                ey = self.embed(hyp["yseq"][i])  # utt list (1) x zdim
                att_c, att_w = self.att([h], hyp["z_prev"][0], hyp["a_prev"])
                ey = F.hstack((ey, att_c))  # utt(1) x (zdim + hdim)

                z_list, c_list = self.rnn_forward(
                    ey, z_list, c_list, hyp["z_prev"], hyp["c_prev"]
                )

                # get nbest local scores and their ids
                local_att_scores = F.log_softmax(self.output(z_list[-1])).data
                if rnnlm:
                    rnnlm_state, local_lm_scores = rnnlm.predict(
                        hyp["rnnlm_prev"], hyp["yseq"][i]
                    )
                    local_scores = (
                        local_att_scores + recog_args.lm_weight * local_lm_scores
                    )
                else:
                    local_scores = local_att_scores

                if lpz is not None:
                    local_best_ids = self.xp.argsort(local_scores, axis=1)[0, ::-1][
                        :ctc_beam
                    ]
                    ctc_scores, ctc_states = ctc_prefix_score(
                        hyp["yseq"], local_best_ids, hyp["ctc_state_prev"]
                    )
                    local_scores = (1.0 - ctc_weight) * local_att_scores[
                        :, local_best_ids
                    ] + ctc_weight * (ctc_scores - hyp["ctc_score_prev"])
                    if rnnlm:
                        local_scores += (
                            recog_args.lm_weight * local_lm_scores[:, local_best_ids]
                        )
                    joint_best_ids = self.xp.argsort(local_scores, axis=1)[0, ::-1][
                        :beam
                    ]
                    local_best_scores = local_scores[:, joint_best_ids]
                    local_best_ids = local_best_ids[joint_best_ids]
                else:
                    local_best_ids = self.xp.argsort(local_scores, axis=1)[0, ::-1][
                        :beam
                    ]
                    local_best_scores = local_scores[:, local_best_ids]

                for j in six.moves.range(beam):
                    new_hyp = {}
                    # do not copy {z,c}_list directly
                    new_hyp["z_prev"] = z_list[:]
                    new_hyp["c_prev"] = c_list[:]
                    new_hyp["a_prev"] = att_w
                    new_hyp["score"] = hyp["score"] + local_best_scores[0, j]
                    new_hyp["yseq"] = [0] * (1 + len(hyp["yseq"]))
                    new_hyp["yseq"][: len(hyp["yseq"])] = hyp["yseq"]
                    new_hyp["yseq"][len(hyp["yseq"])] = self.xp.full(
                        1, local_best_ids[j], "i"
                    )
                    if rnnlm:
                        new_hyp["rnnlm_prev"] = rnnlm_state
                    if lpz is not None:
                        new_hyp["ctc_state_prev"] = ctc_states[joint_best_ids[j]]
                        new_hyp["ctc_score_prev"] = ctc_scores[joint_best_ids[j]]
                    # will be (2 x beam) hyps at most
                    hyps_best_kept.append(new_hyp)

                hyps_best_kept = sorted(
                    hyps_best_kept, key=lambda x: x["score"], reverse=True
                )[:beam]

            # sort and get nbest
            hyps = hyps_best_kept
            logging.debug("number of pruned hypotheses: " + str(len(hyps)))
            logging.debug(
                "best hypo: "
                + "".join([char_list[int(x)] for x in hyps[0]["yseq"][1:]]).replace(
                    "<space>", " "
                )
            )

            # add eos in the final loop to avoid that there are no ended hyps
            if i == maxlen - 1:
                logging.info("adding <eos> in the last position in the loop")
                for hyp in hyps:
                    hyp["yseq"].append(self.xp.full(1, self.eos, "i"))

            # add ended hypotheses to a final list,
            # and removed them from current hypotheses
            # (this will be a problem, number of hyps < beam)
            remained_hyps = []
            for hyp in hyps:
                if hyp["yseq"][-1] == self.eos:
                    # only store the sequence that has more than minlen outputs
                    # also add penalty
                    if len(hyp["yseq"]) > minlen:
                        hyp["score"] += (i + 1) * penalty
                        if rnnlm:  # Word LM needs to add final <eos> score
                            hyp["score"] += recog_args.lm_weight * rnnlm.final(
                                hyp["rnnlm_prev"]
                            )
                        ended_hyps.append(hyp)
                else:
                    remained_hyps.append(hyp)

            # end detection
            if end_detect(ended_hyps, i) and recog_args.maxlenratio == 0.0:
                logging.info("end detected at %d", i)
                break

            hyps = remained_hyps
            if len(hyps) > 0:
                logging.debug("remaining hypotheses: " + str(len(hyps)))
            else:
                logging.info("no hypothesis. Finish decoding.")
                break

            for hyp in hyps:
                logging.debug(
                    "hypo: "
                    + "".join([char_list[int(x)] for x in hyp["yseq"][1:]]).replace(
                        "<space>", " "
                    )
                )

            logging.debug("number of ended hypotheses: " + str(len(ended_hyps)))

        nbest_hyps = sorted(ended_hyps, key=lambda x: x["score"], reverse=True)[
            : min(len(ended_hyps), recog_args.nbest)
        ]

        # check number of hypotheses
        if len(nbest_hyps) == 0:
            logging.warning(
                "there is no N-best results, "
                "perform recognition again with smaller minlenratio."
            )
            # should copy because Namespace will be overwritten globally
            recog_args = Namespace(**vars(recog_args))
            recog_args.minlenratio = max(0.0, recog_args.minlenratio - 0.1)
            return self.recognize_beam(h, lpz, recog_args, char_list, rnnlm)

        logging.info("total log probability: " + str(nbest_hyps[0]["score"]))
        logging.info(
            "normalized log probability: "
            + str(nbest_hyps[0]["score"] / len(nbest_hyps[0]["yseq"]))
        )

        return nbest_hyps

    def calculate_all_attentions(self, hs, ys):
        """Calculate all of attentions.

        Args:
            hs (list of chainer.Variable | N-dimensional array):
                Input variable from encoder.
            ys (list of chainer.Variable | N-dimensional array):
                Input variable of decoder.

        Returns:
            chainer.Variable: List of attention weights.

        """
        # prepare input and output word sequences with sos/eos IDs
        eos = self.xp.array([self.eos], "i")
        sos = self.xp.array([self.sos], "i")
        ys_in = [F.concat([sos, y], axis=0) for y in ys]
        ys_out = [F.concat([y, eos], axis=0) for y in ys]

        # padding for ys with -1
        # pys: utt x olen
        pad_ys_in = F.pad_sequence(ys_in, padding=self.eos)
        pad_ys_out = F.pad_sequence(ys_out, padding=-1)

        # get length info
        olength = pad_ys_out.shape[1]

        # initialization
        c_list = [None]  # list of cell state of each layer
        z_list = [None]  # list of hidden state of each layer
        for _ in six.moves.range(1, self.dlayers):
            c_list.append(None)
            z_list.append(None)
        att_w = None
        att_ws = []
        self.att.reset()  # reset pre-computation of h

        # pre-computation of embedding
        eys = self.embed(pad_ys_in)  # utt x olen x zdim
        eys = F.separate(eys, axis=1)

        # loop for an output sequence
        for i in six.moves.range(olength):
            att_c, att_w = self.att(hs, z_list[0], att_w)
            ey = F.hstack((eys[i], att_c))  # utt x (zdim + hdim)
            z_list, c_list = self.rnn_forward(ey, z_list, c_list, z_list, c_list)
            att_ws.append(att_w)  # for debugging

        att_ws = F.stack(att_ws, axis=1)
        att_ws.to_cpu()

        return att_ws.data


def decoder_for(args, odim, sos, eos, att, labeldist):
    """Return the decoding layer corresponding to the args.

    Args:
        args (Namespace): The program arguments.
        odim (int): The output dimension.
        sos (int): Number to indicate the start of sequences.
        eos (int) Number to indicate the end of sequences.
        att (Module):
            Attention module defined at `espnet.nets.chainer_backend.attentions`.
        labeldist (numpy.array): Distributed array of length od transcript.

    Returns:
        chainer.Chain: The decoder module.

    """
    return Decoder(
        args.eprojs,
        odim,
        args.dtype,
        args.dlayers,
        args.dunits,
        sos,
        eos,
        att,
        args.verbose,
        args.char_list,
        labeldist,
        args.lsm_weight,
        args.sampling_probability,
    )


================================================
FILE: nets/chainer_backend/rnn/encoders.py
================================================
import logging
import six

import chainer
import chainer.functions as F
import chainer.links as L
import numpy as np

from chainer import cuda

from espnet.nets.chainer_backend.nets_utils import _subsamplex
from espnet.nets.e2e_asr_common import get_vgg2l_odim


# TODO(watanabe) explanation of BLSTMP
class RNNP(chainer.Chain):
    """RNN with projection layer module.

    Args:
        idim (int): Dimension of inputs.
        elayers (int): Number of encoder layers.
        cdim (int): Number of rnn units. (resulted in cdim * 2 if bidirectional)
        hdim (int): Number of projection units.
        subsample (np.ndarray): List to use sabsample the input array.
        dropout (float): Dropout rate.
        typ (str): The RNN type.

    """

    def __init__(self, idim, elayers, cdim, hdim, subsample, dropout, typ="blstm"):
        super(RNNP, self).__init__()
        bidir = typ[0] == "b"
        if bidir:
            rnn = L.NStepBiLSTM if "lstm" in typ else L.NStepBiGRU
        else:
            rnn = L.NStepLSTM if "lstm" in typ else L.NStepGRU
        rnn_label = "birnn" if bidir else "rnn"
        with self.init_scope():
            for i in six.moves.range(elayers):
                if i == 0:
                    inputdim = idim
                else:
                    inputdim = hdim
                _cdim = 2 * cdim if bidir else cdim
                # bottleneck layer to merge
                setattr(
                    self, "{}{:d}".format(rnn_label, i), rnn(1, inputdim, cdim, dropout)
                )
                setattr(self, "bt%d" % i, L.Linear(_cdim, hdim))

        self.elayers = elayers
        self.rnn_label = rnn_label
        self.cdim = cdim
        self.subsample = subsample
        self.typ = typ
        self.bidir = bidir

    def __call__(self, xs, ilens):
        """RNNP forward.

        Args:
            xs (chainer.Variable): Batch of padded charactor ids. (B, Tmax)
            ilens (chainer.Variable): Batch of length of each input batch. (B,)

        Returns:
            xs (chainer.Variable):subsampled vector of xs.
            chainer.Variable: Subsampled vector of ilens.

        """
        logging.info(self.__class__.__name__ + " input lengths: " + str(ilens))

        for layer in six.moves.range(self.elayers):
            if "lstm" in self.typ:
                _, _, ys = self[self.rnn_label + str(layer)](None, None, xs)
            else:
                _, ys = self[self.rnn_label + str(layer)](None, xs)
            # ys: utt list of frame x cdim x 2 (2: means bidirectional)
            # TODO(watanabe) replace subsample and FC layer with CNN
            ys, ilens = _subsamplex(ys, self.subsample[layer + 1])
            # (sum _utt frame_utt) x dim
            ys = self["bt" + str(layer)](F.vstack(ys))
            xs = F.split_axis(ys, np.cumsum(ilens[:-1]), axis=0)

        # final tanh operation
        xs = F.split_axis(F.tanh(F.vstack(xs)), np.cumsum(ilens[:-1]), axis=0)

        # 1 utterance case, it becomes an array, so need to make a utt tuple
        if not isinstance(xs, tuple):
            xs = [xs]

        return xs, ilens  # x: utt list of frame x dim


class RNN(chainer.Chain):
    """RNN Module.

    Args:
        idim (int): Dimension of the imput.
        elayers (int): Number of encoder layers.
        cdim (int): Number of rnn units.
        hdim (int): Number of projection units.
        dropout (float): Dropout rate.
        typ (str): Rnn type.

    """

    def __init__(self, idim, elayers, cdim, hdim, dropout, typ="lstm"):
        super(RNN, self).__init__()
        bidir = typ[0] == "b"
        if bidir:
            rnn = L.NStepBiLSTM if "lstm" in typ else L.NStepBiGRU
        else:
            rnn = L.NStepLSTM if "lstm" in typ else L.NStepGRU
        _cdim = 2 * cdim if bidir else cdim
        with self.init_scope():
            self.nbrnn = rnn(elayers, idim, cdim, dropout)
            self.l_last = L.Linear(_cdim, hdim)
        self.typ = typ
        self.bidir = bidir

    def __call__(self, xs, ilens):
        """BRNN forward propagation.

        Args:
            xs (chainer.Variable): Batch of padded charactor ids. (B, Tmax)
            ilens (chainer.Variable): Batch of length of each input batch. (B,)

        Returns:
            tuple(chainer.Variable): Tuple of `chainer.Variable` objects.
            chainer.Variable: `ilens` .

        """
        logging.info(self.__class__.__name__ + " input lengths: " + str(ilens))
        # need to move ilens to cpu
        ilens = cuda.to_cpu(ilens)

        if "lstm" in self.typ:
            _, _, ys = self.nbrnn(None, None, xs)
        else:
            _, ys = self.nbrnn(None, xs)
        ys = self.l_last(F.vstack(ys))  # (sum _utt frame_utt) x dim
        xs = F.split_axis(ys, np.cumsum(ilens[:-1]), axis=0)

        # final tanh operation
        xs = F.split_axis(F.tanh(F.vstack(xs)), np.cumsum(ilens[:-1]), axis=0)

        # 1 utterance case, it becomes an array, so need to make a utt tuple
        if not isinstance(xs, tuple):
            xs = [xs]

        return xs, ilens  # x: utt list of frame x dim


# TODO(watanabe) explanation of VGG2L, VGG2B (Block) might be better
class VGG2L(chainer.Chain):
    """VGG motibated cnn layers.

    Args:
        in_channel (int): Number of channels.

    """

    def __init__(self, in_channel=1):
        super(VGG2L, self).__init__()
        with self.init_scope():
            # CNN layer (VGG motivated)
            self.conv1_1 = L.Convolution2D(in_channel, 64, 3, stride=1, pad=1)
            self.conv1_2 = L.Convolution2D(64, 64, 3, stride=1, pad=1)
            self.conv2_1 = L.Convolution2D(64, 128, 3, stride=1, pad=1)
            self.conv2_2 = L.Convolution2D(128, 128, 3, stride=1, pad=1)

        self.in_channel = in_channel

    def __call__(self, xs, ilens):
        """VGG2L forward propagation.

        Args:
            xs (chainer.Variable): Batch of padded charactor ids. (B, Tmax)
            ilens (chainer.Variable): Batch of length of each features. (B,)

        Returns:
            chainer.Variable: Subsampled vector of xs.
            chainer.Variable: Subsampled vector of ilens.

        """
        logging.info(self.__class__.__name__ + " input lengths: " + str(ilens))

        # x: utt x frame x dim
        xs = F.pad_sequence(xs)

        # x: utt x 1 (input channel num) x frame x dim
        xs = F.swapaxes(
            xs.reshape(
                xs.shape[0],
                xs.shape[1],
                self.in_channel,
                xs.shape[2] // self.in_channel,
            ),
            1,
            2,
        )

        xs = F.relu(self.conv1_1(xs))
        xs = F.relu(self.conv1_2(xs))
        xs = F.max_pooling_2d(xs, 2, stride=2)

        xs = F.relu(self.conv2_1(xs))
        xs = F.relu(self.conv2_2(xs))
        xs = F.max_pooling_2d(xs, 2, stride=2)

        # change ilens accordingly
        ilens = self.xp.array(
            self.xp.ceil(self.xp.array(ilens, dtype=np.float32) / 2), dtype=np.int32
        )
        ilens = self.xp.array(
            self.xp.ceil(self.xp.array(ilens, dtype=np.float32) / 2), dtype=np.int32
        )

        # x: utt_list of frame (remove zeropaded frames) x (input channel num x dim)
        xs = F.swapaxes(xs, 1, 2)
        xs = xs.reshape(xs.shape[0], xs.shape[1], xs.shape[2] * xs.shape[3])
        xs = [xs[i, : ilens[i], :] for i in range(len(ilens))]

        return xs, ilens


class Encoder(chainer.Chain):
    """Encoder network class.

    Args:
        etype (str): Type of encoder network.
        idim (int): Number of dimensions of encoder network.
        elayers (int): Number of layers of encoder network.
        eunits (int): Number of lstm units of encoder network.
        eprojs (int): Number of projection units of encoder network.
        subsample (np.array): Subsampling number. e.g. 1_2_2_2_1
        dropout (float): Dropout rate.

    """

    def __init__(
        self, etype, idim, elayers, eunits, eprojs, subsample, dropout, in_channel=1
    ):
        super(Encoder, self).__init__()
        typ = etype.lstrip("vgg").rstrip("p")
        if typ not in ["lstm", "gru", "blstm", "bgru"]:
            logging.error("Error: need to specify an appropriate encoder architecture")
        with self.init_scope():
            if etype.startswith("vgg"):
                if etype[-1] == "p":
                    self.enc = chainer.Sequential(
                        VGG2L(in_channel),
                        RNNP(
                            get_vgg2l_odim(idim, in_channel=in_channel),
                            elayers,
                            eunits,
                            eprojs,
                            subsample,
                            dropout,
                            typ=typ,
                        ),
                    )
                    logging.info("Use CNN-VGG + " + typ.upper() + "P for encoder")
                else:
                    self.enc = chainer.Sequential(
                        VGG2L(in_channel),
                        RNN(
                            get_vgg2l_odim(idim, in_channel=in_channel),
                            elayers,
                            eunits,
                            eprojs,
                            dropout,
                            typ=typ,
                        ),
                    )
                    logging.info("Use CNN-VGG + " + typ.upper() + " for encoder")
                self.conv_subsampling_factor = 4
            else:
                if etype[-1] == "p":
                    self.enc = chainer.Sequential(
                        RNNP(idim, elayers, eunits, eprojs, subsample, dropout, typ=typ)
                    )
                    logging.info(
                        typ.upper() + " with every-layer projection for encoder"
                    )
                else:
                    self.enc = chainer.Sequential(
                        RNN(idim, elayers, eunits, eprojs, dropout, typ=typ)
                    )
                    logging.info(typ.upper() + " without projection for encoder")
                self.conv_subsampling_factor = 1

    def __call__(self, xs, ilens):
        """Encoder forward.

        Args:
            xs (chainer.Variable): Batch of padded charactor ids. (B, Tmax)
            ilens (chainer.variable): Batch of length of each features. (B,)

        Returns:
            chainer.Variable: Output of the encoder.
            chainer.Variable: (Subsampled) vector of ilens.

        """
        xs, ilens = self.enc(xs, ilens)

        return xs, ilens


def encoder_for(args, idim, subsample):
    """Return the Encoder module.

    Args:
        idim (int): Dimension of input array.
        subsample (numpy.array): Subsample number. egs).1_2_2_2_1

    Return
        chainer.nn.Module: Encoder module.

    """
    return Encoder(
        args.etype,
        idim,
        args.elayers,
        args.eunits,
        args.eprojs,
        subsample,
        args.dropout_rate,
    )


================================================
FILE: nets/chainer_backend/rnn/training.py
================================================
# Copyright 2017 Johns Hopkins University (Shinji Watanabe)
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)


import collections
import logging
import math
import six

# chainer related
from chainer import cuda
from chainer import training
from chainer import Variable

from chainer.training.updaters.multiprocess_parallel_updater import gather_grads
from chainer.training.updaters.multiprocess_parallel_updater import gather_params
from chainer.training.updaters.multiprocess_parallel_updater import scatter_grads

import numpy as np


# copied from https://github.com/chainer/chainer/blob/master/chainer/optimizer.py
def sum_sqnorm(arr):
    """Calculate the norm of the array.

    Args:
        arr (numpy.ndarray)

    Returns:
        Float: Sum of the norm calculated from the given array.

    """
    sq_sum = collections.defaultdict(float)
    for x in arr:
        with cuda.get_device_from_array(x) as dev:
            if x is not None:
                x = x.ravel()
                s = x.dot(x)
                sq_sum[int(dev)] += s
    return sum([float(i) for i in six.itervalues(sq_sum)])


class CustomUpdater(training.StandardUpdater):
    """Custom updater for chainer.

    Args:
        train_iter (iterator | dict[str, iterator]): Dataset iterator for the
            training dataset. It can also be a dictionary that maps strings to
            iterators. If this is just an iterator, then the iterator is
            registered by the name ``'main'``.
        optimizer (optimizer | dict[str, optimizer]): Optimizer to update
            parameters. It can also be a dictionary that maps strings to
            optimizers. If this is just an optimizer, then the optimizer is
            registered by the name ``'main'``.
        converter (espnet.asr.chainer_backend.asr.CustomConverter): Converter
            function to build input arrays. Each batch extracted by the main
            iterator and the ``device`` option are passed to this function.
            :func:`chainer.dataset.concat_examples` is used by default.
        device (int or dict): The destination device info to send variables. In the
            case of cpu or single gpu, `device=-1 or 0`, respectively.
            In the case of multi-gpu, `device={"main":0, "sub_1": 1, ...}`.
        accum_grad (int):The number of gradient accumulation. if set to 2, the network
            parameters will be updated once in twice,
            i.e. actual batchsize will be doubled.

    """

    def __init__(self, train_iter, optimizer, converter, device, accum_grad=1):
        super(CustomUpdater, self).__init__(
            train_iter, optimizer, converter=converter, device=device
        )
        self.forward_count = 0
        self.accum_grad = accum_grad
        self.start = True
        # To solve #1091, it is required to set the variable inside this class.
        self.device = device

    # The core part of the update routine can be customized by overriding.
    def update_core(self):
        """Main update routine for Custom Updater."""
        train_iter = self.get_iterator("main")
        optimizer = self.get_optimizer("main")

        # Get batch and convert into variables
        batch = train_iter.next()
        x = self.converter(batch, self.device)
        if self.start:
            optimizer.target.cleargrads()
            self.start = False

        # Compute the loss at this time step and accumulate it
        loss = optimizer.target(*x) / self.accum_grad
        loss.backward()  # Backprop
        loss.unchain_backward()  # Truncate the graph

        # update parameters
        self.forward_count += 1
        if self.forward_count != self.accum_grad:
            return
        self.forward_count = 0
        # compute the gradient norm to check if it is normal or not
        grad_norm = np.sqrt(
            sum_sqnorm([p.grad for p in optimizer.target.params(False)])
        )
        logging.info("grad norm={}".format(grad_norm))
        if math.isnan(grad_norm):
            logging.warning("grad norm is nan. Do not update model.")
        else:
            optimizer.update()
        optimizer.target.cleargrads()  # Clear the parameter gradients

    def update(self):
        self.update_core()
        if self.forward_count == 0:
            self.iteration += 1


class CustomParallelUpdater(training.updaters.MultiprocessParallelUpdater):
    """Custom Parallel Updater for chainer.

    Defines the main update routine.

    Args:
        train_iter (iterator | dict[str, iterator]): Dataset iterator for the
            training dataset. It can also be a dictionary that maps strings to
            iterators. If this is just an iterator, then the iterator is
            registered by the name ``'main'``.
        optimizer (optimizer | dict[str, optimizer]): Optimizer to update
            parameters. It can also be a dictionary that maps strings to
            optimizers. If this is just an optimizer, then the optimizer is
            registered by the name ``'main'``.
        converter (espnet.asr.chainer_backend.asr.CustomConverter): Converter
            function to build input arrays. Each batch extracted by the main
            iterator and the ``device`` option are passed to this function.
            :func:`chainer.dataset.concat_examples` is used by default.
        device (torch.device): Device to which the training data is sent.
            Negative value
            indicates the host memory (CPU).
        accum_grad (int):The number of gradient accumulation. if set to 2,
            the network parameters will be updated once in twice,
            i.e. actual batchsize will be doubled.

    """

    def __init__(self, train_iters, optimizer, converter, devices, accum_grad=1):
        super(CustomParallelUpdater, self).__init__(
            train_iters, optimizer, converter=converter, devices=devices
        )
        from cupy.cuda import nccl

        self.accum_grad = accum_grad
        self.forward_count = 0
        self.nccl = nccl

    # The core part of the update routine can be customized by overriding.
    def update_core(self):
        """Main Update routine of the custom parallel updater."""
        self.setup_workers()

        self._send_message(("update", None))
        with cuda.Device(self._devices[0]):
            # For reducing memory

            optimizer = self.get_optimizer("main")
            batch = self.get_iterator("main").next()
            x = self.converter(batch, self._devices[0])

            loss = self._master(*x) / self.accum_grad
            loss.backward()
            loss.unchain_backward()

            # NCCL: reduce grads
            null_stream = cuda.Stream.null
            if self.comm is not None:
                gg = gather_grads(self._master)
                self.comm.reduce(
                    gg.data.ptr,
                    gg.data.ptr,
                    gg.size,
                    self.nccl.NCCL_FLOAT,
                    self.nccl.NCCL_SUM,
                    0,
                    null_stream.ptr,
                )
                scatter_grads(self._master, gg)
                del gg

            # update parameters
            self.forward_count += 1
            if self.forward_count != self.accum_grad:
                return
            self.forward_count = 0
            # check gradient value
            grad_norm = np.sqrt(
                sum_sqnorm([p.grad for p in optimizer.target.params(False)])
            )
            logging.info("grad norm={}".format(grad_norm))

            # update
            if math.isnan(grad_norm):
                logging.warning("grad norm is nan. Do not update model.")
            else:
                optimizer.update()
            self._master.cleargrads()

            if self.comm is not None:
                gp = gather_params(self._master)
                self.comm.bcast(
                    gp.data.ptr, gp.size, self.nccl.NCCL_FLOAT, 0, null_stream.ptr
                )

    def update(self):
        self.update_core()
        if self.forward_count == 0:
            self.iteration += 1


class CustomConverter(object):
    """Custom Converter.

    Args:
        subsampling_factor (int): The subsampling factor.

    """

    def __init__(self, subsampling_factor=1):
        self.subsampling_factor = subsampling_factor

    def __call__(self, batch, device):
        """Perform sabsampling.

        Args:
            batch (list): Batch that will be sabsampled.
            device (device): GPU device.

        Returns:
            chainer.Variable: xp.array that sabsampled from batch.
            xp.array: xp.array of the length of the mini-batches.
            chainer.Variable: xp.array that sabsampled from batch.

        """
        # set device
        xp = cuda.cupy if device != -1 else np

        # batch should be located in list
        assert len(batch) == 1
        xs, ys = batch[0]

        # perform subsampling
        if self.subsampling_factor > 1:
            xs = [x[:: self.subsampling_factor, :] for x in xs]

        # get batch made of lengths of input sequences
        ilens = [x.shape[0] for x in xs]

        # convert to Variable
        xs = [Variable(xp.array(x, dtype=xp.float32)) for x in xs]
        ilens = xp.array(ilens, dtype=xp.int32)
        ys = [Variable(xp.array(y, dtype=xp.int32)) for y in ys]

        return xs, ilens, ys


================================================
FILE: nets/chainer_backend/transformer/__init__.py
================================================
"""Initialize sub package."""


================================================
FILE: nets/chainer_backend/transformer/attention.py
================================================
# encoding: utf-8
"""Class Declaration of Transformer's Attention."""

import chainer

import chainer.functions as F
import chainer.links as L

import numpy as np

MIN_VALUE = float(np.finfo(np.float32).min)


class MultiHeadAttention(chainer.Chain):
    """Multi Head Attention Layer.

    Args:
        n_units (int): Number of input units.
        h (int): Number of attention heads.
        dropout (float): Dropout rate.
        initialW: Initializer to initialize the weight.
        initial_bias: Initializer to initialize the bias.

    :param int h: the number of heads
    :param int n_units: the number of features
    :param float dropout_rate: dropout rate

    """

    def __init__(self, n_units, h=8, dropout=0.1, initialW=None, initial_bias=None):
        """Initialize MultiHeadAttention."""
        super(MultiHeadAttention, self).__init__()
        assert n_units % h == 0
        stvd = 1.0 / np.sqrt(n_units)
        with self.init_scope():
            self.linear_q = L.Linear(
                n_units,
                n_units,
                initialW=initialW(scale=stvd),
                initial_bias=initial_bias(scale=stvd),
            )
            self.linear_k = L.Linear(
                n_units,
                n_units,
                initialW=initialW(scale=stvd),
                initial_bias=initial_bias(scale=stvd),
            )
            self.linear_v = L.Linear(
                n_units,
                n_units,
                initialW=initialW(scale=stvd),
                initial_bias=initial_bias(scale=stvd),
            )
            self.linear_out = L.Linear(
                n_units,
                n_units,
                initialW=initialW(scale=stvd),
                initial_bias=initial_bias(scale=stvd),
            )
        self.d_k = n_units // h
        self.h = h
        self.dropout = dropout
        self.attn = None

    def forward(self, e_var, s_var=None, mask=None, batch=1):
        """Core function of the Multi-head attention layer.

        Args:
            e_var (chainer.Variable): Variable of input array.
            s_var (chainer.Variable): Variable of source array from encoder.
            mask (chainer.Variable): Attention mask.
            batch (int): Batch size.

        Returns:
            chainer.Variable: Outout of multi-head attention layer.

        """
        xp = self.xp
        if s_var is None:
            # batch, head, time1/2, d_k)
            Q = self.linear_q(e_var).reshape(batch, -1, self.h, self.d_k)
            K = self.linear_k(e_var).reshape(batch, -1, self.h, self.d_k)
            V = self.linear_v(e_var).reshape(batch, -1, self.h, self.d_k)
        else:
            Q = self.linear_q(e_var).reshape(batch, -1, self.h, self.d_k)
            K = self.linear_k(s_var).reshape(batch, -1, self.h, self.d_k)
            V = self.linear_v(s_var).reshape(batch, -1, self.h, self.d_k)
        scores = F.matmul(F.swapaxes(Q, 1, 2), K.transpose(0, 2, 3, 1)) / np.sqrt(
            self.d_k
        )
        if mask is not None:
            mask = xp.stack([mask] * self.h, axis=1)
            scores = F.where(mask, scores, xp.full(scores.shape, MIN_VALUE, "f"))
        self.attn = F.softmax(scores, axis=-1)
        p_attn = F.dropout(self.attn, self.dropout)
        x = F.matmul(p_attn, F.swapaxes(V, 1, 2))
        x = F.swapaxes(x, 1, 2).reshape(-1, self.h * self.d_k)
        return self.linear_out(x)


================================================
FILE: nets/chainer_backend/transformer/ctc.py
================================================
# encoding: utf-8
"""Class Declaration of Transformer's CTC."""
import logging

import chainer
import chainer.functions as F
import chainer.links as L
import numpy as np


# TODO(nelson): Merge chainer_backend/transformer/ctc.py in chainer_backend/ctc.py
class CTC(chainer.Chain):
    """Chainer implementation of ctc layer.

    Args:
        odim (int): The output dimension.
        eprojs (int | None): Dimension of input vectors from encoder.
        dropout_rate (float): Dropout rate.

    """

    def __init__(self, odim, eprojs, dropout_rate):
        """Initialize CTC."""
        super(CTC, self).__init__()
        self.dropout_rate = dropout_rate
        self.loss = None

        with self.init_scope():
            self.ctc_lo = L.Linear(eprojs, odim)

    def __call__(self, hs, ys):
        """CTC forward.

        Args:
            hs (list of chainer.Variable | N-dimension array):
                Input variable from encoder.
            ys (list of chainer.Variable | N-dimension array):
                Input variable of decoder.

        Returns:
            chainer.Variable: A variable holding a scalar value of the CTC loss.

        """
        self.loss = None
        ilens = [x.shape[0] for x in hs]
        olens = [x.shape[0] for x in ys]

        # zero padding for hs
        y_hat = self.ctc_lo(
            F.dropout(F.pad_sequence(hs), ratio=self.dropout_rate), n_batch_axes=2
        )
        y_hat = F.separate(y_hat, axis=1)  # ilen list of batch x hdim

        # zero padding for ys
        y_true = F.pad_sequence(ys, padding=-1)  # batch x olen

        # get length info
        input_length = chainer.Variable(self.xp.array(ilens, dtype=np.int32))
        label_length = chainer.Variable(self.xp.array(olens, dtype=np.int32))
        logging.info(
            self.__class__.__name__ + " input lengths:  " + str(input_length.data)
        )
        logging.info(
            self.__class__.__name__ + " output lengths: " + str(label_length.data)
        )

        # get ctc loss
        self.loss = F.connectionist_temporal_classification(
            y_hat, y_true, 0, input_length, label_length
        )
        logging.info("ctc loss:" + str(self.loss.data))

        return self.loss

    def log_softmax(self, hs):
        """Log_softmax of frame activations.

        Args:
            hs (list of chainer.Variable | N-dimension array):
                Input variable from encoder.

        Returns:
            chainer.Variable: A n-dimension float array.

        """
        y_hat = self.ctc_lo(F.pad_sequence(hs), n_batch_axes=2)
        return F.log_softmax(y_hat.reshape(-1, y_hat.shape[-1])).reshape(y_hat.shape)


class WarpCTC(chainer.Chain):
    """Chainer implementation of warp-ctc layer.

    Args:
        odim (int): The output dimension.
        eproj (int | None): Dimension of input vector from encoder.
        dropout_rate (float): Dropout rate.

    """

    def __init__(self, odim, eprojs, dropout_rate):
        """Initialize WarpCTC."""
        super(WarpCTC, self).__init__()
        # The main difference between the ctc for transformer and
        # the rnn is because the target (ys) is already a list of
        # arrays located in the cpu, while in rnn routine the target is
        # a list of variables located in cpu/gpu. If the target of rnn becomes
        # a list of cpu arrays then this file would be no longer required.
        from chainer_ctc.warpctc import ctc as warp_ctc

        self.ctc = warp_ctc
        self.dropout_rate = dropout_rate
        self.loss = None

        with self.init_scope():
            self.ctc_lo = L.Linear(eprojs, odim)

    def forward(self, hs, ys):
        """Core function of the Warp-CTC layer.

        Args:
            hs (iterable of chainer.Variable | N-dimention array):
                Input variable from encoder.
            ys (iterable of N-dimension array): Input variable of decoder.

        Returns:
           chainer.Variable: A variable holding a scalar value of the CTC loss.

        """
        self.loss = None
        ilens = [hs.shape[1]] * hs.shape[0]
        olens = [x.shape[0] for x in ys]

        # zero padding for hs
        # output batch x frames x hdim > frames x batch x hdim
        y_hat = self.ctc_lo(
            F.dropout(hs, ratio=self.dropout_rate), n_batch_axes=2
        ).transpose(1, 0, 2)

        # get length info
        logging.info(self.__class__.__name__ + " input lengths:  " + str(ilens))
        logging.info(self.__class__.__name__ + " output lengths: " + str(olens))

        # get ctc loss
        self.loss = self.ctc(y_hat, ilens, ys)[0]
        logging.info("ctc loss:" + str(self.loss.data))
        return self.loss

    def log_softmax(self, hs):
        """Log_softmax of frame activations.

        Args:
            hs (list of chainer.Variable | N-dimension array):
                Input variable from encoder.

        Returns:
            chainer.Variable: A n-dimension float array.

        """
        y_hat = self.ctc_lo(F.pad_sequence(hs), n_batch_axes=2)
        return F.log_softmax(y_hat.reshape(-1, y_hat.shape[-1])).reshape(y_hat.shape)

    def argmax(self, hs_pad):
        """Argmax of frame activations.

        :param chainer variable hs_pad: 3d tensor (B, Tmax, eprojs)
        :return: argmax applied 2d tensor (B, Tmax)
        :rtype: chainer.Variable.
        """
        return F.argmax(self.ctc_lo(F.pad_sequence(hs_pad), n_batch_axes=2), axis=-1)


================================================
FILE: nets/chainer_backend/transformer/decoder.py
================================================
# encoding: utf-8
"""Class Declaration of Transformer's Decoder."""

import chainer

import chainer.functions as F
import chainer.links as L

from espnet.nets.chainer_backend.transformer.decoder_layer import DecoderLayer
from espnet.nets.chainer_backend.transformer.embedding import PositionalEncoding
from espnet.nets.chainer_backend.transformer.layer_norm import LayerNorm
from espnet.nets.chainer_backend.transformer.mask import make_history_mask

import numpy as np


class Decoder(chainer.Chain):
    """Decoder layer.

    Args:
        odim (int): The output dimension.
        n_layers (int): Number of ecoder layers.
        n_units (int): Number of attention units.
        d_units (int): Dimension of input vector of decoder.
        h (int): Number of attention heads.
        dropout (float): Dropout rate.
        initialW (Initializer): Initializer to initialize the weight.
        initial_bias (Initializer): Initializer to initialize teh bias.

    """

    def __init__(self, odim, args, initialW=None, initial_bias=None):
        """Initialize Decoder."""
        super(Decoder, self).__init__()
        self.sos = odim - 1
        self.eos = odim - 1
        initialW = chainer.initializers.Uniform if initialW is None else initialW
        initial_bias = (
            chainer.initializers.Uniform if initial_bias is None else initial_bias
        )
        with self.init_scope():
            self.output_norm = LayerNorm(args.adim)
            self.pe = PositionalEncoding(args.adim, args.dropout_rate)
            stvd = 1.0 / np.sqrt(args.adim)
            self.output_layer = L.Linear(
                args.adim,
                odim,
                initialW=initialW(scale=stvd),
                initial_bias=initial_bias(scale=stvd),
            )
            self.embed = L.EmbedID(
                odim,
                args.adim,
                ignore_label=-1,
                initialW=chainer.initializers.Normal(scale=1.0),
            )
        for i in range(args.dlayers):
            name = "decoders." + str(i)
            layer = DecoderLayer(
                args.adim,
                d_units=args.dunits,
                h=args.aheads,
                dropout=args.dropout_rate,
                initialW=initialW,
                initial_bias=initial_bias,
            )
            self.add_link(name, layer)
        self.n_layers = args.dlayers

    def make_attention_mask(self, source_block, target_block):
        """Prepare the attention mask.

        Args:
            source_block (ndarray): Source block with dimensions: (B x S).
            target_block (ndarray): Target block with dimensions: (B x T).
        Returns:
            ndarray: Mask with dimensions (B, S, T).

        """
        mask = (target_block[:, None, :] >= 0) * (source_block[:, :, None] >= 0)
        # (batch, source_length, target_length)
        return mask

    def forward(self, ys_pad, source, x_mask):
        """Forward decoder.

        :param xp.array e: input token ids, int64 (batch, maxlen_out)
        :param xp.array yy_mask: input token mask, uint8  (batch, maxlen_out)
        :param xp.array source: encoded memory, float32  (batch, maxlen_in, feat)
        :param xp.array xy_mask: encoded memory mask, uint8  (batch, maxlen_in)
        :return e: decoded token score before softmax (batch, maxlen_out, token)
        :rtype: chainer.Variable
        """
        xp = self.xp
        sos = np.array([self.sos], np.int32)
        ys = [np.concatenate([sos, y], axis=0) for y in ys_pad]
        e = F.pad_sequence(ys, padding=self.eos).data
        e = xp.array(e)
        # mask preparation
        xy_mask = self.make_attention_mask(e, xp.array(x_mask))
        yy_mask = self.make_attention_mask(e, e)
        yy_mask *= make_history_mask(xp, e)

        e = self.pe(self.embed(e))
        batch, length, dims = e.shape
        e = e.reshape(-1, dims)
        source = source.reshape(-1, dims)
        for i in range(self.n_layers):
            e = self["decoders." + str(i)](e, source, xy_mask, yy_mask, batch)
        return self.output_layer(self.output_norm(e)).reshape(batch, length, -1)

    def recognize(self, e, yy_mask, source):
        """Process recognition function."""
        e = self.forward(e, source, yy_mask)
        return F.log_softmax(e, axis=-1)


================================================
FILE: nets/chainer_backend/transformer/decoder_layer.py
================================================
# encoding: utf-8
"""Class Declaration of Transformer's Decoder Block."""

import chainer

import chainer.functions as F

from espnet.nets.chainer_backend.transformer.attention import MultiHeadAttention
from espnet.nets.chainer_backend.transformer.layer_norm import LayerNorm
from espnet.nets.chainer_backend.transformer.positionwise_feed_forward import (
    PositionwiseFeedForward,  # noqa: H301
)


class DecoderLayer(chainer.Chain):
    """Single decoder layer module.

    Args:
        n_units (int): Number of input/output dimension of a FeedForward layer.
        d_units (int): Number of units of hidden layer in a FeedForward layer.
        h (int): Number of attention heads.
        dropout (float): Dropout rate

    """

    def __init__(
        self, n_units, d_units=0, h=8, dropout=0.1, initialW=None, initial_bias=None
    ):
        """Initialize DecoderLayer."""
        super(DecoderLayer, self).__init__()
        with self.init_scope():
            self.self_attn = MultiHeadAttention(
                n_units,
                h,
                dropout=dropout,
                initialW=initialW,
                initial_bias=initial_bias,
            )
            self.src_attn = MultiHeadAttention(
                n_units,
                h,
                dropout=dropout,
                initialW=initialW,
                initial_bias=initial_bias,
            )
            self.feed_forward = PositionwiseFeedForward(
                n_units,
                d_units=d_units,
                dropout=dropout,
                initialW=initialW,
                initial_bias=initial_bias,
            )
            self.norm1 = LayerNorm(n_units)
            self.norm2 = LayerNorm(n_units)
            self.norm3 = LayerNorm(n_units)
        self.dropout = dropout

    def forward(self, e, s, xy_mask, yy_mask, batch):
        """Compute Encoder layer.

        Args:
            e (chainer.Variable): Batch of padded features. (B, Lmax)
            s (chainer.Variable): Batch of padded character. (B, Tmax)

        Returns:
            chainer.Variable: Computed variable of decoder.

        """
        n_e = self.norm1(e)
        n_e = self.self_attn(n_e, mask=yy_mask, batch=batch)
        e = e + F.dropout(n_e, self.dropout)

        n_e = self.norm2(e)
        n_e = self.src_attn(n_e, s_var=s, mask=xy_mask, batch=batch)
        e = e + F.dropout(n_e, self.dropout)

        n_e = self.norm3(e)
        n_e = self.feed_forward(n_e)
        e = e + F.dropout(n_e, self.dropout)
        return e


================================================
FILE: nets/chainer_backend/transformer/embedding.py
================================================
# encoding: utf-8
"""Class Declaration of Transformer's Positional Encoding."""

import chainer
import chainer.functions as F

import numpy as np


class PositionalEncoding(chainer.Chain):
    """Positional encoding module.

    :param int n_units: embedding dim
    :param float dropout: dropout rate
    :param int length: maximum input length

    """

    def __init__(self, n_units, dropout=0.1, length=5000):
        """Initialize Positional Encoding."""
        # Implementation described in the paper
        super(PositionalEncoding, self).__init__()
        self.dropout = dropout
        posi_block = np.arange(0, length, dtype=np.float32)[:, None]
        unit_block = np.exp(
            np.arange(0, n_units, 2, dtype=np.float32) * -(np.log(10000.0) / n_units)
        )
        self.pe = np.zeros((length, n_units), dtype=np.float32)
        self.pe[:, ::2] = np.sin(posi_block * unit_block)
        self.pe[:, 1::2] = np.cos(posi_block * unit_block)
        self.scale = np.sqrt(n_units)

    def forward(self, e):
        """Forward Positional Encoding."""
        length = e.shape[1]
        e = e * self.scale + self.xp.array(self.pe[:length])
        return F.dropout(e, self.dropout)


================================================
FILE: nets/chainer_backend/transformer/encoder.py
================================================
# encoding: utf-8
"""Class Declaration of Transformer's Encoder."""

import chainer

from chainer import links as L

from espnet.nets.chainer_backend.transformer.embedding import PositionalEncoding
from espnet.nets.chainer_backend.transformer.encoder_layer import EncoderLayer
from espnet.nets.chainer_backend.transformer.layer_norm import LayerNorm
from espnet.nets.chainer_backend.transformer.mask import make_history_mask
from espnet.nets.chainer_backend.transformer.subsampling import Conv2dSubsampling
from espnet.nets.chainer_backend.transformer.subsampling import LinearSampling

import logging
import numpy as np


class Encoder(chainer.Chain):
    """Encoder.

    Args:
        input_type(str):
            Sampling type. `input_type` must be `conv2d` or 'linear' currently.
        idim (int): Dimension of inputs.
        n_layers (int): Number of encoder layers.
        n_units (int): Number of input/output dimension of a FeedForward layer.
        d_units (int): Number of units of hidden layer in a FeedForward layer.
        h (int): Number of attention heads.
        dropout (float): Dropout rate

    """

    def __init__(
        self,
        idim,
        attention_dim=256,
        attention_heads=4,
        linear_units=2048,
        num_blocks=6,
        dropout_rate=0.1,
        positional_dropout_rate=0.1,
        attention_dropout_rate=0.0,
        input_layer="conv2d",
        pos_enc_class=PositionalEncoding,
        initialW=None,
        initial_bias=None,
    ):
        """Initialize Encoder.

        Args:
            idim (int): Input dimension.
            args (Namespace): Training config.
            initialW (int, optional):  Initializer to initialize the weight.
            initial_bias (bool, optional): Initializer to initialize the bias.

        """
        super(Encoder, self).__init__()
        initialW = chainer.initializers.Uniform if initialW is None else initialW
        initial_bias = (
            chainer.initializers.Uniform if initial_bias is None else initial_bias
        )
        self.do_history_mask = False
        with self.init_scope():
            self.conv_subsampling_factor = 1
            channels = 64  # Based in paper
            if input_layer == "conv2d":
                idim = int(np.ceil(np.ceil(idim / 2) / 2)) * channels
                self.input_layer = Conv2dSubsampling(
                    channels,
                    idim,
                    attention_dim,
                    dropout=dropout_rate,
                    initialW=initialW,
                    initial_bias=initial_bias,
                )
                self.conv_subsampling_factor = 4
            elif input_layer == "linear":
                self.input_layer = LinearSampling(
                    idim, attention_dim, initialW=initialW, initial_bias=initial_bias
                )
            elif input_layer == "embed":
                self.input_layer = chainer.Sequential(
                    L.EmbedID(idim, attention_dim, ignore_label=-1),
                    pos_enc_class(attention_dim, positional_dropout_rate),
                )
                self.do_history_mask = True
            else:
                raise ValueError("unknown input_layer: " + input_layer)
            self.norm = LayerNorm(attention_dim)
        for i in range(num_blocks):
            name = "encoders." + str(i)
            layer = EncoderLayer(
                attention_dim,
                d_units=linear_units,
                h=attention_heads,
                dropout=attention_dropout_rate,
                initialW=initialW,
                initial_bias=initial_bias,
            )
            self.add_link(name, layer)
        self.n_layers = num_blocks

    def forward(self, e, ilens):
        """Compute Encoder layer.

        Args:
            e (chainer.Variable): Batch of padded charactor. (B, Tmax)
            ilens (chainer.Variable): Batch of length of each input batch. (B,)

        Returns:
            chainer.Variable: Computed variable of encoder.
            numpy.array: Mask.
            chainer.Variable: Batch of lengths of each encoder outputs.

        """
        if isinstance(self.input_layer, Conv2dSubsampling):
            e, ilens = self.input_layer(e, ilens)
        else:
            e = self.input_layer(e)
        batch, length, dims = e.shape
        x_mask = np.ones([batch, length])
        for j in range(batch):
            x_mask[j, ilens[j] :] = -1
        xx_mask = (x_mask[:, None, :] >= 0) * (x_mask[:, :, None] >= 0)
        xx_mask = self.xp.array(xx_mask)
        if self.do_history_mask:
            history_mask = make_history_mask(self.xp, x_mask)
            xx_mask *= history_mask
        logging.debug("encoders size: " + str(e.shape))
        e = e.reshape(-1, dims)
        for i in range(self.n_layers):
            e = self["encoders." + str(i)](e, xx_mask, batch)
        return self.norm(e).reshape(batch, length, -1), x_mask, ilens


================================================
FILE: nets/chainer_backend/transformer/encoder_layer.py
================================================
# encoding: utf-8
"""Class Declaration of Transformer's Encoder Block."""

import chainer

import chainer.functions as F

from espnet.nets.chainer_backend.transformer.attention import MultiHeadAttention
from espnet.nets.chainer_backend.transformer.layer_norm import LayerNorm
from espnet.nets.chainer_backend.transformer.positionwise_feed_forward import (
    PositionwiseFeedForward,  # noqa: H301
)


class EncoderLayer(chainer.Chain):
    """Single encoder layer module.

    Args:
        n_units (int): Number of input/output dimension of a FeedForward layer.
        d_units (int): Number of units of hidden layer in a FeedForward layer.
        h (int): Number of attention heads.
        dropout (float): Dropout rate

    """

    def __init__(
        self, n_units, d_units=0, h=8, dropout=0.1, initialW=None, initial_bias=None
    ):
        """Initialize EncoderLayer."""
        super(EncoderLayer, self).__init__()
        with self.init_scope():
            self.self_attn = MultiHeadAttention(
                n_units,
                h,
                dropout=dropout,
                initialW=initialW,
                initial_bias=initial_bias,
            )
            self.feed_forward = PositionwiseFeedForward(
                n_units,
                d_units=d_units,
                dropout=dropout,
                initialW=initialW,
                initial_bias=initial_bias,
            )
            self.norm1 = LayerNorm(n_units)
            self.norm2 = LayerNorm(n_units)
        self.dropout = dropout
        self.n_units = n_units

    def forward(self, e, xx_mask, batch):
        """Forward Positional Encoding."""
        n_e = self.norm1(e)
        n_e = self.self_attn(n_e, mask=xx_mask, batch=batch)
        e = e + F.dropout(n_e, self.dropout)

        n_e = self.norm2(e)
        n_e = self.feed_forward(n_e)
        e = e + F.dropout(n_e, self.dropout)
        return e


================================================
FILE: nets/chainer_backend/transformer/label_smoothing_loss.py
================================================
# encoding: utf-8
"""Class Declaration of Transformer's Label Smootion loss."""

import logging

import chainer

import chainer.functions as F


class LabelSmoothingLoss(chainer.Chain):
    """Label Smoothing Loss.

    Args:
        smoothing (float): smoothing rate (0.0 means the conventional CE).
        n_target_vocab (int): number of classes.
        normalize_length (bool): normalize loss by sequence length if True.

    """

    def __init__(self, smoothing, n_target_vocab, normalize_length=False, ignore_id=-1):
        """Initialize Loss."""
        super(LabelSmoothingLoss, self).__init__()
        self.use_label_smoothing = False
        if smoothing > 0.0:
            logging.info("Use label smoothing")
            self.smoothing = smoothing
            self.confidence = 1.0 - smoothing
            self.use_label_smoothing = True
            self.n_target_vocab = n_target_vocab
        self.normalize_length = normalize_length
        self.ignore_id = ignore_id
        self.acc = None

    def forward(self, ys_block, ys_pad):
        """Forward Loss.

        Args:
            ys_block (chainer.Variable): Predicted labels.
            ys_pad (chainer.Variable): Target (true) labels.

        Returns:
            float: Training loss.

        """
        # Output (all together at once for efficiency)
        batch, length, dims = ys_block.shape
        concat_logit_block = ys_block.reshape(-1, dims)

        # Target reshape
        concat_t_block = ys_pad.reshape((batch * length))
        ignore_mask = concat_t_block >= 0
        n_token = ignore_mask.sum()
        normalizer = n_token if self.normalize_length else batch

        if not self.use_label_smoothing:
            loss = F.softmax_cross_entropy(concat_logit_block, concat_t_block)
            loss = loss * n_token / normalizer
        else:
            log_prob = F.log_softmax(concat_logit_block)
            broad_ignore_mask = self.xp.broadcast_to(
                ignore_mask[:, None], concat_logit_block.shape
            )
            pre_loss = (
                ignore_mask * log_prob[self.xp.arange(batch * length), concat_t_block]
            )
            loss = -F.sum(pre_loss) / normalizer
            label_smoothing = broad_ignore_mask * -1.0 / self.n_target_vocab * log_prob
            label_smoothing = F.sum(label_smoothing) / normalizer
            loss = self.confidence * loss + self.smoothing * label_smoothing
        return loss


================================================
FILE: nets/chainer_backend/transformer/layer_norm.py
================================================
# encoding: utf-8
"""Class Declaration of Transformer's Label Smootion loss."""

import chainer.links as L


class LayerNorm(L.LayerNormalization):
    """Redirect to L.LayerNormalization."""

    def __init__(self, dims, eps=1e-12):
        """Initialize LayerNorm."""
        super(LayerNorm, self).__init__(size=dims, eps=eps)

    def __call__(self, e):
        """Forward LayerNorm."""
        return super(LayerNorm, self).__call__(e)


================================================
FILE: nets/chainer_backend/transformer/mask.py
================================================
"""Create mask for subsequent steps."""


def make_history_mask(xp, block):
    """Prepare the history mask.

    Args:
        block (ndarray): Block with dimensions: (B x S).
    Returns:
        ndarray, np.ndarray: History mask with dimensions (B, S, S).

    """
    batch, length = block.shape
    arange = xp.arange(length)
    history_mask = (arange[None] <= arange[:, None])[
        None,
    ]
    history_mask = xp.broadcast_to(history_mask, (batch, length, length))
    return history_mask


================================================
FILE: nets/chainer_backend/transformer/positionwise_feed_forward.py
================================================
# encoding: utf-8
"""Class Declaration of Transformer's Positionwise Feedforward."""

import chainer

import chainer.functions as F
import chainer.links as L

import numpy as np


class PositionwiseFeedForward(chainer.Chain):
    """Positionwise feed forward.

    Args:
        :param int idim: input dimenstion
        :param int hidden_units: number of hidden units
        :param float dropout_rate: dropout rate

    """

    def __init__(
        self, n_units, d_units=0, dropout=0.1, initialW=None, initial_bias=None
    ):
        """Initialize PositionwiseFeedForward.

        Args:
            n_units (int): Input dimension.
            d_units (int, optional): Output dimension of hidden layer.
            dropout (float, optional): Dropout ratio.
            initialW (int, optional):  Initializer to initialize the weight.
            initial_bias (bool, optional): Initializer to initialize the bias.

        """
        super(PositionwiseFeedForward, self).__init__()
        n_inner_units = d_units if d_units > 0 else n_units * 4
        with self.init_scope():
            stvd = 1.0 / np.sqrt(n_units)
            self.w_1 = L.Linear(
                n_units,
                n_inner_units,
                initialW=initialW(scale=stvd),
                initial_bias=initial_bias(scale=stvd),
            )
            stvd = 1.0 / np.sqrt(n_inner_units)
            self.w_2 = L.Linear(
                n_inner_units,
                n_units,
                initialW=initialW(scale=stvd),
                initial_bias=initial_bias(scale=stvd),
            )
            self.act = F.relu
        self.dropout = dropout

    def __call__(self, e):
        """Initialize PositionwiseFeedForward.

        Args:
            e (chainer.Variable): Input variable.

        Return:
            chainer.Variable: Output variable.

        """
        e = F.dropout(self.act(self.w_1(e)), self.dropout)
        return self.w_2(e)


================================================
FILE: nets/chainer_backend/transformer/subsampling.py
================================================
# encoding: utf-8
"""Class Declaration of Transformer's Input layers."""

import chainer

import chainer.functions as F
import chainer.links as L

from espnet.nets.chainer_backend.transformer.embedding import PositionalEncoding

import logging
import numpy as np


class Conv2dSubsampling(chainer.Chain):
    """Convolutional 2D subsampling (to 1/4 length).

    :param int idim: input dim
    :param int odim: output dim
    :param flaot dropout_rate: dropout rate

    """

    def __init__(
        self, channels, idim, dims, dropout=0.1, initialW=None, initial_bias=None
    ):
        """Initialize Conv2dSubsampling."""
        super(Conv2dSubsampling, self).__init__()
        self.dropout = dropout
        with self.init_scope():
            # Standard deviation for Conv2D with 1 channel and kernel 3 x 3.
            n = 1 * 3 * 3
            stvd = 1.0 / np.sqrt(n)
            self.conv1 = L.Convolution2D(
                1,
                channels,
                3,
                stride=2,
                pad=1,
                initialW=initialW(scale=stvd),
                initial_bias=initial_bias(scale=stvd),
            )
            n = channels * 3 * 3
            stvd = 1.0 / np.sqrt(n)
            self.conv2 = L.Convolution2D(
                channels,
                channels,
                3,
                stride=2,
                pad=1,
                initialW=initialW(scale=stvd),
                initial_bias=initial_bias(scale=stvd),
            )
            stvd = 1.0 / np.sqrt(dims)
            self.out = L.Linear(
                idim,
                dims,
                initialW=initialW(scale=stvd),
                initial_bias=initial_bias(scale=stvd),
            )
            self.pe = PositionalEncoding(dims, dropout)

    def forward(self, xs, ilens):
        """Subsample x.

        :param chainer.Variable x: input tensor
        :return: subsampled x and mask

        """
        xs = self.xp.array(xs[:, None])
        xs = F.relu(self.conv1(xs))
        xs = F.relu(self.conv2(xs))
        batch, _, length, _ = xs.shape
        xs = self.out(F.swapaxes(xs, 1, 2).reshape(batch * length, -1))
        xs = self.pe(xs.reshape(batch, length, -1))
        # change ilens accordingly
        ilens = np.ceil(np.array(ilens, dtype=np.float32) / 2).astype(np.int)
        ilens = np.ceil(np.array(ilens, dtype=np.float32) / 2).astype(np.int)
        return xs, ilens


class LinearSampling(chainer.Chain):
    """Linear 1D subsampling.

    :param int idim: input dim
    :param int odim: output dim
    :param flaot dropout_rate: dropout rate

    """

    def __init__(self, idim, dims, dropout=0.1, initialW=None, initial_bias=None):
        """Initialize LinearSampling."""
        super(LinearSampling, self).__init__()
        stvd = 1.0 / np.sqrt(dims)
        self.dropout = dropout
        with self.init_scope():
            self.linear = L.Linear(
                idim,
                dims,
                initialW=initialW(scale=stvd),
                initial_bias=initial_bias(scale=stvd),
            )
            self.pe = PositionalEncoding(dims, dropout)

    def forward(self, xs, ilens):
        """Subsample x.

        :param chainer.Variable x: input tensor
        :return: subsampled x and mask

        """
        logging.info(xs.shape)
        xs = self.linear(xs, n_batch_axes=2)
        logging.info(xs.shape)
        xs = self.pe(xs)
        return xs, ilens


================================================
FILE: nets/chainer_backend/transformer/training.py
================================================
# Copyright 2017 Johns Hopkins University (Shinji Watanabe)
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
"""Class Declaration of Transformer's Training Subprocess."""
import collections
import logging
import math
import six

from chainer import cuda
from chainer import functions as F
from chainer import training
from chainer.training import extension
from chainer.training.updaters.multiprocess_parallel_updater import gather_grads
from chainer.training.updaters.multiprocess_parallel_updater import gather_params
from chainer.training.updaters.multiprocess_parallel_updater import scatter_grads
import numpy as np


# copied from https://github.com/chainer/chainer/blob/master/chainer/optimizer.py
def sum_sqnorm(arr):
    """Calculate the norm of the array.

    Args:
        arr (numpy.ndarray)

    Returns:
        Float: Sum of the norm calculated from the given array.

    """
    sq_sum = collections.defaultdict(float)
    for x in arr:
        with cuda.get_device_from_array(x) as dev:
            if x is not None:
                x = x.ravel()
                s = x.dot(x)
                sq_sum[int(dev)] += s
    return sum([float(i) for i in six.itervalues(sq_sum)])


class CustomUpdater(training.StandardUpdater):
    """Custom updater for chainer.

    Args:
        train_iter (iterator | dict[str, iterator]): Dataset iterator for the
            training dataset. It can also be a dictionary that maps strings to
            iterators. If this is just an iterator, then the iterator is
            registered by the name ``'main'``.
        optimizer (optimizer | dict[str, optimizer]): Optimizer to update
            parameters. It can also be a dictionary that maps strings to
            optimizers. If this is just an optimizer, then the optimizer is
            registered by the name ``'main'``.
        converter (espnet.asr.chainer_backend.asr.CustomConverter): Converter
            function to build input arrays. Each batch extracted by the main
            iterator and the ``device`` option are passed to this function.
            :func:`chainer.dataset.concat_examples` is used by default.
        device (int or dict): The destination device info to send variables. In the
            case of cpu or single gpu, `device=-1 or 0`, respectively.
            In the case of multi-gpu, `device={"main":0, "sub_1": 1, ...}`.
        accum_grad (int):The number of gradient accumulation. if set to 2, the network
            parameters will be updated once in twice,
            i.e. actual batchsize will be doubled.

    """

    def __init__(self, train_iter, optimizer, converter, device, accum_grad=1):
        """Initialize Custom Updater."""
        super(CustomUpdater, self).__init__(
            train_iter, optimizer, converter=converter, device=device
        )
        self.accum_grad = accum_grad
        self.forward_count = 0
        self.start = True
        self.device = device
        logging.debug("using custom converter for transformer")

    # The core part of the update routine can be customized by overriding.
    def update_core(self):
        """Process main update routine for Custom Updater."""
        train_iter = self.get_iterator("main")
        optimizer = self.get_optimizer("main")

        # Get batch and convert into variables
        batch = train_iter.next()
        x = self.converter(batch, self.device)
        if self.start:
            optimizer.target.cleargrads()
            self.start = False

        # Compute the loss at this time step and accumulate it
        loss = optimizer.target(*x) / self.accum_grad
        loss.backward()  # Backprop

        self.forward_count += 1
        if self.forward_count != self.accum_grad:
            return
        self.forward_count = 0
        # compute the gradient norm to check if it is normal or not
        grad_norm = np.sqrt(
            sum_sqnorm([p.grad for p in optimizer.target.params(False)])
        )
        logging.info("grad norm={}".format(grad_norm))
        if math.isnan(grad_norm):
            logging.warning("grad norm is nan. Do not update model.")
        else:
            optimizer.update()
        optimizer.target.cleargrads()  # Clear the parameter gradients

    def update(self):
        """Update step for Custom Updater."""
        self.update_core()
        if self.forward_count == 0:
            self.iteration += 1


class CustomParallelUpdater(training.updaters.MultiprocessParallelUpdater):
    """Custom Parallel Updater for chainer.

    Defines the main update routine.

    Args:
        train_iter (iterator | dict[str, iterator]): Dataset iterator for the
            training dataset. It can also be a dictionary that maps strings to
            iterators. If this is just an iterator, then the iterator is
            registered by the name ``'main'``.
        optimizer (optimizer | dict[str, optimizer]): Optimizer to update
            parameters. It can also be a dictionary that maps strings to
            optimizers. If this is just an optimizer, then the optimizer is
            registered by the name ``'main'``.
        converter (espnet.asr.chainer_backend.asr.CustomConverter): Converter
            function to build input arrays. Each batch extracted by the main
            iterator and the ``device`` option are passed to this function.
            :func:`chainer.dataset.concat_examples` is used by default.
        device (torch.device): Device to which the training data is sent. Negative value
            indicates the host memory (CPU).
        accum_grad (int):The number of gradient accumulation. if set to 2, the network
            parameters will be updated once in twice,
            i.e. actual batchsize will be doubled.

    """

    def __init__(self, train_iters, optimizer, converter, devices, accum_grad=1):
        """Initialize custom parallel updater."""
        from cupy.cuda import nccl

        super(CustomParallelUpdater, self).__init__(
            train_iters, optimizer, converter=converter, devices=devices
        )
        self.accum_grad = accum_grad
        self.forward_count = 0
        self.nccl = nccl
        logging.debug("using custom parallel updater for transformer")

    # The core part of the update routine can be customized by overriding.
    def update_core(self):
        """Process main update routine for Custom Parallel Updater."""
        self.setup_workers()

        self._send_message(("update", None))
        with cuda.Device(self._devices[0]):
            # For reducing memory
            optimizer = self.get_optimizer("main")
            batch = self.get_iterator("main").next()
            x = self.converter(batch, self._devices[0])

            loss = self._master(*x) / self.accum_grad
            loss.backward()

            # NCCL: reduce grads
            null_stream = cuda.Stream.null
            if self.comm is not None:
                gg = gather_grads(self._master)
                self.comm.reduce(
                    gg.data.ptr,
                    gg.data.ptr,
                    gg.size,
                    self.nccl.NCCL_FLOAT,
                    self.nccl.NCCL_SUM,
                    0,
                    null_stream.ptr,
                )
                scatter_grads(self._master, gg)
                del gg

            # update parameters
            self.forward_count += 1
            if self.forward_count != self.accum_grad:
                return
            self.forward_count = 0
            # check gradient value
            grad_norm = np.sqrt(
                sum_sqnorm([p.grad for p in optimizer.target.params(False)])
            )
            logging.info("grad norm={}".format(grad_norm))

            # update
            if math.isnan(grad_norm):
                logging.warning("grad norm is nan. Do not update model.")
            else:
                optimizer.update()
            self._master.cleargrads()

            if self.comm is not None:
                gp = gather_params(self._master)
                self.comm.bcast(
                    gp.data.ptr, gp.size, self.nccl.NCCL_FLOAT, 0, null_stream.ptr
                )

    def update(self):
        """Update step for Custom Parallel Updater."""
        self.update_core()
        if self.forward_count == 0:
            self.iteration += 1


class VaswaniRule(extension.Extension):
    """Trainer extension to shift an optimizer attribute magically by Vaswani.

    Args:
        attr (str): Name of the attribute to shift.
        rate (float): Rate of the exponential shift. This value is multiplied
            to the attribute at each call.
        init (float): Initial value of the attribute. If it is ``None``, the
            extension extracts the attribute at the first call and uses it as
            the initial value.
        target (float): Target value of the attribute. If the attribute reaches
            this value, the shift stops.
        optimizer (~chainer.Optimizer): Target optimizer to adjust the
            attribute. If it is ``None``, the main optimizer of the updater is
            used.

    """

    def __init__(
        self,
        attr,
        d,
        warmup_steps=4000,
        init=None,
        target=None,
        optimizer=None,
        scale=1.0,
    ):
        """Initialize Vaswani rule extension."""
        self._attr = attr
        self._d_inv05 = d ** (-0.5) * scale
        self._warmup_steps_inv15 = warmup_steps ** (-1.5)
        self._init = init
        self._target = target
        self._optimizer = optimizer
        self._t = 0
        self._last_value = None

    def initialize(self, trainer):
        """Initialize Optimizer values."""
        optimizer = self._get_optimizer(trainer)
        # ensure that _init is set
        if self._init is None:
            self._init = self._d_inv05 * (1.0 * self._warmup_steps_inv15)
        if self._last_value is not None:  # resuming from a snapshot
            self._update_value(optimizer, self._last_value)
        else:
            self._update_value(optimizer, self._init)

    def __call__(self, trainer):
        """Forward extension."""
        self._t += 1
        optimizer = self._get_optimizer(trainer)
        value = self._d_inv05 * min(
            self._t ** (-0.5), self._t * self._warmup_steps_inv15
        )
        self._update_value(optimizer, value)

    def serialize(self, serializer):
        """Serialize extension."""
        self._t = serializer("_t", self._t)
        self._last_value = serializer("_last_value", self._last_value)

    def _get_optimizer(self, trainer):
        """Obtain optimizer from trainer."""
        return self._optimizer or trainer.updater.get_optimizer("main")

    def _update_value(self, optimizer, value):
        """Update requested variable values."""
        setattr(optimizer, self._attr, value)
        self._last_value = value


class CustomConverter(object):
    """Custom Converter.

    Args:
        subsampling_factor (int): The subsampling factor.

    """

    def __init__(self):
        """Initialize subsampling."""
        pass

    def __call__(self, batch, device):
        """Perform subsampling.

        Args:
            batch (list): Batch that will be sabsampled.
            device (chainer.backend.Device): CPU or GPU device.

        Returns:
            chainer.Variable: xp.array that are padded and subsampled from batch.
            xp.array: xp.array of the length of the mini-batches.
            chainer.Variable: xp.array that are padded and subsampled from batch.

        """
        # For transformer, data is processed in CPU.
        # batch should be located in list
        assert len(batch) == 1
        xs, ys = batch[0]
        xs = F.pad_sequence(xs, padding=-1).data
        # get batch of lengths of input sequences
        ilens = np.array([x.shape[0] for x in xs], dtype=np.int32)
        return xs, ilens, ys


================================================
FILE: nets/ctc_prefix_score.py
================================================
#!/usr/bin/env python3

# Copyright 2018 Mitsubishi Electric Research Labs (Takaaki Hori)
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

import torch

import numpy as np
import six


class CTCPrefixScoreTH(object):
    """Batch processing of CTCPrefixScore

    which is based on Algorithm 2 in WATANABE et al.
    "HYBRID CTC/ATTENTION ARCHITECTURE FOR END-TO-END SPEECH RECOGNITION,"
    but extended to efficiently compute the label probablities for multiple
    hypotheses simultaneously
    See also Seki et al. "Vectorized Beam Search for CTC-Attention-Based
    Speech Recognition," In INTERSPEECH (pp. 3825-3829), 2019.
    """

    def __init__(self, x, xlens, blank, eos, margin=0):
        """Construct CTC prefix scorer

        :param torch.Tensor x: input label posterior sequences (B, T, O)
        :param torch.Tensor xlens: input lengths (B,)
        :param int blank: blank label id
        :param int eos: end-of-sequence id
        :param int margin: margin parameter for windowing (0 means no windowing)
        """
        # In the comment lines,
        # we assume T: input_length, B: batch size, W: beam width, O: output dim.
        self.logzero = -10000000000.0
        self.blank = blank
        self.eos = eos
        self.batch = x.size(0)
        self.input_length = x.size(1)
        self.odim = x.size(2)
        self.dtype = x.dtype
        self.device = (
            torch.device("cuda:%d" % x.get_device())
            if x.is_cuda
            else torch.device("cpu")
        )
        # Pad the rest of posteriors in the batch
        # TODO(takaaki-hori): need a better way without for-loops
        for i, l in enumerate(xlens):
            if l < self.input_length:
                x[i, l:, :] = self.logzero
                x[i, l:, blank] = 0
        # Reshape input x
        xn = x.transpose(0, 1)  # (B, T, O) -> (T, B, O)
        xb = xn[:, :, self.blank].unsqueeze(2).expand(-1, -1, self.odim)
        self.x = torch.stack([xn, xb])  # (2, T, B, O)
        self.end_frames = torch.as_tensor(xlens) - 1

        # Setup CTC windowing
        self.margin = margin
        if margin > 0:
            self.frame_ids = torch.arange(
                self.input_length, dtype=self.dtype, device=self.device
            )
        # Base indices for index conversion
        self.idx_bh = None
        self.idx_b = torch.arange(self.batch, device=self.device)
        self.idx_bo = (self.idx_b * self.odim).unsqueeze(1)

    def __call__(self, y, state, scoring_ids=None, att_w=None):
        """Compute CTC prefix scores for next labels

        :param list y: prefix label sequences
        :param tuple state: previous CTC state
        :param torch.Tensor pre_scores: scores for pre-selection of hypotheses (BW, O)
        :param torch.Tensor att_w: attention weights to decide CTC window
        :return new_state, ctc_local_scores (BW, O)
        """
        output_length = len(y[0]) - 1  # ignore sos
        last_ids = [yi[-1] for yi in y]  # last output label ids
        n_bh = len(last_ids)  # batch * hyps
        n_hyps = n_bh // self.batch  # assuming each utterance has the same # of hyps
        self.scoring_num = scoring_ids.size(-1) if scoring_ids is not None else 0
        # prepare state info
        if state is None:
            r_prev = torch.full(
                (self.input_length, 2, self.batch, n_hyps),
                self.logzero,
                dtype=self.dtype,
                device=self.device,
            )
            r_prev[:, 1] = torch.cumsum(self.x[0, :, :, self.blank], 0).unsqueeze(2)
            r_prev = r_prev.view(-1, 2, n_bh)
            s_prev = 0.0
            f_min_prev = 0
            f_max_prev = 1
        else:
            r_prev, s_prev, f_min_prev, f_max_prev = state

        # select input dimensions for scoring
        if self.scoring_num > 0:
            scoring_idmap = torch.full(
                (n_bh, self.odim), -1, dtype=torch.long, device=self.device
            )
            snum = self.scoring_num
            if self.idx_bh is None or n_bh > len(self.idx_bh):
                self.idx_bh = torch.arange(n_bh, device=self.device).view(-1, 1)
            scoring_idmap[self.idx_bh[:n_bh], scoring_ids] = torch.arange(
                snum, device=self.device
            )
            scoring_idx = (
                scoring_ids + self.idx_bo.repeat(1, n_hyps).view(-1, 1)
            ).view(-1)
            x_ = torch.index_select(
                self.x.view(2, -1, self.batch * self.odim), 2, scoring_idx
            ).view(2, -1, n_bh, snum)
        else:
            scoring_ids = None
            scoring_idmap = None
            snum = self.odim
            x_ = self.x.unsqueeze(3).repeat(1, 1, 1, n_hyps, 1).view(2, -1, n_bh, snum)

        # new CTC forward probs are prepared as a (T x 2 x BW x S) tensor
        # that corresponds to r_t^n(h) and r_t^b(h) in a batch.
        r = torch.full(
            (self.input_length, 2, n_bh, snum),
            self.logzero,
            dtype=self.dtype,
            device=self.device,
        )
        if output_length == 0:
            r[0, 0] = x_[0, 0]

        r_sum = torch.logsumexp(r_prev, 1)
        log_phi = r_sum.unsqueeze(2).repeat(1, 1, snum)
        if scoring_ids is not None:
            for idx in range(n_bh):
                pos = scoring_idmap[idx, last_ids[idx]]
                if pos >= 0:
                    log_phi[:, idx, pos] = r_prev[:, 1, idx]
        else:
            for idx in range(n_bh):
                log_phi[:, idx, last_ids[idx]] = r_prev[:, 1, idx]

        # decide start and end frames based on attention weights
        if att_w is not None and self.margin > 0:
            f_arg = torch.matmul(att_w, self.frame_ids)
            f_min = max(int(f_arg.min().cpu()), f_min_prev)
            f_max = max(int(f_arg.max().cpu()), f_max_prev)
            start = min(f_max_prev, max(f_min - self.margin, output_length, 1))
            end = min(f_max + self.margin, self.input_length)
        else:
            f_min = f_max = 0
            start = max(output_length, 1)
            end = self.input_length

        # compute forward probabilities log(r_t^n(h)) and log(r_t^b(h))
        for t in range(start, end):
            rp = r[t - 1]
            rr = torch.stack([rp[0], log_phi[t - 1], rp[0], rp[1]]).view(
                2, 2, n_bh, snum
            )
            r[t] = torch.logsumexp(rr, 1) + x_[:, t]

        # compute log prefix probabilites log(psi)
        log_phi_x = torch.cat((log_phi[0].unsqueeze(0), log_phi[:-1]), dim=0) + x_[0]
        if scoring_ids is not None:
            log_psi = torch.full(
                (n_bh, self.odim), self.logzero, dtype=self.dtype, device=self.device
            )
            log_psi_ = torch.logsumexp(
                torch.cat((log_phi_x[start:end], r[start - 1, 0].unsqueeze(0)), dim=0),
                dim=0,
            )
            for si in range(n_bh):
                log_psi[si, scoring_ids[si]] = log_psi_[si]
        else:
            log_psi = torch.logsumexp(
                torch.cat((log_phi_x[start:end], r[start - 1, 0].unsqueeze(0)), dim=0),
                dim=0,
            )

        for si in range(n_bh):
            log_psi[si, self.eos] = r_sum[self.end_frames[si // n_hyps], si]

        # exclude blank probs
        log_psi[:, self.blank] = self.logzero

        return (log_psi - s_prev), (r, log_psi, f_min, f_max, scoring_idmap)

    def index_select_state(self, state, best_ids):
        """Select CTC states according to best ids

        :param state    : CTC state
        :param best_ids : index numbers selected by beam pruning (B, W)
        :return selected_state
        """
        r, s, f_min, f_max, scoring_idmap = state
        # convert ids to BHO space
        n_bh = len(s)
        n_hyps = n_bh // self.batch
        vidx = (best_ids + (self.idx_b * (n_hyps * self.odim)).view(-1, 1)).view(-1)
        # select hypothesis scores
        s_new = torch.index_select(s.view(-1), 0, vidx)
        s_new = s_new.view(-1, 1).repeat(1, self.odim).view(n_bh, self.odim)
        # convert ids to BHS space (S: scoring_num)
        if scoring_idmap is not None:
            snum = self.scoring_num
            hyp_idx = (best_ids // self.odim + (self.idx_b * n_hyps).view(-1, 1)).view(
                -1
            )
            label_ids = torch.fmod(best_ids, self.odim).view(-1)
            score_idx = scoring_idmap[hyp_idx, label_ids]
            score_idx[score_idx == -1] = 0
            vidx = score_idx + hyp_idx * snum
        else:
            snum = self.odim
        # select forward probabilities
        r_new = torch.index_select(r.view(-1, 2, n_bh * snum), 2, vidx).view(
            -1, 2, n_bh
        )
        return r_new, s_new, f_min, f_max

    def extend_prob(self, x):
        """Extend CTC prob.

        :param torch.Tensor x: input label posterior sequences (B, T, O)
        """

        if self.x.shape[1] < x.shape[1]:  # self.x (2,T,B,O); x (B,T,O)
            # Pad the rest of posteriors in the batch
            # TODO(takaaki-hori): need a better way without for-loops
            xlens = [x.size(1)]
            for i, l in enumerate(xlens):
                if l < self.input_length:
                    x[i, l:, :] = self.logzero
                    x[i, l:, self.blank] = 0
            tmp_x = self.x
            xn = x.transpose(0, 1)  # (B, T, O) -> (T, B, O)
            xb = xn[:, :, self.blank].unsqueeze(2).expand(-1, -1, self.odim)
            self.x = torch.stack([xn, xb])  # (2, T, B, O)
            self.x[:, : tmp_x.shape[1], :, :] = tmp_x
            self.input_length = x.size(1)
            self.end_frames = torch.as_tensor(xlens) - 1

    def extend_state(self, state):
        """Compute CTC prefix state.


        :param state    : CTC state
        :return ctc_state
        """

        if state is None:
            # nothing to do
            return state
        else:
            r_prev, s_prev, f_min_prev, f_max_prev = state

            r_prev_new = torch.full(
                (self.input_length, 2),
                self.logzero,
                dtype=self.dtype,
                device=self.device,
            )
            start = max(r_prev.shape[0], 1)
            r_prev_new[0:start] = r_prev
            for t in six.moves.range(start, self.input_length):
                r_prev_new[t, 1] = r_prev_new[t - 1, 1] + self.x[0, t, :, self.blank]

            return (r_prev_new, s_prev, f_min_prev, f_max_prev)


class CTCPrefixScore(object):
    # by tyrion: CTC prefix score is the probability of all hypothesis start with 
    # that prefix: it is the accumulated probability of given prefix U at any time t.
    """Compute CTC label sequence scores

    which is based on Algorithm 2 in WATANABE et al.
    "HYBRID CTC/ATTENTION ARCHITECTURE FOR END-TO-END SPEECH RECOGNITION,"
    but extended to efficiently compute the probablities of multiple labels
    simultaneously
    """

    def __init__(self, x, blank, eos, xp):
        self.xp = xp
        self.logzero = -10000000000.0
        self.blank = blank
        self.eos = eos
        self.input_length = len(x)
        self.x = x

    def initial_state(self):
        """Obtain an initial CTC state

        :return: CTC state
        """
        # initial CTC state is made of a frame x 2 tensor that corresponds to
        # r_t^n(<sos>) and r_t^b(<sos>), where 0 and 1 of axis=1 represent
        # superscripts n and b (non-blank and blank), respectively.
        r = self.xp.full((self.input_length, 2), self.logzero, dtype=np.float32)
        r[0, 1] = self.x[0, self.blank]
        for i in six.moves.range(1, self.input_length):
            r[i, 1] = r[i - 1, 1] + self.x[i, self.blank]
        return r

    def __call__(self, y, cs, r_prev):
        """Compute CTC prefix scores for next labels

        :param y     : prefix label sequence
        :param cs    : array of next labels
        :param r_prev: previous CTC state
        :return ctc_scores, ctc_states
        """
        # initialize CTC states
        output_length = len(y) - 1  # ignore sos
        # new CTC states are prepared as a frame x (n or b) x n_labels tensor
        # that corresponds to r_t^n(h) and r_t^b(h).
        r = self.xp.ndarray((self.input_length, 2, len(cs)), dtype=np.float32)
        xs = self.x[:, cs]
        if output_length == 0:
            r[0, 0] = xs[0]
            r[0, 1] = self.logzero
        else:
            r[output_length - 1] = self.logzero

        # prepare forward probabilities for the last label
        r_sum = self.xp.logaddexp(
            r_prev[:, 0], r_prev[:, 1]
        )  # log(r_t^n(g) + r_t^b(g))
        last = y[-1]
        if output_length > 0 and last in cs:
            log_phi = self.xp.ndarray((self.input_length, len(cs)), dtype=np.float32)
            for i in six.moves.range(len(cs)):
                log_phi[:, i] = r_sum if cs[i] != last else r_prev[:, 1]
        else:
            log_phi = r_sum

        # compute forward probabilities log(r_t^n(h)), log(r_t^b(h)),
        # and log prefix probabilites log(psi)
        start = max(output_length, 1)
        log_psi = r[start - 1, 0]
        for t in six.moves.range(start, self.input_length):
            r[t, 0] = self.xp.logaddexp(r[t - 1, 0], log_phi[t - 1]) + xs[t]
            r[t, 1] = (
                self.xp.logaddexp(r[t - 1, 0], r[t - 1, 1]) + self.x[t, self.blank]
            )
            log_psi = self.xp.logaddexp(log_psi, log_phi[t - 1] + xs[t])

        # get P(...eos|X) that ends with the prefix itself
        eos_pos = self.xp.where(cs == self.eos)[0]
        if len(eos_pos) > 0:
            log_psi[eos_pos] = r_sum[-1]  # log(r_T^n(g) + r_T^b(g))

        # exclude blank probs
        blank_pos = self.xp.where(cs == self.blank)[0]
        if len(blank_pos) > 0:
            log_psi[blank_pos] = self.logzero

        # return the log prefix probability and CTC states, where the label axis
        # of the CTC states is moved to the first axis to slice it easily
        return log_psi, self.xp.rollaxis(r, 2)


================================================
FILE: nets/e2e_asr_common.py
================================================
#!/usr/bin/env python3
# encoding: utf-8

# Copyright 2017 Johns Hopkins University (Shinji Watanabe)
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

"""Common functions for ASR."""

import json
import logging
import sys

import editdistance
from itertools import groupby
import numpy as np
import six


def end_detect(ended_hyps, i, M=3, D_end=np.log(1 * np.exp(-10))):
    """End detection.

    described in Eq. (50) of S. Watanabe et al
    "Hybrid CTC/Attention Architecture for End-to-End Speech Recognition"

    :param ended_hyps:
    :param i:
    :param M:
    :param D_end:
    :return:
    """
    if len(ended_hyps) == 0:
        return False
    count = 0
    best_hyp = sorted(ended_hyps, key=lambda x: x["score"], reverse=True)[0]
    for m in six.moves.range(M):
        # get ended_hyps with their length is i - m
        hyp_length = i - m
        hyps_same_length = [x for x in ended_hyps if len(x["yseq"]) == hyp_length]
        if len(hyps_same_length) > 0:
            best_hyp_same_length = sorted(
                hyps_same_length, key=lambda x: x["score"], reverse=True
            )[0]
            if best_hyp_same_length["score"] - best_hyp["score"] < D_end:
                count += 1

    if count == M:
        return True
    else:
        return False


# TODO(takaaki-hori): add different smoothing methods
def label_smoothing_dist(odim, lsm_type, transcript=None, blank=0):
    """Obtain label distribution for loss smoothing.

    :param odim:
    :param lsm_type:
    :param blank:
    :param transcript:
    :return:
    """
    if transcript is not None:
        with open(transcript, "rb") as f:
            trans_json = json.load(f)["utts"]

    if lsm_type == "unigram":
        assert transcript is not None, (
            "transcript is required for %s label smoothing" % lsm_type
        )
        labelcount = np.zeros(odim)
        for k, v in trans_json.items():
            ids = np.array([int(n) for n in v["output"][0]["tokenid"].split()])
            # to avoid an error when there is no text in an uttrance
            if len(ids) > 0:
                labelcount[ids] += 1
        labelcount[odim - 1] = len(transcript)  # count <eos>
        labelcount[labelcount == 0] = 1  # flooring
        labelcount[blank] = 0  # remove counts for blank
        labeldist = labelcount.astype(np.float32) / np.sum(labelcount)
    else:
        logging.error("Error: unexpected label smoothing type: %s" % lsm_type)
        sys.exit()

    return labeldist


def get_vgg2l_odim(idim, in_channel=3, out_channel=128):
    """Return the output size of the VGG frontend.

    :param in_channel: input channel size
    :param out_channel: output channel size
    :return: output size
    :rtype int
    """
    idim = idim / in_channel
    idim = np.ceil(np.array(idim, dtype=np.float32) / 2)  # 1st max pooling
    idim = np.ceil(np.array(idim, dtype=np.float32) / 2)  # 2nd max pooling
    return int(idim) * out_channel  # numer of channels


class ErrorCalculator(object):
    """Calculate CER and WER for E2E_ASR and CTC models during training.

    :param y_hats: numpy array with predicted text
    :param y_pads: numpy array with true (target) text
    :param char_list:
    :param sym_space:
    :param sym_blank:
    :return:
    """

    def __init__(
        self, char_list, sym_space, sym_blank, report_cer=False, report_wer=False
    ):
        """Construct an ErrorCalculator object."""
        super(ErrorCalculator, self).__init__()

        self.report_cer = report_cer
        self.report_wer = report_wer

        self.char_list = char_list
        self.space = sym_space
        self.blank = sym_blank
        self.idx_blank = self.char_list.index(self.blank)
        if self.space in self.char_list:
            self.idx_space = self.char_list.index(self.space)
        else:
            self.idx_space = None

    def __call__(self, ys_hat, ys_pad, is_ctc=False):
        """Calculate sentence-level WER/CER score.

        :param torch.Tensor ys_hat: prediction (batch, seqlen)
        :param torch.Tensor ys_pad: reference (batch, seqlen)
        :param bool is_ctc: calculate CER score for CTC
        :return: sentence-level WER score
        :rtype float
        :return: sentence-level CER score
        :rtype float
        """
        cer, wer = None, None
        if is_ctc:
            return self.calculate_cer_ctc(ys_hat, ys_pad)
        elif not self.report_cer and not self.report_wer:
            return cer, wer

        seqs_hat, seqs_true = self.convert_to_char(ys_hat, ys_pad)
        if self.report_cer:
            cer = self.calculate_cer(seqs_hat, seqs_true)

        if self.report_wer:
            wer = self.calculate_wer(seqs_hat, seqs_true)
        return cer, wer

    def calculate_cer_ctc(self, ys_hat, ys_pad):
        """Calculate sentence-level CER score for CTC.

        :param torch.Tensor ys_hat: prediction (batch, seqlen)
        :param torch.Tensor ys_pad: reference (batch, seqlen)
        :return: average sentence-level CER score
        :rtype float
        """
        cers, char_ref_lens = [], []
        for i, y in enumerate(ys_hat):
            y_hat = [x[0] for x in groupby(y)]
            y_true = ys_pad[i]
            seq_hat, seq_true = [], []
            for idx in y_hat:
                idx = int(idx)
                if idx != -1 and idx != self.idx_blank and idx != self.idx_space:
                    seq_hat.append(self.char_list[int(idx)])

            for idx in y_true:
                idx = int(idx)
                if idx != -1 and idx != self.idx_blank and idx != self.idx_space:
                    seq_true.append(self.char_list[int(idx)])

            hyp_chars = "".join(seq_hat)
            ref_chars = "".join(seq_true)
            if len(ref_chars) > 0:
                cers.append(editdistance.eval(hyp_chars, ref_chars))
                char_ref_lens.append(len(ref_chars))

        cer_ctc = float(sum(cers)) / sum(char_ref_lens) if cers else None
        return cer_ctc

    def convert_to_char(self, ys_hat, ys_pad):
        """Convert index to character.

        :param torch.Tensor seqs_hat: prediction (batch, seqlen)
        :param torch.Tensor seqs_true: reference (batch, seqlen)
        :return: token list of prediction
        :rtype list
        :return: token list of reference
        :rtype list
        """
        seqs_hat, seqs_true = [], []
        for i, y_hat in enumerate(ys_hat):
            y_true = ys_pad[i]
            eos_true = np.where(y_true == -1)[0]
            ymax = eos_true[0] if len(eos_true) > 0 else len(y_true)
            # NOTE: padding index (-1) in y_true is used to pad y_hat
            seq_hat = [self.char_list[int(idx)] for idx in y_hat[:ymax]]
            seq_true = [self.char_list[int(idx)] for idx in y_true if int(idx) != -1]
            seq_hat_text = "".join(seq_hat).replace(self.space, " ")
            seq_hat_text = seq_hat_text.replace(self.blank, "")
            seq_true_text = "".join(seq_true).replace(self.space, " ")
            seqs_hat.append(seq_hat_text)
            seqs_true.append(seq_true_text)
        return seqs_hat, seqs_true

    def calculate_cer(self, seqs_hat, seqs_true):
        """Calculate sentence-level CER score.

        :param list seqs_hat: prediction
        :param list seqs_true: reference
        :return: average sentence-level CER score
        :rtype float
        """
        char_eds, char_ref_lens = [], []
        for i, seq_hat_text in enumerate(seqs_hat):
            seq_true_text = seqs_true[i]
            hyp_chars = seq_hat_text.replace(" ", "")
            ref_chars = seq_true_text.replace(" ", "")
            char_eds.append(editdistance.eval(hyp_chars, ref_chars))
            char_ref_lens.append(len(ref_chars))
        return float(sum(char_eds)) / sum(char_ref_lens)

    def calculate_wer(self, seqs_hat, seqs_true):
        """Calculate sentence-level WER score.

        :param list seqs_hat: prediction
        :param list seqs_true: reference
        :return: average sentence-level WER score
        :rtype float
        """
        word_eds, word_ref_lens = [], []
        for i, seq_hat_text in enumerate(seqs_hat):
            seq_true_text = seqs_true[i]
            hyp_words = seq_hat_text.split()
            ref_words = seq_true_text.split()
            word_eds.append(editdistance.eval(hyp_words, ref_words))
            word_ref_lens.append(len(ref_words))
        return float(sum(word_eds)) / sum(word_ref_lens)


================================================
FILE: nets/e2e_mt_common.py
================================================
#!/usr/bin/env python3
# encoding: utf-8

# Copyright 2019 Kyoto University (Hirofumi Inaguma)
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

"""Common functions for ST and MT."""

import nltk
import numpy as np


class ErrorCalculator(object):
    """Calculate BLEU for ST and MT models during training.

    :param y_hats: numpy array with predicted text
    :param y_pads: numpy array with true (target) text
    :param char_list: vocabulary list
    :param sym_space: space symbol
    :param sym_pad: pad symbol
    :param report_bleu: report BLUE score if True
    """

    def __init__(self, char_list, sym_space, sym_pad, report_bleu=False):
        """Construct an ErrorCalculator object."""
        super(ErrorCalculator, self).__init__()
        self.char_list = char_list
        self.space = sym_space
        self.pad = sym_pad
        self.report_bleu = report_bleu
        if self.space in self.char_list:
            self.idx_space = self.char_list.index(self.space)
        else:
            self.idx_space = None

    def __call__(self, ys_hat, ys_pad):
        """Calculate corpus-level BLEU score.

        :param torch.Tensor ys_hat: prediction (batch, seqlen)
        :param torch.Tensor ys_pad: reference (batch, seqlen)
        :return: corpus-level BLEU score in a mini-batch
        :rtype float
        """
        bleu = None
        if not self.report_bleu:
            return bleu

        bleu = self.calculate_corpus_bleu(ys_hat, ys_pad)
        return bleu

    def calculate_corpus_bleu(self, ys_hat, ys_pad):
        """Calculate corpus-level BLEU score in a mini-batch.

        :param torch.Tensor seqs_hat: prediction (batch, seqlen)
        :param torch.Tensor seqs_true: reference (batch, seqlen)
        :return: corpus-level BLEU score
        :rtype float
        """
        seqs_hat, seqs_true = [], []
        for i, y_hat in enumerate(ys_hat):
            y_true = ys_pad[i]
            eos_true = np.where(y_true == -1)[0]
            ymax = eos_true[0] if len(eos_true) > 0 else len(y_true)
            # NOTE: padding index (-1) in y_true is used to pad y_hat
            # because y_hats is not padded with -1
            seq_hat = [self.char_list[int(idx)] for idx in y_hat[:ymax]]
            seq_true = [self.char_list[int(idx)] for idx in y_true if int(idx) != -1]
            seq_hat_text = "".join(seq_hat).replace(self.space, " ")
            seq_hat_text = seq_hat_text.replace(self.pad, "")
            seq_true_text = "".join(seq_true).replace(self.space, " ")
            seqs_hat.append(seq_hat_text)
            seqs_true.append(seq_true_text)
        bleu = nltk.bleu_score.corpus_bleu([[ref] for ref in seqs_true], seqs_hat)
        return bleu * 100


================================================
FILE: nets/lm_interface.py
================================================
"""Language model interface."""

import argparse

from espnet.nets.scorer_interface import ScorerInterface
from espnet.utils.dynamic_import import dynamic_import
from espnet.utils.fill_missing_args import fill_missing_args


class LMInterface(ScorerInterface):
    """LM Interface for ESPnet model implementation."""

    @staticmethod
    def add_arguments(parser):
        """Add arguments to command line argument parser."""
        return parser

    @classmethod
    def build(cls, n_vocab: int, **kwargs):
        """Initialize this class with python-level args.

        Args:
            idim (int): The number of vocabulary.

        Returns:
            LMinterface: A new instance of LMInterface.

        """
        # local import to avoid cyclic import in lm_train
        from espnet.bin.lm_train import get_parser

        def wrap(parser):
            return get_parser(parser, required=False)

        args = argparse.Namespace(**kwargs)
        args = fill_missing_args(args, wrap)
        args = fill_missing_args(args, cls.add_arguments)
        return cls(n_vocab, args)

    def forward(self, x, t):
        """Compute LM loss value from buffer sequences.

        Args:
            x (torch.Tensor): Input ids. (batch, len)
            t (torch.Tensor): Target ids. (batch, len)

        Returns:
            tuple[torch.Tensor, torch.Tensor, torch.Tensor]: Tuple of
                loss to backward (scalar),
                negative log-likelihood of t: -log p(t) (scalar) and
                the number of elements in x (scalar)

        Notes:
            The last two return values are used
            in perplexity: p(t)^{-n} = exp(-log p(t) / n)

        """
        raise NotImplementedError("forward method is not implemented")


predefined_lms = {
    "pytorch": {
        "default": "espnet.nets.pytorch_backend.lm.default:DefaultRNNLM",
        "seq_rnn": "espnet.nets.pytorch_backend.lm.seq_rnn:SequentialRNNLM",
        "transformer": "espnet.nets.pytorch_backend.lm.transformer:TransformerLM",
    },
    "chainer": {"default": "espnet.lm.chainer_backend.lm:DefaultRNNLM"},
}


def dynamic_import_lm(module, backend):
    """Import LM class dynamically.

    Args:
        module (str): module_name:class_name or alias in `predefined_lms`
        backend (str): NN backend. e.g., pytorch, chainer

    Returns:
        type: LM class

    """
    model_class = dynamic_import(module, predefined_lms.get(backend, dict()))
    assert issubclass(
        model_class, LMInterface
    ), f"{module} does not implement LMInterface"
    return model_class


================================================
FILE: nets/mt_interface.py
================================================
"""MT Interface module."""
import argparse

from espnet.bin.asr_train import get_parser
from espnet.utils.fill_missing_args import fill_missing_args


class MTInterface:
    """MT Interface for ESPnet model implementation."""

    @staticmethod
    def add_arguments(parser):
        """Add arguments to parser."""
        return parser

    @classmethod
    def build(cls, idim: int, odim: int, **kwargs):
        """Initialize this class with python-level args.

        Args:
            idim (int): The number of an input feature dim.
            odim (int): The number of output vocab.

        Returns:
            ASRinterface: A new instance of ASRInterface.

        """

        def wrap(parser):
            return get_parser(parser, required=False)

        args = argparse.Namespace(**kwargs)
        args = fill_missing_args(args, wrap)
        args = fill_missing_args(args, cls.add_arguments)
        return cls(idim, odim, args)

    def forward(self, xs, ilens, ys):
        """Compute loss for training.

        :param xs:
            For pytorch, batch of padded source sequences torch.Tensor (B, Tmax, idim)
            For chainer, list of source sequences chainer.Variable
        :param ilens: batch of lengths of source sequences (B)
            For pytorch, torch.Tensor
            For chainer, list of int
        :param ys:
            For pytorch, batch of padded source sequences torch.Tensor (B, Lmax)
            For chainer, list of source sequences chainer.Variable
        :return: loss value
        :rtype: torch.Tensor for pytorch, chainer.Variable for chainer
        """
        raise NotImplementedError("forward method is not implemented")

    def translate(self, x, trans_args, char_list=None, rnnlm=None):
        """Translate x for evaluation.

        :param ndarray x: input acouctic feature (B, T, D) or (T, D)
        :param namespace trans_args: argment namespace contraining options
        :param list char_list: list of characters
        :param torch.nn.Module rnnlm: language model module
        :return: N-best decoding results
        :rtype: list
        """
        raise NotImplementedError("translate method is not implemented")

    def translate_batch(self, x, trans_args, char_list=None, rnnlm=None):
        """Beam search implementation for batch.

        :param torch.Tensor x: encoder hidden state sequences (B, Tmax, Henc)
        :param namespace trans_args: argument namespace containing options
        :param list char_list: list of characters
        :param torch.nn.Module rnnlm: language model module
        :return: N-best decoding results
        :rtype: list
        """
        raise NotImplementedError("Batch decoding is not supported yet.")

    def calculate_all_attentions(self, xs, ilens, ys):
        """Caluculate attention.

        :param list xs: list of padded input sequences [(T1, idim), (T2, idim), ...]
        :param ndarray ilens: batch of lengths of input sequences (B)
        :param list ys: list of character id sequence tensor [(L1), (L2), (L3), ...]
        :return: attention weights (B, Lmax, Tmax)
        :rtype: float ndarray
        """
        raise NotImplementedError("calculate_all_attentions method is not implemented")

    @property
    def attention_plot_class(self):
        """Get attention plot class."""
        from espnet.asr.asr_utils import PlotAttentionReport

        return PlotAttentionReport


================================================
FILE: nets/pytorch_backend/__init__.py
================================================
"""Initialize sub package."""


================================================
FILE: nets/pytorch_backend/conformer/__init__.py
================================================
"""Initialize sub package."""


================================================
FILE: nets/pytorch_backend/conformer/argument.py
================================================
# Copyright 2020 Hirofumi Inaguma
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

"""Conformer common arguments."""


from distutils.util import strtobool
import logging


def add_arguments_conformer_common(group):
    """Add Transformer common arguments."""
    group.add_argument(
        "--transformer-encoder-pos-enc-layer-type",
        type=str,
        default="abs_pos",
        choices=["abs_pos", "scaled_abs_pos", "rel_pos"],
        help="Transformer encoder positional encoding layer type",
    )
    group.add_argument(
        "--transformer-encoder-activation-type",
        type=str,
        default="swish",
        choices=["relu", "hardtanh", "selu", "swish"],
        help="Transformer encoder activation function type",
    )
    group.add_argument(
        "--macaron-style",
        default=False,
        type=strtobool,
        help="Whether to use macaron style for positionwise layer",
    )
    # Attention
    group.add_argument(
        "--zero-triu",
        default=False,
        type=strtobool,
        help="If true, zero the uppper triangular part of attention matrix.",
    )
    # Relative positional encoding
    group.add_argument(
        "--rel-pos-type",
        type=str,
        default="legacy",
        choices=["legacy", "latest"],
        help="Whether to use the latest relative positional encoding or the legacy one."
        "The legacy relative positional encoding will be deprecated in the future."
        "More Details can be found in https://github.com/espnet/espnet/pull/2816.",
    )
    # CNN module
    group.add_argument(
        "--use-cnn-module",
        default=False,
        type=strtobool,
        help="Use convolution module or not",
    )
    group.add_argument(
        "--cnn-module-kernel",
        default=31,
        type=int,
        help="Kernel size of convolution module.",
    )
    return group


def verify_rel_pos_type(args):
    """Verify the relative positional encoding type for compatibility.

    Args:
        args (Namespace): original arguments
    Returns:
        args (Namespace): modified arguments
    """
    rel_pos_type = getattr(args, "rel_pos_type", None)
    if rel_pos_type is None or rel_pos_type == "legacy":
        if args.transformer_encoder_pos_enc_layer_type == "rel_pos":
            args.transformer_encoder_pos_enc_layer_type = "legacy_rel_pos"
            logging.warning(
                "Using legacy_rel_pos and it will be deprecated in the future."
            )
        if args.transformer_encoder_selfattn_layer_type == "rel_selfattn":
            args.transformer_encoder_selfattn_layer_type = "legacy_rel_selfattn"
            logging.warning(
                "Using legacy_rel_selfattn and it will be deprecated in the future."
            )

    return args


================================================
FILE: nets/pytorch_backend/conformer/convolution.py
================================================
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

# Copyright 2020 Johns Hopkins University (Shinji Watanabe)
#                Northwestern Polytechnical University (Pengcheng Guo)
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

"""ConvolutionModule definition."""

from torch import nn


class ConvolutionModule(nn.Module):
    """ConvolutionModule in Conformer model.

    Args:
        channels (int): The number of channels of conv layers.
        kernel_size (int): Kernerl size of conv layers.

    """

    def __init__(self, channels, kernel_size, activation=nn.ReLU(), bias=True):
        """Construct an ConvolutionModule object."""
        super(ConvolutionModule, self).__init__()
        # kernerl_size should be a odd number for 'SAME' padding
        assert (kernel_size - 1) % 2 == 0

        self.pointwise_conv1 = nn.Conv1d(
            channels,
            2 * channels,
            kernel_size=1,
            stride=1,
            padding=0,
            bias=bias,
        )
        self.depthwise_conv = nn.Conv1d(
            channels,
            channels,
            kernel_size,
            stride=1,
            padding=(kernel_size - 1) // 2,
            groups=channels,
            bias=bias,
        )
        # self.norm = nn.BatchNorm1d(channels)
        # It would be harmful to use batch norm in DDP 
        # As it cannot be update globally
        self.norm = nn.GroupNorm(2, channels)
        self.pointwise_conv2 = nn.Conv1d(
            channels,
            channels,
            kernel_size=1,
            stride=1,
            padding=0,
            bias=bias,
        )
        self.activation = activation

    def forward(self, x):
        """Compute convolution module.

        Args:
            x (torch.Tensor): Input tensor (#batch, time, channels).

        Returns:
            torch.Tensor: Output tensor (#batch, time, channels).

        """
        # exchange the temporal dimension and the feature dimension
        x = x.transpose(1, 2)

        # GLU mechanism
        x = self.pointwise_conv1(x)  # (batch, 2*channel, dim)
        x = nn.functional.glu(x, dim=1)  # (batch, channel, dim)

        # 1D Depthwise Conv
        x = self.depthwise_conv(x)
        x = self.activation(self.norm(x))

        x = self.pointwise_conv2(x)

        return x.transpose(1, 2)


================================================
FILE: nets/pytorch_backend/conformer/encoder.py
================================================
# Copyright 2020 Johns Hopkins University (Shinji Watanabe)
#                Northwestern Polytechnical University (Pengcheng Guo)
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

"""Encoder definition."""

import logging
import torch

from espnet.nets.pytorch_backend.conformer.convolution import ConvolutionModule
from espnet.nets.pytorch_backend.conformer.encoder_layer import EncoderLayer
from espnet.nets.pytorch_backend.nets_utils import get_activation
from espnet.nets.pytorch_backend.transducer.vgg2l import VGG2L
from espnet.nets.pytorch_backend.transformer.attention import (
    MultiHeadedAttention,  # noqa: H301
    RelPositionMultiHeadedAttention,  # noqa: H301
    LegacyRelPositionMultiHeadedAttention,  # noqa: H301
)
from espnet.nets.pytorch_backend.transformer.embedding import (
    PositionalEncoding,  # noqa: H301
    ScaledPositionalEncoding,  # noqa: H301
    RelPositionalEncoding,  # noqa: H301
    LegacyRelPositionalEncoding,  # noqa: H301
)
from espnet.nets.pytorch_backend.transformer.layer_norm import LayerNorm
from espnet.nets.pytorch_backend.transformer.multi_layer_conv import Conv1dLinear
from espnet.nets.pytorch_backend.transformer.multi_layer_conv import MultiLayeredConv1d
from espnet.nets.pytorch_backend.transformer.positionwise_feed_forward import (
    PositionwiseFeedForward,  # noqa: H301
)
from espnet.nets.pytorch_backend.transformer.repeat import repeat
from espnet.nets.pytorch_backend.transformer.subsampling import Conv2dSubsampling


class Encoder(torch.nn.Module):
    """Conformer encoder module.

    Args:
        idim (int): Input dimension.
        attention_dim (int): Dimention of attention.
        attention_heads (int): The number of heads of multi head attention.
        linear_units (int): The number of units of position-wise feed forward.
        num_blocks (int): The number of decoder blocks.
        dropout_rate (float): Dropout rate.
        positional_dropout_rate (float): Dropout rate after adding positional encoding.
        attention_dropout_rate (float): Dropout rate in attention.
        input_layer (Union[str, torch.nn.Module]): Input layer type.
        normalize_before (bool): Whether to use layer_norm before the first block.
        concat_after (bool): Whether to concat attention layer's input and output.
            if True, additional linear will be applied.
            i.e. x -> x + linear(concat(x, att(x)))
            if False, no additional linear will be applied. i.e. x -> x + att(x)
        positionwise_layer_type (str): "linear", "conv1d", or "conv1d-linear".
        positionwise_conv_kernel_size (int): Kernel size of positionwise conv1d layer.
        macaron_style (bool): Whether to use macaron style for positionwise layer.
        pos_enc_layer_type (str): Encoder positional encoding layer type.
        selfattention_layer_type (str): Encoder attention layer type.
        activation_type (str): Encoder activation function type.
        use_cnn_module (bool): Whether to use convolution module.
        zero_triu (bool): Whether to zero the upper triangular part of attention matrix.
        cnn_module_kernel (int): Kernerl size of convolution module.
        padding_idx (int): Padding idx for input_layer=embed.

    """

    def __init__(
        self,
        idim,
        attention_dim=256,
        attention_heads=4,
        linear_units=2048,
        num_blocks=6,
        dropout_rate=0.1,
        positional_dropout_rate=0.1,
        attention_dropout_rate=0.0,
        input_layer="conv2d",
        normalize_before=True,
        concat_after=False,
        positionwise_layer_type="linear",
        positionwise_conv_kernel_size=1,
        macaron_style=False,
        pos_enc_layer_type="abs_pos",
        selfattention_layer_type="selfattn",
        activation_type="swish",
        use_cnn_module=False,
        zero_triu=False,
        cnn_module_kernel=31,
        padding_idx=-1,
    ):
        """Construct an Encoder object."""
        super(Encoder, self).__init__()

        activation = get_activation(activation_type)
        if pos_enc_layer_type == "abs_pos":
            pos_enc_class = PositionalEncoding
        elif pos_enc_layer_type == "scaled_abs_pos":
            pos_enc_class = ScaledPositionalEncoding
        elif pos_enc_layer_type == "rel_pos":
            assert selfattention_layer_type == "rel_selfattn"
            pos_enc_class = RelPositionalEncoding
        elif pos_enc_layer_type == "legacy_rel_pos":
            pos_enc_class = LegacyRelPositionalEncoding
            assert selfattention_layer_type == "legacy_rel_selfattn"
        else:
            raise ValueError("unknown pos_enc_layer: " + pos_enc_layer_type)

        self.conv_subsampling_factor = 1
        if input_layer == "linear":
            self.embed = torch.nn.Sequential(
                torch.nn.Linear(idim, attention_dim),
                torch.nn.LayerNorm(attention_dim),
                torch.nn.Dropout(dropout_rate),
                pos_enc_class(attention_dim, positional_dropout_rate),
            )
        elif input_layer == "conv2d":
            self.embed = Conv2dSubsampling(
                idim,
                attention_dim,
                dropout_rate,
                pos_enc_class(attention_dim, positional_dropout_rate),
            )
            self.conv_subsampling_factor = 4
        elif input_layer == "vgg2l":
            self.embed = VGG2L(idim, attention_dim)
            self.conv_subsampling_factor = 4
        elif input_layer == "embed":
            self.embed = torch.nn.Sequential(
                torch.nn.Embedding(idim, attention_dim, padding_idx=padding_idx),
                pos_enc_class(attention_dim, positional_dropout_rate),
            )
        elif isinstance(input_layer, torch.nn.Module):
            self.embed = torch.nn.Sequential(
                input_layer,
                pos_enc_class(attention_dim, positional_dropout_rate),
            )
        elif input_layer is None:
            self.embed = torch.nn.Sequential(
                pos_enc_class(attention_dim, positional_dropout_rate)
            )
        else:
            raise ValueError("unknown input_layer: " + input_layer)
        self.normalize_before = normalize_before

        # self-attention module definition
        if selfattention_layer_type == "selfattn":
            logging.info("encoder self-attention layer type = self-attention")
            encoder_selfattn_layer = MultiHeadedAttention
            encoder_selfattn_layer_args = (
                attention_heads,
                attention_dim,
                attention_dropout_rate,
            )
        elif selfattention_layer_type == "legacy_rel_selfattn":
            assert pos_enc_layer_type == "legacy_rel_pos"
            encoder_selfattn_layer = LegacyRelPositionMultiHeadedAttention
            encoder_selfattn_layer_args = (
                attention_heads,
                attention_dim,
                attention_dropout_rate,
            )
        elif selfattention_layer_type == "rel_selfattn":
            logging.info("encoder self-attention layer type = relative self-attention")
            assert pos_enc_layer_type == "rel_pos"
            encoder_selfattn_layer = RelPositionMultiHeadedAttention
            encoder_selfattn_layer_args = (
                attention_heads,
                attention_dim,
                attention_dropout_rate,
                zero_triu,
            )
        else:
            raise ValueError("unknown encoder_attn_layer: " + selfattention_layer_type)

        # feed-forward module definition
        if positionwise_layer_type == "linear":
            positionwise_layer = PositionwiseFeedForward
            positionwise_layer_args = (
                attention_dim,
                linear_units,
                dropout_rate,
                activation,
            )
        elif positionwise_layer_type == "conv1d":
            positionwise_layer = MultiLayeredConv1d
            positionwise_layer_args = (
                attention_dim,
                linear_units,
                positionwise_conv_kernel_size,
                dropout_rate,
            )
        elif positionwise_layer_type == "conv1d-linear":
            positionwise_layer = Conv1dLinear
            positionwise_layer_args = (
                attention_dim,
                linear_units,
                positionwise_conv_kernel_size,
                dropout_rate,
            )
        else:
            raise NotImplementedError("Support only linear or conv1d.")

        # convolution module definition
        convolution_layer = ConvolutionModule
        convolution_layer_args = (attention_dim, cnn_module_kernel, activation)

        self.encoders = repeat(
            num_blocks,
            lambda lnum: EncoderLayer(
                attention_dim,
                encoder_selfattn_layer(*encoder_selfattn_layer_args),
                positionwise_layer(*positionwise_layer_args),
                positionwise_layer(*positionwise_layer_args) if macaron_style else None,
                convolution_layer(*convolution_layer_args) if use_cnn_module else None,
                dropout_rate,
                normalize_before,
                concat_after,
            ),
        )
        if self.normalize_before:
            self.after_norm = LayerNorm(attention_dim)

    def forward(self, xs, masks):
        """Encode input sequence.

        Args:
            xs (torch.Tensor): Input tensor (#batch, time, idim).
            masks (torch.Tensor): Mask tensor (#batch, time).

        Returns:
            torch.Tensor: Output tensor (#batch, time, attention_dim).
            torch.Tensor: Mask tensor (#batch, time).

        """
        if isinstance(self.embed, (Conv2dSubsampling, VGG2L)):
            xs, masks = self.embed(xs, masks)
        else:
            xs = self.embed(xs)

        xs, masks = self.encoders(xs, masks)
        if isinstance(xs, tuple):
            xs = xs[0]

        if self.normalize_before:
            xs = self.after_norm(xs)
        return xs, masks


================================================
FILE: nets/pytorch_backend/conformer/encoder_layer.py
================================================
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

# Copyright 2020 Johns Hopkins University (Shinji Watanabe)
#                Northwestern Polytechnical University (Pengcheng Guo)
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

"""Encoder self-attention layer definition."""

import torch

from torch import nn

from espnet.nets.pytorch_backend.transformer.layer_norm import LayerNorm


class EncoderLayer(nn.Module):
    """Encoder layer module.

    Args:
        size (int): Input dimension.
        self_attn (torch.nn.Module): Self-attention module instance.
            `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` instance
            can be used as the argument.
        feed_forward (torch.nn.Module): Feed-forward module instance.
            `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance
            can be used as the argument.
        feed_forward_macaron (torch.nn.Module): Additional feed-forward module instance.
            `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance
            can be used as the argument.
        conv_module (torch.nn.Module): Convolution module instance.
            `ConvlutionModule` instance can be used as the argument.
        dropout_rate (float): Dropout rate.
        normalize_before (bool): Whether to use layer_norm before the first block.
        concat_after (bool): Whether to concat attention layer's input and output.
            if True, additional linear will be applied.
            i.e. x -> x + linear(concat(x, att(x)))
            if False, no additional linear will be applied. i.e. x -> x + att(x)

    """

    def __init__(
        self,
        size,
        self_attn,
        feed_forward,
        feed_forward_macaron,
        conv_module,
        dropout_rate,
        normalize_before=True,
        concat_after=False,
    ):
        """Construct an EncoderLayer object."""
        super(EncoderLayer, self).__init__()
        self.self_attn = self_attn
        self.feed_forward = feed_forward
        self.feed_forward_macaron = feed_forward_macaron
        self.conv_module = conv_module
        self.norm_ff = LayerNorm(size)  # for the FNN module
        self.norm_mha = LayerNorm(size)  # for the MHA module
        if feed_forward_macaron is not None:
            self.norm_ff_macaron = LayerNorm(size)
            self.ff_scale = 0.5
        else:
            self.ff_scale = 1.0
        if self.conv_module is not None:
            self.norm_conv = LayerNorm(size)  # for the CNN module
            self.norm_final = LayerNorm(size)  # for the final output of the block
        self.dropout = nn.Dropout(dropout_rate)
        self.size = size
        self.normalize_before = normalize_before
        self.concat_after = concat_after
        if self.concat_after:
            self.concat_linear = nn.Linear(size + size, size)

    def forward(self, x_input, mask, cache=None):
        """Compute encoded features.

        Args:
            x_input (Union[Tuple, torch.Tensor]): Input tensor w/ or w/o pos emb.
                - w/ pos emb: Tuple of tensors [(#batch, time, size), (1, time, size)].
                - w/o pos emb: Tensor (#batch, time, size).
            mask (torch.Tensor): Mask tensor for the input (#batch, time).
            cache (torch.Tensor): Cache tensor of the input (#batch, time - 1, size).

        Returns:
            torch.Tensor: Output tensor (#batch, time, size).
            torch.Tensor: Mask tensor (#batch, time).

        """
        if isinstance(x_input, tuple):
            x, pos_emb = x_input[0], x_input[1]
        else:
            x, pos_emb = x_input, None

        # whether to use macaron style
        if self.feed_forward_macaron is not None:
            residual = x
            if self.normalize_before:
                x = self.norm_ff_macaron(x)
            x = residual + self.ff_scale * self.dropout(self.feed_forward_macaron(x))
            if not self.normalize_before:
                x = self.norm_ff_macaron(x)

        # multi-headed self-attention module
        residual = x
        if self.normalize_before:
            x = self.norm_mha(x)

        if cache is None:
            x_q = x
        else:
            assert cache.shape == (x.shape[0], x.shape[1] - 1, self.size)
            x_q = x[:, -1:, :]
            residual = residual[:, -1:, :]
            mask = None if mask is None else mask[:, -1:, :]

        if pos_emb is not None:
            x_att = self.self_attn(x_q, x, x, pos_emb, mask)
        else:
            x_att = self.self_attn(x_q, x, x, mask)

        if self.concat_after:
            x_concat = torch.cat((x, x_att), dim=-1)
            x = residual + self.concat_linear(x_concat)
        else:
            x = residual + self.dropout(x_att)
        if not self.normalize_before:
            x = self.norm_mha(x)

        # convolution module
        if self.conv_module is not None:
            residual = x
            if self.normalize_before:
                x = self.norm_conv(x)
            x = residual + self.dropout(self.conv_module(x))
            if not self.normalize_before:
                x = self.norm_conv(x)

        # feed forward module
        residual = x
        if self.normalize_before:
            x = self.norm_ff(x)
        x = residual + self.ff_scale * self.dropout(self.feed_forward(x))
        if not self.normalize_before:
            x = self.norm_ff(x)

        if self.conv_module is not None:
            x = self.norm_final(x)

        if cache is not None:
            x = torch.cat([cache, x], dim=1)

        if pos_emb is not None:
            return (x, pos_emb), mask

        return x, mask


================================================
FILE: nets/pytorch_backend/conformer/swish.py
================================================
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

# Copyright 2020 Johns Hopkins University (Shinji Watanabe)
#                Northwestern Polytechnical University (Pengcheng Guo)
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

"""Swish() activation function for Conformer."""

import torch


class Swish(torch.nn.Module):
    """Construct an Swish object."""

    def forward(self, x):
        """Return Swich activation function."""
        return x * torch.sigmoid(x)


================================================
FILE: nets/pytorch_backend/ctc.py
================================================
from distutils.version import LooseVersion
import logging

import numpy as np
import six
import torch
import torch.nn.functional as F

from espnet.nets.pytorch_backend.nets_utils import to_device


class CTC(torch.nn.Module):
    """CTC module

    :param int odim: dimension of outputs
    :param int eprojs: number of encoder projection units
    :param float dropout_rate: dropout rate (0.0 ~ 1.0)
    :param str ctc_type: builtin or warpctc
    :param bool reduce: reduce the CTC loss into a scalar
    """

    def __init__(self, odim, eprojs, dropout_rate, ctc_type="warpctc", reduce=True):
        super().__init__()
        self.dropout_rate = dropout_rate
        self.loss = None
        self.ctc_lo = torch.nn.Linear(eprojs, odim)
        self.probs = None  # for visualization

        # In case of Pytorch >= 1.7.0, CTC will be always builtin
        self.ctc_type = (
            ctc_type
            if LooseVersion(torch.__version__) < LooseVersion("1.7.0")
            else "builtin"
        )

        # ctc_type = buitin not support Pytorch=1.0.1
        if self.ctc_type == "builtin" and (
            LooseVersion(torch.__version__) < LooseVersion("1.1.0")
        ):
            self.ctc_type = "cudnnctc"

        if ctc_type != self.ctc_type:
            logging.warning(f"CTC was set to {self.ctc_type} due to PyTorch version.")

        if self.ctc_type == "builtin":
            reduction_type = "sum" if reduce else "none"
            self.ctc_loss = torch.nn.CTCLoss(
                reduction=reduction_type, zero_infinity=True
            )
        elif self.ctc_type == "cudnnctc":
            reduction_type = "sum" if reduce else "none"
            self.ctc_loss = torch.nn.CTCLoss(reduction=reduction_type)
        elif self.ctc_type == "warpctc":
            import warpctc_pytorch as warp_ctc

            self.ctc_loss = warp_ctc.CTCLoss(size_average=True, reduce=reduce)
        elif self.ctc_type == "gtnctc":
            from espnet.nets.pytorch_backend.gtn_ctc import GTNCTCLossFunction

            self.ctc_loss = GTNCTCLossFunction.apply
        else:
            raise ValueError(
                'ctc_type must be "builtin" or "warpctc": {}'.format(self.ctc_type)
            )

        self.ignore_id = -1
        self.reduce = reduce

    def loss_fn(self, th_pred, th_target, th_ilen, th_olen):
        if self.ctc_type in ["builtin", "cudnnctc"]:
            th_pred = th_pred.log_softmax(2)
            # Use the deterministic CuDNN implementation of CTC loss to avoid
            #  [issue#17798](https://github.com/pytorch/pytorch/issues/17798)
            with torch.backends.cudnn.flags(deterministic=True):
                loss = self.ctc_loss(th_pred, th_target, th_ilen, th_olen)
            # Batch-size average
            loss = loss / th_pred.size(1)
            return loss
        elif self.ctc_type == "warpctc":
            return self.ctc_loss(th_pred, th_target, th_ilen, th_olen)
        elif self.ctc_type == "gtnctc":
            targets = [t.tolist() for t in th_target]
            log_probs = torch.nn.functional.log_softmax(th_pred, dim=2)
            return self.ctc_loss(log_probs, targets, 0, "none")
        else:
            raise NotImplementedError

    # Add the texts to be compatible with MMI loss
    def forward(self, hs_pad, hlens, ys_pad, texts):
        """CTC forward

        :param torch.Tensor hs_pad: batch of padded hidden state sequences (B, Tmax, D)
        :param torch.Tensor hlens: batch of lengths of hidden state sequences (B)
        :param torch.Tensor ys_pad:
            batch of padded character id sequence tensor (B, Lmax)
        :return: ctc loss value
        :rtype: torch.Tensor
        """
        # TODO(kan-bayashi): need to make more smart way
        ys = [y[y != self.ignore_id] for y in ys_pad]  # parse padded ys

        # zero padding for hs
        ys_hat = self.ctc_lo(F.dropout(hs_pad, p=self.dropout_rate))
        if self.ctc_type != "gtnctc":
            ys_hat = ys_hat.transpose(0, 1)

        if self.ctc_type == "builtin":
            olens = to_device(ys_hat, torch.LongTensor([len(s) for s in ys]))
            hlens = hlens.long()
            ys_pad = torch.cat(ys)  # without this the code breaks for asr_mix
            self.loss = self.loss_fn(ys_hat, ys_pad, hlens, olens)
        else:
            self.loss = None
            hlens = torch.from_numpy(np.fromiter(hlens, dtype=np.int32))
            olens = torch.from_numpy(
                np.fromiter((x.size(0) for x in ys), dtype=np.int32)
            )
            # zero padding for ys
            ys_true = torch.cat(ys).cpu().int()  # batch x olen
            # get ctc loss
            # expected shape of seqLength x batchSize x alphabet_size
            dtype = ys_hat.dtype
            if self.ctc_type == "warpctc" or dtype == torch.float16:
                # warpctc only supports float32
                # torch.ctc does not support float16 (#1751)
                ys_hat = ys_hat.to(dtype=torch.float32)
            if self.ctc_type == "cudnnctc":
                # use GPU when using the cuDNN implementation
                ys_true = to_device(hs_pad, ys_true)
            if self.ctc_type == "gtnctc":
                # keep as list for gtn
                ys_true = ys
            self.loss = to_device(
                hs_pad, self.loss_fn(ys_hat, ys_true, hlens, olens)
            ).to(dtype=dtype)

        # get length info
        logging.info(
            self.__class__.__name__
            + " input lengths:  "
            + "".join(str(hlens).split("\n"))
        )
        logging.info(
            self.__class__.__name__
            + " output lengths: "
            + "".join(str(olens).split("\n"))
        )

        if self.reduce:
            # NOTE: sum() is needed to keep consistency
            # since warpctc return as tensor w/ shape (1,)
            # but builtin return as tensor w/o shape (scalar).
            self.loss = self.loss.sum()
            logging.info("ctc loss:" + str(float(self.loss)))

        return self.loss

    def softmax(self, hs_pad):
        """softmax of frame activations

        :param torch.Tensor hs_pad: 3d tensor (B, Tmax, eprojs)
        :return: log softmax applied 3d tensor (B, Tmax, odim)
        :rtype: torch.Tensor
        """
        self.probs = F.softmax(self.ctc_lo(hs_pad), dim=2)
        return self.probs

    def log_softmax(self, hs_pad):
        """log_softmax of frame activations

        :param torch.Tensor hs_pad: 3d tensor (B, Tmax, eprojs)
        :return: log softmax applied 3d tensor (B, Tmax, odim)
        :rtype: torch.Tensor
        """
        return F.log_softmax(self.ctc_lo(hs_pad), dim=2)

    def argmax(self, hs_pad):
        """argmax of frame activations

        :param torch.Tensor hs_pad: 3d tensor (B, Tmax, eprojs)
        :return: argmax applied 2d tensor (B, Tmax)
        :rtype: torch.Tensor
        """
        return torch.argmax(self.ctc_lo(hs_pad), dim=2)

    def forced_align(self, h, y, blank_id=0):
        """forced alignment.

        :param torch.Tensor h: hidden state sequence, 2d tensor (T, D)
        :param torch.Tensor y: id sequence tensor 1d tensor (L)
        :param int y: blank symbol index
        :return: best alignment results
        :rtype: list
        """

        def interpolate_blank(label, blank_id=0):
            """Insert blank token between every two label token."""
            label = np.expand_dims(label, 1)
            blanks = np.zeros((label.shape[0], 1), dtype=np.int64) + blank_id
            label = np.concatenate([blanks, label], axis=1)
            label = label.reshape(-1)
            label = np.append(label, label[0])
            return label

        lpz = self.log_softmax(h)
        lpz = lpz.squeeze(0)

        y_int = interpolate_blank(y, blank_id)

        logdelta = np.zeros((lpz.size(0), len(y_int))) - 100000000000.0  # log of zero
        state_path = (
            np.zeros((lpz.size(0), len(y_int)), dtype=np.int16) - 1
        )  # state path

        logdelta[0, 0] = lpz[0][y_int[0]]
        logdelta[0, 1] = lpz[0][y_int[1]]

        for t in six.moves.range(1, lpz.size(0)):
            for s in six.moves.range(len(y_int)):
                if y_int[s] == blank_id or s < 2 or y_int[s] == y_int[s - 2]:
                    candidates = np.array([logdelta[t - 1, s], logdelta[t - 1, s - 1]])
                    prev_state = [s, s - 1]
                else:
                    candidates = np.array(
                        [
                            logdelta[t - 1, s],
                            logdelta[t - 1, s - 1],
                            logdelta[t - 1, s - 2],
                        ]
                    )
                    prev_state = [s, s - 1, s - 2]
                logdelta[t, s] = np.max(candidates) + lpz[t][y_int[s]]
                state_path[t, s] = prev_state[np.argmax(candidates)]

        state_seq = -1 * np.ones((lpz.size(0), 1), dtype=np.int16)

        candidates = np.array(
            [logdelta[-1, len(y_int) - 1], logdelta[-1, len(y_int) - 2]]
        )
        prev_state = [len(y_int) - 1, len(y_int) - 2]
        state_seq[-1] = prev_state[np.argmax(candidates)]
        for t in six.moves.range(lpz.size(0) - 2, -1, -1):
            state_seq[t] = state_path[t + 1, state_seq[t + 1, 0]]

        output_state_seq = []
        for t in six.moves.range(0, lpz.size(0)):
            output_state_seq.append(y_int[state_seq[t, 0]])

        return output_state_seq


def ctc_for(args, odim, reduce=True):
    """Returns the CTC module for the given args and output dimension

    :param Namespace args: the program args
    :param int odim : The output dimension
    :param bool reduce : return the CTC loss in a scalar
    :return: the corresponding CTC module
    """
    num_encs = getattr(args, "num_encs", 1)  # use getattr to keep compatibility
    if num_encs == 1:
        # compatible with single encoder asr mode
        return CTC(
            odim, args.eprojs, args.dropout_rate, ctc_type=args.ctc_type, reduce=reduce
        )
    elif num_encs >= 1:
        ctcs_list = torch.nn.ModuleList()
        if args.share_ctc:
            # use dropout_rate of the first encoder
            ctc = CTC(
                odim,
                args.eprojs,
                args.dropout_rate[0],
                ctc_type=args.ctc_type,
                reduce=reduce,
            )
            ctcs_list.append(ctc)
        else:
            for idx in range(num_encs):
                ctc = CTC(
                    odim,
                    args.eprojs,
                    args.dropout_rate[idx],
                    ctc_type=args.ctc_type,
                    reduce=reduce,
                )
                ctcs_list.append(ctc)
        return ctcs_list
    else:
        raise ValueError(
            "Number of encoders needs to be more than one. {}".format(num_encs)
        )


================================================
FILE: nets/pytorch_backend/e2e_asr.py
================================================
# Copyright 2017 Johns Hopkins University (Shinji Watanabe)
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

"""RNN sequence-to-sequence speech recognition model (pytorch)."""

import argparse
from itertools import groupby
import logging
import math
import os

import chainer
from chainer import reporter
import editdistance
import numpy as np
import six
import torch

from espnet.nets.asr_interface import ASRInterface
from espnet.nets.e2e_asr_common import label_smoothing_dist
from espnet.nets.pytorch_backend.ctc import ctc_for
from espnet.nets.pytorch_backend.frontends.frontend import frontend_for
from espnet.nets.pytorch_backend.initialization import lecun_normal_init_parameters
from espnet.nets.pytorch_backend.initialization import set_forget_bias_to_one
from espnet.nets.pytorch_backend.nets_utils import get_subsample
from espnet.nets.pytorch_backend.nets_utils import pad_list
from espnet.nets.pytorch_backend.nets_utils import to_device
from espnet.nets.pytorch_backend.nets_utils import to_torch_tensor
from espnet.nets.pytorch_backend.rnn.argument import (
    add_arguments_rnn_encoder_common,  # noqa: H301
    add_arguments_rnn_decoder_common,  # noqa: H301
    add_arguments_rnn_attention_common,  # noqa: H301
)
from espnet.nets.pytorch_backend.rnn.attentions import att_for
from espnet.nets.pytorch_backend.rnn.decoders import decoder_for
from espnet.nets.pytorch_backend.rnn.encoders import encoder_for
from espnet.nets.scorers.ctc import CTCPrefixScorer
from espnet.utils.fill_missing_args import fill_missing_args

CTC_LOSS_THRESHOLD = 10000


class Reporter(chainer.Chain):
    """A chainer reporter wrapper."""

    def report(self, loss_ctc, loss_att, loss_third, loss_mbr, acc, cer_ctc, cer, wer, mtl_loss):
        """Report at every step."""
        reporter.report({"loss_ctc": loss_ctc}, self)
        reporter.report({"loss_att": loss_att}, self)
        reporter.report({"loss_third": loss_third}, self)
        reporter.report({"loss_mbr": loss_mbr}, self)
        reporter.report({"acc": acc}, self)
        reporter.report({"cer_ctc": cer_ctc}, self)
        reporter.report({"cer": cer}, self)
        reporter.report({"wer": wer}, self)
        logging.info("mtl loss:" + str(mtl_loss))
        reporter.report({"loss": mtl_loss}, self)


class E2E(ASRInterface, torch.nn.Module):
    """E2E module.

    :param int idim: dimension of inputs
    :param int odim: dimension of outputs
    :param Namespace args: argument Namespace containing options

    """

    @staticmethod
    def add_arguments(parser):
        """Add arguments."""
        E2E.encoder_add_arguments(parser)
        E2E.attention_add_arguments(parser)
        E2E.decoder_add_arguments(parser)
        return parser

    @staticmethod
    def encoder_add_arguments(parser):
        """Add arguments for the encoder."""
        group = parser.add_argument_group("E2E encoder setting")
        group = add_arguments_rnn_encoder_common(group)
        return parser

    @staticmethod
    def attention_add_arguments(parser):
        """Add arguments for the attention."""
        group = parser.add_argument_group("E2E attention setting")
        group = add_arguments_rnn_attention_common(group)
        return parser

    @staticmethod
    def decoder_add_arguments(parser):
        """Add arguments for the decoder."""
        group = parser.add_argument_group("E2E decoder setting")
        group = add_arguments_rnn_decoder_common(group)
        return parser

    def get_total_subsampling_factor(self):
        """Get total subsampling factor."""
        if isinstance(self.enc, torch.nn.ModuleList):
            return self.enc[0].conv_subsampling_factor * int(np.prod(self.subsample))
        else:
            return self.enc.conv_subsampling_factor * int(np.prod(self.subsample))

    def __init__(self, idim, odim, args):
        """Construct an E2E object.

        :param int idim: dimension of inputs
        :param int odim: dimension of outputs
        :param Namespace args: argument Namespace containing options
        """
        super(E2E, self).__init__()
        torch.nn.Module.__init__(self)

        # fill missing arguments for compatibility
        args = fill_missing_args(args, self.add_arguments)

        self.mtlalpha = args.mtlalpha
        assert 0.0 <= self.mtlalpha <= 1.0, "mtlalpha should be [0.0, 1.0]"
        self.etype = args.etype
        self.verbose = args.verbose
        # NOTE: for self.build method
        args.char_list = getattr(args, "char_list", None)
        self.char_list = args.char_list
        self.outdir = args.outdir
        self.space = args.sym_space
        self.blank = args.sym_blank
        self.reporter = Reporter()

        # below means the last number becomes eos/sos ID
        # note that sos/eos IDs are identical
        self.sos = odim - 1
        self.eos = odim - 1

        # subsample info
        self.subsample = get_subsample(args, mode="asr", arch="rnn")

        # label smoothing info
        if args.lsm_type and os.path.isfile(args.train_json):
            logging.info("Use label smoothing with " + args.lsm_type)
            labeldist = label_smoothing_dist(
                odim, args.lsm_type, transcript=args.train_json
            )
        else:
            labeldist = None

        # encoder
        self.enc = encoder_for(args, idim, self.subsample)
        # ctc
        self.ctc = ctc_for(args, odim)
        # attention
        self.att = att_for(args)
        # decoder
        self.dec = decoder_for(args, odim, self.sos, self.eos, self.att, labeldist)

        # weight initialization
        self.init_like_chainer()

        # options for beam search
        if args.report_cer or args.report_wer:
            recog_args = {
                "beam_size": args.beam_size,
                "penalty": args.penalty,
                "ctc_weight": args.ctc_weight,
                "maxlenratio": args.maxlenratio,
                "minlenratio": args.minlenratio,
                "lm_weight": args.lm_weight,
                "rnnlm": args.rnnlm,
                "nbest": args.nbest,
                "space": args.sym_space,
                "blank": args.sym_blank,
            }

            self.recog_args = argparse.Namespace(**recog_args)
            self.report_cer = args.report_cer
            self.report_wer = args.report_wer
        else:
            self.report_cer = False
            self.report_wer = False
        self.rnnlm = None

        self.logzero = -10000000000.0
        self.loss = None
        self.acc = None

    def init_like_chainer(self):
        """Initialize weight like chainer.

        chainer basically uses LeCun way: W ~ Normal(0, fan_in ** -0.5), b = 0
        pytorch basically uses W, b ~ Uniform(-fan_in**-0.5, fan_in**-0.5)
        however, there are two exceptions as far as I know.
        - EmbedID.W ~ Normal(0, 1)
        - LSTM.upward.b[forget_gate_range] = 1 (but not used in NStepLSTM)
        """
        lecun_normal_init_parameters(self)
        # exceptions
        # embed weight ~ Normal(0, 1)
        self.dec.embed.weight.data.normal_(0, 1)
        # forget-bias = 1.0
        # https://discuss.pytorch.org/t/set-forget-gate-bias-of-lstm/1745
        for i in six.moves.range(len(self.dec.decoder)):
            set_forget_bias_to_one(self.dec.decoder[i].bias_ih)

    def forward(self, xs_pad, ilens, ys_pad):
        """E2E forward.

        :param torch.Tensor xs_pad: batch of padded input sequences (B, Tmax, idim)
        :param torch.Tensor ilens: batch of lengths of input sequences (B)
        :param torch.Tensor ys_pad: batch of padded token id sequence tensor (B, Lmax)
        :return: loss value
        :rtype: torch.Tensor
        """
        # 0. Frontend
        if self.frontend is not None:
            hs_pad, hlens, mask = self.frontend(to_torch_tensor(xs_pad), ilens)
            hs_pad, hlens = self.feature_transform(hs_pad, hlens)
        else:
            hs_pad, hlens = xs_pad, ilens

        # 1. Encoder
        hs_pad, hlens, _ = self.enc(hs_pad, hlens)

        # 2. CTC loss
        if self.mtlalpha == 0:
            self.loss_ctc = None
        else:
            self.loss_ctc = self.ctc(hs_pad, hlens, ys_pad)

        # 3. attention loss
        if self.mtlalpha == 1:
            self.loss_att, acc = None, None
        else:
            self.loss_att, acc, _ = self.dec(hs_pad, hlens, ys_pad)
        self.acc = acc

        # 4. compute cer without beam search
        if self.mtlalpha == 0 or self.char_list is None:
            cer_ctc = None
        else:
            cers = []

            y_hats = self.ctc.argmax(hs_pad).data
            for i, y in enumerate(y_hats):
                y_hat = [x[0] for x in groupby(y)]
                y_true = ys_pad[i]

                seq_hat = [self.char_list[int(idx)] for idx in y_hat if int(idx) != -1]
                seq_true = [
                    self.char_list[int(idx)] for idx in y_true if int(idx) != -1
                ]
                seq_hat_text = "".join(seq_hat).replace(self.space, " ")
                seq_hat_text = seq_hat_text.replace(self.blank, "")
                seq_true_text = "".join(seq_true).replace(self.space, " ")

                hyp_chars = seq_hat_text.replace(" ", "")
                ref_chars = seq_true_text.replace(" ", "")
                if len(ref_chars) > 0:
                    cers.append(
                        editdistance.eval(hyp_chars, ref_chars) / len(ref_chars)
                    )

            cer_ctc = sum(cers) / len(cers) if cers else None

        # 5. compute cer/wer
        if self.training or not (self.report_cer or self.report_wer):
            cer, wer = 0.0, 0.0
            # oracle_cer, oracle_wer = 0.0, 0.0
        else:
            if self.recog_args.ctc_weight > 0.0:
                lpz = self.ctc.log_softmax(hs_pad).data
            else:
                lpz = None

            word_eds, word_ref_lens, char_eds, char_ref_lens = [], [], [], []
            nbest_hyps = self.dec.recognize_beam_batch(
                hs_pad,
                torch.tensor(hlens),
                lpz,
                self.recog_args,
                self.char_list,
                self.rnnlm,
            )
            # remove <sos> and <eos>
            y_hats = [nbest_hyp[0]["yseq"][1:-1] for nbest_hyp in nbest_hyps]
            for i, y_hat in enumerate(y_hats):
                y_true = ys_pad[i]

                seq_hat = [self.char_list[int(idx)] for idx in y_hat if int(idx) != -1]
                seq_true = [
                    self.char_list[int(idx)] for idx in y_true if int(idx) != -1
                ]
                seq_hat_text = "".join(seq_hat).replace(self.recog_args.space, " ")
                seq_hat_text = seq_hat_text.replace(self.recog_args.blank, "")
                seq_true_text = "".join(seq_true).replace(self.recog_args.space, " ")

                hyp_words = seq_hat_text.split()
                ref_words = seq_true_text.split()
                word_eds.append(editdistance.eval(hyp_words, ref_words))
                word_ref_lens.append(len(ref_words))
                hyp_chars = seq_hat_text.replace(" ", "")
                ref_chars = seq_true_text.replace(" ", "")
                char_eds.append(editdistance.eval(hyp_chars, ref_chars))
                char_ref_lens.append(len(ref_chars))

            wer = (
                0.0
                if not self.report_wer
                else float(sum(word_eds)) / sum(word_ref_lens)
            )
            cer = (
                0.0
                if not self.report_cer
                else float(sum(char_eds)) / sum(char_ref_lens)
            )

        alpha = self.mtlalpha
        if alpha == 0:
            self.loss = self.loss_att
            loss_att_data = float(self.loss_att)
            loss_ctc_data = None
        elif alpha == 1:
            self.loss = self.loss_ctc
            loss_att_data = None
            loss_ctc_data = float(self.loss_ctc)
        else:
            self.loss = alpha * self.loss_ctc + (1 - alpha) * self.loss_att
            loss_att_data = float(self.loss_att)
            loss_ctc_data = float(self.loss_ctc)

        loss_data = float(self.loss)
        if loss_data < CTC_LOSS_THRESHOLD and not math.isnan(loss_data):
            self.reporter.report(
                loss_ctc_data, loss_att_data, acc, cer_ctc, cer, wer, loss_data
            )
        else:
            logging.warning("loss (=%f) is not correct", loss_data)
        return self.loss

    def scorers(self):
        """Scorers."""
        return dict(decoder=self.dec, ctc=CTCPrefixScorer(self.ctc, self.eos))

    def encode(self, x):
        """Encode acoustic features.

        :param ndarray x: input acoustic feature (T, D)
        :return: encoder outputs
        :rtype: torch.Tensor
        """
        self.eval()
        ilens = [x.shape[0]]

        # subsample frame
        x = x[:: self.subsample[0], :]
        p = next(self.parameters())
        h = torch.as_tensor(x, device=p.device, dtype=p.dtype)
        # make a utt list (1) to use the same interface for encoder
        hs = h.contiguous().unsqueeze(0)

        # 0. Frontend
        if self.frontend is not None:
            enhanced, hlens, mask = self.frontend(hs, ilens)
            hs, hlens = self.feature_transform(enhanced, hlens)
        else:
            hs, hlens = hs, ilens

        # 1. encoder
        hs, _, _ = self.enc(hs, hlens)
        return hs.squeeze(0)

    def recognize(self, x, recog_args, char_list, rnnlm=None):
        """E2E beam search.

        :param ndarray x: input acoustic feature (T, D)
        :param Namespace recog_args: argument Namespace containing options
        :param list char_list: list of characters
        :param torch.nn.Module rnnlm: language model module
        :return: N-best decoding results
        :rtype: list
        """
        hs = self.encode(x).unsqueeze(0)
        # calculate log P(z_t|X) for CTC scores
        if recog_args.ctc_weight > 0.0:
            lpz = self.ctc.log_softmax(hs)[0]
        else:
            lpz = None

        # 2. Decoder
        # decode the first utterance
        y = self.dec.recognize_beam(hs[0], lpz, recog_args, char_list, rnnlm)
        return y

    def recognize_batch(self, xs, recog_args, char_list, rnnlm=None):
        """E2E batch beam search.

        :param list xs: list of input acoustic feature arrays [(T_1, D), (T_2, D), ...]
        :param Namespace recog_args: argument Namespace containing options
        :param list char_list: list of characters
        :param torch.nn.Module rnnlm: language model module
        :return: N-best decoding results
        :rtype: list
        """
        prev = self.training
        self.eval()
        ilens = np.fromiter((xx.shape[0] for xx in xs), dtype=np.int64)

        # subsample frame
        xs = [xx[:: self.subsample[0], :] for xx in xs]
        xs = [to_device(self, to_torch_tensor(xx).float()) for xx in xs]
        xs_pad = pad_list(xs, 0.0)

        # 0. Frontend
        if self.frontend is not None:
            enhanced, hlens, mask = self.frontend(xs_pad, ilens)
            hs_pad, hlens = self.feature_transform(enhanced, hlens)
        else:
            hs_pad, hlens = xs_pad, ilens

        # 1. Encoder
        hs_pad, hlens, _ = self.enc(hs_pad, hlens)

        # calculate log P(z_t|X) for CTC scores
        if recog_args.ctc_weight > 0.0:
            lpz = self.ctc.log_softmax(hs_pad)
            normalize_score = False
        else:
            lpz = None
            normalize_score = True

        # 2. Decoder
        hlens = torch.tensor(list(map(int, hlens)))  # make sure hlens is tensor
        y = self.dec.recognize_beam_batch(
            hs_pad,
            hlens,
            lpz,
            recog_args,
            char_list,
            rnnlm,
            normalize_score=normalize_score,
        )

        if prev:
            self.train()
        return y

    def enhance(self, xs):
        """Forward only in the frontend stage.

        :param ndarray xs: input acoustic feature (T, C, F)
        :return: enhaned feature
        :rtype: torch.Tensor
        """
        if self.frontend is None:
            raise RuntimeError("Frontend does't exist")
        prev = self.training
        self.eval()
        ilens = np.fromiter((xx.shape[0] for xx in xs), dtype=np.int64)

        # subsample frame
        xs = [xx[:: self.subsample[0], :] for xx in xs]
        xs = [to_device(self, to_torch_tensor(xx).float()) for xx in xs]
        xs_pad = pad_list(xs, 0.0)
        enhanced, hlensm, mask = self.frontend(xs_pad, ilens)
        if prev:
            self.train()
        return enhanced.cpu().numpy(), mask.cpu().numpy(), ilens

    def calculate_all_attentions(self, xs_pad, ilens, ys_pad):
        """E2E attention calculation.

        :param torch.Tensor xs_pad: batch of padded input sequences (B, Tmax, idim)
        :param torch.Tensor ilens: batch of lengths of input sequences (B)
        :param torch.Tensor ys_pad: batch of padded token id sequence tensor (B, Lmax)
        :return: attention weights with the following shape,
            1) multi-head case => attention weights (B, H, Lmax, Tmax),
            2) other case => attention weights (B, Lmax, Tmax).
        :rtype: float ndarray
        """
        self.eval()
        with torch.no_grad():
            # 0. Frontend
            if self.frontend is not None:
                hs_pad, hlens, mask = self.frontend(to_torch_tensor(xs_pad), ilens)
                hs_pad, hlens = self.feature_transform(hs_pad, hlens)
            else:
                hs_pad, hlens = xs_pad, ilens

            # 1. Encoder
            hpad, hlens, _ = self.enc(hs_pad, hlens)

            # 2. Decoder
            att_ws = self.dec.calculate_all_attentions(hpad, hlens, ys_pad)
        self.train()
        return att_ws

    def calculate_all_ctc_probs(self, xs_pad, ilens, ys_pad):
        """E2E CTC probability calculation.

        :param torch.Tensor xs_pad: batch of padded input sequences (B, Tmax)
        :param torch.Tensor ilens: batch of lengths of input sequences (B)
        :param torch.Tensor ys_pad: batch of padded token id sequence tensor (B, Lmax)
        :return: CTC probability (B, Tmax, vocab)
        :rtype: float ndarray
        """
        probs = None
        if self.mtlalpha == 0:
            return probs

        self.eval()
        with torch.no_grad():
            # 0. Frontend
            if self.frontend is not None:
                hs_pad, hlens, mask = self.frontend(to_torch_tensor(xs_pad), ilens)
                hs_pad, hlens = self.feature_transform(hs_pad, hlens)
            else:
                hs_pad, hlens = xs_pad, ilens

            # 1. Encoder
            hpad, hlens, _ = self.enc(hs_pad, hlens)

            # 2. CTC probs
            probs = self.ctc.softmax(hpad).cpu().numpy()
        self.train()
        return probs

    def subsample_frames(self, x):
        """Subsample speeh frames in the encoder."""
        # subsample frame
        x = x[:: self.subsample[0], :]
        ilen = [x.shape[0]]
        h = to_device(self, torch.from_numpy(np.array(x, dtype=np.float32)))
        h.contiguous()
        return h, ilen


================================================
FILE: nets/pytorch_backend/e2e_asr_conformer.py
================================================
# Copyright 2020 Johns Hopkins University (Shinji Watanabe)
#                Northwestern Polytechnical University (Pengcheng Guo)
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

"""
Conformer speech recognition model (pytorch).

It is a fusion of `e2e_asr_transformer.py`
Refer to: https://arxiv.org/abs/2005.08100

"""

from espnet.nets.pytorch_backend.conformer.encoder import Encoder
from espnet.nets.pytorch_backend.e2e_asr_transformer import E2E as E2ETransformer
from espnet.nets.pytorch_backend.conformer.argument import (
    add_arguments_conformer_common,  # noqa: H301
    verify_rel_pos_type,  # noqa: H301
)


class E2E(E2ETransformer):
    """E2E module.

    :param int idim: dimension of inputs
    :param int odim: dimension of outputs
    :param Namespace args: argument Namespace containing options

    """

    @staticmethod
    def add_arguments(parser):
        """Add arguments."""
        E2ETransformer.add_arguments(parser)
        E2E.add_conformer_arguments(parser)
        return parser

    @staticmethod
    def add_conformer_arguments(parser):
        """Add arguments for conformer model."""
        group = parser.add_argument_group("conformer model specific setting")
        group = add_arguments_conformer_common(group)
        return parser

    def __init__(self, idim, odim, args, ignore_id=-1):
        """Construct an E2E object.

        :param int idim: dimension of inputs
        :param int odim: dimension of outputs
        :param Namespace args: argument Namespace containing options
        """
        super().__init__(idim, odim, args, ignore_id)
        if args.transformer_attn_dropout_rate is None:
            args.transformer_attn_dropout_rate = args.dropout_rate

        # Check the relative positional encoding type
        args = verify_rel_pos_type(args)

        self.encoder = Encoder(
            idim=idim,
            attention_dim=args.adim,
            attention_heads=args.aheads,
            linear_units=args.eunits,
            num_blocks=args.elayers,
            input_layer=args.transformer_input_layer,
            dropout_rate=args.dropout_rate,
            positional_dropout_rate=args.dropout_rate,
            attention_dropout_rate=args.transformer_attn_dropout_rate,
            pos_enc_layer_type=args.transformer_encoder_pos_enc_layer_type,
            selfattention_layer_type=args.transformer_encoder_selfattn_layer_type,
            activation_type=args.transformer_encoder_activation_type,
            macaron_style=args.macaron_style,
            use_cnn_module=args.use_cnn_module,
            zero_triu=args.zero_triu,
            cnn_module_kernel=args.cnn_module_kernel,
        )
        self.reset_parameters(args)


================================================
FILE: nets/pytorch_backend/e2e_asr_maskctc.py
================================================
# Copyright 2020 Johns Hopkins University (Shinji Watanabe)
#                Waseda University (Yosuke Higuchi)
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

"""
Mask CTC based non-autoregressive speech recognition model (pytorch).

See https://arxiv.org/abs/2005.08700 for the detail.

"""

from itertools import groupby
import logging
import math

from distutils.util import strtobool
import numpy
import torch

from espnet.nets.pytorch_backend.conformer.encoder import Encoder
from espnet.nets.pytorch_backend.conformer.argument import (
    add_arguments_conformer_common,  # noqa: H301
)
from espnet.nets.pytorch_backend.e2e_asr import CTC_LOSS_THRESHOLD
from espnet.nets.pytorch_backend.e2e_asr_transformer import E2E as E2ETransformer
from espnet.nets.pytorch_backend.maskctc.add_mask_token import mask_uniform
from espnet.nets.pytorch_backend.maskctc.mask import square_mask
from espnet.nets.pytorch_backend.nets_utils import make_non_pad_mask
from espnet.nets.pytorch_backend.nets_utils import th_accuracy


class E2E(E2ETransformer):
    """E2E module.

    :param int idim: dimension of inputs
    :param int odim: dimension of outputs
    :param Namespace args: argument Namespace containing options

    """

    @staticmethod
    def add_arguments(parser):
        """Add arguments."""
        E2ETransformer.add_arguments(parser)
        E2E.add_maskctc_arguments(parser)

        return parser

    @staticmethod
    def add_maskctc_arguments(parser):
        """Add arguments for maskctc model."""
        group = parser.add_argument_group("maskctc specific setting")

        group.add_argument(
            "--maskctc-use-conformer-encoder",
            default=False,
            type=strtobool,
        )
        group = add_arguments_conformer_common(group)

        return parser

    def __init__(self, idim, odim, args, ignore_id=-1):
        """Construct an E2E object.

        :param int idim: dimension of inputs
        :param int odim: dimension of outputs
        :param Namespace args: argument Namespace containing options
        """
        odim += 1  # for the mask token

        super().__init__(idim, odim, args, ignore_id)
        assert 0.0 <= self.mtlalpha < 1.0, "mtlalpha should be [0.0, 1.0)"

        self.mask_token = odim - 1
        self.sos = odim - 2
        self.eos = odim - 2
        self.odim = odim

        if args.maskctc_use_conformer_encoder:
            if args.transformer_attn_dropout_rate is None:
                args.transformer_attn_dropout_rate = args.conformer_dropout_rate
            self.encoder = Encoder(
                idim=idim,
                attention_dim=args.adim,
                attention_heads=args.aheads,
                linear_units=args.eunits,
                num_blocks=args.elayers,
                input_layer=args.transformer_input_layer,
                dropout_rate=args.dropout_rate,
                positional_dropout_rate=args.dropout_rate,
                attention_dropout_rate=args.transformer_attn_dropout_rate,
                pos_enc_layer_type=args.transformer_encoder_pos_enc_layer_type,
                selfattention_layer_type=args.transformer_encoder_selfattn_layer_type,
                activation_type=args.transformer_encoder_activation_type,
                macaron_style=args.macaron_style,
                use_cnn_module=args.use_cnn_module,
                cnn_module_kernel=args.cnn_module_kernel,
            )
        self.reset_parameters(args)

    def forward(self, xs_pad, ilens, ys_pad):
        """E2E forward.

        :param torch.Tensor xs_pad: batch of padded source sequences (B, Tmax, idim)
        :param torch.Tensor ilens: batch of lengths of source sequences (B)
        :param torch.Tensor ys_pad: batch of padded target sequences (B, Lmax)
        :return: ctc loss value
        :rtype: torch.Tensor
        :return: attention loss value
        :rtype: torch.Tensor
        :return: accuracy in attention decoder
        :rtype: float
        """
        # 1. forward encoder
        xs_pad = xs_pad[:, : max(ilens)]  # for data parallel
        src_mask = make_non_pad_mask(ilens.tolist()).to(xs_pad.device).unsqueeze(-2)
        hs_pad, hs_mask = self.encoder(xs_pad, src_mask)
        self.hs_pad = hs_pad

        # 2. forward decoder
        ys_in_pad, ys_out_pad = mask_uniform(
            ys_pad, self.mask_token, self.eos, self.ignore_id
        )
        ys_mask = square_mask(ys_in_pad, self.eos)
        pred_pad, pred_mask = self.decoder(ys_in_pad, ys_mask, hs_pad, hs_mask)
        self.pred_pad = pred_pad

        # 3. compute attention loss
        loss_att = self.criterion(pred_pad, ys_out_pad)
        self.acc = th_accuracy(
            pred_pad.view(-1, self.odim), ys_out_pad, ignore_label=self.ignore_id
        )

        # 4. compute ctc loss
        loss_ctc, cer_ctc = None, None
        if self.mtlalpha > 0:
            batch_size = xs_pad.size(0)
            hs_len = hs_mask.view(batch_size, -1).sum(1)
            loss_ctc = self.ctc(hs_pad.view(batch_size, -1, self.adim), hs_len, ys_pad)
            if self.error_calculator is not None:
                ys_hat = self.ctc.argmax(hs_pad.view(batch_size, -1, self.adim)).data
                cer_ctc = self.error_calculator(ys_hat.cpu(), ys_pad.cpu(), is_ctc=True)
            # for visualization
            if not self.training:
                self.ctc.softmax(hs_pad)

        # 5. compute cer/wer
        if self.training or self.error_calculator is None or self.decoder is None:
            cer, wer = None, None
        else:
            ys_hat = pred_pad.argmax(dim=-1)
            cer, wer = self.error_calculator(ys_hat.cpu(), ys_pad.cpu())

        alpha = self.mtlalpha
        if alpha == 0:
            self.loss = loss_att
            loss_att_data = float(loss_att)
            loss_ctc_data = None
        else:
            self.loss = alpha * loss_ctc + (1 - alpha) * loss_att
            loss_att_data = float(loss_att)
            loss_ctc_data = float(loss_ctc)

        loss_data = float(self.loss)
        if loss_data < CTC_LOSS_THRESHOLD and not math.isnan(loss_data):
            self.reporter.report(
                loss_ctc_data, loss_att_data, self.acc, cer_ctc, cer, wer, loss_data
            )
        else:
            logging.warning("loss (=%f) is not correct", loss_data)
        return self.loss

    def recognize(self, x, recog_args, char_list=None, rnnlm=None):
        """Recognize input speech.

        :param ndnarray x: input acoustic feature (B, T, D) or (T, D)
        :param Namespace recog_args: argment Namespace contraining options
        :param list char_list: list of characters
        :param torch.nn.Module rnnlm: language model module
        :return: decoding result
        :rtype: list
        """

        def num2str(char_list, mask_token, mask_char="_"):
            def f(yl):
                cl = [char_list[y] if y != mask_token else mask_char for y in yl]
                return "".join(cl).replace("<space>", " ")

            return f

        n2s = num2str(char_list, self.mask_token)

        self.eval()
        h = self.encode(x).unsqueeze(0)

        # greedy ctc outputs
        ctc_probs, ctc_ids = torch.exp(self.ctc.log_softmax(h)).max(dim=-1)
        y_hat = torch.stack([x[0] for x in groupby(ctc_ids[0])])
        y_idx = torch.nonzero(y_hat != 0).squeeze(-1)

        # calculate token-level ctc probabilities by taking
        # the maximum probability of consecutive frames with
        # the same ctc symbols
        probs_hat = []
        cnt = 0
        for i, y in enumerate(y_hat.tolist()):
            probs_hat.append(-1)
            while cnt < ctc_ids.shape[1] and y == ctc_ids[0][cnt]:
                if probs_hat[i] < ctc_probs[0][cnt]:
                    probs_hat[i] = ctc_probs[0][cnt].item()
                cnt += 1
        probs_hat = torch.from_numpy(numpy.array(probs_hat))

        # mask ctc outputs based on ctc probabilities
        p_thres = recog_args.maskctc_probability_threshold
        mask_idx = torch.nonzero(probs_hat[y_idx] < p_thres).squeeze(-1)
        confident_idx = torch.nonzero(probs_hat[y_idx] >= p_thres).squeeze(-1)
        mask_num = len(mask_idx)

        y_in = torch.zeros(1, len(y_idx), dtype=torch.long) + self.mask_token
        y_in[0][confident_idx] = y_hat[y_idx][confident_idx]

        logging.info("ctc:{}".format(n2s(y_in[0].tolist())))

        # iterative decoding
        if not mask_num == 0:
            K = recog_args.maskctc_n_iterations
            num_iter = K if mask_num >= K and K > 0 else mask_num

            for t in range(num_iter - 1):
                pred, _ = self.decoder(y_in, None, h, None)
                pred_score, pred_id = pred[0][mask_idx].max(dim=-1)
                cand = torch.topk(pred_score, mask_num // num_iter, -1)[1]
                y_in[0][mask_idx[cand]] = pred_id[cand]
                mask_idx = torch.nonzero(y_in[0] == self.mask_token).squeeze(-1)

                logging.info("msk:{}".format(n2s(y_in[0].tolist())))

            # predict leftover masks (|masks| < mask_num // num_iter)
            pred, pred_mask = self.decoder(y_in, None, h, None)
            y_in[0][mask_idx] = pred[0][mask_idx].argmax(dim=-1)

            logging.info("msk:{}".format(n2s(y_in[0].tolist())))

        ret = y_in.tolist()[0]
        hyp = {"score": 0.0, "yseq": [self.sos] + ret + [self.eos]}

        return [hyp]


================================================
FILE: nets/pytorch_backend/e2e_asr_mix.py
================================================
#!/usr/bin/env python3

"""
This script is used to construct End-to-End models of multi-speaker ASR.

Copyright 2017 Johns Hopkins University (Shinji Watanabe)
 Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
"""

import argparse
from itertools import groupby
import logging
import math
import os
import sys

import editdistance
import numpy as np
import six
import torch

from espnet.nets.asr_interface import ASRInterface
from espnet.nets.e2e_asr_common import get_vgg2l_odim
from espnet.nets.e2e_asr_common import label_smoothing_dist
from espnet.nets.pytorch_backend.ctc import ctc_for
from espnet.nets.pytorch_backend.e2e_asr import E2E as E2EASR
from espnet.nets.pytorch_backend.e2e_asr import Reporter
from espnet.nets.pytorch_backend.frontends.feature_transform import (
    feature_transform_for,  # noqa: H301
)
from espnet.nets.pytorch_backend.frontends.frontend import frontend_for
from espnet.nets.pytorch_backend.initialization import lecun_normal_init_parameters
from espnet.nets.pytorch_backend.initialization import set_forget_bias_to_one
from espnet.nets.pytorch_backend.nets_utils import get_subsample
from espnet.nets.pytorch_backend.nets_utils import make_pad_mask
from espnet.nets.pytorch_backend.nets_utils import pad_list
from espnet.nets.pytorch_backend.nets_utils import to_device
from espnet.nets.pytorch_backend.nets_utils import to_torch_tensor
from espnet.nets.pytorch_backend.rnn.attentions import att_for
from espnet.nets.pytorch_backend.rnn.decoders import decoder_for
from espnet.nets.pytorch_backend.rnn.encoders import encoder_for as encoder_for_single
from espnet.nets.pytorch_backend.rnn.encoders import RNNP
from espnet.nets.pytorch_backend.rnn.encoders import VGG2L

CTC_LOSS_THRESHOLD = 10000


class PIT(object):
    """Permutation Invariant Training (PIT) module.

    :parameter int num_spkrs: number of speakers for PIT process (2 or 3)
    """

    def __init__(self, num_spkrs):
        """Initialize PIT module."""
        self.num_spkrs = num_spkrs

        # [[0, 1], [1, 0]] or
        # [[0, 1, 2], [0, 2, 1], [1, 0, 2], [1, 2, 0], [2, 1, 0], [2, 0, 1]]
        self.perm_choices = []
        initial_seq = np.linspace(0, num_spkrs - 1, num_spkrs, dtype=np.int64)
        self.permutationDFS(initial_seq, 0)

        # [[0, 3], [1, 2]] or
        # [[0, 4, 8], [0, 5, 7], [1, 3, 8], [1, 5, 6], [2, 4, 6], [2, 3, 7]]
        self.loss_perm_idx = np.linspace(
            0, num_spkrs * (num_spkrs - 1), num_spkrs, dtype=np.int64
        ).reshape(1, num_spkrs)
        self.loss_perm_idx = (self.loss_perm_idx + np.array(self.perm_choices)).tolist()

    def min_pit_sample(self, loss):
        """Compute the PIT loss for each sample.

        :param 1-D torch.Tensor loss: list of losses for one sample,
            including [h1r1, h1r2, h2r1, h2r2] or
            [h1r1, h1r2, h1r3, h2r1, h2r2, h2r3, h3r1, h3r2, h3r3]
        :return minimum loss of best permutation
        :rtype torch.Tensor (1)
        :return the best permutation
        :rtype List: len=2

        """
        score_perms = (
            torch.stack(
                [torch.sum(loss[loss_perm_idx]) for loss_perm_idx in self.loss_perm_idx]
            )
            / self.num_spkrs
        )
        perm_loss, min_idx = torch.min(score_perms, 0)
        permutation = self.perm_choices[min_idx]
        return perm_loss, permutation

    def pit_process(self, losses):
        """Compute the PIT loss for a batch.

        :param torch.Tensor losses: losses (B, 1|4|9)
        :return minimum losses of a batch with best permutation
        :rtype torch.Tensor (B)
        :return the best permutation
        :rtype torch.LongTensor (B, 1|2|3)

        """
        bs = losses.size(0)
        ret = [self.min_pit_sample(losses[i]) for i in range(bs)]

        loss_perm = torch.stack([r[0] for r in ret], dim=0).to(losses.device)  # (B)
        permutation = torch.tensor([r[1] for r in ret]).long().to(losses.device)
        return torch.mean(loss_perm), permutation

    def permutationDFS(self, source, start):
        """Get permutations with DFS.

           The final result is all permutations of the 'source' sequence.
           e.g. [[1, 2], [2, 1]] or
                [[1, 2, 3], [1, 3, 2], [2, 1, 3], [2, 3, 1], [3, 2, 1], [3, 1, 2]]

        :param np.ndarray source: (num_spkrs, 1), e.g. [1, 2, ..., N]
        :param int start: the start point to permute

        """
        if start == len(source) - 1:  # reach final state
            self.perm_choices.append(source.tolist())
        for i in range(start, len(source)):
            # swap values at position start and i
            source[start], source[i] = source[i], source[start]
            self.permutationDFS(source, start + 1)
            # reverse the swap
            source[start], source[i] = source[i], source[start]


class E2E(ASRInterface, torch.nn.Module):
    """E2E module.

    :param int idim: dimension of inputs
    :param int odim: dimension of outputs
    :param Namespace args: argument Namespace containing options
    """

    @staticmethod
    def add_arguments(parser):
        """Add arguments."""
        E2EASR.encoder_add_arguments(parser)
        E2E.encoder_mix_add_arguments(parser)
        E2EASR.attention_add_arguments(parser)
        E2EASR.decoder_add_arguments(parser)
        return parser

    @staticmethod
    def encoder_mix_add_arguments(parser):
        """Add arguments for multi-speaker encoder."""
        group = parser.add_argument_group("E2E encoder setting for multi-speaker")
        # asr-mix encoder
        group.add_argument(
            "--spa",
            action="store_true",
            help="Enable speaker parallel attention "
            "for multi-speaker speech recognition task.",
        )
        group.add_argument(
            "--elayers-sd",
            default=4,
            type=int,
            help="Number of speaker differentiate encoder layers"
            "for multi-speaker speech recognition task.",
        )
        return parser

    def get_total_subsampling_factor(self):
        """Get total subsampling factor."""
        return self.enc.conv_subsampling_factor * int(np.prod(self.subsample))

    def __init__(self, idim, odim, args):
        """Initialize multi-speaker E2E module."""
        super(E2E, self).__init__()
        torch.nn.Module.__init__(self)
        self.mtlalpha = args.mtlalpha
        assert 0.0 <= self.mtlalpha <= 1.0, "mtlalpha should be [0.0, 1.0]"
        self.etype = args.etype
        self.verbose = args.verbose
        # NOTE: for self.build method
        args.char_list = getattr(args, "char_list", None)
        self.char_list = args.char_list
        self.outdir = args.outdir
        self.space = args.sym_space
        self.blank = args.sym_blank
        self.reporter = Reporter()
        self.num_spkrs = args.num_spkrs
        self.spa = args.spa
        self.pit = PIT(self.num_spkrs)

        # below means the last number becomes eos/sos ID
        # note that sos/eos IDs are identical
        self.sos = odim - 1
        self.eos = odim - 1

        # subsample info
        self.subsample = get_subsample(args, mode="asr", arch="rnn_mix")

        # label smoothing info
        if args.lsm_type and os.path.isfile(args.train_json):
            logging.info("Use label smoothing with " + args.lsm_type)
            labeldist = label_smoothing_dist(
                odim, args.lsm_type, transcript=args.train_json
            )
        else:
            labeldist = None

        if getattr(args, "use_frontend", False):  # use getattr to keep compatibility
            self.frontend = frontend_for(args, idim)
            self.feature_transform = feature_transform_for(args, (idim - 1) * 2)
            idim = args.n_mels
        else:
            self.frontend = None

        # encoder
        self.enc = encoder_for(args, idim, self.subsample)
        # ctc
        self.ctc = ctc_for(args, odim, reduce=False)
        # attention
        num_att = self.num_spkrs if args.spa else 1
        self.att = att_for(args, num_att)
        # decoder
        self.dec = decoder_for(args, odim, self.sos, self.eos, self.att, labeldist)

        # weight initialization
        self.init_like_chainer()

        # options for beam search
        if "report_cer" in vars(args) and (args.report_cer or args.report_wer):
            recog_args = {
                "beam_size": args.beam_size,
                "penalty": args.penalty,
                "ctc_weight": args.ctc_weight,
                "maxlenratio": args.maxlenratio,
                "minlenratio": args.minlenratio,
                "lm_weight": args.lm_weight,
                "rnnlm": args.rnnlm,
                "nbest": args.nbest,
                "space": args.sym_space,
                "blank": args.sym_blank,
            }

            self.recog_args = argparse.Namespace(**recog_args)
            self.report_cer = args.report_cer
            self.report_wer = args.report_wer
        else:
            self.report_cer = False
            self.report_wer = False
        self.rnnlm = None

        self.logzero = -10000000000.0
        self.loss = None
        self.acc = None

    def init_like_chainer(self):
        """Initialize weight like chainer.

        chainer basically uses LeCun way: W ~ Normal(0, fan_in ** -0.5), b = 0
        pytorch basically uses W, b ~ Uniform(-fan_in**-0.5, fan_in**-0.5)

        however, there are two exceptions as far as I know.
        - EmbedID.W ~ Normal(0, 1)
        - LSTM.upward.b[forget_gate_range] = 1 (but not used in NStepLSTM)
        """
        lecun_normal_init_parameters(self)
        # exceptions
        # embed weight ~ Normal(0, 1)
        self.dec.embed.weight.data.normal_(0, 1)
        # forget-bias = 1.0
        # https://discuss.pytorch.org/t/set-forget-gate-bias-of-lstm/1745
        for i in six.moves.range(len(self.dec.decoder)):
            set_forget_bias_to_one(self.dec.decoder[i].bias_ih)

    def forward(self, xs_pad, ilens, ys_pad):
        """E2E forward.

        :param torch.Tensor xs_pad: batch of padded input sequences (B, Tmax, idim)
        :param torch.Tensor ilens: batch of lengths of input sequences (B)
        :param torch.Tensor ys_pad:
            batch of padded character id sequence tensor (B, num_spkrs, Lmax)
        :return: ctc loss value
        :rtype: torch.Tensor
        :return: attention loss value
        :rtype: torch.Tensor
        :return: accuracy in attention decoder
        :rtype: float
        """
        # 0. Frontend
        if self.frontend is not None:
            hs_pad, hlens, mask = self.frontend(to_torch_tensor(xs_pad), ilens)
            if isinstance(hs_pad, list):
                hlens_n = [None] * self.num_spkrs
                for i in range(self.num_spkrs):
                    hs_pad[i], hlens_n[i] = self.feature_transform(hs_pad[i], hlens)
                hlens = hlens_n
            else:
                hs_pad, hlens = self.feature_transform(hs_pad, hlens)
        else:
            hs_pad, hlens = xs_pad, ilens

        # 1. Encoder
        if not isinstance(
            hs_pad, list
        ):  # single-channel input xs_pad (single- or multi-speaker)
            hs_pad, hlens, _ = self.enc(hs_pad, hlens)
        else:  # multi-channel multi-speaker input xs_pad
            for i in range(self.num_spkrs):
                hs_pad[i], hlens[i], _ = self.enc(hs_pad[i], hlens[i])

        # 2. CTC loss
        if self.mtlalpha == 0:
            loss_ctc, min_perm = None, None
        else:
            if not isinstance(hs_pad, list):  # single-speaker input xs_pad
                loss_ctc = torch.mean(self.ctc(hs_pad, hlens, ys_pad))
            else:  # multi-speaker input xs_pad
                ys_pad = ys_pad.transpose(0, 1)  # (num_spkrs, B, Lmax)
                loss_ctc_perm = torch.stack(
                    [
                        self.ctc(
                            hs_pad[i // self.num_spkrs],
                            hlens[i // self.num_spkrs],
                            ys_pad[i % self.num_spkrs],
                        )
                        for i in range(self.num_spkrs ** 2)
                    ],
                    dim=1,
                )  # (B, num_spkrs^2)
                loss_ctc, min_perm = self.pit.pit_process(loss_ctc_perm)
                logging.info("ctc loss:" + str(float(loss_ctc)))

        # 3. attention loss
        if self.mtlalpha == 1:
            loss_att = None
            acc = None
        else:
            if not isinstance(hs_pad, list):  # single-speaker input xs_pad
                loss_att, acc, _ = self.dec(hs_pad, hlens, ys_pad)
            else:
                for i in range(ys_pad.size(1)):  # B
                    ys_pad[:, i] = ys_pad[min_perm[i], i]
                rslt = [
                    self.dec(hs_pad[i], hlens[i], ys_pad[i], strm_idx=i)
                    for i in range(self.num_spkrs)
                ]
                loss_att = sum([r[0] for r in rslt]) / float(len(rslt))
                acc = sum([r[1] for r in rslt]) / float(len(rslt))
        self.acc = acc

        # 4. compute cer without beam search
        if self.mtlalpha == 0 or self.char_list is None:
            cer_ctc = None
        else:
            cers = []
            for ns in range(self.num_spkrs):
                y_hats = self.ctc.argmax(hs_pad[ns]).data
                for i, y in enumerate(y_hats):
                    y_hat = [x[0] for x in groupby(y)]
                    y_true = ys_pad[ns][i]

                    seq_hat = [
                        self.char_list[int(idx)] for idx in y_hat if int(idx) != -1
                    ]
                    seq_true = [
                        self.char_list[int(idx)] for idx in y_true if int(idx) != -1
                    ]
                    seq_hat_text = "".join(seq_hat).replace(self.space, " ")
                    seq_hat_text = seq_hat_text.replace(self.blank, "")
                    seq_true_text = "".join(seq_true).replace(self.space, " ")

                    hyp_chars = seq_hat_text.replace(" ", "")
                    ref_chars = seq_true_text.replace(" ", "")
                    if len(ref_chars) > 0:
                        cers.append(
                            editdistance.eval(hyp_chars, ref_chars) / len(ref_chars)
                        )

            cer_ctc = sum(cers) / len(cers) if cers else None

        # 5. compute cer/wer
        if (
            self.training
            or not (self.report_cer or self.report_wer)
            or not isinstance(hs_pad, list)
        ):
            cer, wer = 0.0, 0.0
        else:
            if self.recog_args.ctc_weight > 0.0:
                lpz = [
                    self.ctc.log_softmax(hs_pad[i]).data for i in range(self.num_spkrs)
                ]
            else:
                lpz = None

            word_eds, char_eds, word_ref_lens, char_ref_lens = [], [], [], []
            nbest_hyps = [
                self.dec.recognize_beam_batch(
                    hs_pad[i],
                    torch.tensor(hlens[i]),
                    lpz[i],
                    self.recog_args,
                    self.char_list,
                    self.rnnlm,
                    strm_idx=i,
                )
                for i in range(self.num_spkrs)
            ]
            # remove <sos> and <eos>
            y_hats = [
                [nbest_hyp[0]["yseq"][1:-1] for nbest_hyp in nbest_hyps[i]]
                for i in range(self.num_spkrs)
            ]
            for i in range(len(y_hats[0])):
                hyp_words = []
                hyp_chars = []
                ref_words = []
                ref_chars = []
                for ns in range(self.num_spkrs):
                    y_hat = y_hats[ns][i]
                    y_true = ys_pad[ns][i]

                    seq_hat = [
                        self.char_list[int(idx)] for idx in y_hat if int(idx) != -1
                    ]
                    seq_true = [
                        self.char_list[int(idx)] for idx in y_true if int(idx) != -1
                    ]
                    seq_hat_text = "".join(seq_hat).replace(self.recog_args.space, " ")
                    seq_hat_text = seq_hat_text.replace(self.recog_args.blank, "")
                    seq_true_text = "".join(seq_true).replace(
                        self.recog_args.space, " "
                    )

                    hyp_words.append(seq_hat_text.split())
                    ref_words.append(seq_true_text.split())
                    hyp_chars.append(seq_hat_text.replace(" ", ""))
                    ref_chars.append(seq_true_text.replace(" ", ""))

                tmp_word_ed = [
                    editdistance.eval(
                        hyp_words[ns // self.num_spkrs], ref_words[ns % self.num_spkrs]
                    )
                    for ns in range(self.num_spkrs ** 2)
                ]  # h1r1,h1r2,h2r1,h2r2
                tmp_char_ed = [
                    editdistance.eval(
                        hyp_chars[ns // self.num_spkrs], ref_chars[ns % self.num_spkrs]
                    )
                    for ns in range(self.num_spkrs ** 2)
                ]  # h1r1,h1r2,h2r1,h2r2

                word_eds.append(self.pit.min_pit_sample(torch.tensor(tmp_word_ed))[0])
                word_ref_lens.append(len(sum(ref_words, [])))
                char_eds.append(self.pit.min_pit_sample(torch.tensor(tmp_char_ed))[0])
                char_ref_lens.append(len("".join(ref_chars)))

            wer = (
                0.0
                if not self.report_wer
                else float(sum(word_eds)) / sum(word_ref_lens)
            )
            cer = (
                0.0
                if not self.report_cer
                else float(sum(char_eds)) / sum(char_ref_lens)
            )

        alpha = self.mtlalpha
        if alpha == 0:
            self.loss = loss_att
            loss_att_data = float(loss_att)
            loss_ctc_data = None
        elif alpha == 1:
            self.loss = loss_ctc
            loss_att_data = None
            loss_ctc_data = float(loss_ctc)
        else:
            self.loss = alpha * loss_ctc + (1 - alpha) * loss_att
            loss_att_data = float(loss_att)
            loss_ctc_data = float(loss_ctc)

        loss_data = float(self.loss)
        if loss_data < CTC_LOSS_THRESHOLD and not math.isnan(loss_data):
            self.reporter.report(
                loss_ctc_data, loss_att_data, self.acc, cer_ctc, cer, wer, loss_data
            )
        else:
            logging.warning("loss (=%f) is not correct", loss_data)
        return self.loss

    def recognize(self, x, recog_args, char_list, rnnlm=None):
        """E2E beam search.

        :param ndarray x: input acoustic feature (T, D)
        :param Namespace recog_args: argument Namespace containing options
        :param list char_list: list of characters
        :param torch.nn.Module rnnlm: language model module
        :return: N-best decoding results
        :rtype: list
        """
        prev = self.training
        self.eval()
        ilens = [x.shape[0]]

        # subsample frame
        x = x[:: self.subsample[0], :]
        h = to_device(self, to_torch_tensor(x).float())
        # make a utt list (1) to use the same interface for encoder
        hs = h.contiguous().unsqueeze(0)

        # 0. Frontend
        if self.frontend is not None:
            hs, hlens, mask = self.frontend(hs, ilens)
            hlens_n = [None] * self.num_spkrs
            for i in range(self.num_spkrs):
                hs[i], hlens_n[i] = self.feature_transform(hs[i], hlens)
            hlens = hlens_n
        else:
            hs, hlens = hs, ilens

        # 1. Encoder
        if not isinstance(hs, list):  # single-channel multi-speaker input x
            hs, hlens, _ = self.enc(hs, hlens)
        else:  # multi-channel multi-speaker input x
            for i in range(self.num_spkrs):
                hs[i], hlens[i], _ = self.enc(hs[i], hlens[i])

        # calculate log P(z_t|X) for CTC scores
        if recog_args.ctc_weight > 0.0:
            lpz = [self.ctc.log_softmax(i)[0] for i in hs]
        else:
            lpz = None

        # 2. decoder
        # decode the first utterance
        y = [
            self.dec.recognize_beam(
                hs[i][0], lpz[i], recog_args, char_list, rnnlm, strm_idx=i
            )
            for i in range(self.num_spkrs)
        ]

        if prev:
            self.train()
        return y

    def recognize_batch(self, xs, recog_args, char_list, rnnlm=None):
        """E2E beam search.

        :param ndarray xs: input acoustic feature (T, D)
        :param Namespace recog_args: argument Namespace containing options
        :param list char_list: list of characters
        :param torch.nn.Module rnnlm: language model module
        :return: N-best decoding results
        :rtype: list
        """
        prev = self.training
        self.eval()
        ilens = np.fromiter((xx.shape[0] for xx in xs), dtype=np.int64)

        # subsample frame
        xs = [xx[:: self.subsample[0], :] for xx in xs]
        xs = [to_device(self, to_torch_tensor(xx).float()) for xx in xs]
        xs_pad = pad_list(xs, 0.0)

        # 0. Frontend
        if self.frontend is not None:
            hs_pad, hlens, mask = self.frontend(xs_pad, ilens)
            hlens_n = [None] * self.num_spkrs
            for i in range(self.num_spkrs):
                hs_pad[i], hlens_n[i] = self.feature_transform(hs_pad[i], hlens)
            hlens = hlens_n
        else:
            hs_pad, hlens = xs_pad, ilens

        # 1. Encoder
        if not isinstance(hs_pad, list):  # single-channel multi-speaker input x
            hs_pad, hlens, _ = self.enc(hs_pad, hlens)
        else:  # multi-channel multi-speaker input x
            for i in range(self.num_spkrs):
                hs_pad[i], hlens[i], _ = self.enc(hs_pad[i], hlens[i])

        # calculate log P(z_t|X) for CTC scores
        if recog_args.ctc_weight > 0.0:
            lpz = [self.ctc.log_softmax(hs_pad[i]) for i in range(self.num_spkrs)]
            normalize_score = False
        else:
            lpz = None
            normalize_score = True

        # 2. decoder
        y = [
            self.dec.recognize_beam_batch(
                hs_pad[i],
                hlens[i],
                lpz[i],
                recog_args,
                char_list,
                rnnlm,
                normalize_score=normalize_score,
                strm_idx=i,
            )
            for i in range(self.num_spkrs)
        ]

        if prev:
            self.train()
        return y

    def enhance(self, xs):
        """Forward only the frontend stage.

        :param ndarray xs: input acoustic feature (T, C, F)
        """
        if self.frontend is None:
            raise RuntimeError("Frontend doesn't exist")
        prev = self.training
        self.eval()
        ilens = np.fromiter((xx.shape[0] for xx in xs), dtype=np.int64)

        # subsample frame
        xs = [xx[:: self.subsample[0], :] for xx in xs]
        xs = [to_device(self, to_torch_tensor(xx).float()) for xx in xs]
        xs_pad = pad_list(xs, 0.0)
        enhanced, hlensm, mask = self.frontend(xs_pad, ilens)
        if prev:
            self.train()

        if isinstance(enhanced, (tuple, list)):
            enhanced = list(enhanced)
            mask = list(mask)
            for idx in range(len(enhanced)):  # number of speakers
                enhanced[idx] = enhanced[idx].cpu().numpy()
                mask[idx] = mask[idx].cpu().numpy()
            return enhanced, mask, ilens
        return enhanced.cpu().numpy(), mask.cpu().numpy(), ilens

    def calculate_all_attentions(self, xs_pad, ilens, ys_pad):
        """E2E attention calculation.

        :param torch.Tensor xs_pad: batch of padded input sequences (B, Tmax, idim)
        :param torch.Tensor ilens: batch of lengths of input sequences (B)
        :param torch.Tensor ys_pad:
            batch of padded character id sequence tensor (B, num_spkrs, Lmax)
        :return: attention weights with the following shape,
            1) multi-head case => attention weights (B, H, Lmax, Tmax),
            2) other case => attention weights (B, Lmax, Tmax).
        :rtype: float ndarray
        """
        with torch.no_grad():
            # 0. Frontend
            if self.frontend is not None:
                hs_pad, hlens, mask = self.frontend(to_torch_tensor(xs_pad), ilens)
                hlens_n = [None] * self.num_spkrs
                for i in range(self.num_spkrs):
                    hs_pad[i], hlens_n[i] = self.feature_transform(hs_pad[i], hlens)
                hlens = hlens_n
            else:
                hs_pad, hlens = xs_pad, ilens

            # 1. Encoder
            if not isinstance(hs_pad, list):  # single-channel multi-speaker input x
                hs_pad, hlens, _ = self.enc(hs_pad, hlens)
            else:  # multi-channel multi-speaker input x
                for i in range(self.num_spkrs):
                    hs_pad[i], hlens[i], _ = self.enc(hs_pad[i], hlens[i])

            # Permutation
            ys_pad = ys_pad.transpose(0, 1)  # (num_spkrs, B, Lmax)
            if self.num_spkrs <= 3:
                loss_ctc = torch.stack(
                    [
                        self.ctc(
                            hs_pad[i // self.num_spkrs],
                            hlens[i // self.num_spkrs],
                            ys_pad[i % self.num_spkrs],
                        )
                        for i in range(self.num_spkrs ** 2)
                    ],
                    1,
                )  # (B, num_spkrs^2)
                loss_ctc, min_perm = self.pit.pit_process(loss_ctc)
            for i in range(ys_pad.size(1)):  # B
                ys_pad[:, i] = ys_pad[min_perm[i], i]

            # 2. Decoder
            att_ws = [
                self.dec.calculate_all_attentions(
                    hs_pad[i], hlens[i], ys_pad[i], strm_idx=i
                )
                for i in range(self.num_spkrs)
            ]

        return att_ws


class EncoderMix(torch.nn.Module):
    """Encoder module for the case of multi-speaker mixture speech.

    :param str etype: type of encoder network
    :param int idim: number of dimensions of encoder network
    :param int elayers_sd:
        number of layers of speaker differentiate part in encoder network
    :param int elayers_rec:
        number of layers of shared recognition part in encoder network
    :param int eunits: number of lstm units of encoder network
    :param int eprojs: number of projection units of encoder network
    :param np.ndarray subsample: list of subsampling numbers
    :param float dropout: dropout rate
    :param int in_channel: number of input channels
    :param int num_spkrs: number of number of speakers
    """

    def __init__(
        self,
        etype,
        idim,
        elayers_sd,
        elayers_rec,
        eunits,
        eprojs,
        subsample,
        dropout,
        num_spkrs=2,
        in_channel=1,
    ):
        """Initialize the encoder of single-channel multi-speaker ASR."""
        super(EncoderMix, self).__init__()
        typ = etype.lstrip("vgg").rstrip("p")
        if typ not in ["lstm", "gru", "blstm", "bgru"]:
            logging.error("Error: need to specify an appropriate encoder architecture")
        if etype.startswith("vgg"):
            if etype[-1] == "p":
                self.enc_mix = torch.nn.ModuleList([VGG2L(in_channel)])
                self.enc_sd = torch.nn.ModuleList(
                    [
                        torch.nn.ModuleList(
                            [
                                RNNP(
                                    get_vgg2l_odim(idim, in_channel=in_channel),
                                    elayers_sd,
                                    eunits,
                                    eprojs,
                                    subsample[: elayers_sd + 1],
                                    dropout,
                                    typ=typ,
                                )
                            ]
                        )
                        for i in range(num_spkrs)
                    ]
                )
                self.enc_rec = torch.nn.ModuleList(
                    [
                        RNNP(
                            eprojs,
                            elayers_rec,
                            eunits,
                            eprojs,
                            subsample[elayers_sd:],
                            dropout,
                            typ=typ,
                        )
                    ]
                )
                logging.info("Use CNN-VGG + B" + typ.upper() + "P for encoder")
            else:
                logging.error(
                    f"Error: need to specify an appropriate encoder architecture. "
                    f"Illegal name {etype}"
                )
                sys.exit()
        else:
            logging.error(
                f"Error: need to specify an appropriate encoder architecture. "
                f"Illegal name {etype}"
            )
            sys.exit()

        self.num_spkrs = num_spkrs

    def forward(self, xs_pad, ilens):
        """Encodermix forward.

        :param torch.Tensor xs_pad: batch of padded input sequences (B, Tmax, D)
        :param torch.Tensor ilens: batch of lengths of input sequences (B)
        :return: list: batch of hidden state sequences [num_spkrs x (B, Tmax, eprojs)]
        :rtype: torch.Tensor
        """
        # mixture encoder
        for module in self.enc_mix:
            xs_pad, ilens, _ = module(xs_pad, ilens)

        # SD and Rec encoder
        xs_pad_sd = [xs_pad for i in range(self.num_spkrs)]
        ilens_sd = [ilens for i in range(self.num_spkrs)]
        for ns in range(self.num_spkrs):
            # Encoder_SD: speaker differentiate encoder
            for module in self.enc_sd[ns]:
                xs_pad_sd[ns], ilens_sd[ns], _ = module(xs_pad_sd[ns], ilens_sd[ns])
            # Encoder_Rec: recognition encoder
            for module in self.enc_rec:
                xs_pad_sd[ns], ilens_sd[ns], _ = module(xs_pad_sd[ns], ilens_sd[ns])

        # make mask to remove bias value in padded part
        mask = to_device(xs_pad, make_pad_mask(ilens_sd[0]).unsqueeze(-1))

        return [x.masked_fill(mask, 0.0) for x in xs_pad_sd], ilens_sd, None


def encoder_for(args, idim, subsample):
    """Construct the encoder."""
    if getattr(args, "use_frontend", False):  # use getattr to keep compatibility
        # with frontend, the mixed speech are separated as streams for each speaker
        return encoder_for_single(args, idim, subsample)
    else:
        return EncoderMix(
            args.etype,
            idim,
            args.elayers_sd,
            args.elayers,
            args.eunits,
            args.eprojs,
            subsample,
            args.dropout_rate,
            args.num_spkrs,
        )


================================================
FILE: nets/pytorch_backend/e2e_asr_mix_transformer.py
================================================
#!/usr/bin/env python3
# encoding: utf-8

# Copyright 2020 Johns Hopkins University (Xuankai Chang)
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

"""
Transformer speech recognition model for single-channel multi-speaker mixture speech.

It is a fusion of `e2e_asr_mix.py` and `e2e_asr_transformer.py`. Refer to:
    https://arxiv.org/pdf/2002.03921.pdf
1. The Transformer-based Encoder now consists of three stages:
     (a): Enc_mix: encoding input mixture speech;
     (b): Enc_SD: separating mixed speech representations;
     (c): Enc_rec: transforming each separated speech representation.
2. PIT is used in CTC to determine the permutation with minimum loss.
"""

from argparse import Namespace
import logging
import math

import numpy
import torch

from espnet.nets.asr_interface import ASRInterface
from espnet.nets.ctc_prefix_score import CTCPrefixScore
from espnet.nets.e2e_asr_common import end_detect
from espnet.nets.pytorch_backend.ctc import CTC
from espnet.nets.pytorch_backend.e2e_asr import CTC_LOSS_THRESHOLD
from espnet.nets.pytorch_backend.e2e_asr_mix import E2E as E2EASRMIX
from espnet.nets.pytorch_backend.e2e_asr_mix import PIT
from espnet.nets.pytorch_backend.e2e_asr_transformer import E2E as E2EASR
from espnet.nets.pytorch_backend.nets_utils import make_non_pad_mask
from espnet.nets.pytorch_backend.nets_utils import th_accuracy
from espnet.nets.pytorch_backend.rnn.decoders import CTC_SCORING_RATIO
from espnet.nets.pytorch_backend.transformer.add_sos_eos import add_sos_eos
from espnet.nets.pytorch_backend.transformer.encoder_mix import EncoderMix
from espnet.nets.pytorch_backend.transformer.mask import subsequent_mask
from espnet.nets.pytorch_backend.transformer.mask import target_mask


class E2E(E2EASR, ASRInterface, torch.nn.Module):
    """E2E module.

    :param int idim: dimension of inputs
    :param int odim: dimension of outputs
    :param Namespace args: argument Namespace containing options
    """

    @staticmethod
    def add_arguments(parser):
        """Add arguments."""
        E2EASR.add_arguments(parser)
        E2EASRMIX.encoder_mix_add_arguments(parser)
        return parser

    def __init__(self, idim, odim, args, ignore_id=-1):
        """Construct an E2E object.

        :param int idim: dimension of inputs
        :param int odim: dimension of outputs
        :param Namespace args: argument Namespace containing options
        """
        super(E2E, self).__init__(idim, odim, args, ignore_id=-1)
        if args.transformer_attn_dropout_rate is None:
            args.transformer_attn_dropout_rate = args.dropout_rate
        self.encoder = EncoderMix(
            idim=idim,
            attention_dim=args.adim,
            attention_heads=args.aheads,
            linear_units=args.eunits,
            num_blocks_sd=args.elayers_sd,
            num_blocks_rec=args.elayers,
            input_layer=args.transformer_input_layer,
            dropout_rate=args.dropout_rate,
            positional_dropout_rate=args.dropout_rate,
            attention_dropout_rate=args.transformer_attn_dropout_rate,
            num_spkrs=args.num_spkrs,
        )

        if args.mtlalpha > 0.0:
            self.ctc = CTC(
                odim, args.adim, args.dropout_rate, ctc_type=args.ctc_type, reduce=False
            )
        else:
            self.ctc = None

        self.num_spkrs = args.num_spkrs
        self.pit = PIT(self.num_spkrs)

    def forward(self, xs_pad, ilens, ys_pad):
        """E2E forward.

        :param torch.Tensor xs_pad: batch of padded source sequences (B, Tmax, idim)
        :param torch.Tensor ilens: batch of lengths of source sequences (B)
        :param torch.Tensor ys_pad: batch of padded target sequences
                                    (B, num_spkrs, Lmax)
        :return: ctc loass value
        :rtype: torch.Tensor
        :return: attention loss value
        :rtype: torch.Tensor
        :return: accuracy in attention decoder
        :rtype: float
        """
        # 1. forward encoder
        xs_pad = xs_pad[:, : max(ilens)]  # for data parallel
        src_mask = make_non_pad_mask(ilens.tolist()).to(xs_pad.device).unsqueeze(-2)
        hs_pad, hs_mask = self.encoder(xs_pad, src_mask)  # list: speaker differentiate
        self.hs_pad = hs_pad

        # 2. ctc
        # TODO(karita) show predicted text
        # TODO(karita) calculate these stats
        cer_ctc = None
        assert self.mtlalpha > 0.0
        batch_size = xs_pad.size(0)
        ys_pad = ys_pad.transpose(0, 1)  # (num_spkrs, B, Lmax)
        hs_len = [hs_mask[i].view(batch_size, -1).sum(1) for i in range(self.num_spkrs)]
        loss_ctc_perm = torch.stack(
            [
                self.ctc(
                    hs_pad[i // self.num_spkrs].view(batch_size, -1, self.adim),
                    hs_len[i // self.num_spkrs],
                    ys_pad[i % self.num_spkrs],
                )
                for i in range(self.num_spkrs ** 2)
            ],
            dim=1,
        )  # (B, num_spkrs^2)
        loss_ctc, min_perm = self.pit.pit_process(loss_ctc_perm)
        logging.info("ctc loss:" + str(float(loss_ctc)))

        # Permute the labels according to loss
        for b in range(batch_size):  # B
            ys_pad[:, b] = ys_pad[min_perm[b], b]  # (num_spkrs, B, Lmax)
        ys_out_len = [
            float(torch.sum(ys_pad[i] != self.ignore_id)) for i in range(self.num_spkrs)
        ]

        # TODO(karita) show predicted text
        # TODO(karita) calculate these stats
        if self.error_calculator is not None:
            cer_ctc = []
            for i in range(self.num_spkrs):
                ys_hat = self.ctc.argmax(hs_pad[i].view(batch_size, -1, self.adim)).data
                cer_ctc.append(
                    self.error_calculator(ys_hat.cpu(), ys_pad[i].cpu(), is_ctc=True)
                )
            cer_ctc = sum(map(lambda x: x[0] * x[1], zip(cer_ctc, ys_out_len))) / sum(
                ys_out_len
            )
        else:
            cer_ctc = None

        # 3. forward decoder
        if self.mtlalpha == 1.0:
            loss_att, self.acc, cer, wer = None, None, None, None
        else:
            pred_pad, pred_mask = [None] * self.num_spkrs, [None] * self.num_spkrs
            loss_att, acc = [None] * self.num_spkrs, [None] * self.num_spkrs
            for i in range(self.num_spkrs):
                (
                    pred_pad[i],
                    pred_mask[i],
                    loss_att[i],
                    acc[i],
                ) = self.decoder_and_attention(
                    hs_pad[i], hs_mask[i], ys_pad[i], batch_size
                )

            # 4. compute attention loss
            # The following is just an approximation
            loss_att = sum(map(lambda x: x[0] * x[1], zip(loss_att, ys_out_len))) / sum(
                ys_out_len
            )
            self.acc = sum(map(lambda x: x[0] * x[1], zip(acc, ys_out_len))) / sum(
                ys_out_len
            )

            # 5. compute cer/wer
            if self.training or self.error_calculator is None:
                cer, wer = None, None
            else:
                ys_hat = pred_pad.argmax(dim=-1)
                cer, wer = self.error_calculator(ys_hat.cpu(), ys_pad.cpu())

        # copyied from e2e_asr
        alpha = self.mtlalpha
        if alpha == 0:
            self.loss = loss_att
            loss_att_data = float(loss_att)
            loss_ctc_data = None
        elif alpha == 1:
            self.loss = loss_ctc
            loss_att_data = None
            loss_ctc_data = float(loss_ctc)
        else:
            self.loss = alpha * loss_ctc + (1 - alpha) * loss_att
            loss_att_data = float(loss_att)
            loss_ctc_data = float(loss_ctc)

        loss_data = float(self.loss)
        if loss_data < CTC_LOSS_THRESHOLD and not math.isnan(loss_data):
            self.reporter.report(
                loss_ctc_data, loss_att_data, self.acc, cer_ctc, cer, wer, loss_data
            )
        else:
            logging.warning("loss (=%f) is not correct", loss_data)
        return self.loss

    def decoder_and_attention(self, hs_pad, hs_mask, ys_pad, batch_size):
        """Forward decoder and attention loss."""
        # forward decoder
        ys_in_pad, ys_out_pad = add_sos_eos(ys_pad, self.sos, self.eos, self.ignore_id)
        ys_mask = target_mask(ys_in_pad, self.ignore_id)
        pred_pad, pred_mask = self.decoder(ys_in_pad, ys_mask, hs_pad, hs_mask)

        # compute attention loss
        loss_att = self.criterion(pred_pad, ys_out_pad)
        acc = th_accuracy(
            pred_pad.view(-1, self.odim), ys_out_pad, ignore_label=self.ignore_id
        )
        return pred_pad, pred_mask, loss_att, acc

    def encode(self, x):
        """Encode acoustic features.

        :param ndarray x: source acoustic feature (T, D)
        :return: encoder outputs
        :rtype: torch.Tensor
        """
        self.eval()
        x = torch.as_tensor(x).unsqueeze(0)
        enc_output, _ = self.encoder(x, None)
        return enc_output

    def recog(self, enc_output, recog_args, char_list=None, rnnlm=None, use_jit=False):
        """Recognize input speech of each speaker.

        :param ndnarray enc_output: encoder outputs (B, T, D) or (T, D)
        :param Namespace recog_args: argment Namespace contraining options
        :param list char_list: list of characters
        :param torch.nn.Module rnnlm: language model module
        :return: N-best decoding results
        :rtype: list
        """
        if recog_args.ctc_weight > 0.0:
            lpz = self.ctc.log_softmax(enc_output)
            lpz = lpz.squeeze(0)
        else:
            lpz = None

        h = enc_output.squeeze(0)

        logging.info("input lengths: " + str(h.size(0)))
        # search parms
        beam = recog_args.beam_size
        penalty = recog_args.penalty
        ctc_weight = recog_args.ctc_weight

        # preprare sos
        y = self.sos
        vy = h.new_zeros(1).long()

        if recog_args.maxlenratio == 0:
            maxlen = h.shape[0]
        else:
            # maxlen >= 1
            maxlen = max(1, int(recog_args.maxlenratio * h.size(0)))
        minlen = int(recog_args.minlenratio * h.size(0))
        logging.info("max output length: " + str(maxlen))
        logging.info("min output length: " + str(minlen))

        # initialize hypothesis
        if rnnlm:
            hyp = {"score": 0.0, "yseq": [y], "rnnlm_prev": None}
        else:
            hyp = {"score": 0.0, "yseq": [y]}
        if lpz is not None:
            ctc_prefix_score = CTCPrefixScore(lpz.detach().numpy(), 0, self.eos, numpy)
            hyp["ctc_state_prev"] = ctc_prefix_score.initial_state()
            hyp["ctc_score_prev"] = 0.0
            if ctc_weight != 1.0:
                # pre-pruning based on attention scores
                ctc_beam = min(lpz.shape[-1], int(beam * CTC_SCORING_RATIO))
            else:
                ctc_beam = lpz.shape[-1]
        hyps = [hyp]
        ended_hyps = []

        import six

        traced_decoder = None
        for i in six.moves.range(maxlen):
            logging.debug("position " + str(i))

            hyps_best_kept = []
            for hyp in hyps:
                vy[0] = hyp["yseq"][i]

                # get nbest local scores and their ids
                ys_mask = subsequent_mask(i + 1).unsqueeze(0)
                ys = torch.tensor(hyp["yseq"]).unsqueeze(0)
                # FIXME: jit does not match non-jit result
                if use_jit:
                    if traced_decoder is None:
                        traced_decoder = torch.jit.trace(
                            self.decoder.forward_one_step, (ys, ys_mask, enc_output)
                        )
                    local_att_scores = traced_decoder(ys, ys_mask, enc_output)[0]
                else:
                    local_att_scores = self.decoder.forward_one_step(
                        ys, ys_mask, enc_output
                    )[0]

                if rnnlm:
                    rnnlm_state, local_lm_scores = rnnlm.predict(hyp["rnnlm_prev"], vy)
                    local_scores = (
                        local_att_scores + recog_args.lm_weight * local_lm_scores
                    )
                else:
                    local_scores = local_att_scores

                if lpz is not None:
                    local_best_scores, local_best_ids = torch.topk(
                        local_att_scores, ctc_beam, dim=1
                    )
                    ctc_scores, ctc_states = ctc_prefix_score(
                        hyp["yseq"], local_best_ids[0], hyp["ctc_state_prev"]
                    )
                    local_scores = (1.0 - ctc_weight) * local_att_scores[
                        :, local_best_ids[0]
                    ] + ctc_weight * torch.from_numpy(
                        ctc_scores - hyp["ctc_score_prev"]
                    )
                    if rnnlm:
                        local_scores += (
                            recog_args.lm_weight * local_lm_scores[:, local_best_ids[0]]
                        )
                    local_best_scores, joint_best_ids = torch.topk(
                        local_scores, beam, dim=1
                    )
                    local_best_ids = local_best_ids[:, joint_best_ids[0]]
                else:
                    local_best_scores, local_best_ids = torch.topk(
                        local_scores, beam, dim=1
                    )

                for j in six.moves.range(beam):
                    new_hyp = {}
                    new_hyp["score"] = hyp["score"] + float(local_best_scores[0, j])
                    new_hyp["yseq"] = [0] * (1 + len(hyp["yseq"]))
                    new_hyp["yseq"][: len(hyp["yseq"])] = hyp["yseq"]
                    new_hyp["yseq"][len(hyp["yseq"])] = int(local_best_ids[0, j])
                    if rnnlm:
                        new_hyp["rnnlm_prev"] = rnnlm_state
                    if lpz is not None:
                        new_hyp["ctc_state_prev"] = ctc_states[joint_best_ids[0, j]]
                        new_hyp["ctc_score_prev"] = ctc_scores[joint_best_ids[0, j]]
                    # will be (2 x beam) hyps at most
                    hyps_best_kept.append(new_hyp)

                hyps_best_kept = sorted(
                    hyps_best_kept, key=lambda x: x["score"], reverse=True
                )[:beam]

            # sort and get nbest
            hyps = hyps_best_kept
            logging.debug("number of pruned hypothes: " + str(len(hyps)))
            if char_list is not None:
                logging.debug(
                    "best hypo: "
                    + "".join([char_list[int(x)] for x in hyps[0]["yseq"][1:]])
                )

            # add eos in the final loop to avoid that there are no ended hyps
            if i == maxlen - 1:
                logging.info("adding <eos> in the last postion in the loop")
                for hyp in hyps:
                    hyp["yseq"].append(self.eos)

            # add ended hypothes to a final list, and removed them from current hypothes
            # (this will be a probmlem, number of hyps < beam)
            remained_hyps = []
            for hyp in hyps:
                if hyp["yseq"][-1] == self.eos:
                    # only store the sequence that has more than minlen outputs
                    # also add penalty
                    if len(hyp["yseq"]) > minlen:
                        hyp["score"] += (i + 1) * penalty
                        if rnnlm:  # Word LM needs to add final <eos> score
                            hyp["score"] += recog_args.lm_weight * rnnlm.final(
                                hyp["rnnlm_prev"]
                            )
                        ended_hyps.append(hyp)
                else:
                    remained_hyps.append(hyp)

            # end detection

            if end_detect(ended_hyps, i) and recog_args.maxlenratio == 0.0:
                logging.info("end detected at %d", i)
                break

            hyps = remained_hyps
            if len(hyps) > 0:
                logging.debug("remeined hypothes: " + str(len(hyps)))
            else:
                logging.info("no hypothesis. Finish decoding.")
                break

            if char_list is not None:
                for hyp in hyps:
                    logging.debug(
                        "hypo: " + "".join([char_list[int(x)] for x in hyp["yseq"][1:]])
                    )

            logging.debug("number of ended hypothes: " + str(len(ended_hyps)))

        nbest_hyps = sorted(ended_hyps, key=lambda x: x["score"], reverse=True)[
            : min(len(ended_hyps), recog_args.nbest)
        ]

        # check number of hypotheis
        if len(nbest_hyps) == 0:
            logging.warning(
                "there is no N-best results, perform recognition "
                "again with smaller minlenratio."
            )
            # should copy becasuse Namespace will be overwritten globally
            recog_args = Namespace(**vars(recog_args))
            recog_args.minlenratio = max(0.0, recog_args.minlenratio - 0.1)
            return self.recog(enc_output, recog_args, char_list, rnnlm)

        logging.info("total log probability: " + str(nbest_hyps[0]["score"]))
        logging.info(
            "normalized log probability: "
            + str(nbest_hyps[0]["score"] / len(nbest_hyps[0]["yseq"]))
        )
        return nbest_hyps

    def recognize(self, x, recog_args, char_list=None, rnnlm=None, use_jit=False):
        """Recognize input speech of each speaker.

        :param ndnarray x: input acoustic feature (B, T, D) or (T, D)
        :param Namespace recog_args: argment Namespace contraining options
        :param list char_list: list of characters
        :param torch.nn.Module rnnlm: language model module
        :return: N-best decoding results
        :rtype: list
        """
        # Encoder
        enc_output = self.encode(x)

        # Decoder
        nbest_hyps = []
        for enc_out in enc_output:
            nbest_hyps.append(
                self.recog(enc_out, recog_args, char_list, rnnlm, use_jit)
            )
        return nbest_hyps


================================================
FILE: nets/pytorch_backend/e2e_asr_mulenc.py
================================================
# Copyright 2017 Johns Hopkins University (Shinji Watanabe)
# Copyright 2017 Johns Hopkins University (Ruizhi Li)
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

"""Define e2e module for multi-encoder network. https://arxiv.org/pdf/1811.04903.pdf."""

import argparse
from itertools import groupby
import logging
import math
import os

import chainer
from chainer import reporter
import editdistance
import numpy as np
import torch

from espnet.nets.asr_interface import ASRInterface
from espnet.nets.e2e_asr_common import label_smoothing_dist
from espnet.nets.pytorch_backend.ctc import ctc_for
from espnet.nets.pytorch_backend.nets_utils import get_subsample
from espnet.nets.pytorch_backend.nets_utils import pad_list
from espnet.nets.pytorch_backend.nets_utils import to_device
from espnet.nets.pytorch_backend.nets_utils import to_torch_tensor
from espnet.nets.pytorch_backend.rnn.attentions import att_for
from espnet.nets.pytorch_backend.rnn.decoders import decoder_for
from espnet.nets.pytorch_backend.rnn.encoders import Encoder
from espnet.nets.pytorch_backend.rnn.encoders import encoder_for
from espnet.nets.scorers.ctc import CTCPrefixScorer
from espnet.utils.cli_utils import strtobool

CTC_LOSS_THRESHOLD = 10000


class Reporter(chainer.Chain):
    """Define a chainer reporter wrapper."""

    def report(self, loss_ctc_list, loss_att, acc, cer_ctc_list, cer, wer, mtl_loss):
        """Define a chainer reporter function."""
        # loss_ctc_list = [weighted CTC, CTC1, CTC2, ... CTCN]
        # cer_ctc_list = [weighted cer_ctc, cer_ctc_1, cer_ctc_2, ... cer_ctc_N]
        num_encs = len(loss_ctc_list) - 1
        reporter.report({"loss_ctc": loss_ctc_list[0]}, self)
        for i in range(num_encs):
            reporter.report({"loss_ctc{}".format(i + 1): loss_ctc_list[i + 1]}, self)
        reporter.report({"loss_att": loss_att}, self)
        reporter.report({"acc": acc}, self)
        reporter.report({"cer_ctc": cer_ctc_list[0]}, self)
        for i in range(num_encs):
            reporter.report({"cer_ctc{}".format(i + 1): cer_ctc_list[i + 1]}, self)
        reporter.report({"cer": cer}, self)
        reporter.report({"wer": wer}, self)
        logging.info("mtl loss:" + str(mtl_loss))
        reporter.report({"loss": mtl_loss}, self)


class E2E(ASRInterface, torch.nn.Module):
    """E2E module.

    :param List idims: List of dimensions of inputs
    :param int odim: dimension of outputs
    :param Namespace args: argument Namespace containing options

    """

    @staticmethod
    def add_arguments(parser):
        """Add arguments for multi-encoder setting."""
        E2E.encoder_add_arguments(parser)
        E2E.attention_add_arguments(parser)
        E2E.decoder_add_arguments(parser)
        E2E.ctc_add_arguments(parser)
        return parser

    @staticmethod
    def encoder_add_arguments(parser):
        """Add arguments for encoders in multi-encoder setting."""
        group = parser.add_argument_group("E2E encoder setting")
        group.add_argument(
            "--etype",
            action="append",
            type=str,
            choices=[
                "lstm",
                "blstm",
                "lstmp",
                "blstmp",
                "vgglstmp",
                "vggblstmp",
                "vgglstm",
                "vggblstm",
                "gru",
                "bgru",
                "grup",
                "bgrup",
                "vgggrup",
                "vggbgrup",
                "vgggru",
                "vggbgru",
            ],
            help="Type of encoder network architecture",
        )
        group.add_argument(
            "--elayers",
            type=int,
            action="append",
            help="Number of encoder layers "
            "(for shared recognition part in multi-speaker asr mode)",
        )
        group.add_argument(
            "--eunits",
            "-u",
            type=int,
            action="append",
            help="Number of encoder hidden units",
        )
        group.add_argument(
            "--eprojs", default=320, type=int, help="Number of encoder projection units"
        )
        group.add_argument(
            "--subsample",
            type=str,
            action="append",
            help="Subsample input frames x_y_z means "
            "subsample every x frame at 1st layer, "
            "every y frame at 2nd layer etc.",
        )
        return parser

    @staticmethod
    def attention_add_arguments(parser):
        """Add arguments for attentions in multi-encoder setting."""
        group = parser.add_argument_group("E2E attention setting")
        # attention
        group.add_argument(
            "--atype",
            type=str,
            action="append",
            choices=[
                "noatt",
                "dot",
                "add",
                "location",
                "coverage",
                "coverage_location",
                "location2d",
                "location_recurrent",
                "multi_head_dot",
                "multi_head_add",
                "multi_head_loc",
                "multi_head_multi_res_loc",
            ],
            help="Type of attention architecture",
        )
        group.add_argument(
            "--adim",
            type=int,
            action="append",
            help="Number of attention transformation dimensions",
        )
        group.add_argument(
            "--awin",
            type=int,
            action="append",
            help="Window size for location2d attention",
        )
        group.add_argument(
            "--aheads",
            type=int,
            action="append",
            help="Number of heads for multi head attention",
        )
        group.add_argument(
            "--aconv-chans",
            type=int,
            action="append",
            help="Number of attention convolution channels \
                           (negative value indicates no location-aware attention)",
        )
        group.add_argument(
            "--aconv-filts",
            type=int,
            action="append",
            help="Number of attention convolution filters \
                           (negative value indicates no location-aware attention)",
        )
        group.add_argument(
            "--dropout-rate",
            type=float,
            action="append",
            help="Dropout rate for the encoder",
        )
        # hierarchical attention network (HAN)
        group.add_argument(
            "--han-type",
            default="dot",
            type=str,
            choices=[
                "noatt",
                "dot",
                "add",
                "location",
                "coverage",
                "coverage_location",
                "location2d",
                "location_recurrent",
                "multi_head_dot",
                "multi_head_add",
                "multi_head_loc",
                "multi_head_multi_res_loc",
            ],
            help="Type of attention architecture (multi-encoder asr mode only)",
        )
        group.add_argument(
            "--han-dim",
            default=320,
            type=int,
            help="Number of attention transformation dimensions in HAN",
        )
        group.add_argument(
            "--han-win",
            default=5,
            type=int,
            help="Window size for location2d attention in HAN",
        )
        group.add_argument(
            "--han-heads",
            default=4,
            type=int,
            help="Number of heads for multi head attention in HAN",
        )
        group.add_argument(
            "--han-conv-chans",
            default=-1,
            type=int,
            help="Number of attention convolution channels  in HAN \
                           (negative value indicates no location-aware attention)",
        )
        group.add_argument(
            "--han-conv-filts",
            default=100,
            type=int,
            help="Number of attention convolution filters in HAN \
                           (negative value indicates no location-aware attention)",
        )
        return parser

    @staticmethod
    def decoder_add_arguments(parser):
        """Add arguments for decoder in multi-encoder setting."""
        group = parser.add_argument_group("E2E decoder setting")
        group.add_argument(
            "--dtype",
            default="lstm",
            type=str,
            choices=["lstm", "gru"],
            help="Type of decoder network architecture",
        )
        group.add_argument(
            "--dlayers", default=1, type=int, help="Number of decoder layers"
        )
        group.add_argument(
            "--dunits", default=320, type=int, help="Number of decoder hidden units"
        )
        group.add_argument(
            "--dropout-rate-decoder",
            default=0.0,
            type=float,
            help="Dropout rate for the decoder",
        )
        group.add_argument(
            "--sampling-probability",
            default=0.0,
            type=float,
            help="Ratio of predicted labels fed back to decoder",
        )
        group.add_argument(
            "--lsm-type",
            const="",
            default="",
            type=str,
            nargs="?",
            choices=["", "unigram"],
            help="Apply label smoothing with a specified distribution type",
        )
        return parser

    @staticmethod
    def ctc_add_arguments(parser):
        """Add arguments for ctc in multi-encoder setting."""
        group = parser.add_argument_group("E2E multi-ctc setting")
        group.add_argument(
            "--share-ctc",
            type=strtobool,
            default=False,
            help="The flag to switch to share ctc across multiple encoders "
            "(multi-encoder asr mode only).",
        )
        group.add_argument(
            "--weights-ctc-train",
            type=float,
            action="append",
            help="ctc weight assigned to each encoder during training.",
        )
        group.add_argument(
            "--weights-ctc-dec",
            type=float,
            action="append",
            help="ctc weight assigned to each encoder during decoding.",
        )
        return parser

    def get_total_subsampling_factor(self):
        """Get total subsampling factor."""
        if isinstance(self.enc, Encoder):
            return self.enc.conv_subsampling_factor * int(
                np.prod(self.subsample_list[0])
            )
        else:
            return self.enc[0].conv_subsampling_factor * int(
                np.prod(self.subsample_list[0])
            )

    def __init__(self, idims, odim, args):
        """Initialize this class with python-level args.

        Args:
            idims (list): list of the number of an input feature dim.
            odim (int): The number of output vocab.
            args (Namespace): arguments

        """
        super(E2E, self).__init__()
        torch.nn.Module.__init__(self)
        self.mtlalpha = args.mtlalpha
        assert 0.0 <= self.mtlalpha <= 1.0, "mtlalpha should be [0.0, 1.0]"
        self.verbose = args.verbose
        # NOTE: for self.build method
        args.char_list = getattr(args, "char_list", None)
        self.char_list = args.char_list
        self.outdir = args.outdir
        self.space = args.sym_space
        self.blank = args.sym_blank
        self.reporter = Reporter()
        self.num_encs = args.num_encs
        self.share_ctc = args.share_ctc

        # below means the last number becomes eos/sos ID
        # note that sos/eos IDs are identical
        self.sos = odim - 1
        self.eos = odim - 1

        # subsample info
        self.subsample_list = get_subsample(args, mode="asr", arch="rnn_mulenc")

        # label smoothing info
        if args.lsm_type and os.path.isfile(args.train_json):
            logging.info("Use label smoothing with " + args.lsm_type)
            labeldist = label_smoothing_dist(
                odim, args.lsm_type, transcript=args.train_json
            )
        else:
            labeldist = None

        # speech translation related
        self.replace_sos = getattr(
            args, "replace_sos", False
        )  # use getattr to keep compatibility

        self.frontend = None

        # encoder
        self.enc = encoder_for(args, idims, self.subsample_list)
        # ctc
        self.ctc = ctc_for(args, odim)
        # attention
        self.att = att_for(args)
        # hierarchical attention network
        han = att_for(args, han_mode=True)
        self.att.append(han)
        # decoder
        self.dec = decoder_for(args, odim, self.sos, self.eos, self.att, labeldist)

        if args.mtlalpha > 0 and self.num_encs > 1:
            # weights-ctc,
            # e.g. ctc_loss = w_1*ctc_1_loss + w_2 * ctc_2_loss + w_N * ctc_N_loss
            self.weights_ctc_train = args.weights_ctc_train / np.sum(
                args.weights_ctc_train
            )  # normalize
            self.weights_ctc_dec = args.weights_ctc_dec / np.sum(
                args.weights_ctc_dec
            )  # normalize
            logging.info(
                "ctc weights (training during training): "
                + " ".join([str(x) for x in self.weights_ctc_train])
            )
            logging.info(
                "ctc weights (decoding during training): "
                + " ".join([str(x) for x in self.weights_ctc_dec])
            )
        else:
            self.weights_ctc_dec = [1.0]
            self.weights_ctc_train = [1.0]

        # weight initialization
        self.init_like_chainer()

        # options for beam search
        if args.report_cer or args.report_wer:
            recog_args = {
                "beam_size": args.beam_size,
                "penalty": args.penalty,
                "ctc_weight": args.ctc_weight,
                "maxlenratio": args.maxlenratio,
                "minlenratio": args.minlenratio,
                "lm_weight": args.lm_weight,
                "rnnlm": args.rnnlm,
                "nbest": args.nbest,
                "space": args.sym_space,
                "blank": args.sym_blank,
                "tgt_lang": False,
                "ctc_weights_dec": self.weights_ctc_dec,
            }

            self.recog_args = argparse.Namespace(**recog_args)
            self.report_cer = args.report_cer
            self.report_wer = args.report_wer
        else:
            self.report_cer = False
            self.report_wer = False
        self.rnnlm = None

        self.logzero = -10000000000.0
        self.loss = None
        self.acc = None

    def init_like_chainer(self):
        """Initialize weight like chainer.

        chainer basically uses LeCun way: W ~ Normal(0, fan_in ** -0.5), b = 0
        pytorch basically uses W, b ~ Uniform(-fan_in**-0.5, fan_in**-0.5)

        however, there are two exceptions as far as I know.
        - EmbedID.W ~ Normal(0, 1)
        - LSTM.upward.b[forget_gate_range] = 1 (but not used in NStepLSTM)
        """

        def lecun_normal_init_parameters(module):
            for p in module.parameters():
                data = p.data
                if data.dim() == 1:
                    # bias
                    data.zero_()
                elif data.dim() == 2:
                    # linear weight
                    n = data.size(1)
                    stdv = 1.0 / math.sqrt(n)
                    data.normal_(0, stdv)
                elif data.dim() in (3, 4):
                    # conv weight
                    n = data.size(1)
                    for k in data.size()[2:]:
                        n *= k
                    stdv = 1.0 / math.sqrt(n)
                    data.normal_(0, stdv)
                else:
                    raise NotImplementedError

        def set_forget_bias_to_one(bias):
            n = bias.size(0)
            start, end = n // 4, n // 2
            bias.data[start:end].fill_(1.0)

        lecun_normal_init_parameters(self)
        # exceptions
        # embed weight ~ Normal(0, 1)
        self.dec.embed.weight.data.normal_(0, 1)
        # forget-bias = 1.0
        # https://discuss.pytorch.org/t/set-forget-gate-bias-of-lstm/1745
        for i in range(len(self.dec.decoder)):
            set_forget_bias_to_one(self.dec.decoder[i].bias_ih)

    def forward(self, xs_pad_list, ilens_list, ys_pad):
        """E2E forward.

        :param List xs_pad_list: list of batch (torch.Tensor) of padded input sequences
                                [(B, Tmax_1, idim), (B, Tmax_2, idim),..]
        :param List ilens_list:
            list of batch (torch.Tensor) of lengths of input sequences [(B), (B), ..]
        :param torch.Tensor ys_pad:
            batch of padded character id sequence tensor (B, Lmax)
        :return: loss value
        :rtype: torch.Tensor
        """
        if self.replace_sos:
            tgt_lang_ids = ys_pad[:, 0:1]
            ys_pad = ys_pad[:, 1:]  # remove target language ID in the beginning
        else:
            tgt_lang_ids = None

        hs_pad_list, hlens_list, self.loss_ctc_list = [], [], []
        for idx in range(self.num_encs):
            # 1. Encoder
            hs_pad, hlens, _ = self.enc[idx](xs_pad_list[idx], ilens_list[idx])

            # 2. CTC loss
            if self.mtlalpha == 0:
                self.loss_ctc_list.append(None)
            else:
                ctc_idx = 0 if self.share_ctc else idx
                loss_ctc = self.ctc[ctc_idx](hs_pad, hlens, ys_pad)
                self.loss_ctc_list.append(loss_ctc)
            hs_pad_list.append(hs_pad)
            hlens_list.append(hlens)

        # 3. attention loss
        if self.mtlalpha == 1:
            self.loss_att, acc = None, None
        else:
            self.loss_att, acc, _ = self.dec(
                hs_pad_list, hlens_list, ys_pad, lang_ids=tgt_lang_ids
            )
        self.acc = acc

        # 4. compute cer without beam search
        if self.mtlalpha == 0 or self.char_list is None:
            cer_ctc_list = [None] * (self.num_encs + 1)
        else:
            cer_ctc_list = []
            for ind in range(self.num_encs):
                cers = []
                ctc_idx = 0 if self.share_ctc else ind
                y_hats = self.ctc[ctc_idx].argmax(hs_pad_list[ind]).data
                for i, y in enumerate(y_hats):
                    y_hat = [x[0] for x in groupby(y)]
                    y_true = ys_pad[i]

                    seq_hat = [
                        self.char_list[int(idx)] for idx in y_hat if int(idx) != -1
                    ]
                    seq_true = [
                        self.char_list[int(idx)] for idx in y_true if int(idx) != -1
                    ]
                    seq_hat_text = "".join(seq_hat).replace(self.space, " ")
                    seq_hat_text = seq_hat_text.replace(self.blank, "")
                    seq_true_text = "".join(seq_true).replace(self.space, " ")

                    hyp_chars = seq_hat_text.replace(" ", "")
                    ref_chars = seq_true_text.replace(" ", "")
                    if len(ref_chars) > 0:
                        cers.append(
                            editdistance.eval(hyp_chars, ref_chars) / len(ref_chars)
                        )

                cer_ctc = sum(cers) / len(cers) if cers else None
                cer_ctc_list.append(cer_ctc)
            cer_ctc_weighted = np.sum(
                [
                    item * self.weights_ctc_train[i]
                    for i, item in enumerate(cer_ctc_list)
                ]
            )
            cer_ctc_list = [float(cer_ctc_weighted)] + [
                float(item) for item in cer_ctc_list
            ]

        # 5. compute cer/wer
        if self.training or not (self.report_cer or self.report_wer):
            cer, wer = 0.0, 0.0
            # oracle_cer, oracle_wer = 0.0, 0.0
        else:
            if self.recog_args.ctc_weight > 0.0:
                lpz_list = []
                for idx in range(self.num_encs):
                    ctc_idx = 0 if self.share_ctc else idx
                    lpz = self.ctc[ctc_idx].log_softmax(hs_pad_list[idx]).data
                    lpz_list.append(lpz)
            else:
                lpz_list = None

            word_eds, word_ref_lens, char_eds, char_ref_lens = [], [], [], []
            nbest_hyps = self.dec.recognize_beam_batch(
                hs_pad_list,
                hlens_list,
                lpz_list,
                self.recog_args,
                self.char_list,
                self.rnnlm,
                lang_ids=tgt_lang_ids.squeeze(1).tolist() if self.replace_sos else None,
            )
            # remove <sos> and <eos>
            y_hats = [nbest_hyp[0]["yseq"][1:-1] for nbest_hyp in nbest_hyps]
            for i, y_hat in enumerate(y_hats):
                y_true = ys_pad[i]

                seq_hat = [self.char_list[int(idx)] for idx in y_hat if int(idx) != -1]
                seq_true = [
                    self.char_list[int(idx)] for idx in y_true if int(idx) != -1
                ]
                seq_hat_text = "".join(seq_hat).replace(self.recog_args.space, " ")
                seq_hat_text = seq_hat_text.replace(self.recog_args.blank, "")
                seq_true_text = "".join(seq_true).replace(self.recog_args.space, " ")

                hyp_words = seq_hat_text.split()
                ref_words = seq_true_text.split()
                word_eds.append(editdistance.eval(hyp_words, ref_words))
                word_ref_lens.append(len(ref_words))
                hyp_chars = seq_hat_text.replace(" ", "")
                ref_chars = seq_true_text.replace(" ", "")
                char_eds.append(editdistance.eval(hyp_chars, ref_chars))
                char_ref_lens.append(len(ref_chars))

            wer = (
                0.0
                if not self.report_wer
                else float(sum(word_eds)) / sum(word_ref_lens)
            )
            cer = (
                0.0
                if not self.report_cer
                else float(sum(char_eds)) / sum(char_ref_lens)
            )

        alpha = self.mtlalpha
        if alpha == 0:
            self.loss = self.loss_att
            loss_att_data = float(self.loss_att)
            loss_ctc_data_list = [None] * (self.num_encs + 1)
        elif alpha == 1:
            self.loss = torch.sum(
                torch.cat(
                    [
                        (item * self.weights_ctc_train[i]).unsqueeze(0)
                        for i, item in enumerate(self.loss_ctc_list)
                    ]
                )
            )
            loss_att_data = None
            loss_ctc_data_list = [float(self.loss)] + [
                float(item) for item in self.loss_ctc_list
            ]
        else:
            self.loss_ctc = torch.sum(
                torch.cat(
                    [
                        (item * self.weights_ctc_train[i]).unsqueeze(0)
                        for i, item in enumerate(self.loss_ctc_list)
                    ]
                )
            )
            self.loss = alpha * self.loss_ctc + (1 - alpha) * self.loss_att
            loss_att_data = float(self.loss_att)
            loss_ctc_data_list = [float(self.loss_ctc)] + [
                float(item) for item in self.loss_ctc_list
            ]

        loss_data = float(self.loss)
        if loss_data < CTC_LOSS_THRESHOLD and not math.isnan(loss_data):
            self.reporter.report(
                loss_ctc_data_list,
                loss_att_data,
                acc,
                cer_ctc_list,
                cer,
                wer,
                loss_data,
            )
        else:
            logging.warning("loss (=%f) is not correct", loss_data)
        return self.loss

    def scorers(self):
        """Get scorers for `beam_search` (optional).

        Returns:
            dict[str, ScorerInterface]: dict of `ScorerInterface` objects

        """
        return dict(decoder=self.dec, ctc=CTCPrefixScorer(self.ctc, self.eos))

    def encode(self, x_list):
        """Encode feature.

        Args:
            x_list (list): input feature [(T1, D), (T2, D), ... ]
        Returns:
            list
                encoded feature [(T1, D), (T2, D), ... ]

        """
        self.eval()
        ilens_list = [[x_list[idx].shape[0]] for idx in range(self.num_encs)]

        # subsample frame
        x_list = [
            x_list[idx][:: self.subsample_list[idx][0], :]
            for idx in range(self.num_encs)
        ]
        p = next(self.parameters())
        x_list = [
            torch.as_tensor(x_list[idx], device=p.device, dtype=p.dtype)
            for idx in range(self.num_encs)
        ]
        # make a utt list (1) to use the same interface for encoder
        xs_list = [
            x_list[idx].contiguous().unsqueeze(0) for idx in range(self.num_encs)
        ]

        # 1. encoder
        hs_list = []
        for idx in range(self.num_encs):
            hs, _, _ = self.enc[idx](xs_list[idx], ilens_list[idx])
            hs_list.append(hs[0])
        return hs_list

    def recognize(self, x_list, recog_args, char_list, rnnlm=None):
        """E2E beam search.

        :param list of ndarray x: list of input acoustic feature [(T1, D), (T2,D),...]
        :param Namespace recog_args: argument Namespace containing options
        :param list char_list: list of characters
        :param torch.nn.Module rnnlm: language model module
        :return: N-best decoding results
        :rtype: list
        """
        hs_list = self.encode(x_list)
        # calculate log P(z_t|X) for CTC scores
        if recog_args.ctc_weight > 0.0:
            if self.share_ctc:
                lpz_list = [
                    self.ctc[0].log_softmax(hs_list[idx].unsqueeze(0))[0]
                    for idx in range(self.num_encs)
                ]
            else:
                lpz_list = [
                    self.ctc[idx].log_softmax(hs_list[idx].unsqueeze(0))[0]
                    for idx in range(self.num_encs)
                ]
        else:
            lpz_list = None

        # 2. Decoder
        # decode the first utterance
        y = self.dec.recognize_beam(hs_list, lpz_list, recog_args, char_list, rnnlm)
        return y

    def recognize_batch(self, xs_list, recog_args, char_list, rnnlm=None):
        """E2E beam search.

        :param list xs_list: list of list of input acoustic feature arrays
                [[(T1_1, D), (T1_2, D), ...],[(T2_1, D), (T2_2, D), ...], ...]
        :param Namespace recog_args: argument Namespace containing options
        :param list char_list: list of characters
        :param torch.nn.Module rnnlm: language model module
        :return: N-best decoding results
        :rtype: list
        """
        prev = self.training
        self.eval()
        ilens_list = [
            np.fromiter((xx.shape[0] for xx in xs_list[idx]), dtype=np.int64)
            for idx in range(self.num_encs)
        ]

        # subsample frame
        xs_list = [
            [xx[:: self.subsample_list[idx][0], :] for xx in xs_list[idx]]
            for idx in range(self.num_encs)
        ]

        xs_list = [
            [to_device(self, to_torch_tensor(xx).float()) for xx in xs_list[idx]]
            for idx in range(self.num_encs)
        ]
        xs_pad_list = [pad_list(xs_list[idx], 0.0) for idx in range(self.num_encs)]

        # 1. Encoder
        hs_pad_list, hlens_list = [], []
        for idx in range(self.num_encs):
            hs_pad, hlens, _ = self.enc[idx](xs_pad_list[idx], ilens_list[idx])
            hs_pad_list.append(hs_pad)
            hlens_list.append(hlens)

        # calculate log P(z_t|X) for CTC scores
        if recog_args.ctc_weight > 0.0:
            if self.share_ctc:
                lpz_list = [
                    self.ctc[0].log_softmax(hs_pad_list[idx])
                    for idx in range(self.num_encs)
                ]
            else:
                lpz_list = [
                    self.ctc[idx].log_softmax(hs_pad_list[idx])
                    for idx in range(self.num_encs)
                ]
            normalize_score = False
        else:
            lpz_list = None
            normalize_score = True

        # 2. Decoder
        hlens_list = [
            torch.tensor(list(map(int, hlens_list[idx])))
            for idx in range(self.num_encs)
        ]  # make sure hlens is tensor
        y = self.dec.recognize_beam_batch(
            hs_pad_list,
            hlens_list,
            lpz_list,
            recog_args,
            char_list,
            rnnlm,
            normalize_score=normalize_score,
        )

        if prev:
            self.train()
        return y

    def calculate_all_attentions(self, xs_pad_list, ilens_list, ys_pad):
        """E2E attention calculation.

        :param List xs_pad_list: list of batch (torch.Tensor) of padded input sequences
                                [(B, Tmax_1, idim), (B, Tmax_2, idim),..]
        :param List ilens_list:
            list of batch (torch.Tensor) of lengths of input sequences [(B), (B), ..]
        :param torch.Tensor ys_pad:
            batch of padded character id sequence tensor (B, Lmax)
        :return: attention weights with the following shape,
            1) multi-head case => attention weights (B, H, Lmax, Tmax),
            2) multi-encoder case
                => [(B, Lmax, Tmax1), (B, Lmax, Tmax2), ..., (B, Lmax, NumEncs)]
            3) other case => attention weights (B, Lmax, Tmax).
        :rtype: float ndarray or list
        """
        self.eval()
        with torch.no_grad():
            # 1. Encoder
            if self.replace_sos:
                tgt_lang_ids = ys_pad[:, 0:1]
                ys_pad = ys_pad[:, 1:]  # remove target language ID in the beggining
            else:
                tgt_lang_ids = None

            hs_pad_list, hlens_list = [], []
            for idx in range(self.num_encs):
                hs_pad, hlens, _ = self.enc[idx](xs_pad_list[idx], ilens_list[idx])
                hs_pad_list.append(hs_pad)
                hlens_list.append(hlens)

            # 2. Decoder
            att_ws = self.dec.calculate_all_attentions(
                hs_pad_list, hlens_list, ys_pad, lang_ids=tgt_lang_ids
            )
        self.train()
        return att_ws

    def calculate_all_ctc_probs(self, xs_pad_list, ilens_list, ys_pad):
        """E2E CTC probability calculation.

        :param List xs_pad_list: list of batch (torch.Tensor) of padded input sequences
                                [(B, Tmax_1, idim), (B, Tmax_2, idim),..]
        :param List ilens_list:
            list of batch (torch.Tensor) of lengths of input sequences [(B), (B), ..]
        :param torch.Tensor ys_pad:
            batch of padded character id sequence tensor (B, Lmax)
        :return: CTC probability (B, Tmax, vocab)
        :rtype: float ndarray or list
        """
        probs_list = [None]
        if self.mtlalpha == 0:
            return probs_list

        self.eval()
        probs_list = []
        with torch.no_grad():
            # 1. Encoder
            for idx in range(self.num_encs):
                hs_pad, hlens, _ = self.enc[idx](xs_pad_list[idx], ilens_list[idx])

                # 2. CTC loss
                ctc_idx = 0 if self.share_ctc else idx
                probs = self.ctc[ctc_idx].softmax(hs_pad).cpu().numpy()
                probs_list.append(probs)
        self.train()
        return probs_list


================================================
FILE: nets/pytorch_backend/e2e_asr_transducer.py
================================================
"""Transducer speech recognition model (pytorch)."""

from argparse import Namespace
from collections import Counter
from dataclasses import asdict
from functools import partial
from concurrent.futures import ThreadPoolExecutor, wait, ALL_COMPLETED

import logging
import math
import numpy
import functools
import chainer
import torch
from espnet.nets.asr_interface import ASRInterface
from espnet.nets.pytorch_backend.ctc import ctc_for
from espnet.nets.pytorch_backend.nets_utils import get_subsample
from espnet.nets.pytorch_backend.nets_utils import make_non_pad_mask
from espnet.nets.pytorch_backend.transducer.arguments import (
    add_encoder_general_arguments,  # noqa: H301
    add_rnn_encoder_arguments,  # noqa: H301
    add_custom_encoder_arguments,  # noqa: H301
    add_decoder_general_arguments,  # noqa: H301
    add_rnn_decoder_arguments,  # noqa: H301
    add_custom_decoder_arguments,  # noqa: H301
    add_custom_training_arguments,  # noqa: H301
    add_transducer_arguments,  # noqa: H301
    add_auxiliary_task_arguments,  # noqa: H301
    add_att_scorer_arguments,
)
from espnet.nets.pytorch_backend.transducer.auxiliary_task import AuxiliaryTask
from espnet.nets.pytorch_backend.transducer.custom_decoder import CustomDecoder
from espnet.nets.pytorch_backend.transducer.custom_encoder import CustomEncoder
from espnet.nets.pytorch_backend.transducer.error_calculator import ErrorCalculator
from espnet.nets.pytorch_backend.transducer.initializer import initializer
from espnet.nets.pytorch_backend.transducer.joint_network import JointNetwork
from espnet.nets.pytorch_backend.transducer.loss import TransLoss
from espnet.nets.pytorch_backend.transducer.rnn_decoder import DecoderRNNT
from espnet.nets.pytorch_backend.transducer.rnn_encoder import encoder_for
from espnet.nets.pytorch_backend.transducer.utils import prepare_loss_inputs
from espnet.nets.pytorch_backend.transducer.utils import valid_aux_task_layer_list
from espnet.nets.pytorch_backend.transformer.attention import (
    MultiHeadedAttention,  # noqa: H301
    RelPositionMultiHeadedAttention,  # noqa: H301
)
from espnet.nets.pytorch_backend.transformer.label_smoothing_loss import (
    LabelSmoothingLoss,  # noqa: H301
)
from espnet.nets.pytorch_backend.transformer.mask import target_mask
from espnet.nets.pytorch_backend.transformer.plot import PlotAttentionReport
from espnet.utils.fill_missing_args import fill_missing_args
from espnet.snowfall.warpper.warpper_mmi import K2MMI
from espnet.snowfall.warpper.warpper_ctc import K2CTC
from espnet.nets.beam_search_transducer import BeamSearchTransducer
from espnet.nets.pytorch_backend.transformer.decoder import Decoder
from espnet.nets.pytorch_backend.transformer.label_smoothing_loss import (
    LabelSmoothingLoss,  # noqa: H301
)

import editdistance

class Reporter(chainer.Chain):
    """A chainer reporter wrapper for transducer models."""

    def report(
        self,
        loss,
        loss_trans,
        loss_ctc,
        loss_lm,
        loss_aux_trans,
        loss_aux_symm_kl,
        loss_mbr,
        loss_mmi,
        loss_att,
        cer,
        wer,
    ):
        """Instantiate reporter attributes."""
        chainer.reporter.report({"loss": loss}, self)
        chainer.reporter.report({"loss_trans": loss_trans}, self)
        chainer.reporter.report({"loss_ctc": loss_ctc}, self)
        chainer.reporter.report({"loss_lm": loss_lm}, self)
        chainer.reporter.report({"loss_aux_trans": loss_aux_trans}, self)
        chainer.reporter.report({"loss_aux_symm_kl": loss_aux_symm_kl}, self)
        chainer.reporter.report({"loss_mbr": loss_mbr}, self)
        chainer.reporter.report({"loss_mmi": loss_mmi}, self)
        chainer.reporter.report({"loss_att": loss_att}, self)
        chainer.reporter.report({"cer": cer}, self)
        chainer.reporter.report({"wer": wer}, self)

        logging.info("loss:" + str(loss))


class E2E(ASRInterface, torch.nn.Module):
    """E2E module for transducer models.

    Args:
        idim (int): dimension of inputs
        odim (int): dimension of outputs
        args (Namespace): argument Namespace containing options
        ignore_id (int): padding symbol id
        blank_id (int): blank symbol id

    """

    @staticmethod
    def add_arguments(parser):
        """Add arguments for transducer model."""
        E2E.encoder_add_general_arguments(parser)
        E2E.encoder_add_rnn_arguments(parser)
        E2E.encoder_add_custom_arguments(parser)

        E2E.decoder_add_general_arguments(parser)
        E2E.decoder_add_rnn_arguments(parser)
        E2E.decoder_add_custom_arguments(parser)

        E2E.training_add_custom_arguments(parser)
        E2E.transducer_add_arguments(parser)
        E2E.auxiliary_task_add_arguments(parser)

        E2E.att_scorer_arguments(parser)
        return parser

    @staticmethod
    def att_scorer_arguments(parser):
        """Add attention scorer argument."""
        group = parser.add_argument_group("Attention scorer arguments")
        group = add_att_scorer_arguments(group)

        return parser

    @staticmethod
    def encoder_add_general_arguments(parser):
        """Add general arguments for encoder."""
        group = parser.add_argument_group("Encoder general arguments")
        group = add_encoder_general_arguments(group)

        return parser

    @staticmethod
    def encoder_add_rnn_arguments(parser):
        """Add arguments for RNN encoder."""
        group = parser.add_argument_group("RNN encoder arguments")
        group = add_rnn_encoder_arguments(group)

        return parser

    @staticmethod
    def encoder_add_custom_arguments(parser):
        """Add arguments for Custom encoder."""
        group = parser.add_argument_group("Custom encoder arguments")
        group = add_custom_encoder_arguments(group)

        return parser

    @staticmethod
    def decoder_add_general_arguments(parser):
        """Add general arguments for decoder."""
        group = parser.add_argument_group("Decoder general arguments")
        group = add_decoder_general_arguments(group)

        return parser

    @staticmethod
    def decoder_add_rnn_arguments(parser):
        """Add arguments for RNN decoder."""
        group = parser.add_argument_group("RNN decoder arguments")
        group = add_rnn_decoder_arguments(group)

        return parser

    @staticmethod
    def decoder_add_custom_arguments(parser):
        """Add arguments for Custom decoder."""
        group = parser.add_argument_group("Custom decoder arguments")
        group = add_custom_decoder_arguments(group)

        return parser

    @staticmethod
    def training_add_custom_arguments(parser):
        """Add arguments for Custom architecture training."""
        group = parser.add_argument_group("Training arguments for custom archictecture")
        group = add_custom_training_arguments(group)

        return parser

    @staticmethod
    def transducer_add_arguments(parser):
        """Add arguments for transducer model."""
        group = parser.add_argument_group("Transducer model arguments")
        group = add_transducer_arguments(group)

        return parser

    @staticmethod
    def auxiliary_task_add_arguments(parser):
        """Add arguments for auxiliary task."""
        group = parser.add_argument_group("Auxiliary task arguments")
        group = add_auxiliary_task_arguments(group)

        return parser

    @property
    def attention_plot_class(self):
        """Get attention plot class."""
        return PlotAttentionReport

    def get_total_subsampling_factor(self):
        """Get total subsampling factor."""
        if self.etype == "custom":
            return self.encoder.conv_subsampling_factor * int(
                numpy.prod(self.subsample)
            )
        else:
            return self.enc.conv_subsampling_factor * int(numpy.prod(self.subsample))

    def __init__(self, idim, odim, args, ignore_id=-1, blank_id=0, training=True):
        """Construct an E2E object for transducer model."""
        torch.nn.Module.__init__(self)
        
        args = fill_missing_args(args, self.add_arguments)

        self.is_rnnt = True
        self.transducer_weight = args.transducer_weight

        self.use_aux_task = (
            True if (args.aux_task_type is not None and training) else False
        )

        self.use_aux_ctc = args.aux_ctc #and training
        self.aux_ctc_weight = args.aux_ctc_weight

        self.use_aux_mmi = args.aux_mmi #and training
        self.aux_mmi_weight = args.aux_mmi_weight

        self.use_aux_cross_entropy = args.aux_cross_entropy #and training
        self.aux_cross_entropy_weight = args.aux_cross_entropy_weight

        self.use_aux_mbr = args.aux_mbr
        self.aux_mbr_weight = args.aux_mbr_weight
        self.aux_mbr_beam = args.aux_mbr_beam

        self.use_att_scorer = args.att_scorer_weight > 0.0
        self.att_scorer_weight = args.att_scorer_weight

        if self.use_aux_task:
            n_layers = (
                (len(args.enc_block_arch) * args.enc_block_repeat - 1)
                if args.enc_block_arch is not None
                else (args.elayers - 1)
            )

            aux_task_layer_list = valid_aux_task_layer_list(
                args.aux_task_layer_list,
                n_layers,
            )
        else:
            aux_task_layer_list = []

        if "custom" in args.etype:
            if args.enc_block_arch is None:
                raise ValueError(
                    "When specifying custom encoder type, --enc-block-arch"
                    "should also be specified in training config. See"
                    "egs/vivos/asr1/conf/transducer/train_*.yaml for more info."
                )

            self.subsample = get_subsample(args, mode="asr", arch="transformer")

            self.encoder = CustomEncoder(
                idim,
                args.enc_block_arch,
                input_layer=args.custom_enc_input_layer,
                repeat_block=args.enc_block_repeat,
                self_attn_type=args.custom_enc_self_attn_type,
                positional_encoding_type=args.custom_enc_positional_encoding_type,
                positionwise_activation_type=args.custom_enc_pw_activation_type,
                conv_mod_activation_type=args.custom_enc_conv_mod_activation_type,
                aux_task_layer_list=aux_task_layer_list,
            )
            encoder_out = self.encoder.enc_out

            self.most_dom_list = args.enc_block_arch[:]
        else:
            self.subsample = get_subsample(args, mode="asr", arch="rnn-t")

            self.enc = encoder_for(
                args,
                idim,
                self.subsample,
                aux_task_layer_list=aux_task_layer_list,
            )
            encoder_out = args.eprojs

        if "custom" in args.dtype:
            if args.dec_block_arch is None:
                raise ValueError(
                    "When specifying custom decoder type, --dec-block-arch"
                    "should also be specified in training config. See"
                    "egs/vivos/asr1/conf/transducer/train_*.yaml for more info."
                )

            self.decoder = CustomDecoder(
                odim,
                args.dec_block_arch,
                input_layer=args.custom_dec_input_layer,
                repeat_block=args.dec_block_repeat,
                positionwise_activation_type=args.custom_dec_pw_activation_type,
                dropout_rate_embed=args.dropout_rate_embed_decoder,
            )
            decoder_out = self.decoder.dunits

            if "custom" in args.etype:
                self.most_dom_list += args.dec_block_arch[:]
            else:
                self.most_dom_list = args.dec_block_arch[:]
        else:
            self.dec = DecoderRNNT(
                odim,
                args.dtype,
                args.dlayers,
                args.dunits,
                blank_id,
                args.dec_embed_dim,
                args.dropout_rate_decoder,
                args.dropout_rate_embed_decoder,
            )
            decoder_out = args.dunits

        self.joint_network = JointNetwork(
            odim, encoder_out, decoder_out, args.joint_dim, args.joint_activation_type
        )

        # Attention Rescore
        if self.use_att_scorer > 0.0:
            self.att_scorer = Decoder(
                odim=odim,
                selfattention_layer_type=args.att_decoder_selfattn_layer_type,
                attention_dim=args.att_adim,
                attention_heads=args.att_aheads,
                conv_wshare=args.att_wshare,
                conv_kernel_length=args.att_ldconv_decoder_kernel_length,
                conv_usebias=args.att_ldconv_usebias,
                linear_units=args.att_dunits,
                num_blocks=args.att_dlayers,
                dropout_rate=args.att_dropout_rate,
                positional_dropout_rate=args.att_dropout_rate,
                self_attention_dropout_rate=args.att_attn_dropout_rate,
                src_attention_dropout_rate=args.att_attn_dropout_rate,
            )
            self.att_scorer_criterion = LabelSmoothingLoss(
                odim,
                ignore_id,
                args.lsm_weight,
                args.att_length_normalized_loss,
            )
        else:
            self.attention_scorer = None
            self.att_scorer_criterion = None

        if hasattr(self, "most_dom_list"):
            self.most_dom_dim = sorted(
                Counter(
                    d["d_hidden"] for d in self.most_dom_list if "d_hidden" in d
                ).most_common(),
                key=lambda x: x[0],
                reverse=True,
            )[0][0]

        self.etype = args.etype
        self.dtype = args.dtype

        self.sos = odim - 1
        self.eos = odim - 1
        self.blank_id = blank_id
        self.ignore_id = ignore_id

        self.space = args.sym_space
        self.blank = args.sym_blank

        self.odim = odim

        self.reporter = Reporter()

        self.error_calculator = None

        self.default_parameters(args)

        self.criterion = TransLoss(args.trans_type, self.blank_id)
        if training:

            decoder = self.decoder if self.dtype == "custom" else self.dec

            if args.report_cer or args.report_wer:
                self.error_calculator = ErrorCalculator(
                    decoder,
                    self.joint_network,
                    args.char_list,
                    args.sym_space,
                    args.sym_blank,
                    args.report_cer,
                    args.report_wer,
                )

            if self.use_aux_task:
                self.auxiliary_task = AuxiliaryTask(
                    decoder,
                    self.joint_network,
                    self.criterion,
                    args.aux_task_type,
                    args.aux_task_weight,
                    encoder_out,
                    args.joint_dim,
                )

        if self.use_aux_ctc:
            self.aux_ctc = ctc_for(
                Namespace(
                    num_encs=1,
                    eprojs=encoder_out,
                    dropout_rate=args.aux_ctc_dropout_rate,
                    ctc_type="warpctc",
                ),
                odim,
            )

        if self.use_aux_mmi:
            # assert self.use_aux_ctc # ctc is needed for aishell-1 but not for librispeech
            device = torch.device(f"cuda:{args.local_rank}") if torch.cuda.is_available() else torch.device("cpu")
            aux_mmi_module = K2MMI if args.aux_mmi_type == "mmi" else K2CTC
            self.aux_mmi=aux_mmi_module(idim=encoder_out,
                         lang=args.lang,
                         char_list=args.char_list,
                         device=device,
                         dropout=args.aux_mmi_dropout_rate,
                         den_scale=args.den_scale,
                         eos_id=self.eos,
                         use_segment=args.use_segment)

        if self.use_aux_cross_entropy:
            self.aux_decoder_output = torch.nn.Linear(decoder_out, odim)

            self.aux_cross_entropy = LabelSmoothingLoss(
                odim, ignore_id, args.aux_cross_entropy_smoothing
            )

        if self.use_aux_mbr:
            assert args.resume is not None # need a seed model
            self.beam_search = BeamSearchTransducer(
                decoder=self.decoder if "custom" in self.dtype else self.dec,
                joint_network=self.joint_network,
                beam_size=self.aux_mbr_beam,
                nbest=self.aux_mbr_beam,
                search_type='alsd',
            ) 
            self.char_list = args.char_list

            self.mbr_trans_type = args.trans_type
            if args.trans_type == "warp-transducer":
                from warprnnt_pytorch import RNNTLoss
                self.mbr_trans_loss = RNNTLoss(blank=self.blank_id, reduction="none")
            elif args.trans_type == "warp-rnnt":
                from warp_rnnt import rnnt_loss
                self.mbr_trans_loss = rnnt_loss
            print("built beam search decoder for MBR") 

        self.loss = None
        self.rnnlm = None

    def default_parameters(self, args):
        """Initialize/reset parameters for transducer.

        Args:
            args (Namespace): argument Namespace containing options

        """
        initializer(self, args)

    def forward(self, xs_pad, ilens, ys_pad, texts, xs_pad_orig):
        """E2E forward.

        Args:
            xs_pad (torch.Tensor): batch of padded source sequences (B, Tmax, idim)
            ilens (torch.Tensor): batch of lengths of input sequences (B)
            ys_pad (torch.Tensor): batch of padded target sequences (B, Lmax)

        Returns:
            loss (torch.Tensor): transducer loss value

        """
        # 1. encoder
        xs_pad = xs_pad[:, : max(ilens)]

        if "custom" in self.etype:
            src_mask = make_non_pad_mask(ilens.tolist()).to(xs_pad.device).unsqueeze(-2)

            _hs_pad, hs_mask = self.encoder(xs_pad, src_mask)
        else:
            _hs_pad, hs_mask, _ = self.enc(xs_pad, ilens)

        if self.use_aux_task:
            hs_pad, aux_hs_pad = _hs_pad[0], _hs_pad[1]
        else:
            hs_pad, aux_hs_pad = _hs_pad, None

        # 1.5. transducer preparation related
        ys_in_pad, ys_out_pad, target, pred_len, target_len = prepare_loss_inputs(
            ys_pad, hs_mask
        )
        """
        ys_in_pad : ys with blank_id in head. For decoder forward
        ys_out_pad : ys with ignore_id in tail. For aux task 
        target: ys with padding only, for RNNT loss computation
        pred_len: real length of hs_mask
        target_len: real length of target 
        """

        if self.use_aux_mbr:
            loss_mbr = self.mbr_forward(xs_pad_orig, ilens, ys_pad, hs_pad)
            loss_mbr *= self.aux_mbr_weight
        else:
            loss_mbr = 0.0

        # 2. decoder
        if "custom" in self.dtype:
            ys_mask = target_mask(ys_in_pad, self.blank_id)
            pred_pad, _ = self.decoder(ys_in_pad, ys_mask)
        else:
            pred_pad = self.dec(hs_pad, ys_in_pad)

        z = self.joint_network(hs_pad.unsqueeze(2), pred_pad.unsqueeze(1))

        # 3. loss computation
        loss_trans = self.criterion(z, target, pred_len, target_len)

        if self.use_aux_task and aux_hs_pad is not None:
            loss_aux_trans, loss_aux_symm_kl = self.auxiliary_task(
                aux_hs_pad, pred_pad, z, target, pred_len, target_len
            )
        else:
            loss_aux_trans, loss_aux_symm_kl = 0.0, 0.0

        if self.use_aux_ctc or self.use_aux_mmi:
            if "custom" in self.etype:
                hlen = torch.IntTensor(
                    [h.size(1) for h in hs_mask],
                ).to(hs_mask.device)

        if self.use_aux_ctc:
            loss_ctc = self.aux_ctc_weight * self.aux_ctc(hs_pad, hlen, ys_pad, texts)
        else:
            loss_ctc = 0.0

        if self.use_aux_mmi:
            loss_mmi = self.aux_mmi_weight * self.aux_mmi(hs_pad, hlen, ys_pad, texts)
        else:
            loss_mmi = 0.0

        if self.use_aux_cross_entropy:
            loss_lm = self.aux_cross_entropy_weight * self.aux_cross_entropy(
                self.aux_decoder_output(pred_pad), ys_out_pad
            )
        else:
            loss_lm = 0.0

        if self.use_att_scorer:
            ys_mask = target_mask(ys_in_pad, self.ignore_id)
            pred_pad, _ = self.att_scorer(ys_in_pad, ys_mask, hs_pad, hs_mask)
            loss_att = self.att_scorer_criterion(pred_pad, ys_out_pad)
            loss_att *= self.att_scorer_weight
        else:
            loss_att = 0.0

        loss = (
            loss_trans
            + self.transducer_weight * (loss_aux_trans + loss_aux_symm_kl)
            + loss_ctc
            + loss_mmi
            + loss_lm
            + loss_mbr
            + loss_att
        )

        self.loss = loss
        loss_data = float(loss)

        # 4. compute cer/wer
        if self.training or self.error_calculator is None:
            cer, wer = None, None
        else:
            cer, wer = self.error_calculator(hs_pad, ys_pad)

        if not math.isnan(loss_data):
            self.reporter.report(
                loss_data,
                float(loss_trans),
                float(loss_ctc),
                float(loss_lm),
                float(loss_aux_trans),
                float(loss_aux_symm_kl),
                float(loss_mbr),
                float(loss_mmi),
                float(loss_att),
                cer,
                wer,
            )
        else:
            logging.warning("loss (=%f) is not correct", loss_data)

        """ 
        # draw figures        
        T = hs_pad.size(1)
        mmi_array, trans_array = [], []
        for t in range(1, T+1):
            hlen = torch.Tensor([t]).int().cuda()
            mmi_loss_item = - self.aux_mmi(hs_pad[:, :t], hlen, ys_pad[:, :-2], texts).item()
            loss_trans_item = - self.criterion(z[:, :t, :-2].contiguous(), target[:, :-2], hlen, target_len-2).item()
            mmi_array.append(mmi_loss_item)
            trans_array.append(loss_trans_item)
        print(mmi_array, trans_array)

        print(texts[0])
        import uuid
        this_uuid = uuid.uuid4()
        filename = f"figures/{this_uuid}.png"
        print(f"plot save in {filename}")

        import matplotlib

        matplotlib.use("Agg")
        import matplotlib.pyplot as plt
        # plt.style.use('seaborn-whitegrid')
        palette = plt.get_cmap('Set1')
        font1 = {'family' : 'Times New Roman',
        'weight' : 'normal',
        'size'   : 18,
        }

        plt.clf()
        axis = range(1, len(mmi_array) + 1)
        plt.plot(mmi_array, label="LF-MMI", color="red", marker='*')
        plt.plot(trans_array, label="NT", color="blue", marker='v')
        plt.xlabel("Frame Index t", fontsize=14)
        plt.ylabel("Log-Posterior", fontsize=14)
        plt.xticks([162, 163], fontsize=10)
        plt.yticks([-80, -50, -20], fontsize=10)
        plt.vlines(162, -100, 0, color="black", linestyles = "dashed")
        plt.vlines(163, -100, 0, color="black", linestyles = "dashed") 
        plt.xlim((154, 175))
        plt.ylim((-80, 0))
        plt.legend(loc='upper left', fontsize=14)       

        # plt.grid() 
        plt.tight_layout()
        plt.savefig(filename)
        """

        return self.loss

        
    def mbr_forward(self, xs_pad_orig, ilens, ys_pad, hs_pad):
        batch_size = len(ilens)
        
        # (1) on-the-fly decoding
        self.eval()
        with torch.no_grad():
            # decode without data augmentation (a.k.a., xs_pad_orig)
            if "custom" in self.etype:
                src_mask = make_non_pad_mask(ilens.tolist()).to(xs_pad_orig.device).unsqueeze(-2)
                specs, hs_mask = self.encoder(xs_pad_orig, src_mask)
            else:
                specs, hs_mask, _ = self.enc(xs_pad_orig, ilens)           


            hs = [h[h != 0] for h in hs_mask]
            hlens = list(map(int, [h.size(0) for h in hs]))
            specs = [h[:l] for h, l in zip(specs, hlens)]

            # multi-thread on-the-fly decoding on GPU
            """
            It is very inefficient to do on-the-fly decoding.
            We've tried multi-process but failed since the dataloader cannot work
              in forked process
            Multi-thread is used. Remember to use 'export OMP_NUM_THREADS=<ncpu>'
              to achieve faster decoding speed
            """
            with ThreadPoolExecutor(max_workers=self.aux_mbr_beam) as executor:
                futures = [executor.submit(self.beam_search, h) for h in specs]
                wait(futures, return_when=ALL_COMPLETED)
            
                hyps = []
                for future in futures:
                    hyps.extend(future.result()) 
                hyps = [h.yseq[1:] for h in hyps] # exclude <sos>
    
                # for debug
                # for i, y in enumerate(ys_pad):
                #     ref_text = "".join([self.char_list[x] for x in y if x != self.ignore_id])
                #     print(f"ref_text: {ref_text}")
                #     for y in hyps[i * self.aux_mbr_beam: (i+1) * self.aux_mbr_beam]:
                #         hyp_text = "".join([self.char_list[x] for x in y if x != self.blank_id])
                #         print(f"hyp_text: {hyp_text}")

        self.train()

        if not len(hyps) == self.aux_mbr_beam * batch_size:
            print("WARNNING: on-the-decoding fail in this iteration.")
            return 0.0

        # (2) compute edit distance
        dist = self.compute_edit_distance(hyps, ys_pad)
 
        if dist is None:
            print("Warning: An error encountered when editing distance", flush=True)
            return 0.0 # fail in editdistance.  

        # (3) RNN-T loss computation
        # prepare many inputs
        hyp_maxlen = max([len(hyp) for hyp in hyps])
        hyps_pad = [hyp + [self.ignore_id] * (hyp_maxlen - len(hyp)) for hyp in hyps]
        hyps_pad = torch.Tensor(hyps_pad).to(ys_pad.device).to(ys_pad.dtype)

        hyps_in_pad, hyps_out_pad, target, pred_len, target_len = prepare_loss_inputs(
                                                              hyps_pad, hs_mask) 
        
        idx = torch.arange(self.aux_mbr_beam * batch_size) // self.aux_mbr_beam
        pred_len = pred_len[idx]
        hs_pad = hs_pad[idx]

        # decoder and joint-net forward
        """ We are not sure which hs_pad should be used in decoder forward 
            Currently we are using the hs_pad from xs_pad, since we consider
            the encoder should also receive the gradient from denominator
        """
        if "custom" in self.dtype:
            hyps_mask = target_mask(hyps_in_pad, self.blank_id)
            pred_pad, _ = self.decoder(hyps_in_pad, hyps_mask, hs_pad)
        else:
            pred_pad = self.dec(hs_pad, hyps_in_pad)

        z = self.joint_network(hs_pad.unsqueeze(2), pred_pad.unsqueeze(1))

        # loss computation
        # we need reduction = 'none' for utt-level probability
        # code for warp-rnnt is not tested
        if self.mbr_trans_type == "warp-rnnt":
            log_prob = torch.log_softmax(z, dim=-1)
            loss_trans = self.mbr_trans_loss(
                log_probs,
                target,
                pred_len,
                target_len,
                reduction=None,
                blank=self.blank_id,
                gather=True,
            )
        elif self.mbr_trans_type == "warp-transducer":
            loss_trans = self.mbr_trans_loss(z, target, pred_len, target_len)

        # This is exactly posterior P(W|O) 
        loss_trans = (-loss_trans).exp()
        # print("probability: ", loss_trans)
        # print("edit distance: ", dist)
 
        # (4) MBR loss. 
        num = (loss_trans * dist).view(batch_size, self.aux_mbr_beam)
        den = loss_trans.view(batch_size, self.aux_mbr_beam)
        loss_mbr = num.sum(dim=-1) / den.sum(dim=-1)
        loss_mbr = loss_mbr.mean() # RNN-T Loss also works in reduction=mean
        
        return loss_mbr 
 
    def compute_edit_distance(self, hyps, refs):
        # hyps: list of list with number batch * beam
        # refs: 2-D tensor of labels. -1 means padding
  
        # convert refs into list and remove padding 
        refs_device = refs.device
        refs = refs.cpu().tolist()
        refs = [[x for x in t if x != self.ignore_id] for t in refs]
         
        if not len(hyps) % len(refs) == 0:
            raise ValueError("The number of hypotheses is not correct")

        beam = int(len(hyps) / len(refs))

        dist = [editdistance.eval(hyp, refs[i//beam]) 
                for i, hyp in enumerate(hyps)
               ] 
        dist = torch.IntTensor(dist).to(refs_device)
        return dist

    def encode_custom(self, x):
        """Encode acoustic features.

        Args:
            x (ndarray): input acoustic feature (T, D)

        Returns:
            x (torch.Tensor): encoded features (T, D_enc)

        """
        x = torch.as_tensor(x).unsqueeze(0)
        enc_output, _ = self.encoder(x, None)

        return enc_output.squeeze(0)

    def encode_rnn(self, x):
        """Encode acoustic features.

        Args:
            x (ndarray): input acoustic feature (T, D)

        Returns:
            x (torch.Tensor): encoded features (T, D_enc)

        """
        p = next(self.parameters())

        ilens = [x.shape[0]]
        x = x[:: self.subsample[0], :]

        h = torch.as_tensor(x, device=p.device, dtype=p.dtype)
        hs = h.contiguous().unsqueeze(0)

        hs, _, _ = self.enc(hs, ilens)

        return hs.squeeze(0)

    def recognize(self, x, beam_search, decode_feature="combine"):
        """Recognize input features.

        Args:
            x (ndarray): input acoustic feature (T, D)
            beam_search (class): beam search class

        Returns:
            nbest_hyps (list): n-best decoding results

        """
        assert decode_feature == "combine" # other method only for code-switch

        self.eval()

        if "custom" in self.etype:
            h = self.encode_custom(x)
        else:
            h = self.encode_rnn(x)

        nbest_hyps = beam_search(h)
        return [asdict(n) for n in nbest_hyps]

    def calculate_all_attentions(self, xs_pad, ilens, ys_pad, texts, xs_pad_orig):
        """E2E attention calculation.

        Args:
            xs_pad (torch.Tensor): batch of padded input sequences (B, Tmax, idim)
            ilens (torch.Tensor): batch of lengths of input sequences (B)
            ys_pad (torch.Tensor):
                batch of padded character id sequence tensor (B, Lmax)

        Returns:
            ret (ndarray): attention weights with the following shape,
                1) multi-head case => attention weights (B, H, Lmax, Tmax),
                2) other case => attention weights (B, Lmax, Tmax).

        """
        self.eval()

        if "custom" not in self.etype and "custom" not in self.dtype:
            return []
        else:
            with torch.no_grad():
                self.forward(xs_pad, ilens, ys_pad, texts, xs_pad_orig)

            ret = dict()
            for name, m in self.named_modules():
                if isinstance(m, MultiHeadedAttention) or isinstance(
                    m, RelPositionMultiHeadedAttention
                ):
                    ret[name] = m.attn.cpu().numpy()

        self.train()

        return ret


================================================
FILE: nets/pytorch_backend/e2e_asr_transducer_cs.py
================================================
# Author: Jinchuan Tian; tianjinchuan@stu.pku.edu.cn ; tyriontian@tencent.com
# Neural Transducer model for code-switch (bilingual problem)

from argparse import Namespace
from collections import Counter, defaultdict
from dataclasses import asdict

import torch
import chainer
import numpy
import math
import logging
from itertools import groupby
from typing import Tuple, List
from espnet.nets.asr_interface import ASRInterface
from espnet.nets.pytorch_backend.ctc import ctc_for
from espnet.nets.pytorch_backend.nets_utils import get_subsample
from espnet.nets.pytorch_backend.nets_utils import make_non_pad_mask
from espnet.nets.pytorch_backend.transducer.arguments import (
    add_encoder_general_arguments,  # noqa: H301
    add_custom_encoder_arguments,  # noqa: H301
    add_decoder_general_arguments,  # noqa: H301
    add_rnn_decoder_arguments,  # noqa: H301
    add_custom_training_arguments,  # noqa: H301
    add_transducer_arguments,  # noqa: H301
    add_auxiliary_task_arguments,  # noqa: H301
    add_att_scorer_arguments,
    add_transducer_code_switch_arguments,
)
from espnet.nets.pytorch_backend.transducer.custom_encoder import CustomEncoder
from espnet.nets.pytorch_backend.transducer.error_calculator import ErrorCalculator
from espnet.nets.pytorch_backend.transducer.initializer import initializer
from espnet.nets.pytorch_backend.transducer.joint_network import JointNetwork
from espnet.nets.pytorch_backend.transducer.loss import TransLoss
from espnet.nets.pytorch_backend.transducer.rnn_decoder import DecoderRNNT
from espnet.nets.pytorch_backend.transformer.attention import (
    MultiHeadedAttention,  # noqa: H301
    RelPositionMultiHeadedAttention,  # noqa: H301
)
from espnet.nets.pytorch_backend.transformer.plot import PlotAttentionReport
from espnet.utils.fill_missing_args import fill_missing_args
from espnet.nets.pytorch_backend.transducer.utils import prepare_loss_inputs
from espnet.nets.pytorch_backend.e2e_asr import pad_list
from espnet.nets.transducer_decoder_interface import Hypothesis


class Reporter(chainer.Chain):
    """A chainer reporter wrapper for transducer models."""

    def report(
        self,
        loss,
        loss_trans,
        loss_ctc,
        loss_lm,
        loss_aux_trans,
        loss_aux_symm_kl,
        loss_mbr,
        loss_mmi,
        loss_att,
        loss_lang,
        cer,
        wer,
    ):
        """Instantiate reporter attributes."""
        chainer.reporter.report({"loss": loss}, self)
        chainer.reporter.report({"loss_trans": loss_trans}, self)
        chainer.reporter.report({"loss_ctc": loss_ctc}, self)
        chainer.reporter.report({"loss_lm": loss_lm}, self)
        chainer.reporter.report({"loss_aux_trans": loss_aux_trans}, self)
        chainer.reporter.report({"loss_aux_symm_kl": loss_aux_symm_kl}, self)
        chainer.reporter.report({"loss_mbr": loss_mbr}, self)
        chainer.reporter.report({"loss_mmi": loss_mmi}, self)
        chainer.reporter.report({"loss_att": loss_att}, self)
        chainer.reporter.report({"loss_lang": loss_lang}, self)
        chainer.reporter.report({"cer": cer}, self)
        chainer.reporter.report({"wer": wer}, self)

        logging.info("loss:" + str(loss))

class E2E(ASRInterface, torch.nn.Module):
    """E2E module for transducer models.

    Args:
        idim (int): dimension of inputs
        odim (int): dimension of outputs
        args (Namespace): argument Namespace containing options
        ignore_id (int): padding symbol id
        blank_id (int): blank symbol id

    """

    @staticmethod
    def add_arguments(parser):
        """Add arguments for transducer model."""
        E2E.encoder_add_general_arguments(parser)
        E2E.encoder_add_custom_arguments(parser)

        E2E.decoder_add_general_arguments(parser)
        E2E.decoder_add_rnn_arguments(parser)

        E2E.training_add_custom_arguments(parser)
        E2E.transducer_add_arguments(parser)
        E2E.auxiliary_task_add_arguments(parser)

        E2E.transducer_add_code_switch_arguments(parser)
        return parser

    @staticmethod
    def encoder_add_general_arguments(parser):
        """Add general arguments for encoder."""
        group = parser.add_argument_group("Encoder general arguments")
        group = add_encoder_general_arguments(group)

        return parser

    @staticmethod
    def encoder_add_custom_arguments(parser):
        """Add arguments for Custom encoder."""
        group = parser.add_argument_group("Custom encoder arguments")
        group = add_custom_encoder_arguments(group)

        return parser

    @staticmethod
    def decoder_add_general_arguments(parser):
        """Add general arguments for decoder."""
        group = parser.add_argument_group("Decoder general arguments")
        group = add_decoder_general_arguments(group)

        return parser

    @staticmethod
    def decoder_add_rnn_arguments(parser):
        """Add arguments for RNN decoder."""
        group = parser.add_argument_group("RNN decoder arguments")
        group = add_rnn_decoder_arguments(group)

        return parser

    @staticmethod
    def training_add_custom_arguments(parser):
        """Add arguments for Custom architecture training."""
        group = parser.add_argument_group("Training arguments for custom archictecture")
        group = add_custom_training_arguments(group)

        return parser

    @staticmethod
    def transducer_add_arguments(parser):
        """Add arguments for transducer model."""
        group = parser.add_argument_group("Transducer model arguments")
        group = add_transducer_arguments(group)

        return parser

    @staticmethod
    def transducer_add_code_switch_arguments(parser):
        """Add arguments for transducer model."""
        group = parser.add_argument_group("Transducer code switch arguments")
        group = add_transducer_code_switch_arguments(group)
        
        return parser

    @staticmethod
    def auxiliary_task_add_arguments(parser):
        """Add arguments for auxiliary task."""
        group = parser.add_argument_group("Auxiliary task arguments")
        group = add_auxiliary_task_arguments(group)

        return parser

    def get_total_subsampling_factor(self):
        """Get total subsampling factor."""
        if self.shared_encoder:
            return self.shared_encoder.conv_subsampling_factor * int(
                numpy.prod(self.subsample)
        )
        else:
            return self.chn_encoder.conv_subsampling_factor * int(
                numpy.prod(self.subsample)
        )

    def __init__(self, idim, odim, args, ignore_id=-1, blank_id=0, training=True):
        """Construct an E2E object for transducer model."""
        """ By default we only adopt Custom Encoder and RNN Decoder """
        torch.nn.Module.__init__(self)

        args = fill_missing_args(args, self.add_arguments)

        ### Commom transducer configs ###
        self.is_rnnt = True # legacy
        self.transducer_weight = args.transducer_weight
        self.etype = "custom" # legacy
        self.dtype = "rnn"

        self.sos = odim - 1
        self.eos = odim - 1
        self.blank_id = blank_id
        self.ignore_id = ignore_id

        ### code-switch parameters ###
        self.chn_id = odim
        self.eng_id = odim + 1
        self.cs_id = odim + 2
        self.chn_start = args.cs_chn_start
        self.eng_start = args.cs_eng_start
        
        self.use_adversial_examples = args.cs_use_adversial_examples
        self.is_ctc_decoder = args.cs_is_ctc_decoder
        self.is_pretrain = args.cs_is_pretrain
        self.use_decoder_expert = args.cs_decoder_expert       
 
        self.aux_ctc_weight = args.aux_ctc_weight
        self.lang_weight = args.cs_lang_weight

        self.space = args.sym_space
        self.blank = args.sym_blank
        self.odim = odim
        self.reporter = Reporter()
        self.error_calculator = None

        ### Modules ###
        if args.cs_share_encoder:
            self.shared_encoder = CustomEncoder(
                idim=idim,
                enc_arch=args.enc_block_arch,
                input_layer=args.custom_enc_input_layer,
                repeat_block=args.cs_share_encoder_layers,
                self_attn_type=args.custom_enc_self_attn_type,
                positional_encoding_type=args.custom_enc_positional_encoding_type,
                positionwise_activation_type=args.custom_enc_pw_activation_type,
                conv_mod_activation_type=args.custom_enc_conv_mod_activation_type,
            )
        else:
            self.shared_encoder = None
       
        # When use shared_encoder, there is no cnn layers in chn/eng encoder 
        enc_params = dict(
            idim=idim if not args.cs_share_encoder else self.shared_encoder.enc_out,
            enc_arch=args.enc_block_arch,
            input_layer=args.custom_enc_input_layer if not args.cs_share_encoder else "null",
            repeat_block=args.enc_block_repeat,
            self_attn_type=args.custom_enc_self_attn_type,
            positional_encoding_type=args.custom_enc_positional_encoding_type,
            positionwise_activation_type=args.custom_enc_pw_activation_type,
            conv_mod_activation_type=args.custom_enc_conv_mod_activation_type,
        )
        # Make sure identical settings
        self.chn_encoder = CustomEncoder(**enc_params)
        self.eng_encoder = CustomEncoder(**enc_params)

        self.subsample = get_subsample(args, mode="asr", arch="transformer")
        encoder_out = self.chn_encoder.enc_out

        self.most_dom_list = args.enc_block_arch[:]
        self.most_dom_dim = sorted(
            Counter(
                d["d_hidden"] for d in self.most_dom_list if "d_hidden" in d
            ).most_common(),
            key=lambda x: x[0],
            reverse=True,
        )[0][0]
           

        dec_param = (
                odim,
                args.dtype,
                args.dlayers,
                args.dunits,
                blank_id,
                args.dec_embed_dim,
                args.dropout_rate_decoder,
                args.dropout_rate_embed_decoder,
        )
        if self.use_decoder_expert:
            raise NotImplementedError
        else:
            self.dec = DecoderRNNT(*dec_param) 
 
        decoder_out = args.dunits

        self.joint_network = JointNetwork(
            odim, encoder_out, decoder_out, args.joint_dim, args.joint_activation_type
        )

        if self.lang_weight > 0.0:
            self.lang_classifer = torch.nn.Sequential(
                                        torch.nn.Linear(encoder_out, 2 * encoder_out),
                                        torch.nn.ReLU(),
                                        torch.nn.Linear(2 * encoder_out, 3),
                                      )
 
        self.default_parameters(args)

        ### Criterion ###
        self.criterion = TransLoss(args.trans_type, self.blank_id)
        self.aux_ctc = ctc_for(
                Namespace(
                    num_encs=1,
                    eprojs=encoder_out,
                    dropout_rate=args.aux_ctc_dropout_rate,
                    ctc_type="warpctc",
                ),
                odim,
                reduce=False,
        )
        self.decoder_ctc = ctc_for(
                Namespace(
                    num_encs=1,
                    eprojs=encoder_out,
                    dropout_rate=args.aux_ctc_dropout_rate,
                     ctc_type="warpctc",
                ),
                odim,
                reduce=False,
        )
        self.lang_cls_criterion = torch.nn.CrossEntropyLoss()

        self.loss = None
        self.rnnlm = None
        self.lms = {} # ngram LMs. Set externally during decoding

    def default_parameters(self, args):
        """Initialize/reset parameters for transducer.

        Args:
            args (Namespace): argument Namespace containing options

        """
        initializer(self, args)

    ### Training Implementation  ###
    def forward(self, xs_pad, ilens, ys_pad, texts, xs_pad_orig):
        """E2E forward.

        Args:
            xs_pad (torch.Tensor): batch of padded source sequences (B, Tmax, idim)
            ilens (torch.Tensor): batch of lengths of input sequences (B)
            ys_pad (torch.Tensor): batch of padded target sequences (B, Lmax)

        Returns:
            loss (torch.Tensor): transducer loss value
        """
        # 0. process labels
        ys_pad, cls_ids = ys_pad[:, 1:], ys_pad[:, 0].squeeze(0)

        # 1. forward encoder
        xs_pad = xs_pad[:, : max(ilens)]
        src_mask = make_non_pad_mask(ilens.tolist()).to(xs_pad.device).unsqueeze(-2)

        if self.shared_encoder:
            hs_pad, hs_mask = self.shared_encoder(xs_pad, src_mask,
                                                  return_as_intermidiate=True)
        else:
            hs_pad, hs_mask = xs_pad, src_mask

        chn_hs_pad, chn_hs_mask = self.chn_encoder(hs_pad, hs_mask)
        eng_hs_pad, eng_hs_mask = self.eng_encoder(hs_pad, hs_mask)

        hs_pad, hs_mask = self.combine_fn(chn_hs_pad, eng_hs_pad,
                                          chn_hs_mask, eng_hs_mask)

        # 2. Decoder loss: either RNNT or CTC
        if not self.is_pretrain:
            if not self.is_ctc_decoder:    
                ys_in_pad, ys_out_pad, target, pred_len, target_len = \
                    prepare_loss_inputs(ys_pad, hs_mask
                )
        
                pred_pad = self.dec(hs_pad, ys_in_pad)
    
                z = self.joint_network(hs_pad.unsqueeze(2), pred_pad.unsqueeze(1))
                loss_dec = self.criterion(z, target, pred_len, target_len)
            else:
                hlen = torch.IntTensor([h.size(1) for h in hs_mask]).to(hs_mask.device)
                loss_dec = self.decoder_ctc(hs_pad, hlen, ys_pad, texts).sum()
        else:
            loss_dec = 0.0

        # 3. auxiliary CTC
        if self.aux_ctc_weight > 0.0:
            chn_ys_pad, eng_ys_pad = self.monolingual_mask(ys_pad)
            # print(chn_ys_pad, eng_ys_pad)
            hlen = torch.IntTensor([h.size(1) for h in chn_hs_mask]).to(chn_hs_mask.device)
            loss_ctc_chn = self.aux_ctc(chn_hs_pad, hlen, chn_ys_pad, texts)
            loss_ctc_eng = self.aux_ctc(eng_hs_pad, hlen, eng_ys_pad, texts)
        
            # In fine-tuning we must compute two ctc loss for each utt
            if self.use_adversial_examples:
                loss_ctc = (loss_ctc_chn + loss_ctc_eng).sum() / 2
            else:
                chn_indices = torch.nonzero(cls_ids != self.eng_id).squeeze(1)
                eng_indices = torch.nonzero(cls_ids != self.chn_id).squeeze(1)
                loss_ctc = loss_ctc_chn[chn_indices].sum() + \
                           loss_ctc_eng[eng_indices].sum()
        else:
            loss_ctc = 0.0

        # 4. language prediction loss
        if self.lang_weight > 0.0:
            loss_lang = self.lang_cls_criterion(
                        self.lang_classifer(hs_pad.mean(dim=1)),
                        cls_ids - self.chn_id
                        )
        else:
            loss_lang = 0.0

        # 5. aggregate loss and report
        loss = loss_dec + \
               loss_ctc  * self.aux_ctc_weight + \
               loss_lang * self.lang_weight
 
        self.loss = loss
        loss_data = float(loss)

        # Some reprot keys are not used here
        if not math.isnan(loss_data):
            self.reporter.report(
                loss_data,
                float(loss_dec),
                float(loss_ctc),
                0.0,
                0.0,
                0.0,
                0.0,
                0.0,
                0.0,
                float(loss_lang),
                0.0,
                0.0,
            )
        else:
            logging.warning("loss (=%f) is not correct", loss_data)

        return self.loss

    # You may want to revise this function to combine encoder_output differently
    def combine_fn(self, chn_hs_pad, eng_hs_pad, chn_hs_mask, eng_hs_mask):
        return chn_hs_pad + eng_hs_pad, chn_hs_mask

    def monolingual_mask(self, ys_pad):
        # <chn> 2 ; <eng> 3
        ys_pad_chn = torch.where(torch.logical_and(
            ys_pad >= self.eng_start, ys_pad < self.odim),
            3, ys_pad)

        ys_pad_eng = torch.where(torch.logical_and(
            ys_pad >= self.chn_start, ys_pad < self.eng_start),
            2, ys_pad)

        return ys_pad_chn, ys_pad_eng

    ### Decoding Implementation ###
    def encoder_forward(self, x):
         # Inference all
        self.eval()
        device = next(self.parameters()).device
        x = torch.Tensor(x).to(device).unsqueeze(0)

        if self.shared_encoder:
            hs, _ = self.shared_encoder(x, None, return_as_intermidiate=True)
        else:
            hs = x

        chn_hs, _ = self.chn_encoder(hs, None)
        eng_hs, _ = self.eng_encoder(hs, None)

        hs, _ = self.combine_fn(chn_hs, eng_hs, None, None)

        # temporary code:
        if hasattr(self, "lang_classifer"):
            pred = torch.argmax(self.lang_classifer(hs.mean(dim=1))).item()
            print("language classification results: ", pred, flush=True)

        return hs, chn_hs, eng_hs

    def recognize(self, x, beam_search=None, decode_feature="combine"):
        hs, chn_hs, eng_hs = self.encoder_forward(x)
        if decode_feature == "combine":
            feature = hs
        elif decode_feature == "chn":
            feature = chn_hs
        elif decode_feature == "eng":
            feature = eng_hs
        else:
            raise NotImplementedError

        nbest_hyps = beam_search(feature)
        return [asdict(n) for n in nbest_hyps] 

    # legacy, not used  
    def calculate_all_attentions(self, xs_pad, ilens, ys_pad, texts, xs_pad_orig):
        """E2E attention calculation.

        Args:
            xs_pad (torch.Tensor): batch of padded input sequences (B, Tmax, idim)
            ilens (torch.Tensor): batch of lengths of input sequences (B)
            ys_pad (torch.Tensor):
                batch of padded character id sequence tensor (B, Lmax)

        Returns:
            ret (ndarray): attention weights with the following shape,
                1) multi-head case => attention weights (B, H, Lmax, Tmax),
                2) other case => attention weights (B, Lmax, Tmax).

        """
        self.eval()

        if "custom" not in self.etype and "custom" not in self.dtype:
            return []
        else:
            with torch.no_grad():
                self.forward(xs_pad, ilens, ys_pad, texts, xs_pad_orig)

            ret = dict()
            for name, m in self.named_modules():
                if isinstance(m, MultiHeadedAttention) or isinstance(
                    m, RelPositionMultiHeadedAttention
                ):
                    ret[name] = m.attn.cpu().numpy()

        self.train()

        return ret


================================================
FILE: nets/pytorch_backend/e2e_asr_transformer.py
================================================
# Copyright 2019 Shigeki Karita
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

"""Transformer speech recognition model (pytorch)."""

from argparse import Namespace
import logging
import math
import copy
import numpy
import torch
from concurrent.futures import ThreadPoolExecutor, wait, ALL_COMPLETED
import editdistance

from espnet.nets.asr_interface import ASRInterface
from espnet.nets.ctc_prefix_score import CTCPrefixScore
from espnet.nets.e2e_asr_common import end_detect
from espnet.nets.e2e_asr_common import ErrorCalculator
from espnet.nets.pytorch_backend.ctc import CTC
from espnet.nets.pytorch_backend.e2e_asr import CTC_LOSS_THRESHOLD
from espnet.nets.pytorch_backend.e2e_asr import Reporter
from espnet.nets.pytorch_backend.nets_utils import get_subsample
from espnet.nets.pytorch_backend.nets_utils import make_non_pad_mask
from espnet.nets.pytorch_backend.nets_utils import th_accuracy
from espnet.nets.pytorch_backend.rnn.decoders import CTC_SCORING_RATIO
from espnet.nets.pytorch_backend.transformer.add_sos_eos import add_sos_eos
from espnet.nets.pytorch_backend.transformer.argument import (
    add_arguments_transformer_common,  # noqa: H301
)
from espnet.nets.pytorch_backend.transformer.attention import (
    MultiHeadedAttention,  # noqa: H301
    RelPositionMultiHeadedAttention,  # noqa: H301
)
from espnet.nets.pytorch_backend.transformer.decoder import Decoder
from espnet.nets.pytorch_backend.transformer.dynamic_conv import DynamicConvolution
from espnet.nets.pytorch_backend.transformer.dynamic_conv2d import DynamicConvolution2D
from espnet.nets.pytorch_backend.transformer.encoder import Encoder
from espnet.nets.pytorch_backend.transformer.initializer import initialize
from espnet.nets.pytorch_backend.transformer.label_smoothing_loss import (
    LabelSmoothingLoss,  # noqa: H301
)
from espnet.nets.pytorch_backend.transformer.mask import subsequent_mask
from espnet.nets.pytorch_backend.transformer.mask import target_mask
from espnet.nets.pytorch_backend.transformer.plot import PlotAttentionReport
from espnet.nets.scorers.ctc import CTCPrefixScorer
from espnet.utils.fill_missing_args import fill_missing_args

from espnet.snowfall.warpper.warpper_mmi import K2MMI
from espnet.snowfall.warpper.warpper_ctc import K2CTC
from espnet.nets.scorers.mmi import MMIPrefixScores
from espnet.nets.scorers.mmi_lookahead import MMILookaheadScorer
from espnet.nets.scorers.mmi_rescorer import MMIRescorer
from espnet.nets.scorers.mmi_frame_scorer import MMIFrameScorer
from espnet.nets.scorers.mmi_frame_prefix_scorer import MMIFramePrefixScorer
from espnet.nets.scorers.ctc import CTCPrefixScorer
from espnet.nets.beam_search import BeamSearch

from espnet.utils.print import step_print

class E2E(ASRInterface, torch.nn.Module):
    """E2E module.

    :param int idim: dimension of inputs
    :param int odim: dimension of outputs
    :param Namespace args: argument Namespace containing options

    """

    @staticmethod
    def add_arguments(parser):
        """Add arguments."""
        group = parser.add_argument_group("transformer model setting")

        group = add_arguments_transformer_common(group)

        return parser

    @property
    def attention_plot_class(self):
        """Return PlotAttentionReport."""
        return PlotAttentionReport

    def get_total_subsampling_factor(self):
        """Get total subsampling factor."""
        return self.encoder.conv_subsampling_factor * int(numpy.prod(self.subsample))

    def __init__(self, idim, odim, args, ignore_id=-1):
        """Construct an E2E object.

        :param int idim: dimension of inputs
        :param int odim: dimension of outputs
        :param Namespace args: argument Namespace containing options
        """
        torch.nn.Module.__init__(self)

        # fill missing arguments for compatibility
        args = fill_missing_args(args, self.add_arguments)

        if args.transformer_attn_dropout_rate is None:
            args.transformer_attn_dropout_rate = args.dropout_rate
        self.encoder = Encoder(
            idim=idim,
            selfattention_layer_type=args.transformer_encoder_selfattn_layer_type,
            attention_dim=args.adim,
            attention_heads=args.aheads,
            conv_wshare=args.wshare,
            conv_kernel_length=args.ldconv_encoder_kernel_length,
            conv_usebias=args.ldconv_usebias,
            linear_units=args.eunits,
            num_blocks=args.elayers,
            input_layer=args.transformer_input_layer,
            dropout_rate=args.dropout_rate,
            positional_dropout_rate=args.dropout_rate,
            attention_dropout_rate=args.transformer_attn_dropout_rate,
        )
        if args.mtlalpha < 1:
            self.decoder = Decoder(
                odim=odim,
                selfattention_layer_type=args.transformer_decoder_selfattn_layer_type,
                attention_dim=args.adim,
                attention_heads=args.aheads,
                conv_wshare=args.wshare,
                conv_kernel_length=args.ldconv_decoder_kernel_length,
                conv_usebias=args.ldconv_usebias,
                linear_units=args.dunits,
                num_blocks=args.dlayers,
                dropout_rate=args.dropout_rate,
                positional_dropout_rate=args.dropout_rate,
                self_attention_dropout_rate=args.transformer_attn_dropout_rate,
                src_attention_dropout_rate=args.transformer_attn_dropout_rate,
            )
            self.criterion = LabelSmoothingLoss(
                odim,
                ignore_id,
                args.lsm_weight,
                args.transformer_length_normalized_loss,
            )
        else:
            self.decoder = None
            self.criterion = None
        self.blank = 0
        self.sos = odim - 1
        self.eos = odim - 1
        self.odim = odim
        self.ignore_id = ignore_id
        self.subsample = get_subsample(args, mode="asr", arch="transformer")
        self.reporter = Reporter()

        self.reset_parameters(args)
        self.adim = args.adim  # used for CTC (equal to d_model)
        self.mtlalpha = args.mtlalpha
        if args.mtlalpha > 0.0:
            if args.ctc_type == "k2mmi" or args.ctc_type == "k2ctc":
                device = torch.device(f"cuda:{args.local_rank}") if torch.cuda.is_available() else torch.device("cpu")
                enc_supervise_module = K2MMI if args.ctc_type == 'k2mmi' else K2CTC
                self.ctc=enc_supervise_module(idim=args.adim,
                         lang=args.lang,
                         char_list=args.char_list,
                         device=device,
                         dropout=args.dropout_rate, 
                         den_scale=args.den_scale,
                         eos_id=self.eos,
                         use_segment=args.use_segment)
                if args.third_weight:
                    print(f"You have used MMI to supervise encoder Training. However, \
                          you still add CTC on encoder with weight {args.third_weight}")
                    self.third_weight = args.third_weight
                    self.third_loss = CTC(
                      odim, args.adim, args.dropout_rate, ctc_type=args.ctc_type, reduce=True
                      ) 
            else:
                self.ctc = CTC(
                    odim, args.adim, args.dropout_rate, ctc_type=args.ctc_type, reduce=True
                )
        else:
            self.ctc = None

        # Decoder for on-the-fly decoding. Used in MBR training
        if args.aux_mbr:
            scorers = {"decoder": self.decoder,
                       "ctc": CTCPrefixScorer(self.ctc, self.eos),
                      }
            weights = {"decoder": 1 - args.mtlalpha, 
                       "ctc": args.mtlalpha,
                      } 
            self.beam_search = BeamSearch(
                beam_size=args.aux_mbr_beam,
                vocab_size=len(args.char_list),
                weights=weights,
                scorers=scorers,
                sos=self.sos,
                eos=self.eos,
                token_list=args.char_list,
                pre_beam_score_key=None if args.mtlalpha == 1.0 else "full",
            )
            self.aux_mbr_beam = args.aux_mbr_beam
            self.aux_mbr_weight = args.aux_mbr_weight
            self.mbr_criterion = torch.nn.CrossEntropyLoss(
                ignore_index=self.ignore_id,
                reduction="none",
            ) 
        else:
            self.beam_search = None 

        if args.report_cer or args.report_wer:
            self.error_calculator = ErrorCalculator(
                args.char_list,
                args.sym_space,
                args.sym_blank,
                args.report_cer,
                args.report_wer,
            )
        else:
            self.error_calculator = None
        self.rnnlm = None
        self.char_list = args.char_list

    def reset_parameters(self, args):
        """Initialize parameters."""
        # initialize parameters
        initialize(self, args.transformer_init)

    def forward(self, xs_pad, ilens, ys_pad, texts, xs_pad_orig):

        """E2E forward.

        :param torch.Tensor xs_pad: batch of padded source sequences (B, Tmax, idim)
        :param torch.Tensor ilens: batch of lengths of source sequences (B)
        :param torch.Tensor ys_pad: batch of padded target sequences (B, Lmax)
        :return: ctc loss value
        :rtype: torch.Tensor
        :return: attention loss value
        :rtype: torch.Tensor
        :return: accuracy in attention decoder
        :rtype: float
        """
        # 1. forward encoder
        xs_pad = xs_pad[:, : max(ilens)]  # for data parallel
        src_mask = make_non_pad_mask(ilens.tolist()).to(xs_pad.device).unsqueeze(-2)
        hs_pad, hs_mask = self.encoder(xs_pad, src_mask)
        self.hs_pad = hs_pad

        # 2. forward decoder
        if self.decoder is not None:
            ys_in_pad, ys_out_pad = add_sos_eos(
                ys_pad, self.sos, self.eos, self.ignore_id
            )
            ys_mask = target_mask(ys_in_pad, self.ignore_id)
            pred_pad, pred_mask = self.decoder(ys_in_pad, ys_mask, hs_pad, hs_mask)
            self.pred_pad = pred_pad

            # 3. compute attention loss
            loss_att = self.criterion(pred_pad, ys_out_pad)
            self.acc = th_accuracy(
                pred_pad.view(-1, self.odim), ys_out_pad, ignore_label=self.ignore_id
            )
        else:
            loss_att = None
            self.acc = None

        # TODO(karita) show predicted text
        # TODO(karita) calculate these stats
        cer_ctc = None

        if self.mtlalpha == 0.0:
            loss_ctc = None
        else:
            batch_size = xs_pad.size(0)
            hs_len = hs_mask.view(batch_size, -1).sum(1)
            loss_ctc = self.ctc(hs_pad.view(batch_size, -1, self.adim), hs_len, ys_pad, texts)
            if hasattr(self, "third_weight"):
                third_loss = self.third_loss(hs_pad.view(batch_size, -1, self.adim), hs_len, ys_pad, texts)
            if not self.training and self.error_calculator is not None:
                ys_hat = self.ctc.argmax(hs_pad.view(batch_size, -1, self.adim)).data
                cer_ctc = self.error_calculator(ys_hat.cpu(), ys_pad.cpu(), is_ctc=True)
            # for visualization
            if not self.training:
                self.ctc.softmax(hs_pad)

        if self.beam_search:
            loss_mbr = self.mbr_forward(xs_pad_orig, ilens, ys_pad, hs_pad, hs_mask)
        
        # 5. compute cer/wer
        if self.training or self.error_calculator is None or self.decoder is None:
            cer, wer = None, None
        else:
            ys_hat = pred_pad.argmax(dim=-1)
            cer, wer = self.error_calculator(ys_hat.cpu(), ys_pad.cpu())

        # copied from e2e_asr
        alpha = self.mtlalpha
        if alpha == 0:
            self.loss = loss_att
            loss_att_data = float(loss_att)
            loss_ctc_data = None
        elif alpha == 1:
            self.loss = loss_ctc
            loss_att_data = None
            loss_ctc_data = float(loss_ctc)
        else:
            self.loss = alpha * loss_ctc + (1 - alpha) * loss_att
            loss_att_data = float(loss_att)
            loss_ctc_data = float(loss_ctc)

        # Add the third loss if it is adopted
        if hasattr(self, "third_weight"):
            self.loss += self.third_weight * third_loss
            third_loss_data = float(third_loss) 
        else:
            third_loss_data = None

        if self.beam_search:
            self.loss += self.aux_mbr_weight * loss_mbr
            loss_mbr_data = float(loss_mbr)
        else:
            loss_mbr_data = None

        loss_data = float(self.loss)
        if loss_data < CTC_LOSS_THRESHOLD and not math.isnan(loss_data):
            self.reporter.report(
                loss_ctc_data, loss_att_data, third_loss_data, loss_mbr_data, 
                self.acc, cer_ctc, cer, wer, loss_data
            )
        else:
            logging.warning("loss (=%f) is not correct", loss_data)
        return self.loss
     
    def mbr_forward(self, xs_pad_orig, ilens, ys_pad, hs_pad, hs_mask):
        batch_size = len(ilens)

        # (1) on-the-fly decoding
        self.eval()
        with torch.no_grad():
            src_mask = make_non_pad_mask(ilens.tolist()).to(xs_pad_orig.device).unsqueeze(-2)
            specs, hs_mask = self.encoder(xs_pad_orig, src_mask)

            hs = [h[h != 0] for h in hs_mask]
            hlens = list(map(int, [h.size(0) for h in hs]))
            specs = [h[:l] for h, l in zip(specs, hlens)]

            with ThreadPoolExecutor(max_workers=self.aux_mbr_beam) as executor:
                futures = [executor.submit(self.beam_search, h) for h in specs]
                wait(futures, return_when=ALL_COMPLETED)

            hyps = []
            for future in futures:
                hyps.extend(future.result()[:self.aux_mbr_beam])
            hyps = [h.yseq[1:-1].tolist() for h in hyps] # exclude <sos>, <eos>

            # for debug
            for i, y in enumerate(ys_pad):
                ref_text = "".join([self.char_list[x] for x in y if x != self.ignore_id])
                # print(f"ref_text: {ref_text}")
                for y in hyps[i * self.aux_mbr_beam: (i+1) * self.aux_mbr_beam]:
                    hyp_text = "".join([self.char_list[x] for x in y])
                    # print(f"hyp_text: {hyp_text}")
        self.train()

        # problem in decoding
        if not len(hyps) == self.aux_mbr_beam * batch_size:
            return 0.0

        # (2) edit-distance
        dist = self.compute_edit_distance(hyps, ys_pad)
        if dist is None:
            return 0.0 # fail in editdistance.

        # (3) decoder forward: prob of each hyp
        hyp_maxlen = max([len(hyp) for hyp in hyps])
        hyps_pad = [hyp + [self.ignore_id] * (hyp_maxlen - len(hyp)) for hyp in hyps]
        hyps_pad = torch.Tensor(hyps_pad).to(ys_pad.device).to(ys_pad.dtype)
        hyps_pad_in, hyps_pad_out = add_sos_eos(hyps_pad, self.sos, self.eos, self.ignore_id)
        hyps_mask = target_mask(hyps_pad_in, self.ignore_id)

        idx = torch.arange(self.aux_mbr_beam * batch_size) // self.aux_mbr_beam
        hs_pad = hs_pad[idx]
        hs_mask = hs_mask[idx]

        pred_pad, _ = self.decoder(hyps_pad_in, hyps_mask, hs_pad, hs_mask)
        loss_att = self.mbr_criterion(pred_pad.permute(0, 2, 1), hyps_pad_out)
        mask = torch.eq(hyps_pad_out.int(), self.ignore_id)
        loss_att.masked_fill(torch.eq(hyps_pad_out, self.ignore_id), 0.0)
        loss_att = (-loss_att.sum(dim=-1)).exp()

        # (4) MBR loss. 
        num = (loss_att * dist).view(batch_size, self.aux_mbr_beam)
        den = loss_att.view(batch_size, self.aux_mbr_beam)
        loss_mbr = num.sum(dim=-1) / (den.sum(dim=-1) + 1e-10) # smooth
        loss_mbr = loss_mbr.mean() # other Loss also works in reduction=mean
        return loss_mbr
       
    def compute_edit_distance(self, hyps, refs):
        # hyps: list of list with number batch * beam
        # refs: 2-D tensor of labels. -1 means padding

        # convert refs into list and remove padding 
        refs_device = refs.device
        refs = refs.cpu().tolist()
        refs = [[x for x in t if x != self.ignore_id] for t in refs]

        if not len(hyps) % len(refs) == 0:
            raise ValueError("The number of hypotheses is not correct")

        beam = int(len(hyps) / len(refs))

        dist = [editdistance.eval(hyp, refs[i//beam])
                for i, hyp in enumerate(hyps)
               ]
        dist = torch.IntTensor(dist).to(refs_device)
        return dist 

    def scorers(self):
        """Scorers."""
        return dict(decoder=self.decoder)

    def encode(self, x):
        """Encode acoustic features.

        :param ndarray x: source acoustic feature (T, D)
        :return: encoder outputs
        :rtype: torch.Tensor
        """
        self.eval()
        x = torch.as_tensor(x).unsqueeze(0)
        enc_output, _ = self.encoder(x, None)
        return enc_output.squeeze(0)

    def recognize(self, x, recog_args, char_list=None, rnnlm=None, use_jit=False):
        """Recognize input speech.

        :param ndnarray x: input acoustic feature (B, T, D) or (T, D)
        :param Namespace recog_args: argment Namespace contraining options
        :param list char_list: list of characters
        :param torch.nn.Module rnnlm: language model module
        :return: N-best decoding results
        :rtype: list
        """
        enc_output = self.encode(x).unsqueeze(0)
        if self.mtlalpha == 1.0:
            recog_args.ctc_weight = 1.0
            logging.info("Set to pure CTC decoding mode.")

        if self.mtlalpha > 0 and recog_args.ctc_weight == 1.0:
            from itertools import groupby

            lpz = self.ctc.argmax(enc_output)
            collapsed_indices = [x[0] for x in groupby(lpz[0])]
            hyp = [x for x in filter(lambda x: x != self.blank, collapsed_indices)]
            nbest_hyps = [{"score": 0.0, "yseq": [self.sos] + hyp}]
            if recog_args.beam_size > 1:
                raise NotImplementedError("Pure CTC beam search is not implemented.")
            # TODO(hirofumi0810): Implement beam search
            return nbest_hyps
        elif self.mtlalpha > 0 and recog_args.ctc_weight > 0.0:
            # Being compatible with LAS+MMI+CTC
            ctc_module = self.ctc if isinstance(self.ctc, CTC) else self.third_loss
            lpz = ctc_module.log_softmax(enc_output)
            lpz = lpz.squeeze(0)
        else:
            lpz = None

        if recog_args.mmi_weight > 0.0:
            assert isinstance(self.ctc, K2MMI)
            self.ctc.dump_weight(recog_args.local_rank)
            if recog_args.mmi_type == "lookahead":
                mmi_scorer = MMIFramePrefixScorer(lang=self.ctc.lang,
                                                  device="cuda",
                                                  idim=self.adim,
                                                  sos_id=self.sos,
                                                  rank=recog_args.local_rank,
                                                  use_segment=recog_args.use_segment,
                                                  char_list=char_list
                                                  )
            elif recog_args.mmi_type == "frame":
                mmi_scorer = MMIFrameScorer(lang=self.ctc.lang,
                                            device=self.ctc.device,
                                            idim=self.adim,
                                            sos_id=self.sos,
                                            rank=recog_args.local_rank,
                                            use_segment=recog_args.use_segment,
                                            char_list=char_list
                                            )
            else:    
                raise NotImplementedError
        else:
            mmi_scorer = None

        if recog_args.mmi_rescore == True:
            self.ctc.dump_weight(recog_args.local_rank)
            if recog_args.mmi_weight > 0.0:
                raise ValueError("Cannot do rescoring if mmi_weight > 0.0")
            mmi_rescorer = MMIRescorer(lang=self.ctc.lang,
                                                device=self.ctc.device,
                                                idim=self.adim,
                                                sos_id=self.sos,
                                                rank=recog_args.local_rank,
                                                use_segment=recog_args.use_segment,
                                                char_list=char_list)
            print("Will do rescore after decoding")
        else:
            mmi_rescorer = None 

        h = enc_output.squeeze(0)

        logging.info("input lengths: " + str(h.size(0)))
        # search parms
        beam = recog_args.beam_size
        penalty = recog_args.penalty
        ctc_weight = recog_args.ctc_weight

        # preprare sos
        y = self.sos
        vy = h.new_zeros(1).long()

        if recog_args.maxlenratio == 0:
            maxlen = h.shape[0]
        else:
            # maxlen >= 1
            maxlen = max(1, int(recog_args.maxlenratio * h.size(0)))
        minlen = int(recog_args.minlenratio * h.size(0))
        logging.info("max output length: " + str(maxlen))
        logging.info("min output length: " + str(minlen))

        # initialize hypothesis
        if rnnlm:
            hyp = {"score": 0.0, "yseq": [y], "rnnlm_prev": None}
        else:
            hyp = {"score": 0.0, "yseq": [y]}
        if lpz is not None:
            ctc_prefix_score = CTCPrefixScore(lpz.detach().numpy(), 0, self.eos, numpy)
            hyp["ctc_state_prev"] = ctc_prefix_score.initial_state()
            hyp["ctc_score_prev"] = 0.0
        
        # CTC beam is independent to lpz.
        ctc_beam = int(beam * CTC_SCORING_RATIO)

        if mmi_scorer:
            hyp["mmi_state"] = mmi_scorer.init_state(enc_output.squeeze(0))
        
        # Trace each score in each step
        logs = {"att": []}
        if ctc_weight > 0.0:
            logs["ctc"] = []
        if recog_args.mmi_weight > 0.0:
            logs["mmi"] = []
        if recog_args.lm_weight > 0.0:
            logs["lm"] = []
        hyp["logs"] = logs
        
        hyps = [hyp]
        ended_hyps = []

        import six

        traced_decoder = None
        for i in six.moves.range(maxlen):
            logging.debug("position " + str(i))

            hyps_best_kept = []
            print("#" * 20, f"Iteration {i}", "#" * 20, flush=True)
            for hyp in hyps:
                vy[0] = hyp["yseq"][i]

                # get nbest local scores and their ids
                ys_mask = subsequent_mask(i + 1).unsqueeze(0)
                ys = torch.tensor(hyp["yseq"]).unsqueeze(0)
                # FIXME: jit does not match non-jit result
                if use_jit:
                    if traced_decoder is None:
                        traced_decoder = torch.jit.trace(
                            self.decoder.forward_one_step, (ys, ys_mask, enc_output)
                        )
                    local_att_scores = traced_decoder(ys, ys_mask, enc_output)[0]
                else:
                    local_att_scores = self.decoder.forward_one_step(
                        ys, ys_mask, enc_output
                    )[0]

                if rnnlm:
                    rnnlm_state, local_lm_scores = rnnlm.predict(hyp["rnnlm_prev"], vy)
                    local_scores = (
                        local_att_scores + recog_args.lm_weight * local_lm_scores
                    )
                else:
                    local_scores = local_att_scores
                
                if lpz is not None or mmi_scorer: # allow use either CTC or MMI
                    # Accumulate Attention scores
                    local_best_scores, local_best_ids = torch.topk(
                        local_att_scores, ctc_beam, dim=1
                    )
                    local_scores = (1.0 - ctc_weight) * local_best_scores 
                    att_scores = local_best_scores 
                    # Previous Hypothesis
                    prev_hyp = "".join([char_list[int(x)] for x in hyp["yseq"][1:]]).replace("<space>", " ")
                    print(f"Privious Hypothesis: {prev_hyp} | Prev_Score: {hyp['score']}")
                    
                    # Candidates
                    candidates = [char_list[int(x)] for x in local_best_ids[0]]
                    print("Candidates:     " + "".join(["{:<9}".format(x) for x in candidates]))

                    # Attention Scores
                    print(f"Attention({1-recog_args.ctc_weight}): " + "".join(["{:<9.2f}".format(x) for x in local_best_scores[0]])) 

                    # Accumulate CTC scores if provided 
                    if lpz is not None:
                        ctc_scores, ctc_states = ctc_prefix_score(
                            hyp["yseq"], local_best_ids[0], hyp["ctc_state_prev"]
                        )
                        local_scores += ctc_weight * torch.from_numpy(
                                        ctc_scores - hyp["ctc_score_prev"])
                        print(f"CTC({recog_args.ctc_weight}):       " + "".join(["{:<9.2f}".format(x) for x in ctc_scores]))

                    # Accumulate MMI scores if provided
                    if mmi_scorer:
                        prefix = torch.Tensor(hyp["yseq"]).to(torch.int32).to(mmi_scorer.device)
                        mmi_scores, mmi_states = mmi_scorer.score_partial(
                          prefix, local_best_ids[0], hyp["mmi_state"], None
                        )
                        local_scores += (recog_args.mmi_weight * mmi_scores)
                        print(f"MMI({recog_args.mmi_weight}):       " + "".join(["{:<9.2f}".format(x) for x in mmi_scores]))

                    # Accumulate LM scores if provided
                    if recog_args.lm_weight > 0.0:
                        local_scores += (
                            recog_args.lm_weight * local_lm_scores[:, local_best_ids[0]]
                        )
                        lm_scores = local_lm_scores[:, local_best_ids[0]]
                        print(f"LM ({recog_args.lm_weight}):       " + "".join(["{:<9.2f}".format(x) for x \
                          in local_lm_scores[:, local_best_ids[0]][0]]))

                    local_best_scores, joint_best_ids = torch.topk(
                        local_scores, beam, dim=1
                    )
                    local_best_ids = local_best_ids[:, joint_best_ids[0]]
                else:
                    local_best_scores, local_best_ids = torch.topk(
                        local_scores, beam, dim=1
                    )

                for j in six.moves.range(beam):
                    new_hyp = {}
                    new_hyp["score"] = hyp["score"] + float(local_best_scores[0, j])
                    new_hyp["yseq"] = [0] * (1 + len(hyp["yseq"]))
                    new_hyp["yseq"][: len(hyp["yseq"])] = hyp["yseq"]
                    new_hyp["yseq"][len(hyp["yseq"])] = int(local_best_ids[0, j])
                    if rnnlm:
                        new_hyp["rnnlm_prev"] = rnnlm_state
                    if lpz is not None:
                        new_hyp["ctc_state_prev"] = ctc_states[joint_best_ids[0, j]]
                        new_hyp["ctc_score_prev"] = ctc_scores[joint_best_ids[0, j]]
                    if mmi_scorer:
                        new_hyp["mmi_state"] = mmi_scorer.select_state(mmi_states, joint_best_ids[0, j])
                    
                    # Update log
                    old_logs = copy.deepcopy(hyp["logs"])
                    if att_scores is not None:
                        old_logs["att"].append(att_scores[0, joint_best_ids[0, j]].item()) 
                    if ctc_weight > 0.0:
                        old_logs["ctc"].append(ctc_scores[joint_best_ids[0, j]].item())
                    if recog_args.mmi_weight > 0.0:
                        old_logs["mmi"].append(mmi_scores[joint_best_ids[0, j]].item())
                    if recog_args.lm_weight > 0.0:
                        old_logs["lm"].append(lm_scores[0, joint_best_ids[0, j]].item())
                    new_hyp["logs"] = old_logs
                    
                    # will be (2 x beam) hyps at most
                    hyps_best_kept.append(new_hyp)

                hyps_best_kept = sorted(
                    hyps_best_kept, key=lambda x: x["score"], reverse=True
                )[:beam]

            # sort and get nbest
            hyps = hyps_best_kept
            logging.debug("number of pruned hypothes: " + str(len(hyps)))
            if char_list is not None:
                logging.debug(
                    "best hypo: "
                    + "".join([char_list[int(x)] for x in hyps[0]["yseq"][1:]])
                )
            
            # add eos in the final loop to avoid that there are no ended hyps
            if i == maxlen - 1:
                logging.info("adding <eos> in the last postion in the loop")
                for hyp in hyps:
                    hyp["yseq"].append(self.eos)

            # add ended hypothes to a final list, and removed them from current hypothes
            # (this will be a probmlem, number of hyps < beam)
            remained_hyps = []
            for hyp in hyps:
                if hyp["yseq"][-1] == self.eos:
                    # only store the sequence that has more than minlen outputs
                    # also add penalty
                    if len(hyp["yseq"]) > minlen:
                        hyp["score"] += (i + 1) * penalty
                        if rnnlm:  # Word LM needs to add final <eos> score
                            hyp["score"] += recog_args.lm_weight * rnnlm.final(
                                hyp["rnnlm_prev"]
                            )
                        ended_hyps.append(hyp)
                else:
                    remained_hyps.append(hyp)

            # show all hypothesis
            for hyp in ended_hyps:
                hyp_str = "".join([char_list[int(x)] for x in hyp["yseq"][1:]]).replace("<space>", " ")
                print(f"{hyp_str} | {hyp['score']}")

            # end detection
            if end_detect(ended_hyps, i) and recog_args.maxlenratio == 0.0:
                logging.info("end detected at %d", i)
                break

            hyps = remained_hyps
            if len(hyps) > 0:
                logging.debug("remeined hypothes: " + str(len(hyps)))
            else:
                logging.info("no hypothesis. Finish decoding.")
                break

            if char_list is not None:
                for hyp in hyps:
                    logging.debug(
                        "hypo: " + "".join([char_list[int(x)] for x in hyp["yseq"][1:]])
                    )

            logging.debug("number of ended hypothes: " + str(len(ended_hyps)))

        nbest_hyps = sorted(ended_hyps, key=lambda x: x["score"], reverse=True)[
            : min(len(ended_hyps), recog_args.nbest)
        ]

        # check number of hypotheis
        if len(nbest_hyps) == 0:
            logging.warning(
                "there is no N-best results, perform recognition "
                "again with smaller minlenratio."
            )
            # should copy becasuse Namespace will be overwritten globally
            recog_args = Namespace(**vars(recog_args))
            recog_args.minlenratio = max(0.0, recog_args.minlenratio - 0.1)
            return self.recognize(x, recog_args, char_list, rnnlm)

        logging.info("total log probability: " + str(nbest_hyps[0]["score"]))
        logging.info(
            "normalized log probability: "
            + str(nbest_hyps[0]["score"] / len(nbest_hyps[0]["yseq"]))
        )
        if mmi_rescorer:
            nbest_hyps = mmi_rescorer.score(enc_output.squeeze(0), nbest_hyps, char_list)
        return nbest_hyps

    def calculate_all_attentions(self, xs_pad, ilens, ys_pad, texts, xs_pad_orig):
        """E2E attention calculation.

        :param torch.Tensor xs_pad: batch of padded input sequences (B, Tmax, idim)
        :param torch.Tensor ilens: batch of lengths of input sequences (B)
        :param torch.Tensor ys_pad: batch of padded token id sequence tensor (B, Lmax)
        :return: attention weights (B, H, Lmax, Tmax)
        :rtype: float ndarray
        """
        self.eval()
        with torch.no_grad():
            self.forward(xs_pad, ilens, ys_pad, texts, xs_pad_orig)
        ret = dict()
        for name, m in self.named_modules():
            if (
                isinstance(m, MultiHeadedAttention)
                or isinstance(m, DynamicConvolution)
                or isinstance(m, RelPositionMultiHeadedAttention)
            ):
                ret[name] = m.attn.cpu().numpy()
            if isinstance(m, DynamicConvolution2D):
                ret[name + "_time"] = m.attn_t.cpu().numpy()
                ret[name + "_freq"] = m.attn_f.cpu().numpy()
        self.train()
        return ret

    def calculate_all_ctc_probs(self, xs_pad, ilens, ys_pad, texts, xs_pad_orig):
        """E2E CTC probability calculation.

        :param torch.Tensor xs_pad: batch of padded input sequences (B, Tmax)
        :param torch.Tensor ilens: batch of lengths of input sequences (B)
        :param torch.Tensor ys_pad: batch of padded token id sequence tensor (B, Lmax)
        :return: CTC probability (B, Tmax, vocab)
        :rtype: float ndarray
        """
        ret = None
        if self.mtlalpha == 0:
            return ret

        self.eval()
        with torch.no_grad():
            self.forward(xs_pad, ilens, ys_pad, texts, xs_pad_orig)
        for name, m in self.named_modules():
            if isinstance(m, (CTC, K2MMI, K2CTC)) and m.probs is not None:
                ret = m.probs.cpu().numpy()
        self.train()
        return ret


================================================
FILE: nets/pytorch_backend/e2e_mt.py
================================================
# Copyright 2019 Kyoto University (Hirofumi Inaguma)
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

"""RNN sequence-to-sequence text translation model (pytorch)."""

import argparse
import logging
import math
import os

import chainer
from chainer import reporter
import nltk
import numpy as np
import torch

from espnet.nets.e2e_asr_common import label_smoothing_dist
from espnet.nets.mt_interface import MTInterface
from espnet.nets.pytorch_backend.initialization import uniform_init_parameters
from espnet.nets.pytorch_backend.nets_utils import get_subsample
from espnet.nets.pytorch_backend.nets_utils import pad_list
from espnet.nets.pytorch_backend.nets_utils import to_device
from espnet.nets.pytorch_backend.rnn.argument import (
    add_arguments_rnn_encoder_common,  # noqa: H301
    add_arguments_rnn_decoder_common,  # noqa: H301
    add_arguments_rnn_attention_common,  # noqa: H301
)
from espnet.nets.pytorch_backend.rnn.attentions import att_for
from espnet.nets.pytorch_backend.rnn.decoders import decoder_for
from espnet.nets.pytorch_backend.rnn.encoders import encoder_for
from espnet.utils.fill_missing_args import fill_missing_args


class Reporter(chainer.Chain):
    """A chainer reporter wrapper."""

    def report(self, loss, acc, ppl, bleu):
        """Report at every step."""
        reporter.report({"loss": loss}, self)
        reporter.report({"acc": acc}, self)
        reporter.report({"ppl": ppl}, self)
        reporter.report({"bleu": bleu}, self)


class E2E(MTInterface, torch.nn.Module):
    """E2E module.

    :param int idim: dimension of inputs
    :param int odim: dimension of outputs
    :param Namespace args: argument Namespace containing options

    """

    @staticmethod
    def add_arguments(parser):
        """Add arguments."""
        E2E.encoder_add_arguments(parser)
        E2E.attention_add_arguments(parser)
        E2E.decoder_add_arguments(parser)
        return parser

    @staticmethod
    def encoder_add_arguments(parser):
        """Add arguments for the encoder."""
        group = parser.add_argument_group("E2E encoder setting")
        group = add_arguments_rnn_encoder_common(group)
        return parser

    @staticmethod
    def attention_add_arguments(parser):
        """Add arguments for the attention."""
        group = parser.add_argument_group("E2E attention setting")
        group = add_arguments_rnn_attention_common(group)
        return parser

    @staticmethod
    def decoder_add_arguments(parser):
        """Add arguments for the decoder."""
        group = parser.add_argument_group("E2E decoder setting")
        group = add_arguments_rnn_decoder_common(group)
        return parser

    def __init__(self, idim, odim, args):
        """Construct an E2E object.

        :param int idim: dimension of inputs
        :param int odim: dimension of outputs
        :param Namespace args: argument Namespace containing options
        """
        super(E2E, self).__init__()
        torch.nn.Module.__init__(self)

        # fill missing arguments for compatibility
        args = fill_missing_args(args, self.add_arguments)

        self.etype = args.etype
        self.verbose = args.verbose
        # NOTE: for self.build method
        args.char_list = getattr(args, "char_list", None)
        self.char_list = args.char_list
        self.outdir = args.outdir
        self.space = args.sym_space
        self.blank = args.sym_blank
        self.reporter = Reporter()

        # below means the last number becomes eos/sos ID
        # note that sos/eos IDs are identical
        self.sos = odim - 1
        self.eos = odim - 1
        self.pad = 0
        # NOTE: we reserve index:0 for <pad> although this is reserved for a blank class
        # in ASR. However, blank labels are not used in MT.
        # To keep the vocabulary size,
        # we use index:0 for padding instead of adding one more class.

        # subsample info
        self.subsample = get_subsample(args, mode="mt", arch="rnn")

        # label smoothing info
        if args.lsm_type and os.path.isfile(args.train_json):
            logging.info("Use label smoothing with " + args.lsm_type)
            labeldist = label_smoothing_dist(
                odim, args.lsm_type, transcript=args.train_json
            )
        else:
            labeldist = None

        # multilingual related
        self.multilingual = getattr(args, "multilingual", False)
        self.replace_sos = getattr(args, "replace_sos", False)

        # encoder
        self.embed = torch.nn.Embedding(idim, args.eunits, padding_idx=self.pad)
        self.dropout = torch.nn.Dropout(p=args.dropout_rate)
        self.enc = encoder_for(args, args.eunits, self.subsample)
        # attention
        self.att = att_for(args)
        # decoder
        self.dec = decoder_for(args, odim, self.sos, self.eos, self.att, labeldist)

        # tie source and target emeddings
        if args.tie_src_tgt_embedding:
            if idim != odim:
                raise ValueError(
                    "When using tie_src_tgt_embedding, idim and odim must be equal."
                )
            if args.eunits != args.dunits:
                raise ValueError(
                    "When using tie_src_tgt_embedding, eunits and dunits must be equal."
                )
            self.embed.weight = self.dec.embed.weight

        # tie emeddings and the classfier
        if args.tie_classifier:
            if args.context_residual:
                raise ValueError(
                    "When using tie_classifier, context_residual must be turned off."
                )
            self.dec.output.weight = self.dec.embed.weight

        # weight initialization
        self.init_like_fairseq()

        # options for beam search
        if args.report_bleu:
            trans_args = {
                "beam_size": args.beam_size,
                "penalty": args.penalty,
                "ctc_weight": 0,
                "maxlenratio": args.maxlenratio,
                "minlenratio": args.minlenratio,
                "lm_weight": args.lm_weight,
                "rnnlm": args.rnnlm,
                "nbest": args.nbest,
                "space": args.sym_space,
                "blank": args.sym_blank,
                "tgt_lang": False,
            }

            self.trans_args = argparse.Namespace(**trans_args)
            self.report_bleu = args.report_bleu
        else:
            self.report_bleu = False
        self.rnnlm = None

        self.logzero = -10000000000.0
        self.loss = None
        self.acc = None

    def init_like_fairseq(self):
        """Initialize weight like Fairseq.

        Fairseq basically uses W, b, EmbedID.W ~ Uniform(-0.1, 0.1),
        """
        uniform_init_parameters(self)
        # exceptions
        # embed weight ~ Normal(-0.1, 0.1)
        torch.nn.init.uniform_(self.embed.weight, -0.1, 0.1)
        torch.nn.init.constant_(self.embed.weight[self.pad], 0)
        torch.nn.init.uniform_(self.dec.embed.weight, -0.1, 0.1)
        torch.nn.init.constant_(self.dec.embed.weight[self.pad], 0)

    def forward(self, xs_pad, ilens, ys_pad):
        """E2E forward.

        :param torch.Tensor xs_pad: batch of padded input sequences (B, Tmax, idim)
        :param torch.Tensor ilens: batch of lengths of input sequences (B)
        :param torch.Tensor ys_pad: batch of padded token id sequence tensor (B, Lmax)
        :return: loss value
        :rtype: torch.Tensor
        """
        # 1. Encoder
        xs_pad, ys_pad = self.target_language_biasing(xs_pad, ilens, ys_pad)
        hs_pad, hlens, _ = self.enc(self.dropout(self.embed(xs_pad)), ilens)

        # 3. attention loss
        self.loss, self.acc, self.ppl = self.dec(hs_pad, hlens, ys_pad)

        # 4. compute bleu
        if self.training or not self.report_bleu:
            self.bleu = 0.0
        else:
            lpz = None

            nbest_hyps = self.dec.recognize_beam_batch(
                hs_pad,
                torch.tensor(hlens),
                lpz,
                self.trans_args,
                self.char_list,
                self.rnnlm,
            )
            # remove <sos> and <eos>
            list_of_refs = []
            hyps = []
            y_hats = [nbest_hyp[0]["yseq"][1:-1] for nbest_hyp in nbest_hyps]
            for i, y_hat in enumerate(y_hats):
                y_true = ys_pad[i]

                seq_hat = [self.char_list[int(idx)] for idx in y_hat if int(idx) != -1]
                seq_true = [
                    self.char_list[int(idx)] for idx in y_true if int(idx) != -1
                ]
                seq_hat_text = "".join(seq_hat).replace(self.trans_args.space, " ")
                seq_hat_text = seq_hat_text.replace(self.trans_args.blank, "")
                seq_true_text = "".join(seq_true).replace(self.trans_args.space, " ")

                hyps += [seq_hat_text.split(" ")]
                list_of_refs += [[seq_true_text.split(" ")]]

            self.bleu = nltk.bleu_score.corpus_bleu(list_of_refs, hyps) * 100

        loss_data = float(self.loss)
        if not math.isnan(loss_data):
            self.reporter.report(loss_data, self.acc, self.ppl, self.bleu)
        else:
            logging.warning("loss (=%f) is not correct", loss_data)
        return self.loss

    def target_language_biasing(self, xs_pad, ilens, ys_pad):
        """Prepend target language IDs to source sentences for multilingual MT.

        These tags are prepended in source/target sentences as pre-processing.

        :param torch.Tensor xs_pad: batch of padded input sequences (B, Tmax, idim)
        :param torch.Tensor ilens: batch of lengths of input sequences (B)
        :return: source text without language IDs
        :rtype: torch.Tensor
        :return: target text without language IDs
        :rtype: torch.Tensor
        :return: target language IDs
        :rtype: torch.Tensor (B, 1)
        """
        if self.multilingual:
            # remove language ID in the beggining
            tgt_lang_ids = ys_pad[:, 0].unsqueeze(1)
            xs_pad = xs_pad[:, 1:]  # remove source language IDs here
            ys_pad = ys_pad[:, 1:]

            # prepend target language ID to source sentences
            xs_pad = torch.cat([tgt_lang_ids, xs_pad], dim=1)
        return xs_pad, ys_pad

    def translate(self, x, trans_args, char_list, rnnlm=None):
        """E2E beam search.

        :param ndarray x: input source text feature (B, T, D)
        :param Namespace trans_args: argument Namespace containing options
        :param list char_list: list of characters
        :param torch.nn.Module rnnlm: language model module
        :return: N-best decoding results
        :rtype: list
        """
        prev = self.training
        self.eval()

        # 1. encoder
        # make a utt list (1) to use the same interface for encoder
        if self.multilingual:
            ilen = [len(x[0][1:])]
            h = to_device(
                self, torch.from_numpy(np.fromiter(map(int, x[0][1:]), dtype=np.int64))
            )
        else:
            ilen = [len(x[0])]
            h = to_device(
                self, torch.from_numpy(np.fromiter(map(int, x[0]), dtype=np.int64))
            )
        hs, _, _ = self.enc(self.dropout(self.embed(h.unsqueeze(0))), ilen)

        # 2. decoder
        # decode the first utterance
        y = self.dec.recognize_beam(hs[0], None, trans_args, char_list, rnnlm)

        if prev:
            self.train()
        return y

    def translate_batch(self, xs, trans_args, char_list, rnnlm=None):
        """E2E batch beam search.

        :param list xs:
            list of input source text feature arrays [(T_1, D), (T_2, D), ...]
        :param Namespace trans_args: argument Namespace containing options
        :param list char_list: list of characters
        :param torch.nn.Module rnnlm: language model module
        :return: N-best decoding results
        :rtype: list
        """
        prev = self.training
        self.eval()

        # 1. Encoder
        if self.multilingual:
            ilens = np.fromiter((len(xx[1:]) for xx in xs), dtype=np.int64)
            hs = [to_device(self, torch.from_numpy(xx[1:])) for xx in xs]
        else:
            ilens = np.fromiter((len(xx) for xx in xs), dtype=np.int64)
            hs = [to_device(self, torch.from_numpy(xx)) for xx in xs]
        xpad = pad_list(hs, self.pad)
        hs_pad, hlens, _ = self.enc(self.dropout(self.embed(xpad)), ilens)

        # 2. Decoder
        hlens = torch.tensor(list(map(int, hlens)))  # make sure hlens is tensor
        y = self.dec.recognize_beam_batch(
            hs_pad, hlens, None, trans_args, char_list, rnnlm
        )

        if prev:
            self.train()
        return y

    def calculate_all_attentions(self, xs_pad, ilens, ys_pad):
        """E2E attention calculation.

        :param torch.Tensor xs_pad: batch of padded input sequences (B, Tmax, idim)
        :param torch.Tensor ilens: batch of lengths of input sequences (B)
        :param torch.Tensor ys_pad: batch of padded token id sequence tensor (B, Lmax)
        :return: attention weights with the following shape,
            1) multi-head case => attention weights (B, H, Lmax, Tmax),
            2) other case => attention weights (B, Lmax, Tmax).
        :rtype: float ndarray
        """
        self.eval()
        with torch.no_grad():
            # 1. Encoder
            xs_pad, ys_pad = self.target_language_biasing(xs_pad, ilens, ys_pad)
            hpad, hlens, _ = self.enc(self.dropout(self.embed(xs_pad)), ilens)

            # 2. Decoder
            att_ws = self.dec.calculate_all_attentions(hpad, hlens, ys_pad)
        self.train()
        return att_ws


================================================
FILE: nets/pytorch_backend/e2e_mt_transformer.py
================================================
# Copyright 2019 Kyoto University (Hirofumi Inaguma)
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

"""Transformer text translation model (pytorch)."""

from argparse import Namespace
import logging
import math

import numpy as np
import torch

from espnet.nets.e2e_asr_common import end_detect
from espnet.nets.e2e_mt_common import ErrorCalculator
from espnet.nets.mt_interface import MTInterface
from espnet.nets.pytorch_backend.e2e_mt import Reporter
from espnet.nets.pytorch_backend.nets_utils import get_subsample
from espnet.nets.pytorch_backend.nets_utils import make_pad_mask
from espnet.nets.pytorch_backend.nets_utils import th_accuracy
from espnet.nets.pytorch_backend.nets_utils import to_device
from espnet.nets.pytorch_backend.transformer.add_sos_eos import add_sos_eos
from espnet.nets.pytorch_backend.transformer.argument import (
    add_arguments_transformer_common,  # noqa: H301
)
from espnet.nets.pytorch_backend.transformer.attention import MultiHeadedAttention
from espnet.nets.pytorch_backend.transformer.decoder import Decoder
from espnet.nets.pytorch_backend.transformer.encoder import Encoder
from espnet.nets.pytorch_backend.transformer.initializer import initialize
from espnet.nets.pytorch_backend.transformer.label_smoothing_loss import (
    LabelSmoothingLoss,  # noqa: H301
)
from espnet.nets.pytorch_backend.transformer.mask import subsequent_mask
from espnet.nets.pytorch_backend.transformer.mask import target_mask
from espnet.nets.pytorch_backend.transformer.plot import PlotAttentionReport
from espnet.utils.fill_missing_args import fill_missing_args


class E2E(MTInterface, torch.nn.Module):
    """E2E module.

    :param int idim: dimension of inputs
    :param int odim: dimension of outputs
    :param Namespace args: argument Namespace containing options

    """

    @staticmethod
    def add_arguments(parser):
        """Add arguments."""
        group = parser.add_argument_group("transformer model setting")
        group = add_arguments_transformer_common(group)
        return parser

    @property
    def attention_plot_class(self):
        """Return PlotAttentionReport."""
        return PlotAttentionReport

    def __init__(self, idim, odim, args, ignore_id=-1):
        """Construct an E2E object.

        :param int idim: dimension of inputs
        :param int odim: dimension of outputs
        :param Namespace args: argument Namespace containing options
        """
        torch.nn.Module.__init__(self)

        # fill missing arguments for compatibility
        args = fill_missing_args(args, self.add_arguments)

        if args.transformer_attn_dropout_rate is None:
            args.transformer_attn_dropout_rate = args.dropout_rate
        self.encoder = Encoder(
            idim=idim,
            selfattention_layer_type=args.transformer_encoder_selfattn_layer_type,
            attention_dim=args.adim,
            attention_heads=args.aheads,
            conv_wshare=args.wshare,
            conv_kernel_length=args.ldconv_encoder_kernel_length,
            conv_usebias=args.ldconv_usebias,
            linear_units=args.eunits,
            num_blocks=args.elayers,
            input_layer="embed",
            dropout_rate=args.dropout_rate,
            positional_dropout_rate=args.dropout_rate,
            attention_dropout_rate=args.transformer_attn_dropout_rate,
        )
        self.decoder = Decoder(
            odim=odim,
            selfattention_layer_type=args.transformer_decoder_selfattn_layer_type,
            attention_dim=args.adim,
            attention_heads=args.aheads,
            conv_wshare=args.wshare,
            conv_kernel_length=args.ldconv_decoder_kernel_length,
            conv_usebias=args.ldconv_usebias,
            linear_units=args.dunits,
            num_blocks=args.dlayers,
            dropout_rate=args.dropout_rate,
            positional_dropout_rate=args.dropout_rate,
            self_attention_dropout_rate=args.transformer_attn_dropout_rate,
            src_attention_dropout_rate=args.transformer_attn_dropout_rate,
        )
        self.pad = 0  # use <blank> for padding
        self.sos = odim - 1
        self.eos = odim - 1
        self.odim = odim
        self.ignore_id = ignore_id
        self.subsample = get_subsample(args, mode="mt", arch="transformer")
        self.reporter = Reporter()

        # tie source and target emeddings
        if args.tie_src_tgt_embedding:
            if idim != odim:
                raise ValueError(
                    "When using tie_src_tgt_embedding, idim and odim must be equal."
                )
            self.encoder.embed[0].weight = self.decoder.embed[0].weight

        # tie emeddings and the classfier
        if args.tie_classifier:
            self.decoder.output_layer.weight = self.decoder.embed[0].weight

        self.criterion = LabelSmoothingLoss(
            self.odim,
            self.ignore_id,
            args.lsm_weight,
            args.transformer_length_normalized_loss,
        )
        self.normalize_length = args.transformer_length_normalized_loss  # for PPL
        self.reset_parameters(args)
        self.adim = args.adim
        self.error_calculator = ErrorCalculator(
            args.char_list, args.sym_space, args.sym_blank, args.report_bleu
        )
        self.rnnlm = None

        # multilingual MT related
        self.multilingual = args.multilingual

    def reset_parameters(self, args):
        """Initialize parameters."""
        initialize(self, args.transformer_init)
        torch.nn.init.normal_(
            self.encoder.embed[0].weight, mean=0, std=args.adim ** -0.5
        )
        torch.nn.init.constant_(self.encoder.embed[0].weight[self.pad], 0)
        torch.nn.init.normal_(
            self.decoder.embed[0].weight, mean=0, std=args.adim ** -0.5
        )
        torch.nn.init.constant_(self.decoder.embed[0].weight[self.pad], 0)

    def forward(self, xs_pad, ilens, ys_pad):
        """E2E forward.

        :param torch.Tensor xs_pad: batch of padded source sequences (B, Tmax)
        :param torch.Tensor ilens: batch of lengths of source sequences (B)
        :param torch.Tensor ys_pad: batch of padded target sequences (B, Lmax)
        :rtype: torch.Tensor
        :return: attention loss value
        :rtype: torch.Tensor
        :return: accuracy in attention decoder
        :rtype: float
        """
        # 1. forward encoder
        xs_pad = xs_pad[:, : max(ilens)]  # for data parallel
        src_mask = (~make_pad_mask(ilens.tolist())).to(xs_pad.device).unsqueeze(-2)
        xs_pad, ys_pad = self.target_forcing(xs_pad, ys_pad)
        hs_pad, hs_mask = self.encoder(xs_pad, src_mask)

        # 2. forward decoder
        ys_in_pad, ys_out_pad = add_sos_eos(ys_pad, self.sos, self.eos, self.ignore_id)
        ys_mask = target_mask(ys_in_pad, self.ignore_id)
        pred_pad, pred_mask = self.decoder(ys_in_pad, ys_mask, hs_pad, hs_mask)

        # 3. compute attention loss
        self.loss = self.criterion(pred_pad, ys_out_pad)
        self.acc = th_accuracy(
            pred_pad.view(-1, self.odim), ys_out_pad, ignore_label=self.ignore_id
        )

        # 4. compute corpus-level bleu in a mini-batch
        if self.training:
            self.bleu = None
        else:
            ys_hat = pred_pad.argmax(dim=-1)
            self.bleu = self.error_calculator(ys_hat.cpu(), ys_pad.cpu())

        loss_data = float(self.loss)
        if self.normalize_length:
            self.ppl = np.exp(loss_data)
        else:
            batch_size = ys_out_pad.size(0)
            ys_out_pad = ys_out_pad.view(-1)
            ignore = ys_out_pad == self.ignore_id  # (B*T,)
            total_n_tokens = len(ys_out_pad) - ignore.sum().item()
            self.ppl = np.exp(loss_data * batch_size / total_n_tokens)
        if not math.isnan(loss_data):
            self.reporter.report(loss_data, self.acc, self.ppl, self.bleu)
        else:
            logging.warning("loss (=%f) is not correct", loss_data)
        return self.loss

    def scorers(self):
        """Scorers."""
        return dict(decoder=self.decoder)

    def encode(self, xs):
        """Encode source sentences."""
        self.eval()
        xs = torch.as_tensor(xs).unsqueeze(0)
        enc_output, _ = self.encoder(xs, None)
        return enc_output.squeeze(0)

    def target_forcing(self, xs_pad, ys_pad=None, tgt_lang=None):
        """Prepend target language IDs to source sentences for multilingual MT.

        These tags are prepended in source/target sentences as pre-processing.

        :param torch.Tensor xs_pad: batch of padded input sequences (B, Tmax)
        :return: source text without language IDs
        :rtype: torch.Tensor
        :return: target text without language IDs
        :rtype: torch.Tensor
        :return: target language IDs
        :rtype: torch.Tensor (B, 1)
        """
        if self.multilingual:
            xs_pad = xs_pad[:, 1:]  # remove source language IDs here
            if ys_pad is not None:
                # remove language ID in the beginning
                lang_ids = ys_pad[:, 0].unsqueeze(1)
                ys_pad = ys_pad[:, 1:]
            elif tgt_lang is not None:
                lang_ids = xs_pad.new_zeros(xs_pad.size(0), 1).fill_(tgt_lang)
            else:
                raise ValueError("Set ys_pad or tgt_lang.")

            # prepend target language ID to source sentences
            xs_pad = torch.cat([lang_ids, xs_pad], dim=1)
        return xs_pad, ys_pad

    def translate(self, x, trans_args, char_list=None):
        """Translate source text.

        :param list x: input source text feature (T,)
        :param Namespace trans_args: argment Namespace contraining options
        :param list char_list: list of characters
        :return: N-best decoding results
        :rtype: list
        """
        self.eval()  # NOTE: this is important because self.encode() is not used
        assert isinstance(x, list)

        # make a utt list (1) to use the same interface for encoder
        if self.multilingual:
            x = to_device(
                self, torch.from_numpy(np.fromiter(map(int, x[0][1:]), dtype=np.int64))
            )
        else:
            x = to_device(
                self, torch.from_numpy(np.fromiter(map(int, x[0]), dtype=np.int64))
            )

        logging.info("input lengths: " + str(x.size(0)))
        xs_pad = x.unsqueeze(0)
        tgt_lang = None
        if trans_args.tgt_lang:
            tgt_lang = char_list.index(trans_args.tgt_lang)
        xs_pad, _ = self.target_forcing(xs_pad, tgt_lang=tgt_lang)
        h, _ = self.encoder(xs_pad, None)
        logging.info("encoder output lengths: " + str(h.size(1)))

        # search parms
        beam = trans_args.beam_size
        penalty = trans_args.penalty

        if trans_args.maxlenratio == 0:
            maxlen = h.size(1)
        else:
            # maxlen >= 1
            maxlen = max(1, int(trans_args.maxlenratio * h.size(1)))
        minlen = int(trans_args.minlenratio * h.size(1))
        logging.info("max output length: " + str(maxlen))
        logging.info("min output length: " + str(minlen))

        # initialize hypothesis
        hyp = {"score": 0.0, "yseq": [self.sos]}
        hyps = [hyp]
        ended_hyps = []

        for i in range(maxlen):
            logging.debug("position " + str(i))

            # batchfy
            ys = h.new_zeros((len(hyps), i + 1), dtype=torch.int64)
            for j, hyp in enumerate(hyps):
                ys[j, :] = torch.tensor(hyp["yseq"])
            ys_mask = subsequent_mask(i + 1).unsqueeze(0).to(h.device)

            local_scores = self.decoder.forward_one_step(
                ys, ys_mask, h.repeat([len(hyps), 1, 1])
            )[0]

            hyps_best_kept = []
            for j, hyp in enumerate(hyps):
                local_best_scores, local_best_ids = torch.topk(
                    local_scores[j : j + 1], beam, dim=1
                )

                for j in range(beam):
                    new_hyp = {}
                    new_hyp["score"] = hyp["score"] + float(local_best_scores[0, j])
                    new_hyp["yseq"] = [0] * (1 + len(hyp["yseq"]))
                    new_hyp["yseq"][: len(hyp["yseq"])] = hyp["yseq"]
                    new_hyp["yseq"][len(hyp["yseq"])] = int(local_best_ids[0, j])
                    # will be (2 x beam) hyps at most
                    hyps_best_kept.append(new_hyp)

                hyps_best_kept = sorted(
                    hyps_best_kept, key=lambda x: x["score"], reverse=True
                )[:beam]

            # sort and get nbest
            hyps = hyps_best_kept
            logging.debug("number of pruned hypothes: " + str(len(hyps)))
            if char_list is not None:
                logging.debug(
                    "best hypo: "
                    + "".join([char_list[int(x)] for x in hyps[0]["yseq"][1:]])
                )

            # add eos in the final loop to avoid that there are no ended hyps
            if i == maxlen - 1:
                logging.info("adding <eos> in the last postion in the loop")
                for hyp in hyps:
                    hyp["yseq"].append(self.eos)

            # add ended hypothes to a final list, and removed them from current hypothes
            # (this will be a probmlem, number of hyps < beam)
            remained_hyps = []
            for hyp in hyps:
                if hyp["yseq"][-1] == self.eos:
                    # only store the sequence that has more than minlen outputs
                    # also add penalty
                    if len(hyp["yseq"]) > minlen:
                        hyp["score"] += (i + 1) * penalty
                        ended_hyps.append(hyp)
                else:
                    remained_hyps.append(hyp)

            # end detection
            if end_detect(ended_hyps, i) and trans_args.maxlenratio == 0.0:
                logging.info("end detected at %d", i)
                break

            hyps = remained_hyps
            if len(hyps) > 0:
                logging.debug("remeined hypothes: " + str(len(hyps)))
            else:
                logging.info("no hypothesis. Finish decoding.")
                break

            if char_list is not None:
                for hyp in hyps:
                    logging.debug(
                        "hypo: " + "".join([char_list[int(x)] for x in hyp["yseq"][1:]])
                    )

            logging.debug("number of ended hypothes: " + str(len(ended_hyps)))

        nbest_hyps = sorted(ended_hyps, key=lambda x: x["score"], reverse=True)[
            : min(len(ended_hyps), trans_args.nbest)
        ]

        # check number of hypotheis
        if len(nbest_hyps) == 0:
            logging.warning(
                "there is no N-best results, perform translation "
                "again with smaller minlenratio."
            )
            # should copy becasuse Namespace will be overwritten globally
            trans_args = Namespace(**vars(trans_args))
            trans_args.minlenratio = max(0.0, trans_args.minlenratio - 0.1)
            return self.translate(x, trans_args, char_list)

        logging.info("total log probability: " + str(nbest_hyps[0]["score"]))
        logging.info(
            "normalized log probability: "
            + str(nbest_hyps[0]["score"] / len(nbest_hyps[0]["yseq"]))
        )
        return nbest_hyps

    def calculate_all_attentions(self, xs_pad, ilens, ys_pad):
        """E2E attention calculation.

        :param torch.Tensor xs_pad: batch of padded input sequences (B, Tmax)
        :param torch.Tensor ilens: batch of lengths of input sequences (B)
        :param torch.Tensor ys_pad: batch of padded token id sequence tensor (B, Lmax)
        :return: attention weights (B, H, Lmax, Tmax)
        :rtype: float ndarray
        """
        self.eval()
        with torch.no_grad():
            self.forward(xs_pad, ilens, ys_pad)
        ret = dict()
        for name, m in self.named_modules():
            if isinstance(m, MultiHeadedAttention) and m.attn is not None:
                ret[name] = m.attn.cpu().numpy()
        self.train()
        return ret


================================================
FILE: nets/pytorch_backend/e2e_st.py
================================================
# Copyright 2019 Kyoto University (Hirofumi Inaguma)
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

"""RNN sequence-to-sequence speech translation model (pytorch)."""

import argparse
import copy
import logging
import math
import os

import editdistance
import nltk

import chainer
import numpy as np
import six
import torch

from itertools import groupby

from chainer import reporter

from espnet.nets.e2e_asr_common import label_smoothing_dist
from espnet.nets.pytorch_backend.ctc import CTC
from espnet.nets.pytorch_backend.initialization import lecun_normal_init_parameters
from espnet.nets.pytorch_backend.initialization import set_forget_bias_to_one
from espnet.nets.pytorch_backend.nets_utils import get_subsample
from espnet.nets.pytorch_backend.nets_utils import pad_list
from espnet.nets.pytorch_backend.nets_utils import to_device
from espnet.nets.pytorch_backend.nets_utils import to_torch_tensor
from espnet.nets.pytorch_backend.rnn.argument import (
    add_arguments_rnn_encoder_common,  # noqa: H301
    add_arguments_rnn_decoder_common,  # noqa: H301
    add_arguments_rnn_attention_common,  # noqa: H301
)
from espnet.nets.pytorch_backend.rnn.attentions import att_for
from espnet.nets.pytorch_backend.rnn.decoders import decoder_for
from espnet.nets.pytorch_backend.rnn.encoders import encoder_for
from espnet.nets.st_interface import STInterface
from espnet.utils.fill_missing_args import fill_missing_args

CTC_LOSS_THRESHOLD = 10000


class Reporter(chainer.Chain):
    """A chainer reporter wrapper."""

    def report(
        self,
        loss_asr,
        loss_mt,
        loss_st,
        acc_asr,
        acc_mt,
        acc,
        cer_ctc,
        cer,
        wer,
        bleu,
        mtl_loss,
    ):
        """Report at every step."""
        reporter.report({"loss_asr": loss_asr}, self)
        reporter.report({"loss_mt": loss_mt}, self)
        reporter.report({"loss_st": loss_st}, self)
        reporter.report({"acc_asr": acc_asr}, self)
        reporter.report({"acc_mt": acc_mt}, self)
        reporter.report({"acc": acc}, self)
        reporter.report({"cer_ctc": cer_ctc}, self)
        reporter.report({"cer": cer}, self)
        reporter.report({"wer": wer}, self)
        reporter.report({"bleu": bleu}, self)
        logging.info("mtl loss:" + str(mtl_loss))
        reporter.report({"loss": mtl_loss}, self)


class E2E(STInterface, torch.nn.Module):
    """E2E module.

    :param int idim: dimension of inputs
    :param int odim: dimension of outputs
    :param Namespace args: argument Namespace containing options

    """

    @staticmethod
    def add_arguments(parser):
        """Add arguments."""
        E2E.encoder_add_arguments(parser)
        E2E.attention_add_arguments(parser)
        E2E.decoder_add_arguments(parser)
        return parser

    @staticmethod
    def encoder_add_arguments(parser):
        """Add arguments for the encoder."""
        group = parser.add_argument_group("E2E encoder setting")
        group = add_arguments_rnn_encoder_common(group)
        return parser

    @staticmethod
    def attention_add_arguments(parser):
        """Add arguments for the attention."""
        group = parser.add_argument_group("E2E attention setting")
        group = add_arguments_rnn_attention_common(group)
        return parser

    @staticmethod
    def decoder_add_arguments(parser):
        """Add arguments for the decoder."""
        group = parser.add_argument_group("E2E decoder setting")
        group = add_arguments_rnn_decoder_common(group)
        return parser

    def get_total_subsampling_factor(self):
        """Get total subsampling factor."""
        return self.enc.conv_subsampling_factor * int(np.prod(self.subsample))

    def __init__(self, idim, odim, args):
        """Construct an E2E object.

        :param int idim: dimension of inputs
        :param int odim: dimension of outputs
        :param Namespace args: argument Namespace containing options
        """
        super(E2E, self).__init__()
        torch.nn.Module.__init__(self)

        # fill missing arguments for compatibility
        args = fill_missing_args(args, self.add_arguments)

        self.asr_weight = args.asr_weight
        self.mt_weight = args.mt_weight
        self.mtlalpha = args.mtlalpha
        assert 0.0 <= self.asr_weight < 1.0, "asr_weight should be [0.0, 1.0)"
        assert 0.0 <= self.mt_weight < 1.0, "mt_weight should be [0.0, 1.0)"
        assert 0.0 <= self.mtlalpha <= 1.0, "mtlalpha should be [0.0, 1.0]"
        self.etype = args.etype
        self.verbose = args.verbose
        # NOTE: for self.build method
        args.char_list = getattr(args, "char_list", None)
        self.char_list = args.char_list
        self.outdir = args.outdir
        self.space = args.sym_space
        self.blank = args.sym_blank
        self.reporter = Reporter()

        # below means the last number becomes eos/sos ID
        # note that sos/eos IDs are identical
        self.sos = odim - 1
        self.eos = odim - 1
        self.pad = 0
        # NOTE: we reserve index:0 for <pad> although this is reserved for a blank class
        # in ASR. However, blank labels are not used in MT.
        # To keep the vocabulary size,
        # we use index:0 for padding instead of adding one more class.

        # subsample info
        self.subsample = get_subsample(args, mode="st", arch="rnn")

        # label smoothing info
        if args.lsm_type and os.path.isfile(args.train_json):
            logging.info("Use label smoothing with " + args.lsm_type)
            labeldist = label_smoothing_dist(
                odim, args.lsm_type, transcript=args.train_json
            )
        else:
            labeldist = None

        # multilingual related
        self.multilingual = getattr(args, "multilingual", False)
        self.replace_sos = getattr(args, "replace_sos", False)

        # encoder
        self.enc = encoder_for(args, idim, self.subsample)
        # attention (ST)
        self.att = att_for(args)
        # decoder (ST)
        self.dec = decoder_for(args, odim, self.sos, self.eos, self.att, labeldist)

        # submodule for ASR task
        self.ctc = None
        self.att_asr = None
        self.dec_asr = None
        if self.asr_weight > 0:
            if self.mtlalpha > 0.0:
                self.ctc = CTC(
                    odim,
                    args.eprojs,
                    args.dropout_rate,
                    ctc_type=args.ctc_type,
                    reduce=True,
                )
            if self.mtlalpha < 1.0:
                # attention (asr)
                self.att_asr = att_for(args)
                # decoder (asr)
                args_asr = copy.deepcopy(args)
                args_asr.atype = "location"  # TODO(hirofumi0810): make this option
                self.dec_asr = decoder_for(
                    args_asr, odim, self.sos, self.eos, self.att_asr, labeldist
                )

        # submodule for MT task
        if self.mt_weight > 0:
            self.embed_mt = torch.nn.Embedding(odim, args.eunits, padding_idx=self.pad)
            self.dropout_mt = torch.nn.Dropout(p=args.dropout_rate)
            self.enc_mt = encoder_for(
                args, args.eunits, subsample=np.ones(args.elayers + 1, dtype=np.int)
            )

        # weight initialization
        self.init_like_chainer()

        # options for beam search
        if self.asr_weight > 0 and args.report_cer or args.report_wer:
            recog_args = {
                "beam_size": args.beam_size,
                "penalty": args.penalty,
                "ctc_weight": args.ctc_weight,
                "maxlenratio": args.maxlenratio,
                "minlenratio": args.minlenratio,
                "lm_weight": args.lm_weight,
                "rnnlm": args.rnnlm,
                "nbest": args.nbest,
                "space": args.sym_space,
                "blank": args.sym_blank,
                "tgt_lang": False,
            }

            self.recog_args = argparse.Namespace(**recog_args)
            self.report_cer = args.report_cer
            self.report_wer = args.report_wer
        else:
            self.report_cer = False
            self.report_wer = False
        if args.report_bleu:
            trans_args = {
                "beam_size": args.beam_size,
                "penalty": args.penalty,
                "ctc_weight": 0,
                "maxlenratio": args.maxlenratio,
                "minlenratio": args.minlenratio,
                "lm_weight": args.lm_weight,
                "rnnlm": args.rnnlm,
                "nbest": args.nbest,
                "space": args.sym_space,
                "blank": args.sym_blank,
                "tgt_lang": False,
            }

            self.trans_args = argparse.Namespace(**trans_args)
            self.report_bleu = args.report_bleu
        else:
            self.report_bleu = False
        self.rnnlm = None

        self.logzero = -10000000000.0
        self.loss = None
        self.acc = None

    def init_like_chainer(self):
        """Initialize weight like chainer.

        chainer basically uses LeCun way: W ~ Normal(0, fan_in ** -0.5), b = 0
        pytorch basically uses W, b ~ Uniform(-fan_in**-0.5, fan_in**-0.5)
        however, there are two exceptions as far as I know.
        - EmbedID.W ~ Normal(0, 1)
        - LSTM.upward.b[forget_gate_range] = 1 (but not used in NStepLSTM)
        """
        lecun_normal_init_parameters(self)
        # exceptions
        # embed weight ~ Normal(0, 1)
        self.dec.embed.weight.data.normal_(0, 1)
        # forget-bias = 1.0
        # https://discuss.pytorch.org/t/set-forget-gate-bias-of-lstm/1745
        for i in six.moves.range(len(self.dec.decoder)):
            set_forget_bias_to_one(self.dec.decoder[i].bias_ih)

    def forward(self, xs_pad, ilens, ys_pad, ys_pad_src):
        """E2E forward.

        :param torch.Tensor xs_pad: batch of padded input sequences (B, Tmax, idim)
        :param torch.Tensor ilens: batch of lengths of input sequences (B)
        :param torch.Tensor ys_pad: batch of padded token id sequence tensor (B, Lmax)
        :return: loss value
        :rtype: torch.Tensor
        """
        # 0. Extract target language ID
        if self.multilingual:
            tgt_lang_ids = ys_pad[:, 0:1]
            ys_pad = ys_pad[:, 1:]  # remove target language ID in the beggining
        else:
            tgt_lang_ids = None

        # 1. Encoder
        hs_pad, hlens, _ = self.enc(xs_pad, ilens)

        # 2. ST attention loss
        self.loss_st, self.acc, _ = self.dec(
            hs_pad, hlens, ys_pad, lang_ids=tgt_lang_ids
        )

        # 3. ASR loss
        (
            self.loss_asr_att,
            acc_asr,
            self.loss_asr_ctc,
            cer_ctc,
            cer,
            wer,
        ) = self.forward_asr(hs_pad, hlens, ys_pad_src)

        # 4. MT attention loss
        self.loss_mt, acc_mt = self.forward_mt(ys_pad, ys_pad_src)

        # 5. Compute BLEU
        if self.training or not self.report_bleu:
            self.bleu = 0.0
        else:
            lpz = None

            nbest_hyps = self.dec.recognize_beam_batch(
                hs_pad,
                torch.tensor(hlens),
                lpz,
                self.trans_args,
                self.char_list,
                self.rnnlm,
                lang_ids=tgt_lang_ids.squeeze(1).tolist()
                if self.multilingual
                else None,
            )
            # remove <sos> and <eos>
            list_of_refs = []
            hyps = []
            y_hats = [nbest_hyp[0]["yseq"][1:-1] for nbest_hyp in nbest_hyps]
            for i, y_hat in enumerate(y_hats):
                y_true = ys_pad[i]

                seq_hat = [self.char_list[int(idx)] for idx in y_hat if int(idx) != -1]
                seq_true = [
                    self.char_list[int(idx)] for idx in y_true if int(idx) != -1
                ]
                seq_hat_text = "".join(seq_hat).replace(self.trans_args.space, " ")
                seq_hat_text = seq_hat_text.replace(self.trans_args.blank, "")
                seq_true_text = "".join(seq_true).replace(self.trans_args.space, " ")

                hyps += [seq_hat_text.split(" ")]
                list_of_refs += [[seq_true_text.split(" ")]]

            self.bleu = nltk.bleu_score.corpus_bleu(list_of_refs, hyps) * 100

        asr_ctc_weight = self.mtlalpha
        self.loss_asr = (
            asr_ctc_weight * self.loss_asr_ctc
            + (1 - asr_ctc_weight) * self.loss_asr_att
        )
        self.loss = (
            (1 - self.asr_weight - self.mt_weight) * self.loss_st
            + self.asr_weight * self.loss_asr
            + self.mt_weight * self.loss_mt
        )
        loss_st_data = float(self.loss_st)
        loss_asr_data = float(self.loss_asr)
        loss_mt_data = float(self.loss_mt)
        loss_data = float(self.loss)
        if loss_data < CTC_LOSS_THRESHOLD and not math.isnan(loss_data):
            self.reporter.report(
                loss_asr_data,
                loss_mt_data,
                loss_st_data,
                acc_asr,
                acc_mt,
                self.acc,
                cer_ctc,
                cer,
                wer,
                self.bleu,
                loss_data,
            )
        else:
            logging.warning("loss (=%f) is not correct", loss_data)
        return self.loss

    def forward_asr(self, hs_pad, hlens, ys_pad):
        """Forward pass in the auxiliary ASR task.

        :param torch.Tensor hs_pad: batch of padded source sequences (B, Tmax, idim)
        :param torch.Tensor hlens: batch of lengths of input sequences (B)
        :param torch.Tensor ys_pad: batch of padded target sequences (B, Lmax)
        :return: ASR attention loss value
        :rtype: torch.Tensor
        :return: accuracy in ASR attention decoder
        :rtype: float
        :return: ASR CTC loss value
        :rtype: torch.Tensor
        :return: character error rate from CTC prediction
        :rtype: float
        :return: character error rate from attetion decoder prediction
        :rtype: float
        :return: word error rate from attetion decoder prediction
        :rtype: float
        """
        loss_att, loss_ctc = 0.0, 0.0
        acc = None
        cer, wer = None, None
        cer_ctc = None
        if self.asr_weight == 0:
            return loss_att, acc, loss_ctc, cer_ctc, cer, wer

        # attention
        if self.mtlalpha < 1:
            loss_asr, acc_asr, _ = self.dec_asr(hs_pad, hlens, ys_pad)

            # Compute wer and cer
            if not self.training and (self.report_cer or self.report_wer):
                if self.mtlalpha > 0 and self.recog_args.ctc_weight > 0.0:
                    lpz = self.ctc.log_softmax(hs_pad).data
                else:
                    lpz = None

                word_eds, word_ref_lens, char_eds, char_ref_lens = [], [], [], []
                nbest_hyps_asr = self.dec_asr.recognize_beam_batch(
                    hs_pad,
                    torch.tensor(hlens),
                    lpz,
                    self.recog_args,
                    self.char_list,
                    self.rnnlm,
                )
                # remove <sos> and <eos>
                y_hats = [nbest_hyp[0]["yseq"][1:-1] for nbest_hyp in nbest_hyps_asr]
                for i, y_hat in enumerate(y_hats):
                    y_true = ys_pad[i]

                    seq_hat = [
                        self.char_list[int(idx)] for idx in y_hat if int(idx) != -1
                    ]
                    seq_true = [
                        self.char_list[int(idx)] for idx in y_true if int(idx) != -1
                    ]
                    seq_hat_text = "".join(seq_hat).replace(self.recog_args.space, " ")
                    seq_hat_text = seq_hat_text.replace(self.recog_args.blank, "")
                    seq_true_text = "".join(seq_true).replace(
                        self.recog_args.space, " "
                    )

                    hyp_words = seq_hat_text.split()
                    ref_words = seq_true_text.split()
                    word_eds.append(editdistance.eval(hyp_words, ref_words))
                    word_ref_lens.append(len(ref_words))
                    hyp_chars = seq_hat_text.replace(" ", "")
                    ref_chars = seq_true_text.replace(" ", "")
                    char_eds.append(editdistance.eval(hyp_chars, ref_chars))
                    char_ref_lens.append(len(ref_chars))

                wer = (
                    0.0
                    if not self.report_wer
                    else float(sum(word_eds)) / sum(word_ref_lens)
                )
                cer = (
                    0.0
                    if not self.report_cer
                    else float(sum(char_eds)) / sum(char_ref_lens)
                )

        # CTC
        if self.mtlalpha > 0:
            loss_ctc = self.ctc(hs_pad, hlens, ys_pad)

            # Compute cer with CTC prediction
            if self.char_list is not None:
                cers = []
                y_hats = self.ctc.argmax(hs_pad).data
                for i, y in enumerate(y_hats):
                    y_hat = [x[0] for x in groupby(y)]
                    y_true = ys_pad[i]

                    seq_hat = [
                        self.char_list[int(idx)] for idx in y_hat if int(idx) != -1
                    ]
                    seq_true = [
                        self.char_list[int(idx)] for idx in y_true if int(idx) != -1
                    ]
                    seq_hat_text = "".join(seq_hat).replace(self.space, " ")
                    seq_hat_text = seq_hat_text.replace(self.blank, "")
                    seq_true_text = "".join(seq_true).replace(self.space, " ")

                    hyp_chars = seq_hat_text.replace(" ", "")
                    ref_chars = seq_true_text.replace(" ", "")
                    if len(ref_chars) > 0:
                        cers.append(
                            editdistance.eval(hyp_chars, ref_chars) / len(ref_chars)
                        )
                cer_ctc = sum(cers) / len(cers) if cers else None

        return loss_att, acc, loss_ctc, cer_ctc, cer, wer

    def forward_mt(self, xs_pad, ys_pad):
        """Forward pass in the auxiliary MT task.

        :param torch.Tensor xs_pad: batch of padded source sequences (B, Tmax, idim)
        :param torch.Tensor ys_pad: batch of padded target sequences (B, Lmax)
        :return: MT loss value
        :rtype: torch.Tensor
        :return: accuracy in MT decoder
        :rtype: float
        """
        loss = 0.0
        acc = 0.0
        if self.mt_weight == 0:
            return loss, acc

        ilens = torch.sum(xs_pad != -1, dim=1).cpu().numpy()
        # NOTE: xs_pad is padded with -1
        ys_src = [y[y != -1] for y in xs_pad]  # parse padded ys_src
        xs_zero_pad = pad_list(ys_src, self.pad)  # re-pad with zero
        hs_pad, hlens, _ = self.enc_mt(
            self.dropout_mt(self.embed_mt(xs_zero_pad)), ilens
        )
        loss, acc, _ = self.dec(hs_pad, hlens, ys_pad)
        return loss, acc

    def scorers(self):
        """Scorers."""
        return dict(decoder=self.dec)

    def encode(self, x):
        """Encode acoustic features.

        :param ndarray x: input acoustic feature (T, D)
        :return: encoder outputs
        :rtype: torch.Tensor
        """
        self.eval()
        ilens = [x.shape[0]]

        # subsample frame
        x = x[:: self.subsample[0], :]
        p = next(self.parameters())
        h = torch.as_tensor(x, device=p.device, dtype=p.dtype)
        # make a utt list (1) to use the same interface for encoder
        hs = h.contiguous().unsqueeze(0)

        # 1. encoder
        hs, _, _ = self.enc(hs, ilens)
        return hs.squeeze(0)

    def translate(self, x, trans_args, char_list, rnnlm=None):
        """E2E beam search.

        :param ndarray x: input acoustic feature (T, D)
        :param Namespace trans_args: argument Namespace containing options
        :param list char_list: list of characters
        :param torch.nn.Module rnnlm: language model module
        :return: N-best decoding results
        :rtype: list
        """
        logging.info("input lengths: " + str(x.shape[0]))
        hs = self.encode(x).unsqueeze(0)
        logging.info("encoder output lengths: " + str(hs.size(1)))

        # 2. Decoder
        # decode the first utterance
        y = self.dec.recognize_beam(hs[0], None, trans_args, char_list, rnnlm)
        return y

    def translate_batch(self, xs, trans_args, char_list, rnnlm=None):
        """E2E batch beam search.

        :param list xs: list of input acoustic feature arrays [(T_1, D), (T_2, D), ...]
        :param Namespace trans_args: argument Namespace containing options
        :param list char_list: list of characters
        :param torch.nn.Module rnnlm: language model module
        :return: N-best decoding results
        :rtype: list
        """
        prev = self.training
        self.eval()
        ilens = np.fromiter((xx.shape[0] for xx in xs), dtype=np.int64)

        # subsample frame
        xs = [xx[:: self.subsample[0], :] for xx in xs]
        xs = [to_device(self, to_torch_tensor(xx).float()) for xx in xs]
        xs_pad = pad_list(xs, 0.0)

        # 1. Encoder
        hs_pad, hlens, _ = self.enc(xs_pad, ilens)

        # 2. Decoder
        hlens = torch.tensor(list(map(int, hlens)))  # make sure hlens is tensor
        y = self.dec.recognize_beam_batch(
            hs_pad, hlens, None, trans_args, char_list, rnnlm
        )

        if prev:
            self.train()
        return y

    def calculate_all_attentions(self, xs_pad, ilens, ys_pad, ys_pad_src):
        """E2E attention calculation.

        :param torch.Tensor xs_pad: batch of padded input sequences (B, Tmax, idim)
        :param torch.Tensor ilens: batch of lengths of input sequences (B)
        :param torch.Tensor ys_pad: batch of padded token id sequence tensor (B, Lmax)
        :param torch.Tensor ys_pad_src:
            batch of padded token id sequence tensor (B, Lmax)
        :return: attention weights with the following shape,
            1) multi-head case => attention weights (B, H, Lmax, Tmax),
            2) other case => attention weights (B, Lmax, Tmax).
        :rtype: float ndarray
        """
        self.eval()
        with torch.no_grad():
            # 1. Encoder
            if self.multilingual:
                tgt_lang_ids = ys_pad[:, 0:1]
                ys_pad = ys_pad[:, 1:]  # remove target language ID in the beggining
            else:
                tgt_lang_ids = None
            hpad, hlens, _ = self.enc(xs_pad, ilens)

            # 2. Decoder
            att_ws = self.dec.calculate_all_attentions(
                hpad, hlens, ys_pad, lang_ids=tgt_lang_ids
            )
        self.train()
        return att_ws

    def calculate_all_ctc_probs(self, xs_pad, ilens, ys_pad, ys_pad_src):
        """E2E CTC probability calculation.

        :param torch.Tensor xs_pad: batch of padded input sequences (B, Tmax)
        :param torch.Tensor ilens: batch of lengths of input sequences (B)
        :param torch.Tensor ys_pad: batch of padded token id sequence tensor (B, Lmax)
        :param torch.Tensor
            ys_pad_src: batch of padded token id sequence tensor (B, Lmax)
        :return: CTC probability (B, Tmax, vocab)
        :rtype: float ndarray
        """
        probs = None
        if self.asr_weight == 0 or self.mtlalpha == 0:
            return probs

        self.eval()
        with torch.no_grad():
            # 1. Encoder
            hpad, hlens, _ = self.enc(xs_pad, ilens)

            # 2. CTC probs
            probs = self.ctc.softmax(hpad).cpu().numpy()
        self.train()
        return probs

    def subsample_frames(self, x):
        """Subsample speeh frames in the encoder."""
        # subsample frame
        x = x[:: self.subsample[0], :]
        ilen = [x.shape[0]]
        h = to_device(self, torch.from_numpy(np.array(x, dtype=np.float32)))
        h.contiguous()
        return h, ilen


================================================
FILE: nets/pytorch_backend/e2e_st_conformer.py
================================================
# Copyright 2020 Kyoto University (Hirofumi Inaguma)
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

"""
Conformer speech translation model (pytorch).

It is a fusion of `e2e_st_transformer.py`
Refer to: https://arxiv.org/abs/2005.08100

"""

from espnet.nets.pytorch_backend.conformer.encoder import Encoder
from espnet.nets.pytorch_backend.e2e_st_transformer import E2E as E2ETransformer
from espnet.nets.pytorch_backend.conformer.argument import (
    add_arguments_conformer_common,  # noqa: H301
    verify_rel_pos_type,  # noqa: H301
)


class E2E(E2ETransformer):
    """E2E module.

    :param int idim: dimension of inputs
    :param int odim: dimension of outputs
    :param Namespace args: argument Namespace containing options

    """

    @staticmethod
    def add_arguments(parser):
        """Add arguments."""
        E2ETransformer.add_arguments(parser)
        E2E.add_conformer_arguments(parser)
        return parser

    @staticmethod
    def add_conformer_arguments(parser):
        """Add arguments for conformer model."""
        group = parser.add_argument_group("conformer model specific setting")
        group = add_arguments_conformer_common(group)
        return parser

    def __init__(self, idim, odim, args, ignore_id=-1):
        """Construct an E2E object.

        :param int idim: dimension of inputs
        :param int odim: dimension of outputs
        :param Namespace args: argument Namespace containing options
        """
        super().__init__(idim, odim, args, ignore_id)
        if args.transformer_attn_dropout_rate is None:
            args.transformer_attn_dropout_rate = args.dropout_rate

        # Check the relative positional encoding type
        args = verify_rel_pos_type(args)

        self.encoder = Encoder(
            idim=idim,
            attention_dim=args.adim,
            attention_heads=args.aheads,
            linear_units=args.eunits,
            num_blocks=args.elayers,
            input_layer=args.transformer_input_layer,
            dropout_rate=args.dropout_rate,
            positional_dropout_rate=args.dropout_rate,
            attention_dropout_rate=args.transformer_attn_dropout_rate,
            pos_enc_layer_type=args.transformer_encoder_pos_enc_layer_type,
            selfattention_layer_type=args.transformer_encoder_selfattn_layer_type,
            activation_type=args.transformer_encoder_activation_type,
            macaron_style=args.macaron_style,
            use_cnn_module=args.use_cnn_module,
            cnn_module_kernel=args.cnn_module_kernel,
        )
        self.reset_parameters(args)


================================================
FILE: nets/pytorch_backend/e2e_st_transformer.py
================================================
# Copyright 2019 Kyoto University (Hirofumi Inaguma)
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

"""Transformer speech recognition model (pytorch)."""

from argparse import Namespace
import logging
import math
import numpy

import torch

from espnet.nets.e2e_asr_common import end_detect
from espnet.nets.e2e_asr_common import ErrorCalculator as ASRErrorCalculator
from espnet.nets.e2e_mt_common import ErrorCalculator as MTErrorCalculator
from espnet.nets.pytorch_backend.ctc import CTC
from espnet.nets.pytorch_backend.e2e_asr import CTC_LOSS_THRESHOLD
from espnet.nets.pytorch_backend.e2e_st import Reporter
from espnet.nets.pytorch_backend.nets_utils import get_subsample
from espnet.nets.pytorch_backend.nets_utils import make_non_pad_mask
from espnet.nets.pytorch_backend.nets_utils import pad_list
from espnet.nets.pytorch_backend.nets_utils import th_accuracy
from espnet.nets.pytorch_backend.transformer.add_sos_eos import add_sos_eos
from espnet.nets.pytorch_backend.transformer.argument import (
    add_arguments_transformer_common,  # noqa: H301
)
from espnet.nets.pytorch_backend.transformer.attention import MultiHeadedAttention
from espnet.nets.pytorch_backend.transformer.decoder import Decoder
from espnet.nets.pytorch_backend.transformer.encoder import Encoder
from espnet.nets.pytorch_backend.transformer.initializer import initialize
from espnet.nets.pytorch_backend.transformer.label_smoothing_loss import (
    LabelSmoothingLoss,  # noqa: H301
)
from espnet.nets.pytorch_backend.transformer.mask import subsequent_mask
from espnet.nets.pytorch_backend.transformer.mask import target_mask
from espnet.nets.pytorch_backend.transformer.plot import PlotAttentionReport
from espnet.nets.st_interface import STInterface
from espnet.utils.fill_missing_args import fill_missing_args


class E2E(STInterface, torch.nn.Module):
    """E2E module.

    :param int idim: dimension of inputs
    :param int odim: dimension of outputs
    :param Namespace args: argument Namespace containing options

    """

    @staticmethod
    def add_arguments(parser):
        """Add arguments."""
        group = parser.add_argument_group("transformer model setting")
        group = add_arguments_transformer_common(group)
        return parser

    @property
    def attention_plot_class(self):
        """Return PlotAttentionReport."""
        return PlotAttentionReport

    def get_total_subsampling_factor(self):
        """Get total subsampling factor."""
        return self.encoder.conv_subsampling_factor * int(numpy.prod(self.subsample))

    def __init__(self, idim, odim, args, ignore_id=-1):
        """Construct an E2E object.

        :param int idim: dimension of inputs
        :param int odim: dimension of outputs
        :param Namespace args: argument Namespace containing options
        """
        torch.nn.Module.__init__(self)

        # fill missing arguments for compatibility
        args = fill_missing_args(args, self.add_arguments)

        if args.transformer_attn_dropout_rate is None:
            args.transformer_attn_dropout_rate = args.dropout_rate
        self.encoder = Encoder(
            idim=idim,
            selfattention_layer_type=args.transformer_encoder_selfattn_layer_type,
            attention_dim=args.adim,
            attention_heads=args.aheads,
            conv_wshare=args.wshare,
            conv_kernel_length=args.ldconv_encoder_kernel_length,
            conv_usebias=args.ldconv_usebias,
            linear_units=args.eunits,
            num_blocks=args.elayers,
            input_layer=args.transformer_input_layer,
            dropout_rate=args.dropout_rate,
            positional_dropout_rate=args.dropout_rate,
            attention_dropout_rate=args.transformer_attn_dropout_rate,
        )
        self.decoder = Decoder(
            odim=odim,
            selfattention_layer_type=args.transformer_decoder_selfattn_layer_type,
            attention_dim=args.adim,
            attention_heads=args.aheads,
            conv_wshare=args.wshare,
            conv_kernel_length=args.ldconv_decoder_kernel_length,
            conv_usebias=args.ldconv_usebias,
            linear_units=args.dunits,
            num_blocks=args.dlayers,
            dropout_rate=args.dropout_rate,
            positional_dropout_rate=args.dropout_rate,
            self_attention_dropout_rate=args.transformer_attn_dropout_rate,
            src_attention_dropout_rate=args.transformer_attn_dropout_rate,
        )
        self.pad = 0  # use <blank> for padding
        self.sos = odim - 1
        self.eos = odim - 1
        self.odim = odim
        self.ignore_id = ignore_id
        self.subsample = get_subsample(args, mode="st", arch="transformer")
        self.reporter = Reporter()

        self.criterion = LabelSmoothingLoss(
            self.odim,
            self.ignore_id,
            args.lsm_weight,
            args.transformer_length_normalized_loss,
        )
        # submodule for ASR task
        self.mtlalpha = args.mtlalpha
        self.asr_weight = args.asr_weight
        if self.asr_weight > 0 and args.mtlalpha < 1:
            self.decoder_asr = Decoder(
                odim=odim,
                attention_dim=args.adim,
                attention_heads=args.aheads,
                linear_units=args.dunits,
                num_blocks=args.dlayers,
                dropout_rate=args.dropout_rate,
                positional_dropout_rate=args.dropout_rate,
                self_attention_dropout_rate=args.transformer_attn_dropout_rate,
                src_attention_dropout_rate=args.transformer_attn_dropout_rate,
            )

        # submodule for MT task
        self.mt_weight = args.mt_weight
        if self.mt_weight > 0:
            self.encoder_mt = Encoder(
                idim=odim,
                attention_dim=args.adim,
                attention_heads=args.aheads,
                linear_units=args.dunits,
                num_blocks=args.dlayers,
                input_layer="embed",
                dropout_rate=args.dropout_rate,
                positional_dropout_rate=args.dropout_rate,
                attention_dropout_rate=args.transformer_attn_dropout_rate,
                padding_idx=0,
            )
        self.reset_parameters(args)  # NOTE: place after the submodule initialization
        self.adim = args.adim  # used for CTC (equal to d_model)
        if self.asr_weight > 0 and args.mtlalpha > 0.0:
            self.ctc = CTC(
                odim, args.adim, args.dropout_rate, ctc_type=args.ctc_type, reduce=True
            )
        else:
            self.ctc = None

        # translation error calculator
        self.error_calculator = MTErrorCalculator(
            args.char_list, args.sym_space, args.sym_blank, args.report_bleu
        )

        # recognition error calculator
        self.error_calculator_asr = ASRErrorCalculator(
            args.char_list,
            args.sym_space,
            args.sym_blank,
            args.report_cer,
            args.report_wer,
        )
        self.rnnlm = None

        # multilingual E2E-ST related
        self.multilingual = getattr(args, "multilingual", False)
        self.replace_sos = getattr(args, "replace_sos", False)

    def reset_parameters(self, args):
        """Initialize parameters."""
        initialize(self, args.transformer_init)
        if self.mt_weight > 0:
            torch.nn.init.normal_(
                self.encoder_mt.embed[0].weight, mean=0, std=args.adim ** -0.5
            )
            torch.nn.init.constant_(self.encoder_mt.embed[0].weight[self.pad], 0)

    def forward(self, xs_pad, ilens, ys_pad, ys_pad_src):
        """E2E forward.

        :param torch.Tensor xs_pad: batch of padded source sequences (B, Tmax, idim)
        :param torch.Tensor ilens: batch of lengths of source sequences (B)
        :param torch.Tensor ys_pad: batch of padded target sequences (B, Lmax)
        :param torch.Tensor ys_pad_src: batch of padded target sequences (B, Lmax)
        :return: ctc loss value
        :rtype: torch.Tensor
        :return: attention loss value
        :rtype: torch.Tensor
        :return: accuracy in attention decoder
        :rtype: float
        """
        # 0. Extract target language ID
        tgt_lang_ids = None
        if self.multilingual:
            tgt_lang_ids = ys_pad[:, 0:1]
            ys_pad = ys_pad[:, 1:]  # remove target language ID in the beggining

        # 1. forward encoder
        xs_pad = xs_pad[:, : max(ilens)]  # for data parallel
        src_mask = make_non_pad_mask(ilens.tolist()).to(xs_pad.device).unsqueeze(-2)
        hs_pad, hs_mask = self.encoder(xs_pad, src_mask)

        # 2. forward decoder
        ys_in_pad, ys_out_pad = add_sos_eos(ys_pad, self.sos, self.eos, self.ignore_id)
        # replace <sos> with target language ID
        if self.replace_sos:
            ys_in_pad = torch.cat([tgt_lang_ids, ys_in_pad[:, 1:]], dim=1)
        ys_mask = target_mask(ys_in_pad, self.ignore_id)
        pred_pad, pred_mask = self.decoder(ys_in_pad, ys_mask, hs_pad, hs_mask)

        # 3. compute ST loss
        loss_att = self.criterion(pred_pad, ys_out_pad)

        self.acc = th_accuracy(
            pred_pad.view(-1, self.odim), ys_out_pad, ignore_label=self.ignore_id
        )

        # 4. compute corpus-level bleu in a mini-batch
        if self.training:
            self.bleu = None
        else:
            ys_hat = pred_pad.argmax(dim=-1)
            self.bleu = self.error_calculator(ys_hat.cpu(), ys_pad.cpu())

        # 5. compute auxiliary ASR loss
        loss_asr_att, acc_asr, loss_asr_ctc, cer_ctc, cer, wer = self.forward_asr(
            hs_pad, hs_mask, ys_pad_src
        )

        # 6. compute auxiliary MT loss
        loss_mt, acc_mt = 0.0, None
        if self.mt_weight > 0:
            loss_mt, acc_mt = self.forward_mt(
                ys_pad_src, ys_in_pad, ys_out_pad, ys_mask
            )

        asr_ctc_weight = self.mtlalpha
        self.loss = (
            (1 - self.asr_weight - self.mt_weight) * loss_att
            + self.asr_weight
            * (asr_ctc_weight * loss_asr_ctc + (1 - asr_ctc_weight) * loss_asr_att)
            + self.mt_weight * loss_mt
        )
        loss_asr_data = float(
            asr_ctc_weight * loss_asr_ctc + (1 - asr_ctc_weight) * loss_asr_att
        )
        loss_mt_data = None if self.mt_weight == 0 else float(loss_mt)
        loss_st_data = float(loss_att)

        loss_data = float(self.loss)
        if loss_data < CTC_LOSS_THRESHOLD and not math.isnan(loss_data):
            self.reporter.report(
                loss_asr_data,
                loss_mt_data,
                loss_st_data,
                acc_asr,
                acc_mt,
                self.acc,
                cer_ctc,
                cer,
                wer,
                self.bleu,
                loss_data,
            )
        else:
            logging.warning("loss (=%f) is not correct", loss_data)
        return self.loss

    def forward_asr(self, hs_pad, hs_mask, ys_pad):
        """Forward pass in the auxiliary ASR task.

        :param torch.Tensor hs_pad: batch of padded source sequences (B, Tmax, idim)
        :param torch.Tensor hs_mask: batch of input token mask (B, Lmax)
        :param torch.Tensor ys_pad: batch of padded target sequences (B, Lmax)
        :return: ASR attention loss value
        :rtype: torch.Tensor
        :return: accuracy in ASR attention decoder
        :rtype: float
        :return: ASR CTC loss value
        :rtype: torch.Tensor
        :return: character error rate from CTC prediction
        :rtype: float
        :return: character error rate from attetion decoder prediction
        :rtype: float
        :return: word error rate from attetion decoder prediction
        :rtype: float
        """
        loss_att, loss_ctc = 0.0, 0.0
        acc = None
        cer, wer = None, None
        cer_ctc = None
        if self.asr_weight == 0:
            return loss_att, acc, loss_ctc, cer_ctc, cer, wer

        # attention
        if self.mtlalpha < 1:
            ys_in_pad_asr, ys_out_pad_asr = add_sos_eos(
                ys_pad, self.sos, self.eos, self.ignore_id
            )
            ys_mask_asr = target_mask(ys_in_pad_asr, self.ignore_id)
            pred_pad, _ = self.decoder_asr(ys_in_pad_asr, ys_mask_asr, hs_pad, hs_mask)
            loss_att = self.criterion(pred_pad, ys_out_pad_asr)

            acc = th_accuracy(
                pred_pad.view(-1, self.odim),
                ys_out_pad_asr,
                ignore_label=self.ignore_id,
            )
            if not self.training:
                ys_hat_asr = pred_pad.argmax(dim=-1)
                cer, wer = self.error_calculator_asr(ys_hat_asr.cpu(), ys_pad.cpu())

        # CTC
        if self.mtlalpha > 0:
            batch_size = hs_pad.size(0)
            hs_len = hs_mask.view(batch_size, -1).sum(1)
            loss_ctc = self.ctc(hs_pad.view(batch_size, -1, self.adim), hs_len, ys_pad)
            if not self.training:
                ys_hat_ctc = self.ctc.argmax(
                    hs_pad.view(batch_size, -1, self.adim)
                ).data
                cer_ctc = self.error_calculator_asr(
                    ys_hat_ctc.cpu(), ys_pad.cpu(), is_ctc=True
                )
                # for visualization
                self.ctc.softmax(hs_pad)
        return loss_att, acc, loss_ctc, cer_ctc, cer, wer

    def forward_mt(self, xs_pad, ys_in_pad, ys_out_pad, ys_mask):
        """Forward pass in the auxiliary MT task.

        :param torch.Tensor xs_pad: batch of padded source sequences (B, Tmax, idim)
        :param torch.Tensor ys_in_pad: batch of padded target sequences (B, Lmax)
        :param torch.Tensor ys_out_pad: batch of padded target sequences (B, Lmax)
        :param torch.Tensor ys_mask: batch of input token mask (B, Lmax)
        :return: MT loss value
        :rtype: torch.Tensor
        :return: accuracy in MT decoder
        :rtype: float
        """
        loss, acc = 0.0, None
        if self.mt_weight == 0:
            return loss, acc

        ilens = torch.sum(xs_pad != self.ignore_id, dim=1).cpu().numpy()
        # NOTE: xs_pad is padded with -1
        xs = [x[x != self.ignore_id] for x in xs_pad]  # parse padded xs
        xs_zero_pad = pad_list(xs, self.pad)  # re-pad with zero
        xs_zero_pad = xs_zero_pad[:, : max(ilens)]  # for data parallel
        src_mask = (
            make_non_pad_mask(ilens.tolist()).to(xs_zero_pad.device).unsqueeze(-2)
        )
        hs_pad, hs_mask = self.encoder_mt(xs_zero_pad, src_mask)
        pred_pad, _ = self.decoder(ys_in_pad, ys_mask, hs_pad, hs_mask)
        loss = self.criterion(pred_pad, ys_out_pad)
        acc = th_accuracy(
            pred_pad.view(-1, self.odim), ys_out_pad, ignore_label=self.ignore_id
        )
        return loss, acc

    def scorers(self):
        """Scorers."""
        return dict(decoder=self.decoder)

    def encode(self, x):
        """Encode source acoustic features.

        :param ndarray x: source acoustic feature (T, D)
        :return: encoder outputs
        :rtype: torch.Tensor
        """
        self.eval()
        x = torch.as_tensor(x).unsqueeze(0)
        enc_output, _ = self.encoder(x, None)
        return enc_output.squeeze(0)

    def translate(
        self,
        x,
        trans_args,
        char_list=None,
    ):
        """Translate input speech.

        :param ndnarray x: input acoustic feature (B, T, D) or (T, D)
        :param Namespace trans_args: argment Namespace contraining options
        :param list char_list: list of characters
        :return: N-best decoding results
        :rtype: list
        """
        # preprate sos
        if getattr(trans_args, "tgt_lang", False):
            if self.replace_sos:
                y = char_list.index(trans_args.tgt_lang)
        else:
            y = self.sos
        logging.info("<sos> index: " + str(y))
        logging.info("<sos> mark: " + char_list[y])
        logging.info("input lengths: " + str(x.shape[0]))

        enc_output = self.encode(x).unsqueeze(0)

        h = enc_output

        logging.info("encoder output lengths: " + str(h.size(1)))
        # search parms
        beam = trans_args.beam_size
        penalty = trans_args.penalty

        if trans_args.maxlenratio == 0:
            maxlen = h.size(1)
        else:
            # maxlen >= 1
            maxlen = max(1, int(trans_args.maxlenratio * h.size(1)))
        minlen = int(trans_args.minlenratio * h.size(1))
        logging.info("max output length: " + str(maxlen))
        logging.info("min output length: " + str(minlen))

        # initialize hypothesis
        hyp = {"score": 0.0, "yseq": [y]}
        hyps = [hyp]
        ended_hyps = []

        for i in range(maxlen):
            logging.debug("position " + str(i))

            # batchfy
            ys = h.new_zeros((len(hyps), i + 1), dtype=torch.int64)
            for j, hyp in enumerate(hyps):
                ys[j, :] = torch.tensor(hyp["yseq"])
            ys_mask = subsequent_mask(i + 1).unsqueeze(0).to(h.device)

            local_scores = self.decoder.forward_one_step(
                ys, ys_mask, h.repeat([len(hyps), 1, 1])
            )[0]

            hyps_best_kept = []
            for j, hyp in enumerate(hyps):
                local_best_scores, local_best_ids = torch.topk(
                    local_scores[j : j + 1], beam, dim=1
                )

                for j in range(beam):
                    new_hyp = {}
                    new_hyp["score"] = hyp["score"] + float(local_best_scores[0, j])
                    new_hyp["yseq"] = [0] * (1 + len(hyp["yseq"]))
                    new_hyp["yseq"][: len(hyp["yseq"])] = hyp["yseq"]
                    new_hyp["yseq"][len(hyp["yseq"])] = int(local_best_ids[0, j])
                    # will be (2 x beam) hyps at most
                    hyps_best_kept.append(new_hyp)

                hyps_best_kept = sorted(
                    hyps_best_kept, key=lambda x: x["score"], reverse=True
                )[:beam]

            # sort and get nbest
            hyps = hyps_best_kept
            logging.debug("number of pruned hypothes: " + str(len(hyps)))
            if char_list is not None:
                logging.debug(
                    "best hypo: "
                    + "".join([char_list[int(x)] for x in hyps[0]["yseq"][1:]])
                )

            # add eos in the final loop to avoid that there are no ended hyps
            if i == maxlen - 1:
                logging.info("adding <eos> in the last postion in the loop")
                for hyp in hyps:
                    hyp["yseq"].append(self.eos)

            # add ended hypothes to a final list, and removed them from current hypothes
            # (this will be a probmlem, number of hyps < beam)
            remained_hyps = []
            for hyp in hyps:
                if hyp["yseq"][-1] == self.eos:
                    # only store the sequence that has more than minlen outputs
                    # also add penalty
                    if len(hyp["yseq"]) > minlen:
                        hyp["score"] += (i + 1) * penalty
                        ended_hyps.append(hyp)
                else:
                    remained_hyps.append(hyp)

            # end detection
            if end_detect(ended_hyps, i) and trans_args.maxlenratio == 0.0:
                logging.info("end detected at %d", i)
                break

            hyps = remained_hyps
            if len(hyps) > 0:
                logging.debug("remeined hypothes: " + str(len(hyps)))
            else:
                logging.info("no hypothesis. Finish decoding.")
                break

            if char_list is not None:
                for hyp in hyps:
                    logging.debug(
                        "hypo: " + "".join([char_list[int(x)] for x in hyp["yseq"][1:]])
                    )

            logging.debug("number of ended hypothes: " + str(len(ended_hyps)))

        nbest_hyps = sorted(ended_hyps, key=lambda x: x["score"], reverse=True)[
            : min(len(ended_hyps), trans_args.nbest)
        ]

        # check number of hypotheis
        if len(nbest_hyps) == 0:
            logging.warning(
                "there is no N-best results, perform translation "
                "again with smaller minlenratio."
            )
            # should copy becasuse Namespace will be overwritten globally
            trans_args = Namespace(**vars(trans_args))
            trans_args.minlenratio = max(0.0, trans_args.minlenratio - 0.1)
            return self.translate(x, trans_args, char_list)

        logging.info("total log probability: " + str(nbest_hyps[0]["score"]))
        logging.info(
            "normalized log probability: "
            + str(nbest_hyps[0]["score"] / len(nbest_hyps[0]["yseq"]))
        )
        return nbest_hyps

    def calculate_all_attentions(self, xs_pad, ilens, ys_pad, ys_pad_src):
        """E2E attention calculation.

        :param torch.Tensor xs_pad: batch of padded input sequences (B, Tmax, idim)
        :param torch.Tensor ilens: batch of lengths of input sequences (B)
        :param torch.Tensor ys_pad: batch of padded token id sequence tensor (B, Lmax)
        :param torch.Tensor ys_pad_src:
            batch of padded token id sequence tensor (B, Lmax)
        :return: attention weights (B, H, Lmax, Tmax)
        :rtype: float ndarray
        """
        self.eval()
        with torch.no_grad():
            self.forward(xs_pad, ilens, ys_pad, ys_pad_src)
        ret = dict()
        for name, m in self.named_modules():
            if (
                isinstance(m, MultiHeadedAttention) and m.attn is not None
            ):  # skip MHA for submodules
                ret[name] = m.attn.cpu().numpy()
        self.train()
        return ret

    def calculate_all_ctc_probs(self, xs_pad, ilens, ys_pad, ys_pad_src):
        """E2E CTC probability calculation.

        :param torch.Tensor xs_pad: batch of padded input sequences (B, Tmax)
        :param torch.Tensor ilens: batch of lengths of input sequences (B)
        :param torch.Tensor ys_pad: batch of padded token id sequence tensor (B, Lmax)
        :param torch.Tensor ys_pad_src:
            batch of padded token id sequence tensor (B, Lmax)
        :return: CTC probability (B, Tmax, vocab)
        :rtype: float ndarray
        """
        ret = None
        if self.asr_weight == 0 or self.mtlalpha == 0:
            return ret

        self.eval()
        with torch.no_grad():
            self.forward(xs_pad, ilens, ys_pad, ys_pad_src)
        ret = None
        for name, m in self.named_modules():
            if isinstance(m, CTC) and m.probs is not None:
                ret = m.probs.cpu().numpy()
        self.train()
        return ret


================================================
FILE: nets/pytorch_backend/e2e_tts_fastspeech.py
================================================
# Copyright 2019 Tomoki Hayashi
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

"""FastSpeech related modules."""

import logging

import torch
import torch.nn.functional as F

from espnet.asr.asr_utils import get_model_conf
from espnet.asr.asr_utils import torch_load
from espnet.nets.pytorch_backend.fastspeech.duration_calculator import (
    DurationCalculator,  # noqa: H301
)
from espnet.nets.pytorch_backend.fastspeech.duration_predictor import DurationPredictor
from espnet.nets.pytorch_backend.fastspeech.duration_predictor import (
    DurationPredictorLoss,  # noqa: H301
)
from espnet.nets.pytorch_backend.fastspeech.length_regulator import LengthRegulator
from espnet.nets.pytorch_backend.nets_utils import make_non_pad_mask
from espnet.nets.pytorch_backend.nets_utils import make_pad_mask
from espnet.nets.pytorch_backend.tacotron2.decoder import Postnet
from espnet.nets.pytorch_backend.transformer.attention import MultiHeadedAttention
from espnet.nets.pytorch_backend.transformer.embedding import PositionalEncoding
from espnet.nets.pytorch_backend.transformer.embedding import ScaledPositionalEncoding
from espnet.nets.pytorch_backend.transformer.encoder import Encoder
from espnet.nets.pytorch_backend.transformer.initializer import initialize
from espnet.nets.tts_interface import TTSInterface
from espnet.utils.cli_utils import strtobool
from espnet.utils.fill_missing_args import fill_missing_args


class FeedForwardTransformerLoss(torch.nn.Module):
    """Loss function module for feed-forward Transformer."""

    def __init__(self, use_masking=True, use_weighted_masking=False):
        """Initialize feed-forward Transformer loss module.

        Args:
            use_masking (bool):
                Whether to apply masking for padded part in loss calculation.
            use_weighted_masking (bool):
                Whether to weighted masking in loss calculation.

        """
        super(FeedForwardTransformerLoss, self).__init__()
        assert (use_masking != use_weighted_masking) or not use_masking
        self.use_masking = use_masking
        self.use_weighted_masking = use_weighted_masking

        # define criterions
        reduction = "none" if self.use_weighted_masking else "mean"
        self.l1_criterion = torch.nn.L1Loss(reduction=reduction)
        self.duration_criterion = DurationPredictorLoss(reduction=reduction)

    def forward(self, after_outs, before_outs, d_outs, ys, ds, ilens, olens):
        """Calculate forward propagation.

        Args:
            after_outs (Tensor): Batch of outputs after postnets (B, Lmax, odim).
            before_outs (Tensor): Batch of outputs before postnets (B, Lmax, odim).
            d_outs (Tensor): Batch of outputs of duration predictor (B, Tmax).
            ys (Tensor): Batch of target features (B, Lmax, odim).
            ds (Tensor): Batch of durations (B, Tmax).
            ilens (LongTensor): Batch of the lengths of each input (B,).
            olens (LongTensor): Batch of the lengths of each target (B,).

        Returns:
            Tensor: L1 loss value.
            Tensor: Duration predictor loss value.

        """
        # apply mask to remove padded part
        if self.use_masking:
            duration_masks = make_non_pad_mask(ilens).to(ys.device)
            d_outs = d_outs.masked_select(duration_masks)
            ds = ds.masked_select(duration_masks)
            out_masks = make_non_pad_mask(olens).unsqueeze(-1).to(ys.device)
            before_outs = before_outs.masked_select(out_masks)
            after_outs = (
                after_outs.masked_select(out_masks) if after_outs is not None else None
            )
            ys = ys.masked_select(out_masks)

        # calculate loss
        l1_loss = self.l1_criterion(before_outs, ys)
        if after_outs is not None:
            l1_loss += self.l1_criterion(after_outs, ys)
        duration_loss = self.duration_criterion(d_outs, ds)

        # make weighted mask and apply it
        if self.use_weighted_masking:
            out_masks = make_non_pad_mask(olens).unsqueeze(-1).to(ys.device)
            out_weights = out_masks.float() / out_masks.sum(dim=1, keepdim=True).float()
            out_weights /= ys.size(0) * ys.size(2)
            duration_masks = make_non_pad_mask(ilens).to(ys.device)
            duration_weights = (
                duration_masks.float() / duration_masks.sum(dim=1, keepdim=True).float()
            )
            duration_weights /= ds.size(0)

            # apply weight
            l1_loss = l1_loss.mul(out_weights).masked_select(out_masks).sum()
            duration_loss = (
                duration_loss.mul(duration_weights).masked_select(duration_masks).sum()
            )

        return l1_loss, duration_loss


class FeedForwardTransformer(TTSInterface, torch.nn.Module):
    """Feed Forward Transformer for TTS a.k.a. FastSpeech.

    This is a module of FastSpeech,
    feed-forward Transformer with duration predictor described in
    `FastSpeech: Fast, Robust and Controllable Text to Speech`_,
    which does not require any auto-regressive
    processing during inference,
    resulting in fast decoding compared with auto-regressive Transformer.

    .. _`FastSpeech: Fast, Robust and Controllable Text to Speech`:
        https://arxiv.org/pdf/1905.09263.pdf

    """

    @staticmethod
    def add_arguments(parser):
        """Add model-specific arguments to the parser."""
        group = parser.add_argument_group("feed-forward transformer model setting")
        # network structure related
        group.add_argument(
            "--adim",
            default=384,
            type=int,
            help="Number of attention transformation dimensions",
        )
        group.add_argument(
            "--aheads",
            default=4,
            type=int,
            help="Number of heads for multi head attention",
        )
        group.add_argument(
            "--elayers", default=6, type=int, help="Number of encoder layers"
        )
        group.add_argument(
            "--eunits", default=1536, type=int, help="Number of encoder hidden units"
        )
        group.add_argument(
            "--dlayers", default=6, type=int, help="Number of decoder layers"
        )
        group.add_argument(
            "--dunits", default=1536, type=int, help="Number of decoder hidden units"
        )
        group.add_argument(
            "--positionwise-layer-type",
            default="linear",
            type=str,
            choices=["linear", "conv1d", "conv1d-linear"],
            help="Positionwise layer type.",
        )
        group.add_argument(
            "--positionwise-conv-kernel-size",
            default=3,
            type=int,
            help="Kernel size of positionwise conv1d layer",
        )
        group.add_argument(
            "--postnet-layers", default=0, type=int, help="Number of postnet layers"
        )
        group.add_argument(
            "--postnet-chans", default=256, type=int, help="Number of postnet channels"
        )
        group.add_argument(
            "--postnet-filts", default=5, type=int, help="Filter size of postnet"
        )
        group.add_argument(
            "--use-batch-norm",
            default=True,
            type=strtobool,
            help="Whether to use batch normalization",
        )
        group.add_argument(
            "--use-scaled-pos-enc",
            default=True,
            type=strtobool,
            help="Use trainable scaled positional encoding "
            "instead of the fixed scale one",
        )
        group.add_argument(
            "--encoder-normalize-before",
            default=False,
            type=strtobool,
            help="Whether to apply layer norm before encoder block",
        )
        group.add_argument(
            "--decoder-normalize-before",
            default=False,
            type=strtobool,
            help="Whether to apply layer norm before decoder block",
        )
        group.add_argument(
            "--encoder-concat-after",
            default=False,
            type=strtobool,
            help="Whether to concatenate attention layer's input and output in encoder",
        )
        group.add_argument(
            "--decoder-concat-after",
            default=False,
            type=strtobool,
            help="Whether to concatenate attention layer's input and output in decoder",
        )
        group.add_argument(
            "--duration-predictor-layers",
            default=2,
            type=int,
            help="Number of layers in duration predictor",
        )
        group.add_argument(
            "--duration-predictor-chans",
            default=384,
            type=int,
            help="Number of channels in duration predictor",
        )
        group.add_argument(
            "--duration-predictor-kernel-size",
            default=3,
            type=int,
            help="Kernel size in duration predictor",
        )
        group.add_argument(
            "--teacher-model",
            default=None,
            type=str,
            nargs="?",
            help="Teacher model file path",
        )
        group.add_argument(
            "--reduction-factor", default=1, type=int, help="Reduction factor"
        )
        group.add_argument(
            "--spk-embed-dim",
            default=None,
            type=int,
            help="Number of speaker embedding dimensions",
        )
        group.add_argument(
            "--spk-embed-integration-type",
            type=str,
            default="add",
            choices=["add", "concat"],
            help="How to integrate speaker embedding",
        )
        # training related
        group.add_argument(
            "--transformer-init",
            type=str,
            default="pytorch",
            choices=[
                "pytorch",
                "xavier_uniform",
                "xavier_normal",
                "kaiming_uniform",
                "kaiming_normal",
            ],
            help="How to initialize transformer parameters",
        )
        group.add_argument(
            "--initial-encoder-alpha",
            type=float,
            default=1.0,
            help="Initial alpha value in encoder's ScaledPositionalEncoding",
        )
        group.add_argument(
            "--initial-decoder-alpha",
            type=float,
            default=1.0,
            help="Initial alpha value in decoder's ScaledPositionalEncoding",
        )
        group.add_argument(
            "--transformer-lr",
            default=1.0,
            type=float,
            help="Initial value of learning rate",
        )
        group.add_argument(
            "--transformer-warmup-steps",
            default=4000,
            type=int,
            help="Optimizer warmup steps",
        )
        group.add_argument(
            "--transformer-enc-dropout-rate",
            default=0.1,
            type=float,
            help="Dropout rate for transformer encoder except for attention",
        )
        group.add_argument(
            "--transformer-enc-positional-dropout-rate",
            default=0.1,
            type=float,
            help="Dropout rate for transformer encoder positional encoding",
        )
        group.add_argument(
            "--transformer-enc-attn-dropout-rate",
            default=0.1,
            type=float,
            help="Dropout rate for transformer encoder self-attention",
        )
        group.add_argument(
            "--transformer-dec-dropout-rate",
            default=0.1,
            type=float,
            help="Dropout rate for transformer decoder except "
            "for attention and pos encoding",
        )
        group.add_argument(
            "--transformer-dec-positional-dropout-rate",
            default=0.1,
            type=float,
            help="Dropout rate for transformer decoder positional encoding",
        )
        group.add_argument(
            "--transformer-dec-attn-dropout-rate",
            default=0.1,
            type=float,
            help="Dropout rate for transformer decoder self-attention",
        )
        group.add_argument(
            "--transformer-enc-dec-attn-dropout-rate",
            default=0.1,
            type=float,
            help="Dropout rate for transformer encoder-decoder attention",
        )
        group.add_argument(
            "--duration-predictor-dropout-rate",
            default=0.1,
            type=float,
            help="Dropout rate for duration predictor",
        )
        group.add_argument(
            "--postnet-dropout-rate",
            default=0.5,
            type=float,
            help="Dropout rate in postnet",
        )
        group.add_argument(
            "--transfer-encoder-from-teacher",
            default=True,
            type=strtobool,
            help="Whether to transfer teacher's parameters",
        )
        group.add_argument(
            "--transferred-encoder-module",
            default="all",
            type=str,
            choices=["all", "embed"],
            help="Encoder modeules to be trasferred from teacher",
        )
        # loss related
        group.add_argument(
            "--use-masking",
            default=True,
            type=strtobool,
            help="Whether to use masking in calculation of loss",
        )
        group.add_argument(
            "--use-weighted-masking",
            default=False,
            type=strtobool,
            help="Whether to use weighted masking in calculation of loss",
        )
        return parser

    def __init__(self, idim, odim, args=None):
        """Initialize feed-forward Transformer module.

        Args:
            idim (int): Dimension of the inputs.
            odim (int): Dimension of the outputs.
            args (Namespace, optional):
                - elayers (int): Number of encoder layers.
                - eunits (int): Number of encoder hidden units.
                - adim (int): Number of attention transformation dimensions.
                - aheads (int): Number of heads for multi head attention.
                - dlayers (int): Number of decoder layers.
                - dunits (int): Number of decoder hidden units.
                - use_scaled_pos_enc (bool):
                    Whether to use trainable scaled positional encoding.
                - encoder_normalize_before (bool):
                    Whether to perform layer normalization before encoder block.
                - decoder_normalize_before (bool):
                    Whether to perform layer normalization before decoder block.
                - encoder_concat_after (bool): Whether to concatenate attention
                    layer's input and output in encoder.
                - decoder_concat_after (bool): Whether to concatenate attention
                    layer's input and output in decoder.
                - duration_predictor_layers (int): Number of duration predictor layers.
                - duration_predictor_chans (int): Number of duration predictor channels.
                - duration_predictor_kernel_size (int):
                    Kernel size of duration predictor.
                - spk_embed_dim (int): Number of speaker embedding dimensions.
                - spk_embed_integration_type: How to integrate speaker embedding.
                - teacher_model (str): Teacher auto-regressive transformer model path.
                - reduction_factor (int): Reduction factor.
                - transformer_init (float): How to initialize transformer parameters.
                - transformer_lr (float): Initial value of learning rate.
                - transformer_warmup_steps (int): Optimizer warmup steps.
                - transformer_enc_dropout_rate (float):
                    Dropout rate in encoder except attention & positional encoding.
                - transformer_enc_positional_dropout_rate (float):
                    Dropout rate after encoder positional encoding.
                - transformer_enc_attn_dropout_rate (float):
                    Dropout rate in encoder self-attention module.
                - transformer_dec_dropout_rate (float):
                    Dropout rate in decoder except attention & positional encoding.
                - transformer_dec_positional_dropout_rate (float):
                    Dropout rate after decoder positional encoding.
                - transformer_dec_attn_dropout_rate (float):
                    Dropout rate in deocoder self-attention module.
                - transformer_enc_dec_attn_dropout_rate (float):
                    Dropout rate in encoder-deocoder attention module.
                - use_masking (bool):
                    Whether to apply masking for padded part in loss calculation.
                - use_weighted_masking (bool):
                    Whether to apply weighted masking in loss calculation.
                - transfer_encoder_from_teacher:
                    Whether to transfer encoder using teacher encoder parameters.
                - transferred_encoder_module:
                    Encoder module to be initialized using teacher parameters.

        """
        # initialize base classes
        TTSInterface.__init__(self)
        torch.nn.Module.__init__(self)

        # fill missing arguments
        args = fill_missing_args(args, self.add_arguments)

        # store hyperparameters
        self.idim = idim
        self.odim = odim
        self.reduction_factor = args.reduction_factor
        self.use_scaled_pos_enc = args.use_scaled_pos_enc
        self.spk_embed_dim = args.spk_embed_dim
        if self.spk_embed_dim is not None:
            self.spk_embed_integration_type = args.spk_embed_integration_type

        # use idx 0 as padding idx
        padding_idx = 0

        # get positional encoding class
        pos_enc_class = (
            ScaledPositionalEncoding if self.use_scaled_pos_enc else PositionalEncoding
        )

        # define encoder
        encoder_input_layer = torch.nn.Embedding(
            num_embeddings=idim, embedding_dim=args.adim, padding_idx=padding_idx
        )
        self.encoder = Encoder(
            idim=idim,
            attention_dim=args.adim,
            attention_heads=args.aheads,
            linear_units=args.eunits,
            num_blocks=args.elayers,
            input_layer=encoder_input_layer,
            dropout_rate=args.transformer_enc_dropout_rate,
            positional_dropout_rate=args.transformer_enc_positional_dropout_rate,
            attention_dropout_rate=args.transformer_enc_attn_dropout_rate,
            pos_enc_class=pos_enc_class,
            normalize_before=args.encoder_normalize_before,
            concat_after=args.encoder_concat_after,
            positionwise_layer_type=args.positionwise_layer_type,
            positionwise_conv_kernel_size=args.positionwise_conv_kernel_size,
        )

        # define additional projection for speaker embedding
        if self.spk_embed_dim is not None:
            if self.spk_embed_integration_type == "add":
                self.projection = torch.nn.Linear(self.spk_embed_dim, args.adim)
            else:
                self.projection = torch.nn.Linear(
                    args.adim + self.spk_embed_dim, args.adim
                )

        # define duration predictor
        self.duration_predictor = DurationPredictor(
            idim=args.adim,
            n_layers=args.duration_predictor_layers,
            n_chans=args.duration_predictor_chans,
            kernel_size=args.duration_predictor_kernel_size,
            dropout_rate=args.duration_predictor_dropout_rate,
        )

        # define length regulator
        self.length_regulator = LengthRegulator()

        # define decoder
        # NOTE: we use encoder as decoder
        # because fastspeech's decoder is the same as encoder
        self.decoder = Encoder(
            idim=0,
            attention_dim=args.adim,
            attention_heads=args.aheads,
            linear_units=args.dunits,
            num_blocks=args.dlayers,
            input_layer=None,
            dropout_rate=args.transformer_dec_dropout_rate,
            positional_dropout_rate=args.transformer_dec_positional_dropout_rate,
            attention_dropout_rate=args.transformer_dec_attn_dropout_rate,
            pos_enc_class=pos_enc_class,
            normalize_before=args.decoder_normalize_before,
            concat_after=args.decoder_concat_after,
            positionwise_layer_type=args.positionwise_layer_type,
            positionwise_conv_kernel_size=args.positionwise_conv_kernel_size,
        )

        # define final projection
        self.feat_out = torch.nn.Linear(args.adim, odim * args.reduction_factor)

        # define postnet
        self.postnet = (
            None
            if args.postnet_layers == 0
            else Postnet(
                idim=idim,
                odim=odim,
                n_layers=args.postnet_layers,
                n_chans=args.postnet_chans,
                n_filts=args.postnet_filts,
                use_batch_norm=args.use_batch_norm,
                dropout_rate=args.postnet_dropout_rate,
            )
        )

        # initialize parameters
        self._reset_parameters(
            init_type=args.transformer_init,
            init_enc_alpha=args.initial_encoder_alpha,
            init_dec_alpha=args.initial_decoder_alpha,
        )

        # define teacher model
        if args.teacher_model is not None:
            self.teacher = self._load_teacher_model(args.teacher_model)
        else:
            self.teacher = None

        # define duration calculator
        if self.teacher is not None:
            self.duration_calculator = DurationCalculator(self.teacher)
        else:
            self.duration_calculator = None

        # transfer teacher parameters
        if self.teacher is not None and args.transfer_encoder_from_teacher:
            self._transfer_from_teacher(args.transferred_encoder_module)

        # define criterions
        self.criterion = FeedForwardTransformerLoss(
            use_masking=args.use_masking, use_weighted_masking=args.use_weighted_masking
        )

    def _forward(
        self,
        xs,
        ilens,
        ys=None,
        olens=None,
        spembs=None,
        ds=None,
        is_inference=False,
        alpha=1.0,
    ):
        # forward encoder
        x_masks = self._source_mask(ilens)
        hs, _ = self.encoder(xs, x_masks)  # (B, Tmax, adim)

        # integrate speaker embedding
        if self.spk_embed_dim is not None:
            hs = self._integrate_with_spk_embed(hs, spembs)

        # forward duration predictor and length regulator
        d_masks = make_pad_mask(ilens).to(xs.device)
        if is_inference:
            d_outs = self.duration_predictor.inference(hs, d_masks)  # (B, Tmax)
            hs = self.length_regulator(hs, d_outs, alpha)  # (B, Lmax, adim)
        else:
            if ds is None:
                with torch.no_grad():
                    ds = self.duration_calculator(
                        xs, ilens, ys, olens, spembs
                    )  # (B, Tmax)
            d_outs = self.duration_predictor(hs, d_masks)  # (B, Tmax)
            hs = self.length_regulator(hs, ds)  # (B, Lmax, adim)

        # forward decoder
        if olens is not None:
            if self.reduction_factor > 1:
                olens_in = olens.new([olen // self.reduction_factor for olen in olens])
            else:
                olens_in = olens
            h_masks = self._source_mask(olens_in)
        else:
            h_masks = None
        zs, _ = self.decoder(hs, h_masks)  # (B, Lmax, adim)
        before_outs = self.feat_out(zs).view(
            zs.size(0), -1, self.odim
        )  # (B, Lmax, odim)

        # postnet -> (B, Lmax//r * r, odim)
        if self.postnet is None:
            after_outs = before_outs
        else:
            after_outs = before_outs + self.postnet(
                before_outs.transpose(1, 2)
            ).transpose(1, 2)

        if is_inference:
            return before_outs, after_outs, d_outs
        else:
            return before_outs, after_outs, ds, d_outs

    def forward(self, xs, ilens, ys, olens, spembs=None, extras=None, *args, **kwargs):
        """Calculate forward propagation.

        Args:
            xs (Tensor): Batch of padded character ids (B, Tmax).
            ilens (LongTensor): Batch of lengths of each input batch (B,).
            ys (Tensor): Batch of padded target features (B, Lmax, odim).
            olens (LongTensor): Batch of the lengths of each target (B,).
            spembs (Tensor, optional):
                Batch of speaker embedding vectors (B, spk_embed_dim).
            extras (Tensor, optional): Batch of precalculated durations (B, Tmax, 1).

        Returns:
            Tensor: Loss value.

        """
        # remove unnecessary padded part (for multi-gpus)
        xs = xs[:, : max(ilens)]
        ys = ys[:, : max(olens)]
        if extras is not None:
            extras = extras[:, : max(ilens)].squeeze(-1)

        # forward propagation
        before_outs, after_outs, ds, d_outs = self._forward(
            xs, ilens, ys, olens, spembs=spembs, ds=extras, is_inference=False
        )

        # modifiy mod part of groundtruth
        if self.reduction_factor > 1:
            olens = olens.new([olen - olen % self.reduction_factor for olen in olens])
            max_olen = max(olens)
            ys = ys[:, :max_olen]

        # calculate loss
        if self.postnet is None:
            l1_loss, duration_loss = self.criterion(
                None, before_outs, d_outs, ys, ds, ilens, olens
            )
        else:
            l1_loss, duration_loss = self.criterion(
                after_outs, before_outs, d_outs, ys, ds, ilens, olens
            )
        loss = l1_loss + duration_loss
        report_keys = [
            {"l1_loss": l1_loss.item()},
            {"duration_loss": duration_loss.item()},
            {"loss": loss.item()},
        ]

        # report extra information
        if self.use_scaled_pos_enc:
            report_keys += [
                {"encoder_alpha": self.encoder.embed[-1].alpha.data.item()},
                {"decoder_alpha": self.decoder.embed[-1].alpha.data.item()},
            ]
        self.reporter.report(report_keys)

        return loss

    def calculate_all_attentions(
        self, xs, ilens, ys, olens, spembs=None, extras=None, *args, **kwargs
    ):
        """Calculate all of the attention weights.

        Args:
            xs (Tensor): Batch of padded character ids (B, Tmax).
            ilens (LongTensor): Batch of lengths of each input batch (B,).
            ys (Tensor): Batch of padded target features (B, Lmax, odim).
            olens (LongTensor): Batch of the lengths of each target (B,).
            spembs (Tensor, optional):
                Batch of speaker embedding vectors (B, spk_embed_dim).
            extras (Tensor, optional): Batch of precalculated durations (B, Tmax, 1).

        Returns:
            dict: Dict of attention weights and outputs.

        """
        with torch.no_grad():
            # remove unnecessary padded part (for multi-gpus)
            xs = xs[:, : max(ilens)]
            ys = ys[:, : max(olens)]
            if extras is not None:
                extras = extras[:, : max(ilens)].squeeze(-1)

            # forward propagation
            outs = self._forward(
                xs, ilens, ys, olens, spembs=spembs, ds=extras, is_inference=False
            )[1]

        att_ws_dict = dict()
        for name, m in self.named_modules():
            if isinstance(m, MultiHeadedAttention):
                attn = m.attn.cpu().numpy()
                if "encoder" in name:
                    attn = [a[:, :l, :l] for a, l in zip(attn, ilens.tolist())]
                elif "decoder" in name:
                    if "src" in name:
                        attn = [
                            a[:, :ol, :il]
                            for a, il, ol in zip(attn, ilens.tolist(), olens.tolist())
                        ]
                    elif "self" in name:
                        attn = [a[:, :l, :l] for a, l in zip(attn, olens.tolist())]
                    else:
                        logging.warning("unknown attention module: " + name)
                else:
                    logging.warning("unknown attention module: " + name)
                att_ws_dict[name] = attn
        att_ws_dict["predicted_fbank"] = [
            m[:l].T for m, l in zip(outs.cpu().numpy(), olens.tolist())
        ]

        return att_ws_dict

    def inference(self, x, inference_args, spemb=None, *args, **kwargs):
        """Generate the sequence of features given the sequences of characters.

        Args:
            x (Tensor): Input sequence of characters (T,).
            inference_args (Namespace): Dummy for compatibility.
            spemb (Tensor, optional): Speaker embedding vector (spk_embed_dim).

        Returns:
            Tensor: Output sequence of features (L, odim).
            None: Dummy for compatibility.
            None: Dummy for compatibility.

        """
        # setup batch axis
        ilens = torch.tensor([x.shape[0]], dtype=torch.long, device=x.device)
        xs = x.unsqueeze(0)
        if spemb is not None:
            spembs = spemb.unsqueeze(0)
        else:
            spembs = None

        # get option
        alpha = getattr(inference_args, "fastspeech_alpha", 1.0)

        # inference
        _, outs, _ = self._forward(
            xs,
            ilens,
            spembs=spembs,
            is_inference=True,
            alpha=alpha,
        )  # (1, L, odim)

        return outs[0], None, None

    def _integrate_with_spk_embed(self, hs, spembs):
        """Integrate speaker embedding with hidden states.

        Args:
            hs (Tensor): Batch of hidden state sequences (B, Tmax, adim).
            spembs (Tensor): Batch of speaker embeddings (B, spk_embed_dim).

        Returns:
            Tensor: Batch of integrated hidden state sequences (B, Tmax, adim)

        """
        if self.spk_embed_integration_type == "add":
            # apply projection and then add to hidden states
            spembs = self.projection(F.normalize(spembs))
            hs = hs + spembs.unsqueeze(1)
        elif self.spk_embed_integration_type == "concat":
            # concat hidden states with spk embeds and then apply projection
            spembs = F.normalize(spembs).unsqueeze(1).expand(-1, hs.size(1), -1)
            hs = self.projection(torch.cat([hs, spembs], dim=-1))
        else:
            raise NotImplementedError("support only add or concat.")

        return hs

    def _source_mask(self, ilens):
        """Make masks for self-attention.

        Args:
            ilens (LongTensor or List): Batch of lengths (B,).

        Returns:
            Tensor: Mask tensor for self-attention.
                    dtype=torch.uint8 in PyTorch 1.2-
                    dtype=torch.bool in PyTorch 1.2+ (including 1.2)

        Examples:
            >>> ilens = [5, 3]
            >>> self._source_mask(ilens)
            tensor([[[1, 1, 1, 1, 1],
                     [1, 1, 1, 0, 0]]], dtype=torch.uint8)

        """
        x_masks = make_non_pad_mask(ilens).to(next(self.parameters()).device)
        return x_masks.unsqueeze(-2)

    def _load_teacher_model(self, model_path):
        # get teacher model config
        idim, odim, args = get_model_conf(model_path)

        # assert dimension is the same between teacher and studnet
        assert idim == self.idim
        assert odim == self.odim
        assert args.reduction_factor == self.reduction_factor

        # load teacher model
        from espnet.utils.dynamic_import import dynamic_import

        model_class = dynamic_import(args.model_module)
        model = model_class(idim, odim, args)
        torch_load(model_path, model)

        # freeze teacher model parameters
        for p in model.parameters():
            p.requires_grad = False

        return model

    def _reset_parameters(self, init_type, init_enc_alpha=1.0, init_dec_alpha=1.0):
        # initialize parameters
        initialize(self, init_type)

        # initialize alpha in scaled positional encoding
        if self.use_scaled_pos_enc:
            self.encoder.embed[-1].alpha.data = torch.tensor(init_enc_alpha)
            self.decoder.embed[-1].alpha.data = torch.tensor(init_dec_alpha)

    def _transfer_from_teacher(self, transferred_encoder_module):
        if transferred_encoder_module == "all":
            for (n1, p1), (n2, p2) in zip(
                self.encoder.named_parameters(), self.teacher.encoder.named_parameters()
            ):
                assert n1 == n2, "It seems that encoder structure is different."
                assert p1.shape == p2.shape, "It seems that encoder size is different."
                p1.data.copy_(p2.data)
        elif transferred_encoder_module == "embed":
            student_shape = self.encoder.embed[0].weight.data.shape
            teacher_shape = self.teacher.encoder.embed[0].weight.data.shape
            assert (
                student_shape == teacher_shape
            ), "It seems that embed dimension is different."
            self.encoder.embed[0].weight.data.copy_(
                self.teacher.encoder.embed[0].weight.data
            )
        else:
            raise NotImplementedError("Support only all or embed.")

    @property
    def attention_plot_class(self):
        """Return plot class for attention weight plot."""
        # Lazy import to avoid chainer dependency
        from espnet.nets.pytorch_backend.e2e_tts_transformer import TTSPlot

        return TTSPlot

    @property
    def base_plot_keys(self):
        """Return base key names to plot during training.

        keys should match what `chainer.reporter` reports.
        If you add the key `loss`,
        the reporter will report `main/loss` and `validation/main/loss` values.
        also `loss.png` will be created as a figure visulizing `main/loss`
        and `validation/main/loss` values.

        Returns:
            list: List of strings which are base keys to plot during training.

        """
        plot_keys = ["loss", "l1_loss", "duration_loss"]
        if self.use_scaled_pos_enc:
            plot_keys += ["encoder_alpha", "decoder_alpha"]

        return plot_keys


================================================
FILE: nets/pytorch_backend/e2e_tts_tacotron2.py
================================================
# Copyright 2018 Nagoya University (Tomoki Hayashi)
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

"""Tacotron 2 related modules."""

import logging

import numpy as np
import torch
import torch.nn.functional as F

from espnet.nets.pytorch_backend.nets_utils import make_non_pad_mask
from espnet.nets.pytorch_backend.rnn.attentions import AttForward
from espnet.nets.pytorch_backend.rnn.attentions import AttForwardTA
from espnet.nets.pytorch_backend.rnn.attentions import AttLoc
from espnet.nets.pytorch_backend.tacotron2.cbhg import CBHG
from espnet.nets.pytorch_backend.tacotron2.cbhg import CBHGLoss
from espnet.nets.pytorch_backend.tacotron2.decoder import Decoder
from espnet.nets.pytorch_backend.tacotron2.encoder import Encoder
from espnet.nets.tts_interface import TTSInterface
from espnet.utils.cli_utils import strtobool
from espnet.utils.fill_missing_args import fill_missing_args


class GuidedAttentionLoss(torch.nn.Module):
    """Guided attention loss function module.

    This module calculates the guided attention loss described
    in `Efficiently Trainable Text-to-Speech System Based
    on Deep Convolutional Networks with Guided Attention`_,
    which forces the attention to be diagonal.

    .. _`Efficiently Trainable Text-to-Speech System
        Based on Deep Convolutional Networks with Guided Attention`:
        https://arxiv.org/abs/1710.08969

    """

    def __init__(self, sigma=0.4, alpha=1.0, reset_always=True):
        """Initialize guided attention loss module.

        Args:
            sigma (float, optional): Standard deviation to control
                how close attention to a diagonal.
            alpha (float, optional): Scaling coefficient (lambda).
            reset_always (bool, optional): Whether to always reset masks.

        """
        super(GuidedAttentionLoss, self).__init__()
        self.sigma = sigma
        self.alpha = alpha
        self.reset_always = reset_always
        self.guided_attn_masks = None
        self.masks = None

    def _reset_masks(self):
        self.guided_attn_masks = None
        self.masks = None

    def forward(self, att_ws, ilens, olens):
        """Calculate forward propagation.

        Args:
            att_ws (Tensor): Batch of attention weights (B, T_max_out, T_max_in).
            ilens (LongTensor): Batch of input lenghts (B,).
            olens (LongTensor): Batch of output lenghts (B,).

        Returns:
            Tensor: Guided attention loss value.

        """
        if self.guided_attn_masks is None:
            self.guided_attn_masks = self._make_guided_attention_masks(ilens, olens).to(
                att_ws.device
            )
        if self.masks is None:
            self.masks = self._make_masks(ilens, olens).to(att_ws.device)
        losses = self.guided_attn_masks * att_ws
        loss = torch.mean(losses.masked_select(self.masks))
        if self.reset_always:
            self._reset_masks()
        return self.alpha * loss

    def _make_guided_attention_masks(self, ilens, olens):
        n_batches = len(ilens)
        max_ilen = max(ilens)
        max_olen = max(olens)
        guided_attn_masks = torch.zeros((n_batches, max_olen, max_ilen))
        for idx, (ilen, olen) in enumerate(zip(ilens, olens)):
            guided_attn_masks[idx, :olen, :ilen] = self._make_guided_attention_mask(
                ilen, olen, self.sigma
            )
        return guided_attn_masks

    @staticmethod
    def _make_guided_attention_mask(ilen, olen, sigma):
        """Make guided attention mask.

        Examples:
            >>> guided_attn_mask =_make_guided_attention(5, 5, 0.4)
            >>> guided_attn_mask.shape
            torch.Size([5, 5])
            >>> guided_attn_mask
            tensor([[0.0000, 0.1175, 0.3935, 0.6753, 0.8647],
                    [0.1175, 0.0000, 0.1175, 0.3935, 0.6753],
                    [0.3935, 0.1175, 0.0000, 0.1175, 0.3935],
                    [0.6753, 0.3935, 0.1175, 0.0000, 0.1175],
                    [0.8647, 0.6753, 0.3935, 0.1175, 0.0000]])
            >>> guided_attn_mask =_make_guided_attention(3, 6, 0.4)
            >>> guided_attn_mask.shape
            torch.Size([6, 3])
            >>> guided_attn_mask
            tensor([[0.0000, 0.2934, 0.7506],
                    [0.0831, 0.0831, 0.5422],
                    [0.2934, 0.0000, 0.2934],
                    [0.5422, 0.0831, 0.0831],
                    [0.7506, 0.2934, 0.0000],
                    [0.8858, 0.5422, 0.0831]])

        """
        grid_x, grid_y = torch.meshgrid(torch.arange(olen), torch.arange(ilen))
        grid_x, grid_y = grid_x.float().to(olen.device), grid_y.float().to(ilen.device)
        return 1.0 - torch.exp(
            -((grid_y / ilen - grid_x / olen) ** 2) / (2 * (sigma ** 2))
        )

    @staticmethod
    def _make_masks(ilens, olens):
        """Make masks indicating non-padded part.

        Args:
            ilens (LongTensor or List): Batch of lengths (B,).
            olens (LongTensor or List): Batch of lengths (B,).

        Returns:
            Tensor: Mask tensor indicating non-padded part.
                    dtype=torch.uint8 in PyTorch 1.2-
                    dtype=torch.bool in PyTorch 1.2+ (including 1.2)

        Examples:
            >>> ilens, olens = [5, 2], [8, 5]
            >>> _make_mask(ilens, olens)
            tensor([[[1, 1, 1, 1, 1],
                     [1, 1, 1, 1, 1],
                     [1, 1, 1, 1, 1],
                     [1, 1, 1, 1, 1],
                     [1, 1, 1, 1, 1],
                     [1, 1, 1, 1, 1],
                     [1, 1, 1, 1, 1],
                     [1, 1, 1, 1, 1]],
                    [[1, 1, 0, 0, 0],
                     [1, 1, 0, 0, 0],
                     [1, 1, 0, 0, 0],
                     [1, 1, 0, 0, 0],
                     [1, 1, 0, 0, 0],
                     [0, 0, 0, 0, 0],
                     [0, 0, 0, 0, 0],
                     [0, 0, 0, 0, 0]]], dtype=torch.uint8)

        """
        in_masks = make_non_pad_mask(ilens)  # (B, T_in)
        out_masks = make_non_pad_mask(olens)  # (B, T_out)
        return out_masks.unsqueeze(-1) & in_masks.unsqueeze(-2)  # (B, T_out, T_in)


class Tacotron2Loss(torch.nn.Module):
    """Loss function module for Tacotron2."""

    def __init__(
        self, use_masking=True, use_weighted_masking=False, bce_pos_weight=20.0
    ):
        """Initialize Tactoron2 loss module.

        Args:
            use_masking (bool): Whether to apply masking
                for padded part in loss calculation.
            use_weighted_masking (bool):
                Whether to apply weighted masking in loss calculation.
            bce_pos_weight (float): Weight of positive sample of stop token.

        """
        super(Tacotron2Loss, self).__init__()
        assert (use_masking != use_weighted_masking) or not use_masking
        self.use_masking = use_masking
        self.use_weighted_masking = use_weighted_masking

        # define criterions
        reduction = "none" if self.use_weighted_masking else "mean"
        self.l1_criterion = torch.nn.L1Loss(reduction=reduction)
        self.mse_criterion = torch.nn.MSELoss(reduction=reduction)
        self.bce_criterion = torch.nn.BCEWithLogitsLoss(
            reduction=reduction, pos_weight=torch.tensor(bce_pos_weight)
        )

        # NOTE(kan-bayashi): register pre hook function for the compatibility
        self._register_load_state_dict_pre_hook(self._load_state_dict_pre_hook)

    def forward(self, after_outs, before_outs, logits, ys, labels, olens):
        """Calculate forward propagation.

        Args:
            after_outs (Tensor): Batch of outputs after postnets (B, Lmax, odim).
            before_outs (Tensor): Batch of outputs before postnets (B, Lmax, odim).
            logits (Tensor): Batch of stop logits (B, Lmax).
            ys (Tensor): Batch of padded target features (B, Lmax, odim).
            labels (LongTensor): Batch of the sequences of stop token labels (B, Lmax).
            olens (LongTensor): Batch of the lengths of each target (B,).

        Returns:
            Tensor: L1 loss value.
            Tensor: Mean square error loss value.
            Tensor: Binary cross entropy loss value.

        """
        # make mask and apply it
        if self.use_masking:
            masks = make_non_pad_mask(olens).unsqueeze(-1).to(ys.device)
            ys = ys.masked_select(masks)
            after_outs = after_outs.masked_select(masks)
            before_outs = before_outs.masked_select(masks)
            labels = labels.masked_select(masks[:, :, 0])
            logits = logits.masked_select(masks[:, :, 0])

        # calculate loss
        l1_loss = self.l1_criterion(after_outs, ys) + self.l1_criterion(before_outs, ys)
        mse_loss = self.mse_criterion(after_outs, ys) + self.mse_criterion(
            before_outs, ys
        )
        bce_loss = self.bce_criterion(logits, labels)

        # make weighted mask and apply it
        if self.use_weighted_masking:
            masks = make_non_pad_mask(olens).unsqueeze(-1).to(ys.device)
            weights = masks.float() / masks.sum(dim=1, keepdim=True).float()
            out_weights = weights.div(ys.size(0) * ys.size(2))
            logit_weights = weights.div(ys.size(0))

            # apply weight
            l1_loss = l1_loss.mul(out_weights).masked_select(masks).sum()
            mse_loss = mse_loss.mul(out_weights).masked_select(masks).sum()
            bce_loss = (
                bce_loss.mul(logit_weights.squeeze(-1))
                .masked_select(masks.squeeze(-1))
                .sum()
            )

        return l1_loss, mse_loss, bce_loss

    def _load_state_dict_pre_hook(
        self,
        state_dict,
        prefix,
        local_metadata,
        strict,
        missing_keys,
        unexpected_keys,
        error_msgs,
    ):
        """Apply pre hook fucntion before loading state dict.

        From v.0.6.1 `bce_criterion.pos_weight` param is registered as a parameter but
        old models do not include it and as a result, it causes missing key error when
        loading old model parameter. This function solve the issue by adding param in
        state dict before loading as a pre hook function
        of the `load_state_dict` method.

        """
        key = prefix + "bce_criterion.pos_weight"
        if key not in state_dict:
            state_dict[key] = self.bce_criterion.pos_weight


class Tacotron2(TTSInterface, torch.nn.Module):
    """Tacotron2 module for end-to-end text-to-speech (E2E-TTS).

    This is a module of Spectrogram prediction network in Tacotron2 described
    in `Natural TTS Synthesis
    by Conditioning WaveNet on Mel Spectrogram Predictions`_,
    which converts the sequence of characters
    into the sequence of Mel-filterbanks.

    .. _`Natural TTS Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions`:
       https://arxiv.org/abs/1712.05884

    """

    @staticmethod
    def add_arguments(parser):
        """Add model-specific arguments to the parser."""
        group = parser.add_argument_group("tacotron 2 model setting")
        # encoder
        group.add_argument(
            "--embed-dim",
            default=512,
            type=int,
            help="Number of dimension of embedding",
        )
        group.add_argument(
            "--elayers", default=1, type=int, help="Number of encoder layers"
        )
        group.add_argument(
            "--eunits",
            "-u",
            default=512,
            type=int,
            help="Number of encoder hidden units",
        )
        group.add_argument(
            "--econv-layers",
            default=3,
            type=int,
            help="Number of encoder convolution layers",
        )
        group.add_argument(
            "--econv-chans",
            default=512,
            type=int,
            help="Number of encoder convolution channels",
        )
        group.add_argument(
            "--econv-filts",
            default=5,
            type=int,
            help="Filter size of encoder convolution",
        )
        # attention
        group.add_argument(
            "--atype",
            default="location",
            type=str,
            choices=["forward_ta", "forward", "location"],
            help="Type of attention mechanism",
        )
        group.add_argument(
            "--adim",
            default=512,
            type=int,
            help="Number of attention transformation dimensions",
        )
        group.add_argument(
            "--aconv-chans",
            default=32,
            type=int,
            help="Number of attention convolution channels",
        )
        group.add_argument(
            "--aconv-filts",
            default=15,
            type=int,
            help="Filter size of attention convolution",
        )
        group.add_argument(
            "--cumulate-att-w",
            default=True,
            type=strtobool,
            help="Whether or not to cumulate attention weights",
        )
        # decoder
        group.add_argument(
            "--dlayers", default=2, type=int, help="Number of decoder layers"
        )
        group.add_argument(
            "--dunits", default=1024, type=int, help="Number of decoder hidden units"
        )
        group.add_argument(
            "--prenet-layers", default=2, type=int, help="Number of prenet layers"
        )
        group.add_argument(
            "--prenet-units",
            default=256,
            type=int,
            help="Number of prenet hidden units",
        )
        group.add_argument(
            "--postnet-layers", default=5, type=int, help="Number of postnet layers"
        )
        group.add_argument(
            "--postnet-chans", default=512, type=int, help="Number of postnet channels"
        )
        group.add_argument(
            "--postnet-filts", default=5, type=int, help="Filter size of postnet"
        )
        group.add_argument(
            "--output-activation",
            default=None,
            type=str,
            nargs="?",
            help="Output activation function",
        )
        # cbhg
        group.add_argument(
            "--use-cbhg",
            default=False,
            type=strtobool,
            help="Whether to use CBHG module",
        )
        group.add_argument(
            "--cbhg-conv-bank-layers",
            default=8,
            type=int,
            help="Number of convoluional bank layers in CBHG",
        )
        group.add_argument(
            "--cbhg-conv-bank-chans",
            default=128,
            type=int,
            help="Number of convoluional bank channles in CBHG",
        )
        group.add_argument(
            "--cbhg-conv-proj-filts",
            default=3,
            type=int,
            help="Filter size of convoluional projection layer in CBHG",
        )
        group.add_argument(
            "--cbhg-conv-proj-chans",
            default=256,
            type=int,
            help="Number of convoluional projection channels in CBHG",
        )
        group.add_argument(
            "--cbhg-highway-layers",
            default=4,
            type=int,
            help="Number of highway layers in CBHG",
        )
        group.add_argument(
            "--cbhg-highway-units",
            default=128,
            type=int,
            help="Number of highway units in CBHG",
        )
        group.add_argument(
            "--cbhg-gru-units",
            default=256,
            type=int,
            help="Number of GRU units in CBHG",
        )
        # model (parameter) related
        group.add_argument(
            "--use-batch-norm",
            default=True,
            type=strtobool,
            help="Whether to use batch normalization",
        )
        group.add_argument(
            "--use-concate",
            default=True,
            type=strtobool,
            help="Whether to concatenate encoder embedding with decoder outputs",
        )
        group.add_argument(
            "--use-residual",
            default=True,
            type=strtobool,
            help="Whether to use residual connection in conv layer",
        )
        group.add_argument(
            "--dropout-rate", default=0.5, type=float, help="Dropout rate"
        )
        group.add_argument(
            "--zoneout-rate", default=0.1, type=float, help="Zoneout rate"
        )
        group.add_argument(
            "--reduction-factor", default=1, type=int, help="Reduction factor"
        )
        group.add_argument(
            "--spk-embed-dim",
            default=None,
            type=int,
            help="Number of speaker embedding dimensions",
        )
        group.add_argument(
            "--spc-dim", default=None, type=int, help="Number of spectrogram dimensions"
        )
        group.add_argument(
            "--pretrained-model", default=None, type=str, help="Pretrained model path"
        )
        # loss related
        group.add_argument(
            "--use-masking",
            default=False,
            type=strtobool,
            help="Whether to use masking in calculation of loss",
        )
        group.add_argument(
            "--use-weighted-masking",
            default=False,
            type=strtobool,
            help="Whether to use weighted masking in calculation of loss",
        )
        group.add_argument(
            "--bce-pos-weight",
            default=20.0,
            type=float,
            help="Positive sample weight in BCE calculation "
            "(only for use-masking=True)",
        )
        group.add_argument(
            "--use-guided-attn-loss",
            default=False,
            type=strtobool,
            help="Whether to use guided attention loss",
        )
        group.add_argument(
            "--guided-attn-loss-sigma",
            default=0.4,
            type=float,
            help="Sigma in guided attention loss",
        )
        group.add_argument(
            "--guided-attn-loss-lambda",
            default=1.0,
            type=float,
            help="Lambda in guided attention loss",
        )
        return parser

    def __init__(self, idim, odim, args=None):
        """Initialize Tacotron2 module.

        Args:
            idim (int): Dimension of the inputs.
            odim (int): Dimension of the outputs.
            args (Namespace, optional):
                - spk_embed_dim (int): Dimension of the speaker embedding.
                - embed_dim (int): Dimension of character embedding.
                - elayers (int): The number of encoder blstm layers.
                - eunits (int): The number of encoder blstm units.
                - econv_layers (int): The number of encoder conv layers.
                - econv_filts (int): The number of encoder conv filter size.
                - econv_chans (int): The number of encoder conv filter channels.
                - dlayers (int): The number of decoder lstm layers.
                - dunits (int): The number of decoder lstm units.
                - prenet_layers (int): The number of prenet layers.
                - prenet_units (int): The number of prenet units.
                - postnet_layers (int): The number of postnet layers.
                - postnet_filts (int): The number of postnet filter size.
                - postnet_chans (int): The number of postnet filter channels.
                - output_activation (int): The name of activation function for outputs.
                - adim (int): The number of dimension of mlp in attention.
                - aconv_chans (int): The number of attention conv filter channels.
                - aconv_filts (int): The number of attention conv filter size.
                - cumulate_att_w (bool): Whether to cumulate previous attention weight.
                - use_batch_norm (bool): Whether to use batch normalization.
                - use_concate (int): Whether to concatenate encoder embedding
                    with decoder lstm outputs.
                - dropout_rate (float): Dropout rate.
                - zoneout_rate (float): Zoneout rate.
                - reduction_factor (int): Reduction factor.
                - spk_embed_dim (int): Number of speaker embedding dimenstions.
                - spc_dim (int): Number of spectrogram embedding dimenstions
                    (only for use_cbhg=True).
                - use_cbhg (bool): Whether to use CBHG module.
                - cbhg_conv_bank_layers (int): The number of convoluional banks in CBHG.
                - cbhg_conv_bank_chans (int): The number of channels of
                    convolutional bank in CBHG.
                - cbhg_proj_filts (int):
                    The number of filter size of projection layeri in CBHG.
                - cbhg_proj_chans (int):
                    The number of channels of projection layer in CBHG.
                - cbhg_highway_layers (int):
                    The number of layers of highway network in CBHG.
                - cbhg_highway_units (int):
                    The number of units of highway network in CBHG.
                - cbhg_gru_units (int): The number of units of GRU in CBHG.
                - use_masking (bool):
                    Whether to apply masking for padded part in loss calculation.
                - use_weighted_masking (bool):
                    Whether to apply weighted masking in loss calculation.
                - bce_pos_weight (float):
                    Weight of positive sample of stop token (only for use_masking=True).
                - use-guided-attn-loss (bool): Whether to use guided attention loss.
                - guided-attn-loss-sigma (float) Sigma in guided attention loss.
                - guided-attn-loss-lamdba (float): Lambda in guided attention loss.

        """
        # initialize base classes
        TTSInterface.__init__(self)
        torch.nn.Module.__init__(self)

        # fill missing arguments
        args = fill_missing_args(args, self.add_arguments)

        # store hyperparameters
        self.idim = idim
        self.odim = odim
        self.spk_embed_dim = args.spk_embed_dim
        self.cumulate_att_w = args.cumulate_att_w
        self.reduction_factor = args.reduction_factor
        self.use_cbhg = args.use_cbhg
        self.use_guided_attn_loss = args.use_guided_attn_loss

        # define activation function for the final output
        if args.output_activation is None:
            self.output_activation_fn = None
        elif hasattr(F, args.output_activation):
            self.output_activation_fn = getattr(F, args.output_activation)
        else:
            raise ValueError(
                "there is no such an activation function. (%s)" % args.output_activation
            )

        # set padding idx
        padding_idx = 0

        # define network modules
        self.enc = Encoder(
            idim=idim,
            embed_dim=args.embed_dim,
            elayers=args.elayers,
            eunits=args.eunits,
            econv_layers=args.econv_layers,
            econv_chans=args.econv_chans,
            econv_filts=args.econv_filts,
            use_batch_norm=args.use_batch_norm,
            use_residual=args.use_residual,
            dropout_rate=args.dropout_rate,
            padding_idx=padding_idx,
        )
        dec_idim = (
            args.eunits
            if args.spk_embed_dim is None
            else args.eunits + args.spk_embed_dim
        )
        if args.atype == "location":
            att = AttLoc(
                dec_idim, args.dunits, args.adim, args.aconv_chans, args.aconv_filts
            )
        elif args.atype == "forward":
            att = AttForward(
                dec_idim, args.dunits, args.adim, args.aconv_chans, args.aconv_filts
            )
            if self.cumulate_att_w:
                logging.warning(
                    "cumulation of attention weights is disabled in forward attention."
                )
                self.cumulate_att_w = False
        elif args.atype == "forward_ta":
            att = AttForwardTA(
                dec_idim,
                args.dunits,
                args.adim,
                args.aconv_chans,
                args.aconv_filts,
                odim,
            )
            if self.cumulate_att_w:
                logging.warning(
                    "cumulation of attention weights is disabled in forward attention."
                )
                self.cumulate_att_w = False
        else:
            raise NotImplementedError("Support only location or forward")
        self.dec = Decoder(
            idim=dec_idim,
            odim=odim,
            att=att,
            dlayers=args.dlayers,
            dunits=args.dunits,
            prenet_layers=args.prenet_layers,
            prenet_units=args.prenet_units,
            postnet_layers=args.postnet_layers,
            postnet_chans=args.postnet_chans,
            postnet_filts=args.postnet_filts,
            output_activation_fn=self.output_activation_fn,
            cumulate_att_w=self.cumulate_att_w,
            use_batch_norm=args.use_batch_norm,
            use_concate=args.use_concate,
            dropout_rate=args.dropout_rate,
            zoneout_rate=args.zoneout_rate,
            reduction_factor=args.reduction_factor,
        )
        self.taco2_loss = Tacotron2Loss(
            use_masking=args.use_masking,
            use_weighted_masking=args.use_weighted_masking,
            bce_pos_weight=args.bce_pos_weight,
        )
        if self.use_guided_attn_loss:
            self.attn_loss = GuidedAttentionLoss(
                sigma=args.guided_attn_loss_sigma,
                alpha=args.guided_attn_loss_lambda,
            )
        if self.use_cbhg:
            self.cbhg = CBHG(
                idim=odim,
                odim=args.spc_dim,
                conv_bank_layers=args.cbhg_conv_bank_layers,
                conv_bank_chans=args.cbhg_conv_bank_chans,
                conv_proj_filts=args.cbhg_conv_proj_filts,
                conv_proj_chans=args.cbhg_conv_proj_chans,
                highway_layers=args.cbhg_highway_layers,
                highway_units=args.cbhg_highway_units,
                gru_units=args.cbhg_gru_units,
            )
            self.cbhg_loss = CBHGLoss(use_masking=args.use_masking)

        # load pretrained model
        if args.pretrained_model is not None:
            self.load_pretrained_model(args.pretrained_model)

    def forward(
        self, xs, ilens, ys, labels, olens, spembs=None, extras=None, *args, **kwargs
    ):
        """Calculate forward propagation.

        Args:
            xs (Tensor): Batch of padded character ids (B, Tmax).
            ilens (LongTensor): Batch of lengths of each input batch (B,).
            ys (Tensor): Batch of padded target features (B, Lmax, odim).
            olens (LongTensor): Batch of the lengths of each target (B,).
            spembs (Tensor, optional):
                Batch of speaker embedding vectors (B, spk_embed_dim).
            extras (Tensor, optional):
                Batch of groundtruth spectrograms (B, Lmax, spc_dim).

        Returns:
            Tensor: Loss value.

        """
        # remove unnecessary padded part (for multi-gpus)
        max_in = max(ilens)
        max_out = max(olens)
        if max_in != xs.shape[1]:
            xs = xs[:, :max_in]
        if max_out != ys.shape[1]:
            ys = ys[:, :max_out]
            labels = labels[:, :max_out]

        # calculate tacotron2 outputs
        hs, hlens = self.enc(xs, ilens)
        if self.spk_embed_dim is not None:
            spembs = F.normalize(spembs).unsqueeze(1).expand(-1, hs.size(1), -1)
            hs = torch.cat([hs, spembs], dim=-1)
        after_outs, before_outs, logits, att_ws = self.dec(hs, hlens, ys)

        # modifiy mod part of groundtruth
        if self.reduction_factor > 1:
            olens = olens.new([olen - olen % self.reduction_factor for olen in olens])
            max_out = max(olens)
            ys = ys[:, :max_out]
            labels = labels[:, :max_out]
            labels[:, -1] = 1.0  # make sure at least one frame has 1

        # caluculate taco2 loss
        l1_loss, mse_loss, bce_loss = self.taco2_loss(
            after_outs, before_outs, logits, ys, labels, olens
        )
        loss = l1_loss + mse_loss + bce_loss
        report_keys = [
            {"l1_loss": l1_loss.item()},
            {"mse_loss": mse_loss.item()},
            {"bce_loss": bce_loss.item()},
        ]

        # caluculate attention loss
        if self.use_guided_attn_loss:
            # NOTE(kan-bayashi):
            # length of output for auto-regressive input will be changed when r > 1
            if self.reduction_factor > 1:
                olens_in = olens.new([olen // self.reduction_factor for olen in olens])
            else:
                olens_in = olens
            attn_loss = self.attn_loss(att_ws, ilens, olens_in)
            loss = loss + attn_loss
            report_keys += [
                {"attn_loss": attn_loss.item()},
            ]

        # caluculate cbhg loss
        if self.use_cbhg:
            # remove unnecessary padded part (for multi-gpus)
            if max_out != extras.shape[1]:
                extras = extras[:, :max_out]

            # caluculate cbhg outputs & loss and report them
            cbhg_outs, _ = self.cbhg(after_outs, olens)
            cbhg_l1_loss, cbhg_mse_loss = self.cbhg_loss(cbhg_outs, extras, olens)
            loss = loss + cbhg_l1_loss + cbhg_mse_loss
            report_keys += [
                {"cbhg_l1_loss": cbhg_l1_loss.item()},
                {"cbhg_mse_loss": cbhg_mse_loss.item()},
            ]

        report_keys += [{"loss": loss.item()}]
        self.reporter.report(report_keys)

        return loss

    def inference(self, x, inference_args, spemb=None, *args, **kwargs):
        """Generate the sequence of features given the sequences of characters.

        Args:
            x (Tensor): Input sequence of characters (T,).
            inference_args (Namespace):
                - threshold (float): Threshold in inference.
                - minlenratio (float): Minimum length ratio in inference.
                - maxlenratio (float): Maximum length ratio in inference.
            spemb (Tensor, optional): Speaker embedding vector (spk_embed_dim).

        Returns:
            Tensor: Output sequence of features (L, odim).
            Tensor: Output sequence of stop probabilities (L,).
            Tensor: Attention weights (L, T).

        """
        # get options
        threshold = inference_args.threshold
        minlenratio = inference_args.minlenratio
        maxlenratio = inference_args.maxlenratio
        use_att_constraint = getattr(
            inference_args, "use_att_constraint", False
        )  # keep compatibility
        backward_window = inference_args.backward_window if use_att_constraint else 0
        forward_window = inference_args.forward_window if use_att_constraint else 0

        # inference
        h = self.enc.inference(x)
        if self.spk_embed_dim is not None:
            spemb = F.normalize(spemb, dim=0).unsqueeze(0).expand(h.size(0), -1)
            h = torch.cat([h, spemb], dim=-1)
        outs, probs, att_ws = self.dec.inference(
            h,
            threshold,
            minlenratio,
            maxlenratio,
            use_att_constraint=use_att_constraint,
            backward_window=backward_window,
            forward_window=forward_window,
        )

        if self.use_cbhg:
            cbhg_outs = self.cbhg.inference(outs)
            return cbhg_outs, probs, att_ws
        else:
            return outs, probs, att_ws

    def calculate_all_attentions(
        self, xs, ilens, ys, spembs=None, keep_tensor=False, *args, **kwargs
    ):
        """Calculate all of the attention weights.

        Args:
            xs (Tensor): Batch of padded character ids (B, Tmax).
            ilens (LongTensor): Batch of lengths of each input batch (B,).
            ys (Tensor): Batch of padded target features (B, Lmax, odim).
            olens (LongTensor): Batch of the lengths of each target (B,).
            spembs (Tensor, optional):
                Batch of speaker embedding vectors (B, spk_embed_dim).
            keep_tensor (bool, optional): Whether to keep original tensor.

        Returns:
            Union[ndarray, Tensor]: Batch of attention weights (B, Lmax, Tmax).

        """
        # check ilens type (should be list of int)
        if isinstance(ilens, torch.Tensor) or isinstance(ilens, np.ndarray):
            ilens = list(map(int, ilens))

        self.eval()
        with torch.no_grad():
            hs, hlens = self.enc(xs, ilens)
            if self.spk_embed_dim is not None:
                spembs = F.normalize(spembs).unsqueeze(1).expand(-1, hs.size(1), -1)
                hs = torch.cat([hs, spembs], dim=-1)
            att_ws = self.dec.calculate_all_attentions(hs, hlens, ys)
        self.train()

        if keep_tensor:
            return att_ws
        else:
            return att_ws.cpu().numpy()

    @property
    def base_plot_keys(self):
        """Return base key names to plot during training.

        keys should match what `chainer.reporter` reports.
        If you add the key `loss`, the reporter will report `main/loss`
        and `validation/main/loss` values.
        also `loss.png` will be created as a figure visulizing `main/loss`
        and `validation/main/loss` values.

        Returns:
            list: List of strings which are base keys to plot during training.

        """
        plot_keys = ["loss", "l1_loss", "mse_loss", "bce_loss"]
        if self.use_guided_attn_loss:
            plot_keys += ["attn_loss"]
        if self.use_cbhg:
            plot_keys += ["cbhg_l1_loss", "cbhg_mse_loss"]
        return plot_keys


================================================
FILE: nets/pytorch_backend/e2e_tts_transformer.py
================================================
# Copyright 2019 Tomoki Hayashi
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

"""TTS-Transformer related modules."""

import logging

import torch
import torch.nn.functional as F

from espnet.nets.pytorch_backend.e2e_tts_tacotron2 import GuidedAttentionLoss
from espnet.nets.pytorch_backend.e2e_tts_tacotron2 import (
    Tacotron2Loss as TransformerLoss,  # noqa: H301
)
from espnet.nets.pytorch_backend.nets_utils import make_non_pad_mask
from espnet.nets.pytorch_backend.tacotron2.decoder import Postnet
from espnet.nets.pytorch_backend.tacotron2.decoder import Prenet as DecoderPrenet
from espnet.nets.pytorch_backend.tacotron2.encoder import Encoder as EncoderPrenet
from espnet.nets.pytorch_backend.transformer.attention import MultiHeadedAttention
from espnet.nets.pytorch_backend.transformer.decoder import Decoder
from espnet.nets.pytorch_backend.transformer.embedding import PositionalEncoding
from espnet.nets.pytorch_backend.transformer.embedding import ScaledPositionalEncoding
from espnet.nets.pytorch_backend.transformer.encoder import Encoder
from espnet.nets.pytorch_backend.transformer.initializer import initialize
from espnet.nets.pytorch_backend.transformer.mask import subsequent_mask
from espnet.nets.tts_interface import TTSInterface
from espnet.utils.cli_utils import strtobool
from espnet.utils.fill_missing_args import fill_missing_args


class GuidedMultiHeadAttentionLoss(GuidedAttentionLoss):
    """Guided attention loss function module for multi head attention.

    Args:
        sigma (float, optional): Standard deviation to control
        how close attention to a diagonal.
        alpha (float, optional): Scaling coefficient (lambda).
        reset_always (bool, optional): Whether to always reset masks.

    """

    def forward(self, att_ws, ilens, olens):
        """Calculate forward propagation.

        Args:
            att_ws (Tensor):
                Batch of multi head attention weights (B, H, T_max_out, T_max_in).
            ilens (LongTensor): Batch of input lenghts (B,).
            olens (LongTensor): Batch of output lenghts (B,).

        Returns:
            Tensor: Guided attention loss value.

        """
        if self.guided_attn_masks is None:
            self.guided_attn_masks = (
                self._make_guided_attention_masks(ilens, olens)
                .to(att_ws.device)
                .unsqueeze(1)
            )
        if self.masks is None:
            self.masks = self._make_masks(ilens, olens).to(att_ws.device).unsqueeze(1)
        losses = self.guided_attn_masks * att_ws
        loss = torch.mean(losses.masked_select(self.masks))
        if self.reset_always:
            self._reset_masks()

        return self.alpha * loss


try:
    from espnet.nets.pytorch_backend.transformer.plot import PlotAttentionReport
except (ImportError, TypeError):
    TTSPlot = None
else:

    class TTSPlot(PlotAttentionReport):
        """Attention plot module for TTS-Transformer."""

        def plotfn(
            self, data_dict, uttid_list, attn_dict, outdir, suffix="png", savefn=None
        ):
            """Plot multi head attentions.

            Args:
                data_dict (dict): Utts info from json file.
                uttid_list (list): List of utt_id.
                attn_dict (dict): Multi head attention dict.
                    Values should be numpy.ndarray (H, L, T)
                outdir (str): Directory name to save figures.
                suffix (str): Filename suffix including image type (e.g., png).
                savefn (function): Function to save figures.

            """
            import matplotlib.pyplot as plt
            from espnet.nets.pytorch_backend.transformer.plot import (
                _plot_and_save_attention,  # noqa: H301
            )

            for name, att_ws in attn_dict.items():
                for utt_id, att_w in zip(uttid_list, att_ws):
                    filename = "%s/%s.%s.%s" % (outdir, utt_id, name, suffix)
                    if "fbank" in name:
                        fig = plt.Figure()
                        ax = fig.subplots(1, 1)
                        ax.imshow(att_w, aspect="auto")
                        ax.set_xlabel("frames")
                        ax.set_ylabel("fbank coeff")
                        fig.tight_layout()
                    else:
                        fig = _plot_and_save_attention(att_w, filename)
                    savefn(fig, filename)


class Transformer(TTSInterface, torch.nn.Module):
    """Text-to-Speech Transformer module.

    This is a module of text-to-speech Transformer described
    in `Neural Speech Synthesis with Transformer Network`_,
    which convert the sequence of characters
    or phonemes into the sequence of Mel-filterbanks.

    .. _`Neural Speech Synthesis with Transformer Network`:
        https://arxiv.org/pdf/1809.08895.pdf

    """

    @staticmethod
    def add_arguments(parser):
        """Add model-specific arguments to the parser."""
        group = parser.add_argument_group("transformer model setting")
        # network structure related
        group.add_argument(
            "--embed-dim",
            default=512,
            type=int,
            help="Dimension of character embedding in encoder prenet",
        )
        group.add_argument(
            "--eprenet-conv-layers",
            default=3,
            type=int,
            help="Number of encoder prenet convolution layers",
        )
        group.add_argument(
            "--eprenet-conv-chans",
            default=256,
            type=int,
            help="Number of encoder prenet convolution channels",
        )
        group.add_argument(
            "--eprenet-conv-filts",
            default=5,
            type=int,
            help="Filter size of encoder prenet convolution",
        )
        group.add_argument(
            "--dprenet-layers",
            default=2,
            type=int,
            help="Number of decoder prenet layers",
        )
        group.add_argument(
            "--dprenet-units",
            default=256,
            type=int,
            help="Number of decoder prenet hidden units",
        )
        group.add_argument(
            "--elayers", default=3, type=int, help="Number of encoder layers"
        )
        group.add_argument(
            "--eunits", default=1536, type=int, help="Number of encoder hidden units"
        )
        group.add_argument(
            "--adim",
            default=384,
            type=int,
            help="Number of attention transformation dimensions",
        )
        group.add_argument(
            "--aheads",
            default=4,
            type=int,
            help="Number of heads for multi head attention",
        )
        group.add_argument(
            "--dlayers", default=3, type=int, help="Number of decoder layers"
        )
        group.add_argument(
            "--dunits", default=1536, type=int, help="Number of decoder hidden units"
        )
        group.add_argument(
            "--positionwise-layer-type",
            default="linear",
            type=str,
            choices=["linear", "conv1d", "conv1d-linear"],
            help="Positionwise layer type.",
        )
        group.add_argument(
            "--positionwise-conv-kernel-size",
            default=1,
            type=int,
            help="Kernel size of positionwise conv1d layer",
        )
        group.add_argument(
            "--postnet-layers", default=5, type=int, help="Number of postnet layers"
        )
        group.add_argument(
            "--postnet-chans", default=256, type=int, help="Number of postnet channels"
        )
        group.add_argument(
            "--postnet-filts", default=5, type=int, help="Filter size of postnet"
        )
        group.add_argument(
            "--use-scaled-pos-enc",
            default=True,
            type=strtobool,
            help="Use trainable scaled positional encoding "
            "instead of the fixed scale one.",
        )
        group.add_argument(
            "--use-batch-norm",
            default=True,
            type=strtobool,
            help="Whether to use batch normalization",
        )
        group.add_argument(
            "--encoder-normalize-before",
            default=False,
            type=strtobool,
            help="Whether to apply layer norm before encoder block",
        )
        group.add_argument(
            "--decoder-normalize-before",
            default=False,
            type=strtobool,
            help="Whether to apply layer norm before decoder block",
        )
        group.add_argument(
            "--encoder-concat-after",
            default=False,
            type=strtobool,
            help="Whether to concatenate attention layer's input and output in encoder",
        )
        group.add_argument(
            "--decoder-concat-after",
            default=False,
            type=strtobool,
            help="Whether to concatenate attention layer's input and output in decoder",
        )
        group.add_argument(
            "--reduction-factor", default=1, type=int, help="Reduction factor"
        )
        group.add_argument(
            "--spk-embed-dim",
            default=None,
            type=int,
            help="Number of speaker embedding dimensions",
        )
        group.add_argument(
            "--spk-embed-integration-type",
            type=str,
            default="add",
            choices=["add", "concat"],
            help="How to integrate speaker embedding",
        )
        # training related
        group.add_argument(
            "--transformer-init",
            type=str,
            default="pytorch",
            choices=[
                "pytorch",
                "xavier_uniform",
                "xavier_normal",
                "kaiming_uniform",
                "kaiming_normal",
            ],
            help="How to initialize transformer parameters",
        )
        group.add_argument(
            "--initial-encoder-alpha",
            type=float,
            default=1.0,
            help="Initial alpha value in encoder's ScaledPositionalEncoding",
        )
        group.add_argument(
            "--initial-decoder-alpha",
            type=float,
            default=1.0,
            help="Initial alpha value in decoder's ScaledPositionalEncoding",
        )
        group.add_argument(
            "--transformer-lr",
            default=1.0,
            type=float,
            help="Initial value of learning rate",
        )
        group.add_argument(
            "--transformer-warmup-steps",
            default=4000,
            type=int,
            help="Optimizer warmup steps",
        )
        group.add_argument(
            "--transformer-enc-dropout-rate",
            default=0.1,
            type=float,
            help="Dropout rate for transformer encoder except for attention",
        )
        group.add_argument(
            "--transformer-enc-positional-dropout-rate",
            default=0.1,
            type=float,
            help="Dropout rate for transformer encoder positional encoding",
        )
        group.add_argument(
            "--transformer-enc-attn-dropout-rate",
            default=0.1,
            type=float,
            help="Dropout rate for transformer encoder self-attention",
        )
        group.add_argument(
            "--transformer-dec-dropout-rate",
            default=0.1,
            type=float,
            help="Dropout rate for transformer decoder "
            "except for attention and pos encoding",
        )
        group.add_argument(
            "--transformer-dec-positional-dropout-rate",
            default=0.1,
            type=float,
            help="Dropout rate for transformer decoder positional encoding",
        )
        group.add_argument(
            "--transformer-dec-attn-dropout-rate",
            default=0.1,
            type=float,
            help="Dropout rate for transformer decoder self-attention",
        )
        group.add_argument(
            "--transformer-enc-dec-attn-dropout-rate",
            default=0.1,
            type=float,
            help="Dropout rate for transformer encoder-decoder attention",
        )
        group.add_argument(
            "--eprenet-dropout-rate",
            default=0.5,
            type=float,
            help="Dropout rate in encoder prenet",
        )
        group.add_argument(
            "--dprenet-dropout-rate",
            default=0.5,
            type=float,
            help="Dropout rate in decoder prenet",
        )
        group.add_argument(
            "--postnet-dropout-rate",
            default=0.5,
            type=float,
            help="Dropout rate in postnet",
        )
        group.add_argument(
            "--pretrained-model", default=None, type=str, help="Pretrained model path"
        )
        # loss related
        group.add_argument(
            "--use-masking",
            default=True,
            type=strtobool,
            help="Whether to use masking in calculation of loss",
        )
        group.add_argument(
            "--use-weighted-masking",
            default=False,
            type=strtobool,
            help="Whether to use weighted masking in calculation of loss",
        )
        group.add_argument(
            "--loss-type",
            default="L1",
            choices=["L1", "L2", "L1+L2"],
            help="How to calc loss",
        )
        group.add_argument(
            "--bce-pos-weight",
            default=5.0,
            type=float,
            help="Positive sample weight in BCE calculation "
            "(only for use-masking=True)",
        )
        group.add_argument(
            "--use-guided-attn-loss",
            default=False,
            type=strtobool,
            help="Whether to use guided attention loss",
        )
        group.add_argument(
            "--guided-attn-loss-sigma",
            default=0.4,
            type=float,
            help="Sigma in guided attention loss",
        )
        group.add_argument(
            "--guided-attn-loss-lambda",
            default=1.0,
            type=float,
            help="Lambda in guided attention loss",
        )
        group.add_argument(
            "--num-heads-applied-guided-attn",
            default=2,
            type=int,
            help="Number of heads in each layer to be applied guided attention loss"
            "if set -1, all of the heads will be applied.",
        )
        group.add_argument(
            "--num-layers-applied-guided-attn",
            default=2,
            type=int,
            help="Number of layers to be applied guided attention loss"
            "if set -1, all of the layers will be applied.",
        )
        group.add_argument(
            "--modules-applied-guided-attn",
            type=str,
            nargs="+",
            default=["encoder-decoder"],
            help="Module name list to be applied guided attention loss",
        )
        return parser

    @property
    def attention_plot_class(self):
        """Return plot class for attention weight plot."""
        return TTSPlot

    def __init__(self, idim, odim, args=None):
        """Initialize TTS-Transformer module.

        Args:
            idim (int): Dimension of the inputs.
            odim (int): Dimension of the outputs.
            args (Namespace, optional):
                - embed_dim (int): Dimension of character embedding.
                - eprenet_conv_layers (int):
                    Number of encoder prenet convolution layers.
                - eprenet_conv_chans (int):
                    Number of encoder prenet convolution channels.
                - eprenet_conv_filts (int): Filter size of encoder prenet convolution.
                - dprenet_layers (int): Number of decoder prenet layers.
                - dprenet_units (int): Number of decoder prenet hidden units.
                - elayers (int): Number of encoder layers.
                - eunits (int): Number of encoder hidden units.
                - adim (int): Number of attention transformation dimensions.
                - aheads (int): Number of heads for multi head attention.
                - dlayers (int): Number of decoder layers.
                - dunits (int): Number of decoder hidden units.
                - postnet_layers (int): Number of postnet layers.
                - postnet_chans (int): Number of postnet channels.
                - postnet_filts (int): Filter size of postnet.
                - use_scaled_pos_enc (bool):
                    Whether to use trainable scaled positional encoding.
                - use_batch_norm (bool):
                    Whether to use batch normalization in encoder prenet.
                - encoder_normalize_before (bool):
                    Whether to perform layer normalization before encoder block.
                - decoder_normalize_before (bool):
                    Whether to perform layer normalization before decoder block.
                - encoder_concat_after (bool): Whether to concatenate attention
                    layer's input and output in encoder.
                - decoder_concat_after (bool): Whether to concatenate attention
                    layer's input and output in decoder.
                - reduction_factor (int): Reduction factor.
                - spk_embed_dim (int): Number of speaker embedding dimenstions.
                - spk_embed_integration_type: How to integrate speaker embedding.
                - transformer_init (float): How to initialize transformer parameters.
                - transformer_lr (float): Initial value of learning rate.
                - transformer_warmup_steps (int): Optimizer warmup steps.
                - transformer_enc_dropout_rate (float):
                    Dropout rate in encoder except attention & positional encoding.
                - transformer_enc_positional_dropout_rate (float):
                    Dropout rate after encoder positional encoding.
                - transformer_enc_attn_dropout_rate (float):
                    Dropout rate in encoder self-attention module.
                - transformer_dec_dropout_rate (float):
                    Dropout rate in decoder except attention & positional encoding.
                - transformer_dec_positional_dropout_rate (float):
                    Dropout rate after decoder positional encoding.
                - transformer_dec_attn_dropout_rate (float):
                    Dropout rate in deocoder self-attention module.
                - transformer_enc_dec_attn_dropout_rate (float):
                    Dropout rate in encoder-deocoder attention module.
                - eprenet_dropout_rate (float): Dropout rate in encoder prenet.
                - dprenet_dropout_rate (float): Dropout rate in decoder prenet.
                - postnet_dropout_rate (float): Dropout rate in postnet.
                - use_masking (bool):
                    Whether to apply masking for padded part in loss calculation.
                - use_weighted_masking (bool):
                    Whether to apply weighted masking in loss calculation.
                - bce_pos_weight (float): Positive sample weight in bce calculation
                    (only for use_masking=true).
                - loss_type (str): How to calculate loss.
                - use_guided_attn_loss (bool): Whether to use guided attention loss.
                - num_heads_applied_guided_attn (int):
                    Number of heads in each layer to apply guided attention loss.
                - num_layers_applied_guided_attn (int):
                    Number of layers to apply guided attention loss.
                - modules_applied_guided_attn (list):
                    List of module names to apply guided attention loss.
                - guided-attn-loss-sigma (float) Sigma in guided attention loss.
                - guided-attn-loss-lambda (float): Lambda in guided attention loss.

        """
        # initialize base classes
        TTSInterface.__init__(self)
        torch.nn.Module.__init__(self)

        # fill missing arguments
        args = fill_missing_args(args, self.add_arguments)

        # store hyperparameters
        self.idim = idim
        self.odim = odim
        self.spk_embed_dim = args.spk_embed_dim
        if self.spk_embed_dim is not None:
            self.spk_embed_integration_type = args.spk_embed_integration_type
        self.use_scaled_pos_enc = args.use_scaled_pos_enc
        self.reduction_factor = args.reduction_factor
        self.loss_type = args.loss_type
        self.use_guided_attn_loss = args.use_guided_attn_loss
        if self.use_guided_attn_loss:
            if args.num_layers_applied_guided_attn == -1:
                self.num_layers_applied_guided_attn = args.elayers
            else:
                self.num_layers_applied_guided_attn = (
                    args.num_layers_applied_guided_attn
                )
            if args.num_heads_applied_guided_attn == -1:
                self.num_heads_applied_guided_attn = args.aheads
            else:
                self.num_heads_applied_guided_attn = args.num_heads_applied_guided_attn
            self.modules_applied_guided_attn = args.modules_applied_guided_attn

        # use idx 0 as padding idx
        padding_idx = 0

        # get positional encoding class
        pos_enc_class = (
            ScaledPositionalEncoding if self.use_scaled_pos_enc else PositionalEncoding
        )

        # define transformer encoder
        if args.eprenet_conv_layers != 0:
            # encoder prenet
            encoder_input_layer = torch.nn.Sequential(
                EncoderPrenet(
                    idim=idim,
                    embed_dim=args.embed_dim,
                    elayers=0,
                    econv_layers=args.eprenet_conv_layers,
                    econv_chans=args.eprenet_conv_chans,
                    econv_filts=args.eprenet_conv_filts,
                    use_batch_norm=args.use_batch_norm,
                    dropout_rate=args.eprenet_dropout_rate,
                    padding_idx=padding_idx,
                ),
                torch.nn.Linear(args.eprenet_conv_chans, args.adim),
            )
        else:
            encoder_input_layer = torch.nn.Embedding(
                num_embeddings=idim, embedding_dim=args.adim, padding_idx=padding_idx
            )
        self.encoder = Encoder(
            idim=idim,
            attention_dim=args.adim,
            attention_heads=args.aheads,
            linear_units=args.eunits,
            num_blocks=args.elayers,
            input_layer=encoder_input_layer,
            dropout_rate=args.transformer_enc_dropout_rate,
            positional_dropout_rate=args.transformer_enc_positional_dropout_rate,
            attention_dropout_rate=args.transformer_enc_attn_dropout_rate,
            pos_enc_class=pos_enc_class,
            normalize_before=args.encoder_normalize_before,
            concat_after=args.encoder_concat_after,
            positionwise_layer_type=args.positionwise_layer_type,
            positionwise_conv_kernel_size=args.positionwise_conv_kernel_size,
        )

        # define projection layer
        if self.spk_embed_dim is not None:
            if self.spk_embed_integration_type == "add":
                self.projection = torch.nn.Linear(self.spk_embed_dim, args.adim)
            else:
                self.projection = torch.nn.Linear(
                    args.adim + self.spk_embed_dim, args.adim
                )

        # define transformer decoder
        if args.dprenet_layers != 0:
            # decoder prenet
            decoder_input_layer = torch.nn.Sequential(
                DecoderPrenet(
                    idim=odim,
                    n_layers=args.dprenet_layers,
                    n_units=args.dprenet_units,
                    dropout_rate=args.dprenet_dropout_rate,
                ),
                torch.nn.Linear(args.dprenet_units, args.adim),
            )
        else:
            decoder_input_layer = "linear"
        self.decoder = Decoder(
            odim=-1,
            attention_dim=args.adim,
            attention_heads=args.aheads,
            linear_units=args.dunits,
            num_blocks=args.dlayers,
            dropout_rate=args.transformer_dec_dropout_rate,
            positional_dropout_rate=args.transformer_dec_positional_dropout_rate,
            self_attention_dropout_rate=args.transformer_dec_attn_dropout_rate,
            src_attention_dropout_rate=args.transformer_enc_dec_attn_dropout_rate,
            input_layer=decoder_input_layer,
            use_output_layer=False,
            pos_enc_class=pos_enc_class,
            normalize_before=args.decoder_normalize_before,
            concat_after=args.decoder_concat_after,
        )

        # define final projection
        self.feat_out = torch.nn.Linear(args.adim, odim * args.reduction_factor)
        self.prob_out = torch.nn.Linear(args.adim, args.reduction_factor)

        # define postnet
        self.postnet = (
            None
            if args.postnet_layers == 0
            else Postnet(
                idim=idim,
                odim=odim,
                n_layers=args.postnet_layers,
                n_chans=args.postnet_chans,
                n_filts=args.postnet_filts,
                use_batch_norm=args.use_batch_norm,
                dropout_rate=args.postnet_dropout_rate,
            )
        )

        # define loss function
        self.criterion = TransformerLoss(
            use_masking=args.use_masking,
            use_weighted_masking=args.use_weighted_masking,
            bce_pos_weight=args.bce_pos_weight,
        )
        if self.use_guided_attn_loss:
            self.attn_criterion = GuidedMultiHeadAttentionLoss(
                sigma=args.guided_attn_loss_sigma,
                alpha=args.guided_attn_loss_lambda,
            )

        # initialize parameters
        self._reset_parameters(
            init_type=args.transformer_init,
            init_enc_alpha=args.initial_encoder_alpha,
            init_dec_alpha=args.initial_decoder_alpha,
        )

        # load pretrained model
        if args.pretrained_model is not None:
            self.load_pretrained_model(args.pretrained_model)

    def _reset_parameters(self, init_type, init_enc_alpha=1.0, init_dec_alpha=1.0):
        # initialize parameters
        initialize(self, init_type)

        # initialize alpha in scaled positional encoding
        if self.use_scaled_pos_enc:
            self.encoder.embed[-1].alpha.data = torch.tensor(init_enc_alpha)
            self.decoder.embed[-1].alpha.data = torch.tensor(init_dec_alpha)

    def _add_first_frame_and_remove_last_frame(self, ys):
        ys_in = torch.cat(
            [ys.new_zeros((ys.shape[0], 1, ys.shape[2])), ys[:, :-1]], dim=1
        )
        return ys_in

    def forward(self, xs, ilens, ys, labels, olens, spembs=None, *args, **kwargs):
        """Calculate forward propagation.

        Args:
            xs (Tensor): Batch of padded character ids (B, Tmax).
            ilens (LongTensor): Batch of lengths of each input batch (B,).
            ys (Tensor): Batch of padded target features (B, Lmax, odim).
            olens (LongTensor): Batch of the lengths of each target (B,).
            spembs (Tensor, optional):
                Batch of speaker embedding vectors (B, spk_embed_dim).

        Returns:
            Tensor: Loss value.

        """
        # remove unnecessary padded part (for multi-gpus)
        max_ilen = max(ilens)
        max_olen = max(olens)
        if max_ilen != xs.shape[1]:
            xs = xs[:, :max_ilen]
        if max_olen != ys.shape[1]:
            ys = ys[:, :max_olen]
            labels = labels[:, :max_olen]

        # forward encoder
        x_masks = self._source_mask(ilens)
        hs, h_masks = self.encoder(xs, x_masks)

        # integrate speaker embedding
        if self.spk_embed_dim is not None:
            hs = self._integrate_with_spk_embed(hs, spembs)

        # thin out frames for reduction factor (B, Lmax, odim) ->  (B, Lmax//r, odim)
        if self.reduction_factor > 1:
            ys_in = ys[:, self.reduction_factor - 1 :: self.reduction_factor]
            olens_in = olens.new([olen // self.reduction_factor for olen in olens])
        else:
            ys_in, olens_in = ys, olens

        # add first zero frame and remove last frame for auto-regressive
        ys_in = self._add_first_frame_and_remove_last_frame(ys_in)

        # forward decoder
        y_masks = self._target_mask(olens_in)
        zs, _ = self.decoder(ys_in, y_masks, hs, h_masks)
        # (B, Lmax//r, odim * r) -> (B, Lmax//r * r, odim)
        before_outs = self.feat_out(zs).view(zs.size(0), -1, self.odim)
        # (B, Lmax//r, r) -> (B, Lmax//r * r)
        logits = self.prob_out(zs).view(zs.size(0), -1)

        # postnet -> (B, Lmax//r * r, odim)
        if self.postnet is None:
            after_outs = before_outs
        else:
            after_outs = before_outs + self.postnet(
                before_outs.transpose(1, 2)
            ).transpose(1, 2)

        # modifiy mod part of groundtruth
        if self.reduction_factor > 1:
            olens = olens.new([olen - olen % self.reduction_factor for olen in olens])
            max_olen = max(olens)
            ys = ys[:, :max_olen]
            labels = labels[:, :max_olen]
            labels[:, -1] = 1.0  # make sure at least one frame has 1

        # caluculate loss values
        l1_loss, l2_loss, bce_loss = self.criterion(
            after_outs, before_outs, logits, ys, labels, olens
        )
        if self.loss_type == "L1":
            loss = l1_loss + bce_loss
        elif self.loss_type == "L2":
            loss = l2_loss + bce_loss
        elif self.loss_type == "L1+L2":
            loss = l1_loss + l2_loss + bce_loss
        else:
            raise ValueError("unknown --loss-type " + self.loss_type)
        report_keys = [
            {"l1_loss": l1_loss.item()},
            {"l2_loss": l2_loss.item()},
            {"bce_loss": bce_loss.item()},
            {"loss": loss.item()},
        ]

        # calculate guided attention loss
        if self.use_guided_attn_loss:
            # calculate for encoder
            if "encoder" in self.modules_applied_guided_attn:
                att_ws = []
                for idx, layer_idx in enumerate(
                    reversed(range(len(self.encoder.encoders)))
                ):
                    att_ws += [
                        self.encoder.encoders[layer_idx].self_attn.attn[
                            :, : self.num_heads_applied_guided_attn
                        ]
                    ]
                    if idx + 1 == self.num_layers_applied_guided_attn:
                        break
                att_ws = torch.cat(att_ws, dim=1)  # (B, H*L, T_in, T_in)
                enc_attn_loss = self.attn_criterion(att_ws, ilens, ilens)
                loss = loss + enc_attn_loss
                report_keys += [{"enc_attn_loss": enc_attn_loss.item()}]
            # calculate for decoder
            if "decoder" in self.modules_applied_guided_attn:
                att_ws = []
                for idx, layer_idx in enumerate(
                    reversed(range(len(self.decoder.decoders)))
                ):
                    att_ws += [
                        self.decoder.decoders[layer_idx].self_attn.attn[
                            :, : self.num_heads_applied_guided_attn
                        ]
                    ]
                    if idx + 1 == self.num_layers_applied_guided_attn:
                        break
                att_ws = torch.cat(att_ws, dim=1)  # (B, H*L, T_out, T_out)
                dec_attn_loss = self.attn_criterion(att_ws, olens_in, olens_in)
                loss = loss + dec_attn_loss
                report_keys += [{"dec_attn_loss": dec_attn_loss.item()}]
            # calculate for encoder-decoder
            if "encoder-decoder" in self.modules_applied_guided_attn:
                att_ws = []
                for idx, layer_idx in enumerate(
                    reversed(range(len(self.decoder.decoders)))
                ):
                    att_ws += [
                        self.decoder.decoders[layer_idx].src_attn.attn[
                            :, : self.num_heads_applied_guided_attn
                        ]
                    ]
                    if idx + 1 == self.num_layers_applied_guided_attn:
                        break
                att_ws = torch.cat(att_ws, dim=1)  # (B, H*L, T_out, T_in)
                enc_dec_attn_loss = self.attn_criterion(att_ws, ilens, olens_in)
                loss = loss + enc_dec_attn_loss
                report_keys += [{"enc_dec_attn_loss": enc_dec_attn_loss.item()}]

        # report extra information
        if self.use_scaled_pos_enc:
            report_keys += [
                {"encoder_alpha": self.encoder.embed[-1].alpha.data.item()},
                {"decoder_alpha": self.decoder.embed[-1].alpha.data.item()},
            ]
        self.reporter.report(report_keys)

        return loss

    def inference(self, x, inference_args, spemb=None, *args, **kwargs):
        """Generate the sequence of features given the sequences of characters.

        Args:
            x (Tensor): Input sequence of characters (T,).
            inference_args (Namespace):
                - threshold (float): Threshold in inference.
                - minlenratio (float): Minimum length ratio in inference.
                - maxlenratio (float): Maximum length ratio in inference.
            spemb (Tensor, optional): Speaker embedding vector (spk_embed_dim).

        Returns:
            Tensor: Output sequence of features (L, odim).
            Tensor: Output sequence of stop probabilities (L,).
            Tensor: Encoder-decoder (source) attention weights (#layers, #heads, L, T).

        """
        # get options
        threshold = inference_args.threshold
        minlenratio = inference_args.minlenratio
        maxlenratio = inference_args.maxlenratio
        use_att_constraint = getattr(
            inference_args, "use_att_constraint", False
        )  # keep compatibility
        if use_att_constraint:
            logging.warning(
                "Attention constraint is not yet supported in Transformer. Not enabled."
            )

        # forward encoder
        xs = x.unsqueeze(0)
        hs, _ = self.encoder(xs, None)

        # integrate speaker embedding
        if self.spk_embed_dim is not None:
            spembs = spemb.unsqueeze(0)
            hs = self._integrate_with_spk_embed(hs, spembs)

        # set limits of length
        maxlen = int(hs.size(1) * maxlenratio / self.reduction_factor)
        minlen = int(hs.size(1) * minlenratio / self.reduction_factor)

        # initialize
        idx = 0
        ys = hs.new_zeros(1, 1, self.odim)
        outs, probs = [], []

        # forward decoder step-by-step
        z_cache = self.decoder.init_state(x)
        while True:
            # update index
            idx += 1

            # calculate output and stop prob at idx-th step
            y_masks = subsequent_mask(idx).unsqueeze(0).to(x.device)
            z, z_cache = self.decoder.forward_one_step(
                ys, y_masks, hs, cache=z_cache
            )  # (B, adim)
            outs += [
                self.feat_out(z).view(self.reduction_factor, self.odim)
            ]  # [(r, odim), ...]
            probs += [torch.sigmoid(self.prob_out(z))[0]]  # [(r), ...]

            # update next inputs
            ys = torch.cat(
                (ys, outs[-1][-1].view(1, 1, self.odim)), dim=1
            )  # (1, idx + 1, odim)

            # get attention weights
            att_ws_ = []
            for name, m in self.named_modules():
                if isinstance(m, MultiHeadedAttention) and "src" in name:
                    att_ws_ += [m.attn[0, :, -1].unsqueeze(1)]  # [(#heads, 1, T),...]
            if idx == 1:
                att_ws = att_ws_
            else:
                # [(#heads, l, T), ...]
                att_ws = [
                    torch.cat([att_w, att_w_], dim=1)
                    for att_w, att_w_ in zip(att_ws, att_ws_)
                ]

            # check whether to finish generation
            if int(sum(probs[-1] >= threshold)) > 0 or idx >= maxlen:
                # check mininum length
                if idx < minlen:
                    continue
                outs = (
                    torch.cat(outs, dim=0).unsqueeze(0).transpose(1, 2)
                )  # (L, odim) -> (1, L, odim) -> (1, odim, L)
                if self.postnet is not None:
                    outs = outs + self.postnet(outs)  # (1, odim, L)
                outs = outs.transpose(2, 1).squeeze(0)  # (L, odim)
                probs = torch.cat(probs, dim=0)
                break

        # concatenate attention weights -> (#layers, #heads, L, T)
        att_ws = torch.stack(att_ws, dim=0)

        return outs, probs, att_ws

    def calculate_all_attentions(
        self,
        xs,
        ilens,
        ys,
        olens,
        spembs=None,
        skip_output=False,
        keep_tensor=False,
        *args,
        **kwargs
    ):
        """Calculate all of the attention weights.

        Args:
            xs (Tensor): Batch of padded character ids (B, Tmax).
            ilens (LongTensor): Batch of lengths of each input batch (B,).
            ys (Tensor): Batch of padded target features (B, Lmax, odim).
            olens (LongTensor): Batch of the lengths of each target (B,).
            spembs (Tensor, optional):
                Batch of speaker embedding vectors (B, spk_embed_dim).
            skip_output (bool, optional): Whether to skip calculate the final output.
            keep_tensor (bool, optional): Whether to keep original tensor.

        Returns:
            dict: Dict of attention weights and outputs.

        """
        self.eval()
        with torch.no_grad():
            # forward encoder
            x_masks = self._source_mask(ilens)
            hs, h_masks = self.encoder(xs, x_masks)

            # integrate speaker embedding
            if self.spk_embed_dim is not None:
                hs = self._integrate_with_spk_embed(hs, spembs)

            # thin out frames for reduction factor
            # (B, Lmax, odim) ->  (B, Lmax//r, odim)
            if self.reduction_factor > 1:
                ys_in = ys[:, self.reduction_factor - 1 :: self.reduction_factor]
                olens_in = olens.new([olen // self.reduction_factor for olen in olens])
            else:
                ys_in, olens_in = ys, olens

            # add first zero frame and remove last frame for auto-regressive
            ys_in = self._add_first_frame_and_remove_last_frame(ys_in)

            # forward decoder
            y_masks = self._target_mask(olens_in)
            zs, _ = self.decoder(ys_in, y_masks, hs, h_masks)

            # calculate final outputs
            if not skip_output:
                before_outs = self.feat_out(zs).view(zs.size(0), -1, self.odim)
                if self.postnet is None:
                    after_outs = before_outs
                else:
                    after_outs = before_outs + self.postnet(
                        before_outs.transpose(1, 2)
                    ).transpose(1, 2)

        # modifiy mod part of output lengths due to reduction factor > 1
        if self.reduction_factor > 1:
            olens = olens.new([olen - olen % self.reduction_factor for olen in olens])

        # store into dict
        att_ws_dict = dict()
        if keep_tensor:
            for name, m in self.named_modules():
                if isinstance(m, MultiHeadedAttention):
                    att_ws_dict[name] = m.attn
            if not skip_output:
                att_ws_dict["before_postnet_fbank"] = before_outs
                att_ws_dict["after_postnet_fbank"] = after_outs
        else:
            for name, m in self.named_modules():
                if isinstance(m, MultiHeadedAttention):
                    attn = m.attn.cpu().numpy()
                    if "encoder" in name:
                        attn = [a[:, :l, :l] for a, l in zip(attn, ilens.tolist())]
                    elif "decoder" in name:
                        if "src" in name:
                            attn = [
                                a[:, :ol, :il]
                                for a, il, ol in zip(
                                    attn, ilens.tolist(), olens_in.tolist()
                                )
                            ]
                        elif "self" in name:
                            attn = [
                                a[:, :l, :l] for a, l in zip(attn, olens_in.tolist())
                            ]
                        else:
                            logging.warning("unknown attention module: " + name)
                    else:
                        logging.warning("unknown attention module: " + name)
                    att_ws_dict[name] = attn
            if not skip_output:
                before_outs = before_outs.cpu().numpy()
                after_outs = after_outs.cpu().numpy()
                att_ws_dict["before_postnet_fbank"] = [
                    m[:l].T for m, l in zip(before_outs, olens.tolist())
                ]
                att_ws_dict["after_postnet_fbank"] = [
                    m[:l].T for m, l in zip(after_outs, olens.tolist())
                ]
        self.train()
        return att_ws_dict

    def _integrate_with_spk_embed(self, hs, spembs):
        """Integrate speaker embedding with hidden states.

        Args:
            hs (Tensor): Batch of hidden state sequences (B, Tmax, adim).
            spembs (Tensor): Batch of speaker embeddings (B, spk_embed_dim).

        Returns:
            Tensor: Batch of integrated hidden state sequences (B, Tmax, adim)

        """
        if self.spk_embed_integration_type == "add":
            # apply projection and then add to hidden states
            spembs = self.projection(F.normalize(spembs))
            hs = hs + spembs.unsqueeze(1)
        elif self.spk_embed_integration_type == "concat":
            # concat hidden states with spk embeds and then apply projection
            spembs = F.normalize(spembs).unsqueeze(1).expand(-1, hs.size(1), -1)
            hs = self.projection(torch.cat([hs, spembs], dim=-1))
        else:
            raise NotImplementedError("support only add or concat.")

        return hs

    def _source_mask(self, ilens):
        """Make masks for self-attention.

        Args:
            ilens (LongTensor or List): Batch of lengths (B,).

        Returns:
            Tensor: Mask tensor for self-attention.
                    dtype=torch.uint8 in PyTorch 1.2-
                    dtype=torch.bool in PyTorch 1.2+ (including 1.2)

        Examples:
            >>> ilens = [5, 3]
            >>> self._source_mask(ilens)
            tensor([[[1, 1, 1, 1, 1],
                    [[1, 1, 1, 0, 0]]], dtype=torch.uint8)

        """
        x_masks = make_non_pad_mask(ilens).to(next(self.parameters()).device)
        return x_masks.unsqueeze(-2)

    def _target_mask(self, olens):
        """Make masks for masked self-attention.

        Args:
            olens (LongTensor or List): Batch of lengths (B,).

        Returns:
            Tensor: Mask tensor for masked self-attention.
                    dtype=torch.uint8 in PyTorch 1.2-
                    dtype=torch.bool in PyTorch 1.2+ (including 1.2)

        Examples:
            >>> olens = [5, 3]
            >>> self._target_mask(olens)
            tensor([[[1, 0, 0, 0, 0],
                     [1, 1, 0, 0, 0],
                     [1, 1, 1, 0, 0],
                     [1, 1, 1, 1, 0],
                     [1, 1, 1, 1, 1]],
                    [[1, 0, 0, 0, 0],
                     [1, 1, 0, 0, 0],
                     [1, 1, 1, 0, 0],
                     [1, 1, 1, 0, 0],
                     [1, 1, 1, 0, 0]]], dtype=torch.uint8)

        """
        y_masks = make_non_pad_mask(olens).to(next(self.parameters()).device)
        s_masks = subsequent_mask(y_masks.size(-1), device=y_masks.device).unsqueeze(0)
        return y_masks.unsqueeze(-2) & s_masks

    @property
    def base_plot_keys(self):
        """Return base key names to plot during training.

        keys should match what `chainer.reporter` reports.
        If you add the key `loss`, the reporter will report `main/loss`
        and `validation/main/loss` values.
        also `loss.png` will be created as a figure visulizing `main/loss`
        and `validation/main/loss` values.

        Returns:
            list: List of strings which are base keys to plot during training.

        """
        plot_keys = ["loss", "l1_loss", "l2_loss", "bce_loss"]
        if self.use_scaled_pos_enc:
            plot_keys += ["encoder_alpha", "decoder_alpha"]
        if self.use_guided_attn_loss:
            if "encoder" in self.modules_applied_guided_attn:
                plot_keys += ["enc_attn_loss"]
            if "decoder" in self.modules_applied_guided_attn:
                plot_keys += ["dec_attn_loss"]
            if "encoder-decoder" in self.modules_applied_guided_attn:
                plot_keys += ["enc_dec_attn_loss"]

        return plot_keys


================================================
FILE: nets/pytorch_backend/e2e_vc_tacotron2.py
================================================
# Copyright 2020 Nagoya University (Wen-Chin Huang)
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

"""Tacotron2-VC related modules."""

import logging

from distutils.util import strtobool

import numpy as np
import torch
import torch.nn.functional as F

from espnet.nets.pytorch_backend.rnn.attentions import AttForward
from espnet.nets.pytorch_backend.rnn.attentions import AttForwardTA
from espnet.nets.pytorch_backend.rnn.attentions import AttLoc
from espnet.nets.pytorch_backend.tacotron2.cbhg import CBHG
from espnet.nets.pytorch_backend.tacotron2.cbhg import CBHGLoss
from espnet.nets.pytorch_backend.tacotron2.decoder import Decoder
from espnet.nets.pytorch_backend.tacotron2.encoder import Encoder
from espnet.nets.tts_interface import TTSInterface
from espnet.utils.fill_missing_args import fill_missing_args
from espnet.nets.pytorch_backend.e2e_tts_tacotron2 import (
    GuidedAttentionLoss,  # noqa: H301
    Tacotron2Loss,  # noqa: H301
)


class Tacotron2(TTSInterface, torch.nn.Module):
    """VC Tacotron2 module for VC.

    This is a module of Tacotron2-based VC model,
    which convert the sequence of acoustic features
    into the sequence of acoustic features.
    """

    @staticmethod
    def add_arguments(parser):
        """Add model-specific arguments to the parser."""
        group = parser.add_argument_group("tacotron 2 model setting")
        # encoder
        group.add_argument(
            "--elayers", default=1, type=int, help="Number of encoder layers"
        )
        group.add_argument(
            "--eunits",
            "-u",
            default=512,
            type=int,
            help="Number of encoder hidden units",
        )
        group.add_argument(
            "--econv-layers",
            default=3,
            type=int,
            help="Number of encoder convolution layers",
        )
        group.add_argument(
            "--econv-chans",
            default=512,
            type=int,
            help="Number of encoder convolution channels",
        )
        group.add_argument(
            "--econv-filts",
            default=5,
            type=int,
            help="Filter size of encoder convolution",
        )
        # attention
        group.add_argument(
            "--atype",
            default="location",
            type=str,
            choices=["forward_ta", "forward", "location"],
            help="Type of attention mechanism",
        )
        group.add_argument(
            "--adim",
            default=512,
            type=int,
            help="Number of attention transformation dimensions",
        )
        group.add_argument(
            "--aconv-chans",
            default=32,
            type=int,
            help="Number of attention convolution channels",
        )
        group.add_argument(
            "--aconv-filts",
            default=15,
            type=int,
            help="Filter size of attention convolution",
        )
        group.add_argument(
            "--cumulate-att-w",
            default=True,
            type=strtobool,
            help="Whether or not to cumulate attention weights",
        )
        # decoder
        group.add_argument(
            "--dlayers", default=2, type=int, help="Number of decoder layers"
        )
        group.add_argument(
            "--dunits", default=1024, type=int, help="Number of decoder hidden units"
        )
        group.add_argument(
            "--prenet-layers", default=2, type=int, help="Number of prenet layers"
        )
        group.add_argument(
            "--prenet-units",
            default=256,
            type=int,
            help="Number of prenet hidden units",
        )
        group.add_argument(
            "--postnet-layers", default=5, type=int, help="Number of postnet layers"
        )
        group.add_argument(
            "--postnet-chans", default=512, type=int, help="Number of postnet channels"
        )
        group.add_argument(
            "--postnet-filts", default=5, type=int, help="Filter size of postnet"
        )
        group.add_argument(
            "--output-activation",
            default=None,
            type=str,
            nargs="?",
            help="Output activation function",
        )
        # cbhg
        group.add_argument(
            "--use-cbhg",
            default=False,
            type=strtobool,
            help="Whether to use CBHG module",
        )
        group.add_argument(
            "--cbhg-conv-bank-layers",
            default=8,
            type=int,
            help="Number of convoluional bank layers in CBHG",
        )
        group.add_argument(
            "--cbhg-conv-bank-chans",
            default=128,
            type=int,
            help="Number of convoluional bank channles in CBHG",
        )
        group.add_argument(
            "--cbhg-conv-proj-filts",
            default=3,
            type=int,
            help="Filter size of convoluional projection layer in CBHG",
        )
        group.add_argument(
            "--cbhg-conv-proj-chans",
            default=256,
            type=int,
            help="Number of convoluional projection channels in CBHG",
        )
        group.add_argument(
            "--cbhg-highway-layers",
            default=4,
            type=int,
            help="Number of highway layers in CBHG",
        )
        group.add_argument(
            "--cbhg-highway-units",
            default=128,
            type=int,
            help="Number of highway units in CBHG",
        )
        group.add_argument(
            "--cbhg-gru-units",
            default=256,
            type=int,
            help="Number of GRU units in CBHG",
        )
        # model (parameter) related
        group.add_argument(
            "--use-batch-norm",
            default=True,
            type=strtobool,
            help="Whether to use batch normalization",
        )
        group.add_argument(
            "--use-concate",
            default=True,
            type=strtobool,
            help="Whether to concatenate encoder embedding with decoder outputs",
        )
        group.add_argument(
            "--use-residual",
            default=True,
            type=strtobool,
            help="Whether to use residual connection in conv layer",
        )
        group.add_argument(
            "--dropout-rate", default=0.5, type=float, help="Dropout rate"
        )
        group.add_argument(
            "--zoneout-rate", default=0.1, type=float, help="Zoneout rate"
        )
        group.add_argument(
            "--reduction-factor",
            default=1,
            type=int,
            help="Reduction factor (for decoder)",
        )
        group.add_argument(
            "--encoder-reduction-factor",
            default=1,
            type=int,
            help="Reduction factor (for encoder)",
        )
        group.add_argument(
            "--spk-embed-dim",
            default=None,
            type=int,
            help="Number of speaker embedding dimensions",
        )
        group.add_argument(
            "--spc-dim", default=None, type=int, help="Number of spectrogram dimensions"
        )
        group.add_argument(
            "--pretrained-model", default=None, type=str, help="Pretrained model path"
        )
        # loss related
        group.add_argument(
            "--use-masking",
            default=False,
            type=strtobool,
            help="Whether to use masking in calculation of loss",
        )
        group.add_argument(
            "--bce-pos-weight",
            default=20.0,
            type=float,
            help="Positive sample weight in BCE calculation "
            "(only for use-masking=True)",
        )
        group.add_argument(
            "--use-guided-attn-loss",
            default=False,
            type=strtobool,
            help="Whether to use guided attention loss",
        )
        group.add_argument(
            "--guided-attn-loss-sigma",
            default=0.4,
            type=float,
            help="Sigma in guided attention loss",
        )
        group.add_argument(
            "--guided-attn-loss-lambda",
            default=1.0,
            type=float,
            help="Lambda in guided attention loss",
        )
        group.add_argument(
            "--src-reconstruction-loss-lambda",
            default=1.0,
            type=float,
            help="Lambda in source reconstruction loss",
        )
        group.add_argument(
            "--trg-reconstruction-loss-lambda",
            default=1.0,
            type=float,
            help="Lambda in target reconstruction loss",
        )
        return parser

    def __init__(self, idim, odim, args=None):
        """Initialize Tacotron2 module.

        Args:
            idim (int): Dimension of the inputs.
            odim (int): Dimension of the outputs.
            args (Namespace, optional):
                - spk_embed_dim (int): Dimension of the speaker embedding.
                - elayers (int): The number of encoder blstm layers.
                - eunits (int): The number of encoder blstm units.
                - econv_layers (int): The number of encoder conv layers.
                - econv_filts (int): The number of encoder conv filter size.
                - econv_chans (int): The number of encoder conv filter channels.
                - dlayers (int): The number of decoder lstm layers.
                - dunits (int): The number of decoder lstm units.
                - prenet_layers (int): The number of prenet layers.
                - prenet_units (int): The number of prenet units.
                - postnet_layers (int): The number of postnet layers.
                - postnet_filts (int): The number of postnet filter size.
                - postnet_chans (int): The number of postnet filter channels.
                - output_activation (int): The name of activation function for outputs.
                - adim (int): The number of dimension of mlp in attention.
                - aconv_chans (int): The number of attention conv filter channels.
                - aconv_filts (int): The number of attention conv filter size.
                - cumulate_att_w (bool): Whether to cumulate previous attention weight.
                - use_batch_norm (bool): Whether to use batch normalization.
                - use_concate (int):
                    Whether to concatenate encoder embedding with decoder lstm outputs.
                - dropout_rate (float): Dropout rate.
                - zoneout_rate (float): Zoneout rate.
                - reduction_factor (int): Reduction factor.
                - spk_embed_dim (int): Number of speaker embedding dimenstions.
                - spc_dim (int): Number of spectrogram embedding dimenstions
                    (only for use_cbhg=True).
                - use_cbhg (bool): Whether to use CBHG module.
                - cbhg_conv_bank_layers (int):
                    The number of convoluional banks in CBHG.
                - cbhg_conv_bank_chans (int):
                    The number of channels of convolutional bank in CBHG.
                - cbhg_proj_filts (int):
                    The number of filter size of projection layeri in CBHG.
                - cbhg_proj_chans (int):
                    The number of channels of projection layer in CBHG.
                - cbhg_highway_layers (int):
                    The number of layers of highway network in CBHG.
                - cbhg_highway_units (int):
                    The number of units of highway network in CBHG.
                - cbhg_gru_units (int): The number of units of GRU in CBHG.
                - use_masking (bool): Whether to mask padded part in loss calculation.
                - bce_pos_weight (float): Weight of positive sample of stop token
                    (only for use_masking=True).
                - use-guided-attn-loss (bool): Whether to use guided attention loss.
                - guided-attn-loss-sigma (float) Sigma in guided attention loss.
                - guided-attn-loss-lamdba (float): Lambda in guided attention loss.

        """
        # initialize base classes
        TTSInterface.__init__(self)
        torch.nn.Module.__init__(self)

        # fill missing arguments
        args = fill_missing_args(args, self.add_arguments)

        # store hyperparameters
        self.idim = idim
        self.odim = odim
        self.adim = args.adim
        self.spk_embed_dim = args.spk_embed_dim
        self.cumulate_att_w = args.cumulate_att_w
        self.reduction_factor = args.reduction_factor
        self.encoder_reduction_factor = args.encoder_reduction_factor
        self.use_cbhg = args.use_cbhg
        self.use_guided_attn_loss = args.use_guided_attn_loss
        self.src_reconstruction_loss_lambda = args.src_reconstruction_loss_lambda
        self.trg_reconstruction_loss_lambda = args.trg_reconstruction_loss_lambda

        # define activation function for the final output
        if args.output_activation is None:
            self.output_activation_fn = None
        elif hasattr(F, args.output_activation):
            self.output_activation_fn = getattr(F, args.output_activation)
        else:
            raise ValueError(
                "there is no such an activation function. (%s)" % args.output_activation
            )

        # define network modules
        self.enc = Encoder(
            idim=idim * args.encoder_reduction_factor,
            input_layer="linear",
            elayers=args.elayers,
            eunits=args.eunits,
            econv_layers=args.econv_layers,
            econv_chans=args.econv_chans,
            econv_filts=args.econv_filts,
            use_batch_norm=args.use_batch_norm,
            use_residual=args.use_residual,
            dropout_rate=args.dropout_rate,
        )
        dec_idim = (
            args.eunits
            if args.spk_embed_dim is None
            else args.eunits + args.spk_embed_dim
        )
        if args.atype == "location":
            att = AttLoc(
                dec_idim, args.dunits, args.adim, args.aconv_chans, args.aconv_filts
            )
        elif args.atype == "forward":
            att = AttForward(
                dec_idim, args.dunits, args.adim, args.aconv_chans, args.aconv_filts
            )
            if self.cumulate_att_w:
                logging.warning(
                    "cumulation of attention weights is disabled in forward attention."
                )
                self.cumulate_att_w = False
        elif args.atype == "forward_ta":
            att = AttForwardTA(
                dec_idim,
                args.dunits,
                args.adim,
                args.aconv_chans,
                args.aconv_filts,
                odim,
            )
            if self.cumulate_att_w:
                logging.warning(
                    "cumulation of attention weights is disabled in forward attention."
                )
                self.cumulate_att_w = False
        else:
            raise NotImplementedError("Support only location or forward")
        self.dec = Decoder(
            idim=dec_idim,
            odim=odim,
            att=att,
            dlayers=args.dlayers,
            dunits=args.dunits,
            prenet_layers=args.prenet_layers,
            prenet_units=args.prenet_units,
            postnet_layers=args.postnet_layers,
            postnet_chans=args.postnet_chans,
            postnet_filts=args.postnet_filts,
            output_activation_fn=self.output_activation_fn,
            cumulate_att_w=self.cumulate_att_w,
            use_batch_norm=args.use_batch_norm,
            use_concate=args.use_concate,
            dropout_rate=args.dropout_rate,
            zoneout_rate=args.zoneout_rate,
            reduction_factor=args.reduction_factor,
        )
        self.taco2_loss = Tacotron2Loss(
            use_masking=args.use_masking, bce_pos_weight=args.bce_pos_weight
        )
        if self.use_guided_attn_loss:
            self.attn_loss = GuidedAttentionLoss(
                sigma=args.guided_attn_loss_sigma,
                alpha=args.guided_attn_loss_lambda,
            )
        if self.use_cbhg:
            self.cbhg = CBHG(
                idim=odim,
                odim=args.spc_dim,
                conv_bank_layers=args.cbhg_conv_bank_layers,
                conv_bank_chans=args.cbhg_conv_bank_chans,
                conv_proj_filts=args.cbhg_conv_proj_filts,
                conv_proj_chans=args.cbhg_conv_proj_chans,
                highway_layers=args.cbhg_highway_layers,
                highway_units=args.cbhg_highway_units,
                gru_units=args.cbhg_gru_units,
            )
            self.cbhg_loss = CBHGLoss(use_masking=args.use_masking)
        if self.src_reconstruction_loss_lambda > 0:
            self.src_reconstructor = Encoder(
                idim=dec_idim,
                input_layer="linear",
                elayers=args.elayers,
                eunits=args.eunits,
                econv_layers=args.econv_layers,
                econv_chans=args.econv_chans,
                econv_filts=args.econv_filts,
                use_batch_norm=args.use_batch_norm,
                use_residual=args.use_residual,
                dropout_rate=args.dropout_rate,
            )
            self.src_reconstructor_linear = torch.nn.Linear(
                args.econv_chans, idim * args.encoder_reduction_factor
            )

            self.src_reconstruction_loss = CBHGLoss(use_masking=args.use_masking)
        if self.trg_reconstruction_loss_lambda > 0:
            self.trg_reconstructor = Encoder(
                idim=dec_idim,
                input_layer="linear",
                elayers=args.elayers,
                eunits=args.eunits,
                econv_layers=args.econv_layers,
                econv_chans=args.econv_chans,
                econv_filts=args.econv_filts,
                use_batch_norm=args.use_batch_norm,
                use_residual=args.use_residual,
                dropout_rate=args.dropout_rate,
            )
            self.trg_reconstructor_linear = torch.nn.Linear(
                args.econv_chans, odim * args.reduction_factor
            )
            self.trg_reconstruction_loss = CBHGLoss(use_masking=args.use_masking)

        # load pretrained model
        if args.pretrained_model is not None:
            self.load_pretrained_model(args.pretrained_model)

    def forward(
        self, xs, ilens, ys, labels, olens, spembs=None, spcs=None, *args, **kwargs
    ):
        """Calculate forward propagation.

        Args:
            xs (Tensor): Batch of padded acoustic features (B, Tmax, idim).
            ilens (LongTensor): Batch of lengths of each input batch (B,).
            ys (Tensor): Batch of padded target features (B, Lmax, odim).
            olens (LongTensor): Batch of the lengths of each target (B,).
            spembs (Tensor, optional):
                Batch of speaker embedding vectors (B, spk_embed_dim).
            spcs (Tensor, optional):
                Batch of groundtruth spectrograms (B, Lmax, spc_dim).

        Returns:
            Tensor: Loss value.

        """
        # remove unnecessary padded part (for multi-gpus)
        max_in = max(ilens)
        max_out = max(olens)
        if max_in != xs.shape[1]:
            xs = xs[:, :max_in]
        if max_out != ys.shape[1]:
            ys = ys[:, :max_out]
            labels = labels[:, :max_out]

        # thin out input frames for reduction factor
        # (B, Lmax, idim) ->  (B, Lmax // r, idim * r)
        if self.encoder_reduction_factor > 1:
            B, Lmax, idim = xs.shape
            if Lmax % self.encoder_reduction_factor != 0:
                xs = xs[:, : -(Lmax % self.encoder_reduction_factor), :]
            xs_ds = xs.contiguous().view(
                B,
                int(Lmax / self.encoder_reduction_factor),
                idim * self.encoder_reduction_factor,
            )
            ilens_ds = ilens.new(
                [ilen // self.encoder_reduction_factor for ilen in ilens]
            )
        else:
            xs_ds, ilens_ds = xs, ilens

        # calculate tacotron2 outputs
        hs, hlens = self.enc(xs_ds, ilens_ds)
        if self.spk_embed_dim is not None:
            spembs = F.normalize(spembs).unsqueeze(1).expand(-1, hs.size(1), -1)
            hs = torch.cat([hs, spembs], dim=-1)
        after_outs, before_outs, logits, att_ws = self.dec(hs, hlens, ys)

        # caluculate src reconstruction
        if self.src_reconstruction_loss_lambda > 0:
            B, _in_length, _adim = hs.shape
            xt, xtlens = self.src_reconstructor(hs, hlens)
            xt = self.src_reconstructor_linear(xt)
            if self.encoder_reduction_factor > 1:
                xt = xt.view(B, -1, self.idim)

        # caluculate trg reconstruction
        if self.trg_reconstruction_loss_lambda > 0:
            olens_trg_cp = olens.new(
                sorted([olen // self.reduction_factor for olen in olens], reverse=True)
            )
            B, _in_length, _adim = hs.shape
            _, _out_length, _ = att_ws.shape
            # att_R should be [B, out_length / r_d, adim]
            att_R = torch.sum(
                hs.view(B, 1, _in_length, _adim)
                * att_ws.view(B, _out_length, _in_length, 1),
                dim=2,
            )
            yt, ytlens = self.trg_reconstructor(
                att_R, olens_trg_cp
            )  # is using olens correct?
            yt = self.trg_reconstructor_linear(yt)
            if self.reduction_factor > 1:
                yt = yt.view(
                    B, -1, self.odim
                )  # now att_R should be [B, out_length, adim]

        # modifiy mod part of groundtruth
        if self.reduction_factor > 1:
            olens = olens.new([olen - olen % self.reduction_factor for olen in olens])
            max_out = max(olens)
            ys = ys[:, :max_out]
            labels = labels[:, :max_out]
            labels[:, -1] = 1.0  # make sure at least one frame has 1
        if self.encoder_reduction_factor > 1:
            ilens = ilens.new(
                [ilen - ilen % self.encoder_reduction_factor for ilen in ilens]
            )
            max_in = max(ilens)
            xs = xs[:, :max_in]

        # caluculate taco2 loss
        l1_loss, mse_loss, bce_loss = self.taco2_loss(
            after_outs, before_outs, logits, ys, labels, olens
        )
        loss = l1_loss + mse_loss + bce_loss
        report_keys = [
            {"l1_loss": l1_loss.item()},
            {"mse_loss": mse_loss.item()},
            {"bce_loss": bce_loss.item()},
        ]

        # caluculate context_perservation loss
        if self.src_reconstruction_loss_lambda > 0:
            src_recon_l1_loss, src_recon_mse_loss = self.src_reconstruction_loss(
                xt, xs, ilens
            )
            loss = loss + src_recon_l1_loss
            report_keys += [
                {"src_recon_l1_loss": src_recon_l1_loss.item()},
                {"src_recon_mse_loss": src_recon_mse_loss.item()},
            ]
        if self.trg_reconstruction_loss_lambda > 0:
            trg_recon_l1_loss, trg_recon_mse_loss = self.trg_reconstruction_loss(
                yt, ys, olens
            )
            loss = loss + trg_recon_l1_loss
            report_keys += [
                {"trg_recon_l1_loss": trg_recon_l1_loss.item()},
                {"trg_recon_mse_loss": trg_recon_mse_loss.item()},
            ]

        # caluculate attention loss
        if self.use_guided_attn_loss:
            # NOTE(kan-bayashi): length of output for auto-regressive input
            #   will be changed when r > 1
            if self.encoder_reduction_factor > 1:
                ilens_in = ilens.new(
                    [ilen // self.encoder_reduction_factor for ilen in ilens]
                )
            else:
                ilens_in = ilens
            if self.reduction_factor > 1:
                olens_in = olens.new([olen // self.reduction_factor for olen in olens])
            else:
                olens_in = olens
            attn_loss = self.attn_loss(att_ws, ilens_in, olens_in)
            loss = loss + attn_loss
            report_keys += [
                {"attn_loss": attn_loss.item()},
            ]

        # caluculate cbhg loss
        if self.use_cbhg:
            # remove unnecessary padded part (for multi-gpus)
            if max_out != spcs.shape[1]:
                spcs = spcs[:, :max_out]

            # caluculate cbhg outputs & loss and report them
            cbhg_outs, _ = self.cbhg(after_outs, olens)
            cbhg_l1_loss, cbhg_mse_loss = self.cbhg_loss(cbhg_outs, spcs, olens)
            loss = loss + cbhg_l1_loss + cbhg_mse_loss
            report_keys += [
                {"cbhg_l1_loss": cbhg_l1_loss.item()},
                {"cbhg_mse_loss": cbhg_mse_loss.item()},
            ]

        report_keys += [{"loss": loss.item()}]
        self.reporter.report(report_keys)

        return loss

    def inference(self, x, inference_args, spemb=None, *args, **kwargs):
        """Generate the sequence of features given the sequences of characters.

        Args:
            x (Tensor): Input sequence of acoustic features (T, idim).
            inference_args (Namespace):
                - threshold (float): Threshold in inference.
                - minlenratio (float): Minimum length ratio in inference.
                - maxlenratio (float): Maximum length ratio in inference.
            spemb (Tensor, optional): Speaker embedding vector (spk_embed_dim).

        Returns:
            Tensor: Output sequence of features (L, odim).
            Tensor: Output sequence of stop probabilities (L,).
            Tensor: Attention weights (L, T).

        """
        # get options
        threshold = inference_args.threshold
        minlenratio = inference_args.minlenratio
        maxlenratio = inference_args.maxlenratio

        # thin out input frames for reduction factor
        # (B, Lmax, idim) ->  (B, Lmax // r, idim * r)
        if self.encoder_reduction_factor > 1:
            Lmax, idim = x.shape
            if Lmax % self.encoder_reduction_factor != 0:
                x = x[: -(Lmax % self.encoder_reduction_factor), :]
            x_ds = x.contiguous().view(
                int(Lmax / self.encoder_reduction_factor),
                idim * self.encoder_reduction_factor,
            )
        else:
            x_ds = x

        # inference
        h = self.enc.inference(x_ds)
        if self.spk_embed_dim is not None:
            spemb = F.normalize(spemb, dim=0).unsqueeze(0).expand(h.size(0), -1)
            h = torch.cat([h, spemb], dim=-1)
        outs, probs, att_ws = self.dec.inference(h, threshold, minlenratio, maxlenratio)

        if self.use_cbhg:
            cbhg_outs = self.cbhg.inference(outs)
            return cbhg_outs, probs, att_ws
        else:
            return outs, probs, att_ws

    def calculate_all_attentions(self, xs, ilens, ys, spembs=None, *args, **kwargs):
        """Calculate all of the attention weights.

        Args:
            xs (Tensor): Batch of padded acoustic features (B, Tmax, idim).
            ilens (LongTensor): Batch of lengths of each input batch (B,).
            ys (Tensor): Batch of padded target features (B, Lmax, odim).
            olens (LongTensor): Batch of the lengths of each target (B,).
            spembs (Tensor, optional):
                Batch of speaker embedding vectors (B, spk_embed_dim).

        Returns:
            numpy.ndarray: Batch of attention weights (B, Lmax, Tmax).

        """
        # check ilens type (should be list of int)
        if isinstance(ilens, torch.Tensor) or isinstance(ilens, np.ndarray):
            ilens = list(map(int, ilens))

        self.eval()
        with torch.no_grad():
            # thin out input frames for reduction factor
            # (B, Lmax, idim) ->  (B, Lmax // r, idim * r)
            if self.encoder_reduction_factor > 1:
                B, Lmax, idim = xs.shape
                if Lmax % self.encoder_reduction_factor != 0:
                    xs = xs[:, : -(Lmax % self.encoder_reduction_factor), :]
                xs_ds = xs.contiguous().view(
                    B,
                    int(Lmax / self.encoder_reduction_factor),
                    idim * self.encoder_reduction_factor,
                )
                ilens_ds = [ilen // self.encoder_reduction_factor for ilen in ilens]
            else:
                xs_ds, ilens_ds = xs, ilens

            hs, hlens = self.enc(xs_ds, ilens_ds)
            if self.spk_embed_dim is not None:
                spembs = F.normalize(spembs).unsqueeze(1).expand(-1, hs.size(1), -1)
                hs = torch.cat([hs, spembs], dim=-1)
            att_ws = self.dec.calculate_all_attentions(hs, hlens, ys)
        self.train()

        return att_ws.cpu().numpy()

    @property
    def base_plot_keys(self):
        """Return base key names to plot during training.

        keys should match what `chainer.reporter` reports.
        If you add the key `loss`, the reporter will report `main/loss`
            and `validation/main/loss` values.
        also `loss.png` will be created as a figure visulizing `main/loss`
            and `validation/main/loss` values.

        Returns:
            list: List of strings which are base keys to plot during training.

        """
        plot_keys = ["loss", "l1_loss", "mse_loss", "bce_loss"]
        if self.use_guided_attn_loss:
            plot_keys += ["attn_loss"]
        if self.use_cbhg:
            plot_keys += ["cbhg_l1_loss", "cbhg_mse_loss"]
        if self.src_reconstruction_loss_lambda > 0:
            plot_keys += ["src_recon_l1_loss", "src_recon_mse_loss"]
        if self.trg_reconstruction_loss_lambda > 0:
            plot_keys += ["trg_recon_l1_loss", "trg_recon_mse_loss"]
        return plot_keys

    def _sort_by_length(self, xs, ilens):
        sort_ilens, sort_idx = ilens.sort(0, descending=True)
        return xs[sort_idx], ilens[sort_idx], sort_idx

    def _revert_sort_by_length(self, xs, ilens, sort_idx):
        _, revert_idx = sort_idx.sort(0)
        return xs[revert_idx], ilens[revert_idx]


================================================
FILE: nets/pytorch_backend/e2e_vc_transformer.py
================================================
# Copyright 2020 Nagoya University (Wen-Chin Huang)
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

"""Voice Transformer Network (Transformer-VC) related modules."""

import logging

import torch
import torch.nn.functional as F

from espnet.nets.pytorch_backend.e2e_asr_transformer import subsequent_mask
from espnet.nets.pytorch_backend.e2e_tts_tacotron2 import (
    Tacotron2Loss as TransformerLoss,  # noqa: H301
)
from espnet.nets.pytorch_backend.nets_utils import make_non_pad_mask
from espnet.nets.pytorch_backend.tacotron2.decoder import Postnet
from espnet.nets.pytorch_backend.tacotron2.decoder import Prenet as DecoderPrenet
from espnet.nets.pytorch_backend.tacotron2.encoder import Encoder as EncoderPrenet
from espnet.nets.pytorch_backend.transformer.attention import MultiHeadedAttention
from espnet.nets.pytorch_backend.transformer.decoder import Decoder
from espnet.nets.pytorch_backend.transformer.embedding import PositionalEncoding
from espnet.nets.pytorch_backend.transformer.embedding import ScaledPositionalEncoding
from espnet.nets.pytorch_backend.transformer.encoder import Encoder
from espnet.nets.pytorch_backend.transformer.initializer import initialize
from espnet.nets.tts_interface import TTSInterface
from espnet.utils.cli_utils import strtobool
from espnet.utils.fill_missing_args import fill_missing_args
from espnet.nets.pytorch_backend.e2e_tts_transformer import (
    GuidedMultiHeadAttentionLoss,  # noqa: H301
    TTSPlot,  # noqa: H301
)


class Transformer(TTSInterface, torch.nn.Module):
    """VC Transformer module.

    This is a module of the Voice Transformer Network
    (a.k.a. VTN or Transformer-VC) described in
    `Voice Transformer Network: Sequence-to-Sequence
    Voice Conversion Using Transformer with
    Text-to-Speech Pretraining`_,
    which convert the sequence of acoustic features
    into the sequence of acoustic features.

    .. _`Voice Transformer Network: Sequence-to-Sequence
        Voice Conversion Using Transformer with
        Text-to-Speech Pretraining`:
        https://arxiv.org/pdf/1912.06813.pdf

    """

    @staticmethod
    def add_arguments(parser):
        """Add model-specific arguments to the parser."""
        group = parser.add_argument_group("transformer model setting")
        # network structure related
        group.add_argument(
            "--eprenet-conv-layers",
            default=0,
            type=int,
            help="Number of encoder prenet convolution layers",
        )
        group.add_argument(
            "--eprenet-conv-chans",
            default=0,
            type=int,
            help="Number of encoder prenet convolution channels",
        )
        group.add_argument(
            "--eprenet-conv-filts",
            default=0,
            type=int,
            help="Filter size of encoder prenet convolution",
        )
        group.add_argument(
            "--transformer-input-layer",
            default="linear",
            type=str,
            help="Type of input layer (linear or conv2d)",
        )
        group.add_argument(
            "--dprenet-layers",
            default=2,
            type=int,
            help="Number of decoder prenet layers",
        )
        group.add_argument(
            "--dprenet-units",
            default=256,
            type=int,
            help="Number of decoder prenet hidden units",
        )
        group.add_argument(
            "--elayers", default=3, type=int, help="Number of encoder layers"
        )
        group.add_argument(
            "--eunits", default=1536, type=int, help="Number of encoder hidden units"
        )
        group.add_argument(
            "--adim",
            default=384,
            type=int,
            help="Number of attention transformation dimensions",
        )
        group.add_argument(
            "--aheads",
            default=4,
            type=int,
            help="Number of heads for multi head attention",
        )
        group.add_argument(
            "--dlayers", default=3, type=int, help="Number of decoder layers"
        )
        group.add_argument(
            "--dunits", default=1536, type=int, help="Number of decoder hidden units"
        )
        group.add_argument(
            "--positionwise-layer-type",
            default="linear",
            type=str,
            choices=["linear", "conv1d", "conv1d-linear"],
            help="Positionwise layer type.",
        )
        group.add_argument(
            "--positionwise-conv-kernel-size",
            default=1,
            type=int,
            help="Kernel size of positionwise conv1d layer",
        )
        group.add_argument(
            "--postnet-layers", default=5, type=int, help="Number of postnet layers"
        )
        group.add_argument(
            "--postnet-chans", default=256, type=int, help="Number of postnet channels"
        )
        group.add_argument(
            "--postnet-filts", default=5, type=int, help="Filter size of postnet"
        )
        group.add_argument(
            "--use-scaled-pos-enc",
            default=True,
            type=strtobool,
            help="Use trainable scaled positional encoding"
            "instead of the fixed scale one.",
        )
        group.add_argument(
            "--use-batch-norm",
            default=True,
            type=strtobool,
            help="Whether to use batch normalization",
        )
        group.add_argument(
            "--encoder-normalize-before",
            default=False,
            type=strtobool,
            help="Whether to apply layer norm before encoder block",
        )
        group.add_argument(
            "--decoder-normalize-before",
            default=False,
            type=strtobool,
            help="Whether to apply layer norm before decoder block",
        )
        group.add_argument(
            "--encoder-concat-after",
            default=False,
            type=strtobool,
            help="Whether to concatenate attention layer's input and output in encoder",
        )
        group.add_argument(
            "--decoder-concat-after",
            default=False,
            type=strtobool,
            help="Whether to concatenate attention layer's input and output in decoder",
        )
        group.add_argument(
            "--reduction-factor",
            default=1,
            type=int,
            help="Reduction factor (for decoder)",
        )
        group.add_argument(
            "--encoder-reduction-factor",
            default=1,
            type=int,
            help="Reduction factor (for encoder)",
        )
        group.add_argument(
            "--spk-embed-dim",
            default=None,
            type=int,
            help="Number of speaker embedding dimensions",
        )
        group.add_argument(
            "--spk-embed-integration-type",
            type=str,
            default="add",
            choices=["add", "concat"],
            help="How to integrate speaker embedding",
        )
        # training related
        group.add_argument(
            "--transformer-init",
            type=str,
            default="pytorch",
            choices=[
                "pytorch",
                "xavier_uniform",
                "xavier_normal",
                "kaiming_uniform",
                "kaiming_normal",
            ],
            help="How to initialize transformer parameters",
        )
        group.add_argument(
            "--initial-encoder-alpha",
            type=float,
            default=1.0,
            help="Initial alpha value in encoder's ScaledPositionalEncoding",
        )
        group.add_argument(
            "--initial-decoder-alpha",
            type=float,
            default=1.0,
            help="Initial alpha value in decoder's ScaledPositionalEncoding",
        )
        group.add_argument(
            "--transformer-lr",
            default=1.0,
            type=float,
            help="Initial value of learning rate",
        )
        group.add_argument(
            "--transformer-warmup-steps",
            default=4000,
            type=int,
            help="Optimizer warmup steps",
        )
        group.add_argument(
            "--transformer-enc-dropout-rate",
            default=0.1,
            type=float,
            help="Dropout rate for transformer encoder except for attention",
        )
        group.add_argument(
            "--transformer-enc-positional-dropout-rate",
            default=0.1,
            type=float,
            help="Dropout rate for transformer encoder positional encoding",
        )
        group.add_argument(
            "--transformer-enc-attn-dropout-rate",
            default=0.1,
            type=float,
            help="Dropout rate for transformer encoder self-attention",
        )
        group.add_argument(
            "--transformer-dec-dropout-rate",
            default=0.1,
            type=float,
            help="Dropout rate for transformer decoder "
            "except for attention and pos encoding",
        )
        group.add_argument(
            "--transformer-dec-positional-dropout-rate",
            default=0.1,
            type=float,
            help="Dropout rate for transformer decoder positional encoding",
        )
        group.add_argument(
            "--transformer-dec-attn-dropout-rate",
            default=0.1,
            type=float,
            help="Dropout rate for transformer decoder self-attention",
        )
        group.add_argument(
            "--transformer-enc-dec-attn-dropout-rate",
            default=0.1,
            type=float,
            help="Dropout rate for transformer encoder-decoder attention",
        )
        group.add_argument(
            "--eprenet-dropout-rate",
            default=0.5,
            type=float,
            help="Dropout rate in encoder prenet",
        )
        group.add_argument(
            "--dprenet-dropout-rate",
            default=0.5,
            type=float,
            help="Dropout rate in decoder prenet",
        )
        group.add_argument(
            "--postnet-dropout-rate",
            default=0.5,
            type=float,
            help="Dropout rate in postnet",
        )
        group.add_argument(
            "--pretrained-model", default=None, type=str, help="Pretrained model path"
        )

        # loss related
        group.add_argument(
            "--use-masking",
            default=True,
            type=strtobool,
            help="Whether to use masking in calculation of loss",
        )
        group.add_argument(
            "--use-weighted-masking",
            default=False,
            type=strtobool,
            help="Whether to use weighted masking in calculation of loss",
        )
        group.add_argument(
            "--loss-type",
            default="L1",
            choices=["L1", "L2", "L1+L2"],
            help="How to calc loss",
        )
        group.add_argument(
            "--bce-pos-weight",
            default=5.0,
            type=float,
            help="Positive sample weight in BCE calculation "
            "(only for use-masking=True)",
        )
        group.add_argument(
            "--use-guided-attn-loss",
            default=False,
            type=strtobool,
            help="Whether to use guided attention loss",
        )
        group.add_argument(
            "--guided-attn-loss-sigma",
            default=0.4,
            type=float,
            help="Sigma in guided attention loss",
        )
        group.add_argument(
            "--guided-attn-loss-lambda",
            default=1.0,
            type=float,
            help="Lambda in guided attention loss",
        )
        group.add_argument(
            "--num-heads-applied-guided-attn",
            default=2,
            type=int,
            help="Number of heads in each layer to be applied guided attention loss"
            "if set -1, all of the heads will be applied.",
        )
        group.add_argument(
            "--num-layers-applied-guided-attn",
            default=2,
            type=int,
            help="Number of layers to be applied guided attention loss"
            "if set -1, all of the layers will be applied.",
        )
        group.add_argument(
            "--modules-applied-guided-attn",
            type=str,
            nargs="+",
            default=["encoder-decoder"],
            help="Module name list to be applied guided attention loss",
        )
        return parser

    @property
    def attention_plot_class(self):
        """Return plot class for attention weight plot."""
        return TTSPlot

    def __init__(self, idim, odim, args=None):
        """Initialize Transformer-VC module.

        Args:
            idim (int): Dimension of the inputs.
            odim (int): Dimension of the outputs.
            args (Namespace, optional):
                - eprenet_conv_layers (int):
                    Number of encoder prenet convolution layers.
                - eprenet_conv_chans (int):
                    Number of encoder prenet convolution channels.
                - eprenet_conv_filts (int):
                    Filter size of encoder prenet convolution.
                - transformer_input_layer (str): Input layer before the encoder.
                - dprenet_layers (int): Number of decoder prenet layers.
                - dprenet_units (int): Number of decoder prenet hidden units.
                - elayers (int): Number of encoder layers.
                - eunits (int): Number of encoder hidden units.
                - adim (int): Number of attention transformation dimensions.
                - aheads (int): Number of heads for multi head attention.
                - dlayers (int): Number of decoder layers.
                - dunits (int): Number of decoder hidden units.
                - postnet_layers (int): Number of postnet layers.
                - postnet_chans (int): Number of postnet channels.
                - postnet_filts (int): Filter size of postnet.
                - use_scaled_pos_enc (bool):
                    Whether to use trainable scaled positional encoding.
                - use_batch_norm (bool):
                    Whether to use batch normalization in encoder prenet.
                - encoder_normalize_before (bool):
                    Whether to perform layer normalization before encoder block.
                - decoder_normalize_before (bool):
                    Whether to perform layer normalization before decoder block.
                - encoder_concat_after (bool): Whether to concatenate
                    attention layer's input and output in encoder.
                - decoder_concat_after (bool): Whether to concatenate
                    attention layer's input and output in decoder.
                - reduction_factor (int): Reduction factor (for decoder).
                - encoder_reduction_factor (int): Reduction factor (for encoder).
                - spk_embed_dim (int): Number of speaker embedding dimenstions.
                - spk_embed_integration_type: How to integrate speaker embedding.
                - transformer_init (float): How to initialize transformer parameters.
                - transformer_lr (float): Initial value of learning rate.
                - transformer_warmup_steps (int): Optimizer warmup steps.
                - transformer_enc_dropout_rate (float):
                    Dropout rate in encoder except attention & positional encoding.
                - transformer_enc_positional_dropout_rate (float):
                    Dropout rate after encoder positional encoding.
                - transformer_enc_attn_dropout_rate (float):
                    Dropout rate in encoder self-attention module.
                - transformer_dec_dropout_rate (float):
                    Dropout rate in decoder except attention & positional encoding.
                - transformer_dec_positional_dropout_rate (float):
                    Dropout rate after decoder positional encoding.
                - transformer_dec_attn_dropout_rate (float):
                    Dropout rate in deocoder self-attention module.
                - transformer_enc_dec_attn_dropout_rate (float):
                    Dropout rate in encoder-deocoder attention module.
                - eprenet_dropout_rate (float): Dropout rate in encoder prenet.
                - dprenet_dropout_rate (float): Dropout rate in decoder prenet.
                - postnet_dropout_rate (float): Dropout rate in postnet.
                - use_masking (bool):
                    Whether to apply masking for padded part in loss calculation.
                - use_weighted_masking (bool):
                    Whether to apply weighted masking in loss calculation.
                - bce_pos_weight (float): Positive sample weight in bce calculation
                    (only for use_masking=true).
                - loss_type (str): How to calculate loss.
                - use_guided_attn_loss (bool): Whether to use guided attention loss.
                - num_heads_applied_guided_attn (int):
                    Number of heads in each layer to apply guided attention loss.
                - num_layers_applied_guided_attn (int):
                    Number of layers to apply guided attention loss.
                - modules_applied_guided_attn (list):
                    List of module names to apply guided attention loss.
                - guided-attn-loss-sigma (float) Sigma in guided attention loss.
                - guided-attn-loss-lambda (float): Lambda in guided attention loss.

        """
        # initialize base classes
        TTSInterface.__init__(self)
        torch.nn.Module.__init__(self)

        # fill missing arguments
        args = fill_missing_args(args, self.add_arguments)

        # store hyperparameters
        self.idim = idim
        self.odim = odim
        self.spk_embed_dim = args.spk_embed_dim
        if self.spk_embed_dim is not None:
            self.spk_embed_integration_type = args.spk_embed_integration_type
        self.use_scaled_pos_enc = args.use_scaled_pos_enc
        self.reduction_factor = args.reduction_factor
        self.encoder_reduction_factor = args.encoder_reduction_factor
        self.transformer_input_layer = args.transformer_input_layer
        self.loss_type = args.loss_type
        self.use_guided_attn_loss = args.use_guided_attn_loss
        if self.use_guided_attn_loss:
            if args.num_layers_applied_guided_attn == -1:
                self.num_layers_applied_guided_attn = args.elayers
            else:
                self.num_layers_applied_guided_attn = (
                    args.num_layers_applied_guided_attn
                )
            if args.num_heads_applied_guided_attn == -1:
                self.num_heads_applied_guided_attn = args.aheads
            else:
                self.num_heads_applied_guided_attn = args.num_heads_applied_guided_attn
            self.modules_applied_guided_attn = args.modules_applied_guided_attn

        # use idx 0 as padding idx
        padding_idx = 0

        # get positional encoding class
        pos_enc_class = (
            ScaledPositionalEncoding if self.use_scaled_pos_enc else PositionalEncoding
        )

        # define transformer encoder
        if args.eprenet_conv_layers != 0:
            # encoder prenet
            encoder_input_layer = torch.nn.Sequential(
                EncoderPrenet(
                    idim=idim,
                    elayers=0,
                    econv_layers=args.eprenet_conv_layers,
                    econv_chans=args.eprenet_conv_chans,
                    econv_filts=args.eprenet_conv_filts,
                    use_batch_norm=args.use_batch_norm,
                    dropout_rate=args.eprenet_dropout_rate,
                    padding_idx=padding_idx,
                    input_layer=torch.nn.Linear(
                        idim * args.encoder_reduction_factor, idim
                    ),
                ),
                torch.nn.Linear(args.eprenet_conv_chans, args.adim),
            )
        elif args.transformer_input_layer == "linear":
            encoder_input_layer = torch.nn.Linear(
                idim * args.encoder_reduction_factor, args.adim
            )
        else:
            encoder_input_layer = args.transformer_input_layer
        self.encoder = Encoder(
            idim=idim,
            attention_dim=args.adim,
            attention_heads=args.aheads,
            linear_units=args.eunits,
            num_blocks=args.elayers,
            input_layer=encoder_input_layer,
            dropout_rate=args.transformer_enc_dropout_rate,
            positional_dropout_rate=args.transformer_enc_positional_dropout_rate,
            attention_dropout_rate=args.transformer_enc_attn_dropout_rate,
            pos_enc_class=pos_enc_class,
            normalize_before=args.encoder_normalize_before,
            concat_after=args.encoder_concat_after,
            positionwise_layer_type=args.positionwise_layer_type,
            positionwise_conv_kernel_size=args.positionwise_conv_kernel_size,
        )

        # define projection layer
        if self.spk_embed_dim is not None:
            if self.spk_embed_integration_type == "add":
                self.projection = torch.nn.Linear(self.spk_embed_dim, args.adim)
            else:
                self.projection = torch.nn.Linear(
                    args.adim + self.spk_embed_dim, args.adim
                )

        # define transformer decoder
        if args.dprenet_layers != 0:
            # decoder prenet
            decoder_input_layer = torch.nn.Sequential(
                DecoderPrenet(
                    idim=odim,
                    n_layers=args.dprenet_layers,
                    n_units=args.dprenet_units,
                    dropout_rate=args.dprenet_dropout_rate,
                ),
                torch.nn.Linear(args.dprenet_units, args.adim),
            )
        else:
            decoder_input_layer = "linear"
        self.decoder = Decoder(
            odim=-1,
            attention_dim=args.adim,
            attention_heads=args.aheads,
            linear_units=args.dunits,
            num_blocks=args.dlayers,
            dropout_rate=args.transformer_dec_dropout_rate,
            positional_dropout_rate=args.transformer_dec_positional_dropout_rate,
            self_attention_dropout_rate=args.transformer_dec_attn_dropout_rate,
            src_attention_dropout_rate=args.transformer_enc_dec_attn_dropout_rate,
            input_layer=decoder_input_layer,
            use_output_layer=False,
            pos_enc_class=pos_enc_class,
            normalize_before=args.decoder_normalize_before,
            concat_after=args.decoder_concat_after,
        )

        # define final projection
        self.feat_out = torch.nn.Linear(args.adim, odim * args.reduction_factor)
        self.prob_out = torch.nn.Linear(args.adim, args.reduction_factor)

        # define postnet
        self.postnet = (
            None
            if args.postnet_layers == 0
            else Postnet(
                idim=idim,
                odim=odim,
                n_layers=args.postnet_layers,
                n_chans=args.postnet_chans,
                n_filts=args.postnet_filts,
                use_batch_norm=args.use_batch_norm,
                dropout_rate=args.postnet_dropout_rate,
            )
        )

        # define loss function
        self.criterion = TransformerLoss(
            use_masking=args.use_masking,
            use_weighted_masking=args.use_weighted_masking,
            bce_pos_weight=args.bce_pos_weight,
        )
        if self.use_guided_attn_loss:
            self.attn_criterion = GuidedMultiHeadAttentionLoss(
                sigma=args.guided_attn_loss_sigma,
                alpha=args.guided_attn_loss_lambda,
            )

        # initialize parameters
        self._reset_parameters(
            init_type=args.transformer_init,
            init_enc_alpha=args.initial_encoder_alpha,
            init_dec_alpha=args.initial_decoder_alpha,
        )

        # load pretrained model
        if args.pretrained_model is not None:
            self.load_pretrained_model(args.pretrained_model)

    def _reset_parameters(self, init_type, init_enc_alpha=1.0, init_dec_alpha=1.0):
        # initialize parameters
        initialize(self, init_type)

        # initialize alpha in scaled positional encoding
        if self.use_scaled_pos_enc:
            self.encoder.embed[-1].alpha.data = torch.tensor(init_enc_alpha)
            self.decoder.embed[-1].alpha.data = torch.tensor(init_dec_alpha)

    def _add_first_frame_and_remove_last_frame(self, ys):
        ys_in = torch.cat(
            [ys.new_zeros((ys.shape[0], 1, ys.shape[2])), ys[:, :-1]], dim=1
        )
        return ys_in

    def forward(self, xs, ilens, ys, labels, olens, spembs=None, *args, **kwargs):
        """Calculate forward propagation.

        Args:
            xs (Tensor): Batch of padded acoustic features (B, Tmax, idim).
            ilens (LongTensor): Batch of lengths of each input batch (B,).
            ys (Tensor): Batch of padded target features (B, Lmax, odim).
            olens (LongTensor): Batch of the lengths of each target (B,).
            spembs (Tensor, optional): Batch of speaker embedding vectors
                (B, spk_embed_dim).

        Returns:
            Tensor: Loss value.

        """
        # remove unnecessary padded part (for multi-gpus)
        max_ilen = max(ilens)
        max_olen = max(olens)
        if max_ilen != xs.shape[1]:
            xs = xs[:, :max_ilen]
        if max_olen != ys.shape[1]:
            ys = ys[:, :max_olen]
            labels = labels[:, :max_olen]

        # thin out input frames for reduction factor
        # (B, Lmax, idim) ->  (B, Lmax // r, idim * r)
        if self.encoder_reduction_factor > 1:
            B, Lmax, idim = xs.shape
            if Lmax % self.encoder_reduction_factor != 0:
                xs = xs[:, : -(Lmax % self.encoder_reduction_factor), :]
            xs_ds = xs.contiguous().view(
                B,
                int(Lmax / self.encoder_reduction_factor),
                idim * self.encoder_reduction_factor,
            )
            ilens_ds = ilens.new(
                [ilen // self.encoder_reduction_factor for ilen in ilens]
            )
        else:
            xs_ds, ilens_ds = xs, ilens

        # forward encoder
        x_masks = self._source_mask(ilens_ds)
        hs, hs_masks = self.encoder(xs_ds, x_masks)

        # integrate speaker embedding
        if self.spk_embed_dim is not None:
            hs_int = self._integrate_with_spk_embed(hs, spembs)
        else:
            hs_int = hs

        # thin out frames for reduction factor (B, Lmax, odim) ->  (B, Lmax//r, odim)
        if self.reduction_factor > 1:
            ys_in = ys[:, self.reduction_factor - 1 :: self.reduction_factor]
            olens_in = olens.new([olen // self.reduction_factor for olen in olens])
        else:
            ys_in, olens_in = ys, olens

        # add first zero frame and remove last frame for auto-regressive
        ys_in = self._add_first_frame_and_remove_last_frame(ys_in)

        # if conv2d, modify mask. Use ceiling division here
        if "conv2d" in self.transformer_input_layer:
            ilens_ds_st = ilens_ds.new(
                [((ilen - 2 + 1) // 2 - 2 + 1) // 2 for ilen in ilens_ds]
            )
        else:
            ilens_ds_st = ilens_ds

        # forward decoder
        y_masks = self._target_mask(olens_in)
        zs, _ = self.decoder(ys_in, y_masks, hs_int, hs_masks)
        # (B, Lmax//r, odim * r) -> (B, Lmax//r * r, odim)
        before_outs = self.feat_out(zs).view(zs.size(0), -1, self.odim)
        # (B, Lmax//r, r) -> (B, Lmax//r * r)
        logits = self.prob_out(zs).view(zs.size(0), -1)

        # postnet -> (B, Lmax//r * r, odim)
        if self.postnet is None:
            after_outs = before_outs
        else:
            after_outs = before_outs + self.postnet(
                before_outs.transpose(1, 2)
            ).transpose(1, 2)

        # modifiy mod part of groundtruth
        if self.reduction_factor > 1:
            olens = olens.new([olen - olen % self.reduction_factor for olen in olens])
            max_olen = max(olens)
            ys = ys[:, :max_olen]
            labels = labels[:, :max_olen]
            labels[:, -1] = 1.0  # make sure at least one frame has 1

        # caluculate loss values
        l1_loss, l2_loss, bce_loss = self.criterion(
            after_outs, before_outs, logits, ys, labels, olens
        )
        if self.loss_type == "L1":
            loss = l1_loss + bce_loss
        elif self.loss_type == "L2":
            loss = l2_loss + bce_loss
        elif self.loss_type == "L1+L2":
            loss = l1_loss + l2_loss + bce_loss
        else:
            raise ValueError("unknown --loss-type " + self.loss_type)
        report_keys = [
            {"l1_loss": l1_loss.item()},
            {"l2_loss": l2_loss.item()},
            {"bce_loss": bce_loss.item()},
            {"loss": loss.item()},
        ]

        # calculate guided attention loss
        if self.use_guided_attn_loss:
            # calculate for encoder
            if "encoder" in self.modules_applied_guided_attn:
                att_ws = []
                for idx, layer_idx in enumerate(
                    reversed(range(len(self.encoder.encoders)))
                ):
                    att_ws += [
                        self.encoder.encoders[layer_idx].self_attn.attn[
                            :, : self.num_heads_applied_guided_attn
                        ]
                    ]
                    if idx + 1 == self.num_layers_applied_guided_attn:
                        break
                att_ws = torch.cat(att_ws, dim=1)  # (B, H*L, T_in, T_in)
                enc_attn_loss = self.attn_criterion(
                    att_ws, ilens_ds_st, ilens_ds_st
                )  # TODO(unilight): is changing to ilens_ds_st right?
                loss = loss + enc_attn_loss
                report_keys += [{"enc_attn_loss": enc_attn_loss.item()}]
            # calculate for decoder
            if "decoder" in self.modules_applied_guided_attn:
                att_ws = []
                for idx, layer_idx in enumerate(
                    reversed(range(len(self.decoder.decoders)))
                ):
                    att_ws += [
                        self.decoder.decoders[layer_idx].self_attn.attn[
                            :, : self.num_heads_applied_guided_attn
                        ]
                    ]
                    if idx + 1 == self.num_layers_applied_guided_attn:
                        break
                att_ws = torch.cat(att_ws, dim=1)  # (B, H*L, T_out, T_out)
                dec_attn_loss = self.attn_criterion(att_ws, olens_in, olens_in)
                loss = loss + dec_attn_loss
                report_keys += [{"dec_attn_loss": dec_attn_loss.item()}]
            # calculate for encoder-decoder
            if "encoder-decoder" in self.modules_applied_guided_attn:
                att_ws = []
                for idx, layer_idx in enumerate(
                    reversed(range(len(self.decoder.decoders)))
                ):
                    att_ws += [
                        self.decoder.decoders[layer_idx].src_attn.attn[
                            :, : self.num_heads_applied_guided_attn
                        ]
                    ]
                    if idx + 1 == self.num_layers_applied_guided_attn:
                        break
                att_ws = torch.cat(att_ws, dim=1)  # (B, H*L, T_out, T_in)
                enc_dec_attn_loss = self.attn_criterion(
                    att_ws, ilens_ds_st, olens_in
                )  # TODO(unilight): is changing to ilens_ds_st right?
                loss = loss + enc_dec_attn_loss
                report_keys += [{"enc_dec_attn_loss": enc_dec_attn_loss.item()}]

        # report extra information
        if self.use_scaled_pos_enc:
            report_keys += [
                {"encoder_alpha": self.encoder.embed[-1].alpha.data.item()},
                {"decoder_alpha": self.decoder.embed[-1].alpha.data.item()},
            ]
        self.reporter.report(report_keys)

        return loss

    def inference(self, x, inference_args, spemb=None, *args, **kwargs):
        """Generate the sequence of features given the sequences of acoustic features.

        Args:
            x (Tensor): Input sequence of acoustic features (T, idim).
            inference_args (Namespace):
                - threshold (float): Threshold in inference.
                - minlenratio (float): Minimum length ratio in inference.
                - maxlenratio (float): Maximum length ratio in inference.
            spemb (Tensor, optional): Speaker embedding vector (spk_embed_dim).

        Returns:
            Tensor: Output sequence of features (L, odim).
            Tensor: Output sequence of stop probabilities (L,).
            Tensor: Encoder-decoder (source) attention weights (#layers, #heads, L, T).

        """
        # get options
        threshold = inference_args.threshold
        minlenratio = inference_args.minlenratio
        maxlenratio = inference_args.maxlenratio
        use_att_constraint = getattr(
            inference_args, "use_att_constraint", False
        )  # keep compatibility
        if use_att_constraint:
            logging.warning(
                "Attention constraint is not yet supported in Transformer. Not enabled."
            )

        # thin out input frames for reduction factor
        # (B, Lmax, idim) ->  (B, Lmax // r, idim * r)
        if self.encoder_reduction_factor > 1:
            Lmax, idim = x.shape
            if Lmax % self.encoder_reduction_factor != 0:
                x = x[: -(Lmax % self.encoder_reduction_factor), :]
            x_ds = x.contiguous().view(
                int(Lmax / self.encoder_reduction_factor),
                idim * self.encoder_reduction_factor,
            )
        else:
            x_ds = x

        # forward encoder
        x_ds = x_ds.unsqueeze(0)
        hs, _ = self.encoder(x_ds, None)

        # integrate speaker embedding
        if self.spk_embed_dim is not None:
            spembs = spemb.unsqueeze(0)
            hs = self._integrate_with_spk_embed(hs, spembs)

        # set limits of length
        maxlen = int(hs.size(1) * maxlenratio / self.reduction_factor)
        minlen = int(hs.size(1) * minlenratio / self.reduction_factor)

        # initialize
        idx = 0
        ys = hs.new_zeros(1, 1, self.odim)
        outs, probs = [], []

        # forward decoder step-by-step
        z_cache = self.decoder.init_state(x)
        while True:
            # update index
            idx += 1

            # calculate output and stop prob at idx-th step
            y_masks = subsequent_mask(idx).unsqueeze(0).to(x.device)
            z, z_cache = self.decoder.forward_one_step(
                ys, y_masks, hs, cache=z_cache
            )  # (B, adim)
            outs += [
                self.feat_out(z).view(self.reduction_factor, self.odim)
            ]  # [(r, odim), ...]
            probs += [torch.sigmoid(self.prob_out(z))[0]]  # [(r), ...]

            # update next inputs
            ys = torch.cat(
                (ys, outs[-1][-1].view(1, 1, self.odim)), dim=1
            )  # (1, idx + 1, odim)

            # get attention weights
            att_ws_ = []
            for name, m in self.named_modules():
                if isinstance(m, MultiHeadedAttention) and "src" in name:
                    att_ws_ += [m.attn[0, :, -1].unsqueeze(1)]  # [(#heads, 1, T),...]
            if idx == 1:
                att_ws = att_ws_
            else:
                # [(#heads, l, T), ...]
                att_ws = [
                    torch.cat([att_w, att_w_], dim=1)
                    for att_w, att_w_ in zip(att_ws, att_ws_)
                ]

            # check whether to finish generation
            if int(sum(probs[-1] >= threshold)) > 0 or idx >= maxlen:
                # check mininum length
                if idx < minlen:
                    continue
                outs = (
                    torch.cat(outs, dim=0).unsqueeze(0).transpose(1, 2)
                )  # (L, odim) -> (1, L, odim) -> (1, odim, L)
                if self.postnet is not None:
                    outs = outs + self.postnet(outs)  # (1, odim, L)
                outs = outs.transpose(2, 1).squeeze(0)  # (L, odim)
                probs = torch.cat(probs, dim=0)
                break

        # concatenate attention weights -> (#layers, #heads, L, T)
        att_ws = torch.stack(att_ws, dim=0)

        return outs, probs, att_ws

    def calculate_all_attentions(
        self,
        xs,
        ilens,
        ys,
        olens,
        spembs=None,
        skip_output=False,
        keep_tensor=False,
        *args,
        **kwargs
    ):
        """Calculate all of the attention weights.

        Args:
            xs (Tensor): Batch of padded acoustic features (B, Tmax, idim).
            ilens (LongTensor): Batch of lengths of each input batch (B,).
            ys (Tensor): Batch of padded target features (B, Lmax, odim).
            olens (LongTensor): Batch of the lengths of each target (B,).
            spembs (Tensor, optional): Batch of speaker embedding vectors
                (B, spk_embed_dim).
            skip_output (bool, optional): Whether to skip calculate the final output.
            keep_tensor (bool, optional): Whether to keep original tensor.

        Returns:
            dict: Dict of attention weights and outputs.

        """
        with torch.no_grad():
            # thin out input frames for reduction factor
            # (B, Lmax, idim) ->  (B, Lmax // r, idim * r)
            if self.encoder_reduction_factor > 1:
                B, Lmax, idim = xs.shape
                if Lmax % self.encoder_reduction_factor != 0:
                    xs = xs[:, : -(Lmax % self.encoder_reduction_factor), :]
                xs_ds = xs.contiguous().view(
                    B,
                    int(Lmax / self.encoder_reduction_factor),
                    idim * self.encoder_reduction_factor,
                )
                ilens_ds = ilens.new(
                    [ilen // self.encoder_reduction_factor for ilen in ilens]
                )
            else:
                xs_ds, ilens_ds = xs, ilens

            # forward encoder
            x_masks = self._source_mask(ilens_ds)
            hs, hs_masks = self.encoder(xs_ds, x_masks)

            # integrate speaker embedding
            if self.spk_embed_dim is not None:
                hs = self._integrate_with_spk_embed(hs, spembs)

            # thin out frames for reduction factor
            # (B, Lmax, odim) ->  (B, Lmax//r, odim)
            if self.reduction_factor > 1:
                ys_in = ys[:, self.reduction_factor - 1 :: self.reduction_factor]
                olens_in = olens.new([olen // self.reduction_factor for olen in olens])
            else:
                ys_in, olens_in = ys, olens

            # add first zero frame and remove last frame for auto-regressive
            ys_in = self._add_first_frame_and_remove_last_frame(ys_in)

            # forward decoder
            y_masks = self._target_mask(olens_in)
            zs, _ = self.decoder(ys_in, y_masks, hs, hs_masks)

            # calculate final outputs
            if not skip_output:
                before_outs = self.feat_out(zs).view(zs.size(0), -1, self.odim)
                if self.postnet is None:
                    after_outs = before_outs
                else:
                    after_outs = before_outs + self.postnet(
                        before_outs.transpose(1, 2)
                    ).transpose(1, 2)

        # modifiy mod part of output lengths due to reduction factor > 1
        if self.reduction_factor > 1:
            olens = olens.new([olen - olen % self.reduction_factor for olen in olens])

        # store into dict
        att_ws_dict = dict()
        if keep_tensor:
            for name, m in self.named_modules():
                if isinstance(m, MultiHeadedAttention):
                    att_ws_dict[name] = m.attn
            if not skip_output:
                att_ws_dict["before_postnet_fbank"] = before_outs
                att_ws_dict["after_postnet_fbank"] = after_outs
        else:
            for name, m in self.named_modules():
                if isinstance(m, MultiHeadedAttention):
                    attn = m.attn.cpu().numpy()
                    if "encoder" in name:
                        attn = [a[:, :l, :l] for a, l in zip(attn, ilens.tolist())]
                    elif "decoder" in name:
                        if "src" in name:
                            attn = [
                                a[:, :ol, :il]
                                for a, il, ol in zip(
                                    attn, ilens.tolist(), olens_in.tolist()
                                )
                            ]
                        elif "self" in name:
                            attn = [
                                a[:, :l, :l] for a, l in zip(attn, olens_in.tolist())
                            ]
                        else:
                            logging.warning("unknown attention module: " + name)
                    else:
                        logging.warning("unknown attention module: " + name)
                    att_ws_dict[name] = attn
            if not skip_output:
                before_outs = before_outs.cpu().numpy()
                after_outs = after_outs.cpu().numpy()
                att_ws_dict["before_postnet_fbank"] = [
                    m[:l].T for m, l in zip(before_outs, olens.tolist())
                ]
                att_ws_dict["after_postnet_fbank"] = [
                    m[:l].T for m, l in zip(after_outs, olens.tolist())
                ]

        return att_ws_dict

    def _integrate_with_spk_embed(self, hs, spembs):
        """Integrate speaker embedding with hidden states.

        Args:
            hs (Tensor): Batch of hidden state sequences (B, Tmax, adim).
            spembs (Tensor): Batch of speaker embeddings (B, spk_embed_dim).

        Returns:
            Tensor: Batch of integrated hidden state sequences (B, Tmax, adim)

        """
        if self.spk_embed_integration_type == "add":
            # apply projection and then add to hidden states
            spembs = self.projection(F.normalize(spembs))
            hs = hs + spembs.unsqueeze(1)
        elif self.spk_embed_integration_type == "concat":
            # concat hidden states with spk embeds and then apply projection
            spembs = F.normalize(spembs).unsqueeze(1).expand(-1, hs.size(1), -1)
            hs = self.projection(torch.cat([hs, spembs], dim=-1))
        else:
            raise NotImplementedError("support only add or concat.")

        return hs

    def _source_mask(self, ilens):
        """Make masks for self-attention.

        Args:
            ilens (LongTensor or List): Batch of lengths (B,).

        Returns:
            Tensor: Mask tensor for self-attention.
                    dtype=torch.uint8 in PyTorch 1.2-
                    dtype=torch.bool in PyTorch 1.2+ (including 1.2)

        Examples:
            >>> ilens = [5, 3]
            >>> self._source_mask(ilens)
            tensor([[[1, 1, 1, 1, 1],
                    [[1, 1, 1, 0, 0]]], dtype=torch.uint8)

        """
        x_masks = make_non_pad_mask(ilens).to(next(self.parameters()).device)
        return x_masks.unsqueeze(-2)

    def _target_mask(self, olens):
        """Make masks for masked self-attention.

        Args:
            olens (LongTensor or List): Batch of lengths (B,).

        Returns:
            Tensor: Mask tensor for masked self-attention.
                    dtype=torch.uint8 in PyTorch 1.2-
                    dtype=torch.bool in PyTorch 1.2+ (including 1.2)

        Examples:
            >>> olens = [5, 3]
            >>> self._target_mask(olens)
            tensor([[[1, 0, 0, 0, 0],
                     [1, 1, 0, 0, 0],
                     [1, 1, 1, 0, 0],
                     [1, 1, 1, 1, 0],
                     [1, 1, 1, 1, 1]],
                    [[1, 0, 0, 0, 0],
                     [1, 1, 0, 0, 0],
                     [1, 1, 1, 0, 0],
                     [1, 1, 1, 0, 0],
                     [1, 1, 1, 0, 0]]], dtype=torch.uint8)

        """
        y_masks = make_non_pad_mask(olens).to(next(self.parameters()).device)
        s_masks = subsequent_mask(y_masks.size(-1), device=y_masks.device).unsqueeze(0)
        return y_masks.unsqueeze(-2) & s_masks

    @property
    def base_plot_keys(self):
        """Return base key names to plot during training.

        keys should match what `chainer.reporter` reports.
        If you add the key `loss`, the reporter will report `main/loss`
            and `validation/main/loss` values.
        also `loss.png` will be created as a figure visulizing `main/loss`
            and `validation/main/loss` values.

        Returns:
            list: List of strings which are base keys to plot during training.

        """
        plot_keys = ["loss", "l1_loss", "l2_loss", "bce_loss"]
        if self.use_scaled_pos_enc:
            plot_keys += ["encoder_alpha", "decoder_alpha"]
        if self.use_guided_attn_loss:
            if "encoder" in self.modules_applied_guided_attn:
                plot_keys += ["enc_attn_loss"]
            if "decoder" in self.modules_applied_guided_attn:
                plot_keys += ["dec_attn_loss"]
            if "encoder-decoder" in self.modules_applied_guided_attn:
                plot_keys += ["enc_dec_attn_loss"]

        return plot_keys


================================================
FILE: nets/pytorch_backend/fastspeech/__init__.py
================================================
"""Initialize sub package."""


================================================
FILE: nets/pytorch_backend/fastspeech/duration_calculator.py
================================================
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

# Copyright 2019 Tomoki Hayashi
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

"""Duration calculator related modules."""

import torch

from espnet.nets.pytorch_backend.e2e_tts_tacotron2 import Tacotron2
from espnet.nets.pytorch_backend.e2e_tts_transformer import Transformer
from espnet.nets.pytorch_backend.nets_utils import pad_list


class DurationCalculator(torch.nn.Module):
    """Duration calculator module for FastSpeech.

    Todo:
        * Fix the duplicated calculation of diagonal head decision

    """

    def __init__(self, teacher_model):
        """Initialize duration calculator module.

        Args:
            teacher_model (e2e_tts_transformer.Transformer):
                Pretrained auto-regressive Transformer.

        """
        super(DurationCalculator, self).__init__()
        if isinstance(teacher_model, Transformer):
            self.register_buffer("diag_head_idx", torch.tensor(-1))
        elif isinstance(teacher_model, Tacotron2):
            pass
        else:
            raise ValueError(
                "teacher model should be the instance of "
                "e2e_tts_transformer.Transformer or e2e_tts_tacotron2.Tacotron2."
            )
        self.teacher_model = teacher_model

    def forward(self, xs, ilens, ys, olens, spembs=None):
        """Calculate forward propagation.

        Args:
            xs (Tensor): Batch of the padded sequences of character ids (B, Tmax).
            ilens (Tensor): Batch of lengths of each input sequence (B,).
            ys (Tensor):
                Batch of the padded sequence of target features (B, Lmax, odim).
            olens (Tensor): Batch of lengths of each output sequence (B,).
            spembs (Tensor, optional):
                Batch of speaker embedding vectors (B, spk_embed_dim).

        Returns:
            Tensor: Batch of durations (B, Tmax).

        """
        if isinstance(self.teacher_model, Transformer):
            att_ws = self._calculate_encoder_decoder_attentions(
                xs, ilens, ys, olens, spembs=spembs
            )
            # TODO(kan-bayashi): fix this issue
            # this does not work in multi-gpu case. registered buffer is not saved.
            if int(self.diag_head_idx) == -1:
                self._init_diagonal_head(att_ws)
            att_ws = att_ws[:, self.diag_head_idx]
        else:
            # NOTE(kan-bayashi): Here we assume that the teacher is tacotron 2
            att_ws = self.teacher_model.calculate_all_attentions(
                xs, ilens, ys, spembs=spembs, keep_tensor=True
            )
        durations = [
            self._calculate_duration(att_w, ilen, olen)
            for att_w, ilen, olen in zip(att_ws, ilens, olens)
        ]

        return pad_list(durations, 0)

    @staticmethod
    def _calculate_duration(att_w, ilen, olen):
        return torch.stack(
            [att_w[:olen, :ilen].argmax(-1).eq(i).sum() for i in range(ilen)]
        )

    def _init_diagonal_head(self, att_ws):
        diagonal_scores = att_ws.max(dim=-1)[0].mean(dim=-1).mean(dim=0)  # (H * L,)
        self.register_buffer("diag_head_idx", diagonal_scores.argmax())

    def _calculate_encoder_decoder_attentions(self, xs, ilens, ys, olens, spembs=None):
        att_dict = self.teacher_model.calculate_all_attentions(
            xs, ilens, ys, olens, spembs=spembs, skip_output=True, keep_tensor=True
        )
        return torch.cat(
            [att_dict[k] for k in att_dict.keys() if "src_attn" in k], dim=1
        )  # (B, H*L, Lmax, Tmax)


================================================
FILE: nets/pytorch_backend/fastspeech/duration_predictor.py
================================================
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

# Copyright 2019 Tomoki Hayashi
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

"""Duration predictor related modules."""

import torch

from espnet.nets.pytorch_backend.transformer.layer_norm import LayerNorm


class DurationPredictor(torch.nn.Module):
    """Duration predictor module.

    This is a module of duration predictor described
    in `FastSpeech: Fast, Robust and Controllable Text to Speech`_.
    The duration predictor predicts a duration of each frame in log domain
    from the hidden embeddings of encoder.

    .. _`FastSpeech: Fast, Robust and Controllable Text to Speech`:
        https://arxiv.org/pdf/1905.09263.pdf

    Note:
        The calculation domain of outputs is different
        between in `forward` and in `inference`. In `forward`,
        the outputs are calculated in log domain but in `inference`,
        those are calculated in linear domain.

    """

    def __init__(
        self, idim, n_layers=2, n_chans=384, kernel_size=3, dropout_rate=0.1, offset=1.0
    ):
        """Initilize duration predictor module.

        Args:
            idim (int): Input dimension.
            n_layers (int, optional): Number of convolutional layers.
            n_chans (int, optional): Number of channels of convolutional layers.
            kernel_size (int, optional): Kernel size of convolutional layers.
            dropout_rate (float, optional): Dropout rate.
            offset (float, optional): Offset value to avoid nan in log domain.

        """
        super(DurationPredictor, self).__init__()
        self.offset = offset
        self.conv = torch.nn.ModuleList()
        for idx in range(n_layers):
            in_chans = idim if idx == 0 else n_chans
            self.conv += [
                torch.nn.Sequential(
                    torch.nn.Conv1d(
                        in_chans,
                        n_chans,
                        kernel_size,
                        stride=1,
                        padding=(kernel_size - 1) // 2,
                    ),
                    torch.nn.ReLU(),
                    LayerNorm(n_chans, dim=1),
                    torch.nn.Dropout(dropout_rate),
                )
            ]
        self.linear = torch.nn.Linear(n_chans, 1)

    def _forward(self, xs, x_masks=None, is_inference=False):
        xs = xs.transpose(1, -1)  # (B, idim, Tmax)
        for f in self.conv:
            xs = f(xs)  # (B, C, Tmax)

        # NOTE: calculate in log domain
        xs = self.linear(xs.transpose(1, -1)).squeeze(-1)  # (B, Tmax)

        if is_inference:
            # NOTE: calculate in linear domain
            xs = torch.clamp(
                torch.round(xs.exp() - self.offset), min=0
            ).long()  # avoid negative value

        if x_masks is not None:
            xs = xs.masked_fill(x_masks, 0.0)

        return xs

    def forward(self, xs, x_masks=None):
        """Calculate forward propagation.

        Args:
            xs (Tensor): Batch of input sequences (B, Tmax, idim).
            x_masks (ByteTensor, optional):
                Batch of masks indicating padded part (B, Tmax).

        Returns:
            Tensor: Batch of predicted durations in log domain (B, Tmax).

        """
        return self._forward(xs, x_masks, False)

    def inference(self, xs, x_masks=None):
        """Inference duration.

        Args:
            xs (Tensor): Batch of input sequences (B, Tmax, idim).
            x_masks (ByteTensor, optional):
                Batch of masks indicating padded part (B, Tmax).

        Returns:
            LongTensor: Batch of predicted durations in linear domain (B, Tmax).

        """
        return self._forward(xs, x_masks, True)


class DurationPredictorLoss(torch.nn.Module):
    """Loss function module for duration predictor.

    The loss value is Calculated in log domain to make it Gaussian.

    """

    def __init__(self, offset=1.0, reduction="mean"):
        """Initilize duration predictor loss module.

        Args:
            offset (float, optional): Offset value to avoid nan in log domain.
            reduction (str): Reduction type in loss calculation.

        """
        super(DurationPredictorLoss, self).__init__()
        self.criterion = torch.nn.MSELoss(reduction=reduction)
        self.offset = offset

    def forward(self, outputs, targets):
        """Calculate forward propagation.

        Args:
            outputs (Tensor): Batch of prediction durations in log domain (B, T)
            targets (LongTensor): Batch of groundtruth durations in linear domain (B, T)

        Returns:
            Tensor: Mean squared error loss value.

        Note:
            `outputs` is in log domain but `targets` is in linear domain.

        """
        # NOTE: outputs is in log domain while targets in linear
        targets = torch.log(targets.float() + self.offset)
        loss = self.criterion(outputs, targets)

        return loss


================================================
FILE: nets/pytorch_backend/fastspeech/length_regulator.py
================================================
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

# Copyright 2019 Tomoki Hayashi
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

"""Length regulator related modules."""

import logging

from distutils.version import LooseVersion

import torch

from espnet.nets.pytorch_backend.nets_utils import pad_list

is_torch_1_1_plus = LooseVersion(torch.__version__) >= LooseVersion("1.1")


class LengthRegulator(torch.nn.Module):
    """Length regulator module for feed-forward Transformer.

    This is a module of length regulator described in
    `FastSpeech: Fast, Robust and Controllable Text to Speech`_.
    The length regulator expands char or
    phoneme-level embedding features to frame-level by repeating each
    feature based on the corresponding predicted durations.

    .. _`FastSpeech: Fast, Robust and Controllable Text to Speech`:
        https://arxiv.org/pdf/1905.09263.pdf

    """

    def __init__(self, pad_value=0.0):
        """Initilize length regulator module.

        Args:
            pad_value (float, optional): Value used for padding.

        """
        super(LengthRegulator, self).__init__()
        self.pad_value = pad_value
        if is_torch_1_1_plus:
            self.repeat_fn = self._repeat_one_sequence
        else:
            self.repeat_fn = self._legacy_repeat_one_sequence

    def forward(self, xs, ds, alpha=1.0):
        """Calculate forward propagation.

        Args:
            xs (Tensor): Batch of sequences of char or phoneme embeddings (B, Tmax, D).
            ds (LongTensor): Batch of durations of each frame (B, T).
            alpha (float, optional): Alpha value to control speed of speech.

        Returns:
            Tensor: replicated input tensor based on durations (B, T*, D).

        """
        if alpha != 1.0:
            assert alpha > 0
            ds = torch.round(ds.float() * alpha).long()

        if ds.sum() == 0:
            logging.warning(
                "predicted durations includes all 0 sequences. "
                "fill the first element with 1."
            )
            # NOTE(kan-bayashi): This case must not be happend in teacher forcing.
            #   It will be happened in inference with a bad duration predictor.
            #   So we do not need to care the padded sequence case here.
            ds[ds.sum(dim=1).eq(0)] = 1

        return pad_list([self.repeat_fn(x, d) for x, d in zip(xs, ds)], self.pad_value)

    def _repeat_one_sequence(self, x, d):
        """Repeat each frame according to duration for torch 1.1+."""
        return torch.repeat_interleave(x, d, dim=0)

    def _legacy_repeat_one_sequence(self, x, d):
        """Repeat each frame according to duration for torch 1.0.

        Examples:
            >>> x = torch.tensor([[1], [2], [3]])
            tensor([[1],
                    [2],
                    [3]])
            >>> d = torch.tensor([1, 2, 3])
            tensor([1, 2, 3])
            >>> self._repeat_one_sequence(x, d)
            tensor([[1],
                    [2],
                    [2],
                    [3],
                    [3],
                    [3]])

        """
        return torch.cat(
            [x_.repeat(int(d_), 1) for x_, d_ in zip(x, d) if d_ != 0], dim=0
        )


================================================
FILE: nets/pytorch_backend/frontends/__init__.py
================================================
"""Initialize sub package."""


================================================
FILE: nets/pytorch_backend/frontends/beamformer.py
================================================
import torch
from torch_complex import functional as FC
from torch_complex.tensor import ComplexTensor


def get_power_spectral_density_matrix(
    xs: ComplexTensor, mask: torch.Tensor, normalization=True, eps: float = 1e-15
) -> ComplexTensor:
    """Return cross-channel power spectral density (PSD) matrix

    Args:
        xs (ComplexTensor): (..., F, C, T)
        mask (torch.Tensor): (..., F, C, T)
        normalization (bool):
        eps (float):
    Returns
        psd (ComplexTensor): (..., F, C, C)

    """
    # outer product: (..., C_1, T) x (..., C_2, T) -> (..., T, C, C_2)
    psd_Y = FC.einsum("...ct,...et->...tce", [xs, xs.conj()])

    # Averaging mask along C: (..., C, T) -> (..., T)
    mask = mask.mean(dim=-2)

    # Normalized mask along T: (..., T)
    if normalization:
        # If assuming the tensor is padded with zero, the summation along
        # the time axis is same regardless of the padding length.
        mask = mask / (mask.sum(dim=-1, keepdim=True) + eps)

    # psd: (..., T, C, C)
    psd = psd_Y * mask[..., None, None]
    # (..., T, C, C) -> (..., C, C)
    psd = psd.sum(dim=-3)

    return psd


def get_mvdr_vector(
    psd_s: ComplexTensor,
    psd_n: ComplexTensor,
    reference_vector: torch.Tensor,
    eps: float = 1e-15,
) -> ComplexTensor:
    """Return the MVDR(Minimum Variance Distortionless Response) vector:

        h = (Npsd^-1 @ Spsd) / (Tr(Npsd^-1 @ Spsd)) @ u

    Reference:
        On optimal frequency-domain multichannel linear filtering
        for noise reduction; M. Souden et al., 2010;
        https://ieeexplore.ieee.org/document/5089420

    Args:
        psd_s (ComplexTensor): (..., F, C, C)
        psd_n (ComplexTensor): (..., F, C, C)
        reference_vector (torch.Tensor): (..., C)
        eps (float):
    Returns:
        beamform_vector (ComplexTensor)r: (..., F, C)
    """
    # Add eps
    C = psd_n.size(-1)
    eye = torch.eye(C, dtype=psd_n.dtype, device=psd_n.device)
    shape = [1 for _ in range(psd_n.dim() - 2)] + [C, C]
    eye = eye.view(*shape)
    psd_n += eps * eye

    # numerator: (..., C_1, C_2) x (..., C_2, C_3) -> (..., C_1, C_3)
    numerator = FC.einsum("...ec,...cd->...ed", [psd_n.inverse(), psd_s])
    # ws: (..., C, C) / (...,) -> (..., C, C)
    ws = numerator / (FC.trace(numerator)[..., None, None] + eps)
    # h: (..., F, C_1, C_2) x (..., C_2) -> (..., F, C_1)
    beamform_vector = FC.einsum("...fec,...c->...fe", [ws, reference_vector])
    return beamform_vector


def apply_beamforming_vector(
    beamform_vector: ComplexTensor, mix: ComplexTensor
) -> ComplexTensor:
    # (..., C) x (..., C, T) -> (..., T)
    es = FC.einsum("...c,...ct->...t", [beamform_vector.conj(), mix])
    return es


================================================
FILE: nets/pytorch_backend/frontends/dnn_beamformer.py
================================================
from distutils.version import LooseVersion
from typing import Tuple

import torch
from torch.nn import functional as F

from espnet.nets.pytorch_backend.frontends.beamformer import apply_beamforming_vector
from espnet.nets.pytorch_backend.frontends.beamformer import get_mvdr_vector
from espnet.nets.pytorch_backend.frontends.beamformer import (
    get_power_spectral_density_matrix,  # noqa: H301
)
from espnet.nets.pytorch_backend.frontends.mask_estimator import MaskEstimator
from torch_complex.tensor import ComplexTensor

is_torch_1_2_plus = LooseVersion(torch.__version__) >= LooseVersion("1.2.0")
is_torch_1_3_plus = LooseVersion(torch.__version__) >= LooseVersion("1.3.0")


class DNN_Beamformer(torch.nn.Module):
    """DNN mask based Beamformer

    Citation:
        Multichannel End-to-end Speech Recognition; T. Ochiai et al., 2017;
        https://arxiv.org/abs/1703.04783

    """

    def __init__(
        self,
        bidim,
        btype="blstmp",
        blayers=3,
        bunits=300,
        bprojs=320,
        bnmask=2,
        dropout_rate=0.0,
        badim=320,
        ref_channel: int = -1,
        beamformer_type="mvdr",
    ):
        super().__init__()
        self.mask = MaskEstimator(
            btype, bidim, blayers, bunits, bprojs, dropout_rate, nmask=bnmask
        )
        self.ref = AttentionReference(bidim, badim)
        self.ref_channel = ref_channel

        self.nmask = bnmask

        if beamformer_type != "mvdr":
            raise ValueError(
                "Not supporting beamformer_type={}".format(beamformer_type)
            )
        self.beamformer_type = beamformer_type

    def forward(
        self, data: ComplexTensor, ilens: torch.LongTensor
    ) -> Tuple[ComplexTensor, torch.LongTensor, ComplexTensor]:
        """The forward function

        Notation:
            B: Batch
            C: Channel
            T: Time or Sequence length
            F: Freq

        Args:
            data (ComplexTensor): (B, T, C, F)
            ilens (torch.Tensor): (B,)
        Returns:
            enhanced (ComplexTensor): (B, T, F)
            ilens (torch.Tensor): (B,)

        """

        def apply_beamforming(data, ilens, psd_speech, psd_noise):
            # u: (B, C)
            if self.ref_channel < 0:
                u, _ = self.ref(psd_speech, ilens)
            else:
                # (optional) Create onehot vector for fixed reference microphone
                u = torch.zeros(
                    *(data.size()[:-3] + (data.size(-2),)), device=data.device
                )
                u[..., self.ref_channel].fill_(1)

            ws = get_mvdr_vector(psd_speech, psd_noise, u)
            enhanced = apply_beamforming_vector(ws, data)

            return enhanced, ws

        # data (B, T, C, F) -> (B, F, C, T)
        data = data.permute(0, 3, 2, 1)

        # mask: (B, F, C, T)
        masks, _ = self.mask(data, ilens)
        assert self.nmask == len(masks)

        if self.nmask == 2:  # (mask_speech, mask_noise)
            mask_speech, mask_noise = masks

            psd_speech = get_power_spectral_density_matrix(data, mask_speech)
            psd_noise = get_power_spectral_density_matrix(data, mask_noise)

            enhanced, ws = apply_beamforming(data, ilens, psd_speech, psd_noise)

            # (..., F, T) -> (..., T, F)
            enhanced = enhanced.transpose(-1, -2)
            mask_speech = mask_speech.transpose(-1, -3)
        else:  # multi-speaker case: (mask_speech1, ..., mask_noise)
            mask_speech = list(masks[:-1])
            mask_noise = masks[-1]

            psd_speeches = [
                get_power_spectral_density_matrix(data, mask) for mask in mask_speech
            ]
            psd_noise = get_power_spectral_density_matrix(data, mask_noise)

            enhanced = []
            ws = []
            for i in range(self.nmask - 1):
                psd_speech = psd_speeches.pop(i)
                # treat all other speakers' psd_speech as noises
                enh, w = apply_beamforming(
                    data, ilens, psd_speech, sum(psd_speeches) + psd_noise
                )
                psd_speeches.insert(i, psd_speech)

                # (..., F, T) -> (..., T, F)
                enh = enh.transpose(-1, -2)
                mask_speech[i] = mask_speech[i].transpose(-1, -3)

                enhanced.append(enh)
                ws.append(w)

        return enhanced, ilens, mask_speech


class AttentionReference(torch.nn.Module):
    def __init__(self, bidim, att_dim):
        super().__init__()
        self.mlp_psd = torch.nn.Linear(bidim, att_dim)
        self.gvec = torch.nn.Linear(att_dim, 1)

    def forward(
        self, psd_in: ComplexTensor, ilens: torch.LongTensor, scaling: float = 2.0
    ) -> Tuple[torch.Tensor, torch.LongTensor]:
        """The forward function

        Args:
            psd_in (ComplexTensor): (B, F, C, C)
            ilens (torch.Tensor): (B,)
            scaling (float):
        Returns:
            u (torch.Tensor): (B, C)
            ilens (torch.Tensor): (B,)
        """
        B, _, C = psd_in.size()[:3]
        assert psd_in.size(2) == psd_in.size(3), psd_in.size()
        # psd_in: (B, F, C, C)
        datatype = torch.bool if is_torch_1_3_plus else torch.uint8
        datatype2 = torch.bool if is_torch_1_2_plus else torch.uint8
        psd = psd_in.masked_fill(
            torch.eye(C, dtype=datatype, device=psd_in.device).type(datatype2), 0
        )
        # psd: (B, F, C, C) -> (B, C, F)
        psd = (psd.sum(dim=-1) / (C - 1)).transpose(-1, -2)

        # Calculate amplitude
        psd_feat = (psd.real ** 2 + psd.imag ** 2) ** 0.5

        # (B, C, F) -> (B, C, F2)
        mlp_psd = self.mlp_psd(psd_feat)
        # (B, C, F2) -> (B, C, 1) -> (B, C)
        e = self.gvec(torch.tanh(mlp_psd)).squeeze(-1)
        u = F.softmax(scaling * e, dim=-1)
        return u, ilens


================================================
FILE: nets/pytorch_backend/frontends/dnn_wpe.py
================================================
from typing import Tuple

from pytorch_wpe import wpe_one_iteration
import torch
from torch_complex.tensor import ComplexTensor

from espnet.nets.pytorch_backend.frontends.mask_estimator import MaskEstimator
from espnet.nets.pytorch_backend.nets_utils import make_pad_mask


class DNN_WPE(torch.nn.Module):
    def __init__(
        self,
        wtype: str = "blstmp",
        widim: int = 257,
        wlayers: int = 3,
        wunits: int = 300,
        wprojs: int = 320,
        dropout_rate: float = 0.0,
        taps: int = 5,
        delay: int = 3,
        use_dnn_mask: bool = True,
        iterations: int = 1,
        normalization: bool = False,
    ):
        super().__init__()
        self.iterations = iterations
        self.taps = taps
        self.delay = delay

        self.normalization = normalization
        self.use_dnn_mask = use_dnn_mask

        self.inverse_power = True

        if self.use_dnn_mask:
            self.mask_est = MaskEstimator(
                wtype, widim, wlayers, wunits, wprojs, dropout_rate, nmask=1
            )

    def forward(
        self, data: ComplexTensor, ilens: torch.LongTensor
    ) -> Tuple[ComplexTensor, torch.LongTensor, ComplexTensor]:
        """The forward function

        Notation:
            B: Batch
            C: Channel
            T: Time or Sequence length
            F: Freq or Some dimension of the feature vector

        Args:
            data: (B, C, T, F)
            ilens: (B,)
        Returns:
            data: (B, C, T, F)
            ilens: (B,)
        """
        # (B, T, C, F) -> (B, F, C, T)
        enhanced = data = data.permute(0, 3, 2, 1)
        mask = None

        for i in range(self.iterations):
            # Calculate power: (..., C, T)
            power = enhanced.real ** 2 + enhanced.imag ** 2
            if i == 0 and self.use_dnn_mask:
                # mask: (B, F, C, T)
                (mask,), _ = self.mask_est(enhanced, ilens)
                if self.normalization:
                    # Normalize along T
                    mask = mask / mask.sum(dim=-1)[..., None]
                # (..., C, T) * (..., C, T) -> (..., C, T)
                power = power * mask

            # Averaging along the channel axis: (..., C, T) -> (..., T)
            power = power.mean(dim=-2)

            # enhanced: (..., C, T) -> (..., C, T)
            enhanced = wpe_one_iteration(
                data.contiguous(),
                power,
                taps=self.taps,
                delay=self.delay,
                inverse_power=self.inverse_power,
            )

            enhanced.masked_fill_(make_pad_mask(ilens, enhanced.real), 0)

        # (B, F, C, T) -> (B, T, C, F)
        enhanced = enhanced.permute(0, 3, 2, 1)
        if mask is not None:
            mask = mask.transpose(-1, -3)
        return enhanced, ilens, mask


================================================
FILE: nets/pytorch_backend/frontends/feature_transform.py
================================================
from typing import List
from typing import Tuple
from typing import Union

import librosa
import numpy as np
import torch
from torch_complex.tensor import ComplexTensor

from espnet.nets.pytorch_backend.nets_utils import make_pad_mask


class FeatureTransform(torch.nn.Module):
    def __init__(
        self,
        # Mel options,
        fs: int = 16000,
        n_fft: int = 512,
        n_mels: int = 80,
        fmin: float = 0.0,
        fmax: float = None,
        # Normalization
        stats_file: str = None,
        apply_uttmvn: bool = True,
        uttmvn_norm_means: bool = True,
        uttmvn_norm_vars: bool = False,
    ):
        super().__init__()
        self.apply_uttmvn = apply_uttmvn

        self.logmel = LogMel(fs=fs, n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax)
        self.stats_file = stats_file
        if stats_file is not None:
            self.global_mvn = GlobalMVN(stats_file)
        else:
            self.global_mvn = None

        if self.apply_uttmvn is not None:
            self.uttmvn = UtteranceMVN(
                norm_means=uttmvn_norm_means, norm_vars=uttmvn_norm_vars
            )
        else:
            self.uttmvn = None

    def forward(
        self, x: ComplexTensor, ilens: Union[torch.LongTensor, np.ndarray, List[int]]
    ) -> Tuple[torch.Tensor, torch.LongTensor]:
        # (B, T, F) or (B, T, C, F)
        if x.dim() not in (3, 4):
            raise ValueError(f"Input dim must be 3 or 4: {x.dim()}")
        if not torch.is_tensor(ilens):
            ilens = torch.from_numpy(np.asarray(ilens)).to(x.device)

        if x.dim() == 4:
            # h: (B, T, C, F) -> h: (B, T, F)
            if self.training:
                # Select 1ch randomly
                ch = np.random.randint(x.size(2))
                h = x[:, :, ch, :]
            else:
                # Use the first channel
                h = x[:, :, 0, :]
        else:
            h = x

        # h: ComplexTensor(B, T, F) -> torch.Tensor(B, T, F)
        h = h.real ** 2 + h.imag ** 2

        h, _ = self.logmel(h, ilens)
        if self.stats_file is not None:
            h, _ = self.global_mvn(h, ilens)
        if self.apply_uttmvn:
            h, _ = self.uttmvn(h, ilens)

        return h, ilens


class LogMel(torch.nn.Module):
    """Convert STFT to fbank feats

    The arguments is same as librosa.filters.mel

    Args:
        fs: number > 0 [scalar] sampling rate of the incoming signal
        n_fft: int > 0 [scalar] number of FFT components
        n_mels: int > 0 [scalar] number of Mel bands to generate
        fmin: float >= 0 [scalar] lowest frequency (in Hz)
        fmax: float >= 0 [scalar] highest frequency (in Hz).
            If `None`, use `fmax = fs / 2.0`
        htk: use HTK formula instead of Slaney
        norm: {None, 1, np.inf} [scalar]
            if 1, divide the triangular mel weights by the width of the mel band
            (area normalization).  Otherwise, leave all the triangles aiming for
            a peak value of 1.0

    """

    def __init__(
        self,
        fs: int = 16000,
        n_fft: int = 512,
        n_mels: int = 80,
        fmin: float = 0.0,
        fmax: float = None,
        htk: bool = False,
        norm=1,
    ):
        super().__init__()

        _mel_options = dict(
            sr=fs, n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax, htk=htk, norm=norm
        )
        self.mel_options = _mel_options

        # Note(kamo): The mel matrix of librosa is different from kaldi.
        melmat = librosa.filters.mel(**_mel_options)
        # melmat: (D2, D1) -> (D1, D2)
        self.register_buffer("melmat", torch.from_numpy(melmat.T).float())

    def extra_repr(self):
        return ", ".join(f"{k}={v}" for k, v in self.mel_options.items())

    def forward(
        self, feat: torch.Tensor, ilens: torch.LongTensor
    ) -> Tuple[torch.Tensor, torch.LongTensor]:
        # feat: (B, T, D1) x melmat: (D1, D2) -> mel_feat: (B, T, D2)
        mel_feat = torch.matmul(feat, self.melmat)

        logmel_feat = (mel_feat + 1e-20).log()
        # Zero padding
        logmel_feat = logmel_feat.masked_fill(make_pad_mask(ilens, logmel_feat, 1), 0.0)
        return logmel_feat, ilens


class GlobalMVN(torch.nn.Module):
    """Apply global mean and variance normalization

    Args:
        stats_file(str): npy file of 1-dim array or text file.
            From the _first element to
            the {(len(array) - 1) / 2}th element are treated as
            the sum of features,
            and the rest excluding the last elements are
            treated as the sum of the square value of features,
            and the last elements eqauls to the number of samples.
        std_floor(float):
    """

    def __init__(
        self,
        stats_file: str,
        norm_means: bool = True,
        norm_vars: bool = True,
        eps: float = 1.0e-20,
    ):
        super().__init__()
        self.norm_means = norm_means
        self.norm_vars = norm_vars

        self.stats_file = stats_file
        stats = np.load(stats_file)

        stats = stats.astype(float)
        assert (len(stats) - 1) % 2 == 0, stats.shape

        count = stats.flatten()[-1]
        mean = stats[: (len(stats) - 1) // 2] / count
        var = stats[(len(stats) - 1) // 2 : -1] / count - mean * mean
        std = np.maximum(np.sqrt(var), eps)

        self.register_buffer("bias", torch.from_numpy(-mean.astype(np.float32)))
        self.register_buffer("scale", torch.from_numpy(1 / std.astype(np.float32)))

    def extra_repr(self):
        return (
            f"stats_file={self.stats_file}, "
            f"norm_means={self.norm_means}, norm_vars={self.norm_vars}"
        )

    def forward(
        self, x: torch.Tensor, ilens: torch.LongTensor
    ) -> Tuple[torch.Tensor, torch.LongTensor]:
        # feat: (B, T, D)
        if self.norm_means:
            x += self.bias.type_as(x)
            x.masked_fill(make_pad_mask(ilens, x, 1), 0.0)

        if self.norm_vars:
            x *= self.scale.type_as(x)
        return x, ilens


class UtteranceMVN(torch.nn.Module):
    def __init__(
        self, norm_means: bool = True, norm_vars: bool = False, eps: float = 1.0e-20
    ):
        super().__init__()
        self.norm_means = norm_means
        self.norm_vars = norm_vars
        self.eps = eps

    def extra_repr(self):
        return f"norm_means={self.norm_means}, norm_vars={self.norm_vars}"

    def forward(
        self, x: torch.Tensor, ilens: torch.LongTensor
    ) -> Tuple[torch.Tensor, torch.LongTensor]:
        return utterance_mvn(
            x, ilens, norm_means=self.norm_means, norm_vars=self.norm_vars, eps=self.eps
        )


def utterance_mvn(
    x: torch.Tensor,
    ilens: torch.LongTensor,
    norm_means: bool = True,
    norm_vars: bool = False,
    eps: float = 1.0e-20,
) -> Tuple[torch.Tensor, torch.LongTensor]:
    """Apply utterance mean and variance normalization

    Args:
        x: (B, T, D), assumed zero padded
        ilens: (B, T, D)
        norm_means:
        norm_vars:
        eps:

    """
    ilens_ = ilens.type_as(x)
    # mean: (B, D)
    mean = x.sum(dim=1) / ilens_[:, None]

    if norm_means:
        x -= mean[:, None, :]
        x_ = x
    else:
        x_ = x - mean[:, None, :]

    # Zero padding
    x_.masked_fill(make_pad_mask(ilens, x_, 1), 0.0)
    if norm_vars:
        var = x_.pow(2).sum(dim=1) / ilens_[:, None]
        var = torch.clamp(var, min=eps)
        x /= var.sqrt()[:, None, :]
        x_ = x
    return x_, ilens


def feature_transform_for(args, n_fft):
    return FeatureTransform(
        # Mel options,
        fs=args.fbank_fs,
        n_fft=n_fft,
        n_mels=args.n_mels,
        fmin=args.fbank_fmin,
        fmax=args.fbank_fmax,
        # Normalization
        stats_file=args.stats_file,
        apply_uttmvn=args.apply_uttmvn,
        uttmvn_norm_means=args.uttmvn_norm_means,
        uttmvn_norm_vars=args.uttmvn_norm_vars,
    )


================================================
FILE: nets/pytorch_backend/frontends/frontend.py
================================================
from typing import List
from typing import Optional
from typing import Tuple
from typing import Union

import numpy
import torch
import torch.nn as nn
from torch_complex.tensor import ComplexTensor

from espnet.nets.pytorch_backend.frontends.dnn_beamformer import DNN_Beamformer
from espnet.nets.pytorch_backend.frontends.dnn_wpe import DNN_WPE


class Frontend(nn.Module):
    def __init__(
        self,
        idim: int,
        # WPE options
        use_wpe: bool = False,
        wtype: str = "blstmp",
        wlayers: int = 3,
        wunits: int = 300,
        wprojs: int = 320,
        wdropout_rate: float = 0.0,
        taps: int = 5,
        delay: int = 3,
        use_dnn_mask_for_wpe: bool = True,
        # Beamformer options
        use_beamformer: bool = False,
        btype: str = "blstmp",
        blayers: int = 3,
        bunits: int = 300,
        bprojs: int = 320,
        bnmask: int = 2,
        badim: int = 320,
        ref_channel: int = -1,
        bdropout_rate=0.0,
    ):
        super().__init__()

        self.use_beamformer = use_beamformer
        self.use_wpe = use_wpe
        self.use_dnn_mask_for_wpe = use_dnn_mask_for_wpe
        # use frontend for all the data,
        # e.g. in the case of multi-speaker speech separation
        self.use_frontend_for_all = bnmask > 2

        if self.use_wpe:
            if self.use_dnn_mask_for_wpe:
                # Use DNN for power estimation
                # (Not observed significant gains)
                iterations = 1
            else:
                # Performing as conventional WPE, without DNN Estimator
                iterations = 2

            self.wpe = DNN_WPE(
                wtype=wtype,
                widim=idim,
                wunits=wunits,
                wprojs=wprojs,
                wlayers=wlayers,
                taps=taps,
                delay=delay,
                dropout_rate=wdropout_rate,
                iterations=iterations,
                use_dnn_mask=use_dnn_mask_for_wpe,
            )
        else:
            self.wpe = None

        if self.use_beamformer:
            self.beamformer = DNN_Beamformer(
                btype=btype,
                bidim=idim,
                bunits=bunits,
                bprojs=bprojs,
                blayers=blayers,
                bnmask=bnmask,
                dropout_rate=bdropout_rate,
                badim=badim,
                ref_channel=ref_channel,
            )
        else:
            self.beamformer = None

    def forward(
        self, x: ComplexTensor, ilens: Union[torch.LongTensor, numpy.ndarray, List[int]]
    ) -> Tuple[ComplexTensor, torch.LongTensor, Optional[ComplexTensor]]:
        assert len(x) == len(ilens), (len(x), len(ilens))
        # (B, T, F) or (B, T, C, F)
        if x.dim() not in (3, 4):
            raise ValueError(f"Input dim must be 3 or 4: {x.dim()}")
        if not torch.is_tensor(ilens):
            ilens = torch.from_numpy(numpy.asarray(ilens)).to(x.device)

        mask = None
        h = x
        if h.dim() == 4:
            if self.training:
                choices = [(False, False)] if not self.use_frontend_for_all else []
                if self.use_wpe:
                    choices.append((True, False))

                if self.use_beamformer:
                    choices.append((False, True))

                use_wpe, use_beamformer = choices[numpy.random.randint(len(choices))]

            else:
                use_wpe = self.use_wpe
                use_beamformer = self.use_beamformer

            # 1. WPE
            if use_wpe:
                # h: (B, T, C, F) -> h: (B, T, C, F)
                h, ilens, mask = self.wpe(h, ilens)

            # 2. Beamformer
            if use_beamformer:
                # h: (B, T, C, F) -> h: (B, T, F)
                h, ilens, mask = self.beamformer(h, ilens)

        return h, ilens, mask


def frontend_for(args, idim):
    return Frontend(
        idim=idim,
        # WPE options
        use_wpe=args.use_wpe,
        wtype=args.wtype,
        wlayers=args.wlayers,
        wunits=args.wunits,
        wprojs=args.wprojs,
        wdropout_rate=args.wdropout_rate,
        taps=args.wpe_taps,
        delay=args.wpe_delay,
        use_dnn_mask_for_wpe=args.use_dnn_mask_for_wpe,
        # Beamformer options
        use_beamformer=args.use_beamformer,
        btype=args.btype,
        blayers=args.blayers,
        bunits=args.bunits,
        bprojs=args.bprojs,
        bnmask=args.bnmask,
        badim=args.badim,
        ref_channel=args.ref_channel,
        bdropout_rate=args.bdropout_rate,
    )


================================================
FILE: nets/pytorch_backend/frontends/mask_estimator.py
================================================
from typing import Tuple

import numpy as np
import torch
from torch.nn import functional as F
from torch_complex.tensor import ComplexTensor

from espnet.nets.pytorch_backend.nets_utils import make_pad_mask
from espnet.nets.pytorch_backend.rnn.encoders import RNN
from espnet.nets.pytorch_backend.rnn.encoders import RNNP


class MaskEstimator(torch.nn.Module):
    def __init__(self, type, idim, layers, units, projs, dropout, nmask=1):
        super().__init__()
        subsample = np.ones(layers + 1, dtype=np.int)

        typ = type.lstrip("vgg").rstrip("p")
        if type[-1] == "p":
            self.brnn = RNNP(idim, layers, units, projs, subsample, dropout, typ=typ)
        else:
            self.brnn = RNN(idim, layers, units, projs, dropout, typ=typ)

        self.type = type
        self.nmask = nmask
        self.linears = torch.nn.ModuleList(
            [torch.nn.Linear(projs, idim) for _ in range(nmask)]
        )

    def forward(
        self, xs: ComplexTensor, ilens: torch.LongTensor
    ) -> Tuple[Tuple[torch.Tensor, ...], torch.LongTensor]:
        """The forward function

        Args:
            xs: (B, F, C, T)
            ilens: (B,)
        Returns:
            hs (torch.Tensor): The hidden vector (B, F, C, T)
            masks: A tuple of the masks. (B, F, C, T)
            ilens: (B,)
        """
        assert xs.size(0) == ilens.size(0), (xs.size(0), ilens.size(0))
        _, _, C, input_length = xs.size()
        # (B, F, C, T) -> (B, C, T, F)
        xs = xs.permute(0, 2, 3, 1)

        # Calculate amplitude: (B, C, T, F) -> (B, C, T, F)
        xs = (xs.real ** 2 + xs.imag ** 2) ** 0.5
        # xs: (B, C, T, F) -> xs: (B * C, T, F)
        xs = xs.contiguous().view(-1, xs.size(-2), xs.size(-1))
        # ilens: (B,) -> ilens_: (B * C)
        ilens_ = ilens[:, None].expand(-1, C).contiguous().view(-1)

        # xs: (B * C, T, F) -> xs: (B * C, T, D)
        xs, _, _ = self.brnn(xs, ilens_)
        # xs: (B * C, T, D) -> xs: (B, C, T, D)
        xs = xs.view(-1, C, xs.size(-2), xs.size(-1))

        masks = []
        for linear in self.linears:
            # xs: (B, C, T, D) -> mask:(B, C, T, F)
            mask = linear(xs)

            mask = torch.sigmoid(mask)
            # Zero padding
            mask.masked_fill(make_pad_mask(ilens, mask, length_dim=2), 0)

            # (B, C, T, F) -> (B, F, C, T)
            mask = mask.permute(0, 3, 1, 2)

            # Take cares of multi gpu cases: If input_length > max(ilens)
            if mask.size(-1) < input_length:
                mask = F.pad(mask, [0, input_length - mask.size(-1)], value=0)
            masks.append(mask)

        return tuple(masks), ilens


================================================
FILE: nets/pytorch_backend/gtn_ctc.py
================================================
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""GTN CTC implementation."""

import gtn
import torch


class GTNCTCLossFunction(torch.autograd.Function):
    """GTN CTC module."""

    # Copied from FB's GTN example implementation:
    # https://github.com/facebookresearch/gtn_applications/blob/master/utils.py#L251

    @staticmethod
    def create_ctc_graph(target, blank_idx):
        """Build gtn graph.

        :param list target: single target sequence
        :param int blank_idx: index of blank token
        :return: gtn graph of target sequence
        :rtype: gtn.Graph
        """
        g_criterion = gtn.Graph(False)
        L = len(target)
        S = 2 * L + 1
        for s in range(S):
            idx = (s - 1) // 2
            g_criterion.add_node(s == 0, s == S - 1 or s == S - 2)
            label = target[idx] if s % 2 else blank_idx
            g_criterion.add_arc(s, s, label)
            if s > 0:
                g_criterion.add_arc(s - 1, s, label)
            if s % 2 and s > 1 and label != target[idx - 1]:
                g_criterion.add_arc(s - 2, s, label)
        g_criterion.arc_sort(False)
        return g_criterion

    @staticmethod
    def forward(ctx, log_probs, targets, blank_idx=0, reduction="none"):
        """Forward computation.

        :param torch.tensor log_probs: batched log softmax probabilities (B, Tmax, oDim)
        :param list targets: batched target sequences, list of lists
        :param int blank_idx: index of blank token
        :return: ctc loss value
        :rtype: torch.Tensor
        """
        B, T, C = log_probs.shape
        losses = [None] * B
        scales = [None] * B
        emissions_graphs = [None] * B

        def process(b):
            # create emission graph
            g_emissions = gtn.linear_graph(T, C, log_probs.requires_grad)
            cpu_data = log_probs[b].cpu().contiguous()
            g_emissions.set_weights(cpu_data.data_ptr())

            # create criterion graph
            g_criterion = GTNCTCLossFunction.create_ctc_graph(targets[b], blank_idx)
            # compose the graphs
            g_loss = gtn.negate(
                gtn.forward_score(gtn.intersect(g_emissions, g_criterion))
            )

            scale = 1.0
            if reduction == "mean":
                L = len(targets[b])
                scale = 1.0 / L if L > 0 else scale
            elif reduction != "none":
                raise ValueError("invalid value for reduction '" + str(reduction) + "'")

            # Save for backward:
            losses[b] = g_loss
            scales[b] = scale
            emissions_graphs[b] = g_emissions

        gtn.parallel_for(process, range(B))

        ctx.auxiliary_data = (losses, scales, emissions_graphs, log_probs.shape)
        loss = torch.tensor([losses[b].item() * scales[b] for b in range(B)])
        return torch.mean(loss.cuda() if log_probs.is_cuda else loss)

    @staticmethod
    def backward(ctx, grad_output):
        """Backward computation.

        :param torch.tensor grad_output: backward passed gradient value
        :return: cumulative gradient output
        :rtype: (torch.Tensor, None, None, None)
        """
        losses, scales, emissions_graphs, in_shape = ctx.auxiliary_data
        B, T, C = in_shape
        input_grad = torch.empty((B, T, C))

        def process(b):
            gtn.backward(losses[b], False)
            emissions = emissions_graphs[b]
            grad = emissions.grad().weights_to_numpy()
            input_grad[b] = torch.from_numpy(grad).view(1, T, C) * scales[b]

        gtn.parallel_for(process, range(B))

        if grad_output.is_cuda:
            input_grad = input_grad.cuda()
        input_grad *= grad_output / B

        return (
            input_grad,
            None,  # targets
            None,  # blank_idx
            None,  # reduction
        )


================================================
FILE: nets/pytorch_backend/initialization.py
================================================
#!/usr/bin/env python

# Copyright 2019 Kyoto University (Hirofumi Inaguma)
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

"""Initialization functions for RNN sequence-to-sequence models."""

import math


def lecun_normal_init_parameters(module):
    """Initialize parameters in the LeCun's manner."""
    for p in module.parameters():
        data = p.data
        if data.dim() == 1:
            # bias
            data.zero_()
        elif data.dim() == 2:
            # linear weight
            n = data.size(1)
            stdv = 1.0 / math.sqrt(n)
            data.normal_(0, stdv)
        elif data.dim() in (3, 4):
            # conv weight
            n = data.size(1)
            for k in data.size()[2:]:
                n *= k
            stdv = 1.0 / math.sqrt(n)
            data.normal_(0, stdv)
        else:
            raise NotImplementedError


def uniform_init_parameters(module):
    """Initialize parameters with an uniform distribution."""
    for p in module.parameters():
        data = p.data
        if data.dim() == 1:
            # bias
            data.uniform_(-0.1, 0.1)
        elif data.dim() == 2:
            # linear weight
            data.uniform_(-0.1, 0.1)
        elif data.dim() in (3, 4):
            # conv weight
            pass  # use the pytorch default
        else:
            raise NotImplementedError


def set_forget_bias_to_one(bias):
    """Initialize a bias vector in the forget gate with one."""
    n = bias.size(0)
    start, end = n // 4, n // 2
    bias.data[start:end].fill_(1.0)


================================================
FILE: nets/pytorch_backend/lm/__init__.py
================================================
"""Initialize sub package."""


================================================
FILE: nets/pytorch_backend/lm/default.py
================================================
"""Default Recurrent Neural Network Languge Model in `lm_train.py`."""

from typing import Any
from typing import List
from typing import Tuple

import logging
import torch
import torch.nn as nn
import torch.nn.functional as F

from espnet.nets.lm_interface import LMInterface
from espnet.nets.pytorch_backend.e2e_asr import to_device
from espnet.nets.scorer_interface import BatchScorerInterface
from espnet.utils.cli_utils import strtobool


class DefaultRNNLM(BatchScorerInterface, LMInterface, nn.Module):
    """Default RNNLM for `LMInterface` Implementation.

    Note:
        PyTorch seems to have memory leak when one GPU compute this after data parallel.
        If parallel GPUs compute this, it seems to be fine.
        See also https://github.com/espnet/espnet/issues/1075

    """

    @staticmethod
    def add_arguments(parser):
        """Add arguments to command line argument parser."""
        parser.add_argument(
            "--type",
            type=str,
            default="lstm",
            nargs="?",
            choices=["lstm", "gru"],
            help="Which type of RNN to use",
        )
        parser.add_argument(
            "--layer", "-l", type=int, default=2, help="Number of hidden layers"
        )
        parser.add_argument(
            "--unit", "-u", type=int, default=650, help="Number of hidden units"
        )
        parser.add_argument(
            "--embed-unit",
            default=None,
            type=int,
            help="Number of hidden units in embedding layer, "
            "if it is not specified, it keeps the same number with hidden units.",
        )
        parser.add_argument(
            "--dropout-rate", type=float, default=0.5, help="dropout probability"
        )
        parser.add_argument(
            "--emb-dropout-rate",
            type=float,
            default=0.0,
            help="emb dropout probability",
        )
        parser.add_argument(
            "--tie-weights",
            type=strtobool,
            default=False,
            help="Tie input and output embeddings",
        )
        return parser

    def __init__(self, n_vocab, args):
        """Initialize class.

        Args:
            n_vocab (int): The size of the vocabulary
            args (argparse.Namespace): configurations. see py:method:`add_arguments`

        """
        nn.Module.__init__(self)
        # NOTE: for a compatibility with less than 0.5.0 version models
        dropout_rate = getattr(args, "dropout_rate", 0.0)
        # NOTE: for a compatibility with less than 0.6.1 version models
        embed_unit = getattr(args, "embed_unit", None)
        # NOTE: for a compatibility with less than 0.9.7 version models
        emb_dropout_rate = getattr(args, "emb_dropout_rate", 0.0)
        # NOTE: for a compatibility with less than 0.9.7 version models
        tie_weights = getattr(args, "tie_weights", False)

        self.model = ClassifierWithState(
            RNNLM(
                n_vocab,
                args.layer,
                args.unit,
                embed_unit,
                args.type,
                dropout_rate,
                emb_dropout_rate,
                tie_weights,
            )
        )

    def state_dict(self):
        """Dump state dict."""
        return self.model.state_dict()

    def load_state_dict(self, d):
        """Load state dict."""
        self.model.load_state_dict(d)

    def forward(self, x, t):
        """Compute LM loss value from buffer sequences.

        Args:
            x (torch.Tensor): Input ids. (batch, len)
            t (torch.Tensor): Target ids. (batch, len)

        Returns:
            tuple[torch.Tensor, torch.Tensor, torch.Tensor]: Tuple of
                loss to backward (scalar),
                negative log-likelihood of t: -log p(t) (scalar) and
                the number of elements in x (scalar)

        Notes:
            The last two return values are used
            in perplexity: p(t)^{-n} = exp(-log p(t) / n)

        """
        loss = 0
        logp = 0
        count = torch.tensor(0).long()
        state = None
        batch_size, sequence_length = x.shape
        for i in range(sequence_length):
            # Compute the loss at this time step and accumulate it
            state, loss_batch = self.model(state, x[:, i], t[:, i])
            non_zeros = torch.sum(x[:, i] != 0, dtype=loss_batch.dtype)
            loss += loss_batch.mean() * non_zeros
            logp += torch.sum(loss_batch * non_zeros)
            count += int(non_zeros)
        return loss / batch_size, loss, count.to(loss.device)

    def score(self, y, state, x):
        """Score new token.

        Args:
            y (torch.Tensor): 1D torch.int64 prefix tokens.
            state: Scorer state for prefix tokens
            x (torch.Tensor): 2D encoder feature that generates ys.

        Returns:
            tuple[torch.Tensor, Any]: Tuple of
                torch.float32 scores for next token (n_vocab)
                and next state for ys

        """
        new_state, scores = self.model.predict(state, y[-1].unsqueeze(0))
        return scores.squeeze(0), new_state

    def final_score(self, state):
        """Score eos.

        Args:
            state: Scorer state for prefix tokens

        Returns:
            float: final score

        """
        return self.model.final(state)

    # batch beam search API (see BatchScorerInterface)
    def batch_score(
        self, ys: torch.Tensor, states: List[Any], xs: torch.Tensor
    ) -> Tuple[torch.Tensor, List[Any]]:
        """Score new token batch.

        Args:
            ys (torch.Tensor): torch.int64 prefix tokens (n_batch, ylen).
            states (List[Any]): Scorer states for prefix tokens.
            xs (torch.Tensor):
                The encoder feature that generates ys (n_batch, xlen, n_feat).

        Returns:
            tuple[torch.Tensor, List[Any]]: Tuple of
                batchfied scores for next token with shape of `(n_batch, n_vocab)`
                and next state list for ys.

        """
        # merge states
        n_batch = len(ys)
        n_layers = self.model.predictor.n_layers
        if self.model.predictor.typ == "lstm":
            keys = ("c", "h")
        else:
            keys = ("h",)

        if states[0] is None:
            states = None
        else:
            # transpose state of [batch, key, layer] into [key, layer, batch]
            states = {
                k: [
                    torch.stack([states[b][k][i] for b in range(n_batch)])
                    for i in range(n_layers)
                ]
                for k in keys
            }
        states, logp = self.model.predict(states, ys[:, -1])

        # transpose state of [key, layer, batch] into [batch, key, layer]
        return (
            logp,
            [
                {k: [states[k][i][b] for i in range(n_layers)] for k in keys}
                for b in range(n_batch)
            ],
        )


class ClassifierWithState(nn.Module):
    """A wrapper for pytorch RNNLM."""

    def __init__(
        self, predictor, lossfun=nn.CrossEntropyLoss(reduction="none"), label_key=-1
    ):
        """Initialize class.

        :param torch.nn.Module predictor : The RNNLM
        :param function lossfun : The loss function to use
        :param int/str label_key :

        """
        if not (isinstance(label_key, (int, str))):
            raise TypeError("label_key must be int or str, but is %s" % type(label_key))
        super(ClassifierWithState, self).__init__()
        self.lossfun = lossfun
        self.y = None
        self.loss = None
        self.label_key = label_key
        self.predictor = predictor

    def forward(self, state, *args, **kwargs):
        """Compute the loss value for an input and label pair.

        Notes:
            It also computes accuracy and stores it to the attribute.
            When ``label_key`` is ``int``, the corresponding element in ``args``
            is treated as ground truth labels. And when it is ``str``, the
            element in ``kwargs`` is used.
            The all elements of ``args`` and ``kwargs`` except the groundtruth
            labels are features.
            It feeds features to the predictor and compare the result
            with ground truth labels.

        :param torch.Tensor state : the LM state
        :param list[torch.Tensor] args : Input minibatch
        :param dict[torch.Tensor] kwargs : Input minibatch
        :return loss value
        :rtype torch.Tensor

        """
        if isinstance(self.label_key, int):
            if not (-len(args) <= self.label_key < len(args)):
                msg = "Label key %d is out of bounds" % self.label_key
                raise ValueError(msg)
            t = args[self.label_key]
            if self.label_key == -1:
                args = args[:-1]
            else:
                args = args[: self.label_key] + args[self.label_key + 1 :]
        elif isinstance(self.label_key, str):
            if self.label_key not in kwargs:
                msg = 'Label key "%s" is not found' % self.label_key
                raise ValueError(msg)
            t = kwargs[self.label_key]
            del kwargs[self.label_key]

        self.y = None
        self.loss = None
        state, self.y = self.predictor(state, *args, **kwargs)
        self.loss = self.lossfun(self.y, t)
        return state, self.loss

    def predict(self, state, x):
        """Predict log probabilities for given state and input x using the predictor.

        :param torch.Tensor state : The current state
        :param torch.Tensor x : The input
        :return a tuple (new state, log prob vector)
        :rtype (torch.Tensor, torch.Tensor)
        """
        if hasattr(self.predictor, "normalized") and self.predictor.normalized:
            return self.predictor(state, x)
        else:
            state, z = self.predictor(state, x)
            return state, F.log_softmax(z, dim=1)

    def buff_predict(self, state, x, n):
        """Predict new tokens from buffered inputs."""
        if self.predictor.__class__.__name__ == "RNNLM":
            return self.predict(state, x)

        new_state = []
        new_log_y = []
        for i in range(n):
            state_i = None if state is None else state[i]
            state_i, log_y = self.predict(state_i, x[i].unsqueeze(0))
            new_state.append(state_i)
            new_log_y.append(log_y)

        return new_state, torch.cat(new_log_y)

    def final(self, state, index=None):
        """Predict final log probabilities for given state using the predictor.

        :param state: The state
        :return The final log probabilities
        :rtype torch.Tensor
        """
        if hasattr(self.predictor, "final"):
            if index is not None:
                return self.predictor.final(state[index])
            else:
                return self.predictor.final(state)
        else:
            return 0.0


# Definition of a recurrent net for language modeling
class RNNLM(nn.Module):
    """A pytorch RNNLM."""

    def __init__(
        self,
        n_vocab,
        n_layers,
        n_units,
        n_embed=None,
        typ="lstm",
        dropout_rate=0.5,
        emb_dropout_rate=0.0,
        tie_weights=False,
    ):
        """Initialize class.

        :param int n_vocab: The size of the vocabulary
        :param int n_layers: The number of layers to create
        :param int n_units: The number of units per layer
        :param str typ: The RNN type
        """
        super(RNNLM, self).__init__()
        if n_embed is None:
            n_embed = n_units

        self.embed = nn.Embedding(n_vocab, n_embed)

        if emb_dropout_rate == 0.0:
            self.embed_drop = None
        else:
            self.embed_drop = nn.Dropout(emb_dropout_rate)

        if typ == "lstm":
            self.rnn = nn.ModuleList(
                [nn.LSTMCell(n_embed, n_units)]
                + [nn.LSTMCell(n_units, n_units) for _ in range(n_layers - 1)]
            )
        else:
            self.rnn = nn.ModuleList(
                [nn.GRUCell(n_embed, n_units)]
                + [nn.GRUCell(n_units, n_units) for _ in range(n_layers - 1)]
            )

        self.dropout = nn.ModuleList(
            [nn.Dropout(dropout_rate) for _ in range(n_layers + 1)]
        )
        self.lo = nn.Linear(n_units, n_vocab)
        self.n_layers = n_layers
        self.n_units = n_units
        self.typ = typ

        logging.info("Tie weights set to {}".format(tie_weights))
        logging.info("Dropout set to {}".format(dropout_rate))
        logging.info("Emb Dropout set to {}".format(emb_dropout_rate))

        if tie_weights:
            assert (
                n_embed == n_units
            ), "Tie Weights: True need embedding and final dimensions to match"
            self.lo.weight = self.embed.weight

        # initialize parameters from uniform distribution
        for param in self.parameters():
            param.data.uniform_(-0.1, 0.1)

    def zero_state(self, batchsize):
        """Initialize state."""
        p = next(self.parameters())
        return torch.zeros(batchsize, self.n_units).to(device=p.device, dtype=p.dtype)

    def forward(self, state, x):
        """Forward neural networks."""
        if state is None:
            h = [to_device(x, self.zero_state(x.size(0))) for n in range(self.n_layers)]
            state = {"h": h}
            if self.typ == "lstm":
                c = [
                    to_device(x, self.zero_state(x.size(0)))
                    for n in range(self.n_layers)
                ]
                state = {"c": c, "h": h}

        h = [None] * self.n_layers
        if self.embed_drop is not None:
            emb = self.embed_drop(self.embed(x))
        else:
            emb = self.embed(x)
        if self.typ == "lstm":
            c = [None] * self.n_layers
            h[0], c[0] = self.rnn[0](
                self.dropout[0](emb), (state["h"][0], state["c"][0])
            )
            for n in range(1, self.n_layers):
                h[n], c[n] = self.rnn[n](
                    self.dropout[n](h[n - 1]), (state["h"][n], state["c"][n])
                )
            state = {"c": c, "h": h}
        else:
            h[0] = self.rnn[0](self.dropout[0](emb), state["h"][0])
            for n in range(1, self.n_layers):
                h[n] = self.rnn[n](self.dropout[n](h[n - 1]), state["h"][n])
            state = {"h": h}
        y = self.lo(self.dropout[-1](h[-1]))
        return state, y


================================================
FILE: nets/pytorch_backend/lm/seq_rnn.py
================================================
"""Sequential implementation of Recurrent Neural Network Language Model."""

import torch
import torch.nn as nn
import torch.nn.functional as F

from espnet.nets.lm_interface import LMInterface


class SequentialRNNLM(LMInterface, torch.nn.Module):
    """Sequential RNNLM.

    See also:
        https://github.com/pytorch/examples/blob/4581968193699de14b56527296262dd76ab43557/word_language_model/model.py

    """

    @staticmethod
    def add_arguments(parser):
        """Add arguments to command line argument parser."""
        parser.add_argument(
            "--type",
            type=str,
            default="lstm",
            nargs="?",
            choices=["lstm", "gru"],
            help="Which type of RNN to use",
        )
        parser.add_argument(
            "--layer", "-l", type=int, default=2, help="Number of hidden layers"
        )
        parser.add_argument(
            "--unit", "-u", type=int, default=650, help="Number of hidden units"
        )
        parser.add_argument(
            "--dropout-rate", type=float, default=0.5, help="dropout probability"
        )
        return parser

    def __init__(self, n_vocab, args):
        """Initialize class.

        Args:
            n_vocab (int): The size of the vocabulary
            args (argparse.Namespace): configurations. see py:method:`add_arguments`

        """
        torch.nn.Module.__init__(self)
        self._setup(
            rnn_type=args.type.upper(),
            ntoken=n_vocab,
            ninp=args.unit,
            nhid=args.unit,
            nlayers=args.layer,
            dropout=args.dropout_rate,
        )

    def _setup(
        self, rnn_type, ntoken, ninp, nhid, nlayers, dropout=0.5, tie_weights=False
    ):
        self.drop = nn.Dropout(dropout)
        self.encoder = nn.Embedding(ntoken, ninp)
        if rnn_type in ["LSTM", "GRU"]:
            self.rnn = getattr(nn, rnn_type)(ninp, nhid, nlayers, dropout=dropout)
        else:
            try:
                nonlinearity = {"RNN_TANH": "tanh", "RNN_RELU": "relu"}[rnn_type]
            except KeyError:
                raise ValueError(
                    "An invalid option for `--model` was supplied, "
                    "options are ['LSTM', 'GRU', 'RNN_TANH' or 'RNN_RELU']"
                )
            self.rnn = nn.RNN(
                ninp, nhid, nlayers, nonlinearity=nonlinearity, dropout=dropout
            )
        self.decoder = nn.Linear(nhid, ntoken)

        # Optionally tie weights as in:
        # "Using the Output Embedding to Improve Language Models" (Press & Wolf 2016)
        # https://arxiv.org/abs/1608.05859
        # and
        # "Tying Word Vectors and Word Classifiers:
        #  A Loss Framework for Language Modeling" (Inan et al. 2016)
        # https://arxiv.org/abs/1611.01462
        if tie_weights:
            if nhid != ninp:
                raise ValueError(
                    "When using the tied flag, nhid must be equal to emsize"
                )
            self.decoder.weight = self.encoder.weight

        self._init_weights()

        self.rnn_type = rnn_type
        self.nhid = nhid
        self.nlayers = nlayers

    def _init_weights(self):
        # NOTE: original init in pytorch/examples
        # initrange = 0.1
        # self.encoder.weight.data.uniform_(-initrange, initrange)
        # self.decoder.bias.data.zero_()
        # self.decoder.weight.data.uniform_(-initrange, initrange)
        # NOTE: our default.py:RNNLM init
        for param in self.parameters():
            param.data.uniform_(-0.1, 0.1)

    def forward(self, x, t):
        """Compute LM loss value from buffer sequences.

        Args:
            x (torch.Tensor): Input ids. (batch, len)
            t (torch.Tensor): Target ids. (batch, len)

        Returns:
            tuple[torch.Tensor, torch.Tensor, torch.Tensor]: Tuple of
                loss to backward (scalar),
                negative log-likelihood of t: -log p(t) (scalar) and
                the number of elements in x (scalar)

        Notes:
            The last two return values are used
            in perplexity: p(t)^{-n} = exp(-log p(t) / n)

        """
        y = self._before_loss(x, None)[0]
        mask = (x != 0).to(y.dtype)
        loss = F.cross_entropy(y.view(-1, y.shape[-1]), t.view(-1), reduction="none")
        logp = loss * mask.view(-1)
        logp = logp.sum()
        count = mask.sum()
        return logp / count, logp, count

    def _before_loss(self, input, hidden):
        emb = self.drop(self.encoder(input))
        output, hidden = self.rnn(emb, hidden)
        output = self.drop(output)
        decoded = self.decoder(
            output.view(output.size(0) * output.size(1), output.size(2))
        )
        return decoded.view(output.size(0), output.size(1), decoded.size(1)), hidden

    def init_state(self, x):
        """Get an initial state for decoding.

        Args:
            x (torch.Tensor): The encoded feature tensor

        Returns: initial state

        """
        bsz = 1
        weight = next(self.parameters())
        if self.rnn_type == "LSTM":
            return (
                weight.new_zeros(self.nlayers, bsz, self.nhid),
                weight.new_zeros(self.nlayers, bsz, self.nhid),
            )
        else:
            return weight.new_zeros(self.nlayers, bsz, self.nhid)

    def score(self, y, state, x):
        """Score new token.

        Args:
            y (torch.Tensor): 1D torch.int64 prefix tokens.
            state: Scorer state for prefix tokens
            x (torch.Tensor): 2D encoder feature that generates ys.

        Returns:
            tuple[torch.Tensor, Any]: Tuple of
                torch.float32 scores for next token (n_vocab)
                and next state for ys

        """
        y, new_state = self._before_loss(y[-1].view(1, 1), state)
        logp = y.log_softmax(dim=-1).view(-1)
        return logp, new_state


================================================
FILE: nets/pytorch_backend/lm/transformer.py
================================================
"""Transformer language model."""

from typing import Any
from typing import List
from typing import Tuple

import logging
import torch
import torch.nn as nn
import torch.nn.functional as F

from espnet.nets.lm_interface import LMInterface
from espnet.nets.pytorch_backend.transformer.embedding import PositionalEncoding
from espnet.nets.pytorch_backend.transformer.encoder import Encoder
from espnet.nets.pytorch_backend.transformer.mask import subsequent_mask
from espnet.nets.scorer_interface import BatchScorerInterface
from espnet.utils.cli_utils import strtobool


class TransformerLM(nn.Module, LMInterface, BatchScorerInterface):
    """Transformer language model."""

    @staticmethod
    def add_arguments(parser):
        """Add arguments to command line argument parser."""
        parser.add_argument(
            "--layer", type=int, default=4, help="Number of hidden layers"
        )
        parser.add_argument(
            "--unit",
            type=int,
            default=1024,
            help="Number of hidden units in feedforward layer",
        )
        parser.add_argument(
            "--att-unit",
            type=int,
            default=256,
            help="Number of hidden units in attention layer",
        )
        parser.add_argument(
            "--embed-unit",
            type=int,
            default=128,
            help="Number of hidden units in embedding layer",
        )
        parser.add_argument(
            "--head", type=int, default=2, help="Number of multi head attention"
        )
        parser.add_argument(
            "--dropout-rate", type=float, default=0.5, help="dropout probability"
        )
        parser.add_argument(
            "--att-dropout-rate",
            type=float,
            default=0.0,
            help="att dropout probability",
        )
        parser.add_argument(
            "--emb-dropout-rate",
            type=float,
            default=0.0,
            help="emb dropout probability",
        )
        parser.add_argument(
            "--tie-weights",
            type=strtobool,
            default=False,
            help="Tie input and output embeddings",
        )
        parser.add_argument(
            "--pos-enc",
            default="sinusoidal",
            choices=["sinusoidal", "none"],
            help="positional encoding",
        )
        return parser

    def __init__(self, n_vocab, args):
        """Initialize class.

        Args:
            n_vocab (int): The size of the vocabulary
            args (argparse.Namespace): configurations. see py:method:`add_arguments`

        """
        nn.Module.__init__(self)

        # NOTE: for a compatibility with less than 0.9.7 version models
        emb_dropout_rate = getattr(args, "emb_dropout_rate", 0.0)
        # NOTE: for a compatibility with less than 0.9.7 version models
        tie_weights = getattr(args, "tie_weights", False)
        # NOTE: for a compatibility with less than 0.9.7 version models
        att_dropout_rate = getattr(args, "att_dropout_rate", 0.0)

        if args.pos_enc == "sinusoidal":
            pos_enc_class = PositionalEncoding
        elif args.pos_enc == "none":

            def pos_enc_class(*args, **kwargs):
                return nn.Sequential()  # indentity

        else:
            raise ValueError(f"unknown pos-enc option: {args.pos_enc}")

        self.embed = nn.Embedding(n_vocab, args.embed_unit)

        if emb_dropout_rate == 0.0:
            self.embed_drop = None
        else:
            self.embed_drop = nn.Dropout(emb_dropout_rate)

        self.encoder = Encoder(
            idim=args.embed_unit,
            attention_dim=args.att_unit,
            attention_heads=args.head,
            linear_units=args.unit,
            num_blocks=args.layer,
            dropout_rate=args.dropout_rate,
            attention_dropout_rate=att_dropout_rate,
            input_layer="linear",
            pos_enc_class=pos_enc_class,
        )
        self.decoder = nn.Linear(args.att_unit, n_vocab)

        logging.info("Tie weights set to {}".format(tie_weights))
        logging.info("Dropout set to {}".format(args.dropout_rate))
        logging.info("Emb Dropout set to {}".format(emb_dropout_rate))
        logging.info("Att Dropout set to {}".format(att_dropout_rate))

        if tie_weights:
            assert (
                args.att_unit == args.embed_unit
            ), "Tie Weights: True need embedding and final dimensions to match"
            self.decoder.weight = self.embed.weight

    def _target_mask(self, ys_in_pad):
        ys_mask = ys_in_pad != 0
        m = subsequent_mask(ys_mask.size(-1), device=ys_mask.device).unsqueeze(0)
        return ys_mask.unsqueeze(-2) & m

    def forward(
        self, x: torch.Tensor, t: torch.Tensor
    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
        """Compute LM loss value from buffer sequences.

        Args:
            x (torch.Tensor): Input ids. (batch, len)
            t (torch.Tensor): Target ids. (batch, len)

        Returns:
            tuple[torch.Tensor, torch.Tensor, torch.Tensor]: Tuple of
                loss to backward (scalar),
                negative log-likelihood of t: -log p(t) (scalar) and
                the number of elements in x (scalar)

        Notes:
            The last two return values are used
            in perplexity: p(t)^{-n} = exp(-log p(t) / n)

        """
        xm = x != 0

        if self.embed_drop is not None:
            emb = self.embed_drop(self.embed(x))
        else:
            emb = self.embed(x)

        h, _ = self.encoder(emb, self._target_mask(x))
        y = self.decoder(h)
        loss = F.cross_entropy(y.view(-1, y.shape[-1]), t.view(-1), reduction="none")
        mask = xm.to(dtype=loss.dtype)
        logp = loss * mask.view(-1)
        logp = logp.sum()
        count = mask.sum()
        return logp / count, logp, count

    def score(
        self, y: torch.Tensor, state: Any, x: torch.Tensor
    ) -> Tuple[torch.Tensor, Any]:
        """Score new token.

        Args:
            y (torch.Tensor): 1D torch.int64 prefix tokens.
            state: Scorer state for prefix tokens
            x (torch.Tensor): encoder feature that generates ys.

        Returns:
            tuple[torch.Tensor, Any]: Tuple of
                torch.float32 scores for next token (n_vocab)
                and next state for ys

        """
        y = y.unsqueeze(0)
        if self.embed_drop is not None:
            emb = self.embed_drop(self.embed(y))
        else:
            emb = self.embed(y)

        h, _, cache = self.encoder.forward_one_step(
            emb, self._target_mask(y), cache=state
        )
        h = self.decoder(h[:, -1])
        logp = h.log_softmax(dim=-1).squeeze(0)
        return logp, cache

    def score_partial(
        self, y: torch.Tensor, next_tokens: Any, state: Any, x: torch.Tensor
    ) -> Tuple[torch.Tensor, Any]:
        scores, state = self.score(y, state, x)
        scores = scores[next_tokens]
        return scores, state

    def select_state(self, states, i):
        return states

    # batch beam search API (see BatchScorerInterface)
    def batch_score(
        self, ys: torch.Tensor, states: List[Any], xs: torch.Tensor
    ) -> Tuple[torch.Tensor, List[Any]]:
        """Score new token batch (required).

        Args:
            ys (torch.Tensor): torch.int64 prefix tokens (n_batch, ylen).
            states (List[Any]): Scorer states for prefix tokens.
            xs (torch.Tensor):
                The encoder feature that generates ys (n_batch, xlen, n_feat).

        Returns:
            tuple[torch.Tensor, List[Any]]: Tuple of
                batchfied scores for next token with shape of `(n_batch, n_vocab)`
                and next state list for ys.

        """
        # merge states
        n_batch = len(ys)
        n_layers = len(self.encoder.encoders)
        if states[0] is None:
            batch_state = None
        else:
            # transpose state of [batch, layer] into [layer, batch]
            batch_state = [
                torch.stack([states[b][i] for b in range(n_batch)])
                for i in range(n_layers)
            ]

        if self.embed_drop is not None:
            emb = self.embed_drop(self.embed(ys))
        else:
            emb = self.embed(ys)

        # batch decoding
        h, _, states = self.encoder.forward_one_step(
            emb, self._target_mask(ys), cache=batch_state
        )
        h = self.decoder(h[:, -1])
        logp = h.log_softmax(dim=-1)

        # transpose state of [layer, batch] into [batch, layer]
        state_list = [[states[i][b] for i in range(n_layers)] for b in range(n_batch)]
        return logp, state_list


================================================
FILE: nets/pytorch_backend/maskctc/__init__.py
================================================
"""Initialize sub package."""


================================================
FILE: nets/pytorch_backend/maskctc/add_mask_token.py
================================================
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

# Copyright 2020 Johns Hopkins University (Shinji Watanabe)
#                Waseda University (Yosuke Higuchi)
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

"""Token masking module for Masked LM."""

import numpy


def mask_uniform(ys_pad, mask_token, eos, ignore_id):
    """Replace random tokens with <mask> label and add <eos> label.

    The number of <mask> is chosen from a uniform distribution
    between one and the target sequence's length.
    :param torch.Tensor ys_pad: batch of padded target sequences (B, Lmax)
    :param int mask_token: index of <mask>
    :param int eos: index of <eos>
    :param int ignore_id: index of padding
    :return: padded tensor (B, Lmax)
    :rtype: torch.Tensor
    :return: padded tensor (B, Lmax)
    :rtype: torch.Tensor
    """
    from espnet.nets.pytorch_backend.nets_utils import pad_list

    ys = [y[y != ignore_id] for y in ys_pad]  # parse padded ys
    ys_out = [y.new(y.size()).fill_(ignore_id) for y in ys]
    ys_in = [y.clone() for y in ys]
    for i in range(len(ys)):
        num_samples = numpy.random.randint(1, len(ys[i]) + 1)
        idx = numpy.random.choice(len(ys[i]), num_samples)

        ys_in[i][idx] = mask_token
        ys_out[i][idx] = ys[i][idx]

    return pad_list(ys_in, eos), pad_list(ys_out, ignore_id)


================================================
FILE: nets/pytorch_backend/maskctc/mask.py
================================================
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

# Copyright 2020 Johns Hopkins University (Shinji Watanabe)
#                Waseda University (Yosuke Higuchi)
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

"""Attention masking module for Masked LM."""


def square_mask(ys_in_pad, ignore_id):
    """Create attention mask to avoid attending on padding tokens.

    :param torch.Tensor ys_pad: batch of padded target sequences (B, Lmax)
    :param int ignore_id: index of padding
    :param torch.dtype dtype: result dtype
    :rtype: torch.Tensor (B, Lmax, Lmax)
    """
    ys_mask = (ys_in_pad != ignore_id).unsqueeze(-2)
    ymax = ys_mask.size(-1)
    ys_mask_tmp = ys_mask.transpose(1, 2).repeat(1, 1, ymax)
    ys_mask = ys_mask.repeat(1, ymax, 1) & ys_mask_tmp

    return ys_mask


================================================
FILE: nets/pytorch_backend/nets_utils.py
================================================
# -*- coding: utf-8 -*-

"""Network related utility tools."""

import logging
from typing import Dict

import numpy as np
import torch


def to_device(m, x):
    """Send tensor into the device of the module.

    Args:
        m (torch.nn.Module): Torch module.
        x (Tensor): Torch tensor.

    Returns:
        Tensor: Torch tensor located in the same place as torch module.

    """
    if isinstance(m, torch.nn.Module):
        device = next(m.parameters()).device
    elif isinstance(m, torch.Tensor):
        device = m.device
    else:
        raise TypeError(
            "Expected torch.nn.Module or torch.tensor, " f"bot got: {type(m)}"
        )
    return x.to(device)


def pad_list(xs, pad_value):
    """Perform padding for the list of tensors.

    Args:
        xs (List): List of Tensors [(T_1, `*`), (T_2, `*`), ..., (T_B, `*`)].
        pad_value (float): Value for padding.

    Returns:
        Tensor: Padded tensor (B, Tmax, `*`).

    Examples:
        >>> x = [torch.ones(4), torch.ones(2), torch.ones(1)]
        >>> x
        [tensor([1., 1., 1., 1.]), tensor([1., 1.]), tensor([1.])]
        >>> pad_list(x, 0)
        tensor([[1., 1., 1., 1.],
                [1., 1., 0., 0.],
                [1., 0., 0., 0.]])

    """
    n_batch = len(xs)
    max_len = max(x.size(0) for x in xs)
    pad = xs[0].new(n_batch, max_len, *xs[0].size()[1:]).fill_(pad_value)

    for i in range(n_batch):
        pad[i, : xs[i].size(0)] = xs[i]

    return pad


def make_pad_mask(lengths, xs=None, length_dim=-1):
    """Make mask tensor containing indices of padded part.

    Args:
        lengths (LongTensor or List): Batch of lengths (B,).
        xs (Tensor, optional): The reference tensor.
            If set, masks will be the same shape as this tensor.
        length_dim (int, optional): Dimension indicator of the above tensor.
            See the example.

    Returns:
        Tensor: Mask tensor containing indices of padded part.
                dtype=torch.uint8 in PyTorch 1.2-
                dtype=torch.bool in PyTorch 1.2+ (including 1.2)

    Examples:
        With only lengths.

        >>> lengths = [5, 3, 2]
        >>> make_non_pad_mask(lengths)
        masks = [[0, 0, 0, 0 ,0],
                 [0, 0, 0, 1, 1],
                 [0, 0, 1, 1, 1]]

        With the reference tensor.

        >>> xs = torch.zeros((3, 2, 4))
        >>> make_pad_mask(lengths, xs)
        tensor([[[0, 0, 0, 0],
                 [0, 0, 0, 0]],
                [[0, 0, 0, 1],
                 [0, 0, 0, 1]],
                [[0, 0, 1, 1],
                 [0, 0, 1, 1]]], dtype=torch.uint8)
        >>> xs = torch.zeros((3, 2, 6))
        >>> make_pad_mask(lengths, xs)
        tensor([[[0, 0, 0, 0, 0, 1],
                 [0, 0, 0, 0, 0, 1]],
                [[0, 0, 0, 1, 1, 1],
                 [0, 0, 0, 1, 1, 1]],
                [[0, 0, 1, 1, 1, 1],
                 [0, 0, 1, 1, 1, 1]]], dtype=torch.uint8)

        With the reference tensor and dimension indicator.

        >>> xs = torch.zeros((3, 6, 6))
        >>> make_pad_mask(lengths, xs, 1)
        tensor([[[0, 0, 0, 0, 0, 0],
                 [0, 0, 0, 0, 0, 0],
                 [0, 0, 0, 0, 0, 0],
                 [0, 0, 0, 0, 0, 0],
                 [0, 0, 0, 0, 0, 0],
                 [1, 1, 1, 1, 1, 1]],
                [[0, 0, 0, 0, 0, 0],
                 [0, 0, 0, 0, 0, 0],
                 [0, 0, 0, 0, 0, 0],
                 [1, 1, 1, 1, 1, 1],
                 [1, 1, 1, 1, 1, 1],
                 [1, 1, 1, 1, 1, 1]],
                [[0, 0, 0, 0, 0, 0],
                 [0, 0, 0, 0, 0, 0],
                 [1, 1, 1, 1, 1, 1],
                 [1, 1, 1, 1, 1, 1],
                 [1, 1, 1, 1, 1, 1],
                 [1, 1, 1, 1, 1, 1]]], dtype=torch.uint8)
        >>> make_pad_mask(lengths, xs, 2)
        tensor([[[0, 0, 0, 0, 0, 1],
                 [0, 0, 0, 0, 0, 1],
                 [0, 0, 0, 0, 0, 1],
                 [0, 0, 0, 0, 0, 1],
                 [0, 0, 0, 0, 0, 1],
                 [0, 0, 0, 0, 0, 1]],
                [[0, 0, 0, 1, 1, 1],
                 [0, 0, 0, 1, 1, 1],
                 [0, 0, 0, 1, 1, 1],
                 [0, 0, 0, 1, 1, 1],
                 [0, 0, 0, 1, 1, 1],
                 [0, 0, 0, 1, 1, 1]],
                [[0, 0, 1, 1, 1, 1],
                 [0, 0, 1, 1, 1, 1],
                 [0, 0, 1, 1, 1, 1],
                 [0, 0, 1, 1, 1, 1],
                 [0, 0, 1, 1, 1, 1],
                 [0, 0, 1, 1, 1, 1]]], dtype=torch.uint8)

    """
    if length_dim == 0:
        raise ValueError("length_dim cannot be 0: {}".format(length_dim))

    if not isinstance(lengths, list):
        lengths = lengths.tolist()
    bs = int(len(lengths))
    if xs is None:
        maxlen = int(max(lengths))
    else:
        maxlen = xs.size(length_dim)

    seq_range = torch.arange(0, maxlen, dtype=torch.int64)
    seq_range_expand = seq_range.unsqueeze(0).expand(bs, maxlen)
    seq_length_expand = seq_range_expand.new(lengths).unsqueeze(-1)
    mask = seq_range_expand >= seq_length_expand

    if xs is not None:
        assert xs.size(0) == bs, (xs.size(0), bs)

        if length_dim < 0:
            length_dim = xs.dim() + length_dim
        # ind = (:, None, ..., None, :, , None, ..., None)
        ind = tuple(
            slice(None) if i in (0, length_dim) else None for i in range(xs.dim())
        )
        mask = mask[ind].expand_as(xs).to(xs.device)
    return mask


def make_non_pad_mask(lengths, xs=None, length_dim=-1):
    """Make mask tensor containing indices of non-padded part.

    Args:
        lengths (LongTensor or List): Batch of lengths (B,).
        xs (Tensor, optional): The reference tensor.
            If set, masks will be the same shape as this tensor.
        length_dim (int, optional): Dimension indicator of the above tensor.
            See the example.

    Returns:
        ByteTensor: mask tensor containing indices of padded part.
                    dtype=torch.uint8 in PyTorch 1.2-
                    dtype=torch.bool in PyTorch 1.2+ (including 1.2)

    Examples:
        With only lengths.

        >>> lengths = [5, 3, 2]
        >>> make_non_pad_mask(lengths)
        masks = [[1, 1, 1, 1 ,1],
                 [1, 1, 1, 0, 0],
                 [1, 1, 0, 0, 0]]

        With the reference tensor.

        >>> xs = torch.zeros((3, 2, 4))
        >>> make_non_pad_mask(lengths, xs)
        tensor([[[1, 1, 1, 1],
                 [1, 1, 1, 1]],
                [[1, 1, 1, 0],
                 [1, 1, 1, 0]],
                [[1, 1, 0, 0],
                 [1, 1, 0, 0]]], dtype=torch.uint8)
        >>> xs = torch.zeros((3, 2, 6))
        >>> make_non_pad_mask(lengths, xs)
        tensor([[[1, 1, 1, 1, 1, 0],
                 [1, 1, 1, 1, 1, 0]],
                [[1, 1, 1, 0, 0, 0],
                 [1, 1, 1, 0, 0, 0]],
                [[1, 1, 0, 0, 0, 0],
                 [1, 1, 0, 0, 0, 0]]], dtype=torch.uint8)

        With the reference tensor and dimension indicator.

        >>> xs = torch.zeros((3, 6, 6))
        >>> make_non_pad_mask(lengths, xs, 1)
        tensor([[[1, 1, 1, 1, 1, 1],
                 [1, 1, 1, 1, 1, 1],
                 [1, 1, 1, 1, 1, 1],
                 [1, 1, 1, 1, 1, 1],
                 [1, 1, 1, 1, 1, 1],
                 [0, 0, 0, 0, 0, 0]],
                [[1, 1, 1, 1, 1, 1],
                 [1, 1, 1, 1, 1, 1],
                 [1, 1, 1, 1, 1, 1],
                 [0, 0, 0, 0, 0, 0],
                 [0, 0, 0, 0, 0, 0],
                 [0, 0, 0, 0, 0, 0]],
                [[1, 1, 1, 1, 1, 1],
                 [1, 1, 1, 1, 1, 1],
                 [0, 0, 0, 0, 0, 0],
                 [0, 0, 0, 0, 0, 0],
                 [0, 0, 0, 0, 0, 0],
                 [0, 0, 0, 0, 0, 0]]], dtype=torch.uint8)
        >>> make_non_pad_mask(lengths, xs, 2)
        tensor([[[1, 1, 1, 1, 1, 0],
                 [1, 1, 1, 1, 1, 0],
                 [1, 1, 1, 1, 1, 0],
                 [1, 1, 1, 1, 1, 0],
                 [1, 1, 1, 1, 1, 0],
                 [1, 1, 1, 1, 1, 0]],
                [[1, 1, 1, 0, 0, 0],
                 [1, 1, 1, 0, 0, 0],
                 [1, 1, 1, 0, 0, 0],
                 [1, 1, 1, 0, 0, 0],
                 [1, 1, 1, 0, 0, 0],
                 [1, 1, 1, 0, 0, 0]],
                [[1, 1, 0, 0, 0, 0],
                 [1, 1, 0, 0, 0, 0],
                 [1, 1, 0, 0, 0, 0],
                 [1, 1, 0, 0, 0, 0],
                 [1, 1, 0, 0, 0, 0],
                 [1, 1, 0, 0, 0, 0]]], dtype=torch.uint8)

    """
    return ~make_pad_mask(lengths, xs, length_dim)


def mask_by_length(xs, lengths, fill=0):
    """Mask tensor according to length.

    Args:
        xs (Tensor): Batch of input tensor (B, `*`).
        lengths (LongTensor or List): Batch of lengths (B,).
        fill (int or float): Value to fill masked part.

    Returns:
        Tensor: Batch of masked input tensor (B, `*`).

    Examples:
        >>> x = torch.arange(5).repeat(3, 1) + 1
        >>> x
        tensor([[1, 2, 3, 4, 5],
                [1, 2, 3, 4, 5],
                [1, 2, 3, 4, 5]])
        >>> lengths = [5, 3, 2]
        >>> mask_by_length(x, lengths)
        tensor([[1, 2, 3, 4, 5],
                [1, 2, 3, 0, 0],
                [1, 2, 0, 0, 0]])

    """
    assert xs.size(0) == len(lengths)
    ret = xs.data.new(*xs.size()).fill_(fill)
    for i, l in enumerate(lengths):
        ret[i, :l] = xs[i, :l]
    return ret


def th_accuracy(pad_outputs, pad_targets, ignore_label):
    """Calculate accuracy.

    Args:
        pad_outputs (Tensor): Prediction tensors (B * Lmax, D).
        pad_targets (LongTensor): Target label tensors (B, Lmax, D).
        ignore_label (int): Ignore label id.

    Returns:
        float: Accuracy value (0.0 - 1.0).

    """
    pad_pred = pad_outputs.view(
        pad_targets.size(0), pad_targets.size(1), pad_outputs.size(1)
    ).argmax(2)
    mask = pad_targets != ignore_label
    numerator = torch.sum(
        pad_pred.masked_select(mask) == pad_targets.masked_select(mask)
    )
    denominator = torch.sum(mask)
    return float(numerator) / float(denominator)


def to_torch_tensor(x):
    """Change to torch.Tensor or ComplexTensor from numpy.ndarray.

    Args:
        x: Inputs. It should be one of numpy.ndarray, Tensor, ComplexTensor, and dict.

    Returns:
        Tensor or ComplexTensor: Type converted inputs.

    Examples:
        >>> xs = np.ones(3, dtype=np.float32)
        >>> xs = to_torch_tensor(xs)
        tensor([1., 1., 1.])
        >>> xs = torch.ones(3, 4, 5)
        >>> assert to_torch_tensor(xs) is xs
        >>> xs = {'real': xs, 'imag': xs}
        >>> to_torch_tensor(xs)
        ComplexTensor(
        Real:
        tensor([1., 1., 1.])
        Imag;
        tensor([1., 1., 1.])
        )

    """
    # If numpy, change to torch tensor
    if isinstance(x, np.ndarray):
        if x.dtype.kind == "c":
            # Dynamically importing because torch_complex requires python3
            from torch_complex.tensor import ComplexTensor

            return ComplexTensor(x)
        else:
            return torch.from_numpy(x)

    # If {'real': ..., 'imag': ...}, convert to ComplexTensor
    elif isinstance(x, dict):
        # Dynamically importing because torch_complex requires python3
        from torch_complex.tensor import ComplexTensor

        if "real" not in x or "imag" not in x:
            raise ValueError("has 'real' and 'imag' keys: {}".format(list(x)))
        # Relative importing because of using python3 syntax
        return ComplexTensor(x["real"], x["imag"])

    # If torch.Tensor, as it is
    elif isinstance(x, torch.Tensor):
        return x

    else:
        error = (
            "x must be numpy.ndarray, torch.Tensor or a dict like "
            "{{'real': torch.Tensor, 'imag': torch.Tensor}}, "
            "but got {}".format(type(x))
        )
        try:
            from torch_complex.tensor import ComplexTensor
        except Exception:
            # If PY2
            raise ValueError(error)
        else:
            # If PY3
            if isinstance(x, ComplexTensor):
                return x
            else:
                raise ValueError(error)


def get_subsample(train_args, mode, arch):
    """Parse the subsampling factors from the args for the specified `mode` and `arch`.

    Args:
        train_args: argument Namespace containing options.
        mode: one of ('asr', 'mt', 'st')
        arch: one of ('rnn', 'rnn-t', 'rnn_mix', 'rnn_mulenc', 'transformer')

    Returns:
        np.ndarray / List[np.ndarray]: subsampling factors.
    """
    if arch == "transformer":
        return np.array([1])

    elif mode == "mt" and arch == "rnn":
        # +1 means input (+1) and layers outputs (train_args.elayer)
        subsample = np.ones(train_args.elayers + 1, dtype=np.int)
        logging.warning("Subsampling is not performed for machine translation.")
        logging.info("subsample: " + " ".join([str(x) for x in subsample]))
        return subsample

    elif (
        (mode == "asr" and arch in ("rnn", "rnn-t"))
        or (mode == "mt" and arch == "rnn")
        or (mode == "st" and arch == "rnn")
    ):
        subsample = np.ones(train_args.elayers + 1, dtype=np.int)
        if train_args.etype.endswith("p") and not train_args.etype.startswith("vgg"):
            ss = train_args.subsample.split("_")
            for j in range(min(train_args.elayers + 1, len(ss))):
                subsample[j] = int(ss[j])
        else:
            logging.warning(
                "Subsampling is not performed for vgg*. "
                "It is performed in max pooling layers at CNN."
            )
        logging.info("subsample: " + " ".join([str(x) for x in subsample]))
        return subsample

    elif mode == "asr" and arch == "rnn_mix":
        subsample = np.ones(
            train_args.elayers_sd + train_args.elayers + 1, dtype=np.int
        )
        if train_args.etype.endswith("p") and not train_args.etype.startswith("vgg"):
            ss = train_args.subsample.split("_")
            for j in range(
                min(train_args.elayers_sd + train_args.elayers + 1, len(ss))
            ):
                subsample[j] = int(ss[j])
        else:
            logging.warning(
                "Subsampling is not performed for vgg*. "
                "It is performed in max pooling layers at CNN."
            )
        logging.info("subsample: " + " ".join([str(x) for x in subsample]))
        return subsample

    elif mode == "asr" and arch == "rnn_mulenc":
        subsample_list = []
        for idx in range(train_args.num_encs):
            subsample = np.ones(train_args.elayers[idx] + 1, dtype=np.int)
            if train_args.etype[idx].endswith("p") and not train_args.etype[
                idx
            ].startswith("vgg"):
                ss = train_args.subsample[idx].split("_")
                for j in range(min(train_args.elayers[idx] + 1, len(ss))):
                    subsample[j] = int(ss[j])
            else:
                logging.warning(
                    "Encoder %d: Subsampling is not performed for vgg*. "
                    "It is performed in max pooling layers at CNN.",
                    idx + 1,
                )
            logging.info("subsample: " + " ".join([str(x) for x in subsample]))
            subsample_list.append(subsample)
        return subsample_list

    else:
        raise ValueError("Invalid options: mode={}, arch={}".format(mode, arch))


def rename_state_dict(
    old_prefix: str, new_prefix: str, state_dict: Dict[str, torch.Tensor]
):
    """Replace keys of old prefix with new prefix in state dict."""
    # need this list not to break the dict iterator
    old_keys = [k for k in state_dict if k.startswith(old_prefix)]
    if len(old_keys) > 0:
        logging.warning(f"Rename: {old_prefix} -> {new_prefix}")
    for k in old_keys:
        v = state_dict.pop(k)
        new_k = k.replace(old_prefix, new_prefix)
        state_dict[new_k] = v


def get_activation(act):
    """Return activation function."""
    # Lazy load to avoid unused import
    from espnet.nets.pytorch_backend.conformer.swish import Swish

    activation_funcs = {
        "hardtanh": torch.nn.Hardtanh,
        "tanh": torch.nn.Tanh,
        "relu": torch.nn.ReLU,
        "selu": torch.nn.SELU,
        "swish": Swish,
    }

    return activation_funcs[act]()


================================================
FILE: nets/pytorch_backend/rnn/__init__.py
================================================
"""Initialize sub package."""


================================================
FILE: nets/pytorch_backend/rnn/argument.py
================================================
# Copyright 2020 Hirofumi Inaguma
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

"""Conformer common arguments."""


def add_arguments_rnn_encoder_common(group):
    """Define common arguments for RNN encoder."""
    group.add_argument(
        "--etype",
        default="blstmp",
        type=str,
        choices=[
            "lstm",
            "blstm",
            "lstmp",
            "blstmp",
            "vgglstmp",
            "vggblstmp",
            "vgglstm",
            "vggblstm",
            "gru",
            "bgru",
            "grup",
            "bgrup",
            "vgggrup",
            "vggbgrup",
            "vgggru",
            "vggbgru",
        ],
        help="Type of encoder network architecture",
    )
    group.add_argument(
        "--elayers",
        default=4,
        type=int,
        help="Number of encoder layers",
    )
    group.add_argument(
        "--eunits",
        "-u",
        default=300,
        type=int,
        help="Number of encoder hidden units",
    )
    group.add_argument(
        "--eprojs", default=320, type=int, help="Number of encoder projection units"
    )
    group.add_argument(
        "--subsample",
        default="1",
        type=str,
        help="Subsample input frames x_y_z means "
        "subsample every x frame at 1st layer, "
        "every y frame at 2nd layer etc.",
    )
    return group


def add_arguments_rnn_decoder_common(group):
    """Define common arguments for RNN decoder."""
    group.add_argument(
        "--dtype",
        default="lstm",
        type=str,
        choices=["lstm", "gru"],
        help="Type of decoder network architecture",
    )
    group.add_argument(
        "--dlayers", default=1, type=int, help="Number of decoder layers"
    )
    group.add_argument(
        "--dunits", default=320, type=int, help="Number of decoder hidden units"
    )
    group.add_argument(
        "--dropout-rate-decoder",
        default=0.0,
        type=float,
        help="Dropout rate for the decoder",
    )
    group.add_argument(
        "--sampling-probability",
        default=0.0,
        type=float,
        help="Ratio of predicted labels fed back to decoder",
    )
    group.add_argument(
        "--lsm-type",
        const="",
        default="",
        type=str,
        nargs="?",
        choices=["", "unigram"],
        help="Apply label smoothing with a specified distribution type",
    )
    return group


def add_arguments_rnn_attention_common(group):
    """Define common arguments for RNN attention."""
    group.add_argument(
        "--atype",
        default="dot",
        type=str,
        choices=[
            "noatt",
            "dot",
            "add",
            "location",
            "coverage",
            "coverage_location",
            "location2d",
            "location_recurrent",
            "multi_head_dot",
            "multi_head_add",
            "multi_head_loc",
            "multi_head_multi_res_loc",
        ],
        help="Type of attention architecture",
    )
    group.add_argument(
        "--adim",
        default=320,
        type=int,
        help="Number of attention transformation dimensions",
    )
    group.add_argument(
        "--awin", default=5, type=int, help="Window size for location2d attention"
    )
    group.add_argument(
        "--aheads",
        default=4,
        type=int,
        help="Number of heads for multi head attention",
    )
    group.add_argument(
        "--aconv-chans",
        default=-1,
        type=int,
        help="Number of attention convolution channels \
                       (negative value indicates no location-aware attention)",
    )
    group.add_argument(
        "--aconv-filts",
        default=100,
        type=int,
        help="Number of attention convolution filters \
                       (negative value indicates no location-aware attention)",
    )
    group.add_argument(
        "--dropout-rate",
        default=0.0,
        type=float,
        help="Dropout rate for the encoder",
    )
    return group


================================================
FILE: nets/pytorch_backend/rnn/attentions.py
================================================
"""Attention modules for RNN."""

import math
import six

import torch
import torch.nn.functional as F

from espnet.nets.pytorch_backend.nets_utils import make_pad_mask
from espnet.nets.pytorch_backend.nets_utils import to_device


def _apply_attention_constraint(
    e, last_attended_idx, backward_window=1, forward_window=3
):
    """Apply monotonic attention constraint.

    This function apply the monotonic attention constraint
    introduced in `Deep Voice 3: Scaling
    Text-to-Speech with Convolutional Sequence Learning`_.

    Args:
        e (Tensor): Attention energy before applying softmax (1, T).
        last_attended_idx (int): The index of the inputs of the last attended [0, T].
        backward_window (int, optional): Backward window size in attention constraint.
        forward_window (int, optional): Forward window size in attetion constraint.

    Returns:
        Tensor: Monotonic constrained attention energy (1, T).

    .. _`Deep Voice 3: Scaling Text-to-Speech with Convolutional Sequence Learning`:
        https://arxiv.org/abs/1710.07654

    """
    if e.size(0) != 1:
        raise NotImplementedError("Batch attention constraining is not yet supported.")
    backward_idx = last_attended_idx - backward_window
    forward_idx = last_attended_idx + forward_window
    if backward_idx > 0:
        e[:, :backward_idx] = -float("inf")
    if forward_idx < e.size(1):
        e[:, forward_idx:] = -float("inf")
    return e


class NoAtt(torch.nn.Module):
    """No attention"""

    def __init__(self):
        super(NoAtt, self).__init__()
        self.h_length = None
        self.enc_h = None
        self.pre_compute_enc_h = None
        self.c = None

    def reset(self):
        """reset states"""
        self.h_length = None
        self.enc_h = None
        self.pre_compute_enc_h = None
        self.c = None

    def forward(self, enc_hs_pad, enc_hs_len, dec_z, att_prev):
        """NoAtt forward

        :param torch.Tensor enc_hs_pad: padded encoder hidden state (B, T_max, D_enc)
        :param list enc_hs_len: padded encoder hidden state length (B)
        :param torch.Tensor dec_z: dummy (does not use)
        :param torch.Tensor att_prev: dummy (does not use)
        :return: attention weighted encoder state (B, D_enc)
        :rtype: torch.Tensor
        :return: previous attention weights
        :rtype: torch.Tensor
        """
        batch = len(enc_hs_pad)
        # pre-compute all h outside the decoder loop
        if self.pre_compute_enc_h is None:
            self.enc_h = enc_hs_pad  # utt x frame x hdim
            self.h_length = self.enc_h.size(1)

        # initialize attention weight with uniform dist.
        if att_prev is None:
            # if no bias, 0 0-pad goes 0
            mask = 1.0 - make_pad_mask(enc_hs_len).float()
            att_prev = mask / mask.new(enc_hs_len).unsqueeze(-1)
            att_prev = att_prev.to(self.enc_h)
            self.c = torch.sum(
                self.enc_h * att_prev.view(batch, self.h_length, 1), dim=1
            )

        return self.c, att_prev


class AttDot(torch.nn.Module):
    """Dot product attention

    :param int eprojs: # projection-units of encoder
    :param int dunits: # units of decoder
    :param int att_dim: attention dimension
    :param bool han_mode: flag to swith on mode of hierarchical attention
        and not store pre_compute_enc_h
    """

    def __init__(self, eprojs, dunits, att_dim, han_mode=False):
        super(AttDot, self).__init__()
        self.mlp_enc = torch.nn.Linear(eprojs, att_dim)
        self.mlp_dec = torch.nn.Linear(dunits, att_dim)

        self.dunits = dunits
        self.eprojs = eprojs
        self.att_dim = att_dim
        self.h_length = None
        self.enc_h = None
        self.pre_compute_enc_h = None
        self.mask = None
        self.han_mode = han_mode

    def reset(self):
        """reset states"""
        self.h_length = None
        self.enc_h = None
        self.pre_compute_enc_h = None
        self.mask = None

    def forward(self, enc_hs_pad, enc_hs_len, dec_z, att_prev, scaling=2.0):
        """AttDot forward

        :param torch.Tensor enc_hs_pad: padded encoder hidden state (B x T_max x D_enc)
        :param list enc_hs_len: padded encoder hidden state length (B)
        :param torch.Tensor dec_z: dummy (does not use)
        :param torch.Tensor att_prev: dummy (does not use)
        :param float scaling: scaling parameter before applying softmax
        :return: attention weighted encoder state (B, D_enc)
        :rtype: torch.Tensor
        :return: previous attention weight (B x T_max)
        :rtype: torch.Tensor
        """

        batch = enc_hs_pad.size(0)
        # pre-compute all h outside the decoder loop
        if self.pre_compute_enc_h is None or self.han_mode:
            self.enc_h = enc_hs_pad  # utt x frame x hdim
            self.h_length = self.enc_h.size(1)
            # utt x frame x att_dim
            self.pre_compute_enc_h = torch.tanh(self.mlp_enc(self.enc_h))

        if dec_z is None:
            dec_z = enc_hs_pad.new_zeros(batch, self.dunits)
        else:
            dec_z = dec_z.view(batch, self.dunits)

        e = torch.sum(
            self.pre_compute_enc_h
            * torch.tanh(self.mlp_dec(dec_z)).view(batch, 1, self.att_dim),
            dim=2,
        )  # utt x frame

        # NOTE consider zero padding when compute w.
        if self.mask is None:
            self.mask = to_device(enc_hs_pad, make_pad_mask(enc_hs_len))
        e.masked_fill_(self.mask, -float("inf"))
        w = F.softmax(scaling * e, dim=1)

        # weighted sum over flames
        # utt x hdim
        # NOTE use bmm instead of sum(*)
        c = torch.sum(self.enc_h * w.view(batch, self.h_length, 1), dim=1)
        return c, w


class AttAdd(torch.nn.Module):
    """Additive attention

    :param int eprojs: # projection-units of encoder
    :param int dunits: # units of decoder
    :param int att_dim: attention dimension
    :param bool han_mode: flag to swith on mode of hierarchical attention
        and not store pre_compute_enc_h
    """

    def __init__(self, eprojs, dunits, att_dim, han_mode=False):
        super(AttAdd, self).__init__()
        self.mlp_enc = torch.nn.Linear(eprojs, att_dim)
        self.mlp_dec = torch.nn.Linear(dunits, att_dim, bias=False)
        self.gvec = torch.nn.Linear(att_dim, 1)
        self.dunits = dunits
        self.eprojs = eprojs
        self.att_dim = att_dim
        self.h_length = None
        self.enc_h = None
        self.pre_compute_enc_h = None
        self.mask = None
        self.han_mode = han_mode

    def reset(self):
        """reset states"""
        self.h_length = None
        self.enc_h = None
        self.pre_compute_enc_h = None
        self.mask = None

    def forward(self, enc_hs_pad, enc_hs_len, dec_z, att_prev, scaling=2.0):
        """AttAdd forward

        :param torch.Tensor enc_hs_pad: padded encoder hidden state (B x T_max x D_enc)
        :param list enc_hs_len: padded encoder hidden state length (B)
        :param torch.Tensor dec_z: decoder hidden state (B x D_dec)
        :param torch.Tensor att_prev: dummy (does not use)
        :param float scaling: scaling parameter before applying softmax
        :return: attention weighted encoder state (B, D_enc)
        :rtype: torch.Tensor
        :return: previous attention weights (B x T_max)
        :rtype: torch.Tensor
        """

        batch = len(enc_hs_pad)
        # pre-compute all h outside the decoder loop
        if self.pre_compute_enc_h is None or self.han_mode:
            self.enc_h = enc_hs_pad  # utt x frame x hdim
            self.h_length = self.enc_h.size(1)
            # utt x frame x att_dim
            self.pre_compute_enc_h = self.mlp_enc(self.enc_h)

        if dec_z is None:
            dec_z = enc_hs_pad.new_zeros(batch, self.dunits)
        else:
            dec_z = dec_z.view(batch, self.dunits)

        # dec_z_tiled: utt x frame x att_dim
        dec_z_tiled = self.mlp_dec(dec_z).view(batch, 1, self.att_dim)

        # dot with gvec
        # utt x frame x att_dim -> utt x frame
        e = self.gvec(torch.tanh(self.pre_compute_enc_h + dec_z_tiled)).squeeze(2)

        # NOTE consider zero padding when compute w.
        if self.mask is None:
            self.mask = to_device(enc_hs_pad, make_pad_mask(enc_hs_len))
        e.masked_fill_(self.mask, -float("inf"))
        w = F.softmax(scaling * e, dim=1)

        # weighted sum over flames
        # utt x hdim
        # NOTE use bmm instead of sum(*)
        c = torch.sum(self.enc_h * w.view(batch, self.h_length, 1), dim=1)

        return c, w


class AttLoc(torch.nn.Module):
    """location-aware attention module.

    Reference: Attention-Based Models for Speech Recognition
        (https://arxiv.org/pdf/1506.07503.pdf)

    :param int eprojs: # projection-units of encoder
    :param int dunits: # units of decoder
    :param int att_dim: attention dimension
    :param int aconv_chans: # channels of attention convolution
    :param int aconv_filts: filter size of attention convolution
    :param bool han_mode: flag to swith on mode of hierarchical attention
        and not store pre_compute_enc_h
    """

    def __init__(
        self, eprojs, dunits, att_dim, aconv_chans, aconv_filts, han_mode=False
    ):
        super(AttLoc, self).__init__()
        self.mlp_enc = torch.nn.Linear(eprojs, att_dim)
        self.mlp_dec = torch.nn.Linear(dunits, att_dim, bias=False)
        self.mlp_att = torch.nn.Linear(aconv_chans, att_dim, bias=False)
        self.loc_conv = torch.nn.Conv2d(
            1,
            aconv_chans,
            (1, 2 * aconv_filts + 1),
            padding=(0, aconv_filts),
            bias=False,
        )
        self.gvec = torch.nn.Linear(att_dim, 1)

        self.dunits = dunits
        self.eprojs = eprojs
        self.att_dim = att_dim
        self.h_length = None
        self.enc_h = None
        self.pre_compute_enc_h = None
        self.mask = None
        self.han_mode = han_mode

    def reset(self):
        """reset states"""
        self.h_length = None
        self.enc_h = None
        self.pre_compute_enc_h = None
        self.mask = None

    def forward(
        self,
        enc_hs_pad,
        enc_hs_len,
        dec_z,
        att_prev,
        scaling=2.0,
        last_attended_idx=None,
        backward_window=1,
        forward_window=3,
    ):
        """Calcualte AttLoc forward propagation.

        :param torch.Tensor enc_hs_pad: padded encoder hidden state (B x T_max x D_enc)
        :param list enc_hs_len: padded encoder hidden state length (B)
        :param torch.Tensor dec_z: decoder hidden state (B x D_dec)
        :param torch.Tensor att_prev: previous attention weight (B x T_max)
        :param float scaling: scaling parameter before applying softmax
        :param torch.Tensor forward_window:
            forward window size when constraining attention
        :param int last_attended_idx: index of the inputs of the last attended
        :param int backward_window: backward window size in attention constraint
        :param int forward_window: forward window size in attetion constraint
        :return: attention weighted encoder state (B, D_enc)
        :rtype: torch.Tensor
        :return: previous attention weights (B x T_max)
        :rtype: torch.Tensor
        """
        batch = len(enc_hs_pad)
        # pre-compute all h outside the decoder loop
        if self.pre_compute_enc_h is None or self.han_mode:
            self.enc_h = enc_hs_pad  # utt x frame x hdim
            self.h_length = self.enc_h.size(1)
            # utt x frame x att_dim
            self.pre_compute_enc_h = self.mlp_enc(self.enc_h)

        if dec_z is None:
            dec_z = enc_hs_pad.new_zeros(batch, self.dunits)
        else:
            dec_z = dec_z.view(batch, self.dunits)

        # initialize attention weight with uniform dist.
        if att_prev is None:
            # if no bias, 0 0-pad goes 0
            att_prev = 1.0 - make_pad_mask(enc_hs_len).to(
                device=dec_z.device, dtype=dec_z.dtype
            )
            att_prev = att_prev / att_prev.new(enc_hs_len).unsqueeze(-1)

        # att_prev: utt x frame -> utt x 1 x 1 x frame
        # -> utt x att_conv_chans x 1 x frame
        att_conv = self.loc_conv(att_prev.view(batch, 1, 1, self.h_length))
        # att_conv: utt x att_conv_chans x 1 x frame -> utt x frame x att_conv_chans
        att_conv = att_conv.squeeze(2).transpose(1, 2)
        # att_conv: utt x frame x att_conv_chans -> utt x frame x att_dim
        att_conv = self.mlp_att(att_conv)

        # dec_z_tiled: utt x frame x att_dim
        dec_z_tiled = self.mlp_dec(dec_z).view(batch, 1, self.att_dim)

        # dot with gvec
        # utt x frame x att_dim -> utt x frame
        e = self.gvec(
            torch.tanh(att_conv + self.pre_compute_enc_h + dec_z_tiled)
        ).squeeze(2)

        # NOTE: consider zero padding when compute w.
        if self.mask is None:
            self.mask = to_device(enc_hs_pad, make_pad_mask(enc_hs_len))
        e.masked_fill_(self.mask, -float("inf"))

        # apply monotonic attention constraint (mainly for TTS)
        if last_attended_idx is not None:
            e = _apply_attention_constraint(
                e, last_attended_idx, backward_window, forward_window
            )

        w = F.softmax(scaling * e, dim=1)

        # weighted sum over flames
        # utt x hdim
        c = torch.sum(self.enc_h * w.view(batch, self.h_length, 1), dim=1)

        return c, w


class AttCov(torch.nn.Module):
    """Coverage mechanism attention

    Reference: Get To The Point: Summarization with Pointer-Generator Network
       (https://arxiv.org/abs/1704.04368)

    :param int eprojs: # projection-units of encoder
    :param int dunits: # units of decoder
    :param int att_dim: attention dimension
    :param bool han_mode: flag to swith on mode of hierarchical attention
        and not store pre_compute_enc_h
    """

    def __init__(self, eprojs, dunits, att_dim, han_mode=False):
        super(AttCov, self).__init__()
        self.mlp_enc = torch.nn.Linear(eprojs, att_dim)
        self.mlp_dec = torch.nn.Linear(dunits, att_dim, bias=False)
        self.wvec = torch.nn.Linear(1, att_dim)
        self.gvec = torch.nn.Linear(att_dim, 1)

        self.dunits = dunits
        self.eprojs = eprojs
        self.att_dim = att_dim
        self.h_length = None
        self.enc_h = None
        self.pre_compute_enc_h = None
        self.mask = None
        self.han_mode = han_mode

    def reset(self):
        """reset states"""
        self.h_length = None
        self.enc_h = None
        self.pre_compute_enc_h = None
        self.mask = None

    def forward(self, enc_hs_pad, enc_hs_len, dec_z, att_prev_list, scaling=2.0):
        """AttCov forward

        :param torch.Tensor enc_hs_pad: padded encoder hidden state (B x T_max x D_enc)
        :param list enc_hs_len: padded encoder hidden state length (B)
        :param torch.Tensor dec_z: decoder hidden state (B x D_dec)
        :param list att_prev_list: list of previous attention weight
        :param float scaling: scaling parameter before applying softmax
        :return: attention weighted encoder state (B, D_enc)
        :rtype: torch.Tensor
        :return: list of previous attention weights
        :rtype: list
        """

        batch = len(enc_hs_pad)
        # pre-compute all h outside the decoder loop
        if self.pre_compute_enc_h is None or self.han_mode:
            self.enc_h = enc_hs_pad  # utt x frame x hdim
            self.h_length = self.enc_h.size(1)
            # utt x frame x att_dim
            self.pre_compute_enc_h = self.mlp_enc(self.enc_h)

        if dec_z is None:
            dec_z = enc_hs_pad.new_zeros(batch, self.dunits)
        else:
            dec_z = dec_z.view(batch, self.dunits)

        # initialize attention weight with uniform dist.
        if att_prev_list is None:
            # if no bias, 0 0-pad goes 0
            att_prev_list = to_device(
                enc_hs_pad, (1.0 - make_pad_mask(enc_hs_len).float())
            )
            att_prev_list = [
                att_prev_list / att_prev_list.new(enc_hs_len).unsqueeze(-1)
            ]

        # att_prev_list: L' * [B x T] => cov_vec B x T
        cov_vec = sum(att_prev_list)
        # cov_vec: B x T => B x T x 1 => B x T x att_dim
        cov_vec = self.wvec(cov_vec.unsqueeze(-1))

        # dec_z_tiled: utt x frame x att_dim
        dec_z_tiled = self.mlp_dec(dec_z).view(batch, 1, self.att_dim)

        # dot with gvec
        # utt x frame x att_dim -> utt x frame
        e = self.gvec(
            torch.tanh(cov_vec + self.pre_compute_enc_h + dec_z_tiled)
        ).squeeze(2)

        # NOTE consider zero padding when compute w.
        if self.mask is None:
            self.mask = to_device(enc_hs_pad, make_pad_mask(enc_hs_len))
        e.masked_fill_(self.mask, -float("inf"))
        w = F.softmax(scaling * e, dim=1)
        att_prev_list += [w]

        # weighted sum over flames
        # utt x hdim
        # NOTE use bmm instead of sum(*)
        c = torch.sum(self.enc_h * w.view(batch, self.h_length, 1), dim=1)

        return c, att_prev_list


class AttLoc2D(torch.nn.Module):
    """2D location-aware attention

    This attention is an extended version of location aware attention.
    It take not only one frame before attention weights,
    but also earlier frames into account.

    :param int eprojs: # projection-units of encoder
    :param int dunits: # units of decoder
    :param int att_dim: attention dimension
    :param int aconv_chans: # channels of attention convolution
    :param int aconv_filts: filter size of attention convolution
    :param int att_win: attention window size (default=5)
    :param bool han_mode:
        flag to swith on mode of hierarchical attention and not store pre_compute_enc_h
    """

    def __init__(
        self, eprojs, dunits, att_dim, att_win, aconv_chans, aconv_filts, han_mode=False
    ):
        super(AttLoc2D, self).__init__()
        self.mlp_enc = torch.nn.Linear(eprojs, att_dim)
        self.mlp_dec = torch.nn.Linear(dunits, att_dim, bias=False)
        self.mlp_att = torch.nn.Linear(aconv_chans, att_dim, bias=False)
        self.loc_conv = torch.nn.Conv2d(
            1,
            aconv_chans,
            (att_win, 2 * aconv_filts + 1),
            padding=(0, aconv_filts),
            bias=False,
        )
        self.gvec = torch.nn.Linear(att_dim, 1)

        self.dunits = dunits
        self.eprojs = eprojs
        self.att_dim = att_dim
        self.h_length = None
        self.enc_h = None
        self.pre_compute_enc_h = None
        self.aconv_chans = aconv_chans
        self.att_win = att_win
        self.mask = None
        self.han_mode = han_mode

    def reset(self):
        """reset states"""
        self.h_length = None
        self.enc_h = None
        self.pre_compute_enc_h = None
        self.mask = None

    def forward(self, enc_hs_pad, enc_hs_len, dec_z, att_prev, scaling=2.0):
        """AttLoc2D forward

        :param torch.Tensor enc_hs_pad: padded encoder hidden state (B x T_max x D_enc)
        :param list enc_hs_len: padded encoder hidden state length (B)
        :param torch.Tensor dec_z: decoder hidden state (B x D_dec)
        :param torch.Tensor att_prev: previous attention weight (B x att_win x T_max)
        :param float scaling: scaling parameter before applying softmax
        :return: attention weighted encoder state (B, D_enc)
        :rtype: torch.Tensor
        :return: previous attention weights (B x att_win x T_max)
        :rtype: torch.Tensor
        """

        batch = len(enc_hs_pad)
        # pre-compute all h outside the decoder loop
        if self.pre_compute_enc_h is None or self.han_mode:
            self.enc_h = enc_hs_pad  # utt x frame x hdim
            self.h_length = self.enc_h.size(1)
            # utt x frame x att_dim
            self.pre_compute_enc_h = self.mlp_enc(self.enc_h)

        if dec_z is None:
            dec_z = enc_hs_pad.new_zeros(batch, self.dunits)
        else:
            dec_z = dec_z.view(batch, self.dunits)

        # initialize attention weight with uniform dist.
        if att_prev is None:
            # B * [Li x att_win]
            # if no bias, 0 0-pad goes 0
            att_prev = to_device(enc_hs_pad, (1.0 - make_pad_mask(enc_hs_len).float()))
            att_prev = att_prev / att_prev.new(enc_hs_len).unsqueeze(-1)
            att_prev = att_prev.unsqueeze(1).expand(-1, self.att_win, -1)

        # att_prev: B x att_win x Tmax -> B x 1 x att_win x Tmax -> B x C x 1 x Tmax
        att_conv = self.loc_conv(att_prev.unsqueeze(1))
        # att_conv: B x C x 1 x Tmax -> B x Tmax x C
        att_conv = att_conv.squeeze(2).transpose(1, 2)
        # att_conv: utt x frame x att_conv_chans -> utt x frame x att_dim
        att_conv = self.mlp_att(att_conv)

        # dec_z_tiled: utt x frame x att_dim
        dec_z_tiled = self.mlp_dec(dec_z).view(batch, 1, self.att_dim)

        # dot with gvec
        # utt x frame x att_dim -> utt x frame
        e = self.gvec(
            torch.tanh(att_conv + self.pre_compute_enc_h + dec_z_tiled)
        ).squeeze(2)

        # NOTE consider zero padding when compute w.
        if self.mask is None:
            self.mask = to_device(enc_hs_pad, make_pad_mask(enc_hs_len))
        e.masked_fill_(self.mask, -float("inf"))
        w = F.softmax(scaling * e, dim=1)

        # weighted sum over flames
        # utt x hdim
        # NOTE use bmm instead of sum(*)
        c = torch.sum(self.enc_h * w.view(batch, self.h_length, 1), dim=1)

        # update att_prev: B x att_win x Tmax -> B x att_win+1 x Tmax
        # -> B x att_win x Tmax
        att_prev = torch.cat([att_prev, w.unsqueeze(1)], dim=1)
        att_prev = att_prev[:, 1:]

        return c, att_prev


class AttLocRec(torch.nn.Module):
    """location-aware recurrent attention

    This attention is an extended version of location aware attention.
    With the use of RNN,
    it take the effect of the history of attention weights into account.

    :param int eprojs: # projection-units of encoder
    :param int dunits: # units of decoder
    :param int att_dim: attention dimension
    :param int aconv_chans: # channels of attention convolution
    :param int aconv_filts: filter size of attention convolution
    :param bool han_mode:
        flag to swith on mode of hierarchical attention and not store pre_compute_enc_h
    """

    def __init__(
        self, eprojs, dunits, att_dim, aconv_chans, aconv_filts, han_mode=False
    ):
        super(AttLocRec, self).__init__()
        self.mlp_enc = torch.nn.Linear(eprojs, att_dim)
        self.mlp_dec = torch.nn.Linear(dunits, att_dim, bias=False)
        self.loc_conv = torch.nn.Conv2d(
            1,
            aconv_chans,
            (1, 2 * aconv_filts + 1),
            padding=(0, aconv_filts),
            bias=False,
        )
        self.att_lstm = torch.nn.LSTMCell(aconv_chans, att_dim, bias=False)
        self.gvec = torch.nn.Linear(att_dim, 1)

        self.dunits = dunits
        self.eprojs = eprojs
        self.att_dim = att_dim
        self.h_length = None
        self.enc_h = None
        self.pre_compute_enc_h = None
        self.mask = None
        self.han_mode = han_mode

    def reset(self):
        """reset states"""
        self.h_length = None
        self.enc_h = None
        self.pre_compute_enc_h = None
        self.mask = None

    def forward(self, enc_hs_pad, enc_hs_len, dec_z, att_prev_states, scaling=2.0):
        """AttLocRec forward

        :param torch.Tensor enc_hs_pad: padded encoder hidden state (B x T_max x D_enc)
        :param list enc_hs_len: padded encoder hidden state length (B)
        :param torch.Tensor dec_z: decoder hidden state (B x D_dec)
        :param tuple att_prev_states: previous attention weight and lstm states
                                      ((B, T_max), ((B, att_dim), (B, att_dim)))
        :param float scaling: scaling parameter before applying softmax
        :return: attention weighted encoder state (B, D_enc)
        :rtype: torch.Tensor
        :return: previous attention weights and lstm states (w, (hx, cx))
                 ((B, T_max), ((B, att_dim), (B, att_dim)))
        :rtype: tuple
        """

        batch = len(enc_hs_pad)
        # pre-compute all h outside the decoder loop
        if self.pre_compute_enc_h is None or self.han_mode:
            self.enc_h = enc_hs_pad  # utt x frame x hdim
            self.h_length = self.enc_h.size(1)
            # utt x frame x att_dim
            self.pre_compute_enc_h = self.mlp_enc(self.enc_h)

        if dec_z is None:
            dec_z = enc_hs_pad.new_zeros(batch, self.dunits)
        else:
            dec_z = dec_z.view(batch, self.dunits)

        if att_prev_states is None:
            # initialize attention weight with uniform dist.
            # if no bias, 0 0-pad goes 0
            att_prev = to_device(enc_hs_pad, (1.0 - make_pad_mask(enc_hs_len).float()))
            att_prev = att_prev / att_prev.new(enc_hs_len).unsqueeze(-1)

            # initialize lstm states
            att_h = enc_hs_pad.new_zeros(batch, self.att_dim)
            att_c = enc_hs_pad.new_zeros(batch, self.att_dim)
            att_states = (att_h, att_c)
        else:
            att_prev = att_prev_states[0]
            att_states = att_prev_states[1]

        # B x 1 x 1 x T -> B x C x 1 x T
        att_conv = self.loc_conv(att_prev.view(batch, 1, 1, self.h_length))
        # apply non-linear
        att_conv = F.relu(att_conv)
        # B x C x 1 x T -> B x C x 1 x 1 -> B x C
        att_conv = F.max_pool2d(att_conv, (1, att_conv.size(3))).view(batch, -1)

        att_h, att_c = self.att_lstm(att_conv, att_states)

        # dec_z_tiled: utt x frame x att_dim
        dec_z_tiled = self.mlp_dec(dec_z).view(batch, 1, self.att_dim)

        # dot with gvec
        # utt x frame x att_dim -> utt x frame
        e = self.gvec(
            torch.tanh(att_h.unsqueeze(1) + self.pre_compute_enc_h + dec_z_tiled)
        ).squeeze(2)

        # NOTE consider zero padding when compute w.
        if self.mask is None:
            self.mask = to_device(enc_hs_pad, make_pad_mask(enc_hs_len))
        e.masked_fill_(self.mask, -float("inf"))
        w = F.softmax(scaling * e, dim=1)

        # weighted sum over flames
        # utt x hdim
        # NOTE use bmm instead of sum(*)
        c = torch.sum(self.enc_h * w.view(batch, self.h_length, 1), dim=1)

        return c, (w, (att_h, att_c))


class AttCovLoc(torch.nn.Module):
    """Coverage mechanism location aware attention

    This attention is a combination of coverage and location-aware attentions.

    :param int eprojs: # projection-units of encoder
    :param int dunits: # units of decoder
    :param int att_dim: attention dimension
    :param int aconv_chans: # channels of attention convolution
    :param int aconv_filts: filter size of attention convolution
    :param bool han_mode:
        flag to swith on mode of hierarchical attention and not store pre_compute_enc_h
    """

    def __init__(
        self, eprojs, dunits, att_dim, aconv_chans, aconv_filts, han_mode=False
    ):
        super(AttCovLoc, self).__init__()
        self.mlp_enc = torch.nn.Linear(eprojs, att_dim)
        self.mlp_dec = torch.nn.Linear(dunits, att_dim, bias=False)
        self.mlp_att = torch.nn.Linear(aconv_chans, att_dim, bias=False)
        self.loc_conv = torch.nn.Conv2d(
            1,
            aconv_chans,
            (1, 2 * aconv_filts + 1),
            padding=(0, aconv_filts),
            bias=False,
        )
        self.gvec = torch.nn.Linear(att_dim, 1)

        self.dunits = dunits
        self.eprojs = eprojs
        self.att_dim = att_dim
        self.h_length = None
        self.enc_h = None
        self.pre_compute_enc_h = None
        self.aconv_chans = aconv_chans
        self.mask = None
        self.han_mode = han_mode

    def reset(self):
        """reset states"""
        self.h_length = None
        self.enc_h = None
        self.pre_compute_enc_h = None
        self.mask = None

    def forward(self, enc_hs_pad, enc_hs_len, dec_z, att_prev_list, scaling=2.0):
        """AttCovLoc forward

        :param torch.Tensor enc_hs_pad: padded encoder hidden state (B x T_max x D_enc)
        :param list enc_hs_len: padded encoder hidden state length (B)
        :param torch.Tensor dec_z: decoder hidden state (B x D_dec)
        :param list att_prev_list: list of previous attention weight
        :param float scaling: scaling parameter before applying softmax
        :return: attention weighted encoder state (B, D_enc)
        :rtype: torch.Tensor
        :return: list of previous attention weights
        :rtype: list
        """

        batch = len(enc_hs_pad)
        # pre-compute all h outside the decoder loop
        if self.pre_compute_enc_h is None or self.han_mode:
            self.enc_h = enc_hs_pad  # utt x frame x hdim
            self.h_length = self.enc_h.size(1)
            # utt x frame x att_dim
            self.pre_compute_enc_h = self.mlp_enc(self.enc_h)

        if dec_z is None:
            dec_z = enc_hs_pad.new_zeros(batch, self.dunits)
        else:
            dec_z = dec_z.view(batch, self.dunits)

        # initialize attention weight with uniform dist.
        if att_prev_list is None:
            # if no bias, 0 0-pad goes 0
            mask = 1.0 - make_pad_mask(enc_hs_len).float()
            att_prev_list = [
                to_device(enc_hs_pad, mask / mask.new(enc_hs_len).unsqueeze(-1))
            ]

        # att_prev_list: L' * [B x T] => cov_vec B x T
        cov_vec = sum(att_prev_list)

        # cov_vec: B x T -> B x 1 x 1 x T -> B x C x 1 x T
        att_conv = self.loc_conv(cov_vec.view(batch, 1, 1, self.h_length))
        # att_conv: utt x att_conv_chans x 1 x frame -> utt x frame x att_conv_chans
        att_conv = att_conv.squeeze(2).transpose(1, 2)
        # att_conv: utt x frame x att_conv_chans -> utt x frame x att_dim
        att_conv = self.mlp_att(att_conv)

        # dec_z_tiled: utt x frame x att_dim
        dec_z_tiled = self.mlp_dec(dec_z).view(batch, 1, self.att_dim)

        # dot with gvec
        # utt x frame x att_dim -> utt x frame
        e = self.gvec(
            torch.tanh(att_conv + self.pre_compute_enc_h + dec_z_tiled)
        ).squeeze(2)

        # NOTE consider zero padding when compute w.
        if self.mask is None:
            self.mask = to_device(enc_hs_pad, make_pad_mask(enc_hs_len))
        e.masked_fill_(self.mask, -float("inf"))
        w = F.softmax(scaling * e, dim=1)
        att_prev_list += [w]

        # weighted sum over flames
        # utt x hdim
        # NOTE use bmm instead of sum(*)
        c = torch.sum(self.enc_h * w.view(batch, self.h_length, 1), dim=1)

        return c, att_prev_list


class AttMultiHeadDot(torch.nn.Module):
    """Multi head dot product attention

    Reference: Attention is all you need
        (https://arxiv.org/abs/1706.03762)

    :param int eprojs: # projection-units of encoder
    :param int dunits: # units of decoder
    :param int aheads: # heads of multi head attention
    :param int att_dim_k: dimension k in multi head attention
    :param int att_dim_v: dimension v in multi head attention
    :param bool han_mode: flag to swith on mode of hierarchical attention
        and not store pre_compute_k and pre_compute_v
    """

    def __init__(self, eprojs, dunits, aheads, att_dim_k, att_dim_v, han_mode=False):
        super(AttMultiHeadDot, self).__init__()
        self.mlp_q = torch.nn.ModuleList()
        self.mlp_k = torch.nn.ModuleList()
        self.mlp_v = torch.nn.ModuleList()
        for _ in six.moves.range(aheads):
            self.mlp_q += [torch.nn.Linear(dunits, att_dim_k)]
            self.mlp_k += [torch.nn.Linear(eprojs, att_dim_k, bias=False)]
            self.mlp_v += [torch.nn.Linear(eprojs, att_dim_v, bias=False)]
        self.mlp_o = torch.nn.Linear(aheads * att_dim_v, eprojs, bias=False)
        self.dunits = dunits
        self.eprojs = eprojs
        self.aheads = aheads
        self.att_dim_k = att_dim_k
        self.att_dim_v = att_dim_v
        self.scaling = 1.0 / math.sqrt(att_dim_k)
        self.h_length = None
        self.enc_h = None
        self.pre_compute_k = None
        self.pre_compute_v = None
        self.mask = None
        self.han_mode = han_mode

    def reset(self):
        """reset states"""
        self.h_length = None
        self.enc_h = None
        self.pre_compute_k = None
        self.pre_compute_v = None
        self.mask = None

    def forward(self, enc_hs_pad, enc_hs_len, dec_z, att_prev):
        """AttMultiHeadDot forward

        :param torch.Tensor enc_hs_pad: padded encoder hidden state (B x T_max x D_enc)
        :param list enc_hs_len: padded encoder hidden state length (B)
        :param torch.Tensor dec_z: decoder hidden state (B x D_dec)
        :param torch.Tensor att_prev: dummy (does not use)
        :return: attention weighted encoder state (B x D_enc)
        :rtype: torch.Tensor
        :return: list of previous attention weight (B x T_max) * aheads
        :rtype: list
        """

        batch = enc_hs_pad.size(0)
        # pre-compute all k and v outside the decoder loop
        if self.pre_compute_k is None or self.han_mode:
            self.enc_h = enc_hs_pad  # utt x frame x hdim
            self.h_length = self.enc_h.size(1)
            # utt x frame x att_dim
            self.pre_compute_k = [
                torch.tanh(self.mlp_k[h](self.enc_h))
                for h in six.moves.range(self.aheads)
            ]

        if self.pre_compute_v is None or self.han_mode:
            self.enc_h = enc_hs_pad  # utt x frame x hdim
            self.h_length = self.enc_h.size(1)
            # utt x frame x att_dim
            self.pre_compute_v = [
                self.mlp_v[h](self.enc_h) for h in six.moves.range(self.aheads)
            ]

        if dec_z is None:
            dec_z = enc_hs_pad.new_zeros(batch, self.dunits)
        else:
            dec_z = dec_z.view(batch, self.dunits)

        c = []
        w = []
        for h in six.moves.range(self.aheads):
            e = torch.sum(
                self.pre_compute_k[h]
                * torch.tanh(self.mlp_q[h](dec_z)).view(batch, 1, self.att_dim_k),
                dim=2,
            )  # utt x frame

            # NOTE consider zero padding when compute w.
            if self.mask is None:
                self.mask = to_device(enc_hs_pad, make_pad_mask(enc_hs_len))
            e.masked_fill_(self.mask, -float("inf"))
            w += [F.softmax(self.scaling * e, dim=1)]

            # weighted sum over flames
            # utt x hdim
            # NOTE use bmm instead of sum(*)
            c += [
                torch.sum(
                    self.pre_compute_v[h] * w[h].view(batch, self.h_length, 1), dim=1
                )
            ]

        # concat all of c
        c = self.mlp_o(torch.cat(c, dim=1))

        return c, w


class AttMultiHeadAdd(torch.nn.Module):
    """Multi head additive attention

    Reference: Attention is all you need
        (https://arxiv.org/abs/1706.03762)

    This attention is multi head attention using additive attention for each head.

    :param int eprojs: # projection-units of encoder
    :param int dunits: # units of decoder
    :param int aheads: # heads of multi head attention
    :param int att_dim_k: dimension k in multi head attention
    :param int att_dim_v: dimension v in multi head attention
    :param bool han_mode: flag to swith on mode of hierarchical attention
        and not store pre_compute_k and pre_compute_v
    """

    def __init__(self, eprojs, dunits, aheads, att_dim_k, att_dim_v, han_mode=False):
        super(AttMultiHeadAdd, self).__init__()
        self.mlp_q = torch.nn.ModuleList()
        self.mlp_k = torch.nn.ModuleList()
        self.mlp_v = torch.nn.ModuleList()
        self.gvec = torch.nn.ModuleList()
        for _ in six.moves.range(aheads):
            self.mlp_q += [torch.nn.Linear(dunits, att_dim_k)]
            self.mlp_k += [torch.nn.Linear(eprojs, att_dim_k, bias=False)]
            self.mlp_v += [torch.nn.Linear(eprojs, att_dim_v, bias=False)]
            self.gvec += [torch.nn.Linear(att_dim_k, 1)]
        self.mlp_o = torch.nn.Linear(aheads * att_dim_v, eprojs, bias=False)
        self.dunits = dunits
        self.eprojs = eprojs
        self.aheads = aheads
        self.att_dim_k = att_dim_k
        self.att_dim_v = att_dim_v
        self.scaling = 1.0 / math.sqrt(att_dim_k)
        self.h_length = None
        self.enc_h = None
        self.pre_compute_k = None
        self.pre_compute_v = None
        self.mask = None
        self.han_mode = han_mode

    def reset(self):
        """reset states"""
        self.h_length = None
        self.enc_h = None
        self.pre_compute_k = None
        self.pre_compute_v = None
        self.mask = None

    def forward(self, enc_hs_pad, enc_hs_len, dec_z, att_prev):
        """AttMultiHeadAdd forward

        :param torch.Tensor enc_hs_pad: padded encoder hidden state (B x T_max x D_enc)
        :param list enc_hs_len: padded encoder hidden state length (B)
        :param torch.Tensor dec_z: decoder hidden state (B x D_dec)
        :param torch.Tensor att_prev: dummy (does not use)
        :return: attention weighted encoder state (B, D_enc)
        :rtype: torch.Tensor
        :return: list of previous attention weight (B x T_max) * aheads
        :rtype: list
        """

        batch = enc_hs_pad.size(0)
        # pre-compute all k and v outside the decoder loop
        if self.pre_compute_k is None or self.han_mode:
            self.enc_h = enc_hs_pad  # utt x frame x hdim
            self.h_length = self.enc_h.size(1)
            # utt x frame x att_dim
            self.pre_compute_k = [
                self.mlp_k[h](self.enc_h) for h in six.moves.range(self.aheads)
            ]

        if self.pre_compute_v is None or self.han_mode:
            self.enc_h = enc_hs_pad  # utt x frame x hdim
            self.h_length = self.enc_h.size(1)
            # utt x frame x att_dim
            self.pre_compute_v = [
                self.mlp_v[h](self.enc_h) for h in six.moves.range(self.aheads)
            ]

        if dec_z is None:
            dec_z = enc_hs_pad.new_zeros(batch, self.dunits)
        else:
            dec_z = dec_z.view(batch, self.dunits)

        c = []
        w = []
        for h in six.moves.range(self.aheads):
            e = self.gvec[h](
                torch.tanh(
                    self.pre_compute_k[h]
                    + self.mlp_q[h](dec_z).view(batch, 1, self.att_dim_k)
                )
            ).squeeze(2)

            # NOTE consider zero padding when compute w.
            if self.mask is None:
                self.mask = to_device(enc_hs_pad, make_pad_mask(enc_hs_len))
            e.masked_fill_(self.mask, -float("inf"))
            w += [F.softmax(self.scaling * e, dim=1)]

            # weighted sum over flames
            # utt x hdim
            # NOTE use bmm instead of sum(*)
            c += [
                torch.sum(
                    self.pre_compute_v[h] * w[h].view(batch, self.h_length, 1), dim=1
                )
            ]

        # concat all of c
        c = self.mlp_o(torch.cat(c, dim=1))

        return c, w


class AttMultiHeadLoc(torch.nn.Module):
    """Multi head location based attention

    Reference: Attention is all you need
        (https://arxiv.org/abs/1706.03762)

    This attention is multi head attention using location-aware attention for each head.

    :param int eprojs: # projection-units of encoder
    :param int dunits: # units of decoder
    :param int aheads: # heads of multi head attention
    :param int att_dim_k: dimension k in multi head attention
    :param int att_dim_v: dimension v in multi head attention
    :param int aconv_chans: # channels of attention convolution
    :param int aconv_filts: filter size of attention convolution
    :param bool han_mode: flag to swith on mode of hierarchical attention
        and not store pre_compute_k and pre_compute_v
    """

    def __init__(
        self,
        eprojs,
        dunits,
        aheads,
        att_dim_k,
        att_dim_v,
        aconv_chans,
        aconv_filts,
        han_mode=False,
    ):
        super(AttMultiHeadLoc, self).__init__()
        self.mlp_q = torch.nn.ModuleList()
        self.mlp_k = torch.nn.ModuleList()
        self.mlp_v = torch.nn.ModuleList()
        self.gvec = torch.nn.ModuleList()
        self.loc_conv = torch.nn.ModuleList()
        self.mlp_att = torch.nn.ModuleList()
        for _ in six.moves.range(aheads):
            self.mlp_q += [torch.nn.Linear(dunits, att_dim_k)]
            self.mlp_k += [torch.nn.Linear(eprojs, att_dim_k, bias=False)]
            self.mlp_v += [torch.nn.Linear(eprojs, att_dim_v, bias=False)]
            self.gvec += [torch.nn.Linear(att_dim_k, 1)]
            self.loc_conv += [
                torch.nn.Conv2d(
                    1,
                    aconv_chans,
                    (1, 2 * aconv_filts + 1),
                    padding=(0, aconv_filts),
                    bias=False,
                )
            ]
            self.mlp_att += [torch.nn.Linear(aconv_chans, att_dim_k, bias=False)]
        self.mlp_o = torch.nn.Linear(aheads * att_dim_v, eprojs, bias=False)
        self.dunits = dunits
        self.eprojs = eprojs
        self.aheads = aheads
        self.att_dim_k = att_dim_k
        self.att_dim_v = att_dim_v
        self.scaling = 1.0 / math.sqrt(att_dim_k)
        self.h_length = None
        self.enc_h = None
        self.pre_compute_k = None
        self.pre_compute_v = None
        self.mask = None
        self.han_mode = han_mode

    def reset(self):
        """reset states"""
        self.h_length = None
        self.enc_h = None
        self.pre_compute_k = None
        self.pre_compute_v = None
        self.mask = None

    def forward(self, enc_hs_pad, enc_hs_len, dec_z, att_prev, scaling=2.0):
        """AttMultiHeadLoc forward

        :param torch.Tensor enc_hs_pad: padded encoder hidden state (B x T_max x D_enc)
        :param list enc_hs_len: padded encoder hidden state length (B)
        :param torch.Tensor dec_z: decoder hidden state (B x D_dec)
        :param torch.Tensor att_prev:
            list of previous attention weight (B x T_max) * aheads
        :param float scaling: scaling parameter before applying softmax
        :return: attention weighted encoder state (B x D_enc)
        :rtype: torch.Tensor
        :return: list of previous attention weight (B x T_max) * aheads
        :rtype: list
        """

        batch = enc_hs_pad.size(0)
        # pre-compute all k and v outside the decoder loop
        if self.pre_compute_k is None or self.han_mode:
            self.enc_h = enc_hs_pad  # utt x frame x hdim
            self.h_length = self.enc_h.size(1)
            # utt x frame x att_dim
            self.pre_compute_k = [
                self.mlp_k[h](self.enc_h) for h in six.moves.range(self.aheads)
            ]

        if self.pre_compute_v is None or self.han_mode:
            self.enc_h = enc_hs_pad  # utt x frame x hdim
            self.h_length = self.enc_h.size(1)
            # utt x frame x att_dim
            self.pre_compute_v = [
                self.mlp_v[h](self.enc_h) for h in six.moves.range(self.aheads)
            ]

        if dec_z is None:
            dec_z = enc_hs_pad.new_zeros(batch, self.dunits)
        else:
            dec_z = dec_z.view(batch, self.dunits)

        if att_prev is None:
            att_prev = []
            for _ in six.moves.range(self.aheads):
                # if no bias, 0 0-pad goes 0
                mask = 1.0 - make_pad_mask(enc_hs_len).float()
                att_prev += [
                    to_device(enc_hs_pad, mask / mask.new(enc_hs_len).unsqueeze(-1))
                ]

        c = []
        w = []
        for h in six.moves.range(self.aheads):
            att_conv = self.loc_conv[h](att_prev[h].view(batch, 1, 1, self.h_length))
            att_conv = att_conv.squeeze(2).transpose(1, 2)
            att_conv = self.mlp_att[h](att_conv)

            e = self.gvec[h](
                torch.tanh(
                    self.pre_compute_k[h]
                    + att_conv
                    + self.mlp_q[h](dec_z).view(batch, 1, self.att_dim_k)
                )
            ).squeeze(2)

            # NOTE consider zero padding when compute w.
            if self.mask is None:
                self.mask = to_device(enc_hs_pad, make_pad_mask(enc_hs_len))
            e.masked_fill_(self.mask, -float("inf"))
            w += [F.softmax(scaling * e, dim=1)]

            # weighted sum over flames
            # utt x hdim
            # NOTE use bmm instead of sum(*)
            c += [
                torch.sum(
                    self.pre_compute_v[h] * w[h].view(batch, self.h_length, 1), dim=1
                )
            ]

        # concat all of c
        c = self.mlp_o(torch.cat(c, dim=1))

        return c, w


class AttMultiHeadMultiResLoc(torch.nn.Module):
    """Multi head multi resolution location based attention

    Reference: Attention is all you need
        (https://arxiv.org/abs/1706.03762)

    This attention is multi head attention using location-aware attention for each head.
    Furthermore, it uses different filter size for each head.

    :param int eprojs: # projection-units of encoder
    :param int dunits: # units of decoder
    :param int aheads: # heads of multi head attention
    :param int att_dim_k: dimension k in multi head attention
    :param int att_dim_v: dimension v in multi head attention
    :param int aconv_chans: maximum # channels of attention convolution
        each head use #ch = aconv_chans * (head + 1) / aheads
        e.g. aheads=4, aconv_chans=100 => filter size = 25, 50, 75, 100
    :param int aconv_filts: filter size of attention convolution
    :param bool han_mode: flag to swith on mode of hierarchical attention
        and not store pre_compute_k and pre_compute_v
    """

    def __init__(
        self,
        eprojs,
        dunits,
        aheads,
        att_dim_k,
        att_dim_v,
        aconv_chans,
        aconv_filts,
        han_mode=False,
    ):
        super(AttMultiHeadMultiResLoc, self).__init__()
        self.mlp_q = torch.nn.ModuleList()
        self.mlp_k = torch.nn.ModuleList()
        self.mlp_v = torch.nn.ModuleList()
        self.gvec = torch.nn.ModuleList()
        self.loc_conv = torch.nn.ModuleList()
        self.mlp_att = torch.nn.ModuleList()
        for h in six.moves.range(aheads):
            self.mlp_q += [torch.nn.Linear(dunits, att_dim_k)]
            self.mlp_k += [torch.nn.Linear(eprojs, att_dim_k, bias=False)]
            self.mlp_v += [torch.nn.Linear(eprojs, att_dim_v, bias=False)]
            self.gvec += [torch.nn.Linear(att_dim_k, 1)]
            afilts = aconv_filts * (h + 1) // aheads
            self.loc_conv += [
                torch.nn.Conv2d(
                    1, aconv_chans, (1, 2 * afilts + 1), padding=(0, afilts), bias=False
                )
            ]
            self.mlp_att += [torch.nn.Linear(aconv_chans, att_dim_k, bias=False)]
        self.mlp_o = torch.nn.Linear(aheads * att_dim_v, eprojs, bias=False)
        self.dunits = dunits
        self.eprojs = eprojs
        self.aheads = aheads
        self.att_dim_k = att_dim_k
        self.att_dim_v = att_dim_v
        self.scaling = 1.0 / math.sqrt(att_dim_k)
        self.h_length = None
        self.enc_h = None
        self.pre_compute_k = None
        self.pre_compute_v = None
        self.mask = None
        self.han_mode = han_mode

    def reset(self):
        """reset states"""
        self.h_length = None
        self.enc_h = None
        self.pre_compute_k = None
        self.pre_compute_v = None
        self.mask = None

    def forward(self, enc_hs_pad, enc_hs_len, dec_z, att_prev):
        """AttMultiHeadMultiResLoc forward

        :param torch.Tensor enc_hs_pad: padded encoder hidden state (B x T_max x D_enc)
        :param list enc_hs_len: padded encoder hidden state length (B)
        :param torch.Tensor dec_z: decoder hidden state (B x D_dec)
        :param torch.Tensor att_prev: list of previous attention weight
            (B x T_max) * aheads
        :return: attention weighted encoder state (B x D_enc)
        :rtype: torch.Tensor
        :return: list of previous attention weight (B x T_max) * aheads
        :rtype: list
        """

        batch = enc_hs_pad.size(0)
        # pre-compute all k and v outside the decoder loop
        if self.pre_compute_k is None or self.han_mode:
            self.enc_h = enc_hs_pad  # utt x frame x hdim
            self.h_length = self.enc_h.size(1)
            # utt x frame x att_dim
            self.pre_compute_k = [
                self.mlp_k[h](self.enc_h) for h in six.moves.range(self.aheads)
            ]

        if self.pre_compute_v is None or self.han_mode:
            self.enc_h = enc_hs_pad  # utt x frame x hdim
            self.h_length = self.enc_h.size(1)
            # utt x frame x att_dim
            self.pre_compute_v = [
                self.mlp_v[h](self.enc_h) for h in six.moves.range(self.aheads)
            ]

        if dec_z is None:
            dec_z = enc_hs_pad.new_zeros(batch, self.dunits)
        else:
            dec_z = dec_z.view(batch, self.dunits)

        if att_prev is None:
            att_prev = []
            for _ in six.moves.range(self.aheads):
                # if no bias, 0 0-pad goes 0
                mask = 1.0 - make_pad_mask(enc_hs_len).float()
                att_prev += [
                    to_device(enc_hs_pad, mask / mask.new(enc_hs_len).unsqueeze(-1))
                ]

        c = []
        w = []
        for h in six.moves.range(self.aheads):
            att_conv = self.loc_conv[h](att_prev[h].view(batch, 1, 1, self.h_length))
            att_conv = att_conv.squeeze(2).transpose(1, 2)
            att_conv = self.mlp_att[h](att_conv)

            e = self.gvec[h](
                torch.tanh(
                    self.pre_compute_k[h]
                    + att_conv
                    + self.mlp_q[h](dec_z).view(batch, 1, self.att_dim_k)
                )
            ).squeeze(2)

            # NOTE consider zero padding when compute w.
            if self.mask is None:
                self.mask = to_device(enc_hs_pad, make_pad_mask(enc_hs_len))
            e.masked_fill_(self.mask, -float("inf"))
            w += [F.softmax(self.scaling * e, dim=1)]

            # weighted sum over flames
            # utt x hdim
            # NOTE use bmm instead of sum(*)
            c += [
                torch.sum(
                    self.pre_compute_v[h] * w[h].view(batch, self.h_length, 1), dim=1
                )
            ]

        # concat all of c
        c = self.mlp_o(torch.cat(c, dim=1))

        return c, w


class AttForward(torch.nn.Module):
    """Forward attention module.

    Reference:
    Forward attention in sequence-to-sequence acoustic modeling for speech synthesis
        (https://arxiv.org/pdf/1807.06736.pdf)

    :param int eprojs: # projection-units of encoder
    :param int dunits: # units of decoder
    :param int att_dim: attention dimension
    :param int aconv_chans: # channels of attention convolution
    :param int aconv_filts: filter size of attention convolution
    """

    def __init__(self, eprojs, dunits, att_dim, aconv_chans, aconv_filts):
        super(AttForward, self).__init__()
        self.mlp_enc = torch.nn.Linear(eprojs, att_dim)
        self.mlp_dec = torch.nn.Linear(dunits, att_dim, bias=False)
        self.mlp_att = torch.nn.Linear(aconv_chans, att_dim, bias=False)
        self.loc_conv = torch.nn.Conv2d(
            1,
            aconv_chans,
            (1, 2 * aconv_filts + 1),
            padding=(0, aconv_filts),
            bias=False,
        )
        self.gvec = torch.nn.Linear(att_dim, 1)
        self.dunits = dunits
        self.eprojs = eprojs
        self.att_dim = att_dim
        self.h_length = None
        self.enc_h = None
        self.pre_compute_enc_h = None
        self.mask = None

    def reset(self):
        """reset states"""
        self.h_length = None
        self.enc_h = None
        self.pre_compute_enc_h = None
        self.mask = None

    def forward(
        self,
        enc_hs_pad,
        enc_hs_len,
        dec_z,
        att_prev,
        scaling=1.0,
        last_attended_idx=None,
        backward_window=1,
        forward_window=3,
    ):
        """Calculate AttForward forward propagation.

        :param torch.Tensor enc_hs_pad: padded encoder hidden state (B x T_max x D_enc)
        :param list enc_hs_len: padded encoder hidden state length (B)
        :param torch.Tensor dec_z: decoder hidden state (B x D_dec)
        :param torch.Tensor att_prev: attention weights of previous step
        :param float scaling: scaling parameter before applying softmax
        :param int last_attended_idx: index of the inputs of the last attended
        :param int backward_window: backward window size in attention constraint
        :param int forward_window: forward window size in attetion constraint
        :return: attention weighted encoder state (B, D_enc)
        :rtype: torch.Tensor
        :return: previous attention weights (B x T_max)
        :rtype: torch.Tensor
        """
        batch = len(enc_hs_pad)
        # pre-compute all h outside the decoder loop
        if self.pre_compute_enc_h is None:
            self.enc_h = enc_hs_pad  # utt x frame x hdim
            self.h_length = self.enc_h.size(1)
            # utt x frame x att_dim
            self.pre_compute_enc_h = self.mlp_enc(self.enc_h)

        if dec_z is None:
            dec_z = enc_hs_pad.new_zeros(batch, self.dunits)
        else:
            dec_z = dec_z.view(batch, self.dunits)

        if att_prev is None:
            # initial attention will be [1, 0, 0, ...]
            att_prev = enc_hs_pad.new_zeros(*enc_hs_pad.size()[:2])
            att_prev[:, 0] = 1.0

        # att_prev: utt x frame -> utt x 1 x 1 x frame
        # -> utt x att_conv_chans x 1 x frame
        att_conv = self.loc_conv(att_prev.view(batch, 1, 1, self.h_length))
        # att_conv: utt x att_conv_chans x 1 x frame -> utt x frame x att_conv_chans
        att_conv = att_conv.squeeze(2).transpose(1, 2)
        # att_conv: utt x frame x att_conv_chans -> utt x frame x att_dim
        att_conv = self.mlp_att(att_conv)

        # dec_z_tiled: utt x frame x att_dim
        dec_z_tiled = self.mlp_dec(dec_z).unsqueeze(1)

        # dot with gvec
        # utt x frame x att_dim -> utt x frame
        e = self.gvec(
            torch.tanh(self.pre_compute_enc_h + dec_z_tiled + att_conv)
        ).squeeze(2)

        # NOTE: consider zero padding when compute w.
        if self.mask is None:
            self.mask = to_device(enc_hs_pad, make_pad_mask(enc_hs_len))
        e.masked_fill_(self.mask, -float("inf"))

        # apply monotonic attention constraint (mainly for TTS)
        if last_attended_idx is not None:
            e = _apply_attention_constraint(
                e, last_attended_idx, backward_window, forward_window
            )

        w = F.softmax(scaling * e, dim=1)

        # forward attention
        att_prev_shift = F.pad(att_prev, (1, 0))[:, :-1]
        w = (att_prev + att_prev_shift) * w
        # NOTE: clamp is needed to avoid nan gradient
        w = F.normalize(torch.clamp(w, 1e-6), p=1, dim=1)

        # weighted sum over flames
        # utt x hdim
        # NOTE use bmm instead of sum(*)
        c = torch.sum(self.enc_h * w.unsqueeze(-1), dim=1)

        return c, w


class AttForwardTA(torch.nn.Module):
    """Forward attention with transition agent module.

    Reference:
    Forward attention in sequence-to-sequence acoustic modeling for speech synthesis
        (https://arxiv.org/pdf/1807.06736.pdf)

    :param int eunits: # units of encoder
    :param int dunits: # units of decoder
    :param int att_dim: attention dimension
    :param int aconv_chans: # channels of attention convolution
    :param int aconv_filts: filter size of attention convolution
    :param int odim: output dimension
    """

    def __init__(self, eunits, dunits, att_dim, aconv_chans, aconv_filts, odim):
        super(AttForwardTA, self).__init__()
        self.mlp_enc = torch.nn.Linear(eunits, att_dim)
        self.mlp_dec = torch.nn.Linear(dunits, att_dim, bias=False)
        self.mlp_ta = torch.nn.Linear(eunits + dunits + odim, 1)
        self.mlp_att = torch.nn.Linear(aconv_chans, att_dim, bias=False)
        self.loc_conv = torch.nn.Conv2d(
            1,
            aconv_chans,
            (1, 2 * aconv_filts + 1),
            padding=(0, aconv_filts),
            bias=False,
        )
        self.gvec = torch.nn.Linear(att_dim, 1)
        self.dunits = dunits
        self.eunits = eunits
        self.att_dim = att_dim
        self.h_length = None
        self.enc_h = None
        self.pre_compute_enc_h = None
        self.mask = None
        self.trans_agent_prob = 0.5

    def reset(self):
        self.h_length = None
        self.enc_h = None
        self.pre_compute_enc_h = None
        self.mask = None
        self.trans_agent_prob = 0.5

    def forward(
        self,
        enc_hs_pad,
        enc_hs_len,
        dec_z,
        att_prev,
        out_prev,
        scaling=1.0,
        last_attended_idx=None,
        backward_window=1,
        forward_window=3,
    ):
        """Calculate AttForwardTA forward propagation.

        :param torch.Tensor enc_hs_pad: padded encoder hidden state (B, Tmax, eunits)
        :param list enc_hs_len: padded encoder hidden state length (B)
        :param torch.Tensor dec_z: decoder hidden state (B, dunits)
        :param torch.Tensor att_prev: attention weights of previous step
        :param torch.Tensor out_prev: decoder outputs of previous step (B, odim)
        :param float scaling: scaling parameter before applying softmax
        :param int last_attended_idx: index of the inputs of the last attended
        :param int backward_window: backward window size in attention constraint
        :param int forward_window: forward window size in attetion constraint
        :return: attention weighted encoder state (B, dunits)
        :rtype: torch.Tensor
        :return: previous attention weights (B, Tmax)
        :rtype: torch.Tensor
        """
        batch = len(enc_hs_pad)
        # pre-compute all h outside the decoder loop
        if self.pre_compute_enc_h is None:
            self.enc_h = enc_hs_pad  # utt x frame x hdim
            self.h_length = self.enc_h.size(1)
            # utt x frame x att_dim
            self.pre_compute_enc_h = self.mlp_enc(self.enc_h)

        if dec_z is None:
            dec_z = enc_hs_pad.new_zeros(batch, self.dunits)
        else:
            dec_z = dec_z.view(batch, self.dunits)

        if att_prev is None:
            # initial attention will be [1, 0, 0, ...]
            att_prev = enc_hs_pad.new_zeros(*enc_hs_pad.size()[:2])
            att_prev[:, 0] = 1.0

        # att_prev: utt x frame -> utt x 1 x 1 x frame
        # -> utt x att_conv_chans x 1 x frame
        att_conv = self.loc_conv(att_prev.view(batch, 1, 1, self.h_length))
        # att_conv: utt x att_conv_chans x 1 x frame -> utt x frame x att_conv_chans
        att_conv = att_conv.squeeze(2).transpose(1, 2)
        # att_conv: utt x frame x att_conv_chans -> utt x frame x att_dim
        att_conv = self.mlp_att(att_conv)

        # dec_z_tiled: utt x frame x att_dim
        dec_z_tiled = self.mlp_dec(dec_z).view(batch, 1, self.att_dim)

        # dot with gvec
        # utt x frame x att_dim -> utt x frame
        e = self.gvec(
            torch.tanh(att_conv + self.pre_compute_enc_h + dec_z_tiled)
        ).squeeze(2)

        # NOTE consider zero padding when compute w.
        if self.mask is None:
            self.mask = to_device(enc_hs_pad, make_pad_mask(enc_hs_len))
        e.masked_fill_(self.mask, -float("inf"))

        # apply monotonic attention constraint (mainly for TTS)
        if last_attended_idx is not None:
            e = _apply_attention_constraint(
                e, last_attended_idx, backward_window, forward_window
            )

        w = F.softmax(scaling * e, dim=1)

        # forward attention
        att_prev_shift = F.pad(att_prev, (1, 0))[:, :-1]
        w = (
            self.trans_agent_prob * att_prev
            + (1 - self.trans_agent_prob) * att_prev_shift
        ) * w
        # NOTE: clamp is needed to avoid nan gradient
        w = F.normalize(torch.clamp(w, 1e-6), p=1, dim=1)

        # weighted sum over flames
        # utt x hdim
        # NOTE use bmm instead of sum(*)
        c = torch.sum(self.enc_h * w.view(batch, self.h_length, 1), dim=1)

        # update transition agent prob
        self.trans_agent_prob = torch.sigmoid(
            self.mlp_ta(torch.cat([c, out_prev, dec_z], dim=1))
        )

        return c, w


def att_for(args, num_att=1, han_mode=False):
    """Instantiates an attention module given the program arguments

    :param Namespace args: The arguments
    :param int num_att: number of attention modules
        (in multi-speaker case, it can be 2 or more)
    :param bool han_mode: switch on/off mode of hierarchical attention network (HAN)
    :rtype torch.nn.Module
    :return: The attention module
    """
    att_list = torch.nn.ModuleList()
    num_encs = getattr(args, "num_encs", 1)  # use getattr to keep compatibility
    aheads = getattr(args, "aheads", None)
    awin = getattr(args, "awin", None)
    aconv_chans = getattr(args, "aconv_chans", None)
    aconv_filts = getattr(args, "aconv_filts", None)

    if num_encs == 1:
        for i in range(num_att):
            att = initial_att(
                args.atype,
                args.eprojs,
                args.dunits,
                aheads,
                args.adim,
                awin,
                aconv_chans,
                aconv_filts,
            )
            att_list.append(att)
    elif num_encs > 1:  # no multi-speaker mode
        if han_mode:
            att = initial_att(
                args.han_type,
                args.eprojs,
                args.dunits,
                args.han_heads,
                args.han_dim,
                args.han_win,
                args.han_conv_chans,
                args.han_conv_filts,
                han_mode=True,
            )
            return att
        else:
            att_list = torch.nn.ModuleList()
            for idx in range(num_encs):
                att = initial_att(
                    args.atype[idx],
                    args.eprojs,
                    args.dunits,
                    aheads[idx],
                    args.adim[idx],
                    awin[idx],
                    aconv_chans[idx],
                    aconv_filts[idx],
                )
                att_list.append(att)
    else:
        raise ValueError(
            "Number of encoders needs to be more than one. {}".format(num_encs)
        )
    return att_list


def initial_att(
    atype, eprojs, dunits, aheads, adim, awin, aconv_chans, aconv_filts, han_mode=False
):
    """Instantiates a single attention module

    :param str atype: attention type
    :param int eprojs: # projection-units of encoder
    :param int dunits: # units of decoder
    :param int aheads: # heads of multi head attention
    :param int adim: attention dimension
    :param int awin: attention window size
    :param int aconv_chans: # channels of attention convolution
    :param int aconv_filts: filter size of attention convolution
    :param bool han_mode: flag to swith on mode of hierarchical attention
    :return: The attention module
    """

    if atype == "noatt":
        att = NoAtt()
    elif atype == "dot":
        att = AttDot(eprojs, dunits, adim, han_mode)
    elif atype == "add":
        att = AttAdd(eprojs, dunits, adim, han_mode)
    elif atype == "location":
        att = AttLoc(eprojs, dunits, adim, aconv_chans, aconv_filts, han_mode)
    elif atype == "location2d":
        att = AttLoc2D(eprojs, dunits, adim, awin, aconv_chans, aconv_filts, han_mode)
    elif atype == "location_recurrent":
        att = AttLocRec(eprojs, dunits, adim, aconv_chans, aconv_filts, han_mode)
    elif atype == "coverage":
        att = AttCov(eprojs, dunits, adim, han_mode)
    elif atype == "coverage_location":
        att = AttCovLoc(eprojs, dunits, adim, aconv_chans, aconv_filts, han_mode)
    elif atype == "multi_head_dot":
        att = AttMultiHeadDot(eprojs, dunits, aheads, adim, adim, han_mode)
    elif atype == "multi_head_add":
        att = AttMultiHeadAdd(eprojs, dunits, aheads, adim, adim, han_mode)
    elif atype == "multi_head_loc":
        att = AttMultiHeadLoc(
            eprojs, dunits, aheads, adim, adim, aconv_chans, aconv_filts, han_mode
        )
    elif atype == "multi_head_multi_res_loc":
        att = AttMultiHeadMultiResLoc(
            eprojs, dunits, aheads, adim, adim, aconv_chans, aconv_filts, han_mode
        )
    return att


def att_to_numpy(att_ws, att):
    """Converts attention weights to a numpy array given the attention

    :param list att_ws: The attention weights
    :param torch.nn.Module att: The attention
    :rtype: np.ndarray
    :return: The numpy array of the attention weights
    """
    # convert to numpy array with the shape (B, Lmax, Tmax)
    if isinstance(att, AttLoc2D):
        # att_ws => list of previous concate attentions
        att_ws = torch.stack([aw[:, -1] for aw in att_ws], dim=1).cpu().numpy()
    elif isinstance(att, (AttCov, AttCovLoc)):
        # att_ws => list of list of previous attentions
        att_ws = (
            torch.stack([aw[idx] for idx, aw in enumerate(att_ws)], dim=1).cpu().numpy()
        )
    elif isinstance(att, AttLocRec):
        # att_ws => list of tuple of attention and hidden states
        att_ws = torch.stack([aw[0] for aw in att_ws], dim=1).cpu().numpy()
    elif isinstance(
        att,
        (AttMultiHeadDot, AttMultiHeadAdd, AttMultiHeadLoc, AttMultiHeadMultiResLoc),
    ):
        # att_ws => list of list of each head attention
        n_heads = len(att_ws[0])
        att_ws_sorted_by_head = []
        for h in six.moves.range(n_heads):
            att_ws_head = torch.stack([aw[h] for aw in att_ws], dim=1)
            att_ws_sorted_by_head += [att_ws_head]
        att_ws = torch.stack(att_ws_sorted_by_head, dim=1).cpu().numpy()
    else:
        # att_ws => list of attentions
        att_ws = torch.stack(att_ws, dim=1).cpu().numpy()
    return att_ws


================================================
FILE: nets/pytorch_backend/rnn/decoders.py
================================================
from distutils.version import LooseVersion
import logging
import math
import random
import six

import numpy as np
import torch
import torch.nn.functional as F

from argparse import Namespace

from espnet.nets.ctc_prefix_score import CTCPrefixScore
from espnet.nets.ctc_prefix_score import CTCPrefixScoreTH
from espnet.nets.e2e_asr_common import end_detect

from espnet.nets.pytorch_backend.rnn.attentions import att_to_numpy

from espnet.nets.pytorch_backend.nets_utils import mask_by_length
from espnet.nets.pytorch_backend.nets_utils import pad_list
from espnet.nets.pytorch_backend.nets_utils import th_accuracy
from espnet.nets.pytorch_backend.nets_utils import to_device
from espnet.nets.scorer_interface import ScorerInterface

MAX_DECODER_OUTPUT = 5
CTC_SCORING_RATIO = 1.5


class Decoder(torch.nn.Module, ScorerInterface):
    """Decoder module

    :param int eprojs: encoder projection units
    :param int odim: dimension of outputs
    :param str dtype: gru or lstm
    :param int dlayers: decoder layers
    :param int dunits: decoder units
    :param int sos: start of sequence symbol id
    :param int eos: end of sequence symbol id
    :param torch.nn.Module att: attention module
    :param int verbose: verbose level
    :param list char_list: list of character strings
    :param ndarray labeldist: distribution of label smoothing
    :param float lsm_weight: label smoothing weight
    :param float sampling_probability: scheduled sampling probability
    :param float dropout: dropout rate
    :param float context_residual: if True, use context vector for token generation
    :param float replace_sos: use for multilingual (speech/text) translation
    """

    def __init__(
        self,
        eprojs,
        odim,
        dtype,
        dlayers,
        dunits,
        sos,
        eos,
        att,
        verbose=0,
        char_list=None,
        labeldist=None,
        lsm_weight=0.0,
        sampling_probability=0.0,
        dropout=0.0,
        context_residual=False,
        replace_sos=False,
        num_encs=1,
    ):

        torch.nn.Module.__init__(self)
        self.dtype = dtype
        self.dunits = dunits
        self.dlayers = dlayers
        self.context_residual = context_residual
        self.embed = torch.nn.Embedding(odim, dunits)
        self.dropout_emb = torch.nn.Dropout(p=dropout)

        self.decoder = torch.nn.ModuleList()
        self.dropout_dec = torch.nn.ModuleList()
        self.decoder += [
            torch.nn.LSTMCell(dunits + eprojs, dunits)
            if self.dtype == "lstm"
            else torch.nn.GRUCell(dunits + eprojs, dunits)
        ]
        self.dropout_dec += [torch.nn.Dropout(p=dropout)]
        for _ in six.moves.range(1, self.dlayers):
            self.decoder += [
                torch.nn.LSTMCell(dunits, dunits)
                if self.dtype == "lstm"
                else torch.nn.GRUCell(dunits, dunits)
            ]
            self.dropout_dec += [torch.nn.Dropout(p=dropout)]
            # NOTE: dropout is applied only for the vertical connections
            # see https://arxiv.org/pdf/1409.2329.pdf
        self.ignore_id = -1

        if context_residual:
            self.output = torch.nn.Linear(dunits + eprojs, odim)
        else:
            self.output = torch.nn.Linear(dunits, odim)

        self.loss = None
        self.att = att
        self.dunits = dunits
        self.sos = sos
        self.eos = eos
        self.odim = odim
        self.verbose = verbose
        self.char_list = char_list
        # for label smoothing
        self.labeldist = labeldist
        self.vlabeldist = None
        self.lsm_weight = lsm_weight
        self.sampling_probability = sampling_probability
        self.dropout = dropout
        self.num_encs = num_encs

        # for multilingual E2E-ST
        self.replace_sos = replace_sos

        self.logzero = -10000000000.0

    def zero_state(self, hs_pad):
        return hs_pad.new_zeros(hs_pad.size(0), self.dunits)

    def rnn_forward(self, ey, z_list, c_list, z_prev, c_prev):
        if self.dtype == "lstm":
            z_list[0], c_list[0] = self.decoder[0](ey, (z_prev[0], c_prev[0]))
            for i in six.moves.range(1, self.dlayers):
                z_list[i], c_list[i] = self.decoder[i](
                    self.dropout_dec[i - 1](z_list[i - 1]), (z_prev[i], c_prev[i])
                )
        else:
            z_list[0] = self.decoder[0](ey, z_prev[0])
            for i in six.moves.range(1, self.dlayers):
                z_list[i] = self.decoder[i](
                    self.dropout_dec[i - 1](z_list[i - 1]), z_prev[i]
                )
        return z_list, c_list

    def forward(self, hs_pad, hlens, ys_pad, strm_idx=0, lang_ids=None):
        """Decoder forward

        :param torch.Tensor hs_pad: batch of padded hidden state sequences (B, Tmax, D)
                                    [in multi-encoder case,
                                    list of torch.Tensor,
                                    [(B, Tmax_1, D), (B, Tmax_2, D), ..., ] ]
        :param torch.Tensor hlens: batch of lengths of hidden state sequences (B)
                                   [in multi-encoder case, list of torch.Tensor,
                                   [(B), (B), ..., ]
        :param torch.Tensor ys_pad: batch of padded character id sequence tensor
                                    (B, Lmax)
        :param int strm_idx: stream index indicates the index of decoding stream.
        :param torch.Tensor lang_ids: batch of target language id tensor (B, 1)
        :return: attention loss value
        :rtype: torch.Tensor
        :return: accuracy
        :rtype: float
        """
        # to support mutiple encoder asr mode, in single encoder mode,
        # convert torch.Tensor to List of torch.Tensor
        if self.num_encs == 1:
            hs_pad = [hs_pad]
            hlens = [hlens]

        # TODO(kan-bayashi): need to make more smart way
        ys = [y[y != self.ignore_id] for y in ys_pad]  # parse padded ys
        # attention index for the attention module
        # in SPA (speaker parallel attention),
        # att_idx is used to select attention module. In other cases, it is 0.
        att_idx = min(strm_idx, len(self.att) - 1)

        # hlens should be list of list of integer
        hlens = [list(map(int, hlens[idx])) for idx in range(self.num_encs)]

        self.loss = None
        # prepare input and output word sequences with sos/eos IDs
        eos = ys[0].new([self.eos])
        sos = ys[0].new([self.sos])
        if self.replace_sos:
            ys_in = [torch.cat([idx, y], dim=0) for idx, y in zip(lang_ids, ys)]
        else:
            ys_in = [torch.cat([sos, y], dim=0) for y in ys]
        ys_out = [torch.cat([y, eos], dim=0) for y in ys]

        # padding for ys with -1
        # pys: utt x olen
        ys_in_pad = pad_list(ys_in, self.eos)
        ys_out_pad = pad_list(ys_out, self.ignore_id)

        # get dim, length info
        batch = ys_out_pad.size(0)
        olength = ys_out_pad.size(1)
        for idx in range(self.num_encs):
            logging.info(
                self.__class__.__name__
                + "Number of Encoder:{}; enc{}: input lengths: {}.".format(
                    self.num_encs, idx + 1, hlens[idx]
                )
            )
        logging.info(
            self.__class__.__name__
            + " output lengths: "
            + str([y.size(0) for y in ys_out])
        )

        # initialization
        c_list = [self.zero_state(hs_pad[0])]
        z_list = [self.zero_state(hs_pad[0])]
        for _ in six.moves.range(1, self.dlayers):
            c_list.append(self.zero_state(hs_pad[0]))
            z_list.append(self.zero_state(hs_pad[0]))
        z_all = []
        if self.num_encs == 1:
            att_w = None
            self.att[att_idx].reset()  # reset pre-computation of h
        else:
            att_w_list = [None] * (self.num_encs + 1)  # atts + han
            att_c_list = [None] * (self.num_encs)  # atts
            for idx in range(self.num_encs + 1):
                self.att[idx].reset()  # reset pre-computation of h in atts and han

        # pre-computation of embedding
        eys = self.dropout_emb(self.embed(ys_in_pad))  # utt x olen x zdim

        # loop for an output sequence
        for i in six.moves.range(olength):
            if self.num_encs == 1:
                att_c, att_w = self.att[att_idx](
                    hs_pad[0], hlens[0], self.dropout_dec[0](z_list[0]), att_w
                )
            else:
                for idx in range(self.num_encs):
                    att_c_list[idx], att_w_list[idx] = self.att[idx](
                        hs_pad[idx],
                        hlens[idx],
                        self.dropout_dec[0](z_list[0]),
                        att_w_list[idx],
                    )
                hs_pad_han = torch.stack(att_c_list, dim=1)
                hlens_han = [self.num_encs] * len(ys_in)
                att_c, att_w_list[self.num_encs] = self.att[self.num_encs](
                    hs_pad_han,
                    hlens_han,
                    self.dropout_dec[0](z_list[0]),
                    att_w_list[self.num_encs],
                )
            if i > 0 and random.random() < self.sampling_probability:
                logging.info(" scheduled sampling ")
                z_out = self.output(z_all[-1])
                z_out = np.argmax(z_out.detach().cpu(), axis=1)
                z_out = self.dropout_emb(self.embed(to_device(hs_pad[0], z_out)))
                ey = torch.cat((z_out, att_c), dim=1)  # utt x (zdim + hdim)
            else:
                ey = torch.cat((eys[:, i, :], att_c), dim=1)  # utt x (zdim + hdim)
            z_list, c_list = self.rnn_forward(ey, z_list, c_list, z_list, c_list)
            if self.context_residual:
                z_all.append(
                    torch.cat((self.dropout_dec[-1](z_list[-1]), att_c), dim=-1)
                )  # utt x (zdim + hdim)
            else:
                z_all.append(self.dropout_dec[-1](z_list[-1]))  # utt x (zdim)

        z_all = torch.stack(z_all, dim=1).view(batch * olength, -1)
        # compute loss
        y_all = self.output(z_all)
        if LooseVersion(torch.__version__) < LooseVersion("1.0"):
            reduction_str = "elementwise_mean"
        else:
            reduction_str = "mean"
        self.loss = F.cross_entropy(
            y_all,
            ys_out_pad.view(-1),
            ignore_index=self.ignore_id,
            reduction=reduction_str,
        )
        # compute perplexity
        ppl = math.exp(self.loss.item())
        # -1: eos, which is removed in the loss computation
        self.loss *= np.mean([len(x) for x in ys_in]) - 1
        acc = th_accuracy(y_all, ys_out_pad, ignore_label=self.ignore_id)
        logging.info("att loss:" + "".join(str(self.loss.item()).split("\n")))

        # show predicted character sequence for debug
        if self.verbose > 0 and self.char_list is not None:
            ys_hat = y_all.view(batch, olength, -1)
            ys_true = ys_out_pad
            for (i, y_hat), y_true in zip(
                enumerate(ys_hat.detach().cpu().numpy()), ys_true.detach().cpu().numpy()
            ):
                if i == MAX_DECODER_OUTPUT:
                    break
                idx_hat = np.argmax(y_hat[y_true != self.ignore_id], axis=1)
                idx_true = y_true[y_true != self.ignore_id]
                seq_hat = [self.char_list[int(idx)] for idx in idx_hat]
                seq_true = [self.char_list[int(idx)] for idx in idx_true]
                seq_hat = "".join(seq_hat)
                seq_true = "".join(seq_true)
                logging.info("groundtruth[%d]: " % i + seq_true)
                logging.info("prediction [%d]: " % i + seq_hat)

        if self.labeldist is not None:
            if self.vlabeldist is None:
                self.vlabeldist = to_device(hs_pad[0], torch.from_numpy(self.labeldist))
            loss_reg = -torch.sum(
                (F.log_softmax(y_all, dim=1) * self.vlabeldist).view(-1), dim=0
            ) / len(ys_in)
            self.loss = (1.0 - self.lsm_weight) * self.loss + self.lsm_weight * loss_reg

        return self.loss, acc, ppl

    def recognize_beam(self, h, lpz, recog_args, char_list, rnnlm=None, strm_idx=0):
        """beam search implementation

        :param torch.Tensor h: encoder hidden state (T, eprojs)
                                [in multi-encoder case, list of torch.Tensor,
                                [(T1, eprojs), (T2, eprojs), ...] ]
        :param torch.Tensor lpz: ctc log softmax output (T, odim)
                                [in multi-encoder case, list of torch.Tensor,
                                [(T1, odim), (T2, odim), ...] ]
        :param Namespace recog_args: argument Namespace containing options
        :param char_list: list of character strings
        :param torch.nn.Module rnnlm: language module
        :param int strm_idx:
            stream index for speaker parallel attention in multi-speaker case
        :return: N-best decoding results
        :rtype: list of dicts
        """
        # to support mutiple encoder asr mode, in single encoder mode,
        # convert torch.Tensor to List of torch.Tensor
        if self.num_encs == 1:
            h = [h]
            lpz = [lpz]
        if self.num_encs > 1 and lpz is None:
            lpz = [lpz] * self.num_encs

        for idx in range(self.num_encs):
            logging.info(
                "Number of Encoder:{}; enc{}: input lengths: {}.".format(
                    self.num_encs, idx + 1, h[0].size(0)
                )
            )
        att_idx = min(strm_idx, len(self.att) - 1)
        # initialization
        c_list = [self.zero_state(h[0].unsqueeze(0))]
        z_list = [self.zero_state(h[0].unsqueeze(0))]
        for _ in six.moves.range(1, self.dlayers):
            c_list.append(self.zero_state(h[0].unsqueeze(0)))
            z_list.append(self.zero_state(h[0].unsqueeze(0)))
        if self.num_encs == 1:
            a = None
            self.att[att_idx].reset()  # reset pre-computation of h
        else:
            a = [None] * (self.num_encs + 1)  # atts + han
            att_w_list = [None] * (self.num_encs + 1)  # atts + han
            att_c_list = [None] * (self.num_encs)  # atts
            for idx in range(self.num_encs + 1):
                self.att[idx].reset()  # reset pre-computation of h in atts and han

        # search parms
        beam = recog_args.beam_size
        penalty = recog_args.penalty
        ctc_weight = getattr(recog_args, "ctc_weight", False)  # for NMT

        if lpz[0] is not None and self.num_encs > 1:
            # weights-ctc,
            # e.g. ctc_loss = w_1*ctc_1_loss + w_2 * ctc_2_loss + w_N * ctc_N_loss
            weights_ctc_dec = recog_args.weights_ctc_dec / np.sum(
                recog_args.weights_ctc_dec
            )  # normalize
            logging.info(
                "ctc weights (decoding): " + " ".join([str(x) for x in weights_ctc_dec])
            )
        else:
            weights_ctc_dec = [1.0]

        # preprate sos
        if self.replace_sos and recog_args.tgt_lang:
            y = char_list.index(recog_args.tgt_lang)
        else:
            y = self.sos
        logging.info("<sos> index: " + str(y))
        logging.info("<sos> mark: " + char_list[y])
        vy = h[0].new_zeros(1).long()

        maxlen = np.amin([h[idx].size(0) for idx in range(self.num_encs)])
        if recog_args.maxlenratio != 0:
            # maxlen >= 1
            maxlen = max(1, int(recog_args.maxlenratio * maxlen))
        minlen = int(recog_args.minlenratio * maxlen)
        logging.info("max output length: " + str(maxlen))
        logging.info("min output length: " + str(minlen))

        # initialize hypothesis
        if rnnlm:
            hyp = {
                "score": 0.0,
                "yseq": [y],
                "c_prev": c_list,
                "z_prev": z_list,
                "a_prev": a,
                "rnnlm_prev": None,
            }
        else:
            hyp = {
                "score": 0.0,
                "yseq": [y],
                "c_prev": c_list,
                "z_prev": z_list,
                "a_prev": a,
            }
        if lpz[0] is not None:
            ctc_prefix_score = [
                CTCPrefixScore(lpz[idx].detach().numpy(), 0, self.eos, np)
                for idx in range(self.num_encs)
            ]
            hyp["ctc_state_prev"] = [
                ctc_prefix_score[idx].initial_state() for idx in range(self.num_encs)
            ]
            hyp["ctc_score_prev"] = [0.0] * self.num_encs
            if ctc_weight != 1.0:
                # pre-pruning based on attention scores
                ctc_beam = min(lpz[0].shape[-1], int(beam * CTC_SCORING_RATIO))
            else:
                ctc_beam = lpz[0].shape[-1]
        hyps = [hyp]
        ended_hyps = []

        for i in six.moves.range(maxlen):
            logging.debug("position " + str(i))

            hyps_best_kept = []
            for hyp in hyps:
                vy[0] = hyp["yseq"][i]
                ey = self.dropout_emb(self.embed(vy))  # utt list (1) x zdim
                if self.num_encs == 1:
                    att_c, att_w = self.att[att_idx](
                        h[0].unsqueeze(0),
                        [h[0].size(0)],
                        self.dropout_dec[0](hyp["z_prev"][0]),
                        hyp["a_prev"],
                    )
                else:
                    for idx in range(self.num_encs):
                        att_c_list[idx], att_w_list[idx] = self.att[idx](
                            h[idx].unsqueeze(0),
                            [h[idx].size(0)],
                            self.dropout_dec[0](hyp["z_prev"][0]),
                            hyp["a_prev"][idx],
                        )
                    h_han = torch.stack(att_c_list, dim=1)
                    att_c, att_w_list[self.num_encs] = self.att[self.num_encs](
                        h_han,
                        [self.num_encs],
                        self.dropout_dec[0](hyp["z_prev"][0]),
                        hyp["a_prev"][self.num_encs],
                    )
                ey = torch.cat((ey, att_c), dim=1)  # utt(1) x (zdim + hdim)
                z_list, c_list = self.rnn_forward(
                    ey, z_list, c_list, hyp["z_prev"], hyp["c_prev"]
                )

                # get nbest local scores and their ids
                if self.context_residual:
                    logits = self.output(
                        torch.cat((self.dropout_dec[-1](z_list[-1]), att_c), dim=-1)
                    )
                else:
                    logits = self.output(self.dropout_dec[-1](z_list[-1]))
                local_att_scores = F.log_softmax(logits, dim=1)
                if rnnlm:
                    rnnlm_state, local_lm_scores = rnnlm.predict(hyp["rnnlm_prev"], vy)
                    local_scores = (
                        local_att_scores + recog_args.lm_weight * local_lm_scores
                    )
                else:
                    local_scores = local_att_scores

                if lpz[0] is not None:
                    local_best_scores, local_best_ids = torch.topk(
                        local_att_scores, ctc_beam, dim=1
                    )
                    ctc_scores, ctc_states = (
                        [None] * self.num_encs,
                        [None] * self.num_encs,
                    )
                    for idx in range(self.num_encs):
                        ctc_scores[idx], ctc_states[idx] = ctc_prefix_score[idx](
                            hyp["yseq"], local_best_ids[0], hyp["ctc_state_prev"][idx]
                        )
                    local_scores = (1.0 - ctc_weight) * local_att_scores[
                        :, local_best_ids[0]
                    ]
                    if self.num_encs == 1:
                        local_scores += ctc_weight * torch.from_numpy(
                            ctc_scores[0] - hyp["ctc_score_prev"][0]
                        )
                    else:
                        for idx in range(self.num_encs):
                            local_scores += (
                                ctc_weight
                                * weights_ctc_dec[idx]
                                * torch.from_numpy(
                                    ctc_scores[idx] - hyp["ctc_score_prev"][idx]
                                )
                            )
                    if rnnlm:
                        local_scores += (
                            recog_args.lm_weight * local_lm_scores[:, local_best_ids[0]]
                        )
                    local_best_scores, joint_best_ids = torch.topk(
                        local_scores, beam, dim=1
                    )
                    local_best_ids = local_best_ids[:, joint_best_ids[0]]
                else:
                    local_best_scores, local_best_ids = torch.topk(
                        local_scores, beam, dim=1
                    )

                for j in six.moves.range(beam):
                    new_hyp = {}
                    # [:] is needed!
                    new_hyp["z_prev"] = z_list[:]
                    new_hyp["c_prev"] = c_list[:]
                    if self.num_encs == 1:
                        new_hyp["a_prev"] = att_w[:]
                    else:
                        new_hyp["a_prev"] = [
                            att_w_list[idx][:] for idx in range(self.num_encs + 1)
                        ]
                    new_hyp["score"] = hyp["score"] + local_best_scores[0, j]
                    new_hyp["yseq"] = [0] * (1 + len(hyp["yseq"]))
                    new_hyp["yseq"][: len(hyp["yseq"])] = hyp["yseq"]
                    new_hyp["yseq"][len(hyp["yseq"])] = int(local_best_ids[0, j])
                    if rnnlm:
                        new_hyp["rnnlm_prev"] = rnnlm_state
                    if lpz[0] is not None:
                        new_hyp["ctc_state_prev"] = [
                            ctc_states[idx][joint_best_ids[0, j]]
                            for idx in range(self.num_encs)
                        ]
                        new_hyp["ctc_score_prev"] = [
                            ctc_scores[idx][joint_best_ids[0, j]]
                            for idx in range(self.num_encs)
                        ]
                    # will be (2 x beam) hyps at most
                    hyps_best_kept.append(new_hyp)

                hyps_best_kept = sorted(
                    hyps_best_kept, key=lambda x: x["score"], reverse=True
                )[:beam]

            # sort and get nbest
            hyps = hyps_best_kept
            logging.debug("number of pruned hypotheses: " + str(len(hyps)))
            logging.debug(
                "best hypo: "
                + "".join([char_list[int(x)] for x in hyps[0]["yseq"][1:]])
            )

            # add eos in the final loop to avoid that there are no ended hyps
            if i == maxlen - 1:
                logging.info("adding <eos> in the last position in the loop")
                for hyp in hyps:
                    hyp["yseq"].append(self.eos)

            # add ended hypotheses to a final list,
            # and removed them from current hypotheses
            # (this will be a problem, number of hyps < beam)
            remained_hyps = []
            for hyp in hyps:
                if hyp["yseq"][-1] == self.eos:
                    # only store the sequence that has more than minlen outputs
                    # also add penalty
                    if len(hyp["yseq"]) > minlen:
                        hyp["score"] += (i + 1) * penalty
                        if rnnlm:  # Word LM needs to add final <eos> score
                            hyp["score"] += recog_args.lm_weight * rnnlm.final(
                                hyp["rnnlm_prev"]
                            )
                        ended_hyps.append(hyp)
                else:
                    remained_hyps.append(hyp)

            # end detection
            if end_detect(ended_hyps, i) and recog_args.maxlenratio == 0.0:
                logging.info("end detected at %d", i)
                break

            hyps = remained_hyps
            if len(hyps) > 0:
                logging.debug("remaining hypotheses: " + str(len(hyps)))
            else:
                logging.info("no hypothesis. Finish decoding.")
                break

            for hyp in hyps:
                logging.debug(
                    "hypo: " + "".join([char_list[int(x)] for x in hyp["yseq"][1:]])
                )

            logging.debug("number of ended hypotheses: " + str(len(ended_hyps)))

        nbest_hyps = sorted(ended_hyps, key=lambda x: x["score"], reverse=True)[
            : min(len(ended_hyps), recog_args.nbest)
        ]

        # check number of hypotheses
        if len(nbest_hyps) == 0:
            logging.warning(
                "there is no N-best results, "
                "perform recognition again with smaller minlenratio."
            )
            # should copy because Namespace will be overwritten globally
            recog_args = Namespace(**vars(recog_args))
            recog_args.minlenratio = max(0.0, recog_args.minlenratio - 0.1)
            if self.num_encs == 1:
                return self.recognize_beam(h[0], lpz[0], recog_args, char_list, rnnlm)
            else:
                return self.recognize_beam(h, lpz, recog_args, char_list, rnnlm)

        logging.info("total log probability: " + str(nbest_hyps[0]["score"]))
        logging.info(
            "normalized log probability: "
            + str(nbest_hyps[0]["score"] / len(nbest_hyps[0]["yseq"]))
        )

        # remove sos
        return nbest_hyps

    def recognize_beam_batch(
        self,
        h,
        hlens,
        lpz,
        recog_args,
        char_list,
        rnnlm=None,
        normalize_score=True,
        strm_idx=0,
        lang_ids=None,
    ):
        # to support mutiple encoder asr mode, in single encoder mode,
        # convert torch.Tensor to List of torch.Tensor
        if self.num_encs == 1:
            h = [h]
            hlens = [hlens]
            lpz = [lpz]
        if self.num_encs > 1 and lpz is None:
            lpz = [lpz] * self.num_encs

        att_idx = min(strm_idx, len(self.att) - 1)
        for idx in range(self.num_encs):
            logging.info(
                "Number of Encoder:{}; enc{}: input lengths: {}.".format(
                    self.num_encs, idx + 1, h[idx].size(1)
                )
            )
            h[idx] = mask_by_length(h[idx], hlens[idx], 0.0)

        # search params
        batch = len(hlens[0])
        beam = recog_args.beam_size
        penalty = recog_args.penalty
        ctc_weight = getattr(recog_args, "ctc_weight", 0)  # for NMT
        att_weight = 1.0 - ctc_weight
        ctc_margin = getattr(
            recog_args, "ctc_window_margin", 0
        )  # use getattr to keep compatibility
        # weights-ctc,
        # e.g. ctc_loss = w_1*ctc_1_loss + w_2 * ctc_2_loss + w_N * ctc_N_loss
        if lpz[0] is not None and self.num_encs > 1:
            weights_ctc_dec = recog_args.weights_ctc_dec / np.sum(
                recog_args.weights_ctc_dec
            )  # normalize
            logging.info(
                "ctc weights (decoding): " + " ".join([str(x) for x in weights_ctc_dec])
            )
        else:
            weights_ctc_dec = [1.0]

        n_bb = batch * beam
        pad_b = to_device(h[0], torch.arange(batch) * beam).view(-1, 1)

        max_hlen = np.amin([max(hlens[idx]) for idx in range(self.num_encs)])
        if recog_args.maxlenratio == 0:
            maxlen = max_hlen
        else:
            maxlen = max(1, int(recog_args.maxlenratio * max_hlen))
        minlen = int(recog_args.minlenratio * max_hlen)
        logging.info("max output length: " + str(maxlen))
        logging.info("min output length: " + str(minlen))

        # initialization
        c_prev = [
            to_device(h[0], torch.zeros(n_bb, self.dunits)) for _ in range(self.dlayers)
        ]
        z_prev = [
            to_device(h[0], torch.zeros(n_bb, self.dunits)) for _ in range(self.dlayers)
        ]
        c_list = [
            to_device(h[0], torch.zeros(n_bb, self.dunits)) for _ in range(self.dlayers)
        ]
        z_list = [
            to_device(h[0], torch.zeros(n_bb, self.dunits)) for _ in range(self.dlayers)
        ]
        vscores = to_device(h[0], torch.zeros(batch, beam))

        rnnlm_state = None
        if self.num_encs == 1:
            a_prev = [None]
            att_w_list, ctc_scorer, ctc_state = [None], [None], [None]
            self.att[att_idx].reset()  # reset pre-computation of h
        else:
            a_prev = [None] * (self.num_encs + 1)  # atts + han
            att_w_list = [None] * (self.num_encs + 1)  # atts + han
            att_c_list = [None] * (self.num_encs)  # atts
            ctc_scorer, ctc_state = [None] * (self.num_encs), [None] * (self.num_encs)
            for idx in range(self.num_encs + 1):
                self.att[idx].reset()  # reset pre-computation of h in atts and han

        if self.replace_sos and recog_args.tgt_lang:
            logging.info("<sos> index: " + str(char_list.index(recog_args.tgt_lang)))
            logging.info("<sos> mark: " + recog_args.tgt_lang)
            yseq = [
                [char_list.index(recog_args.tgt_lang)] for _ in six.moves.range(n_bb)
            ]
        elif lang_ids is not None:
            # NOTE: used for evaluation during training
            yseq = [
                [lang_ids[b // recog_args.beam_size]] for b in six.moves.range(n_bb)
            ]
        else:
            logging.info("<sos> index: " + str(self.sos))
            logging.info("<sos> mark: " + char_list[self.sos])
            yseq = [[self.sos] for _ in six.moves.range(n_bb)]

        accum_odim_ids = [self.sos for _ in six.moves.range(n_bb)]
        stop_search = [False for _ in six.moves.range(batch)]
        nbest_hyps = [[] for _ in six.moves.range(batch)]
        ended_hyps = [[] for _ in range(batch)]

        exp_hlens = [
            hlens[idx].repeat(beam).view(beam, batch).transpose(0, 1).contiguous()
            for idx in range(self.num_encs)
        ]
        exp_hlens = [exp_hlens[idx].view(-1).tolist() for idx in range(self.num_encs)]
        exp_h = [
            h[idx].unsqueeze(1).repeat(1, beam, 1, 1).contiguous()
            for idx in range(self.num_encs)
        ]
        exp_h = [
            exp_h[idx].view(n_bb, h[idx].size()[1], h[idx].size()[2])
            for idx in range(self.num_encs)
        ]

        if lpz[0] is not None:
            scoring_num = min(
                int(beam * CTC_SCORING_RATIO)
                if att_weight > 0.0 and not lpz[0].is_cuda
                else 0,
                lpz[0].size(-1),
            )
            ctc_scorer = [
                CTCPrefixScoreTH(
                    lpz[idx],
                    hlens[idx],
                    0,
                    self.eos,
                    margin=ctc_margin,
                )
                for idx in range(self.num_encs)
            ]

        for i in six.moves.range(maxlen):
            logging.debug("position " + str(i))

            vy = to_device(h[0], torch.LongTensor(self._get_last_yseq(yseq)))
            ey = self.dropout_emb(self.embed(vy))
            if self.num_encs == 1:
                att_c, att_w = self.att[att_idx](
                    exp_h[0], exp_hlens[0], self.dropout_dec[0](z_prev[0]), a_prev[0]
                )
                att_w_list = [att_w]
            else:
                for idx in range(self.num_encs):
                    att_c_list[idx], att_w_list[idx] = self.att[idx](
                        exp_h[idx],
                        exp_hlens[idx],
                        self.dropout_dec[0](z_prev[0]),
                        a_prev[idx],
                    )
                exp_h_han = torch.stack(att_c_list, dim=1)
                att_c, att_w_list[self.num_encs] = self.att[self.num_encs](
                    exp_h_han,
                    [self.num_encs] * n_bb,
                    self.dropout_dec[0](z_prev[0]),
                    a_prev[self.num_encs],
                )
            ey = torch.cat((ey, att_c), dim=1)

            # attention decoder
            z_list, c_list = self.rnn_forward(ey, z_list, c_list, z_prev, c_prev)
            if self.context_residual:
                logits = self.output(
                    torch.cat((self.dropout_dec[-1](z_list[-1]), att_c), dim=-1)
                )
            else:
                logits = self.output(self.dropout_dec[-1](z_list[-1]))
            local_scores = att_weight * F.log_softmax(logits, dim=1)

            # rnnlm
            if rnnlm:
                rnnlm_state, local_lm_scores = rnnlm.buff_predict(rnnlm_state, vy, n_bb)
                local_scores = local_scores + recog_args.lm_weight * local_lm_scores

            # ctc
            if ctc_scorer[0]:
                local_scores[:, 0] = self.logzero  # avoid choosing blank
                part_ids = (
                    torch.topk(local_scores, scoring_num, dim=-1)[1]
                    if scoring_num > 0
                    else None
                )
                for idx in range(self.num_encs):
                    att_w = att_w_list[idx]
                    att_w_ = att_w if isinstance(att_w, torch.Tensor) else att_w[0]
                    local_ctc_scores, ctc_state[idx] = ctc_scorer[idx](
                        yseq, ctc_state[idx], part_ids, att_w_
                    )
                    local_scores = (
                        local_scores
                        + ctc_weight * weights_ctc_dec[idx] * local_ctc_scores
                    )

            local_scores = local_scores.view(batch, beam, self.odim)
            if i == 0:
                local_scores[:, 1:, :] = self.logzero

            # accumulate scores
            eos_vscores = local_scores[:, :, self.eos] + vscores
            vscores = vscores.view(batch, beam, 1).repeat(1, 1, self.odim)
            vscores[:, :, self.eos] = self.logzero
            vscores = (vscores + local_scores).view(batch, -1)

            # global pruning
            accum_best_scores, accum_best_ids = torch.topk(vscores, beam, 1)
            accum_odim_ids = (
                torch.fmod(accum_best_ids, self.odim).view(-1).data.cpu().tolist()
            )
            accum_padded_beam_ids = (
                (accum_best_ids // self.odim + pad_b).view(-1).data.cpu().tolist()
            )

            y_prev = yseq[:][:]
            yseq = self._index_select_list(yseq, accum_padded_beam_ids)
            yseq = self._append_ids(yseq, accum_odim_ids)
            vscores = accum_best_scores
            vidx = to_device(h[0], torch.LongTensor(accum_padded_beam_ids))

            a_prev = []
            num_atts = self.num_encs if self.num_encs == 1 else self.num_encs + 1
            for idx in range(num_atts):
                if isinstance(att_w_list[idx], torch.Tensor):
                    _a_prev = torch.index_select(
                        att_w_list[idx].view(n_bb, *att_w_list[idx].shape[1:]), 0, vidx
                    )
                elif isinstance(att_w_list[idx], list):
                    # handle the case of multi-head attention
                    _a_prev = [
                        torch.index_select(att_w_one.view(n_bb, -1), 0, vidx)
                        for att_w_one in att_w_list[idx]
                    ]
                else:
                    # handle the case of location_recurrent when return is a tuple
                    _a_prev_ = torch.index_select(
                        att_w_list[idx][0].view(n_bb, -1), 0, vidx
                    )
                    _h_prev_ = torch.index_select(
                        att_w_list[idx][1][0].view(n_bb, -1), 0, vidx
                    )
                    _c_prev_ = torch.index_select(
                        att_w_list[idx][1][1].view(n_bb, -1), 0, vidx
                    )
                    _a_prev = (_a_prev_, (_h_prev_, _c_prev_))
                a_prev.append(_a_prev)
            z_prev = [
                torch.index_select(z_list[li].view(n_bb, -1), 0, vidx)
                for li in range(self.dlayers)
            ]
            c_prev = [
                torch.index_select(c_list[li].view(n_bb, -1), 0, vidx)
                for li in range(self.dlayers)
            ]

            # pick ended hyps
            if i >= minlen:
                k = 0
                penalty_i = (i + 1) * penalty
                thr = accum_best_scores[:, -1]
                for samp_i in six.moves.range(batch):
                    if stop_search[samp_i]:
                        k = k + beam
                        continue
                    for beam_j in six.moves.range(beam):
                        _vscore = None
                        if eos_vscores[samp_i, beam_j] > thr[samp_i]:
                            yk = y_prev[k][:]
                            if len(yk) <= min(
                                hlens[idx][samp_i] for idx in range(self.num_encs)
                            ):
                                _vscore = eos_vscores[samp_i][beam_j] + penalty_i
                        elif i == maxlen - 1:
                            yk = yseq[k][:]
                            _vscore = vscores[samp_i][beam_j] + penalty_i
                        if _vscore:
                            yk.append(self.eos)
                            if rnnlm:
                                _vscore += recog_args.lm_weight * rnnlm.final(
                                    rnnlm_state, index=k
                                )
                            _score = _vscore.data.cpu().numpy()
                            ended_hyps[samp_i].append(
                                {"yseq": yk, "vscore": _vscore, "score": _score}
                            )
                        k = k + 1

            # end detection
            stop_search = [
                stop_search[samp_i] or end_detect(ended_hyps[samp_i], i)
                for samp_i in six.moves.range(batch)
            ]
            stop_search_summary = list(set(stop_search))
            if len(stop_search_summary) == 1 and stop_search_summary[0]:
                break

            if rnnlm:
                rnnlm_state = self._index_select_lm_state(rnnlm_state, 0, vidx)
            if ctc_scorer[0]:
                for idx in range(self.num_encs):
                    ctc_state[idx] = ctc_scorer[idx].index_select_state(
                        ctc_state[idx], accum_best_ids
                    )

        torch.cuda.empty_cache()

        dummy_hyps = [
            {"yseq": [self.sos, self.eos], "score": np.array([-float("inf")])}
        ]
        ended_hyps = [
            ended_hyps[samp_i] if len(ended_hyps[samp_i]) != 0 else dummy_hyps
            for samp_i in six.moves.range(batch)
        ]
        if normalize_score:
            for samp_i in six.moves.range(batch):
                for x in ended_hyps[samp_i]:
                    x["score"] /= len(x["yseq"])

        nbest_hyps = [
            sorted(ended_hyps[samp_i], key=lambda x: x["score"], reverse=True)[
                : min(len(ended_hyps[samp_i]), recog_args.nbest)
            ]
            for samp_i in six.moves.range(batch)
        ]

        return nbest_hyps

    def calculate_all_attentions(self, hs_pad, hlen, ys_pad, strm_idx=0, lang_ids=None):
        """Calculate all of attentions

        :param torch.Tensor hs_pad: batch of padded hidden state sequences
                                    (B, Tmax, D)
                                    in multi-encoder case, list of torch.Tensor,
                                    [(B, Tmax_1, D), (B, Tmax_2, D), ..., ] ]
        :param torch.Tensor hlen: batch of lengths of hidden state sequences (B)
                                    [in multi-encoder case, list of torch.Tensor,
                                    [(B), (B), ..., ]
        :param torch.Tensor ys_pad:
            batch of padded character id sequence tensor (B, Lmax)
        :param int strm_idx:
            stream index for parallel speaker attention in multi-speaker case
        :param torch.Tensor lang_ids: batch of target language id tensor (B, 1)
        :return: attention weights with the following shape,
            1) multi-head case => attention weights (B, H, Lmax, Tmax),
            2) multi-encoder case =>
                [(B, Lmax, Tmax1), (B, Lmax, Tmax2), ..., (B, Lmax, NumEncs)]
            3) other case => attention weights (B, Lmax, Tmax).
        :rtype: float ndarray
        """
        # to support mutiple encoder asr mode, in single encoder mode,
        # convert torch.Tensor to List of torch.Tensor
        if self.num_encs == 1:
            hs_pad = [hs_pad]
            hlen = [hlen]

        # TODO(kan-bayashi): need to make more smart way
        ys = [y[y != self.ignore_id] for y in ys_pad]  # parse padded ys
        att_idx = min(strm_idx, len(self.att) - 1)

        # hlen should be list of list of integer
        hlen = [list(map(int, hlen[idx])) for idx in range(self.num_encs)]

        self.loss = None
        # prepare input and output word sequences with sos/eos IDs
        eos = ys[0].new([self.eos])
        sos = ys[0].new([self.sos])
        if self.replace_sos:
            ys_in = [torch.cat([idx, y], dim=0) for idx, y in zip(lang_ids, ys)]
        else:
            ys_in = [torch.cat([sos, y], dim=0) for y in ys]
        ys_out = [torch.cat([y, eos], dim=0) for y in ys]

        # padding for ys with -1
        # pys: utt x olen
        ys_in_pad = pad_list(ys_in, self.eos)
        ys_out_pad = pad_list(ys_out, self.ignore_id)

        # get length info
        olength = ys_out_pad.size(1)

        # initialization
        c_list = [self.zero_state(hs_pad[0])]
        z_list = [self.zero_state(hs_pad[0])]
        for _ in six.moves.range(1, self.dlayers):
            c_list.append(self.zero_state(hs_pad[0]))
            z_list.append(self.zero_state(hs_pad[0]))
        att_ws = []
        if self.num_encs == 1:
            att_w = None
            self.att[att_idx].reset()  # reset pre-computation of h
        else:
            att_w_list = [None] * (self.num_encs + 1)  # atts + han
            att_c_list = [None] * (self.num_encs)  # atts
            for idx in range(self.num_encs + 1):
                self.att[idx].reset()  # reset pre-computation of h in atts and han

        # pre-computation of embedding
        eys = self.dropout_emb(self.embed(ys_in_pad))  # utt x olen x zdim

        # loop for an output sequence
        for i in six.moves.range(olength):
            if self.num_encs == 1:
                att_c, att_w = self.att[att_idx](
                    hs_pad[0], hlen[0], self.dropout_dec[0](z_list[0]), att_w
                )
                att_ws.append(att_w)
            else:
                for idx in range(self.num_encs):
                    att_c_list[idx], att_w_list[idx] = self.att[idx](
                        hs_pad[idx],
                        hlen[idx],
                        self.dropout_dec[0](z_list[0]),
                        att_w_list[idx],
                    )
                hs_pad_han = torch.stack(att_c_list, dim=1)
                hlen_han = [self.num_encs] * len(ys_in)
                att_c, att_w_list[self.num_encs] = self.att[self.num_encs](
                    hs_pad_han,
                    hlen_han,
                    self.dropout_dec[0](z_list[0]),
                    att_w_list[self.num_encs],
                )
                att_ws.append(att_w_list.copy())
            ey = torch.cat((eys[:, i, :], att_c), dim=1)  # utt x (zdim + hdim)
            z_list, c_list = self.rnn_forward(ey, z_list, c_list, z_list, c_list)

        if self.num_encs == 1:
            # convert to numpy array with the shape (B, Lmax, Tmax)
            att_ws = att_to_numpy(att_ws, self.att[att_idx])
        else:
            _att_ws = []
            for idx, ws in enumerate(zip(*att_ws)):
                ws = att_to_numpy(ws, self.att[idx])
                _att_ws.append(ws)
            att_ws = _att_ws
        return att_ws

    @staticmethod
    def _get_last_yseq(exp_yseq):
        last = []
        for y_seq in exp_yseq:
            last.append(y_seq[-1])
        return last

    @staticmethod
    def _append_ids(yseq, ids):
        if isinstance(ids, list):
            for i, j in enumerate(ids):
                yseq[i].append(j)
        else:
            for i in range(len(yseq)):
                yseq[i].append(ids)
        return yseq

    @staticmethod
    def _index_select_list(yseq, lst):
        new_yseq = []
        for i in lst:
            new_yseq.append(yseq[i][:])
        return new_yseq

    @staticmethod
    def _index_select_lm_state(rnnlm_state, dim, vidx):
        if isinstance(rnnlm_state, dict):
            new_state = {}
            for k, v in rnnlm_state.items():
                new_state[k] = [torch.index_select(vi, dim, vidx) for vi in v]
        elif isinstance(rnnlm_state, list):
            new_state = []
            for i in vidx:
                new_state.append(rnnlm_state[int(i)][:])
        return new_state

    # scorer interface methods
    def init_state(self, x):
        # to support mutiple encoder asr mode, in single encoder mode,
        # convert torch.Tensor to List of torch.Tensor
        if self.num_encs == 1:
            x = [x]

        c_list = [self.zero_state(x[0].unsqueeze(0))]
        z_list = [self.zero_state(x[0].unsqueeze(0))]
        for _ in six.moves.range(1, self.dlayers):
            c_list.append(self.zero_state(x[0].unsqueeze(0)))
            z_list.append(self.zero_state(x[0].unsqueeze(0)))
        # TODO(karita): support strm_index for `asr_mix`
        strm_index = 0
        att_idx = min(strm_index, len(self.att) - 1)
        if self.num_encs == 1:
            a = None
            self.att[att_idx].reset()  # reset pre-computation of h
        else:
            a = [None] * (self.num_encs + 1)  # atts + han
            for idx in range(self.num_encs + 1):
                self.att[idx].reset()  # reset pre-computation of h in atts and han
        return dict(
            c_prev=c_list[:],
            z_prev=z_list[:],
            a_prev=a,
            workspace=(att_idx, z_list, c_list),
        )

    def score(self, yseq, state, x):
        # to support mutiple encoder asr mode, in single encoder mode,
        # convert torch.Tensor to List of torch.Tensor
        if self.num_encs == 1:
            x = [x]

        att_idx, z_list, c_list = state["workspace"]
        vy = yseq[-1].unsqueeze(0)
        ey = self.dropout_emb(self.embed(vy))  # utt list (1) x zdim
        if self.num_encs == 1:
            att_c, att_w = self.att[att_idx](
                x[0].unsqueeze(0),
                [x[0].size(0)],
                self.dropout_dec[0](state["z_prev"][0]),
                state["a_prev"],
            )
        else:
            att_w = [None] * (self.num_encs + 1)  # atts + han
            att_c_list = [None] * (self.num_encs)  # atts
            for idx in range(self.num_encs):
                att_c_list[idx], att_w[idx] = self.att[idx](
                    x[idx].unsqueeze(0),
                    [x[idx].size(0)],
                    self.dropout_dec[0](state["z_prev"][0]),
                    state["a_prev"][idx],
                )
            h_han = torch.stack(att_c_list, dim=1)
            att_c, att_w[self.num_encs] = self.att[self.num_encs](
                h_han,
                [self.num_encs],
                self.dropout_dec[0](state["z_prev"][0]),
                state["a_prev"][self.num_encs],
            )
        ey = torch.cat((ey, att_c), dim=1)  # utt(1) x (zdim + hdim)
        z_list, c_list = self.rnn_forward(
            ey, z_list, c_list, state["z_prev"], state["c_prev"]
        )
        if self.context_residual:
            logits = self.output(
                torch.cat((self.dropout_dec[-1](z_list[-1]), att_c), dim=-1)
            )
        else:
            logits = self.output(self.dropout_dec[-1](z_list[-1]))
        logp = F.log_softmax(logits, dim=1).squeeze(0)
        return (
            logp,
            dict(
                c_prev=c_list[:],
                z_prev=z_list[:],
                a_prev=att_w,
                workspace=(att_idx, z_list, c_list),
            ),
        )


def decoder_for(args, odim, sos, eos, att, labeldist):
    return Decoder(
        args.eprojs,
        odim,
        args.dtype,
        args.dlayers,
        args.dunits,
        sos,
        eos,
        att,
        args.verbose,
        args.char_list,
        labeldist,
        args.lsm_weight,
        args.sampling_probability,
        args.dropout_rate_decoder,
        getattr(args, "context_residual", False),  # use getattr to keep compatibility
        getattr(args, "replace_sos", False),  # use getattr to keep compatibility
        getattr(args, "num_encs", 1),
    )  # use getattr to keep compatibility


================================================
FILE: nets/pytorch_backend/rnn/encoders.py
================================================
import logging
import six

import numpy as np
import torch
import torch.nn.functional as F
from torch.nn.utils.rnn import pack_padded_sequence
from torch.nn.utils.rnn import pad_packed_sequence

from espnet.nets.e2e_asr_common import get_vgg2l_odim
from espnet.nets.pytorch_backend.nets_utils import make_pad_mask
from espnet.nets.pytorch_backend.nets_utils import to_device


class RNNP(torch.nn.Module):
    """RNN with projection layer module

    :param int idim: dimension of inputs
    :param int elayers: number of encoder layers
    :param int cdim: number of rnn units (resulted in cdim * 2 if bidirectional)
    :param int hdim: number of projection units
    :param np.ndarray subsample: list of subsampling numbers
    :param float dropout: dropout rate
    :param str typ: The RNN type
    """

    def __init__(self, idim, elayers, cdim, hdim, subsample, dropout, typ="blstm"):
        super(RNNP, self).__init__()
        bidir = typ[0] == "b"
        for i in six.moves.range(elayers):
            if i == 0:
                inputdim = idim
            else:
                inputdim = hdim

            RNN = torch.nn.LSTM if "lstm" in typ else torch.nn.GRU
            rnn = RNN(
                inputdim, cdim, num_layers=1, bidirectional=bidir, batch_first=True
            )

            setattr(self, "%s%d" % ("birnn" if bidir else "rnn", i), rnn)

            # bottleneck layer to merge
            if bidir:
                setattr(self, "bt%d" % i, torch.nn.Linear(2 * cdim, hdim))
            else:
                setattr(self, "bt%d" % i, torch.nn.Linear(cdim, hdim))

        self.elayers = elayers
        self.cdim = cdim
        self.subsample = subsample
        self.typ = typ
        self.bidir = bidir
        self.dropout = dropout

    def forward(self, xs_pad, ilens, prev_state=None):
        """RNNP forward

        :param torch.Tensor xs_pad: batch of padded input sequences (B, Tmax, idim)
        :param torch.Tensor ilens: batch of lengths of input sequences (B)
        :param torch.Tensor prev_state: batch of previous RNN states
        :return: batch of hidden state sequences (B, Tmax, hdim)
        :rtype: torch.Tensor
        """
        logging.debug(self.__class__.__name__ + " input lengths: " + str(ilens))
        elayer_states = []
        for layer in six.moves.range(self.elayers):
            if not isinstance(ilens, torch.Tensor):
                ilens = torch.tensor(ilens)
            xs_pack = pack_padded_sequence(xs_pad, ilens.cpu(), batch_first=True)
            rnn = getattr(self, ("birnn" if self.bidir else "rnn") + str(layer))
            rnn.flatten_parameters()
            if prev_state is not None and rnn.bidirectional:
                prev_state = reset_backward_rnn_state(prev_state)
            ys, states = rnn(
                xs_pack, hx=None if prev_state is None else prev_state[layer]
            )
            elayer_states.append(states)
            # ys: utt list of frame x cdim x 2 (2: means bidirectional)
            ys_pad, ilens = pad_packed_sequence(ys, batch_first=True)
            sub = self.subsample[layer + 1]
            if sub > 1:
                ys_pad = ys_pad[:, ::sub]
                ilens = torch.tensor([int(i + 1) // sub for i in ilens])
            # (sum _utt frame_utt) x dim
            projection_layer = getattr(self, "bt%d" % layer)
            projected = projection_layer(ys_pad.contiguous().view(-1, ys_pad.size(2)))
            xs_pad = projected.view(ys_pad.size(0), ys_pad.size(1), -1)
            if layer < self.elayers - 1:
                xs_pad = torch.tanh(F.dropout(xs_pad, p=self.dropout))

        return xs_pad, ilens, elayer_states  # x: utt list of frame x dim


class RNN(torch.nn.Module):
    """RNN module

    :param int idim: dimension of inputs
    :param int elayers: number of encoder layers
    :param int cdim: number of rnn units (resulted in cdim * 2 if bidirectional)
    :param int hdim: number of final projection units
    :param float dropout: dropout rate
    :param str typ: The RNN type
    """

    def __init__(self, idim, elayers, cdim, hdim, dropout, typ="blstm"):
        super(RNN, self).__init__()
        bidir = typ[0] == "b"
        self.nbrnn = (
            torch.nn.LSTM(
                idim,
                cdim,
                elayers,
                batch_first=True,
                dropout=dropout,
                bidirectional=bidir,
            )
            if "lstm" in typ
            else torch.nn.GRU(
                idim,
                cdim,
                elayers,
                batch_first=True,
                dropout=dropout,
                bidirectional=bidir,
            )
        )
        if bidir:
            self.l_last = torch.nn.Linear(cdim * 2, hdim)
        else:
            self.l_last = torch.nn.Linear(cdim, hdim)
        self.typ = typ

    def forward(self, xs_pad, ilens, prev_state=None):
        """RNN forward

        :param torch.Tensor xs_pad: batch of padded input sequences (B, Tmax, D)
        :param torch.Tensor ilens: batch of lengths of input sequences (B)
        :param torch.Tensor prev_state: batch of previous RNN states
        :return: batch of hidden state sequences (B, Tmax, eprojs)
        :rtype: torch.Tensor
        """
        logging.debug(self.__class__.__name__ + " input lengths: " + str(ilens))
        if not isinstance(ilens, torch.Tensor):
            ilens = torch.tensor(ilens)
        xs_pack = pack_padded_sequence(xs_pad, ilens.cpu(), batch_first=True)
        self.nbrnn.flatten_parameters()
        if prev_state is not None and self.nbrnn.bidirectional:
            # We assume that when previous state is passed,
            # it means that we're streaming the input
            # and therefore cannot propagate backward BRNN state
            # (otherwise it goes in the wrong direction)
            prev_state = reset_backward_rnn_state(prev_state)
        ys, states = self.nbrnn(xs_pack, hx=prev_state)
        # ys: utt list of frame x cdim x 2 (2: means bidirectional)
        ys_pad, ilens = pad_packed_sequence(ys, batch_first=True)
        # (sum _utt frame_utt) x dim
        projected = torch.tanh(
            self.l_last(ys_pad.contiguous().view(-1, ys_pad.size(2)))
        )
        xs_pad = projected.view(ys_pad.size(0), ys_pad.size(1), -1)
        return xs_pad, ilens, states  # x: utt list of frame x dim


def reset_backward_rnn_state(states):
    """Sets backward BRNN states to zeroes

    Useful in processing of sliding windows over the inputs
    """
    if isinstance(states, (list, tuple)):
        for state in states:
            state[1::2] = 0.0
    else:
        states[1::2] = 0.0
    return states


class VGG2L(torch.nn.Module):
    """VGG-like module

    :param int in_channel: number of input channels
    """

    def __init__(self, in_channel=1):
        super(VGG2L, self).__init__()
        # CNN layer (VGG motivated)
        self.conv1_1 = torch.nn.Conv2d(in_channel, 64, 3, stride=1, padding=1)
        self.conv1_2 = torch.nn.Conv2d(64, 64, 3, stride=1, padding=1)
        self.conv2_1 = torch.nn.Conv2d(64, 128, 3, stride=1, padding=1)
        self.conv2_2 = torch.nn.Conv2d(128, 128, 3, stride=1, padding=1)

        self.in_channel = in_channel

    def forward(self, xs_pad, ilens, **kwargs):
        """VGG2L forward

        :param torch.Tensor xs_pad: batch of padded input sequences (B, Tmax, D)
        :param torch.Tensor ilens: batch of lengths of input sequences (B)
        :return: batch of padded hidden state sequences (B, Tmax // 4, 128 * D // 4)
        :rtype: torch.Tensor
        """
        logging.debug(self.__class__.__name__ + " input lengths: " + str(ilens))

        # x: utt x frame x dim
        # xs_pad = F.pad_sequence(xs_pad)

        # x: utt x 1 (input channel num) x frame x dim
        xs_pad = xs_pad.view(
            xs_pad.size(0),
            xs_pad.size(1),
            self.in_channel,
            xs_pad.size(2) // self.in_channel,
        ).transpose(1, 2)

        # NOTE: max_pool1d ?
        xs_pad = F.relu(self.conv1_1(xs_pad))
        xs_pad = F.relu(self.conv1_2(xs_pad))
        xs_pad = F.max_pool2d(xs_pad, 2, stride=2, ceil_mode=True)

        xs_pad = F.relu(self.conv2_1(xs_pad))
        xs_pad = F.relu(self.conv2_2(xs_pad))
        xs_pad = F.max_pool2d(xs_pad, 2, stride=2, ceil_mode=True)
        if torch.is_tensor(ilens):
            ilens = ilens.cpu().numpy()
        else:
            ilens = np.array(ilens, dtype=np.float32)
        ilens = np.array(np.ceil(ilens / 2), dtype=np.int64)
        ilens = np.array(
            np.ceil(np.array(ilens, dtype=np.float32) / 2), dtype=np.int64
        ).tolist()

        # x: utt_list of frame (remove zeropaded frames) x (input channel num x dim)
        xs_pad = xs_pad.transpose(1, 2)
        xs_pad = xs_pad.contiguous().view(
            xs_pad.size(0), xs_pad.size(1), xs_pad.size(2) * xs_pad.size(3)
        )
        return xs_pad, ilens, None  # no state in this layer


class Encoder(torch.nn.Module):
    """Encoder module

    :param str etype: type of encoder network
    :param int idim: number of dimensions of encoder network
    :param int elayers: number of layers of encoder network
    :param int eunits: number of lstm units of encoder network
    :param int eprojs: number of projection units of encoder network
    :param np.ndarray subsample: list of subsampling numbers
    :param float dropout: dropout rate
    :param int in_channel: number of input channels
    """

    def __init__(
        self, etype, idim, elayers, eunits, eprojs, subsample, dropout, in_channel=1
    ):
        super(Encoder, self).__init__()
        typ = etype.lstrip("vgg").rstrip("p")
        if typ not in ["lstm", "gru", "blstm", "bgru"]:
            logging.error("Error: need to specify an appropriate encoder architecture")

        if etype.startswith("vgg"):
            if etype[-1] == "p":
                self.enc = torch.nn.ModuleList(
                    [
                        VGG2L(in_channel),
                        RNNP(
                            get_vgg2l_odim(idim, in_channel=in_channel),
                            elayers,
                            eunits,
                            eprojs,
                            subsample,
                            dropout,
                            typ=typ,
                        ),
                    ]
                )
                logging.info("Use CNN-VGG + " + typ.upper() + "P for encoder")
            else:
                self.enc = torch.nn.ModuleList(
                    [
                        VGG2L(in_channel),
                        RNN(
                            get_vgg2l_odim(idim, in_channel=in_channel),
                            elayers,
                            eunits,
                            eprojs,
                            dropout,
                            typ=typ,
                        ),
                    ]
                )
                logging.info("Use CNN-VGG + " + typ.upper() + " for encoder")
            self.conv_subsampling_factor = 4
        else:
            if etype[-1] == "p":
                self.enc = torch.nn.ModuleList(
                    [RNNP(idim, elayers, eunits, eprojs, subsample, dropout, typ=typ)]
                )
                logging.info(typ.upper() + " with every-layer projection for encoder")
            else:
                self.enc = torch.nn.ModuleList(
                    [RNN(idim, elayers, eunits, eprojs, dropout, typ=typ)]
                )
                logging.info(typ.upper() + " without projection for encoder")
            self.conv_subsampling_factor = 1

    def forward(self, xs_pad, ilens, prev_states=None):
        """Encoder forward

        :param torch.Tensor xs_pad: batch of padded input sequences (B, Tmax, D)
        :param torch.Tensor ilens: batch of lengths of input sequences (B)
        :param torch.Tensor prev_state: batch of previous encoder hidden states (?, ...)
        :return: batch of hidden state sequences (B, Tmax, eprojs)
        :rtype: torch.Tensor
        """
        if prev_states is None:
            prev_states = [None] * len(self.enc)
        assert len(prev_states) == len(self.enc)

        current_states = []
        for module, prev_state in zip(self.enc, prev_states):
            xs_pad, ilens, states = module(xs_pad, ilens, prev_state=prev_state)
            current_states.append(states)

        # make mask to remove bias value in padded part
        mask = to_device(xs_pad, make_pad_mask(ilens).unsqueeze(-1))

        return xs_pad.masked_fill(mask, 0.0), ilens, current_states


def encoder_for(args, idim, subsample):
    """Instantiates an encoder module given the program arguments

    :param Namespace args: The arguments
    :param int or List of integer idim: dimension of input, e.g. 83, or
                                        List of dimensions of inputs, e.g. [83,83]
    :param List or List of List subsample: subsample factors, e.g. [1,2,2,1,1], or
                                        List of subsample factors of each encoder.
                                         e.g. [[1,2,2,1,1], [1,2,2,1,1]]
    :rtype torch.nn.Module
    :return: The encoder module
    """
    num_encs = getattr(args, "num_encs", 1)  # use getattr to keep compatibility
    if num_encs == 1:
        # compatible with single encoder asr mode
        return Encoder(
            args.etype,
            idim,
            args.elayers,
            args.eunits,
            args.eprojs,
            subsample,
            args.dropout_rate,
        )
    elif num_encs >= 1:
        enc_list = torch.nn.ModuleList()
        for idx in range(num_encs):
            enc = Encoder(
                args.etype[idx],
                idim[idx],
                args.elayers[idx],
                args.eunits[idx],
                args.eprojs,
                subsample[idx],
                args.dropout_rate[idx],
            )
            enc_list.append(enc)
        return enc_list
    else:
        raise ValueError(
            "Number of encoders needs to be more than one. {}".format(num_encs)
        )


================================================
FILE: nets/pytorch_backend/streaming/__init__.py
================================================
"""Initialize sub package."""


================================================
FILE: nets/pytorch_backend/streaming/segment.py
================================================
import numpy as np
import torch


class SegmentStreamingE2E(object):
    """SegmentStreamingE2E constructor.

    :param E2E e2e: E2E ASR object
    :param recog_args: arguments for "recognize" method of E2E
    """

    def __init__(self, e2e, recog_args, rnnlm=None):
        self._e2e = e2e
        self._recog_args = recog_args
        self._char_list = e2e.char_list
        self._rnnlm = rnnlm

        self._e2e.eval()

        self._blank_idx_in_char_list = -1
        for idx in range(len(self._char_list)):
            if self._char_list[idx] == self._e2e.blank:
                self._blank_idx_in_char_list = idx
                break

        self._subsampling_factor = np.prod(e2e.subsample)
        self._activates = 0
        self._blank_dur = 0

        self._previous_input = []
        self._previous_encoder_recurrent_state = None
        self._encoder_states = []
        self._ctc_posteriors = []

        assert (
            self._recog_args.batchsize <= 1
        ), "SegmentStreamingE2E works only with batch size <= 1"
        assert (
            "b" not in self._e2e.etype
        ), "SegmentStreamingE2E works only with uni-directional encoders"

    def accept_input(self, x):
        """Call this method each time a new batch of input is available."""

        self._previous_input.extend(x)
        h, ilen = self._e2e.subsample_frames(x)

        # Run encoder and apply greedy search on CTC softmax output
        h, _, self._previous_encoder_recurrent_state = self._e2e.enc(
            h.unsqueeze(0), ilen, self._previous_encoder_recurrent_state
        )
        z = self._e2e.ctc.argmax(h).squeeze(0)

        if self._activates == 0 and z[0] != self._blank_idx_in_char_list:
            self._activates = 1

            # Rerun encoder with zero state at onset of detection
            tail_len = self._subsampling_factor * (
                self._recog_args.streaming_onset_margin + 1
            )
            h, ilen = self._e2e.subsample_frames(
                np.reshape(
                    self._previous_input[-tail_len:], [-1, len(self._previous_input[0])]
                )
            )
            h, _, self._previous_encoder_recurrent_state = self._e2e.enc(
                h.unsqueeze(0), ilen, None
            )

        hyp = None
        if self._activates == 1:
            self._encoder_states.extend(h.squeeze(0))
            self._ctc_posteriors.extend(self._e2e.ctc.log_softmax(h).squeeze(0))

            if z[0] == self._blank_idx_in_char_list:
                self._blank_dur += 1
            else:
                self._blank_dur = 0

            if self._blank_dur >= self._recog_args.streaming_min_blank_dur:
                seg_len = (
                    len(self._encoder_states)
                    - self._blank_dur
                    + self._recog_args.streaming_offset_margin
                )
                if seg_len > 0:
                    # Run decoder with a detected segment
                    h = torch.cat(self._encoder_states[:seg_len], dim=0).view(
                        -1, self._encoder_states[0].size(0)
                    )
                    if self._recog_args.ctc_weight > 0.0:
                        lpz = torch.cat(self._ctc_posteriors[:seg_len], dim=0).view(
                            -1, self._ctc_posteriors[0].size(0)
                        )
                        if self._recog_args.batchsize > 0:
                            lpz = lpz.unsqueeze(0)
                        normalize_score = False
                    else:
                        lpz = None
                        normalize_score = True

                    if self._recog_args.batchsize == 0:
                        hyp = self._e2e.dec.recognize_beam(
                            h, lpz, self._recog_args, self._char_list, self._rnnlm
                        )
                    else:
                        hlens = torch.tensor([h.shape[0]])
                        hyp = self._e2e.dec.recognize_beam_batch(
                            h.unsqueeze(0),
                            hlens,
                            lpz,
                            self._recog_args,
                            self._char_list,
                            self._rnnlm,
                            normalize_score=normalize_score,
                        )[0]

                    self._activates = 0
                    self._blank_dur = 0

                    tail_len = (
                        self._subsampling_factor
                        * self._recog_args.streaming_onset_margin
                    )
                    self._previous_input = self._previous_input[-tail_len:]
                    self._encoder_states = []
                    self._ctc_posteriors = []

        return hyp


================================================
FILE: nets/pytorch_backend/streaming/window.py
================================================
import torch


# TODO(pzelasko): Currently allows half-streaming only;
#  needs streaming attention decoder implementation
class WindowStreamingE2E(object):
    """WindowStreamingE2E constructor.

    :param E2E e2e: E2E ASR object
    :param recog_args: arguments for "recognize" method of E2E
    """

    def __init__(self, e2e, recog_args, rnnlm=None):
        self._e2e = e2e
        self._recog_args = recog_args
        self._char_list = e2e.char_list
        self._rnnlm = rnnlm

        self._e2e.eval()

        self._offset = 0
        self._previous_encoder_recurrent_state = None
        self._encoder_states = []
        self._ctc_posteriors = []
        self._last_recognition = None

        assert (
            self._recog_args.ctc_weight > 0.0
        ), "WindowStreamingE2E works only with combined CTC and attention decoders."

    def accept_input(self, x):
        """Call this method each time a new batch of input is available."""

        h, ilen = self._e2e.subsample_frames(x)

        # Streaming encoder
        h, _, self._previous_encoder_recurrent_state = self._e2e.enc(
            h.unsqueeze(0), ilen, self._previous_encoder_recurrent_state
        )
        self._encoder_states.append(h.squeeze(0))

        # CTC posteriors for the incoming audio
        self._ctc_posteriors.append(self._e2e.ctc.log_softmax(h).squeeze(0))

    def _input_window_for_decoder(self, use_all=False):
        if use_all:
            return (
                torch.cat(self._encoder_states, dim=0),
                torch.cat(self._ctc_posteriors, dim=0),
            )

        def select_unprocessed_windows(window_tensors):
            last_offset = self._offset
            offset_traversed = 0
            selected_windows = []
            for es in window_tensors:
                if offset_traversed > last_offset:
                    selected_windows.append(es)
                    continue
                offset_traversed += es.size(1)
            return torch.cat(selected_windows, dim=0)

        return (
            select_unprocessed_windows(self._encoder_states),
            select_unprocessed_windows(self._ctc_posteriors),
        )

    def decode_with_attention_offline(self):
        """Run the attention decoder offline.

        Works even if the previous layers (encoder and CTC decoder) were
        being run in the online mode.
        This method should be run after all the audio has been consumed.
        This is used mostly to compare the results between offline
        and online implementation of the previous layers.
        """
        h, lpz = self._input_window_for_decoder(use_all=True)

        return self._e2e.dec.recognize_beam(
            h, lpz, self._recog_args, self._char_list, self._rnnlm
        )


================================================
FILE: nets/pytorch_backend/tacotron2/__init__.py
================================================
"""Initialize sub package."""


================================================
FILE: nets/pytorch_backend/tacotron2/cbhg.py
================================================
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

# Copyright 2019 Nagoya University (Tomoki Hayashi)
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

"""CBHG related modules."""

import torch
import torch.nn.functional as F

from torch.nn.utils.rnn import pack_padded_sequence
from torch.nn.utils.rnn import pad_packed_sequence

from espnet.nets.pytorch_backend.nets_utils import make_non_pad_mask


class CBHGLoss(torch.nn.Module):
    """Loss function module for CBHG."""

    def __init__(self, use_masking=True):
        """Initialize CBHG loss module.

        Args:
            use_masking (bool): Whether to mask padded part in loss calculation.

        """
        super(CBHGLoss, self).__init__()
        self.use_masking = use_masking

    def forward(self, cbhg_outs, spcs, olens):
        """Calculate forward propagation.

        Args:
            cbhg_outs (Tensor): Batch of CBHG outputs (B, Lmax, spc_dim).
            spcs (Tensor): Batch of groundtruth of spectrogram (B, Lmax, spc_dim).
            olens (LongTensor): Batch of the lengths of each sequence (B,).

        Returns:
            Tensor: L1 loss value
            Tensor: Mean square error loss value.

        """
        # perform masking for padded values
        if self.use_masking:
            mask = make_non_pad_mask(olens).unsqueeze(-1).to(spcs.device)
            spcs = spcs.masked_select(mask)
            cbhg_outs = cbhg_outs.masked_select(mask)

        # calculate loss
        cbhg_l1_loss = F.l1_loss(cbhg_outs, spcs)
        cbhg_mse_loss = F.mse_loss(cbhg_outs, spcs)

        return cbhg_l1_loss, cbhg_mse_loss


class CBHG(torch.nn.Module):
    """CBHG module to convert log Mel-filterbanks to linear spectrogram.

    This is a module of CBHG introduced
    in `Tacotron: Towards End-to-End Speech Synthesis`_.
    The CBHG converts the sequence of log Mel-filterbanks into linear spectrogram.

    .. _`Tacotron: Towards End-to-End Speech Synthesis`:
         https://arxiv.org/abs/1703.10135

    """

    def __init__(
        self,
        idim,
        odim,
        conv_bank_layers=8,
        conv_bank_chans=128,
        conv_proj_filts=3,
        conv_proj_chans=256,
        highway_layers=4,
        highway_units=128,
        gru_units=256,
    ):
        """Initialize CBHG module.

        Args:
            idim (int): Dimension of the inputs.
            odim (int): Dimension of the outputs.
            conv_bank_layers (int, optional): The number of convolution bank layers.
            conv_bank_chans (int, optional): The number of channels in convolution bank.
            conv_proj_filts (int, optional):
                Kernel size of convolutional projection layer.
            conv_proj_chans (int, optional):
                The number of channels in convolutional projection layer.
            highway_layers (int, optional): The number of highway network layers.
            highway_units (int, optional): The number of highway network units.
            gru_units (int, optional): The number of GRU units (for both directions).

        """
        super(CBHG, self).__init__()
        self.idim = idim
        self.odim = odim
        self.conv_bank_layers = conv_bank_layers
        self.conv_bank_chans = conv_bank_chans
        self.conv_proj_filts = conv_proj_filts
        self.conv_proj_chans = conv_proj_chans
        self.highway_layers = highway_layers
        self.highway_units = highway_units
        self.gru_units = gru_units

        # define 1d convolution bank
        self.conv_bank = torch.nn.ModuleList()
        for k in range(1, self.conv_bank_layers + 1):
            if k % 2 != 0:
                padding = (k - 1) // 2
            else:
                padding = ((k - 1) // 2, (k - 1) // 2 + 1)
            self.conv_bank += [
                torch.nn.Sequential(
                    torch.nn.ConstantPad1d(padding, 0.0),
                    torch.nn.Conv1d(
                        idim, self.conv_bank_chans, k, stride=1, padding=0, bias=True
                    ),
                    torch.nn.BatchNorm1d(self.conv_bank_chans),
                    torch.nn.ReLU(),
                )
            ]

        # define max pooling (need padding for one-side to keep same length)
        self.max_pool = torch.nn.Sequential(
            torch.nn.ConstantPad1d((0, 1), 0.0), torch.nn.MaxPool1d(2, stride=1)
        )

        # define 1d convolution projection
        self.projections = torch.nn.Sequential(
            torch.nn.Conv1d(
                self.conv_bank_chans * self.conv_bank_layers,
                self.conv_proj_chans,
                self.conv_proj_filts,
                stride=1,
                padding=(self.conv_proj_filts - 1) // 2,
                bias=True,
            ),
            torch.nn.BatchNorm1d(self.conv_proj_chans),
            torch.nn.ReLU(),
            torch.nn.Conv1d(
                self.conv_proj_chans,
                self.idim,
                self.conv_proj_filts,
                stride=1,
                padding=(self.conv_proj_filts - 1) // 2,
                bias=True,
            ),
            torch.nn.BatchNorm1d(self.idim),
        )

        # define highway network
        self.highways = torch.nn.ModuleList()
        self.highways += [torch.nn.Linear(idim, self.highway_units)]
        for _ in range(self.highway_layers):
            self.highways += [HighwayNet(self.highway_units)]

        # define bidirectional GRU
        self.gru = torch.nn.GRU(
            self.highway_units,
            gru_units // 2,
            num_layers=1,
            batch_first=True,
            bidirectional=True,
        )

        # define final projection
        self.output = torch.nn.Linear(gru_units, odim, bias=True)

    def forward(self, xs, ilens):
        """Calculate forward propagation.

        Args:
            xs (Tensor): Batch of the padded sequences of inputs (B, Tmax, idim).
            ilens (LongTensor): Batch of lengths of each input sequence (B,).

        Return:
            Tensor: Batch of the padded sequence of outputs (B, Tmax, odim).
            LongTensor: Batch of lengths of each output sequence (B,).

        """
        xs = xs.transpose(1, 2)  # (B, idim, Tmax)
        convs = []
        for k in range(self.conv_bank_layers):
            convs += [self.conv_bank[k](xs)]
        convs = torch.cat(convs, dim=1)  # (B, #CH * #BANK, Tmax)
        convs = self.max_pool(convs)
        convs = self.projections(convs).transpose(1, 2)  # (B, Tmax, idim)
        xs = xs.transpose(1, 2) + convs
        # + 1 for dimension adjustment layer
        for i in range(self.highway_layers + 1):
            xs = self.highways[i](xs)

        # sort by length
        xs, ilens, sort_idx = self._sort_by_length(xs, ilens)

        # total_length needs for DataParallel
        # (see https://github.com/pytorch/pytorch/pull/6327)
        total_length = xs.size(1)
        if not isinstance(ilens, torch.Tensor):
            ilens = torch.tensor(ilens)
        xs = pack_padded_sequence(xs, ilens.cpu(), batch_first=True)
        self.gru.flatten_parameters()
        xs, _ = self.gru(xs)
        xs, ilens = pad_packed_sequence(xs, batch_first=True, total_length=total_length)

        # revert sorting by length
        xs, ilens = self._revert_sort_by_length(xs, ilens, sort_idx)

        xs = self.output(xs)  # (B, Tmax, odim)

        return xs, ilens

    def inference(self, x):
        """Inference.

        Args:
            x (Tensor): The sequences of inputs (T, idim).

        Return:
            Tensor: The sequence of outputs (T, odim).

        """
        assert len(x.size()) == 2
        xs = x.unsqueeze(0)
        ilens = x.new([x.size(0)]).long()

        return self.forward(xs, ilens)[0][0]

    def _sort_by_length(self, xs, ilens):
        sort_ilens, sort_idx = ilens.sort(0, descending=True)
        return xs[sort_idx], ilens[sort_idx], sort_idx

    def _revert_sort_by_length(self, xs, ilens, sort_idx):
        _, revert_idx = sort_idx.sort(0)
        return xs[revert_idx], ilens[revert_idx]


class HighwayNet(torch.nn.Module):
    """Highway Network module.

    This is a module of Highway Network introduced in `Highway Networks`_.

    .. _`Highway Networks`: https://arxiv.org/abs/1505.00387

    """

    def __init__(self, idim):
        """Initialize Highway Network module.

        Args:
            idim (int): Dimension of the inputs.

        """
        super(HighwayNet, self).__init__()
        self.idim = idim
        self.projection = torch.nn.Sequential(
            torch.nn.Linear(idim, idim), torch.nn.ReLU()
        )
        self.gate = torch.nn.Sequential(torch.nn.Linear(idim, idim), torch.nn.Sigmoid())

    def forward(self, x):
        """Calculate forward propagation.

        Args:
            x (Tensor): Batch of inputs (B, ..., idim).

        Returns:
            Tensor: Batch of outputs, which are the same shape as inputs (B, ..., idim).

        """
        proj = self.projection(x)
        gate = self.gate(x)
        return proj * gate + x * (1.0 - gate)


================================================
FILE: nets/pytorch_backend/tacotron2/decoder.py
================================================
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

# Copyright 2019 Nagoya University (Tomoki Hayashi)
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

"""Tacotron2 decoder related modules."""

import six

import torch
import torch.nn.functional as F

from espnet.nets.pytorch_backend.rnn.attentions import AttForwardTA


def decoder_init(m):
    """Initialize decoder parameters."""
    if isinstance(m, torch.nn.Conv1d):
        torch.nn.init.xavier_uniform_(m.weight, torch.nn.init.calculate_gain("tanh"))


class ZoneOutCell(torch.nn.Module):
    """ZoneOut Cell module.

    This is a module of zoneout described in
    `Zoneout: Regularizing RNNs by Randomly Preserving Hidden Activations`_.
    This code is modified from `eladhoffer/seq2seq.pytorch`_.

    Examples:
        >>> lstm = torch.nn.LSTMCell(16, 32)
        >>> lstm = ZoneOutCell(lstm, 0.5)

    .. _`Zoneout: Regularizing RNNs by Randomly Preserving Hidden Activations`:
        https://arxiv.org/abs/1606.01305

    .. _`eladhoffer/seq2seq.pytorch`:
        https://github.com/eladhoffer/seq2seq.pytorch

    """

    def __init__(self, cell, zoneout_rate=0.1):
        """Initialize zone out cell module.

        Args:
            cell (torch.nn.Module): Pytorch recurrent cell module
                e.g. `torch.nn.Module.LSTMCell`.
            zoneout_rate (float, optional): Probability of zoneout from 0.0 to 1.0.

        """
        super(ZoneOutCell, self).__init__()
        self.cell = cell
        self.hidden_size = cell.hidden_size
        self.zoneout_rate = zoneout_rate
        if zoneout_rate > 1.0 or zoneout_rate < 0.0:
            raise ValueError(
                "zoneout probability must be in the range from 0.0 to 1.0."
            )

    def forward(self, inputs, hidden):
        """Calculate forward propagation.

        Args:
            inputs (Tensor): Batch of input tensor (B, input_size).
            hidden (tuple):
                - Tensor: Batch of initial hidden states (B, hidden_size).
                - Tensor: Batch of initial cell states (B, hidden_size).

        Returns:
            tuple:
                - Tensor: Batch of next hidden states (B, hidden_size).
                - Tensor: Batch of next cell states (B, hidden_size).

        """
        next_hidden = self.cell(inputs, hidden)
        next_hidden = self._zoneout(hidden, next_hidden, self.zoneout_rate)
        return next_hidden

    def _zoneout(self, h, next_h, prob):
        # apply recursively
        if isinstance(h, tuple):
            num_h = len(h)
            if not isinstance(prob, tuple):
                prob = tuple([prob] * num_h)
            return tuple(
                [self._zoneout(h[i], next_h[i], prob[i]) for i in range(num_h)]
            )

        if self.training:
            mask = h.new(*h.size()).bernoulli_(prob)
            return mask * h + (1 - mask) * next_h
        else:
            return prob * h + (1 - prob) * next_h


class Prenet(torch.nn.Module):
    """Prenet module for decoder of Spectrogram prediction network.

    This is a module of Prenet in the decoder of Spectrogram prediction network,
    which described in `Natural TTS
    Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions`_.
    The Prenet preforms nonlinear conversion
    of inputs before input to auto-regressive lstm,
    which helps to learn diagonal attentions.

    Note:
        This module alway applies dropout even in evaluation.
        See the detail in `Natural TTS Synthesis by
        Conditioning WaveNet on Mel Spectrogram Predictions`_.

    .. _`Natural TTS Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions`:
       https://arxiv.org/abs/1712.05884

    """

    def __init__(self, idim, n_layers=2, n_units=256, dropout_rate=0.5):
        """Initialize prenet module.

        Args:
            idim (int): Dimension of the inputs.
            odim (int): Dimension of the outputs.
            n_layers (int, optional): The number of prenet layers.
            n_units (int, optional): The number of prenet units.

        """
        super(Prenet, self).__init__()
        self.dropout_rate = dropout_rate
        self.prenet = torch.nn.ModuleList()
        for layer in six.moves.range(n_layers):
            n_inputs = idim if layer == 0 else n_units
            self.prenet += [
                torch.nn.Sequential(torch.nn.Linear(n_inputs, n_units), torch.nn.ReLU())
            ]

    def forward(self, x):
        """Calculate forward propagation.

        Args:
            x (Tensor): Batch of input tensors (B, ..., idim).

        Returns:
            Tensor: Batch of output tensors (B, ..., odim).

        """
        for i in six.moves.range(len(self.prenet)):
            x = F.dropout(self.prenet[i](x), self.dropout_rate)
        return x


class Postnet(torch.nn.Module):
    """Postnet module for Spectrogram prediction network.

    This is a module of Postnet in Spectrogram prediction network,
    which described in `Natural TTS Synthesis by
    Conditioning WaveNet on Mel Spectrogram Predictions`_.
    The Postnet predicts refines the predicted
    Mel-filterbank of the decoder,
    which helps to compensate the detail sturcture of spectrogram.

    .. _`Natural TTS Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions`:
       https://arxiv.org/abs/1712.05884

    """

    def __init__(
        self,
        idim,
        odim,
        n_layers=5,
        n_chans=512,
        n_filts=5,
        dropout_rate=0.5,
        use_batch_norm=True,
    ):
        """Initialize postnet module.

        Args:
            idim (int): Dimension of the inputs.
            odim (int): Dimension of the outputs.
            n_layers (int, optional): The number of layers.
            n_filts (int, optional): The number of filter size.
            n_units (int, optional): The number of filter channels.
            use_batch_norm (bool, optional): Whether to use batch normalization..
            dropout_rate (float, optional): Dropout rate..

        """
        super(Postnet, self).__init__()
        self.postnet = torch.nn.ModuleList()
        for layer in six.moves.range(n_layers - 1):
            ichans = odim if layer == 0 else n_chans
            ochans = odim if layer == n_layers - 1 else n_chans
            if use_batch_norm:
                self.postnet += [
                    torch.nn.Sequential(
                        torch.nn.Conv1d(
                            ichans,
                            ochans,
                            n_filts,
                            stride=1,
                            padding=(n_filts - 1) // 2,
                            bias=False,
                        ),
                        torch.nn.BatchNorm1d(ochans),
                        torch.nn.Tanh(),
                        torch.nn.Dropout(dropout_rate),
                    )
                ]
            else:
                self.postnet += [
                    torch.nn.Sequential(
                        torch.nn.Conv1d(
                            ichans,
                            ochans,
                            n_filts,
                            stride=1,
                            padding=(n_filts - 1) // 2,
                            bias=False,
                        ),
                        torch.nn.Tanh(),
                        torch.nn.Dropout(dropout_rate),
                    )
                ]
        ichans = n_chans if n_layers != 1 else odim
        if use_batch_norm:
            self.postnet += [
                torch.nn.Sequential(
                    torch.nn.Conv1d(
                        ichans,
                        odim,
                        n_filts,
                        stride=1,
                        padding=(n_filts - 1) // 2,
                        bias=False,
                    ),
                    torch.nn.BatchNorm1d(odim),
                    torch.nn.Dropout(dropout_rate),
                )
            ]
        else:
            self.postnet += [
                torch.nn.Sequential(
                    torch.nn.Conv1d(
                        ichans,
                        odim,
                        n_filts,
                        stride=1,
                        padding=(n_filts - 1) // 2,
                        bias=False,
                    ),
                    torch.nn.Dropout(dropout_rate),
                )
            ]

    def forward(self, xs):
        """Calculate forward propagation.

        Args:
            xs (Tensor): Batch of the sequences of padded input tensors (B, idim, Tmax).

        Returns:
            Tensor: Batch of padded output tensor. (B, odim, Tmax).

        """
        for i in six.moves.range(len(self.postnet)):
            xs = self.postnet[i](xs)
        return xs


class Decoder(torch.nn.Module):
    """Decoder module of Spectrogram prediction network.

    This is a module of decoder of Spectrogram prediction network in Tacotron2,
    which described in `Natural TTS
    Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions`_.
    The decoder generates the sequence of
    features from the sequence of the hidden states.

    .. _`Natural TTS Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions`:
       https://arxiv.org/abs/1712.05884

    """

    def __init__(
        self,
        idim,
        odim,
        att,
        dlayers=2,
        dunits=1024,
        prenet_layers=2,
        prenet_units=256,
        postnet_layers=5,
        postnet_chans=512,
        postnet_filts=5,
        output_activation_fn=None,
        cumulate_att_w=True,
        use_batch_norm=True,
        use_concate=True,
        dropout_rate=0.5,
        zoneout_rate=0.1,
        reduction_factor=1,
    ):
        """Initialize Tacotron2 decoder module.

        Args:
            idim (int): Dimension of the inputs.
            odim (int): Dimension of the outputs.
            att (torch.nn.Module): Instance of attention class.
            dlayers (int, optional): The number of decoder lstm layers.
            dunits (int, optional): The number of decoder lstm units.
            prenet_layers (int, optional): The number of prenet layers.
            prenet_units (int, optional): The number of prenet units.
            postnet_layers (int, optional): The number of postnet layers.
            postnet_filts (int, optional): The number of postnet filter size.
            postnet_chans (int, optional): The number of postnet filter channels.
            output_activation_fn (torch.nn.Module, optional):
                Activation function for outputs.
            cumulate_att_w (bool, optional):
                Whether to cumulate previous attention weight.
            use_batch_norm (bool, optional): Whether to use batch normalization.
            use_concate (bool, optional): Whether to concatenate encoder embedding
                with decoder lstm outputs.
            dropout_rate (float, optional): Dropout rate.
            zoneout_rate (float, optional): Zoneout rate.
            reduction_factor (int, optional): Reduction factor.

        """
        super(Decoder, self).__init__()

        # store the hyperparameters
        self.idim = idim
        self.odim = odim
        self.att = att
        self.output_activation_fn = output_activation_fn
        self.cumulate_att_w = cumulate_att_w
        self.use_concate = use_concate
        self.reduction_factor = reduction_factor

        # check attention type
        if isinstance(self.att, AttForwardTA):
            self.use_att_extra_inputs = True
        else:
            self.use_att_extra_inputs = False

        # define lstm network
        prenet_units = prenet_units if prenet_layers != 0 else odim
        self.lstm = torch.nn.ModuleList()
        for layer in six.moves.range(dlayers):
            iunits = idim + prenet_units if layer == 0 else dunits
            lstm = torch.nn.LSTMCell(iunits, dunits)
            if zoneout_rate > 0.0:
                lstm = ZoneOutCell(lstm, zoneout_rate)
            self.lstm += [lstm]

        # define prenet
        if prenet_layers > 0:
            self.prenet = Prenet(
                idim=odim,
                n_layers=prenet_layers,
                n_units=prenet_units,
                dropout_rate=dropout_rate,
            )
        else:
            self.prenet = None

        # define postnet
        if postnet_layers > 0:
            self.postnet = Postnet(
                idim=idim,
                odim=odim,
                n_layers=postnet_layers,
                n_chans=postnet_chans,
                n_filts=postnet_filts,
                use_batch_norm=use_batch_norm,
                dropout_rate=dropout_rate,
            )
        else:
            self.postnet = None

        # define projection layers
        iunits = idim + dunits if use_concate else dunits
        self.feat_out = torch.nn.Linear(iunits, odim * reduction_factor, bias=False)
        self.prob_out = torch.nn.Linear(iunits, reduction_factor)

        # initialize
        self.apply(decoder_init)

    def _zero_state(self, hs):
        init_hs = hs.new_zeros(hs.size(0), self.lstm[0].hidden_size)
        return init_hs

    def forward(self, hs, hlens, ys):
        """Calculate forward propagation.

        Args:
            hs (Tensor): Batch of the sequences of padded hidden states (B, Tmax, idim).
            hlens (LongTensor): Batch of lengths of each input batch (B,).
            ys (Tensor):
                Batch of the sequences of padded target features (B, Lmax, odim).

        Returns:
            Tensor: Batch of output tensors after postnet (B, Lmax, odim).
            Tensor: Batch of output tensors before postnet (B, Lmax, odim).
            Tensor: Batch of logits of stop prediction (B, Lmax).
            Tensor: Batch of attention weights (B, Lmax, Tmax).

        Note:
            This computation is performed in teacher-forcing manner.

        """
        # thin out frames (B, Lmax, odim) ->  (B, Lmax/r, odim)
        if self.reduction_factor > 1:
            ys = ys[:, self.reduction_factor - 1 :: self.reduction_factor]

        # length list should be list of int
        hlens = list(map(int, hlens))

        # initialize hidden states of decoder
        c_list = [self._zero_state(hs)]
        z_list = [self._zero_state(hs)]
        for _ in six.moves.range(1, len(self.lstm)):
            c_list += [self._zero_state(hs)]
            z_list += [self._zero_state(hs)]
        prev_out = hs.new_zeros(hs.size(0), self.odim)

        # initialize attention
        prev_att_w = None
        self.att.reset()

        # loop for an output sequence
        outs, logits, att_ws = [], [], []
        for y in ys.transpose(0, 1):
            if self.use_att_extra_inputs:
                att_c, att_w = self.att(hs, hlens, z_list[0], prev_att_w, prev_out)
            else:
                att_c, att_w = self.att(hs, hlens, z_list[0], prev_att_w)
            prenet_out = self.prenet(prev_out) if self.prenet is not None else prev_out
            xs = torch.cat([att_c, prenet_out], dim=1)
            z_list[0], c_list[0] = self.lstm[0](xs, (z_list[0], c_list[0]))
            for i in six.moves.range(1, len(self.lstm)):
                z_list[i], c_list[i] = self.lstm[i](
                    z_list[i - 1], (z_list[i], c_list[i])
                )
            zcs = (
                torch.cat([z_list[-1], att_c], dim=1)
                if self.use_concate
                else z_list[-1]
            )
            outs += [self.feat_out(zcs).view(hs.size(0), self.odim, -1)]
            logits += [self.prob_out(zcs)]
            att_ws += [att_w]
            prev_out = y  # teacher forcing
            if self.cumulate_att_w and prev_att_w is not None:
                prev_att_w = prev_att_w + att_w  # Note: error when use +=
            else:
                prev_att_w = att_w

        logits = torch.cat(logits, dim=1)  # (B, Lmax)
        before_outs = torch.cat(outs, dim=2)  # (B, odim, Lmax)
        att_ws = torch.stack(att_ws, dim=1)  # (B, Lmax, Tmax)

        if self.reduction_factor > 1:
            before_outs = before_outs.view(
                before_outs.size(0), self.odim, -1
            )  # (B, odim, Lmax)

        if self.postnet is not None:
            after_outs = before_outs + self.postnet(before_outs)  # (B, odim, Lmax)
        else:
            after_outs = before_outs
        before_outs = before_outs.transpose(2, 1)  # (B, Lmax, odim)
        after_outs = after_outs.transpose(2, 1)  # (B, Lmax, odim)
        logits = logits

        # apply activation function for scaling
        if self.output_activation_fn is not None:
            before_outs = self.output_activation_fn(before_outs)
            after_outs = self.output_activation_fn(after_outs)

        return after_outs, before_outs, logits, att_ws

    def inference(
        self,
        h,
        threshold=0.5,
        minlenratio=0.0,
        maxlenratio=10.0,
        use_att_constraint=False,
        backward_window=None,
        forward_window=None,
    ):
        """Generate the sequence of features given the sequences of characters.

        Args:
            h (Tensor): Input sequence of encoder hidden states (T, C).
            threshold (float, optional): Threshold to stop generation.
            minlenratio (float, optional): Minimum length ratio.
                If set to 1.0 and the length of input is 10,
                the minimum length of outputs will be 10 * 1 = 10.
            minlenratio (float, optional): Minimum length ratio.
                If set to 10 and the length of input is 10,
                the maximum length of outputs will be 10 * 10 = 100.
            use_att_constraint (bool):
                Whether to apply attention constraint introduced in `Deep Voice 3`_.
            backward_window (int): Backward window size in attention constraint.
            forward_window (int): Forward window size in attention constraint.

        Returns:
            Tensor: Output sequence of features (L, odim).
            Tensor: Output sequence of stop probabilities (L,).
            Tensor: Attention weights (L, T).

        Note:
            This computation is performed in auto-regressive manner.

        .. _`Deep Voice 3`: https://arxiv.org/abs/1710.07654

        """
        # setup
        assert len(h.size()) == 2
        hs = h.unsqueeze(0)
        ilens = [h.size(0)]
        maxlen = int(h.size(0) * maxlenratio)
        minlen = int(h.size(0) * minlenratio)

        # initialize hidden states of decoder
        c_list = [self._zero_state(hs)]
        z_list = [self._zero_state(hs)]
        for _ in six.moves.range(1, len(self.lstm)):
            c_list += [self._zero_state(hs)]
            z_list += [self._zero_state(hs)]
        prev_out = hs.new_zeros(1, self.odim)

        # initialize attention
        prev_att_w = None
        self.att.reset()

        # setup for attention constraint
        if use_att_constraint:
            last_attended_idx = 0
        else:
            last_attended_idx = None

        # loop for an output sequence
        idx = 0
        outs, att_ws, probs = [], [], []
        while True:
            # updated index
            idx += self.reduction_factor

            # decoder calculation
            if self.use_att_extra_inputs:
                att_c, att_w = self.att(
                    hs,
                    ilens,
                    z_list[0],
                    prev_att_w,
                    prev_out,
                    last_attended_idx=last_attended_idx,
                    backward_window=backward_window,
                    forward_window=forward_window,
                )
            else:
                att_c, att_w = self.att(
                    hs,
                    ilens,
                    z_list[0],
                    prev_att_w,
                    last_attended_idx=last_attended_idx,
                    backward_window=backward_window,
                    forward_window=forward_window,
                )

            att_ws += [att_w]
            prenet_out = self.prenet(prev_out) if self.prenet is not None else prev_out
            xs = torch.cat([att_c, prenet_out], dim=1)
            z_list[0], c_list[0] = self.lstm[0](xs, (z_list[0], c_list[0]))
            for i in six.moves.range(1, len(self.lstm)):
                z_list[i], c_list[i] = self.lstm[i](
                    z_list[i - 1], (z_list[i], c_list[i])
                )
            zcs = (
                torch.cat([z_list[-1], att_c], dim=1)
                if self.use_concate
                else z_list[-1]
            )
            outs += [self.feat_out(zcs).view(1, self.odim, -1)]  # [(1, odim, r), ...]
            probs += [torch.sigmoid(self.prob_out(zcs))[0]]  # [(r), ...]
            if self.output_activation_fn is not None:
                prev_out = self.output_activation_fn(outs[-1][:, :, -1])  # (1, odim)
            else:
                prev_out = outs[-1][:, :, -1]  # (1, odim)
            if self.cumulate_att_w and prev_att_w is not None:
                prev_att_w = prev_att_w + att_w  # Note: error when use +=
            else:
                prev_att_w = att_w
            if use_att_constraint:
                last_attended_idx = int(att_w.argmax())

            # check whether to finish generation
            if int(sum(probs[-1] >= threshold)) > 0 or idx >= maxlen:
                # check mininum length
                if idx < minlen:
                    continue
                outs = torch.cat(outs, dim=2)  # (1, odim, L)
                if self.postnet is not None:
                    outs = outs + self.postnet(outs)  # (1, odim, L)
                outs = outs.transpose(2, 1).squeeze(0)  # (L, odim)
                probs = torch.cat(probs, dim=0)
                att_ws = torch.cat(att_ws, dim=0)
                break

        if self.output_activation_fn is not None:
            outs = self.output_activation_fn(outs)

        return outs, probs, att_ws

    def calculate_all_attentions(self, hs, hlens, ys):
        """Calculate all of the attention weights.

        Args:
            hs (Tensor): Batch of the sequences of padded hidden states (B, Tmax, idim).
            hlens (LongTensor): Batch of lengths of each input batch (B,).
            ys (Tensor):
                Batch of the sequences of padded target features (B, Lmax, odim).

        Returns:
            numpy.ndarray: Batch of attention weights (B, Lmax, Tmax).

        Note:
            This computation is performed in teacher-forcing manner.

        """
        # thin out frames (B, Lmax, odim) ->  (B, Lmax/r, odim)
        if self.reduction_factor > 1:
            ys = ys[:, self.reduction_factor - 1 :: self.reduction_factor]

        # length list should be list of int
        hlens = list(map(int, hlens))

        # initialize hidden states of decoder
        c_list = [self._zero_state(hs)]
        z_list = [self._zero_state(hs)]
        for _ in six.moves.range(1, len(self.lstm)):
            c_list += [self._zero_state(hs)]
            z_list += [self._zero_state(hs)]
        prev_out = hs.new_zeros(hs.size(0), self.odim)

        # initialize attention
        prev_att_w = None
        self.att.reset()

        # loop for an output sequence
        att_ws = []
        for y in ys.transpose(0, 1):
            if self.use_att_extra_inputs:
                att_c, att_w = self.att(hs, hlens, z_list[0], prev_att_w, prev_out)
            else:
                att_c, att_w = self.att(hs, hlens, z_list[0], prev_att_w)
            att_ws += [att_w]
            prenet_out = self.prenet(prev_out) if self.prenet is not None else prev_out
            xs = torch.cat([att_c, prenet_out], dim=1)
            z_list[0], c_list[0] = self.lstm[0](xs, (z_list[0], c_list[0]))
            for i in six.moves.range(1, len(self.lstm)):
                z_list[i], c_list[i] = self.lstm[i](
                    z_list[i - 1], (z_list[i], c_list[i])
                )
            prev_out = y  # teacher forcing
            if self.cumulate_att_w and prev_att_w is not None:
                prev_att_w = prev_att_w + att_w  # Note: error when use +=
            else:
                prev_att_w = att_w

        att_ws = torch.stack(att_ws, dim=1)  # (B, Lmax, Tmax)

        return att_ws


================================================
FILE: nets/pytorch_backend/tacotron2/encoder.py
================================================
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

# Copyright 2019 Nagoya University (Tomoki Hayashi)
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

"""Tacotron2 encoder related modules."""

import six

import torch

from torch.nn.utils.rnn import pack_padded_sequence
from torch.nn.utils.rnn import pad_packed_sequence


def encoder_init(m):
    """Initialize encoder parameters."""
    if isinstance(m, torch.nn.Conv1d):
        torch.nn.init.xavier_uniform_(m.weight, torch.nn.init.calculate_gain("relu"))


class Encoder(torch.nn.Module):
    """Encoder module of Spectrogram prediction network.

    This is a module of encoder of Spectrogram prediction network in Tacotron2,
    which described in `Natural TTS Synthesis by Conditioning WaveNet on Mel
    Spectrogram Predictions`_. This is the encoder which converts either a sequence
    of characters or acoustic features into the sequence of hidden states.

    .. _`Natural TTS Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions`:
       https://arxiv.org/abs/1712.05884

    """

    def __init__(
        self,
        idim,
        input_layer="embed",
        embed_dim=512,
        elayers=1,
        eunits=512,
        econv_layers=3,
        econv_chans=512,
        econv_filts=5,
        use_batch_norm=True,
        use_residual=False,
        dropout_rate=0.5,
        padding_idx=0,
    ):
        """Initialize Tacotron2 encoder module.

        Args:
            idim (int) Dimension of the inputs.
            input_layer (str): Input layer type.
            embed_dim (int, optional) Dimension of character embedding.
            elayers (int, optional) The number of encoder blstm layers.
            eunits (int, optional) The number of encoder blstm units.
            econv_layers (int, optional) The number of encoder conv layers.
            econv_filts (int, optional) The number of encoder conv filter size.
            econv_chans (int, optional) The number of encoder conv filter channels.
            use_batch_norm (bool, optional) Whether to use batch normalization.
            use_residual (bool, optional) Whether to use residual connection.
            dropout_rate (float, optional) Dropout rate.

        """
        super(Encoder, self).__init__()
        # store the hyperparameters
        self.idim = idim
        self.use_residual = use_residual

        # define network layer modules
        if input_layer == "linear":
            self.embed = torch.nn.Linear(idim, econv_chans)
        elif input_layer == "embed":
            self.embed = torch.nn.Embedding(idim, embed_dim, padding_idx=padding_idx)
        else:
            raise ValueError("unknown input_layer: " + input_layer)

        if econv_layers > 0:
            self.convs = torch.nn.ModuleList()
            for layer in six.moves.range(econv_layers):
                ichans = (
                    embed_dim if layer == 0 and input_layer == "embed" else econv_chans
                )
                if use_batch_norm:
                    self.convs += [
                        torch.nn.Sequential(
                            torch.nn.Conv1d(
                                ichans,
                                econv_chans,
                                econv_filts,
                                stride=1,
                                padding=(econv_filts - 1) // 2,
                                bias=False,
                            ),
                            torch.nn.BatchNorm1d(econv_chans),
                            torch.nn.ReLU(),
                            torch.nn.Dropout(dropout_rate),
                        )
                    ]
                else:
                    self.convs += [
                        torch.nn.Sequential(
                            torch.nn.Conv1d(
                                ichans,
                                econv_chans,
                                econv_filts,
                                stride=1,
                                padding=(econv_filts - 1) // 2,
                                bias=False,
                            ),
                            torch.nn.ReLU(),
                            torch.nn.Dropout(dropout_rate),
                        )
                    ]
        else:
            self.convs = None
        if elayers > 0:
            iunits = econv_chans if econv_layers != 0 else embed_dim
            self.blstm = torch.nn.LSTM(
                iunits, eunits // 2, elayers, batch_first=True, bidirectional=True
            )
        else:
            self.blstm = None

        # initialize
        self.apply(encoder_init)

    def forward(self, xs, ilens=None):
        """Calculate forward propagation.

        Args:
            xs (Tensor): Batch of the padded sequence. Either character ids (B, Tmax)
                or acoustic feature (B, Tmax, idim * encoder_reduction_factor). Padded
                value should be 0.
            ilens (LongTensor): Batch of lengths of each input batch (B,).

        Returns:
            Tensor: Batch of the sequences of encoder states(B, Tmax, eunits).
            LongTensor: Batch of lengths of each sequence (B,)

        """
        xs = self.embed(xs).transpose(1, 2)
        if self.convs is not None:
            for i in six.moves.range(len(self.convs)):
                if self.use_residual:
                    xs += self.convs[i](xs)
                else:
                    xs = self.convs[i](xs)
        if self.blstm is None:
            return xs.transpose(1, 2)
        if not isinstance(ilens, torch.Tensor):
            ilens = torch.tensor(ilens)
        xs = pack_padded_sequence(xs.transpose(1, 2), ilens.cpu(), batch_first=True)
        self.blstm.flatten_parameters()
        xs, _ = self.blstm(xs)  # (B, Tmax, C)
        xs, hlens = pad_packed_sequence(xs, batch_first=True)

        return xs, hlens

    def inference(self, x):
        """Inference.

        Args:
            x (Tensor): The sequeunce of character ids (T,)
                    or acoustic feature (T, idim * encoder_reduction_factor).

        Returns:
            Tensor: The sequences of encoder states(T, eunits).

        """
        xs = x.unsqueeze(0)
        ilens = torch.tensor([x.size(0)])

        return self.forward(xs, ilens)[0][0]


================================================
FILE: nets/pytorch_backend/transducer/__init__.py
================================================
"""Initialize sub package."""


================================================
FILE: nets/pytorch_backend/transducer/arguments.py
================================================
"""Transducer model arguments."""

import ast
from distutils.util import strtobool


def add_encoder_general_arguments(group):
    """Define general arguments for encoder."""
    group.add_argument(
        "--etype",
        default="blstmp",
        type=str,
        choices=[
            "custom",
            "lstm",
            "blstm",
            "lstmp",
            "blstmp",
            "vgglstmp",
            "vggblstmp",
            "vgglstm",
            "vggblstm",
            "gru",
            "bgru",
            "grup",
            "bgrup",
            "vgggrup",
            "vggbgrup",
            "vgggru",
            "vggbgru",
        ],
        help="Type of encoder network architecture",
    )
    group.add_argument(
        "--dropout-rate",
        default=0.0,
        type=float,
        help="Dropout rate for the encoder",
    )

    return group


def add_rnn_encoder_arguments(group):
    """Define arguments for RNN encoder."""
    group.add_argument(
        "--elayers",
        default=4,
        type=int,
        help="Number of encoder layers (for shared recognition part "
        "in multi-speaker asr mode)",
    )
    group.add_argument(
        "--eunits",
        "-u",
        default=300,
        type=int,
        help="Number of encoder hidden units",
    )
    group.add_argument(
        "--eprojs", default=320, type=int, help="Number of encoder projection units"
    )
    group.add_argument(
        "--subsample",
        default="1",
        type=str,
        help="Subsample input frames x_y_z means subsample every x frame "
        "at 1st layer, every y frame at 2nd layer etc.",
    )

    return group


def add_custom_encoder_arguments(group):
    """Define arguments for Custom encoder."""
    group.add_argument(
        "--enc-block-arch",
        type=eval,
        action="append",
        default=None,
        help="Encoder architecture definition by blocks",
    )
    group.add_argument(
        "--enc-block-repeat",
        default=0,
        type=int,
        help="Repeat N times the provided encoder blocks if N > 1",
    )
    group.add_argument(
        "--custom-enc-input-layer",
        type=str,
        default="conv2d",
        choices=["conv2d", "vgg2l", "linear", "embed", "null"],
        help="Custom encoder input layer type",
    )
    group.add_argument(
        "--custom-enc-positional-encoding-type",
        type=str,
        default="abs_pos",
        choices=["abs_pos", "scaled_abs_pos", "rel_pos"],
        help="Custom encoder positional encoding layer type",
    )
    group.add_argument(
        "--custom-enc-self-attn-type",
        type=str,
        default="self_attn",
        choices=["self_attn", "rel_self_attn"],
        help="Custom encoder self-attention type",
    )
    group.add_argument(
        "--custom-enc-pw-activation-type",
        type=str,
        default="relu",
        choices=["relu", "hardtanh", "selu", "swish"],
        help="Custom encoder pointwise activation type",
    )
    group.add_argument(
        "--custom-enc-conv-mod-activation-type",
        type=str,
        default="swish",
        choices=["relu", "hardtanh", "selu", "swish"],
        help="Custom encoder convolutional module activation type",
    )

    return group


def add_decoder_general_arguments(group):
    """Define general arguments for encoder."""
    group.add_argument(
        "--dtype",
        default="lstm",
        type=str,
        choices=["lstm", "gru", "custom"],
        help="Type of decoder to use",
    )
    group.add_argument(
        "--dropout-rate-decoder",
        default=0.0,
        type=float,
        help="Dropout rate for the decoder",
    )
    group.add_argument(
        "--dropout-rate-embed-decoder",
        default=0.0,
        type=float,
        help="Dropout rate for the decoder embedding layer",
    )

    return group


def add_rnn_decoder_arguments(group):
    """Define arguments for RNN decoder."""
    group.add_argument(
        "--dec-embed-dim",
        default=320,
        type=int,
        help="Number of decoder embeddings dimensions",
    )
    group.add_argument(
        "--dlayers", default=1, type=int, help="Number of decoder layers"
    )
    group.add_argument(
        "--dunits", default=320, type=int, help="Number of decoder hidden units"
    )

    return group


def add_custom_decoder_arguments(group):
    """Define arguments for Custom decoder."""
    group.add_argument(
        "--dec-block-arch",
        type=eval,
        action="append",
        default=None,
        help="Custom decoder blocks definition",
    )
    group.add_argument(
        "--dec-block-repeat",
        default=1,
        type=int,
        help="Repeat N times the provided decoder blocks if N > 1",
    )
    group.add_argument(
        "--custom-dec-input-layer",
        type=str,
        default="embed",
        choices=["linear", "embed"],
        help="Custom decoder input layer type",
    )
    group.add_argument(
        "--custom-dec-pw-activation-type",
        type=str,
        default="relu",
        choices=["relu", "hardtanh", "selu", "swish"],
        help="Custom decoder pointwise activation type",
    )

    return group


def add_custom_training_arguments(group):
    """Define arguments for training with Custom architecture."""
    group.add_argument(
        "--transformer-warmup-steps",
        default=25000,
        type=int,
        help="Optimizer warmup steps",
    )
    group.add_argument(
        "--transformer-lr",
        default=10.0,
        type=float,
        help="Initial value of learning rate",
    )

    return group


def add_transducer_arguments(group):
    """Define general arguments for transducer model."""
    group.add_argument(
        "--trans-type",
        default="warp-transducer",
        type=str,
        choices=["warp-transducer", "warp-rnnt"],
        help="Type of transducer implementation to calculate loss.",
    )
    group.add_argument(
        "--transducer-weight",
        default=1.0,
        type=float,
        help="Weight of transducer loss when auxiliary task is used.",
    )
    group.add_argument(
        "--joint-dim",
        default=320,
        type=int,
        help="Number of dimensions in joint space",
    )
    group.add_argument(
        "--joint-activation-type",
        type=str,
        default="tanh",
        choices=["relu", "tanh", "swish"],
        help="Joint network activation type",
    )
    group.add_argument(
        "--score-norm",
        type=strtobool,
        nargs="?",
        default=True,
        help="Normalize transducer scores by length",
    )

    return group


def add_auxiliary_task_arguments(group):
    """Add arguments for auxiliary task."""
    group.add_argument(
        "--aux-task-type",
        nargs="?",
        default=None,
        choices=["default", "symm_kl_div", "both"],
        help="Type of auxiliary task.",
    )
    group.add_argument(
        "--aux-task-layer-list",
        default=None,
        type=ast.literal_eval,
        help="List of layers to use for auxiliary task.",
    )
    group.add_argument(
        "--aux-task-weight",
        default=0.3,
        type=float,
        help="Weight of auxiliary task loss.",
    )
    group.add_argument(
        "--aux-ctc",
        type=strtobool,
        nargs="?",
        default=False,
        help="Whether to use CTC as auxiliary task.",
    )
    group.add_argument(
        "--aux-ctc-weight",
        default=1.0,
        type=float,
        help="Weight of auxiliary task loss",
    )
    group.add_argument(
        "--aux-ctc-dropout-rate",
        default=0.0,
        type=float,
        help="Dropout rate for auxiliary CTC",
    )
    group.add_argument(
        "--aux-cross-entropy",
        type=strtobool,
        nargs="?",
        default=False,
        help="Whether to use CE as auxiliary task for the prediction network.",
    )
    group.add_argument(
        "--aux-cross-entropy-smoothing",
        default=0.0,
        type=float,
        help="Smoothing rate for cross-entropy. If > 0, enables label smoothing loss.",
    )
    group.add_argument(
        "--aux-cross-entropy-weight",
        default=0.5,
        type=float,
        help="Weight of auxiliary task loss",
    )
    group.add_argument(
        "--aux-mmi",
        type=strtobool,
        nargs="?",
        default=False,
        help="Whether to use mmi as auxiliary task.",
    )
    group.add_argument(
        "--aux-mmi-weight",
        default=0.5,
        type=float,
        help="Weight of auxiliary mmi loss",
    )
    group.add_argument(
        "--aux-mmi-dropout-rate",
        default=0.0,
        type=float,
        help="Dropout rate for auxiliary mmi",
    )
    group.add_argument(
        "--aux-mmi-type",
        type=str,
        choices=['mmi', 'phonectc'],
        default='mmi',
        help="LF-MMI or CTC",
    )
    group.add_argument(
        "--aux-mbr",
        type=strtobool,
        nargs="?",
        default=False,
        help="Whether to use mbr as auxiliary task.",
    )
    group.add_argument(
        "--aux-mbr-weight",
        default=1.0,
        type=float,
        help="Weight of auxiliary mbr loss",
    )
    group.add_argument(
        "--aux-mbr-beam",
        default=2,
        type=int,
        help="Number of hypothesis for MBR loss computation",
    )

    return group

def add_att_scorer_arguments(group):
    """
    Argument mainly copied from: espnet.nets.pytorch_backend.transformer.argument
    We only copy the argument for attention decoder / rescorer
    All arguments are added with prefix 'att', which means RNN-T attention scorer only
    """
    group.add_argument(
        "--att-scorer-weight",
        default=0.0,
        type=float,
        help="weight of attention scorer loss",
    )
    group.add_argument(
        "--att-decoder-selfattn-layer-type",
        type=str,
        default="selfattn",
        choices=[
            "selfattn",
            "lightconv",
            "lightconv2d",
            "dynamicconv",
            "dynamicconv2d",
            "light-dynamicconv2d",
        ],
        help="transformer decoder self-attention layer type",
    )
    group.add_argument(
        "--att-adim",
        default=320,
        type=int,
        help="Number of attention transformation dimensions",
    )
    group.add_argument(
        "--att-aheads",
        default=4,
        type=int,
        help="Number of heads for multi head attention",
    )
    group.add_argument(
        "--att-wshare",
        default=4,
        type=int,
        help="Number of parameter shargin for lightweight convolution",
    )
    group.add_argument(
        "--att-ldconv-decoder-kernel-length",
        default="11_13_15_17_19_21",
        type=str,
        help="kernel size for lightweight/dynamic convolution: "
        'Decoder side. For example, "21_23_25" means kernel length 21 for '
        "First layer, 23 for Second layer and so on.",
    )
    group.add_argument(
        "--att-ldconv-usebias",
        type=strtobool,
        default=False,
        help="use bias term in lightweight/dynamic convolution",
    )
    group.add_argument(
        "--att-dlayers", default=1, type=int, help="Number of decoder layers"
    )
    group.add_argument(
        "--att-dunits", default=320, type=int, help="Number of decoder hidden units"
    )
    group.add_argument(
        "--att-attn-dropout-rate",
        default=None,
        type=float,
        help="dropout in transformer attention. use --dropout-rate if None is set",
    )
    group.add_argument(
        "--att-dropout-rate",
        default=0.0,
        type=float,
        help="Dropout rate for the encoder",
    )
    group.add_argument(
        "--att-length-normalized-loss",
        default=True,
        type=strtobool,
        help="normalize loss by length",
    )
    return group


def add_transducer_code_switch_arguments(group):
    """Define general arguments for transducer model."""
    group.add_argument(
        "--cs-is-pretrain",
        default=False,
        type=strtobool,
        help="If true, ignore decoder loss",
    )
    group.add_argument(
        "--cs-share-encoder",
        default=False,
        type=strtobool,
        help="If true, use a shared encoder before the language-specific encoder",
    )
    group.add_argument(
        "--cs-share-encoder-layers",
        default=9,
        type=int,
        help="If true, number of layers in shared encoder",
    )
    group.add_argument(
        "--cs-chn-start",
        default=5,
        type=int,
        help="start index of chn symbols in dict",
    )
    group.add_argument(
        "--cs-eng-start",
        default=4302,
        type=int,
        help="start index of eng symbols in dict",
    )
    group.add_argument(
        "--cs-use-adversial-examples",
        default=False,
        type=strtobool,
        help="If true, mask symbols not from this language",
    )
    group.add_argument(
        "--cs-is-ctc-decoder",
        default=False,
        type=strtobool,
        help="If true, the fine tuning system is on CTC rather than RNNT",
    )
    group.add_argument(
        "--cs-use-mask-predictor",
        default=False,
        type=strtobool,
        help="If true, use a mask-filter process in combine function",
    )
    group.add_argument(
        "--cs-lang-weight",
        default=0.0,
        type=float,
        help="weight of language classificiation",
    )
    group.add_argument(
        "--cs-decoder-expert",
        default=False,
        type=strtobool,
        help="If true, use decoder expert",
    )
    return group


================================================
FILE: nets/pytorch_backend/transducer/auxiliary_task.py
================================================
"""Auxiliary task implementation for transducer models."""

from itertools import chain
from typing import List
from typing import Tuple
from typing import Union

import torch
import torch.nn.functional as F

from espnet.nets.transducer_decoder_interface import TransducerDecoderInterface


class AuxiliaryTask(torch.nn.Module):
    """Auxiliary task module."""

    def __init__(
        self,
        decoder: Union[torch.nn.Module, TransducerDecoderInterface],
        joint_network: torch.nn.Module,
        rnnt_criterion: torch.nn.Module,
        aux_task_type: str,
        aux_task_weight: int,
        encoder_out_dim: int,
        joint_dim: int,
    ):
        """Auxiliary task initialization.

        Args:
            decoder: Decoder module
            joint_network: Joint network module
            aux_task_type: Auxiliary task type
            aux_task_weight: Auxiliary task weight
            encoder_out: Encoder output dimension
            joint_dim: Joint space dimension

        """
        super().__init__()

        self.rnnt_criterion = rnnt_criterion

        self.mlp_net = torch.nn.Sequential(
            torch.nn.Linear(encoder_out_dim, joint_dim),
            torch.nn.ReLU(),
            torch.nn.Linear(joint_dim, joint_dim),
        )

        self.decoder = decoder
        self.joint_network = joint_network

        self.aux_task_type = aux_task_type
        self.aux_task_weight = aux_task_weight

    def forward(
        self,
        enc_out_aux: List,
        dec_out: torch.Tensor,
        main_joint: torch.Tensor,
        target: torch.Tensor,
        pred_len: torch.Tensor,
        target_len: torch.Tensor,
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        """Forward auxiliary task.

        Args:
            enc_out_aux: List of encoder intermediate outputs
            dec_out: Decoder outputs
            main_joint: Joint output for main task
            target: Target labels
            pred_len: Prediction lengths
            target_len: Target lengths

        Returns:
            : (Weighted auxiliary transducer loss, Weighted auxiliary symmetric KL loss)

        """
        aux_trans = 0
        aux_symm_kl = 0

        for p in chain(self.decoder.parameters(), self.joint_network.parameters()):
            p.requires_grad = False

        for i, enc_aux in enumerate(enc_out_aux):
            aux_mlp = self.mlp_net(enc_aux)

            aux_joint = self.joint_network(
                aux_mlp.unsqueeze(2),
                dec_out.unsqueeze(1),
                is_aux=True,
            )

            if self.aux_task_type != "symm_kl_div":
                aux_trans += self.rnnt_criterion(
                    aux_joint,
                    target,
                    pred_len,
                    target_len,
                )

            if self.aux_task_type != "default":
                aux_symm_kl += F.kl_div(
                    F.log_softmax(main_joint, dim=-1),
                    F.softmax(aux_joint, dim=-1),
                    reduction="mean",
                ) + F.kl_div(
                    F.log_softmax(aux_joint, dim=-1),
                    F.softmax(main_joint, dim=-1),
                    reduction="mean",
                )

        for p in chain(self.decoder.parameters(), self.joint_network.parameters()):
            p.requires_grad = True

        return self.aux_task_weight * aux_trans, self.aux_task_weight * aux_symm_kl


================================================
FILE: nets/pytorch_backend/transducer/blocks.py
================================================
"""Set of methods to create custom architecture."""

from collections import Counter

import torch

from espnet.nets.pytorch_backend.conformer.convolution import ConvolutionModule
from espnet.nets.pytorch_backend.conformer.encoder_layer import (
    EncoderLayer as ConformerEncoderLayer,  # noqa: H301
)

from espnet.nets.pytorch_backend.nets_utils import get_activation

from espnet.nets.pytorch_backend.transducer.causal_conv1d import CausalConv1d
from espnet.nets.pytorch_backend.transducer.transformer_decoder_layer import (
    DecoderLayer,  # noqa: H301
)
from espnet.nets.pytorch_backend.transducer.tdnn import TDNN
from espnet.nets.pytorch_backend.transducer.vgg2l import VGG2L

from espnet.nets.pytorch_backend.transformer.attention import (
    MultiHeadedAttention,  # noqa: H301
    RelPositionMultiHeadedAttention,  # noqa: H301
)
from espnet.nets.pytorch_backend.transformer.encoder_layer import EncoderLayer
from espnet.nets.pytorch_backend.transformer.embedding import (
    PositionalEncoding,  # noqa: H301
    ScaledPositionalEncoding,  # noqa: H301
    RelPositionalEncoding,  # noqa: H301
)
from espnet.nets.pytorch_backend.transformer.positionwise_feed_forward import (
    PositionwiseFeedForward,  # noqa: H301
)
from espnet.nets.pytorch_backend.transformer.repeat import MultiSequential
from espnet.nets.pytorch_backend.transformer.subsampling import Conv2dSubsampling


def check_and_prepare(net_part, blocks_arch, input_layer):
    """Check consecutive block shapes match and prepare input parameters.

    Args:
        net_part (str): either 'encoder' or 'decoder'
        blocks_arch (list): list of blocks for network part (type and parameters)
        input_layer (str): input layer type

    Return:
        input_layer (str): input layer type
        input_layer_odim (int): output dim of input layer
        input_dropout_rate (float): dropout rate of input layer
        input_pos_dropout_rate (float): dropout rate of input layer positional enc.
        out_dim (int): output dim of last block

    """
    input_dropout_rate = sorted(
        Counter(
            b["dropout-rate"] for b in blocks_arch if "dropout-rate" in b
        ).most_common(),
        key=lambda x: x[0],
        reverse=True,
    )

    input_pos_dropout_rate = sorted(
        Counter(
            b["pos-dropout-rate"] for b in blocks_arch if "pos-dropout-rate" in b
        ).most_common(),
        key=lambda x: x[0],
        reverse=True,
    )

    input_dropout_rate = input_dropout_rate[0][0] if input_dropout_rate else 0.0
    input_pos_dropout_rate = (
        input_pos_dropout_rate[0][0] if input_pos_dropout_rate else 0.0
    )

    cmp_io = []
    has_transformer = False
    has_conformer = False
    for i in range(len(blocks_arch)):
        if "type" in blocks_arch[i]:
            block_type = blocks_arch[i]["type"]
        else:
            raise ValueError("type is not defined in the " + str(i + 1) + "th block.")

        if block_type == "transformer":
            if not {"d_hidden", "d_ff", "heads"}.issubset(blocks_arch[i]):
                raise ValueError(
                    "Block "
                    + str(i + 1)
                    + "in "
                    + net_part
                    + ": Transformer block format is: {'type: transformer', "
                    "'d_hidden': int, 'd_ff': int, 'heads': int, [...]}"
                )

            has_transformer = True
            cmp_io.append((blocks_arch[i]["d_hidden"], blocks_arch[i]["d_hidden"]))
        elif block_type == "conformer":
            if net_part != "encoder":
                raise ValueError(
                    "Block " + str(i + 1) + ": conformer type is only for encoder part."
                )

            if not {
                "d_hidden",
                "d_ff",
                "heads",
                "macaron_style",
                "use_conv_mod",
            }.issubset(blocks_arch[i]):
                raise ValueError(
                    "Block "
                    + str(i + 1)
                    + " in "
                    + net_part
                    + ": Conformer block format is {'type: conformer', "
                    "'d_hidden': int, 'd_ff': int, 'heads': int, "
                    "'macaron_style': bool, 'use_conv_mod': bool, [...]}"
                )

            if (
                blocks_arch[i]["use_conv_mod"] is True
                and "conv_mod_kernel" not in blocks_arch[i]
            ):
                raise ValueError(
                    "Block "
                    + str(i + 1)
                    + ": 'use_conv_mod' is True but 'use_conv_kernel' is not specified"
                )

            has_conformer = True
            cmp_io.append((blocks_arch[i]["d_hidden"], blocks_arch[i]["d_hidden"]))
        elif block_type == "causal-conv1d":
            if not {"idim", "odim", "kernel_size"}.issubset(blocks_arch[i]):
                raise ValueError(
                    "Block "
                    + str(i + 1)
                    + " in "
                    + net_part
                    + ": causal conv1d block format is: {'type: causal-conv1d', "
                    "'idim': int, 'odim': int, 'kernel_size': int}"
                )

            if i == 0:
                input_layer = "c-embed"

            cmp_io.append((blocks_arch[i]["idim"], blocks_arch[i]["odim"]))
        elif block_type == "tdnn":
            if not {"idim", "odim", "ctx_size", "dilation", "stride"}.issubset(
                blocks_arch[i]
            ):
                raise ValueError(
                    "Block "
                    + str(i + 1)
                    + " in "
                    + net_part
                    + ": TDNN block format is: {'type: tdnn', "
                    "'idim': int, 'odim': int, 'ctx_size': int, "
                    "'dilation': int, 'stride': int, [...]}"
                )

            cmp_io.append((blocks_arch[i]["idim"], blocks_arch[i]["odim"]))
        else:
            raise NotImplementedError(
                "Wrong type for block "
                + str(i + 1)
                + " in "
                + net_part
                + ". Currently supported: "
                "tdnn, causal-conv1d or transformer"
            )

    if has_transformer and has_conformer:
        raise NotImplementedError(
            net_part + ": transformer and conformer blocks "
            "can't be defined in the same net part."
        )

    for i in range(1, len(cmp_io)):
        if cmp_io[(i - 1)][1] != cmp_io[i][0]:
            raise ValueError(
                "Output/Input mismatch between blocks "
                + str(i)
                + " and "
                + str(i + 1)
                + " in "
                + net_part
            )

    if blocks_arch[0]["type"] in ("tdnn", "causal-conv1d"):
        input_layer_odim = blocks_arch[0]["idim"]
    else:
        input_layer_odim = blocks_arch[0]["d_hidden"]

    if blocks_arch[-1]["type"] in ("tdnn", "causal-conv1d"):
        out_dim = blocks_arch[-1]["odim"]
    else:
        out_dim = blocks_arch[-1]["d_hidden"]

    return (
        input_layer,
        input_layer_odim,
        input_dropout_rate,
        input_pos_dropout_rate,
        out_dim,
    )


def get_pos_enc_and_att_class(net_part, pos_enc_type, self_attn_type):
    """Get positional encoding and self attention module class.

    Args:
        net_part (str): either 'encoder' or 'decoder'
        pos_enc_type (str): positional encoding type
        self_attn_type (str): self-attention type

    Return:
        pos_enc_class (torch.nn.Module): positional encoding class
        self_attn_class (torch.nn.Module): self-attention class

    """
    if pos_enc_type == "abs_pos":
        pos_enc_class = PositionalEncoding
    elif pos_enc_type == "scaled_abs_pos":
        pos_enc_class = ScaledPositionalEncoding
    elif pos_enc_type == "rel_pos":
        if net_part == "encoder" and self_attn_type != "rel_self_attn":
            raise ValueError("'rel_pos' is only compatible with 'rel_self_attn'")
        pos_enc_class = RelPositionalEncoding
    else:
        raise NotImplementedError(
            "pos_enc_type should be either 'abs_pos', 'scaled_abs_pos' or 'rel_pos'"
        )

    if self_attn_type == "rel_self_attn":
        self_attn_class = RelPositionMultiHeadedAttention
    else:
        self_attn_class = MultiHeadedAttention

    return pos_enc_class, self_attn_class


def build_input_layer(
    input_layer,
    idim,
    odim,
    pos_enc_class,
    dropout_rate_embed,
    dropout_rate,
    pos_dropout_rate,
    padding_idx,
):
    """Build input layer.

    Args:
        input_layer (str): input layer type
        idim (int): input dimension
        odim (int): output dimension
        pos_enc_class (class): positional encoding class
        dropout_rate_embed (float): dropout rate for embedding layer
        dropout_rate (float): dropout rate for input layer
        pos_dropout_rate (float): dropout rate for positional encoding
        padding_idx (int): padding index for embedding input layer (if specified)

    Returns:
        (torch.nn.*): input layer module
        subsampling_factor (int): subsampling factor

    """
    if input_layer == "null":
        return None, 1
    elif pos_enc_class.__name__ == "RelPositionalEncoding":
        pos_enc_class_subsampling = pos_enc_class(odim, pos_dropout_rate)
    else:
        pos_enc_class_subsampling = None

    if input_layer == "linear":
        return (
            torch.nn.Sequential(
                torch.nn.Linear(idim, odim),
                torch.nn.LayerNorm(odim),
                torch.nn.Dropout(dropout_rate),
                torch.nn.ReLU(),
                pos_enc_class(odim, pos_dropout_rate),
            ),
            1,
        )
    elif input_layer == "conv2d":
        return Conv2dSubsampling(idim, odim, dropout_rate, pos_enc_class_subsampling), 4
    elif input_layer == "vgg2l":
        return VGG2L(idim, odim, pos_enc_class_subsampling), 4
    elif input_layer == "embed":
        return (
            torch.nn.Sequential(
                torch.nn.Embedding(idim, odim, padding_idx=padding_idx),
                pos_enc_class(odim, pos_dropout_rate),
            ),
            1,
        )
    elif input_layer == "c-embed":
        return (
            torch.nn.Sequential(
                torch.nn.Embedding(idim, odim, padding_idx=padding_idx),
                torch.nn.Dropout(dropout_rate_embed),
            ),
            1,
        )
    else:
        raise NotImplementedError("Support: linear, conv2d, vgg2l and embed")


def build_transformer_block(net_part, block_arch, pw_layer_type, pw_activation_type):
    """Build function for transformer block.

    Args:
        net_part (str): either 'encoder' or 'decoder'
        block_arch (dict): transformer block parameters
        pw_layer_type (str): positionwise layer type
        pw_activation_type (str): positionwise activation type

    Returns:
        (function): function to create transformer block

    """
    d_hidden = block_arch["d_hidden"]
    d_ff = block_arch["d_ff"]
    heads = block_arch["heads"]

    dropout_rate = block_arch["dropout-rate"] if "dropout-rate" in block_arch else 0.0
    pos_dropout_rate = (
        block_arch["pos-dropout-rate"] if "pos-dropout-rate" in block_arch else 0.0
    )
    att_dropout_rate = (
        block_arch["att-dropout-rate"] if "att-dropout-rate" in block_arch else 0.0
    )

    if pw_layer_type == "linear":
        pw_layer = PositionwiseFeedForward
        pw_activation = get_activation(pw_activation_type)
        pw_layer_args = (d_hidden, d_ff, pos_dropout_rate, pw_activation)
    else:
        raise NotImplementedError("Transformer block only supports linear yet.")

    if net_part == "encoder":
        transformer_layer_class = EncoderLayer
    elif net_part == "decoder":
        transformer_layer_class = DecoderLayer

    return lambda: transformer_layer_class(
        d_hidden,
        MultiHeadedAttention(heads, d_hidden, att_dropout_rate),
        pw_layer(*pw_layer_args),
        dropout_rate,
    )


def build_conformer_block(
    block_arch,
    self_attn_class,
    pw_layer_type,
    pw_activation_type,
    conv_mod_activation_type,
):
    """Build function for conformer block.

    Args:
        block_arch (dict): conformer block parameters
        self_attn_type (str): self-attention module type
        pw_layer_type (str): positionwise layer type
        pw_activation_type (str): positionwise activation type
        conv_mod_activation_type (str): convolutional module activation type

    Returns:
        (function): function to create conformer block

    """
    d_hidden = block_arch["d_hidden"]
    d_ff = block_arch["d_ff"]
    heads = block_arch["heads"]
    macaron_style = block_arch["macaron_style"]
    use_conv_mod = block_arch["use_conv_mod"]

    dropout_rate = block_arch["dropout-rate"] if "dropout-rate" in block_arch else 0.0
    pos_dropout_rate = (
        block_arch["pos-dropout-rate"] if "pos-dropout-rate" in block_arch else 0.0
    )
    att_dropout_rate = (
        block_arch["att-dropout-rate"] if "att-dropout-rate" in block_arch else 0.0
    )

    if pw_layer_type == "linear":
        pw_layer = PositionwiseFeedForward
        pw_activation = get_activation(pw_activation_type)
        pw_layer_args = (d_hidden, d_ff, pos_dropout_rate, pw_activation)
    else:
        raise NotImplementedError("Conformer block only supports linear yet.")

    if use_conv_mod:
        conv_layer = ConvolutionModule
        conv_activation = get_activation(conv_mod_activation_type)
        conv_layers_args = (d_hidden, block_arch["conv_mod_kernel"], conv_activation)

    return lambda: ConformerEncoderLayer(
        d_hidden,
        self_attn_class(heads, d_hidden, att_dropout_rate),
        pw_layer(*pw_layer_args),
        pw_layer(*pw_layer_args) if macaron_style else None,
        conv_layer(*conv_layers_args) if use_conv_mod else None,
        dropout_rate,
    )


def build_causal_conv1d_block(block_arch):
    """Build function for causal conv1d block.

    Args:
        block_arch (dict): causal conv1d block parameters

    Returns:
        (function): function to create causal conv1d block

    """
    idim = block_arch["idim"]
    odim = block_arch["odim"]
    kernel_size = block_arch["kernel_size"]

    return lambda: CausalConv1d(idim, odim, kernel_size)


def build_tdnn_block(block_arch):
    """Build function for tdnn block.

    Args:
        block_arch (dict): tdnn block parameters

    Returns:
        (function): function to create tdnn block

    """
    idim = block_arch["idim"]
    odim = block_arch["odim"]
    ctx_size = block_arch["ctx_size"]
    dilation = block_arch["dilation"]
    stride = block_arch["stride"]

    use_batch_norm = (
        block_arch["use-batch-norm"] if "use-batch-norm" in block_arch else False
    )
    use_relu = block_arch["use-relu"] if "use-relu" in block_arch else False

    dropout_rate = block_arch["dropout-rate"] if "dropout-rate" in block_arch else 0.0

    return lambda: TDNN(
        idim,
        odim,
        ctx_size=ctx_size,
        dilation=dilation,
        stride=stride,
        dropout_rate=dropout_rate,
        batch_norm=use_batch_norm,
        relu=use_relu,
    )


def build_blocks(
    net_part,
    idim,
    input_layer,
    blocks_arch,
    repeat_block=0,
    self_attn_type="self_attn",
    positional_encoding_type="abs_pos",
    positionwise_layer_type="linear",
    positionwise_activation_type="relu",
    conv_mod_activation_type="relu",
    dropout_rate_embed=0.0,
    padding_idx=-1,
):
    """Build block for customizable architecture.

    Args:
        net_part (str): either 'encoder' or 'decoder'
        idim (int): dimension of inputs
        input_layer (str): input layer type
        blocks_arch (list): list of blocks for network part (type and parameters)
        repeat_block (int): repeat provided blocks N times if N > 1
        positional_encoding_type (str): positional encoding layer type
        positionwise_layer_type (str): linear
        positionwise_activation_type (str): positionwise activation type
        conv_mod_activation_type (str): convolutional module activation type
        dropout_rate_embed (float): dropout rate for embedding
        padding_idx (int): padding index for embedding input layer (if specified)

    Returns:
        in_layer (torch.nn.*): input layer
        all_blocks (MultiSequential): all blocks for network part
        out_dim (int): dimension of last block output
        conv_subsampling_factor (int): subsampling factor in frontend CNN

    """
    fn_modules = []

    (
        input_layer,
        input_layer_odim,
        input_dropout_rate,
        input_pos_dropout_rate,
        out_dim,
    ) = check_and_prepare(net_part, blocks_arch, input_layer)

    pos_enc_class, self_attn_class = get_pos_enc_and_att_class(
        net_part, positional_encoding_type, self_attn_type
    )

    in_layer, conv_subsampling_factor = build_input_layer(
        input_layer,
        idim,
        input_layer_odim,
        pos_enc_class,
        dropout_rate_embed,
        input_dropout_rate,
        input_pos_dropout_rate,
        padding_idx,
    )

    for i in range(len(blocks_arch)):
        block_type = blocks_arch[i]["type"]

        if block_type == "tdnn":
            module = build_tdnn_block(blocks_arch[i])
        elif block_type == "transformer":
            module = build_transformer_block(
                net_part,
                blocks_arch[i],
                positionwise_layer_type,
                positionwise_activation_type,
            )
        elif block_type == "conformer":
            module = build_conformer_block(
                blocks_arch[i],
                self_attn_class,
                positionwise_layer_type,
                positionwise_activation_type,
                conv_mod_activation_type,
            )
        elif block_type == "causal-conv1d":
            module = build_causal_conv1d_block(blocks_arch[i])

        fn_modules.append(module)

    if repeat_block > 1:
        fn_modules = fn_modules * repeat_block

    return (
        in_layer,
        MultiSequential(*[fn() for fn in fn_modules]),
        out_dim,
        conv_subsampling_factor,
    )


================================================
FILE: nets/pytorch_backend/transducer/causal_conv1d.py
================================================
"""CausalConv1d module definition for custom decoder."""

import torch


class CausalConv1d(torch.nn.Module):
    """CausalConv1d module for custom decoder.

    Args:
        idim (int): dimension of inputs
        odim (int): dimension of outputs
        kernel_size (int): size of convolving kernel
        stride (int): stride of the convolution
        dilation (int): spacing between the kernel points
        groups (int): number of blocked connections from ichannels to ochannels
        bias (bool): whether to add a learnable bias to the output

    """

    def __init__(
        self, idim, odim, kernel_size, stride=1, dilation=1, groups=1, bias=True
    ):
        """Construct a CausalConv1d object."""
        super().__init__()

        self._pad = (kernel_size - 1) * dilation

        self.causal_conv1d = torch.nn.Conv1d(
            idim,
            odim,
            kernel_size=kernel_size,
            stride=stride,
            padding=self._pad,
            dilation=dilation,
            groups=groups,
            bias=bias,
        )

    def forward(self, x, x_mask, cache=None):
        """CausalConv1d forward for x.

        Args:
            x (torch.Tensor): input torch (B, U, idim)
            x_mask (torch.Tensor): (B, 1, U)

        Returns:
            x (torch.Tensor): input torch (B, sub(U), attention_dim)
            x_mask (torch.Tensor): (B, 1, sub(U))

        """
        x = x.permute(0, 2, 1)
        x = self.causal_conv1d(x)

        if self._pad != 0:
            x = x[:, :, : -self._pad]

        x = x.permute(0, 2, 1)

        return x, x_mask


================================================
FILE: nets/pytorch_backend/transducer/custom_decoder.py
================================================
"""Custom decoder definition for transducer models."""

import torch

from espnet.nets.pytorch_backend.transducer.blocks import build_blocks
from espnet.nets.pytorch_backend.transducer.utils import check_batch_state
from espnet.nets.pytorch_backend.transducer.utils import check_state
from espnet.nets.pytorch_backend.transducer.utils import pad_sequence
from espnet.nets.pytorch_backend.transformer.layer_norm import LayerNorm
from espnet.nets.pytorch_backend.transformer.mask import subsequent_mask
from espnet.nets.transducer_decoder_interface import TransducerDecoderInterface


class CustomDecoder(TransducerDecoderInterface, torch.nn.Module):
    """Custom decoder module for transducer models.

    Args:
        odim (int): dimension of outputs
        dec_arch (list): list of layer definitions
        input_layer (str): input layer type
        repeat_block (int): repeat provided blocks N times if N > 1
        positional_encoding_type (str): positional encoding type
        positionwise_layer_type (str): linear
        positionwise_activation_type (str): positionwise activation type
        dropout_rate_embed (float): dropout rate for embedding layer (if specified)
        blank (int): blank symbol ID

    """

    def __init__(
        self,
        odim,
        dec_arch,
        input_layer="embed",
        repeat_block=0,
        joint_activation_type="tanh",
        positional_encoding_type="abs_pos",
        positionwise_layer_type="linear",
        positionwise_activation_type="relu",
        dropout_rate_embed=0.0,
        blank=0,
    ):
        """Construct a CustomDecoder object."""
        torch.nn.Module.__init__(self)

        self.embed, self.decoders, ddim, _ = build_blocks(
            "decoder",
            odim,
            input_layer,
            dec_arch,
            repeat_block=repeat_block,
            positional_encoding_type=positional_encoding_type,
            positionwise_layer_type=positionwise_layer_type,
            positionwise_activation_type=positionwise_activation_type,
            dropout_rate_embed=dropout_rate_embed,
            padding_idx=blank,
        )

        self.after_norm = LayerNorm(ddim)

        self.dlayers = len(self.decoders)
        self.dunits = ddim
        self.odim = odim

        self.blank = blank

    def set_device(self, device):
        """Set GPU device to use.

        Args:
            device (torch.device): device id

        """
        self.device = device

    def init_state(self, batch_size=None, device=None, dtype=None):
        """Initialize decoder states.

        Args:
            None

        Returns:
            state (list): batch of decoder decoder states [L x None]

        """
        state = [None] * self.dlayers

        return state

    def forward(self, tgt, tgt_mask, memory):
        """Forward custom decoder.

        Args:
            tgt (torch.Tensor): input token ids, int64 (batch, maxlen_out)
                                if input_layer == "embed"
                                input tensor
                                (batch, maxlen_out, #mels) in the other cases
            tgt_mask (torch.Tensor): input token mask,  (batch, maxlen_out)
                                     dtype=torch.uint8 in PyTorch 1.2-
                                     dtype=torch.bool in PyTorch 1.2+ (include 1.2)
            memory (torch.Tensor): encoded memory, float32  (batch, maxlen_in, feat)

        Return:
            tgt (torch.Tensor): decoder output (batch, maxlen_out, dim_dec)
            tgt_mask (torch.Tensor): score mask before softmax (batch, maxlen_out)

        """
        tgt = self.embed(tgt)

        tgt, tgt_mask = self.decoders(tgt, tgt_mask)
        tgt = self.after_norm(tgt)

        return tgt, tgt_mask

    def score(self, hyp, cache):
        """Forward one step.

        Args:
            hyp (dataclass): hypothesis
            cache (dict): states cache

        Returns:
            y (torch.Tensor): decoder outputs (1, dec_dim)
            (list): decoder states
                [L x (1, max_len, dec_dim)]
            lm_tokens (torch.Tensor): token id for LM (1)

        """
        tgt = torch.tensor([hyp.yseq], device=self.device)
        lm_tokens = tgt[:, -1]

        str_yseq = "".join(list(map(str, hyp.yseq)))

        if str_yseq in cache:
            y, new_state = cache[str_yseq]
        else:
            tgt_mask = subsequent_mask(len(hyp.yseq)).unsqueeze_(0)

            state = check_state(hyp.dec_state, (tgt.size(1) - 1), self.blank)

            tgt = self.embed(tgt)

            new_state = []
            for s, decoder in zip(state, self.decoders):
                tgt, tgt_mask = decoder(tgt, tgt_mask, cache=s)
                new_state.append(tgt)

            y = self.after_norm(tgt[:, -1])

            cache[str_yseq] = (y, new_state)

        return y[0], new_state, lm_tokens

    def batch_score(self, hyps, batch_states, cache, use_lm):
        """Forward batch one step.

        Args:
            hyps (list): batch of hypotheses
            batch_states (list): decoder states
                [L x (B, max_len, dec_dim)]
            cache (dict): states cache

        Returns:
            batch_y (torch.Tensor): decoder output (B, dec_dim)
            batch_states (list): decoder states
                [L x (B, max_len, dec_dim)]
            lm_tokens (torch.Tensor): batch of token ids for LM (B)

        """
        final_batch = len(hyps)

        process = []
        done = [None for _ in range(final_batch)]

        for i, hyp in enumerate(hyps):
            str_yseq = "".join(list(map(str, hyp.yseq)))

            if str_yseq in cache:
                done[i] = cache[str_yseq]
            else:
                process.append((str_yseq, hyp.yseq, hyp.dec_state))

        if process:
            _tokens = pad_sequence([p[1] for p in process], self.blank)
            batch_tokens = torch.LongTensor(_tokens, device=self.device)

            tgt_mask = (
                subsequent_mask(batch_tokens.size(-1))
                .unsqueeze_(0)
                .expand(len(process), -1, -1)
            )

            dec_state = self.create_batch_states(
                self.init_state(),
                [p[2] for p in process],
                _tokens,
            )

            tgt = self.embed(batch_tokens)

            next_state = []
            for s, decoder in zip(dec_state, self.decoders):
                tgt, tgt_mask = decoder(tgt, tgt_mask, cache=s)
                next_state.append(tgt)

            tgt = self.after_norm(tgt[:, -1])

        j = 0
        for i in range(final_batch):
            if done[i] is None:
                new_state = self.select_state(next_state, j)

                done[i] = (tgt[j], new_state)
                cache[process[j][0]] = (tgt[j], new_state)

                j += 1

        self.create_batch_states(
            batch_states, [d[1] for d in done], [[0] + h.yseq for h in hyps]
        )
        batch_y = torch.stack([d[0] for d in done])

        if use_lm:
            lm_tokens = torch.LongTensor(
                [hyp.yseq[-1] for hyp in hyps], device=self.device
            )

            return batch_y, batch_states, lm_tokens

        return batch_y, batch_states, None

    def select_state(self, batch_states, idx):
        """Get decoder state from batch of states, for given id.

        Args:
            batch_states (list): batch of decoder states
                [L x (B, max_len, dec_dim)]
            idx (int): index to extract state from batch of states

        Returns:
            state_idx (list): decoder states for given id
                [L x (1, max_len, dec_dim)]

        """
        if batch_states[0] is None:
            return batch_states

        state_idx = [batch_states[layer][idx] for layer in range(self.dlayers)]

        return state_idx

    def create_batch_states(self, batch_states, l_states, check_list):
        """Create batch of decoder states.

        Args:
            batch_states (list): batch of decoder states
                [L x (B, max_len, dec_dim)]
            l_states (list): list of decoder states
                [B x [L x (1, max_len, dec_dim)]]
            check_list (list): list of sequences for max_len

        Returns:
            batch_states (list): batch of decoder states
                [L x (B, max_len, dec_dim)]

        """
        if l_states[0][0] is None:
            return batch_states

        max_len = max(len(elem) for elem in check_list) - 1

        for layer in range(self.dlayers):
            batch_states[layer] = check_batch_state(
                [s[layer] for s in l_states], max_len, self.blank
            )

        return batch_states


================================================
FILE: nets/pytorch_backend/transducer/custom_encoder.py
================================================
"""Cutom encoder definition for transducer models."""

import torch

from espnet.nets.pytorch_backend.transducer.blocks import build_blocks
from espnet.nets.pytorch_backend.transducer.vgg2l import VGG2L

from espnet.nets.pytorch_backend.transformer.layer_norm import LayerNorm
from espnet.nets.pytorch_backend.transformer.subsampling import Conv2dSubsampling


class CustomEncoder(torch.nn.Module):
    """Custom encoder module for transducer models.

    Args:
        idim (int): input dim
        enc_arch (list): list of encoder blocks (type and parameters)
        input_layer (str): input layer type
        repeat_block (int): repeat provided block N times if N > 1
        self_attn_type (str): type of self-attention
        positional_encoding_type (str): positional encoding type
        positionwise_layer_type (str): linear
        positionwise_activation_type (str): positionwise activation type
        conv_mod_activation_type (str): convolutional module activation type
        normalize_before (bool): whether to use layer_norm before the first block
        aux_task_layer_list (list): list of layer ids for intermediate output
        padding_idx (int): padding_idx for embedding input layer (if specified)

    """

    def __init__(
        self,
        idim,
        enc_arch,
        input_layer="linear",
        repeat_block=0,
        self_attn_type="selfattn",
        positional_encoding_type="abs_pos",
        positionwise_layer_type="linear",
        positionwise_activation_type="relu",
        conv_mod_activation_type="relu",
        normalize_before=True,
        aux_task_layer_list=[],
        padding_idx=-1,
    ):
        """Construct an CustomEncoder object."""
        super().__init__()
        (
            self.embed,
            self.encoders,
            self.enc_out,
            self.conv_subsampling_factor,
        ) = build_blocks(
            "encoder",
            idim,
            input_layer,
            enc_arch,
            repeat_block=repeat_block,
            self_attn_type=self_attn_type,
            positional_encoding_type=positional_encoding_type,
            positionwise_layer_type=positionwise_layer_type,
            positionwise_activation_type=positionwise_activation_type,
            conv_mod_activation_type=conv_mod_activation_type,
            padding_idx=padding_idx,
        )

        self.normalize_before = normalize_before

        if self.normalize_before:
            self.after_norm = LayerNorm(self.enc_out)

        self.n_blocks = len(enc_arch) * repeat_block

        self.aux_task_layer_list = aux_task_layer_list

    def forward(self, xs, masks, return_as_intermidiate=False):
        """Encode input sequence.

        Args:
            xs (torch.Tensor): input tensor
            masks (torch.Tensor): input mask

        Returns:
            xs (torch.Tensor or tuple):
                position embedded output or
                (position embedded output, auxiliary outputs)
            mask (torch.Tensor): position embedded mask

        """
        if self.embed is None:
            xs, masks = xs, masks
        elif isinstance(self.embed, (Conv2dSubsampling, VGG2L)):
            xs, masks = self.embed(xs, masks)
        else:
            xs = self.embed(xs)

        if self.aux_task_layer_list:
            aux_xs_list = []

            for b in range(self.n_blocks):
                xs, masks = self.encoders[b](xs, masks)

                if b in self.aux_task_layer_list:
                    if isinstance(xs, tuple):
                        aux_xs = xs[0]
                    else:
                        aux_xs = xs

                    if self.normalize_before:
                        aux_xs_list.append(self.after_norm(aux_xs))
                    else:
                        aux_xs_list.append(aux_xs)
        else:
            xs, masks = self.encoders(xs, masks)

        # we keep the pos_emb for layer conformer layers
        if return_as_intermidiate:
            return xs, masks

        if isinstance(xs, tuple):
            xs = xs[0]
        
        if self.normalize_before:
            xs = self.after_norm(xs)

        if self.aux_task_layer_list:
            return (xs, aux_xs_list), masks

        return xs, masks


================================================
FILE: nets/pytorch_backend/transducer/error_calculator.py
================================================
#!/usr/bin/env python3
# encoding: utf-8

"""CER/WER monitoring for transducer models."""

import editdistance

from espnet.nets.beam_search_transducer import BeamSearchTransducer


class ErrorCalculator(object):
    """Calculate CER and WER for transducer models.

    Args:
        decoder (torch.nn.Module|TransducerDecoderInterface): decoder module
        joint_network (torch.nn.Module): joint network module
        token_list (list): list of tokens
        sym_space (str): space symbol
        sym_blank (str): blank symbol
        report_cer (boolean): compute CER option
        report_wer (boolean): compute WER option

    """

    def __init__(
        self,
        decoder,
        joint_network,
        token_list,
        sym_space,
        sym_blank,
        report_cer=False,
        report_wer=False,
    ):
        """Construct an ErrorCalculator object for transducer model."""
        super().__init__()

        self.beam_search = BeamSearchTransducer(
            decoder=decoder,
            joint_network=joint_network,
            beam_size=1,
        )

        self.decoder = decoder

        self.token_list = token_list
        self.space = sym_space
        self.blank = sym_blank

        self.report_cer = report_cer
        self.report_wer = report_wer

    def __call__(self, hs_pad, ys_pad):
        """Calculate sentence-level WER/CER score for transducer models.

        Args:
            hs_pad (torch.Tensor): batch of padded input sequence (batch, T, D)
            ys_pad (torch.Tensor): reference (batch, seqlen)

        Returns:
            (float): sentence-level CER score
            (float): sentence-level WER score

        """
        cer, wer = None, None

        batchsize = int(hs_pad.size(0))
        batch_nbest = []

        hs_pad = hs_pad.to(next(self.decoder.parameters()).device)

        for b in range(batchsize):
            nbest_hyps = self.beam_search(hs_pad[b])
            batch_nbest.append(nbest_hyps[-1])

        ys_hat = [nbest_hyp.yseq[1:] for nbest_hyp in batch_nbest]

        seqs_hat, seqs_true = self.convert_to_char(ys_hat, ys_pad.cpu())

        if self.report_cer:
            cer = self.calculate_cer(seqs_hat, seqs_true)

        if self.report_wer:
            wer = self.calculate_wer(seqs_hat, seqs_true)

        return cer, wer

    def convert_to_char(self, ys_hat, ys_pad):
        """Convert index to character.

        Args:
            ys_hat (torch.Tensor): prediction (batch, seqlen)
            ys_pad (torch.Tensor): reference (batch, seqlen)

        Returns:
            (list): token list of prediction
            (list): token list of reference

        """
        seqs_hat, seqs_true = [], []

        for i, y_hat in enumerate(ys_hat):
            y_true = ys_pad[i]

            seq_hat = [self.token_list[int(idx)] for idx in y_hat]
            seq_true = [self.token_list[int(idx)] for idx in y_true if int(idx) != -1]

            seq_hat_text = "".join(seq_hat).replace(self.space, " ")
            seq_hat_text = seq_hat_text.replace(self.blank, "")
            seq_true_text = "".join(seq_true).replace(self.space, " ")

            seqs_hat.append(seq_hat_text)
            seqs_true.append(seq_true_text)

        return seqs_hat, seqs_true

    def calculate_cer(self, seqs_hat, seqs_true):
        """Calculate sentence-level CER score for transducer model.

        Args:
            seqs_hat (torch.Tensor): prediction (batch, seqlen)
            seqs_true (torch.Tensor): reference (batch, seqlen)

        Returns:
            (float): average sentence-level CER score

        """
        char_eds, char_ref_lens = [], []

        for i, seq_hat_text in enumerate(seqs_hat):
            seq_true_text = seqs_true[i]

            hyp_chars = seq_hat_text.replace(" ", "")
            ref_chars = seq_true_text.replace(" ", "")

            char_eds.append(editdistance.eval(hyp_chars, ref_chars))
            char_ref_lens.append(len(ref_chars))

        return float(sum(char_eds)) / sum(char_ref_lens)

    def calculate_wer(self, seqs_hat, seqs_true):
        """Calculate sentence-level WER score for transducer model.

        Args:
            seqs_hat (torch.Tensor): prediction (batch, seqlen)
            seqs_true (torch.Tensor): reference (batch, seqlen)

        Returns:
            (float): average sentence-level WER score

        """
        word_eds, word_ref_lens = [], []

        for i, seq_hat_text in enumerate(seqs_hat):
            seq_true_text = seqs_true[i]

            hyp_words = seq_hat_text.split()
            ref_words = seq_true_text.split()

            word_eds.append(editdistance.eval(hyp_words, ref_words))
            word_ref_lens.append(len(ref_words))

        return float(sum(word_eds)) / sum(word_ref_lens)


================================================
FILE: nets/pytorch_backend/transducer/initializer.py
================================================
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""Parameter initialization for transducer model."""

import math

from espnet.nets.pytorch_backend.initialization import set_forget_bias_to_one


def initializer(model, args):
    """Initialize transducer model.

    Args:
        model (torch.nn.Module): transducer instance
        args (Namespace): argument Namespace containing options

    """

    # RNN only
    for name, p in model.named_parameters():
        if any(x in name for x in ["enc.", "dec.", "joint_network"]):
            # rnn based parts + joint network
            if p.dim() == 1:
                # bias
                p.data.zero_()
            elif p.dim() == 2:
                # linear weight
                n = p.size(1)
                stdv = 1.0 / math.sqrt(n)
                p.data.normal_(0, stdv)
            elif p.dim() in (3, 4):
                # conv weight
                n = p.size(1)
                for k in p.size()[2:]:
                    n *= k
                    stdv = 1.0 / math.sqrt(n)
                    p.data.normal_(0, stdv)

    if args.dtype != "custom":
        model.dec.embed.weight.data.normal_(0, 1)

        for i in range(model.dec.dlayers):
            set_forget_bias_to_one(getattr(model.dec.decoder[i], "bias_ih_l0"))
            set_forget_bias_to_one(getattr(model.dec.decoder[i], "bias_hh_l0"))


================================================
FILE: nets/pytorch_backend/transducer/joint_network.py
================================================
"""Transducer joint network implementation."""

import torch

from espnet.nets.pytorch_backend.nets_utils import get_activation


class JointNetwork(torch.nn.Module):
    """Transducer joint network module.

    Args:
        joint_space_size: Dimension of joint space
        joint_activation_type: Activation type for joint network

    """

    def __init__(
        self,
        vocab_size: int,
        encoder_output_size: int,
        decoder_output_size: int,
        joint_space_size: int,
        joint_activation_type: int,
    ):
        """Joint network initializer."""
        super().__init__()

        self.lin_enc = torch.nn.Linear(encoder_output_size, joint_space_size)
        self.lin_dec = torch.nn.Linear(
            decoder_output_size, joint_space_size, bias=False
        )

        self.lin_out = torch.nn.Linear(joint_space_size, vocab_size)

        self.joint_activation = get_activation(joint_activation_type)

    def forward(
        self, h_enc: torch.Tensor, h_dec: torch.Tensor, is_aux: bool = False
    ) -> torch.Tensor:
        """Joint computation of z.

        Args:
            h_enc: Batch of expanded hidden state (B, T, 1, D_enc)
            h_dec: Batch of expanded hidden state (B, 1, U, D_dec)

        Returns:
            z: Output (B, T, U, vocab_size)

        """
        if is_aux:
            z = self.joint_activation(h_enc + self.lin_dec(h_dec))
        else:
            z = self.joint_activation(self.lin_enc(h_enc) + self.lin_dec(h_dec))
        z = self.lin_out(z)

        return z


================================================
FILE: nets/pytorch_backend/transducer/loss.py
================================================
#!/usr/bin/env python3

"""Transducer loss module."""

import torch


class TransLoss(torch.nn.Module):
    """Transducer loss module.

    Args:
        trans_type (str): type of transducer implementation to calculate loss.
        blank_id (int): blank symbol id
    """

    def __init__(self, trans_type, blank_id):
        """Construct an TransLoss object."""
        super().__init__()

        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        if trans_type == "warp-transducer":
            from warprnnt_pytorch import RNNTLoss

            self.trans_loss = RNNTLoss(blank=blank_id)
        elif trans_type == "warp-rnnt":
            if device.type == "cuda":
                try:
                    from warp_rnnt import rnnt_loss

                    self.trans_loss = rnnt_loss
                except ImportError:
                    raise ImportError(
                        "warp-rnnt is not installed. Please re-setup"
                        " espnet or use 'warp-transducer'"
                    )
            else:
                raise ValueError("warp-rnnt is not supported in CPU mode")

        self.trans_type = trans_type
        self.blank_id = blank_id

    def forward(self, pred_pad, target, pred_len, target_len):
        """Compute path-aware regularization transducer loss.

        Args:
            pred_pad (torch.Tensor): Batch of predicted sequences
                (batch, maxlen_in, maxlen_out+1, odim)
            target (torch.Tensor): Batch of target sequences (batch, maxlen_out)
            pred_len (torch.Tensor): batch of lengths of predicted sequences (batch)
            target_len (torch.tensor): batch of lengths of target sequences (batch)

        Returns:
            loss (torch.Tensor): transducer loss

        """
        dtype = pred_pad.dtype
        if dtype != torch.float32:
            # warp-transducer and warp-rnnt only support float32
            pred_pad = pred_pad.to(dtype=torch.float32)

        if self.trans_type == "warp-rnnt":
            log_probs = torch.log_softmax(pred_pad, dim=-1)

            loss = self.trans_loss(
                log_probs,
                target,
                pred_len,
                target_len,
                reduction="mean",
                blank=self.blank_id,
                gather=True,
            )
        else:
            loss = self.trans_loss(pred_pad, target, pred_len, target_len)
        loss = loss.to(dtype=dtype)

        return loss


================================================
FILE: nets/pytorch_backend/transducer/rnn_decoder.py
================================================
"""RNN decoder for transducer-based models."""

import torch

from espnet.nets.transducer_decoder_interface import TransducerDecoderInterface


class DecoderRNNT(TransducerDecoderInterface, torch.nn.Module):
    """RNN-T Decoder module.

    Args:
        odim (int): dimension of outputs
        dtype (str): gru or lstm
        dlayers (int): # prediction layers
        dunits (int): # prediction units
        blank (int): blank symbol id
        embed_dim (int): dimension of embeddings
        dropout (float): dropout rate
        dropout_embed (float): embedding dropout rate

    """

    def __init__(
        self,
        odim,
        dtype,
        dlayers,
        dunits,
        blank,
        embed_dim,
        dropout=0.0,
        dropout_embed=0.0,
    ):
        """Transducer initializer."""
        super().__init__()

        self.embed = torch.nn.Embedding(odim, embed_dim, padding_idx=blank)
        self.dropout_embed = torch.nn.Dropout(p=dropout_embed)

        dec_net = torch.nn.LSTM if dtype == "lstm" else torch.nn.GRU

        self.decoder = torch.nn.ModuleList(
            [dec_net(embed_dim, dunits, 1, batch_first=True)]
        )
        self.dropout_dec = torch.nn.Dropout(p=dropout)

        for _ in range(1, dlayers):
            self.decoder += [dec_net(dunits, dunits, 1, batch_first=True)]

        self.dlayers = dlayers
        self.dunits = dunits
        self.dtype = dtype

        self.odim = odim

        self.ignore_id = -1
        self.blank = blank

        self.multi_gpus = torch.cuda.device_count() > 1

    def set_device(self, device):
        """Set GPU device to use.

        Args:
            device (torch.device): device id

        """
        self.device = device

    def set_data_type(self, data_type):
        """Set GPU device to use.

        Args:
            data_type (torch.dtype): Tensor data type

        """
        self.data_type = data_type

    def init_state(self, batch_size):
        """Initialize decoder states.

        Args:
            batch_size (int): Batch size

        Returns:
            (tuple): batch of decoder states
                ((L, B, dec_dim), (L, B, dec_dim))

        """
        h_n = torch.zeros(
            self.dlayers,
            batch_size,
            self.dunits,
            device=self.device,
            dtype=self.data_type,
        )

        if self.dtype == "lstm":
            c_n = torch.zeros(
                self.dlayers,
                batch_size,
                self.dunits,
                device=self.device,
                dtype=self.data_type,
            )

            return (h_n, c_n)

        return (h_n, None)

    def rnn_forward(self, y, state):
        """RNN forward.

        Args:
            y (torch.Tensor): batch of input features (B, emb_dim)
            state (tuple): batch of decoder states
                ((L, B, dec_dim), (L, B, dec_dim))

        Returns:
            y (torch.Tensor): batch of output features (B, dec_dim)
            (tuple): batch of decoder states
                (L, B, dec_dim), (L, B, dec_dim))

        """
        h_prev, c_prev = state
        h_next, c_next = self.init_state(y.size(0))

        for layer in range(self.dlayers):
            if self.dtype == "lstm":
                y, (
                    h_next[layer : layer + 1],
                    c_next[layer : layer + 1],
                ) = self.decoder[layer](
                    y, hx=(h_prev[layer : layer + 1], c_prev[layer : layer + 1])
                )
            else:
                y, h_next[layer : layer + 1] = self.decoder[layer](
                    y, hx=h_prev[layer : layer + 1]
                )

            y = self.dropout_dec(y)

        return y, (h_next, c_next)

    def forward(self, hs_pad, ys_in_pad):
        """Forward function for transducer.

        Args:
            hs_pad (torch.Tensor):
                batch of padded hidden state sequences (B, Tmax, D)
            ys_in_pad (torch.Tensor):
                batch of padded character id sequence tensor (B, Lmax+1)

        Returns:
            z (torch.Tensor): output (B, T, U, odim)

        """
        self.set_device(hs_pad.device)
        self.set_data_type(hs_pad.dtype)

        state = self.init_state(hs_pad.size(0))
        eys = self.dropout_embed(self.embed(ys_in_pad))

        h_dec, _ = self.rnn_forward(eys, state)

        return h_dec

    def score(self, hyp, cache):
        """Forward one step.

        Args:
            hyp (dataclass): hypothesis
            cache (dict): states cache

        Returns:
            y (torch.Tensor): decoder outputs (1, dec_dim)
            state (tuple): decoder states
                ((L, 1, dec_dim), (L, 1, dec_dim)),
            (torch.Tensor): token id for LM (1,)

        """
        vy = torch.full((1, 1), hyp.yseq[-1], dtype=torch.long, device=self.device)

        str_yseq = "".join(list(map(str, hyp.yseq)))

        if str_yseq in cache:
            y, state = cache[str_yseq]
        else:
            ey = self.embed(vy)

            y, state = self.rnn_forward(ey, hyp.dec_state)
            cache[str_yseq] = (y, state)

        return y[0][0], state, vy[0]

    def batch_score(self, hyps, batch_states, cache, use_lm):
        """Forward batch one step.

        Args:
            hyps (list): batch of hypotheses
            batch_states (tuple): batch of decoder states
                ((L, B, dec_dim), (L, B, dec_dim))
            cache (dict): states cache
            use_lm (bool): whether a LM is used for decoding

        Returns:
            batch_y (torch.Tensor): decoder output (B, dec_dim)
            batch_states (tuple): batch of decoder states
                ((L, B, dec_dim), (L, B, dec_dim))
            lm_tokens (torch.Tensor): batch of token ids for LM (B)

        """
        final_batch = len(hyps)

        process = []
        done = [None] * final_batch

        for i, hyp in enumerate(hyps):
            str_yseq = "".join(list(map(str, hyp.yseq)))

            if str_yseq in cache:
                done[i] = cache[str_yseq]
            else:
                process.append((str_yseq, hyp.yseq[-1], hyp.dec_state))

        if process:
            tokens = torch.LongTensor([[p[1]] for p in process]).to(self.device)
            dec_state = self.create_batch_states(
                self.init_state(tokens.size(0)), [p[2] for p in process]
            )

            ey = self.embed(tokens)
            y, dec_state = self.rnn_forward(ey, dec_state)

        j = 0
        for i in range(final_batch):
            if done[i] is None:
                new_state = self.select_state(dec_state, j)

                done[i] = (y[j], new_state)
                cache[process[j][0]] = (y[j], new_state)

                j += 1

        batch_y = torch.cat([d[0] for d in done], dim=0)
        batch_states = self.create_batch_states(batch_states, [d[1] for d in done])

        if use_lm:
            lm_tokens = torch.LongTensor([h.yseq[-1] for h in hyps], device=self.device)

            return batch_y, batch_states, lm_tokens

        return batch_y, batch_states, None

    def select_state(self, batch_states, idx):
        """Get decoder state from batch of states, for given id.

        Args:
            batch_states (tuple): batch of decoder states
                ((L, B, dec_dim), (L, B, dec_dim))
            idx (int): index to extract state from batch of states

        Returns:
            (tuple): decoder states for given id
                ((L, 1, dec_dim), (L, 1, dec_dim))

        """
        return (
            batch_states[0][:, idx : idx + 1, :],
            batch_states[1][:, idx : idx + 1, :] if self.dtype == "lstm" else None,
        )

    def create_batch_states(self, batch_states, l_states, l_tokens=None):
        """Create batch of decoder states.

        Args:
            batch_states (tuple): batch of decoder states
               ((L, B, dec_dim), (L, B, dec_dim))
            l_states (list): list of decoder states
               [L x ((1, dec_dim), (1, dec_dim))]

        Returns:
            batch_states (tuple): batch of decoder states
                ((L, B, dec_dim), (L, B, dec_dim))

        """
        return (
            torch.cat([s[0] for s in l_states], dim=1),
            torch.cat([s[1] for s in l_states], dim=1)
            if self.dtype == "lstm"
            else None,
        )


================================================
FILE: nets/pytorch_backend/transducer/rnn_encoder.py
================================================
"""RNN encoder implementation for transducer-based models.

These classes are based on the ones in espnet.nets.pytorch_backend.rnn.encoders,
and modified to output intermediate layers representation based on a list of
layers given as input. These additional outputs are intended to be used with
auxiliary tasks.
It should be noted that, here, RNN class rely on a stack of 1-layer LSTM instead
of a multi-layer LSTM for that purpose.

"""

import argparse
import logging
from typing import List
from typing import Optional
from typing import Tuple
from typing import Union

import numpy as np
import torch
import torch.nn.functional as F
from torch.nn.utils.rnn import pack_padded_sequence
from torch.nn.utils.rnn import pad_packed_sequence

from espnet.nets.e2e_asr_common import get_vgg2l_odim
from espnet.nets.pytorch_backend.nets_utils import make_pad_mask
from espnet.nets.pytorch_backend.nets_utils import to_device


class RNNP(torch.nn.Module):
    """RNN with projection layer module.

    Args:
        idim: Dimension of inputs
        elayers: Dimension of encoder layers
        cdim: Number of units (results in cdim * 2 if bidirectional)
        hdim: Number of projection units
        subsample: List of subsampling number
        dropout: Dropout rate
        typ: RNN type
        aux_task_layer_list: List of layer ids for intermediate output

    """

    def __init__(
        self,
        idim: int,
        elayers: int,
        cdim: int,
        hdim: int,
        subsample: np.ndarray,
        dropout: float,
        typ: str = "blstm",
        aux_task_layer_list: List = [],
    ):
        """Initialize RNNP module."""
        super(RNNP, self).__init__()

        bidir = typ[0] == "b"
        for i in range(elayers):
            if i == 0:
                inputdim = idim
            else:
                inputdim = hdim

            RNN = torch.nn.LSTM if "lstm" in typ else torch.nn.GRU
            rnn = RNN(
                inputdim, cdim, num_layers=1, bidirectional=bidir, batch_first=True
            )

            setattr(self, "%s%d" % ("birnn" if bidir else "rnn", i), rnn)

            if bidir:
                setattr(self, "bt%d" % i, torch.nn.Linear(2 * cdim, hdim))
            else:
                setattr(self, "bt%d" % i, torch.nn.Linear(cdim, hdim))

        self.elayers = elayers
        self.cdim = cdim
        self.subsample = subsample
        self.typ = typ
        self.bidir = bidir
        self.dropout = dropout

        self.aux_task_layer_list = aux_task_layer_list

    def forward(
        self,
        xs_pad: torch.Tensor,
        ilens: torch.Tensor,
        prev_state: Optional[torch.Tensor] = None,
    ) -> Union[Tuple[torch.Tensor, List], torch.Tensor]:
        """RNNP forward.

        Args:
            xs_pad: Batch of padded input sequences (B, Tmax, idim)
            ilens: Batch of lengths of input sequences (B)
            prev_state: Batch of previous RNN states

        Returns:
            : Batch of padded output sequences (B, Tmax, hdim)
                    or tuple w/ aux outputs ((B, Tmax, hdim), [L x (B, Tmax, hdim)])
            : Batch of lengths of output sequences (B)
            : Batch of hidden state sequences (B, Tmax, hdim)

        """
        logging.debug(self.__class__.__name__ + " input lengths: " + str(ilens))

        aux_xs_list = []
        elayer_states = []
        for layer in range(self.elayers):
            if not isinstance(ilens, torch.Tensor):
                ilens = torch.tensor(ilens)

            xs_pack = pack_padded_sequence(xs_pad, ilens.cpu(), batch_first=True)
            rnn = getattr(self, ("birnn" if self.bidir else "rnn") + str(layer))
            rnn.flatten_parameters()

            if prev_state is not None and rnn.bidirectional:
                prev_state = reset_backward_rnn_state(prev_state)

            ys, states = rnn(
                xs_pack, hx=None if prev_state is None else prev_state[layer]
            )
            elayer_states.append(states)

            ys_pad, ilens = pad_packed_sequence(ys, batch_first=True)

            sub = self.subsample[layer + 1]
            if sub > 1:
                ys_pad = ys_pad[:, ::sub]
                ilens = torch.tensor([int(i + 1) // sub for i in ilens])

            projection_layer = getattr(self, "bt%d" % layer)
            projected = projection_layer(ys_pad.contiguous().view(-1, ys_pad.size(2)))
            xs_pad = projected.view(ys_pad.size(0), ys_pad.size(1), -1)

            if layer in self.aux_task_layer_list:
                aux_xs_list.append(xs_pad)

            if layer < self.elayers - 1:
                xs_pad = torch.tanh(F.dropout(xs_pad, p=self.dropout))

        if aux_xs_list:
            return (xs_pad, aux_xs_list), ilens, elayer_states
        else:
            return xs_pad, ilens, elayer_states


class RNN(torch.nn.Module):
    """RNN module.

    Args:
        idim: Dimension of inputs
        elayers: Number of encoder layers
        cdim: Number of rnn units (resulted in cdim * 2 if bidirectional)
        hdim: Number of final projection units
        dropout: Dropout rate
        typ: The RNN type

    """

    def __init__(
        self,
        idim: int,
        elayers: int,
        cdim: int,
        hdim: int,
        dropout: float,
        typ: str = "blstm",
        aux_task_layer_list: List = [],
    ):
        """Initialize RNN module."""
        super(RNN, self).__init__()

        bidir = typ[0] == "b"

        for i in range(elayers):
            if i == 0:
                inputdim = idim
            else:
                inputdim = cdim

            layer_type = torch.nn.LSTM if "lstm" in typ else torch.nn.GRU
            rnn = layer_type(
                inputdim, cdim, num_layers=1, bidirectional=bidir, batch_first=True
            )

            setattr(self, "%s%d" % ("birnn" if bidir else "rnn", i), rnn)

        self.dropout = torch.nn.Dropout(p=dropout)

        self.elayers = elayers
        self.cdim = cdim
        self.hdim = hdim
        self.typ = typ
        self.bidir = bidir

        self.l_last = torch.nn.Linear(cdim, hdim)

        self.aux_task_layer_list = aux_task_layer_list

    def forward(
        self,
        xs_pad: torch.Tensor,
        ilens: torch.Tensor,
        prev_state: Optional[torch.Tensor] = None,
    ) -> Union[Tuple[torch.Tensor, List], torch.Tensor]:
        """RNN forward.

        Args:
            xs_pad: Batch of padded input sequences (B, Tmax, idim)
            ilens: Batch of lengths of input sequences (B)
            prev_state: Batch of previous RNN states

        Returns:
            : Batch of padded output sequences (B, Tmax, hdim)
                    or tuple w/ aux outputs ((B, Tmax, hdim), [L x (B, Tmax, hdim)])
            : Batch of lengths of output sequences (B)
            : Batch of hidden state sequences (B, Tmax, hdim)

        """
        logging.debug(self.__class__.__name__ + " input lengths: " + str(ilens))

        aux_xs_list = []
        elayer_states = []
        for layer in range(self.elayers):
            if not isinstance(ilens, torch.Tensor):
                ilens = torch.tensor(ilens)

            xs_pack = pack_padded_sequence(xs_pad, ilens.cpu(), batch_first=True)

            rnn = getattr(self, ("birnn" if self.bidir else "rnn") + str(layer))
            rnn.flatten_parameters()

            if prev_state is not None and rnn.bidirectional:
                prev_state = reset_backward_rnn_state(prev_state)

            xs, states = rnn(
                xs_pack, hx=None if prev_state is None else prev_state[layer]
            )
            elayer_states.append(states)

            xs_pad, ilens = pad_packed_sequence(xs, batch_first=True)

            if self.bidir:
                xs_pad = xs_pad[:, :, : self.cdim] + xs_pad[:, :, self.cdim :]

            if layer in self.aux_task_layer_list:
                aux_projected = torch.tanh(
                    self.l_last(xs_pad.contiguous().view(-1, xs_pad.size(2)))
                )
                aux_xs_pad = aux_projected.view(xs_pad.size(0), xs_pad.size(1), -1)

                aux_xs_list.append(aux_xs_pad)

            if layer < self.elayers - 1:
                xs_pad = self.dropout(xs_pad)

        projected = torch.tanh(
            self.l_last(xs_pad.contiguous().view(-1, xs_pad.size(2)))
        )
        xs_pad = projected.view(xs_pad.size(0), xs_pad.size(1), -1)

        if aux_xs_list:
            return (xs_pad, aux_xs_list), ilens, elayer_states
        else:
            return xs_pad, ilens, elayer_states


def reset_backward_rnn_state(
    states: Union[torch.Tensor, Tuple, List]
) -> Union[torch.Tensor, Tuple, List]:
    """Set backward BRNN states to zeroes.

    Args:
        states: RNN states

    Returns:
        states: RNN states with backward set to zeroes

    """
    if isinstance(states, (list, tuple)):
        for state in states:
            state[1::2] = 0.0
    else:
        states[1::2] = 0.0
    return states


class VGG2L(torch.nn.Module):
    """VGG-like module.

    Args:
        in_channel: number of input channels

    """

    def __init__(self, in_channel: int = 1):
        """Initialize VGG-like module."""
        super(VGG2L, self).__init__()

        # CNN layer (VGG motivated)
        self.conv1_1 = torch.nn.Conv2d(in_channel, 64, 3, stride=1, padding=1)
        self.conv1_2 = torch.nn.Conv2d(64, 64, 3, stride=1, padding=1)
        self.conv2_1 = torch.nn.Conv2d(64, 128, 3, stride=1, padding=1)
        self.conv2_2 = torch.nn.Conv2d(128, 128, 3, stride=1, padding=1)

        self.in_channel = in_channel

    def forward(self, xs_pad: torch.Tensor, ilens: torch.Tensor, **kwargs):
        """VGG2L forward.

        Args:
            xs_pad: Batch of padded input sequences (B, Tmax, D)
            ilens: Batch of lengths of input sequences (B)

        Returns:
            : Batch of padded output sequences (B, Tmax // 4, 128 * D // 4)
            : Batch of lengths of output sequences (B)

        """
        logging.debug(self.__class__.__name__ + " input lengths: " + str(ilens))

        xs_pad = xs_pad.view(
            xs_pad.size(0),
            xs_pad.size(1),
            self.in_channel,
            xs_pad.size(2) // self.in_channel,
        ).transpose(1, 2)

        xs_pad = F.relu(self.conv1_1(xs_pad))
        xs_pad = F.relu(self.conv1_2(xs_pad))
        xs_pad = F.max_pool2d(xs_pad, 2, stride=2, ceil_mode=True)

        xs_pad = F.relu(self.conv2_1(xs_pad))
        xs_pad = F.relu(self.conv2_2(xs_pad))
        xs_pad = F.max_pool2d(xs_pad, 2, stride=2, ceil_mode=True)

        if torch.is_tensor(ilens):
            ilens = ilens.cpu().numpy()
        else:
            ilens = np.array(ilens, dtype=np.float32)
        ilens = np.array(np.ceil(ilens / 2), dtype=np.int64)
        ilens = np.array(
            np.ceil(np.array(ilens, dtype=np.float32) / 2), dtype=np.int64
        ).tolist()

        xs_pad = xs_pad.transpose(1, 2)
        xs_pad = xs_pad.contiguous().view(
            xs_pad.size(0), xs_pad.size(1), xs_pad.size(2) * xs_pad.size(3)
        )

        return xs_pad, ilens, None


class Encoder(torch.nn.Module):
    """Encoder module.

    Args:
        etype: Type of encoder network
        idim: Number of dimensions of encoder network
        elayers: Number of layers of encoder network
        eunits: Number of RNN units of encoder network
        eprojs: Number of projection units of encoder network
        subsample: List of subsampling numbers
        dropout: Dropout rate
        in_channel: Number of input channels

    """

    def __init__(
        self,
        etype: str,
        idim: int,
        elayers: int,
        eunits: int,
        eprojs: int,
        subsample: np.ndarray,
        dropout: float,
        in_channel: int = 1,
        aux_task_layer_list: List = [],
    ):
        """Initialize Encoder module."""
        super(Encoder, self).__init__()

        typ = etype.lstrip("vgg").rstrip("p")
        if typ not in ["lstm", "gru", "blstm", "bgru"]:
            logging.error("Error: need to specify an appropriate encoder architecture")

        if etype.startswith("vgg"):
            if etype[-1] == "p":
                self.enc = torch.nn.ModuleList(
                    [
                        VGG2L(in_channel),
                        RNNP(
                            get_vgg2l_odim(idim, in_channel=in_channel),
                            elayers,
                            eunits,
                            eprojs,
                            subsample,
                            dropout,
                            typ=typ,
                            aux_task_layer_list=aux_task_layer_list,
                        ),
                    ]
                )
                logging.info("Use CNN-VGG + " + typ.upper() + "P for encoder")
            else:
                self.enc = torch.nn.ModuleList(
                    [
                        VGG2L(in_channel),
                        RNN(
                            get_vgg2l_odim(idim, in_channel=in_channel),
                            elayers,
                            eunits,
                            eprojs,
                            dropout,
                            typ=typ,
                            aux_task_layer_list=aux_task_layer_list,
                        ),
                    ]
                )
                logging.info("Use CNN-VGG + " + typ.upper() + " for encoder")
            self.conv_subsampling_factor = 4
        else:
            if etype[-1] == "p":
                self.enc = torch.nn.ModuleList(
                    [
                        RNNP(
                            idim,
                            elayers,
                            eunits,
                            eprojs,
                            subsample,
                            dropout,
                            typ=typ,
                            aux_task_layer_list=aux_task_layer_list,
                        )
                    ]
                )
                logging.info(typ.upper() + " with every-layer projection for encoder")
            else:
                self.enc = torch.nn.ModuleList(
                    [
                        RNN(
                            idim,
                            elayers,
                            eunits,
                            eprojs,
                            dropout,
                            typ=typ,
                            aux_task_layer_list=aux_task_layer_list,
                        )
                    ]
                )
                logging.info(typ.upper() + " without projection for encoder")
            self.conv_subsampling_factor = 1

    def forward(self, xs_pad, ilens, prev_states=None):
        """Forward encoder.

        Args:
            xs_pad: Batch of padded input sequences (B, Tmax, idim)
            ilens: Batch of lengths of input sequences (B)
            prev_state: Batch of previous encoder hidden states (B, ??)

        Returns:
            : Batch of padded output sequences (B, Tmax, hdim)
                    or tuple w/ aux outputs ((B, Tmax, hdim), [L x (B, Tmax, hdim)])
            : Batch of lengths of output sequences (B)
            : Batch of hidden state sequences (B, Tmax, hdim)

        """
        if prev_states is None:
            prev_states = [None] * len(self.enc)
        assert len(prev_states) == len(self.enc)

        current_states = []
        for module, prev_state in zip(self.enc, prev_states):
            xs_pad, ilens, states = module(
                xs_pad,
                ilens,
                prev_state=prev_state,
            )
            current_states.append(states)

        if isinstance(xs_pad, tuple):
            final_xs_pad, aux_xs_list = xs_pad[0], xs_pad[1]

            mask = to_device(final_xs_pad, make_pad_mask(ilens).unsqueeze(-1))

            aux_xs_list = [layer.masked_fill(mask, 0.0) for layer in aux_xs_list]

            return (
                (
                    final_xs_pad.masked_fill(mask, 0.0),
                    aux_xs_list,
                ),
                ilens,
                current_states,
            )
        else:
            mask = to_device(xs_pad, make_pad_mask(ilens).unsqueeze(-1))

            return xs_pad.masked_fill(mask, 0.0), ilens, current_states


def encoder_for(
    args: argparse.Namespace,
    idim: Union[int, List],
    subsample: np.ndarray,
    aux_task_layer_list: List = [],
) -> Union[torch.nn.Module, List[torch.nn.Module]]:
    """Instantiate an encoder module given the program arguments.

    Args:
        args: The model arguments
        idim: Dimension of inputs or list of dimensions of inputs for each encoder
        subsample: subsample factors or list of subsample factors for each encoder

    Returns:
        : The encoder module or list of encoder modules

    """
    return Encoder(
        args.etype,
        idim,
        args.elayers,
        args.eunits,
        args.eprojs,
        subsample,
        args.dropout_rate,
        aux_task_layer_list=aux_task_layer_list,
    )


================================================
FILE: nets/pytorch_backend/transducer/tdnn.py
================================================
"""TDNN modules definition for transformer encoder."""

import logging
from typing import Tuple
from typing import Union

import torch


class TDNN(torch.nn.Module):
    """TDNN implementation with symmetric context.

    Args:
        idim: Dimension of inputs
        odim: Dimension of outputs
        ctx_size: Size of context window
        stride: Stride of the sliding blocks
        dilation: Parameter to control the stride of
                  elements within the neighborhood
        batch_norm: Whether to use batch normalization
        relu: Whether to use non-linearity layer (ReLU)

    """

    def __init__(
        self,
        idim: int,
        odim: int,
        ctx_size: int = 5,
        dilation: int = 1,
        stride: int = 1,
        batch_norm: bool = False,
        relu: bool = True,
        dropout_rate: float = 0.0,
    ):
        """Construct a TDNN object."""
        super().__init__()

        self.idim = idim
        self.odim = odim

        self.ctx_size = ctx_size
        self.stride = stride
        self.dilation = dilation

        self.batch_norm = batch_norm
        self.relu = relu

        self.tdnn = torch.nn.Conv1d(
            idim, odim, ctx_size, stride=stride, dilation=dilation
        )

        if self.relu:
            self.relu_func = torch.nn.ReLU()

        if self.batch_norm:
            self.bn = torch.nn.BatchNorm1d(odim)

        self.dropout = torch.nn.Dropout(p=dropout_rate)

    def forward(
        self,
        x_input: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]],
        masks: torch.Tensor,
    ) -> Tuple[Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]], torch.Tensor]:
        """Forward TDNN.

        Args:
            x_input: Input tensor (B, T, idim) or ((B, T, idim), (B, T, att_dim))
            or ((B, T, idim), (B, 2*T-1, att_dim))
            masks: Input mask (B, 1, T)

        Returns:
            x_output: Output tensor (B, sub(T), odim)
                          or ((B, sub(T), odim), (B, sub(T), att_dim))
            mask: Output mask (B, 1, sub(T))

        """
        if isinstance(x_input, tuple):
            xs, pos_emb = x_input[0], x_input[1]
        else:
            xs, pos_emb = x_input, None

        # The bidirect_pos is used to distinguish legacy_rel_pos and rel_pos in
        # Conformer model. Note the `legacy_rel_pos` will be deprecated in the future.
        # Details can be found in https://github.com/espnet/espnet/pull/2816.
        if pos_emb is not None and pos_emb.size(1) == 2 * xs.size(1) - 1:
            logging.warning("Using bidirectional relative postitional encoding.")
            bidirect_pos = True
        else:
            bidirect_pos = False

        xs = xs.transpose(1, 2)
        xs = self.tdnn(xs)

        if self.relu:
            xs = self.relu_func(xs)

        xs = self.dropout(xs)

        if self.batch_norm:
            xs = self.bn(xs)

        xs = xs.transpose(1, 2)

        return self.create_outputs(xs, pos_emb, masks, bidirect_pos=bidirect_pos)

    def create_outputs(
        self,
        xs: torch.Tensor,
        pos_emb: torch.Tensor,
        masks: torch.Tensor,
        bidirect_pos: bool = False,
    ) -> Tuple[Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]], torch.Tensor]:
        """Create outputs with subsampled version of pos_emb and masks.

        Args:
            xs: Output tensor (B, sub(T), odim)
            pos_emb: Input positional embedding tensor (B, T, att_dim)
            or (B, 2*T-1, att_dim)
            masks: Input mask (B, 1, T)
            bidirect_pos: whether to use bidirectional positional embedding

        Returns:
            xs: Output tensor (B, sub(T), odim)
            pos_emb: Output positional embedding tensor (B, sub(T), att_dim)
            or (B, 2*sub(T)-1, att_dim)
            masks: Output mask (B, 1, sub(T))

        """
        sub = (self.ctx_size - 1) * self.dilation

        if masks is not None:
            if sub != 0:
                masks = masks[:, :, :-sub]

            masks = masks[:, :, :: self.stride]

        if pos_emb is not None:
            # If the bidirect_pos is true, the pos_emb will include both positive and
            # negative embeddings. Refer to https://github.com/espnet/espnet/pull/2816.
            if bidirect_pos:
                pos_emb_positive = pos_emb[:, : pos_emb.size(1) // 2 + 1, :]
                pos_emb_negative = pos_emb[:, pos_emb.size(1) // 2 :, :]

                if sub != 0:
                    pos_emb_positive = pos_emb_positive[:, :-sub, :]
                    pos_emb_negative = pos_emb_negative[:, :-sub, :]

                pos_emb_positive = pos_emb_positive[:, :: self.stride, :]
                pos_emb_negative = pos_emb_negative[:, :: self.stride, :]
                pos_emb = torch.cat(
                    [pos_emb_positive, pos_emb_negative[:, 1:, :]], dim=1
                )
            else:
                if sub != 0:
                    pos_emb = pos_emb[:, :-sub, :]

                pos_emb = pos_emb[:, :: self.stride, :]

            return (xs, pos_emb), masks

        return xs, masks


================================================
FILE: nets/pytorch_backend/transducer/transformer_decoder_layer.py
================================================
"""Decoder layer definition for transformer-transducer models."""

import torch
from torch import nn

from espnet.nets.pytorch_backend.transformer.layer_norm import LayerNorm


class DecoderLayer(nn.Module):
    """Single decoder layer module for transformer-transducer models.

    Args:
        size (int): input dim
        self_attn (MultiHeadedAttention): self attention module
        feed_forward (PositionwiseFeedForward): feed forward layer module
        dropout_rate (float): dropout rate
        normalize_before (bool): whether to use layer_norm before the first block

    """

    def __init__(self, size, self_attn, feed_forward, dropout_rate):
        """Construct an DecoderLayer object."""
        super().__init__()

        self.self_attn = self_attn
        self.feed_forward = feed_forward

        self.norm1 = LayerNorm(size)
        self.norm2 = LayerNorm(size)

        self.dropout = nn.Dropout(dropout_rate)

        self.size = size

    def forward(self, tgt, tgt_mask, cache=None):
        """Compute decoded features.

        Args:
            tgt (torch.Tensor): decoded previous target features (B, Lmax, idim)
            tgt_mask (torch.Tensor): mask for tgt (B, Lmax)
            cache (torch.Tensor): cached output (B, Lmax-1, idim)

        Returns:
            tgt (torch.Tensor): decoder target features (B, Lmax, odim)
            tgt_mask (torch.Tensor): mask for tgt (B, Lmax)
        """
        residual = tgt
        tgt = self.norm1(tgt)

        if cache is None:
            tgt_q = tgt
        else:
            assert cache.shape == (
                tgt.shape[0],
                tgt.shape[1] - 1,
                self.size,
            ), f"{cache.shape} == {(tgt.shape[0], tgt.shape[1] - 1, self.size)}"

            tgt_q = tgt[:, -1:, :]
            residual = residual[:, -1:, :]

            if tgt_mask is not None:
                tgt_mask = tgt_mask[:, -1:, :]

        tgt = residual + self.dropout(self.self_attn(tgt_q, tgt, tgt, tgt_mask))

        residual = tgt
        tgt = self.norm2(tgt)

        tgt = residual + self.dropout(self.feed_forward(tgt))

        if cache is not None:
            tgt = torch.cat([cache, tgt], dim=1)

        return tgt, tgt_mask


================================================
FILE: nets/pytorch_backend/transducer/utils.py
================================================
"""Utility functions for transducer models."""

import os

import numpy as np
import torch

from espnet.nets.pytorch_backend.nets_utils import pad_list


def prepare_loss_inputs(ys_pad, hlens, blank_id=0, ignore_id=-1):
    """Prepare tensors for transducer loss computation.

    Args:
        ys_pad (torch.Tensor): batch of padded target sequences (B, Lmax)
        hlens (torch.Tensor): batch of hidden sequence lengthts (B)
                              or batch of masks (B, 1, Tmax)
        blank_id (int): index of blank label
        ignore_id (int): index of initial padding

    Returns:
        ys_in_pad (torch.Tensor): batch of padded target sequences + blank (B, Lmax + 1)
        target (torch.Tensor): batch of padded target sequences (B, Lmax)
        pred_len (torch.Tensor): batch of hidden sequence lengths (B)
        target_len (torch.Tensor): batch of output sequence lengths (B)

    """
    device = ys_pad.device

    ys = [y[y != ignore_id] for y in ys_pad]
    blank = ys[0].new([blank_id])

    ys_in_pad = pad_list([torch.cat([blank, y], dim=0) for y in ys], blank_id)
    ys_out_pad = pad_list([torch.cat([y, blank], dim=0) for y in ys], ignore_id)

    target = pad_list(ys, blank_id).type(torch.int32).to(device)
    target_len = torch.IntTensor([y.size(0) for y in ys]).to(device)

    if torch.is_tensor(hlens):
        if hlens.dim() > 1:
            hs = [h[h != 0] for h in hlens]
            hlens = list(map(int, [h.size(0) for h in hs]))
        else:
            hlens = list(map(int, hlens))

    pred_len = torch.IntTensor(hlens).to(device)

    return ys_in_pad, ys_out_pad, target, pred_len, target_len


def valid_aux_task_layer_list(aux_layer_ids, enc_num_layers):
    """Check whether input list of auxiliary layer ids is valid.

       Return the valid list sorted with duplicated removed.

    Args:
        aux_layer_ids (list): Auxiliary layers ids
        enc_num_layers (int): Number of encoder layers

    Returns:
        valid (list): Validated list of layers for auxiliary task

    """
    if (
        not isinstance(aux_layer_ids, list)
        or not aux_layer_ids
        or not all(isinstance(layer, int) for layer in aux_layer_ids)
    ):
        raise ValueError("--aux-task-layer-list argument takes a list of layer ids.")

    sorted_list = sorted(aux_layer_ids, key=int, reverse=False)
    valid = list(filter(lambda x: 0 <= x < enc_num_layers, sorted_list))

    if sorted_list != valid:
        raise ValueError(
            "Provided list of layer ids for auxiliary task is incorrect. "
            "IDs should be between [0, %d]" % (enc_num_layers - 1)
        )

    return valid


def is_prefix(x, pref):
    """Check prefix.

    Args:
        x (list): token id sequence
        pref (list): token id sequence

    Returns:
       (boolean): whether pref is a prefix of x.

    """
    if len(pref) >= len(x):
        return False

    for i in range(len(pref)):
        if pref[i] != x[i]:
            return False

    return True


def substract(x, subset):
    """Remove elements of subset if corresponding token id sequence exist in x.

    Args:
        x (list): set of hypotheses
        subset (list): subset of hypotheses

    Returns:
       final (list): new set

    """
    final = []

    for x_ in x:
        if any(x_.yseq == sub.yseq for sub in subset):
            continue
        final.append(x_)

    return final


def select_lm_state(lm_states, idx, lm_layers, is_wordlm):
    """Get LM state from batch for given id.

    Args:
        lm_states (list or dict): batch of LM states
        idx (int): index to extract state from batch state
        lm_layers (int): number of LM layers
        is_wordlm (bool): whether provided LM is a word-LM

    Returns:
       idx_state (dict): LM state for given id

    """
    if is_wordlm:
        idx_state = lm_states[idx]
    else:
        idx_state = {}

        idx_state["c"] = [lm_states["c"][layer][idx] for layer in range(lm_layers)]
        idx_state["h"] = [lm_states["h"][layer][idx] for layer in range(lm_layers)]

    return idx_state


def create_lm_batch_state(lm_states_list, lm_layers, is_wordlm):
    """Create batch of LM states.

    Args:
        lm_states (list or dict): list of individual LM states
        lm_layers (int): number of LM layers
        is_wordlm (bool): whether provided LM is a word-LM

    Returns:
       batch_states (list): batch of LM states

    """
    if is_wordlm:
        batch_states = lm_states_list
    else:
        batch_states = {}

        batch_states["c"] = [
            torch.stack([state["c"][layer] for state in lm_states_list])
            for layer in range(lm_layers)
        ]
        batch_states["h"] = [
            torch.stack([state["h"][layer] for state in lm_states_list])
            for layer in range(lm_layers)
        ]

    return batch_states


def init_lm_state(lm_model):
    """Initialize LM state.

    Args:
        lm_model (torch.nn.Module): LM module

    Returns:
        lm_state (dict): initial LM state

    """
    lm_layers = len(lm_model.rnn)
    lm_units_typ = lm_model.typ
    lm_units = lm_model.n_units

    p = next(lm_model.parameters())

    h = [
        torch.zeros(lm_units).to(device=p.device, dtype=p.dtype)
        for _ in range(lm_layers)
    ]

    lm_state = {"h": h}

    if lm_units_typ == "lstm":
        lm_state["c"] = [
            torch.zeros(lm_units).to(device=p.device, dtype=p.dtype)
            for _ in range(lm_layers)
        ]

    return lm_state


def recombine_hyps(hyps, mmi_weight):
    """Recombine hypotheses with equivalent output sequence.

    Args:
        hyps (list): list of hypotheses

    Returns:
       final (list): list of recombined hypotheses

    """
    final = []

    for hyp in hyps:
        seq_final = [f.yseq for f in final if f.yseq]

        if hyp.yseq in seq_final:
            seq_pos = seq_final.index(hyp.yseq)

            # for the same u, t, MMI score should be the same.
            assert (final[seq_pos].mmi_tot_score - hyp.mmi_tot_score) < 1e-5
            mmi_score = hyp.mmi_tot_score

            # the MMI score should not be combined: it is independent to paths
            final[seq_pos].score = np.logaddexp(final[seq_pos].score - mmi_score * mmi_weight, hyp.score - mmi_score * mmi_weight)
            final[seq_pos].score += mmi_weight * mmi_score
        else:
            final.append(hyp)

    return final # prev: hyps


def pad_sequence(seqlist, pad_token):
    """Left pad list of token id sequences.

    Args:
        seqlist (list): list of token id sequences
        pad_token (int): padding token id

    Returns:
        final (list): list of padded token id sequences

    """
    maxlen = max(len(x) for x in seqlist)

    final = [([pad_token] * (maxlen - len(x))) + x for x in seqlist]

    return final


def check_state(state, max_len, pad_token):
    """Check state and left pad or trim if necessary.

    Args:
        state (list): list of of L decoder states (in_len, dec_dim)
        max_len (int): maximum length authorized
        pad_token (int): padding token id

    Returns:
        final (list): list of L padded decoder states (1, max_len, dec_dim)

    """
    if state is None or max_len < 1 or state[0].size(1) == max_len:
        return state

    curr_len = state[0].size(1)

    if curr_len > max_len:
        trim_val = int(state[0].size(1) - max_len)

        for i, s in enumerate(state):
            state[i] = s[:, trim_val:, :]
    else:
        layers = len(state)
        ddim = state[0].size(2)

        final_dims = (1, max_len, ddim)
        final = [state[0].data.new(*final_dims).fill_(pad_token) for _ in range(layers)]

        for i, s in enumerate(state):
            final[i][:, (max_len - s.size(1)) : max_len, :] = s

        return final

    return state


def check_batch_state(state, max_len, pad_token):
    """Check batch of states and left pad or trim if necessary.

    Args:
        state (list): list of of L decoder states (B, ?, dec_dim)
        max_len (int): maximum length authorized
        pad_token (int): padding token id

    Returns:
        final (list): list of L decoder states (B, pred_len, dec_dim)

    """
    final_dims = (len(state), max_len, state[0].size(1))
    final = state[0].data.new(*final_dims).fill_(pad_token)

    for i, s in enumerate(state):
        curr_len = s.size(0)

        if curr_len < max_len:
            final[i, (max_len - curr_len) : max_len, :] = s
        else:
            final[i, :, :] = s[(curr_len - max_len) :, :]

    return final


def custom_torch_load(model_path, model, training=True):
    """Load transducer model modules and parameters with training-only ones removed.

    Args:
        model_path (str): Model path
        model (torch.nn.Module): The model with pretrained modules

    """
    if "snapshot" in os.path.basename(model_path):
        model_state_dict = torch.load(
            model_path, map_location=lambda storage, loc: storage
        )["model"]
    else:
        model_state_dict = torch.load(
            model_path, map_location=lambda storage, loc: storage
        )

    if not training:
        model_state_dict = {
            k: v for k, v in model_state_dict.items() # if not k.startswith("aux")
        }

    model.load_state_dict(model_state_dict)

    del model_state_dict


================================================
FILE: nets/pytorch_backend/transducer/vgg2l.py
================================================
"""VGG2L module definition for transformer encoder."""

from typing import Tuple
from typing import Union

import torch


class VGG2L(torch.nn.Module):
    """VGG2L module for custom encoder.

    Args:
        idim: Dimension of inputs
        odim: Dimension of outputs
        pos_enc: Positional encoding class

    """

    def __init__(self, idim: int, odim: int, pos_enc: torch.nn.Module = None):
        """Construct a VGG2L object."""
        super().__init__()

        self.vgg2l = torch.nn.Sequential(
            torch.nn.Conv2d(1, 64, 3, stride=1, padding=1),
            torch.nn.ReLU(),
            torch.nn.Conv2d(64, 64, 3, stride=1, padding=1),
            torch.nn.ReLU(),
            torch.nn.MaxPool2d((3, 2)),
            torch.nn.Conv2d(64, 128, 3, stride=1, padding=1),
            torch.nn.ReLU(),
            torch.nn.Conv2d(128, 128, 3, stride=1, padding=1),
            torch.nn.ReLU(),
            torch.nn.MaxPool2d((2, 2)),
        )

        if pos_enc is not None:
            self.output = torch.nn.Sequential(
                torch.nn.Linear(128 * ((idim // 2) // 2), odim), pos_enc
            )
        else:
            self.output = torch.nn.Linear(128 * ((idim // 2) // 2), odim)

    def forward(
        self, x: torch.Tensor, x_mask: torch.Tensor
    ) -> Union[
        Tuple[torch.Tensor, torch.Tensor],
        Tuple[Tuple[torch.Tensor, torch.Tensor], torch.Tensor],
    ]:
        """VGG2L forward for x.

        Args:
            x: Input tensor (B, T, idim)
            x_mask: Input mask (B, 1, T)

        Returns:
            x: Output tensor (B, sub(T), odim)
                   or ((B, sub(T), odim), (B, sub(T), att_dim))
            x_mask: Output mask (B, 1, sub(T))

        """
        x = x.unsqueeze(1)
        x = self.vgg2l(x)
        b, c, t, f = x.size()

        x = self.output(x.transpose(1, 2).contiguous().view(b, t, c * f))
        if x_mask is not None:
            x_mask = self.create_new_mask(x_mask)

        return x, x_mask

    def create_new_mask(self, x_mask: torch.Tensor) -> torch.Tensor:
        """Create a subsampled version of x_mask.

        Args:
            x_mask: Input mask (B, 1, T)

        Returns:
            x_mask: Output mask (B, 1, sub(T))

        """
        x_t1 = x_mask.size(2) - (x_mask.size(2) % 3)
        x_mask = x_mask[:, :, :x_t1][:, :, ::3]

        x_t2 = x_mask.size(2) - (x_mask.size(2) % 2)
        x_mask = x_mask[:, :, :x_t2][:, :, ::2]

        return x_mask


================================================
FILE: nets/pytorch_backend/transformer/__init__.py
================================================
"""Initialize sub package."""


================================================
FILE: nets/pytorch_backend/transformer/add_sos_eos.py
================================================
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

# Copyright 2019 Shigeki Karita
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

"""Unility funcitons for Transformer."""

import torch


def add_sos_eos(ys_pad, sos, eos, ignore_id):
    """Add <sos> and <eos> labels.

    :param torch.Tensor ys_pad: batch of padded target sequences (B, Lmax)
    :param int sos: index of <sos>
    :param int eos: index of <eos>
    :param int ignore_id: index of padding
    :return: padded tensor (B, Lmax)
    :rtype: torch.Tensor
    :return: padded tensor (B, Lmax)
    :rtype: torch.Tensor
    """
    from espnet.nets.pytorch_backend.nets_utils import pad_list

    _sos = ys_pad.new([sos])
    _eos = ys_pad.new([eos])
    ys = [y[y != ignore_id] for y in ys_pad]  # parse padded ys
    ys_in = [torch.cat([_sos, y], dim=0) for y in ys]
    ys_out = [torch.cat([y, _eos], dim=0) for y in ys]
    return pad_list(ys_in, eos), pad_list(ys_out, ignore_id)


================================================
FILE: nets/pytorch_backend/transformer/argument.py
================================================
# Copyright 2020 Hirofumi Inaguma
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

"""Transformer common arguments."""


from distutils.util import strtobool


def add_arguments_transformer_common(group):
    """Add Transformer common arguments."""
    group.add_argument(
        "--transformer-init",
        type=str,
        default="pytorch",
        choices=[
            "pytorch",
            "xavier_uniform",
            "xavier_normal",
            "kaiming_uniform",
            "kaiming_normal",
        ],
        help="how to initialize transformer parameters",
    )
    group.add_argument(
        "--transformer-input-layer",
        type=str,
        default="conv2d",
        choices=["conv2d", "linear", "embed"],
        help="transformer input layer type",
    )
    group.add_argument(
        "--transformer-attn-dropout-rate",
        default=None,
        type=float,
        help="dropout in transformer attention. use --dropout-rate if None is set",
    )
    group.add_argument(
        "--transformer-lr",
        default=10.0,
        type=float,
        help="Initial value of learning rate",
    )
    group.add_argument(
        "--transformer-warmup-steps",
        default=25000,
        type=int,
        help="optimizer warmup steps",
    )
    group.add_argument(
        "--transformer-length-normalized-loss",
        default=True,
        type=strtobool,
        help="normalize loss by length",
    )
    group.add_argument(
        "--transformer-encoder-selfattn-layer-type",
        type=str,
        default="selfattn",
        choices=[
            "selfattn",
            "rel_selfattn",
            "lightconv",
            "lightconv2d",
            "dynamicconv",
            "dynamicconv2d",
            "light-dynamicconv2d",
        ],
        help="transformer encoder self-attention layer type",
    )
    group.add_argument(
        "--transformer-decoder-selfattn-layer-type",
        type=str,
        default="selfattn",
        choices=[
            "selfattn",
            "lightconv",
            "lightconv2d",
            "dynamicconv",
            "dynamicconv2d",
            "light-dynamicconv2d",
        ],
        help="transformer decoder self-attention layer type",
    )
    # Lightweight/Dynamic convolution related parameters.
    # See https://arxiv.org/abs/1912.11793v2
    # and https://arxiv.org/abs/1901.10430 for detail of the method.
    # Configurations used in the first paper are in
    # egs/{csj, librispeech}/asr1/conf/tuning/ld_conv/
    group.add_argument(
        "--wshare",
        default=4,
        type=int,
        help="Number of parameter shargin for lightweight convolution",
    )
    group.add_argument(
        "--ldconv-encoder-kernel-length",
        default="21_23_25_27_29_31_33_35_37_39_41_43",
        type=str,
        help="kernel size for lightweight/dynamic convolution: "
        'Encoder side. For example, "21_23_25" means kernel length 21 for '
        "First layer, 23 for Second layer and so on.",
    )
    group.add_argument(
        "--ldconv-decoder-kernel-length",
        default="11_13_15_17_19_21",
        type=str,
        help="kernel size for lightweight/dynamic convolution: "
        'Decoder side. For example, "21_23_25" means kernel length 21 for '
        "First layer, 23 for Second layer and so on.",
    )
    group.add_argument(
        "--ldconv-usebias",
        type=strtobool,
        default=False,
        help="use bias term in lightweight/dynamic convolution",
    )
    group.add_argument(
        "--dropout-rate",
        default=0.0,
        type=float,
        help="Dropout rate for the encoder",
    )
    # Encoder
    group.add_argument(
        "--elayers",
        default=4,
        type=int,
        help="Number of encoder layers (for shared recognition part "
        "in multi-speaker asr mode)",
    )
    group.add_argument(
        "--eunits",
        "-u",
        default=300,
        type=int,
        help="Number of encoder hidden units",
    )
    # Attention
    group.add_argument(
        "--adim",
        default=320,
        type=int,
        help="Number of attention transformation dimensions",
    )
    group.add_argument(
        "--aheads",
        default=4,
        type=int,
        help="Number of heads for multi head attention",
    )
    # Decoder
    group.add_argument(
        "--dlayers", default=1, type=int, help="Number of decoder layers"
    )
    group.add_argument(
        "--dunits", default=320, type=int, help="Number of decoder hidden units"
    )

    # MBR 
    group.add_argument(
        "--aux-mbr",
        type=strtobool,
        nargs="?",
        default=False,
        help="Whether to use mbr as auxiliary task.",
    )
    group.add_argument(
        "--aux-mbr-weight",
        default=1.0,
        type=float,
        help="Weight of auxiliary mbr loss",
    )
    group.add_argument(
        "--aux-mbr-beam",
        default=2,
        type=int,
        help="Number of hypothesis for MBR loss computation",
    )
    return group


================================================
FILE: nets/pytorch_backend/transformer/attention.py
================================================
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

# Copyright 2019 Shigeki Karita
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

"""Multi-Head Attention layer definition."""

import math

import numpy
import torch
from torch import nn


class MultiHeadedAttention(nn.Module):
    """Multi-Head Attention layer.

    Args:
        n_head (int): The number of heads.
        n_feat (int): The number of features.
        dropout_rate (float): Dropout rate.

    """

    def __init__(self, n_head, n_feat, dropout_rate):
        """Construct an MultiHeadedAttention object."""
        super(MultiHeadedAttention, self).__init__()
        assert n_feat % n_head == 0
        # We assume d_v always equals d_k
        self.d_k = n_feat // n_head
        self.h = n_head
        self.linear_q = nn.Linear(n_feat, n_feat)
        self.linear_k = nn.Linear(n_feat, n_feat)
        self.linear_v = nn.Linear(n_feat, n_feat)
        self.linear_out = nn.Linear(n_feat, n_feat)
        self.attn = None
        self.dropout = nn.Dropout(p=dropout_rate)

    def forward_qkv(self, query, key, value):
        """Transform query, key and value.

        Args:
            query (torch.Tensor): Query tensor (#batch, time1, size).
            key (torch.Tensor): Key tensor (#batch, time2, size).
            value (torch.Tensor): Value tensor (#batch, time2, size).

        Returns:
            torch.Tensor: Transformed query tensor (#batch, n_head, time1, d_k).
            torch.Tensor: Transformed key tensor (#batch, n_head, time2, d_k).
            torch.Tensor: Transformed value tensor (#batch, n_head, time2, d_k).

        """
        n_batch = query.size(0)
        q = self.linear_q(query).view(n_batch, -1, self.h, self.d_k)
        k = self.linear_k(key).view(n_batch, -1, self.h, self.d_k)
        v = self.linear_v(value).view(n_batch, -1, self.h, self.d_k)
        q = q.transpose(1, 2)  # (batch, head, time1, d_k)
        k = k.transpose(1, 2)  # (batch, head, time2, d_k)
        v = v.transpose(1, 2)  # (batch, head, time2, d_k)

        return q, k, v

    def forward_attention(self, value, scores, mask):
        """Compute attention context vector.

        Args:
            value (torch.Tensor): Transformed value (#batch, n_head, time2, d_k).
            scores (torch.Tensor): Attention score (#batch, n_head, time1, time2).
            mask (torch.Tensor): Mask (#batch, 1, time2) or (#batch, time1, time2).

        Returns:
            torch.Tensor: Transformed value (#batch, time1, d_model)
                weighted by the attention score (#batch, time1, time2).

        """
        n_batch = value.size(0)
        if mask is not None:
            mask = mask.unsqueeze(1).eq(0)  # (batch, 1, *, time2)
            min_value = float(
                numpy.finfo(torch.tensor(0, dtype=scores.dtype).numpy().dtype).min
            )
            scores = scores.masked_fill(mask, min_value)
            self.attn = torch.softmax(scores, dim=-1).masked_fill(
                mask, 0.0
            )  # (batch, head, time1, time2)
        else:
            self.attn = torch.softmax(scores, dim=-1)  # (batch, head, time1, time2)

        p_attn = self.dropout(self.attn)
        x = torch.matmul(p_attn, value)  # (batch, head, time1, d_k)
        x = (
            x.transpose(1, 2).contiguous().view(n_batch, -1, self.h * self.d_k)
        )  # (batch, time1, d_model)

        return self.linear_out(x)  # (batch, time1, d_model)

    def forward(self, query, key, value, mask):
        """Compute scaled dot product attention.

        Args:
            query (torch.Tensor): Query tensor (#batch, time1, size).
            key (torch.Tensor): Key tensor (#batch, time2, size).
            value (torch.Tensor): Value tensor (#batch, time2, size).
            mask (torch.Tensor): Mask tensor (#batch, 1, time2) or
                (#batch, time1, time2).

        Returns:
            torch.Tensor: Output tensor (#batch, time1, d_model).

        """
        q, k, v = self.forward_qkv(query, key, value)
        scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.d_k)
        return self.forward_attention(v, scores, mask)


class LegacyRelPositionMultiHeadedAttention(MultiHeadedAttention):
    """Multi-Head Attention layer with relative position encoding (old version).

    Details can be found in https://github.com/espnet/espnet/pull/2816.

    Paper: https://arxiv.org/abs/1901.02860

    Args:
        n_head (int): The number of heads.
        n_feat (int): The number of features.
        dropout_rate (float): Dropout rate.
        zero_triu (bool): Whether to zero the upper triangular part of attention matrix.

    """

    def __init__(self, n_head, n_feat, dropout_rate, zero_triu=False):
        """Construct an RelPositionMultiHeadedAttention object."""
        super().__init__(n_head, n_feat, dropout_rate)
        self.zero_triu = zero_triu
        # linear transformation for positional encoding
        self.linear_pos = nn.Linear(n_feat, n_feat, bias=False)
        # these two learnable bias are used in matrix c and matrix d
        # as described in https://arxiv.org/abs/1901.02860 Section 3.3
        self.pos_bias_u = nn.Parameter(torch.Tensor(self.h, self.d_k))
        self.pos_bias_v = nn.Parameter(torch.Tensor(self.h, self.d_k))
        torch.nn.init.xavier_uniform_(self.pos_bias_u)
        torch.nn.init.xavier_uniform_(self.pos_bias_v)

    def rel_shift(self, x):
        """Compute relative positional encoding.

        Args:
            x (torch.Tensor): Input tensor (batch, head, time1, time2).

        Returns:
            torch.Tensor: Output tensor.

        """
        zero_pad = torch.zeros((*x.size()[:3], 1), device=x.device, dtype=x.dtype)
        x_padded = torch.cat([zero_pad, x], dim=-1)

        x_padded = x_padded.view(*x.size()[:2], x.size(3) + 1, x.size(2))
        x = x_padded[:, :, 1:].view_as(x)

        if self.zero_triu:
            ones = torch.ones((x.size(2), x.size(3)))
            x = x * torch.tril(ones, x.size(3) - x.size(2))[None, None, :, :]

        return x

    def forward(self, query, key, value, pos_emb, mask):
        """Compute 'Scaled Dot Product Attention' with rel. positional encoding.

        Args:
            query (torch.Tensor): Query tensor (#batch, time1, size).
            key (torch.Tensor): Key tensor (#batch, time2, size).
            value (torch.Tensor): Value tensor (#batch, time2, size).
            pos_emb (torch.Tensor): Positional embedding tensor (#batch, time1, size).
            mask (torch.Tensor): Mask tensor (#batch, 1, time2) or
                (#batch, time1, time2).

        Returns:
            torch.Tensor: Output tensor (#batch, time1, d_model).

        """
        q, k, v = self.forward_qkv(query, key, value)
        q = q.transpose(1, 2)  # (batch, time1, head, d_k)

        n_batch_pos = pos_emb.size(0)
        p = self.linear_pos(pos_emb).view(n_batch_pos, -1, self.h, self.d_k)
        p = p.transpose(1, 2)  # (batch, head, time1, d_k)

        # (batch, head, time1, d_k)
        q_with_bias_u = (q + self.pos_bias_u).transpose(1, 2)
        # (batch, head, time1, d_k)
        q_with_bias_v = (q + self.pos_bias_v).transpose(1, 2)

        # compute attention score
        # first compute matrix a and matrix c
        # as described in https://arxiv.org/abs/1901.02860 Section 3.3
        # (batch, head, time1, time2)
        matrix_ac = torch.matmul(q_with_bias_u, k.transpose(-2, -1))

        # compute matrix b and matrix d
        # (batch, head, time1, time1)
        matrix_bd = torch.matmul(q_with_bias_v, p.transpose(-2, -1))
        matrix_bd = self.rel_shift(matrix_bd)

        scores = (matrix_ac + matrix_bd) / math.sqrt(
            self.d_k
        )  # (batch, head, time1, time2)

        return self.forward_attention(v, scores, mask)


class RelPositionMultiHeadedAttention(MultiHeadedAttention):
    """Multi-Head Attention layer with relative position encoding (new implementation).

    Details can be found in https://github.com/espnet/espnet/pull/2816.

    Paper: https://arxiv.org/abs/1901.02860

    Args:
        n_head (int): The number of heads.
        n_feat (int): The number of features.
        dropout_rate (float): Dropout rate.
        zero_triu (bool): Whether to zero the upper triangular part of attention matrix.

    """

    def __init__(self, n_head, n_feat, dropout_rate, zero_triu=False):
        """Construct an RelPositionMultiHeadedAttention object."""
        super().__init__(n_head, n_feat, dropout_rate)
        self.zero_triu = zero_triu
        # linear transformation for positional encoding
        self.linear_pos = nn.Linear(n_feat, n_feat, bias=False)
        # these two learnable bias are used in matrix c and matrix d
        # as described in https://arxiv.org/abs/1901.02860 Section 3.3
        self.pos_bias_u = nn.Parameter(torch.Tensor(self.h, self.d_k))
        self.pos_bias_v = nn.Parameter(torch.Tensor(self.h, self.d_k))
        torch.nn.init.xavier_uniform_(self.pos_bias_u)
        torch.nn.init.xavier_uniform_(self.pos_bias_v)

    def rel_shift(self, x):
        """Compute relative positional encoding.

        Args:
            x (torch.Tensor): Input tensor (batch, head, time1, 2*time1-1).
            time1 means the length of query vector.

        Returns:
            torch.Tensor: Output tensor.

        """
        zero_pad = torch.zeros((*x.size()[:3], 1), device=x.device, dtype=x.dtype)
        x_padded = torch.cat([zero_pad, x], dim=-1)

        x_padded = x_padded.view(*x.size()[:2], x.size(3) + 1, x.size(2))
        x = x_padded[:, :, 1:].view_as(x)[
            :, :, :, : x.size(-1) // 2 + 1
        ]  # only keep the positions from 0 to time2

        if self.zero_triu:
            ones = torch.ones((x.size(2), x.size(3)), device=x.device)
            x = x * torch.tril(ones, x.size(3) - x.size(2))[None, None, :, :]

        return x

    def forward(self, query, key, value, pos_emb, mask):
        """Compute 'Scaled Dot Product Attention' with rel. positional encoding.

        Args:
            query (torch.Tensor): Query tensor (#batch, time1, size).
            key (torch.Tensor): Key tensor (#batch, time2, size).
            value (torch.Tensor): Value tensor (#batch, time2, size).
            pos_emb (torch.Tensor): Positional embedding tensor
                (#batch, 2*time1-1, size).
            mask (torch.Tensor): Mask tensor (#batch, 1, time2) or
                (#batch, time1, time2).

        Returns:
            torch.Tensor: Output tensor (#batch, time1, d_model).

        """
        q, k, v = self.forward_qkv(query, key, value)
        q = q.transpose(1, 2)  # (batch, time1, head, d_k)

        n_batch_pos = pos_emb.size(0)
        p = self.linear_pos(pos_emb).view(n_batch_pos, -1, self.h, self.d_k)
        p = p.transpose(1, 2)  # (batch, head, 2*time1-1, d_k)

        # (batch, head, time1, d_k)
        q_with_bias_u = (q + self.pos_bias_u).transpose(1, 2)
        # (batch, head, time1, d_k)
        q_with_bias_v = (q + self.pos_bias_v).transpose(1, 2)

        # compute attention score
        # first compute matrix a and matrix c
        # as described in https://arxiv.org/abs/1901.02860 Section 3.3
        # (batch, head, time1, time2)
        matrix_ac = torch.matmul(q_with_bias_u, k.transpose(-2, -1))

        # compute matrix b and matrix d
        # (batch, head, time1, 2*time1-1)
        matrix_bd = torch.matmul(q_with_bias_v, p.transpose(-2, -1))
        matrix_bd = self.rel_shift(matrix_bd)

        scores = (matrix_ac + matrix_bd) / math.sqrt(
            self.d_k
        )  # (batch, head, time1, time2)

        return self.forward_attention(v, scores, mask)


================================================
FILE: nets/pytorch_backend/transformer/contextual_block_encoder_layer.py
================================================
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

# Copyright 2020 Emiru Tsunoo
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

"""Encoder self-attention layer definition."""

import torch

from torch import nn

from espnet.nets.pytorch_backend.transformer.layer_norm import LayerNorm


class ContextualBlockEncoderLayer(nn.Module):
    """Contexutal Block Encoder layer module.

    Args:
        size (int): Input dimension.
        self_attn (torch.nn.Module): Self-attention module instance.
            `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` instance
            can be used as the argument.
        feed_forward (torch.nn.Module): Feed-forward module instance.
            `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance
            can be used as the argument.
        dropout_rate (float): Dropout rate.
        total_layer_num (int): Total number of layers
        normalize_before (bool): Whether to use layer_norm before the first block.
        concat_after (bool): Whether to concat attention layer's input and output.
            if True, additional linear will be applied.
            i.e. x -> x + linear(concat(x, att(x)))
            if False, no additional linear will be applied. i.e. x -> x + att(x)

    """

    def __init__(
        self,
        size,
        self_attn,
        feed_forward,
        dropout_rate,
        total_layer_num,
        normalize_before=True,
        concat_after=False,
    ):
        """Construct an EncoderLayer object."""
        super(ContextualBlockEncoderLayer, self).__init__()
        self.self_attn = self_attn
        self.feed_forward = feed_forward
        self.norm1 = LayerNorm(size)
        self.norm2 = LayerNorm(size)
        self.dropout = nn.Dropout(dropout_rate)
        self.size = size
        self.normalize_before = normalize_before
        self.concat_after = concat_after
        self.total_layer_num = total_layer_num
        if self.concat_after:
            self.concat_linear = nn.Linear(size + size, size)

    def forward(self, x, mask, past_ctx=None, next_ctx=None, layer_idx=0, cache=None):
        """Compute encoded features.

        Args:
            x_input (torch.Tensor): Input tensor (#batch, time, size).
            mask (torch.Tensor): Mask tensor for the input (#batch, time).
            past_ctx (torch.Tensor): Previous contexutal vector
            next_ctx (torch.Tensor): Next contexutal vector
            cache (torch.Tensor): Cache tensor of the input (#batch, time - 1, size).

        Returns:
            torch.Tensor: Output tensor (#batch, time, size).
            torch.Tensor: Mask tensor (#batch, time).
            cur_ctx (torch.Tensor): Current contexutal vector
            next_ctx (torch.Tensor): Next contexutal vector
            layer_idx (int): layer index number

        """
        nbatch = x.size(0)
        nblock = x.size(1)

        if past_ctx is not None:
            if next_ctx is None:
                # store all context vectors in one tensor
                next_ctx = past_ctx.new_zeros(
                    nbatch, nblock, self.total_layer_num, x.size(-1)
                )
            else:
                x[:, :, 0] = past_ctx[:, :, layer_idx]

        # reshape ( nbatch, nblock, block_size + 2, dim )
        #     -> ( nbatch * nblock, block_size + 2, dim )
        x = x.view(-1, x.size(-2), x.size(-1))
        if mask is not None:
            mask = mask.view(-1, mask.size(-2), mask.size(-1))

        residual = x
        if self.normalize_before:
            x = self.norm1(x)

        if cache is None:
            x_q = x
        else:
            assert cache.shape == (x.shape[0], x.shape[1] - 1, self.size)
            x_q = x[:, -1:, :]
            residual = residual[:, -1:, :]
            mask = None if mask is None else mask[:, -1:, :]

        if self.concat_after:
            x_concat = torch.cat((x, self.self_attn(x_q, x, x, mask)), dim=-1)
            x = residual + self.concat_linear(x_concat)
        else:
            x = residual + self.dropout(self.self_attn(x_q, x, x, mask))
        if not self.normalize_before:
            x = self.norm1(x)

        residual = x
        if self.normalize_before:
            x = self.norm2(x)
        x = residual + self.dropout(self.feed_forward(x))
        if not self.normalize_before:
            x = self.norm2(x)

        if cache is not None:
            x = torch.cat([cache, x], dim=1)

        layer_idx += 1
        # reshape ( nbatch * nblock, block_size + 2, dim )
        #       -> ( nbatch, nblock, block_size + 2, dim )
        x = x.view(nbatch, -1, x.size(-2), x.size(-1)).squeeze(1)
        if mask is not None:
            mask = mask.view(nbatch, -1, mask.size(-2), mask.size(-1)).squeeze(1)

        if next_ctx is not None and layer_idx < self.total_layer_num:
            next_ctx[:, 0, layer_idx, :] = x[:, 0, -1, :]
            next_ctx[:, 1:, layer_idx, :] = x[:, 0:-1, -1, :]

        return x, mask, next_ctx, next_ctx, layer_idx


================================================
FILE: nets/pytorch_backend/transformer/decoder.py
================================================
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

# Copyright 2019 Shigeki Karita
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

"""Decoder definition."""

import logging

from typing import Any
from typing import List
from typing import Tuple

import torch

from espnet.nets.pytorch_backend.nets_utils import rename_state_dict
from espnet.nets.pytorch_backend.transformer.attention import MultiHeadedAttention
from espnet.nets.pytorch_backend.transformer.decoder_layer import DecoderLayer
from espnet.nets.pytorch_backend.transformer.dynamic_conv import DynamicConvolution
from espnet.nets.pytorch_backend.transformer.dynamic_conv2d import DynamicConvolution2D
from espnet.nets.pytorch_backend.transformer.embedding import PositionalEncoding
from espnet.nets.pytorch_backend.transformer.layer_norm import LayerNorm
from espnet.nets.pytorch_backend.transformer.lightconv import LightweightConvolution
from espnet.nets.pytorch_backend.transformer.lightconv2d import LightweightConvolution2D
from espnet.nets.pytorch_backend.transformer.mask import subsequent_mask
from espnet.nets.pytorch_backend.transformer.positionwise_feed_forward import (
    PositionwiseFeedForward,  # noqa: H301
)
from espnet.nets.pytorch_backend.transformer.repeat import repeat
from espnet.nets.scorer_interface import BatchScorerInterface


def _pre_hook(
    state_dict,
    prefix,
    local_metadata,
    strict,
    missing_keys,
    unexpected_keys,
    error_msgs,
):
    # https://github.com/espnet/espnet/commit/3d422f6de8d4f03673b89e1caef698745ec749ea#diff-bffb1396f038b317b2b64dd96e6d3563
    rename_state_dict(prefix + "output_norm.", prefix + "after_norm.", state_dict)


class Decoder(BatchScorerInterface, torch.nn.Module):
    """Transfomer decoder module.

    Args:
        odim (int): Output diminsion.
        self_attention_layer_type (str): Self-attention layer type.
        attention_dim (int): Dimention of attention.
        attention_heads (int): The number of heads of multi head attention.
        conv_wshare (int): The number of kernel of convolution. Only used in
            self_attention_layer_type == "lightconv*" or "dynamiconv*".
        conv_kernel_length (Union[int, str]): Kernel size str of convolution
            (e.g. 71_71_71_71_71_71). Only used in self_attention_layer_type
            == "lightconv*" or "dynamiconv*".
        conv_usebias (bool): Whether to use bias in convolution. Only used in
            self_attention_layer_type == "lightconv*" or "dynamiconv*".
        linear_units (int): The number of units of position-wise feed forward.
        num_blocks (int): The number of decoder blocks.
        dropout_rate (float): Dropout rate.
        positional_dropout_rate (float): Dropout rate after adding positional encoding.
        self_attention_dropout_rate (float): Dropout rate in self-attention.
        src_attention_dropout_rate (float): Dropout rate in source-attention.
        input_layer (Union[str, torch.nn.Module]): Input layer type.
        use_output_layer (bool): Whether to use output layer.
        pos_enc_class (torch.nn.Module): Positional encoding module class.
            `PositionalEncoding `or `ScaledPositionalEncoding`
        normalize_before (bool): Whether to use layer_norm before the first block.
        concat_after (bool): Whether to concat attention layer's input and output.
            if True, additional linear will be applied.
            i.e. x -> x + linear(concat(x, att(x)))
            if False, no additional linear will be applied. i.e. x -> x + att(x)

    """

    def __init__(
        self,
        odim,
        selfattention_layer_type="selfattn",
        attention_dim=256,
        attention_heads=4,
        conv_wshare=4,
        conv_kernel_length=11,
        conv_usebias=False,
        linear_units=2048,
        num_blocks=6,
        dropout_rate=0.1,
        positional_dropout_rate=0.1,
        self_attention_dropout_rate=0.0,
        src_attention_dropout_rate=0.0,
        input_layer="embed",
        use_output_layer=True,
        pos_enc_class=PositionalEncoding,
        normalize_before=True,
        concat_after=False,
    ):
        """Construct an Decoder object."""
        torch.nn.Module.__init__(self)
        self._register_load_state_dict_pre_hook(_pre_hook)
        if input_layer == "embed":
            self.embed = torch.nn.Sequential(
                torch.nn.Embedding(odim, attention_dim),
                pos_enc_class(attention_dim, positional_dropout_rate),
            )
        elif input_layer == "linear":
            self.embed = torch.nn.Sequential(
                torch.nn.Linear(odim, attention_dim),
                torch.nn.LayerNorm(attention_dim),
                torch.nn.Dropout(dropout_rate),
                torch.nn.ReLU(),
                pos_enc_class(attention_dim, positional_dropout_rate),
            )
        elif isinstance(input_layer, torch.nn.Module):
            self.embed = torch.nn.Sequential(
                input_layer, pos_enc_class(attention_dim, positional_dropout_rate)
            )
        else:
            raise NotImplementedError("only `embed` or torch.nn.Module is supported.")
        self.normalize_before = normalize_before

        # self-attention module definition
        if selfattention_layer_type == "selfattn":
            logging.info("decoder self-attention layer type = self-attention")
            decoder_selfattn_layer = MultiHeadedAttention
            decoder_selfattn_layer_args = [
                (
                    attention_heads,
                    attention_dim,
                    self_attention_dropout_rate,
                )
            ] * num_blocks
        elif selfattention_layer_type == "lightconv":
            logging.info("decoder self-attention layer type = lightweight convolution")
            decoder_selfattn_layer = LightweightConvolution
            decoder_selfattn_layer_args = [
                (
                    conv_wshare,
                    attention_dim,
                    self_attention_dropout_rate,
                    int(conv_kernel_length.split("_")[lnum]),
                    True,
                    conv_usebias,
                )
                for lnum in range(num_blocks)
            ]
        elif selfattention_layer_type == "lightconv2d":
            logging.info(
                "decoder self-attention layer "
                "type = lightweight convolution 2-dimentional"
            )
            decoder_selfattn_layer = LightweightConvolution2D
            decoder_selfattn_layer_args = [
                (
                    conv_wshare,
                    attention_dim,
                    self_attention_dropout_rate,
                    int(conv_kernel_length.split("_")[lnum]),
                    True,
                    conv_usebias,
                )
                for lnum in range(num_blocks)
            ]
        elif selfattention_layer_type == "dynamicconv":
            logging.info("decoder self-attention layer type = dynamic convolution")
            decoder_selfattn_layer = DynamicConvolution
            decoder_selfattn_layer_args = [
                (
                    conv_wshare,
                    attention_dim,
                    self_attention_dropout_rate,
                    int(conv_kernel_length.split("_")[lnum]),
                    True,
                    conv_usebias,
                )
                for lnum in range(num_blocks)
            ]
        elif selfattention_layer_type == "dynamicconv2d":
            logging.info(
                "decoder self-attention layer type = dynamic convolution 2-dimentional"
            )
            decoder_selfattn_layer = DynamicConvolution2D
            decoder_selfattn_layer_args = [
                (
                    conv_wshare,
                    attention_dim,
                    self_attention_dropout_rate,
                    int(conv_kernel_length.split("_")[lnum]),
                    True,
                    conv_usebias,
                )
                for lnum in range(num_blocks)
            ]

        self.decoders = repeat(
            num_blocks,
            lambda lnum: DecoderLayer(
                attention_dim,
                decoder_selfattn_layer(*decoder_selfattn_layer_args[lnum]),
                MultiHeadedAttention(
                    attention_heads, attention_dim, src_attention_dropout_rate
                ),
                PositionwiseFeedForward(attention_dim, linear_units, dropout_rate),
                dropout_rate,
                normalize_before,
                concat_after,
            ),
        )
        self.selfattention_layer_type = selfattention_layer_type
        if self.normalize_before:
            self.after_norm = LayerNorm(attention_dim)
        if use_output_layer:
            self.output_layer = torch.nn.Linear(attention_dim, odim)
        else:
            self.output_layer = None

    def forward(self, tgt, tgt_mask, memory, memory_mask):
        """Forward decoder.

        Args:
            tgt (torch.Tensor): Input token ids, int64 (#batch, maxlen_out) if
                input_layer == "embed". In the other case, input tensor
                (#batch, maxlen_out, odim).
            tgt_mask (torch.Tensor): Input token mask (#batch, maxlen_out).
                dtype=torch.uint8 in PyTorch 1.2- and dtype=torch.bool in PyTorch 1.2+
                (include 1.2).
            memory (torch.Tensor): Encoded memory, float32 (#batch, maxlen_in, feat).
            memory_mask (torch.Tensor): Encoded memory mask (#batch, maxlen_in).
                dtype=torch.uint8 in PyTorch 1.2- and dtype=torch.bool in PyTorch 1.2+
                (include 1.2).

        Returns:
            torch.Tensor: Decoded token score before softmax (#batch, maxlen_out, odim)
                   if use_output_layer is True. In the other case,final block outputs
                   (#batch, maxlen_out, attention_dim).
            torch.Tensor: Score mask before softmax (#batch, maxlen_out).

        """
        x = self.embed(tgt)
        x, tgt_mask, memory, memory_mask = self.decoders(
            x, tgt_mask, memory, memory_mask
        )
        if self.normalize_before:
            x = self.after_norm(x)
        if self.output_layer is not None:
            x = self.output_layer(x)
        return x, tgt_mask

    def forward_one_step(self, tgt, tgt_mask, memory, cache=None):
        """Forward one step.

        Args:
            tgt (torch.Tensor): Input token ids, int64 (#batch, maxlen_out).
            tgt_mask (torch.Tensor): Input token mask (#batch, maxlen_out).
                dtype=torch.uint8 in PyTorch 1.2- and dtype=torch.bool in PyTorch 1.2+
                (include 1.2).
            memory (torch.Tensor): Encoded memory, float32 (#batch, maxlen_in, feat).
            cache (List[torch.Tensor]): List of cached tensors.
                Each tensor shape should be (#batch, maxlen_out - 1, size).

        Returns:
            torch.Tensor: Output tensor (batch, maxlen_out, odim).
            List[torch.Tensor]: List of cache tensors of each decoder layer.

        """
        x = self.embed(tgt)
        if cache is None:
            cache = [None] * len(self.decoders)
        new_cache = []
        for c, decoder in zip(cache, self.decoders):
            x, tgt_mask, memory, memory_mask = decoder(
                x, tgt_mask, memory, None, cache=c
            )
            new_cache.append(x)

        if self.normalize_before:
            y = self.after_norm(x[:, -1])
        else:
            y = x[:, -1]
        if self.output_layer is not None:
            y = torch.log_softmax(self.output_layer(y), dim=-1)

        return y, new_cache

    # beam search API (see ScorerInterface)
    def score(self, ys, state, x):
        """Score."""
        ys_mask = subsequent_mask(len(ys), device=x.device).unsqueeze(0)
        if self.selfattention_layer_type != "selfattn":
            # TODO(karita): implement cache
            logging.warning(
                f"{self.selfattention_layer_type} does not support cached decoding."
            )
            state = None
        logp, state = self.forward_one_step(
            ys.unsqueeze(0), ys_mask, x.unsqueeze(0), cache=state
        )
        return logp.squeeze(0), state

    # batch beam search API (see BatchScorerInterface)
    def batch_score(
        self, ys: torch.Tensor, states: List[Any], xs: torch.Tensor
    ) -> Tuple[torch.Tensor, List[Any]]:
        """Score new token batch (required).

        Args:
            ys (torch.Tensor): torch.int64 prefix tokens (n_batch, ylen).
            states (List[Any]): Scorer states for prefix tokens.
            xs (torch.Tensor):
                The encoder feature that generates ys (n_batch, xlen, n_feat).

        Returns:
            tuple[torch.Tensor, List[Any]]: Tuple of
                batchfied scores for next token with shape of `(n_batch, n_vocab)`
                and next state list for ys.

        """
        # merge states
        n_batch = len(ys)
        n_layers = len(self.decoders)
        if states[0] is None:
            batch_state = None
        else:
            # transpose state of [batch, layer] into [layer, batch]
            batch_state = [
                torch.stack([states[b][i] for b in range(n_batch)])
                for i in range(n_layers)
            ]

        # batch decoding
        ys_mask = subsequent_mask(ys.size(-1), device=xs.device).unsqueeze(0)
        logp, states = self.forward_one_step(ys, ys_mask, xs, cache=batch_state)

        # transpose state of [layer, batch] into [batch, layer]
        state_list = [[states[i][b] for i in range(n_layers)] for b in range(n_batch)]
        return logp, state_list


================================================
FILE: nets/pytorch_backend/transformer/decoder_layer.py
================================================
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

# Copyright 2019 Shigeki Karita
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

"""Decoder self-attention layer definition."""

import torch
from torch import nn

from espnet.nets.pytorch_backend.transformer.layer_norm import LayerNorm


class DecoderLayer(nn.Module):
    """Single decoder layer module.

    Args:
        size (int): Input dimension.
        self_attn (torch.nn.Module): Self-attention module instance.
            `MultiHeadedAttention` instance can be used as the argument.
        src_attn (torch.nn.Module): Self-attention module instance.
            `MultiHeadedAttention` instance can be used as the argument.
        feed_forward (torch.nn.Module): Feed-forward module instance.
            `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance
            can be used as the argument.
        dropout_rate (float): Dropout rate.
        normalize_before (bool): Whether to use layer_norm before the first block.
        concat_after (bool): Whether to concat attention layer's input and output.
            if True, additional linear will be applied.
            i.e. x -> x + linear(concat(x, att(x)))
            if False, no additional linear will be applied. i.e. x -> x + att(x)


    """

    def __init__(
        self,
        size,
        self_attn,
        src_attn,
        feed_forward,
        dropout_rate,
        normalize_before=True,
        concat_after=False,
    ):
        """Construct an DecoderLayer object."""
        super(DecoderLayer, self).__init__()
        self.size = size
        self.self_attn = self_attn
        self.src_attn = src_attn
        self.feed_forward = feed_forward
        self.norm1 = LayerNorm(size)
        self.norm2 = LayerNorm(size)
        self.norm3 = LayerNorm(size)
        self.dropout = nn.Dropout(dropout_rate)
        self.normalize_before = normalize_before
        self.concat_after = concat_after
        if self.concat_after:
            self.concat_linear1 = nn.Linear(size + size, size)
            self.concat_linear2 = nn.Linear(size + size, size)

    def forward(self, tgt, tgt_mask, memory, memory_mask, cache=None):
        """Compute decoded features.

        Args:
            tgt (torch.Tensor): Input tensor (#batch, maxlen_out, size).
            tgt_mask (torch.Tensor): Mask for input tensor (#batch, maxlen_out).
            memory (torch.Tensor): Encoded memory, float32 (#batch, maxlen_in, size).
            memory_mask (torch.Tensor): Encoded memory mask (#batch, maxlen_in).
            cache (List[torch.Tensor]): List of cached tensors.
                Each tensor shape should be (#batch, maxlen_out - 1, size).

        Returns:
            torch.Tensor: Output tensor(#batch, maxlen_out, size).
            torch.Tensor: Mask for output tensor (#batch, maxlen_out).
            torch.Tensor: Encoded memory (#batch, maxlen_in, size).
            torch.Tensor: Encoded memory mask (#batch, maxlen_in).

        """
        residual = tgt
        if self.normalize_before:
            tgt = self.norm1(tgt)

        if cache is None:
            tgt_q = tgt
            tgt_q_mask = tgt_mask
        else:
            # compute only the last frame query keeping dim: max_time_out -> 1
            assert cache.shape == (
                tgt.shape[0],
                tgt.shape[1] - 1,
                self.size,
            ), f"{cache.shape} == {(tgt.shape[0], tgt.shape[1] - 1, self.size)}"
            tgt_q = tgt[:, -1:, :]
            residual = residual[:, -1:, :]
            tgt_q_mask = None
            if tgt_mask is not None:
                tgt_q_mask = tgt_mask[:, -1:, :]

        if self.concat_after:
            tgt_concat = torch.cat(
                (tgt_q, self.self_attn(tgt_q, tgt, tgt, tgt_q_mask)), dim=-1
            )
            x = residual + self.concat_linear1(tgt_concat)
        else:
            x = residual + self.dropout(self.self_attn(tgt_q, tgt, tgt, tgt_q_mask))
        if not self.normalize_before:
            x = self.norm1(x)

        residual = x
        if self.normalize_before:
            x = self.norm2(x)
        if self.concat_after:
            x_concat = torch.cat(
                (x, self.src_attn(x, memory, memory, memory_mask)), dim=-1
            )
            x = residual + self.concat_linear2(x_concat)
        else:
            x = residual + self.dropout(self.src_attn(x, memory, memory, memory_mask))
        if not self.normalize_before:
            x = self.norm2(x)

        residual = x
        if self.normalize_before:
            x = self.norm3(x)
        x = residual + self.dropout(self.feed_forward(x))
        if not self.normalize_before:
            x = self.norm3(x)

        if cache is not None:
            x = torch.cat([cache, x], dim=1)

        return x, tgt_mask, memory, memory_mask


================================================
FILE: nets/pytorch_backend/transformer/dynamic_conv.py
================================================
"""Dynamic Convolution module."""

import numpy
import torch
from torch import nn
import torch.nn.functional as F


MIN_VALUE = float(numpy.finfo(numpy.float32).min)


class DynamicConvolution(nn.Module):
    """Dynamic Convolution layer.

    This implementation is based on
    https://github.com/pytorch/fairseq/tree/master/fairseq

    Args:
        wshare (int): the number of kernel of convolution
        n_feat (int): the number of features
        dropout_rate (float): dropout_rate
        kernel_size (int): kernel size (length)
        use_kernel_mask (bool): Use causal mask or not for convolution kernel
        use_bias (bool): Use bias term or not.

    """

    def __init__(
        self,
        wshare,
        n_feat,
        dropout_rate,
        kernel_size,
        use_kernel_mask=False,
        use_bias=False,
    ):
        """Construct Dynamic Convolution layer."""
        super(DynamicConvolution, self).__init__()

        assert n_feat % wshare == 0
        self.wshare = wshare
        self.use_kernel_mask = use_kernel_mask
        self.dropout_rate = dropout_rate
        self.kernel_size = kernel_size
        self.attn = None

        # linear -> GLU -- -> lightconv -> linear
        #               \        /
        #                 Linear
        self.linear1 = nn.Linear(n_feat, n_feat * 2)
        self.linear2 = nn.Linear(n_feat, n_feat)
        self.linear_weight = nn.Linear(n_feat, self.wshare * 1 * kernel_size)
        nn.init.xavier_uniform(self.linear_weight.weight)
        self.act = nn.GLU()

        # dynamic conv related
        self.use_bias = use_bias
        if self.use_bias:
            self.bias = nn.Parameter(torch.Tensor(n_feat))

    def forward(self, query, key, value, mask):
        """Forward of 'Dynamic Convolution'.

        This function takes query, key and value but uses only quert.
        This is just for compatibility with self-attention layer (attention.py)

        Args:
            query (torch.Tensor): (batch, time1, d_model) input tensor
            key (torch.Tensor): (batch, time2, d_model) NOT USED
            value (torch.Tensor): (batch, time2, d_model) NOT USED
            mask (torch.Tensor): (batch, time1, time2) mask

        Return:
            x (torch.Tensor): (batch, time1, d_model) ouput

        """
        # linear -> GLU -- -> lightconv -> linear
        #               \        /
        #                 Linear
        x = query
        B, T, C = x.size()
        H = self.wshare
        k = self.kernel_size

        # first liner layer
        x = self.linear1(x)

        # GLU activation
        x = self.act(x)

        # get kernel of convolution
        weight = self.linear_weight(x)  # B x T x kH
        weight = F.dropout(weight, self.dropout_rate, training=self.training)
        weight = weight.view(B, T, H, k).transpose(1, 2).contiguous()  # B x H x T x k
        weight_new = torch.zeros(B * H * T * (T + k - 1), dtype=weight.dtype)
        weight_new = weight_new.view(B, H, T, T + k - 1).fill_(float("-inf"))
        weight_new = weight_new.to(x.device)  # B x H x T x T+k-1
        weight_new.as_strided(
            (B, H, T, k), ((T + k - 1) * T * H, (T + k - 1) * T, T + k, 1)
        ).copy_(weight)
        weight_new = weight_new.narrow(-1, int((k - 1) / 2), T)  # B x H x T x T(k)
        if self.use_kernel_mask:
            kernel_mask = torch.tril(torch.ones(T, T, device=x.device)).unsqueeze(0)
            weight_new = weight_new.masked_fill(kernel_mask == 0.0, float("-inf"))
        weight_new = F.softmax(weight_new, dim=-1)
        self.attn = weight_new
        weight_new = weight_new.view(B * H, T, T)

        # convolution
        x = x.transpose(1, 2).contiguous()  # B x C x T
        x = x.view(B * H, int(C / H), T).transpose(1, 2)
        x = torch.bmm(weight_new, x)  # BH x T x C/H
        x = x.transpose(1, 2).contiguous().view(B, C, T)

        if self.use_bias:
            x = x + self.bias.view(1, -1, 1)
        x = x.transpose(1, 2)  # B x T x C

        if mask is not None and not self.use_kernel_mask:
            mask = mask.transpose(-1, -2)
            x = x.masked_fill(mask == 0, 0.0)

        # second linear layer
        x = self.linear2(x)
        return x


================================================
FILE: nets/pytorch_backend/transformer/dynamic_conv2d.py
================================================
"""Dynamic 2-Dimentional Convolution module."""

import numpy
import torch
from torch import nn
import torch.nn.functional as F


MIN_VALUE = float(numpy.finfo(numpy.float32).min)


class DynamicConvolution2D(nn.Module):
    """Dynamic 2-Dimentional Convolution layer.

    This implementation is based on
    https://github.com/pytorch/fairseq/tree/master/fairseq

    Args:
        wshare (int): the number of kernel of convolution
        n_feat (int): the number of features
        dropout_rate (float): dropout_rate
        kernel_size (int): kernel size (length)
        use_kernel_mask (bool): Use causal mask or not for convolution kernel
        use_bias (bool): Use bias term or not.

    """

    def __init__(
        self,
        wshare,
        n_feat,
        dropout_rate,
        kernel_size,
        use_kernel_mask=False,
        use_bias=False,
    ):
        """Construct Dynamic 2-Dimentional Convolution layer."""
        super(DynamicConvolution2D, self).__init__()

        assert n_feat % wshare == 0
        self.wshare = wshare
        self.use_kernel_mask = use_kernel_mask
        self.dropout_rate = dropout_rate
        self.kernel_size = kernel_size
        self.padding_size = int(kernel_size / 2)
        self.attn_t = None
        self.attn_f = None

        # linear -> GLU -- -> lightconv -> linear
        #               \        /
        #                 Linear
        self.linear1 = nn.Linear(n_feat, n_feat * 2)
        self.linear2 = nn.Linear(n_feat * 2, n_feat)
        self.linear_weight = nn.Linear(n_feat, self.wshare * 1 * kernel_size)
        nn.init.xavier_uniform(self.linear_weight.weight)
        self.linear_weight_f = nn.Linear(n_feat, kernel_size)
        nn.init.xavier_uniform(self.linear_weight_f.weight)
        self.act = nn.GLU()

        # dynamic conv related
        self.use_bias = use_bias
        if self.use_bias:
            self.bias = nn.Parameter(torch.Tensor(n_feat))

    def forward(self, query, key, value, mask):
        """Forward of 'Dynamic 2-Dimentional Convolution'.

        This function takes query, key and value but uses only query.
        This is just for compatibility with self-attention layer (attention.py)

        Args:
            query (torch.Tensor): (batch, time1, d_model) input tensor
            key (torch.Tensor): (batch, time2, d_model) NOT USED
            value (torch.Tensor): (batch, time2, d_model) NOT USED
            mask (torch.Tensor): (batch, time1, time2) mask

        Return:
            x (torch.Tensor): (batch, time1, d_model) ouput

        """
        # linear -> GLU -- -> lightconv -> linear
        #               \        /
        #                 Linear
        x = query
        B, T, C = x.size()
        H = self.wshare
        k = self.kernel_size

        # first liner layer
        x = self.linear1(x)

        # GLU activation
        x = self.act(x)

        # convolution of frequency axis
        weight_f = self.linear_weight_f(x).view(B * T, 1, k)  # B x T x k
        self.attn_f = weight_f.view(B, T, k).unsqueeze(1)
        xf = F.conv1d(
            x.view(1, B * T, C), weight_f, padding=self.padding_size, groups=B * T
        )
        xf = xf.view(B, T, C)

        # get kernel of convolution
        weight = self.linear_weight(x)  # B x T x kH
        weight = F.dropout(weight, self.dropout_rate, training=self.training)
        weight = weight.view(B, T, H, k).transpose(1, 2).contiguous()  # B x H x T x k
        weight_new = torch.zeros(B * H * T * (T + k - 1), dtype=weight.dtype)
        weight_new = weight_new.view(B, H, T, T + k - 1).fill_(float("-inf"))
        weight_new = weight_new.to(x.device)  # B x H x T x T+k-1
        weight_new.as_strided(
            (B, H, T, k), ((T + k - 1) * T * H, (T + k - 1) * T, T + k, 1)
        ).copy_(weight)
        weight_new = weight_new.narrow(-1, int((k - 1) / 2), T)  # B x H x T x T(k)
        if self.use_kernel_mask:
            kernel_mask = torch.tril(torch.ones(T, T, device=x.device)).unsqueeze(0)
            weight_new = weight_new.masked_fill(kernel_mask == 0.0, float("-inf"))
        weight_new = F.softmax(weight_new, dim=-1)
        self.attn_t = weight_new
        weight_new = weight_new.view(B * H, T, T)

        # convolution
        x = x.transpose(1, 2).contiguous()  # B x C x T
        x = x.view(B * H, int(C / H), T).transpose(1, 2)
        x = torch.bmm(weight_new, x)
        x = x.transpose(1, 2).contiguous().view(B, C, T)

        if self.use_bias:
            x = x + self.bias.view(1, -1, 1)
        x = x.transpose(1, 2)  # B x T x C
        x = torch.cat((x, xf), -1)  # B x T x Cx2

        if mask is not None and not self.use_kernel_mask:
            mask = mask.transpose(-1, -2)
            x = x.masked_fill(mask == 0, 0.0)

        # second linear layer
        x = self.linear2(x)
        return x


================================================
FILE: nets/pytorch_backend/transformer/embedding.py
================================================
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

# Copyright 2019 Shigeki Karita
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

"""Positional Encoding Module."""

import math

import torch


def _pre_hook(
    state_dict,
    prefix,
    local_metadata,
    strict,
    missing_keys,
    unexpected_keys,
    error_msgs,
):
    """Perform pre-hook in load_state_dict for backward compatibility.

    Note:
        We saved self.pe until v.0.5.2 but we have omitted it later.
        Therefore, we remove the item "pe" from `state_dict` for backward compatibility.

    """
    k = prefix + "pe"
    if k in state_dict:
        state_dict.pop(k)


class PositionalEncoding(torch.nn.Module):
    """Positional encoding.

    Args:
        d_model (int): Embedding dimension.
        dropout_rate (float): Dropout rate.
        max_len (int): Maximum input length.
        reverse (bool): Whether to reverse the input position. Only for
        the class LegacyRelPositionalEncoding. We remove it in the current
        class RelPositionalEncoding.

    """

    def __init__(self, d_model, dropout_rate, max_len=5000, reverse=False):
        """Construct an PositionalEncoding object."""
        super(PositionalEncoding, self).__init__()
        self.d_model = d_model
        self.reverse = reverse
        self.xscale = math.sqrt(self.d_model)
        self.dropout = torch.nn.Dropout(p=dropout_rate)
        self.pe = None
        self.extend_pe(torch.tensor(0.0).expand(1, max_len))
        self._register_load_state_dict_pre_hook(_pre_hook)

    def extend_pe(self, x):
        """Reset the positional encodings."""
        if self.pe is not None:
            if self.pe.size(1) >= x.size(1):
                if self.pe.dtype != x.dtype or self.pe.device != x.device:
                    self.pe = self.pe.to(dtype=x.dtype, device=x.device)
                return
        pe = torch.zeros(x.size(1), self.d_model)
        if self.reverse:
            position = torch.arange(
                x.size(1) - 1, -1, -1.0, dtype=torch.float32
            ).unsqueeze(1)
        else:
            position = torch.arange(0, x.size(1), dtype=torch.float32).unsqueeze(1)
        div_term = torch.exp(
            torch.arange(0, self.d_model, 2, dtype=torch.float32)
            * -(math.log(10000.0) / self.d_model)
        )
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.pe = pe.to(device=x.device, dtype=x.dtype)

    def forward(self, x: torch.Tensor):
        """Add positional encoding.

        Args:
            x (torch.Tensor): Input tensor (batch, time, `*`).

        Returns:
            torch.Tensor: Encoded tensor (batch, time, `*`).

        """
        self.extend_pe(x)
        x = x * self.xscale + self.pe[:, : x.size(1)]
        return self.dropout(x)


class ScaledPositionalEncoding(PositionalEncoding):
    """Scaled positional encoding module.

    See Sec. 3.2  https://arxiv.org/abs/1809.08895

    Args:
        d_model (int): Embedding dimension.
        dropout_rate (float): Dropout rate.
        max_len (int): Maximum input length.

    """

    def __init__(self, d_model, dropout_rate, max_len=5000):
        """Initialize class."""
        super().__init__(d_model=d_model, dropout_rate=dropout_rate, max_len=max_len)
        self.alpha = torch.nn.Parameter(torch.tensor(1.0))

    def reset_parameters(self):
        """Reset parameters."""
        self.alpha.data = torch.tensor(1.0)

    def forward(self, x):
        """Add positional encoding.

        Args:
            x (torch.Tensor): Input tensor (batch, time, `*`).

        Returns:
            torch.Tensor: Encoded tensor (batch, time, `*`).

        """
        self.extend_pe(x)
        x = x + self.alpha * self.pe[:, : x.size(1)]
        return self.dropout(x)


class LegacyRelPositionalEncoding(PositionalEncoding):
    """Relative positional encoding module (old version).

    Details can be found in https://github.com/espnet/espnet/pull/2816.

    See : Appendix B in https://arxiv.org/abs/1901.02860

    Args:
        d_model (int): Embedding dimension.
        dropout_rate (float): Dropout rate.
        max_len (int): Maximum input length.

    """

    def __init__(self, d_model, dropout_rate, max_len=5000):
        """Initialize class."""
        super().__init__(
            d_model=d_model,
            dropout_rate=dropout_rate,
            max_len=max_len,
            reverse=True,
        )

    def forward(self, x):
        """Compute positional encoding.

        Args:
            x (torch.Tensor): Input tensor (batch, time, `*`).

        Returns:
            torch.Tensor: Encoded tensor (batch, time, `*`).
            torch.Tensor: Positional embedding tensor (1, time, `*`).

        """
        self.extend_pe(x)
        x = x * self.xscale
        pos_emb = self.pe[:, : x.size(1)]
        return self.dropout(x), self.dropout(pos_emb)


class RelPositionalEncoding(torch.nn.Module):
    """Relative positional encoding module (new implementation).

    Details can be found in https://github.com/espnet/espnet/pull/2816.

    See : Appendix B in https://arxiv.org/abs/1901.02860

    Args:
        d_model (int): Embedding dimension.
        dropout_rate (float): Dropout rate.
        max_len (int): Maximum input length.

    """

    def __init__(self, d_model, dropout_rate, max_len=5000):
        """Construct an PositionalEncoding object."""
        super(RelPositionalEncoding, self).__init__()
        self.d_model = d_model
        self.xscale = math.sqrt(self.d_model)
        self.dropout = torch.nn.Dropout(p=dropout_rate)
        self.pe = None
        self.extend_pe(torch.tensor(0.0).expand(1, max_len))

    def extend_pe(self, x):
        """Reset the positional encodings."""
        if self.pe is not None:
            # self.pe contains both positive and negative parts
            # the length of self.pe is 2 * input_len - 1
            if self.pe.size(1) >= x.size(1) * 2 - 1:
                if self.pe.dtype != x.dtype or self.pe.device != x.device:
                    self.pe = self.pe.to(dtype=x.dtype, device=x.device)
                return
        # Suppose `i` means to the position of query vecotr and `j` means the
        # position of key vector. We use position relative positions when keys
        # are to the left (i>j) and negative relative positions otherwise (i<j).
        pe_positive = torch.zeros(x.size(1), self.d_model)
        pe_negative = torch.zeros(x.size(1), self.d_model)
        position = torch.arange(0, x.size(1), dtype=torch.float32).unsqueeze(1)
        div_term = torch.exp(
            torch.arange(0, self.d_model, 2, dtype=torch.float32)
            * -(math.log(10000.0) / self.d_model)
        )
        pe_positive[:, 0::2] = torch.sin(position * div_term)
        pe_positive[:, 1::2] = torch.cos(position * div_term)
        pe_negative[:, 0::2] = torch.sin(-1 * position * div_term)
        pe_negative[:, 1::2] = torch.cos(-1 * position * div_term)

        # Reserve the order of positive indices and concat both positive and
        # negative indices. This is used to support the shifting trick
        # as in https://arxiv.org/abs/1901.02860
        pe_positive = torch.flip(pe_positive, [0]).unsqueeze(0)
        pe_negative = pe_negative[1:].unsqueeze(0)
        pe = torch.cat([pe_positive, pe_negative], dim=1)
        self.pe = pe.to(device=x.device, dtype=x.dtype)

    def forward(self, x: torch.Tensor):
        """Add positional encoding.

        Args:
            x (torch.Tensor): Input tensor (batch, time, `*`).

        Returns:
            torch.Tensor: Encoded tensor (batch, time, `*`).

        """
        self.extend_pe(x)
        x = x * self.xscale
        pos_emb = self.pe[
            :,
            self.pe.size(1) // 2 - x.size(1) + 1 : self.pe.size(1) // 2 + x.size(1),
        ]
        return self.dropout(x), self.dropout(pos_emb)


================================================
FILE: nets/pytorch_backend/transformer/encoder.py
================================================
# Copyright 2019 Shigeki Karita
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

"""Encoder definition."""

import logging
import torch

from espnet.nets.pytorch_backend.nets_utils import rename_state_dict
from espnet.nets.pytorch_backend.transducer.vgg2l import VGG2L
from espnet.nets.pytorch_backend.transformer.attention import MultiHeadedAttention
from espnet.nets.pytorch_backend.transformer.dynamic_conv import DynamicConvolution
from espnet.nets.pytorch_backend.transformer.dynamic_conv2d import DynamicConvolution2D
from espnet.nets.pytorch_backend.transformer.embedding import PositionalEncoding
from espnet.nets.pytorch_backend.transformer.encoder_layer import EncoderLayer
from espnet.nets.pytorch_backend.transformer.layer_norm import LayerNorm
from espnet.nets.pytorch_backend.transformer.lightconv import LightweightConvolution
from espnet.nets.pytorch_backend.transformer.lightconv2d import LightweightConvolution2D
from espnet.nets.pytorch_backend.transformer.multi_layer_conv import Conv1dLinear
from espnet.nets.pytorch_backend.transformer.multi_layer_conv import MultiLayeredConv1d
from espnet.nets.pytorch_backend.transformer.positionwise_feed_forward import (
    PositionwiseFeedForward,  # noqa: H301
)
from espnet.nets.pytorch_backend.transformer.repeat import repeat
from espnet.nets.pytorch_backend.transformer.subsampling import Conv2dSubsampling
from espnet.nets.pytorch_backend.transformer.subsampling import Conv2dSubsampling6
from espnet.nets.pytorch_backend.transformer.subsampling import Conv2dSubsampling8


def _pre_hook(
    state_dict,
    prefix,
    local_metadata,
    strict,
    missing_keys,
    unexpected_keys,
    error_msgs,
):
    # https://github.com/espnet/espnet/commit/21d70286c354c66c0350e65dc098d2ee236faccc#diff-bffb1396f038b317b2b64dd96e6d3563
    rename_state_dict(prefix + "input_layer.", prefix + "embed.", state_dict)
    # https://github.com/espnet/espnet/commit/3d422f6de8d4f03673b89e1caef698745ec749ea#diff-bffb1396f038b317b2b64dd96e6d3563
    rename_state_dict(prefix + "norm.", prefix + "after_norm.", state_dict)


class Encoder(torch.nn.Module):
    """Transformer encoder module.

    Args:
        idim (int): Input dimension.
        attention_dim (int): Dimention of attention.
        attention_heads (int): The number of heads of multi head attention.
        conv_wshare (int): The number of kernel of convolution. Only used in
            self_attention_layer_type == "lightconv*" or "dynamiconv*".
        conv_kernel_length (Union[int, str]): Kernel size str of convolution
            (e.g. 71_71_71_71_71_71). Only used in self_attention_layer_type
            == "lightconv*" or "dynamiconv*".
        conv_usebias (bool): Whether to use bias in convolution. Only used in
            self_attention_layer_type == "lightconv*" or "dynamiconv*".
        linear_units (int): The number of units of position-wise feed forward.
        num_blocks (int): The number of decoder blocks.
        dropout_rate (float): Dropout rate.
        positional_dropout_rate (float): Dropout rate after adding positional encoding.
        attention_dropout_rate (float): Dropout rate in attention.
        input_layer (Union[str, torch.nn.Module]): Input layer type.
        pos_enc_class (torch.nn.Module): Positional encoding module class.
            `PositionalEncoding `or `ScaledPositionalEncoding`
        normalize_before (bool): Whether to use layer_norm before the first block.
        concat_after (bool): Whether to concat attention layer's input and output.
            if True, additional linear will be applied.
            i.e. x -> x + linear(concat(x, att(x)))
            if False, no additional linear will be applied. i.e. x -> x + att(x)
        positionwise_layer_type (str): "linear", "conv1d", or "conv1d-linear".
        positionwise_conv_kernel_size (int): Kernel size of positionwise conv1d layer.
        selfattention_layer_type (str): Encoder attention layer type.
        padding_idx (int): Padding idx for input_layer=embed.

    """

    def __init__(
        self,
        idim,
        attention_dim=256,
        attention_heads=4,
        conv_wshare=4,
        conv_kernel_length="11",
        conv_usebias=False,
        linear_units=2048,
        num_blocks=6,
        dropout_rate=0.1,
        positional_dropout_rate=0.1,
        attention_dropout_rate=0.0,
        input_layer="conv2d",
        pos_enc_class=PositionalEncoding,
        normalize_before=True,
        concat_after=False,
        positionwise_layer_type="linear",
        positionwise_conv_kernel_size=1,
        selfattention_layer_type="selfattn",
        padding_idx=-1,
    ):
        """Construct an Encoder object."""
        super(Encoder, self).__init__()
        self._register_load_state_dict_pre_hook(_pre_hook)

        self.conv_subsampling_factor = 1
        if input_layer == "linear":
            self.embed = torch.nn.Sequential(
                torch.nn.Linear(idim, attention_dim),
                torch.nn.LayerNorm(attention_dim),
                torch.nn.Dropout(dropout_rate),
                torch.nn.ReLU(),
                pos_enc_class(attention_dim, positional_dropout_rate),
            )
        elif input_layer == "conv2d":
            self.embed = Conv2dSubsampling(idim, attention_dim, dropout_rate)
            self.conv_subsampling_factor = 4
        elif input_layer == "conv2d-scaled-pos-enc":
            self.embed = Conv2dSubsampling(
                idim,
                attention_dim,
                dropout_rate,
                pos_enc_class(attention_dim, positional_dropout_rate),
            )
            self.conv_subsampling_factor = 4
        elif input_layer == "conv2d6":
            self.embed = Conv2dSubsampling6(idim, attention_dim, dropout_rate)
            self.conv_subsampling_factor = 6
        elif input_layer == "conv2d8":
            self.embed = Conv2dSubsampling8(idim, attention_dim, dropout_rate)
            self.conv_subsampling_factor = 8
        elif input_layer == "vgg2l":
            self.embed = VGG2L(idim, attention_dim)
            self.conv_subsampling_factor = 4
        elif input_layer == "embed":
            self.embed = torch.nn.Sequential(
                torch.nn.Embedding(idim, attention_dim, padding_idx=padding_idx),
                pos_enc_class(attention_dim, positional_dropout_rate),
            )
        elif isinstance(input_layer, torch.nn.Module):
            self.embed = torch.nn.Sequential(
                input_layer,
                pos_enc_class(attention_dim, positional_dropout_rate),
            )
        elif input_layer is None:
            self.embed = torch.nn.Sequential(
                pos_enc_class(attention_dim, positional_dropout_rate)
            )
        else:
            raise ValueError("unknown input_layer: " + input_layer)
        self.normalize_before = normalize_before
        positionwise_layer, positionwise_layer_args = self.get_positionwise_layer(
            positionwise_layer_type,
            attention_dim,
            linear_units,
            dropout_rate,
            positionwise_conv_kernel_size,
        )
        if selfattention_layer_type in [
            "selfattn",
            "rel_selfattn",
            "legacy_rel_selfattn",
        ]:
            logging.info("encoder self-attention layer type = self-attention")
            encoder_selfattn_layer = MultiHeadedAttention
            encoder_selfattn_layer_args = [
                (
                    attention_heads,
                    attention_dim,
                    attention_dropout_rate,
                )
            ] * num_blocks
        elif selfattention_layer_type == "lightconv":
            logging.info("encoder self-attention layer type = lightweight convolution")
            encoder_selfattn_layer = LightweightConvolution
            encoder_selfattn_layer_args = [
                (
                    conv_wshare,
                    attention_dim,
                    attention_dropout_rate,
                    int(conv_kernel_length.split("_")[lnum]),
                    False,
                    conv_usebias,
                )
                for lnum in range(num_blocks)
            ]
        elif selfattention_layer_type == "lightconv2d":
            logging.info(
                "encoder self-attention layer "
                "type = lightweight convolution 2-dimentional"
            )
            encoder_selfattn_layer = LightweightConvolution2D
            encoder_selfattn_layer_args = [
                (
                    conv_wshare,
                    attention_dim,
                    attention_dropout_rate,
                    int(conv_kernel_length.split("_")[lnum]),
                    False,
                    conv_usebias,
                )
                for lnum in range(num_blocks)
            ]
        elif selfattention_layer_type == "dynamicconv":
            logging.info("encoder self-attention layer type = dynamic convolution")
            encoder_selfattn_layer = DynamicConvolution
            encoder_selfattn_layer_args = [
                (
                    conv_wshare,
                    attention_dim,
                    attention_dropout_rate,
                    int(conv_kernel_length.split("_")[lnum]),
                    False,
                    conv_usebias,
                )
                for lnum in range(num_blocks)
            ]
        elif selfattention_layer_type == "dynamicconv2d":
            logging.info(
                "encoder self-attention layer type = dynamic convolution 2-dimentional"
            )
            encoder_selfattn_layer = DynamicConvolution2D
            encoder_selfattn_layer_args = [
                (
                    conv_wshare,
                    attention_dim,
                    attention_dropout_rate,
                    int(conv_kernel_length.split("_")[lnum]),
                    False,
                    conv_usebias,
                )
                for lnum in range(num_blocks)
            ]
        else:
            raise NotImplementedError(selfattention_layer_type)

        self.encoders = repeat(
            num_blocks,
            lambda lnum: EncoderLayer(
                attention_dim,
                encoder_selfattn_layer(*encoder_selfattn_layer_args[lnum]),
                positionwise_layer(*positionwise_layer_args),
                dropout_rate,
                normalize_before,
                concat_after,
            ),
        )
        if self.normalize_before:
            self.after_norm = LayerNorm(attention_dim)

    def get_positionwise_layer(
        self,
        positionwise_layer_type="linear",
        attention_dim=256,
        linear_units=2048,
        dropout_rate=0.1,
        positionwise_conv_kernel_size=1,
    ):
        """Define positionwise layer."""
        if positionwise_layer_type == "linear":
            positionwise_layer = PositionwiseFeedForward
            positionwise_layer_args = (attention_dim, linear_units, dropout_rate)
        elif positionwise_layer_type == "conv1d":
            positionwise_layer = MultiLayeredConv1d
            positionwise_layer_args = (
                attention_dim,
                linear_units,
                positionwise_conv_kernel_size,
                dropout_rate,
            )
        elif positionwise_layer_type == "conv1d-linear":
            positionwise_layer = Conv1dLinear
            positionwise_layer_args = (
                attention_dim,
                linear_units,
                positionwise_conv_kernel_size,
                dropout_rate,
            )
        else:
            raise NotImplementedError("Support only linear or conv1d.")
        return positionwise_layer, positionwise_layer_args

    def forward(self, xs, masks):
        """Encode input sequence.

        Args:
            xs (torch.Tensor): Input tensor (#batch, time, idim).
            masks (torch.Tensor): Mask tensor (#batch, time).

        Returns:
            torch.Tensor: Output tensor (#batch, time, attention_dim).
            torch.Tensor: Mask tensor (#batch, time).

        """
        if isinstance(
            self.embed,
            (Conv2dSubsampling, Conv2dSubsampling6, Conv2dSubsampling8, VGG2L),
        ):
            xs, masks = self.embed(xs, masks)
        else:
            xs = self.embed(xs)
        xs, masks = self.encoders(xs, masks)
        if self.normalize_before:
            xs = self.after_norm(xs)
        return xs, masks

    def forward_one_step(self, xs, masks, cache=None):
        """Encode input frame.

        Args:
            xs (torch.Tensor): Input tensor.
            masks (torch.Tensor): Mask tensor.
            cache (List[torch.Tensor]): List of cache tensors.

        Returns:
            torch.Tensor: Output tensor.
            torch.Tensor: Mask tensor.
            List[torch.Tensor]: List of new cache tensors.

        """
        if isinstance(self.embed, Conv2dSubsampling):
            xs, masks = self.embed(xs, masks)
        else:
            xs = self.embed(xs)
        if cache is None:
            cache = [None for _ in range(len(self.encoders))]
        new_cache = []
        for c, e in zip(cache, self.encoders):
            xs, masks = e(xs, masks, cache=c)
            new_cache.append(xs)
        if self.normalize_before:
            xs = self.after_norm(xs)
        return xs, masks, new_cache


================================================
FILE: nets/pytorch_backend/transformer/encoder_layer.py
================================================
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

# Copyright 2019 Shigeki Karita
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

"""Encoder self-attention layer definition."""

import torch

from torch import nn

from espnet.nets.pytorch_backend.transformer.layer_norm import LayerNorm


class EncoderLayer(nn.Module):
    """Encoder layer module.

    Args:
        size (int): Input dimension.
        self_attn (torch.nn.Module): Self-attention module instance.
            `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` instance
            can be used as the argument.
        feed_forward (torch.nn.Module): Feed-forward module instance.
            `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance
            can be used as the argument.
        dropout_rate (float): Dropout rate.
        normalize_before (bool): Whether to use layer_norm before the first block.
        concat_after (bool): Whether to concat attention layer's input and output.
            if True, additional linear will be applied.
            i.e. x -> x + linear(concat(x, att(x)))
            if False, no additional linear will be applied. i.e. x -> x + att(x)

    """

    def __init__(
        self,
        size,
        self_attn,
        feed_forward,
        dropout_rate,
        normalize_before=True,
        concat_after=False,
    ):
        """Construct an EncoderLayer object."""
        super(EncoderLayer, self).__init__()
        self.self_attn = self_attn
        self.feed_forward = feed_forward
        self.norm1 = LayerNorm(size)
        self.norm2 = LayerNorm(size)
        self.dropout = nn.Dropout(dropout_rate)
        self.size = size
        self.normalize_before = normalize_before
        self.concat_after = concat_after
        if self.concat_after:
            self.concat_linear = nn.Linear(size + size, size)

    def forward(self, x, mask, cache=None):
        """Compute encoded features.

        Args:
            x_input (torch.Tensor): Input tensor (#batch, time, size).
            mask (torch.Tensor): Mask tensor for the input (#batch, time).
            cache (torch.Tensor): Cache tensor of the input (#batch, time - 1, size).

        Returns:
            torch.Tensor: Output tensor (#batch, time, size).
            torch.Tensor: Mask tensor (#batch, time).

        """
        residual = x
        if self.normalize_before:
            x = self.norm1(x)

        if cache is None:
            x_q = x
        else:
            assert cache.shape == (x.shape[0], x.shape[1] - 1, self.size)
            x_q = x[:, -1:, :]
            residual = residual[:, -1:, :]
            mask = None if mask is None else mask[:, -1:, :]

        if self.concat_after:
            x_concat = torch.cat((x, self.self_attn(x_q, x, x, mask)), dim=-1)
            x = residual + self.concat_linear(x_concat)
        else:
            x = residual + self.dropout(self.self_attn(x_q, x, x, mask))
        if not self.normalize_before:
            x = self.norm1(x)

        residual = x
        if self.normalize_before:
            x = self.norm2(x)
        x = residual + self.dropout(self.feed_forward(x))
        if not self.normalize_before:
            x = self.norm2(x)

        if cache is not None:
            x = torch.cat([cache, x], dim=1)

        return x, mask


================================================
FILE: nets/pytorch_backend/transformer/encoder_mix.py
================================================
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

# Copyright 2019 Shigeki Karita
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

"""Encoder Mix definition."""

import torch

from espnet.nets.pytorch_backend.transducer.vgg2l import VGG2L
from espnet.nets.pytorch_backend.transformer.attention import MultiHeadedAttention
from espnet.nets.pytorch_backend.transformer.embedding import PositionalEncoding
from espnet.nets.pytorch_backend.transformer.encoder import Encoder
from espnet.nets.pytorch_backend.transformer.encoder_layer import EncoderLayer
from espnet.nets.pytorch_backend.transformer.repeat import repeat
from espnet.nets.pytorch_backend.transformer.subsampling import Conv2dSubsampling


class EncoderMix(Encoder, torch.nn.Module):
    """Transformer encoder module.

    :param int idim: input dim
    :param int attention_dim: dimention of attention
    :param int attention_heads: the number of heads of multi head attention
    :param int linear_units: the number of units of position-wise feed forward
    :param int num_blocks: the number of decoder blocks
    :param float dropout_rate: dropout rate
    :param float attention_dropout_rate: dropout rate in attention
    :param float positional_dropout_rate: dropout rate after adding positional encoding
    :param str or torch.nn.Module input_layer: input layer type
    :param class pos_enc_class: PositionalEncoding or ScaledPositionalEncoding
    :param bool normalize_before: whether to use layer_norm before the first block
    :param bool concat_after: whether to concat attention layer's input and output
        if True, additional linear will be applied.
        i.e. x -> x + linear(concat(x, att(x)))
        if False, no additional linear will be applied. i.e. x -> x + att(x)
    :param str positionwise_layer_type: linear of conv1d
    :param int positionwise_conv_kernel_size: kernel size of positionwise conv1d layer
    :param int padding_idx: padding_idx for input_layer=embed
    """

    def __init__(
        self,
        idim,
        attention_dim=256,
        attention_heads=4,
        linear_units=2048,
        num_blocks_sd=4,
        num_blocks_rec=8,
        dropout_rate=0.1,
        positional_dropout_rate=0.1,
        attention_dropout_rate=0.0,
        input_layer="conv2d",
        pos_enc_class=PositionalEncoding,
        normalize_before=True,
        concat_after=False,
        positionwise_layer_type="linear",
        positionwise_conv_kernel_size=1,
        padding_idx=-1,
        num_spkrs=2,
    ):
        """Construct an Encoder object."""
        super(EncoderMix, self).__init__(
            idim=idim,
            selfattention_layer_type="selfattn",
            attention_dim=attention_dim,
            attention_heads=attention_heads,
            linear_units=linear_units,
            num_blocks=num_blocks_rec,
            dropout_rate=dropout_rate,
            positional_dropout_rate=positional_dropout_rate,
            attention_dropout_rate=attention_dropout_rate,
            input_layer=input_layer,
            pos_enc_class=pos_enc_class,
            normalize_before=normalize_before,
            concat_after=concat_after,
            positionwise_layer_type=positionwise_layer_type,
            positionwise_conv_kernel_size=positionwise_conv_kernel_size,
            padding_idx=padding_idx,
        )
        positionwise_layer, positionwise_layer_args = self.get_positionwise_layer(
            positionwise_layer_type,
            attention_dim,
            linear_units,
            dropout_rate,
            positionwise_conv_kernel_size,
        )
        self.num_spkrs = num_spkrs
        self.encoders_sd = torch.nn.ModuleList(
            [
                repeat(
                    num_blocks_sd,
                    lambda lnum: EncoderLayer(
                        attention_dim,
                        MultiHeadedAttention(
                            attention_heads, attention_dim, attention_dropout_rate
                        ),
                        positionwise_layer(*positionwise_layer_args),
                        dropout_rate,
                        normalize_before,
                        concat_after,
                    ),
                )
                for i in range(num_spkrs)
            ]
        )

    def forward(self, xs, masks):
        """Encode input sequence.

        :param torch.Tensor xs: input tensor
        :param torch.Tensor masks: input mask
        :return: position embedded tensor and mask
        :rtype Tuple[torch.Tensor, torch.Tensor]:
        """
        if isinstance(self.embed, (Conv2dSubsampling, VGG2L)):
            xs, masks = self.embed(xs, masks)
        else:
            xs = self.embed(xs)
        xs_sd, masks_sd = [None] * self.num_spkrs, [None] * self.num_spkrs

        for ns in range(self.num_spkrs):
            xs_sd[ns], masks_sd[ns] = self.encoders_sd[ns](xs, masks)
            xs_sd[ns], masks_sd[ns] = self.encoders(xs_sd[ns], masks_sd[ns])  # Enc_rec
            if self.normalize_before:
                xs_sd[ns] = self.after_norm(xs_sd[ns])
        return xs_sd, masks_sd

    def forward_one_step(self, xs, masks, cache=None):
        """Encode input frame.

        :param torch.Tensor xs: input tensor
        :param torch.Tensor masks: input mask
        :param List[torch.Tensor] cache: cache tensors
        :return: position embedded tensor, mask and new cache
        :rtype Tuple[torch.Tensor, torch.Tensor, List[torch.Tensor]]:
        """
        if isinstance(self.embed, Conv2dSubsampling):
            xs, masks = self.embed(xs, masks)
        else:
            xs = self.embed(xs)

        new_cache_sd = []
        for ns in range(self.num_spkrs):
            if cache is None:
                cache = [
                    None for _ in range(len(self.encoders_sd) + len(self.encoders_rec))
                ]
            new_cache = []
            for c, e in zip(cache[: len(self.encoders_sd)], self.encoders_sd[ns]):
                xs, masks = e(xs, masks, cache=c)
                new_cache.append(xs)
            for c, e in zip(cache[: len(self.encoders_sd) :], self.encoders_rec):
                xs, masks = e(xs, masks, cache=c)
                new_cache.append(xs)
            new_cache_sd.append(new_cache)
            if self.normalize_before:
                xs = self.after_norm(xs)
        return xs, masks, new_cache_sd


================================================
FILE: nets/pytorch_backend/transformer/initializer.py
================================================
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

# Copyright 2019 Shigeki Karita
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

"""Parameter initialization."""

import torch

from espnet.nets.pytorch_backend.transformer.layer_norm import LayerNorm


def initialize(model, init_type="pytorch"):
    """Initialize Transformer module.

    :param torch.nn.Module model: transformer instance
    :param str init_type: initialization type
    """
    if init_type == "pytorch":
        return

    # weight init
    for p in model.parameters():
        if p.dim() > 1:
            if init_type == "xavier_uniform":
                torch.nn.init.xavier_uniform_(p.data)
            elif init_type == "xavier_normal":
                torch.nn.init.xavier_normal_(p.data)
            elif init_type == "kaiming_uniform":
                torch.nn.init.kaiming_uniform_(p.data, nonlinearity="relu")
            elif init_type == "kaiming_normal":
                torch.nn.init.kaiming_normal_(p.data, nonlinearity="relu")
            else:
                raise ValueError("Unknown initialization: " + init_type)
    # bias init
    for p in model.parameters():
        if p.dim() == 1:
            p.data.zero_()

    # reset some modules with default init
    for m in model.modules():
        if isinstance(m, (torch.nn.Embedding, LayerNorm)):
            m.reset_parameters()


================================================
FILE: nets/pytorch_backend/transformer/label_smoothing_loss.py
================================================
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

# Copyright 2019 Shigeki Karita
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

"""Label smoothing module."""

import torch
from torch import nn


class LabelSmoothingLoss(nn.Module):
    """Label-smoothing loss.

    :param int size: the number of class
    :param int padding_idx: ignored class id
    :param float smoothing: smoothing rate (0.0 means the conventional CE)
    :param bool normalize_length: normalize loss by sequence length if True
    :param torch.nn.Module criterion: loss function to be smoothed
    """

    def __init__(
        self,
        size,
        padding_idx,
        smoothing,
        normalize_length=False,
        criterion=nn.KLDivLoss(reduction="none"),
    ):
        """Construct an LabelSmoothingLoss object."""
        super(LabelSmoothingLoss, self).__init__()
        self.criterion = criterion
        self.padding_idx = padding_idx
        self.confidence = 1.0 - smoothing
        self.smoothing = smoothing
        self.size = size
        self.true_dist = None
        self.normalize_length = normalize_length

    def forward(self, x, target):
        """Compute loss between x and target.

        :param torch.Tensor x: prediction (batch, seqlen, class)
        :param torch.Tensor target:
            target signal masked with self.padding_id (batch, seqlen)
        :return: scalar float value
        :rtype torch.Tensor
        """
        assert x.size(2) == self.size
        batch_size = x.size(0)
        x = x.view(-1, self.size)
        target = target.view(-1)
        with torch.no_grad():
            true_dist = x.clone()
            true_dist.fill_(self.smoothing / (self.size - 1))
            ignore = target == self.padding_idx  # (B,)
            total = len(target) - ignore.sum().item()
            target = target.masked_fill(ignore, 0)  # avoid -1 index
            true_dist.scatter_(1, target.unsqueeze(1), self.confidence)
        kl = self.criterion(torch.log_softmax(x, dim=1), true_dist)
        denom = total if self.normalize_length else batch_size
        return kl.masked_fill(ignore.unsqueeze(1), 0).sum() / denom


================================================
FILE: nets/pytorch_backend/transformer/layer_norm.py
================================================
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

# Copyright 2019 Shigeki Karita
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

"""Layer normalization module."""

import torch


class LayerNorm(torch.nn.LayerNorm):
    """Layer normalization module.

    Args:
        nout (int): Output dim size.
        dim (int): Dimension to be normalized.

    """

    def __init__(self, nout, dim=-1):
        """Construct an LayerNorm object."""
        super(LayerNorm, self).__init__(nout, eps=1e-12)
        self.dim = dim

    def forward(self, x):
        """Apply layer normalization.

        Args:
            x (torch.Tensor): Input tensor.

        Returns:
            torch.Tensor: Normalized tensor.

        """
        if self.dim == -1:
            return super(LayerNorm, self).forward(x)
        return (
            super(LayerNorm, self)
            .forward(x.transpose(self.dim, -1))
            .transpose(self.dim, -1)
        )


================================================
FILE: nets/pytorch_backend/transformer/lightconv.py
================================================
"""Lightweight Convolution Module."""

import numpy
import torch
from torch import nn
import torch.nn.functional as F


MIN_VALUE = float(numpy.finfo(numpy.float32).min)


class LightweightConvolution(nn.Module):
    """Lightweight Convolution layer.

    This implementation is based on
    https://github.com/pytorch/fairseq/tree/master/fairseq

    Args:
        wshare (int): the number of kernel of convolution
        n_feat (int): the number of features
        dropout_rate (float): dropout_rate
        kernel_size (int): kernel size (length)
        use_kernel_mask (bool): Use causal mask or not for convolution kernel
        use_bias (bool): Use bias term or not.

    """

    def __init__(
        self,
        wshare,
        n_feat,
        dropout_rate,
        kernel_size,
        use_kernel_mask=False,
        use_bias=False,
    ):
        """Construct Lightweight Convolution layer."""
        super(LightweightConvolution, self).__init__()

        assert n_feat % wshare == 0
        self.wshare = wshare
        self.use_kernel_mask = use_kernel_mask
        self.dropout_rate = dropout_rate
        self.kernel_size = kernel_size
        self.padding_size = int(kernel_size / 2)

        # linear -> GLU -> lightconv -> linear
        self.linear1 = nn.Linear(n_feat, n_feat * 2)
        self.linear2 = nn.Linear(n_feat, n_feat)
        self.act = nn.GLU()

        # lightconv related
        self.weight = nn.Parameter(
            torch.Tensor(self.wshare, 1, kernel_size).uniform_(0, 1)
        )
        self.use_bias = use_bias
        if self.use_bias:
            self.bias = nn.Parameter(torch.Tensor(n_feat))

        # mask of kernel
        kernel_mask0 = torch.zeros(self.wshare, int(kernel_size / 2))
        kernel_mask1 = torch.ones(self.wshare, int(kernel_size / 2 + 1))
        self.kernel_mask = torch.cat((kernel_mask1, kernel_mask0), dim=-1).unsqueeze(1)

    def forward(self, query, key, value, mask):
        """Forward of 'Lightweight Convolution'.

        This function takes query, key and value but uses only query.
        This is just for compatibility with self-attention layer (attention.py)

        Args:
            query (torch.Tensor): (batch, time1, d_model) input tensor
            key (torch.Tensor): (batch, time2, d_model) NOT USED
            value (torch.Tensor): (batch, time2, d_model) NOT USED
            mask (torch.Tensor): (batch, time1, time2) mask

        Return:
            x (torch.Tensor): (batch, time1, d_model) ouput

        """
        # linear -> GLU -> lightconv -> linear
        x = query
        B, T, C = x.size()
        H = self.wshare

        # first liner layer
        x = self.linear1(x)

        # GLU activation
        x = self.act(x)

        # lightconv
        x = x.transpose(1, 2).contiguous().view(-1, H, T)  # B x C x T
        weight = F.dropout(self.weight, self.dropout_rate, training=self.training)
        if self.use_kernel_mask:
            self.kernel_mask = self.kernel_mask.to(x.device)
            weight = weight.masked_fill(self.kernel_mask == 0.0, float("-inf"))
        weight = F.softmax(weight, dim=-1)
        x = F.conv1d(x, weight, padding=self.padding_size, groups=self.wshare).view(
            B, C, T
        )
        if self.use_bias:
            x = x + self.bias.view(1, -1, 1)
        x = x.transpose(1, 2)  # B x T x C

        if mask is not None and not self.use_kernel_mask:
            mask = mask.transpose(-1, -2)
            x = x.masked_fill(mask == 0, 0.0)

        # second linear layer
        x = self.linear2(x)
        return x


================================================
FILE: nets/pytorch_backend/transformer/lightconv2d.py
================================================
"""Lightweight 2-Dimentional Convolution module."""

import numpy
import torch
from torch import nn
import torch.nn.functional as F


MIN_VALUE = float(numpy.finfo(numpy.float32).min)


class LightweightConvolution2D(nn.Module):
    """Lightweight 2-Dimentional Convolution layer.

    This implementation is based on
    https://github.com/pytorch/fairseq/tree/master/fairseq

    Args:
        wshare (int): the number of kernel of convolution
        n_feat (int): the number of features
        dropout_rate (float): dropout_rate
        kernel_size (int): kernel size (length)
        use_kernel_mask (bool): Use causal mask or not for convolution kernel
        use_bias (bool): Use bias term or not.

    """

    def __init__(
        self,
        wshare,
        n_feat,
        dropout_rate,
        kernel_size,
        use_kernel_mask=False,
        use_bias=False,
    ):
        """Construct Lightweight 2-Dimentional Convolution layer."""
        super(LightweightConvolution2D, self).__init__()

        assert n_feat % wshare == 0
        self.wshare = wshare
        self.use_kernel_mask = use_kernel_mask
        self.dropout_rate = dropout_rate
        self.kernel_size = kernel_size
        self.padding_size = int(kernel_size / 2)

        # linear -> GLU -> lightconv -> linear
        self.linear1 = nn.Linear(n_feat, n_feat * 2)
        self.linear2 = nn.Linear(n_feat * 2, n_feat)
        self.act = nn.GLU()

        # lightconv related
        self.weight = nn.Parameter(
            torch.Tensor(self.wshare, 1, kernel_size).uniform_(0, 1)
        )
        self.weight_f = nn.Parameter(torch.Tensor(1, 1, kernel_size).uniform_(0, 1))
        self.use_bias = use_bias
        if self.use_bias:
            self.bias = nn.Parameter(torch.Tensor(n_feat))

        # mask of kernel
        kernel_mask0 = torch.zeros(self.wshare, int(kernel_size / 2))
        kernel_mask1 = torch.ones(self.wshare, int(kernel_size / 2 + 1))
        self.kernel_mask = torch.cat((kernel_mask1, kernel_mask0), dim=-1).unsqueeze(1)

    def forward(self, query, key, value, mask):
        """Forward of 'Lightweight 2-Dimentional Convolution'.

        This function takes query, key and value but uses only query.
        This is just for compatibility with self-attention layer (attention.py)

        Args:
            query (torch.Tensor): (batch, time1, d_model) input tensor
            key (torch.Tensor): (batch, time2, d_model) NOT USED
            value (torch.Tensor): (batch, time2, d_model) NOT USED
            mask (torch.Tensor): (batch, time1, time2) mask

        Return:
            x (torch.Tensor): (batch, time1, d_model) ouput

        """
        # linear -> GLU -> lightconv -> linear
        x = query
        B, T, C = x.size()
        H = self.wshare

        # first liner layer
        x = self.linear1(x)

        # GLU activation
        x = self.act(x)

        # convolution along frequency axis
        weight_f = F.softmax(self.weight_f, dim=-1)
        weight_f = F.dropout(weight_f, self.dropout_rate, training=self.training)
        weight_new = torch.zeros(
            B * T, 1, self.kernel_size, device=x.device, dtype=x.dtype
        ).copy_(weight_f)
        xf = F.conv1d(
            x.view(1, B * T, C), weight_new, padding=self.padding_size, groups=B * T
        ).view(B, T, C)

        # lightconv
        x = x.transpose(1, 2).contiguous().view(-1, H, T)  # B x C x T
        weight = F.dropout(self.weight, self.dropout_rate, training=self.training)
        if self.use_kernel_mask:
            self.kernel_mask = self.kernel_mask.to(x.device)
            weight = weight.masked_fill(self.kernel_mask == 0.0, float("-inf"))
        weight = F.softmax(weight, dim=-1)
        x = F.conv1d(x, weight, padding=self.padding_size, groups=self.wshare).view(
            B, C, T
        )
        if self.use_bias:
            x = x + self.bias.view(1, -1, 1)
        x = x.transpose(1, 2)  # B x T x C
        x = torch.cat((x, xf), -1)  # B x T x Cx2

        if mask is not None and not self.use_kernel_mask:
            mask = mask.transpose(-1, -2)
            x = x.masked_fill(mask == 0, 0.0)

        # second linear layer
        x = self.linear2(x)
        return x


================================================
FILE: nets/pytorch_backend/transformer/mask.py
================================================
# -*- coding: utf-8 -*-

# Copyright 2019 Shigeki Karita
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

"""Mask module."""

from distutils.version import LooseVersion

import torch

is_torch_1_2_plus = LooseVersion(torch.__version__) >= LooseVersion("1.2.0")
# LooseVersion('1.2.0') == LooseVersion(torch.__version__) can't include e.g. 1.2.0+aaa
is_torch_1_2 = (
    LooseVersion("1.3") > LooseVersion(torch.__version__) >= LooseVersion("1.2")
)
datatype = torch.bool if is_torch_1_2_plus else torch.uint8


def subsequent_mask(size, device="cpu", dtype=datatype):
    """Create mask for subsequent steps (size, size).

    :param int size: size of mask
    :param str device: "cpu" or "cuda" or torch.Tensor.device
    :param torch.dtype dtype: result dtype
    :rtype: torch.Tensor
    >>> subsequent_mask(3)
    [[1, 0, 0],
     [1, 1, 0],
     [1, 1, 1]]
    """
    if is_torch_1_2 and dtype == torch.bool:
        # torch=1.2 doesn't support tril for bool tensor
        ret = torch.ones(size, size, device=device, dtype=torch.uint8)
        return torch.tril(ret, out=ret).type(dtype)
    else:
        ret = torch.ones(size, size, device=device, dtype=dtype)
        return torch.tril(ret, out=ret)


def target_mask(ys_in_pad, ignore_id):
    """Create mask for decoder self-attention.

    :param torch.Tensor ys_pad: batch of padded target sequences (B, Lmax)
    :param int ignore_id: index of padding
    :param torch.dtype dtype: result dtype
    :rtype: torch.Tensor (B, Lmax, Lmax)
    """
    ys_mask = ys_in_pad != ignore_id
    m = subsequent_mask(ys_mask.size(-1), device=ys_mask.device).unsqueeze(0)
    return ys_mask.unsqueeze(-2) & m


================================================
FILE: nets/pytorch_backend/transformer/multi_layer_conv.py
================================================
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

# Copyright 2019 Tomoki Hayashi
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

"""Layer modules for FFT block in FastSpeech (Feed-forward Transformer)."""

import torch


class MultiLayeredConv1d(torch.nn.Module):
    """Multi-layered conv1d for Transformer block.

    This is a module of multi-leyered conv1d designed
    to replace positionwise feed-forward network
    in Transforner block, which is introduced in
    `FastSpeech: Fast, Robust and Controllable Text to Speech`_.

    .. _`FastSpeech: Fast, Robust and Controllable Text to Speech`:
        https://arxiv.org/pdf/1905.09263.pdf

    """

    def __init__(self, in_chans, hidden_chans, kernel_size, dropout_rate):
        """Initialize MultiLayeredConv1d module.

        Args:
            in_chans (int): Number of input channels.
            hidden_chans (int): Number of hidden channels.
            kernel_size (int): Kernel size of conv1d.
            dropout_rate (float): Dropout rate.

        """
        super(MultiLayeredConv1d, self).__init__()
        self.w_1 = torch.nn.Conv1d(
            in_chans,
            hidden_chans,
            kernel_size,
            stride=1,
            padding=(kernel_size - 1) // 2,
        )
        self.w_2 = torch.nn.Conv1d(
            hidden_chans,
            in_chans,
            kernel_size,
            stride=1,
            padding=(kernel_size - 1) // 2,
        )
        self.dropout = torch.nn.Dropout(dropout_rate)

    def forward(self, x):
        """Calculate forward propagation.

        Args:
            x (torch.Tensor): Batch of input tensors (B, T, in_chans).

        Returns:
            torch.Tensor: Batch of output tensors (B, T, hidden_chans).

        """
        x = torch.relu(self.w_1(x.transpose(-1, 1))).transpose(-1, 1)
        return self.w_2(self.dropout(x).transpose(-1, 1)).transpose(-1, 1)


class Conv1dLinear(torch.nn.Module):
    """Conv1D + Linear for Transformer block.

    A variant of MultiLayeredConv1d, which replaces second conv-layer to linear.

    """

    def __init__(self, in_chans, hidden_chans, kernel_size, dropout_rate):
        """Initialize Conv1dLinear module.

        Args:
            in_chans (int): Number of input channels.
            hidden_chans (int): Number of hidden channels.
            kernel_size (int): Kernel size of conv1d.
            dropout_rate (float): Dropout rate.

        """
        super(Conv1dLinear, self).__init__()
        self.w_1 = torch.nn.Conv1d(
            in_chans,
            hidden_chans,
            kernel_size,
            stride=1,
            padding=(kernel_size - 1) // 2,
        )
        self.w_2 = torch.nn.Linear(hidden_chans, in_chans)
        self.dropout = torch.nn.Dropout(dropout_rate)

    def forward(self, x):
        """Calculate forward propagation.

        Args:
            x (torch.Tensor): Batch of input tensors (B, T, in_chans).

        Returns:
            torch.Tensor: Batch of output tensors (B, T, hidden_chans).

        """
        x = torch.relu(self.w_1(x.transpose(-1, 1))).transpose(-1, 1)
        return self.w_2(self.dropout(x))


================================================
FILE: nets/pytorch_backend/transformer/optimizer.py
================================================
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

# Copyright 2019 Shigeki Karita
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

"""Optimizer module."""

import torch


class NoamOpt(object):
    """Optim wrapper that implements rate."""

    def __init__(self, model_size, factor, warmup, optimizer):
        """Construct an NoamOpt object."""
        self.optimizer = optimizer
        self._step = 0
        self.warmup = warmup
        self.factor = factor
        self.model_size = model_size
        self._rate = 0

    @property
    def param_groups(self):
        """Return param_groups."""
        return self.optimizer.param_groups

    def step(self):
        """Update parameters and rate."""
        self._step += 1
        rate = self.rate()
        for p in self.optimizer.param_groups:
            p["lr"] = rate
        self._rate = rate
        self.optimizer.step()

    def rate(self, step=None):
        """Implement `lrate` above."""
        if step is None:
            step = self._step
        return (
            self.factor
            * self.model_size ** (-0.5)
            * min(step ** (-0.5), step * self.warmup ** (-1.5))
        )

    def zero_grad(self):
        """Reset gradient."""
        self.optimizer.zero_grad()

    def state_dict(self):
        """Return state_dict."""
        return {
            "_step": self._step,
            "warmup": self.warmup,
            "factor": self.factor,
            "model_size": self.model_size,
            "_rate": self._rate,
            "optimizer": self.optimizer.state_dict(),
        }

    def load_state_dict(self, state_dict):
        """Load state_dict."""
        for key, value in state_dict.items():
            if key == "optimizer":
                self.optimizer.load_state_dict(state_dict["optimizer"])
            else:
                setattr(self, key, value)


def get_std_opt(model_params, d_model, warmup, factor):
    """Get standard NoamOpt."""
    base = torch.optim.Adam(model_params, lr=0, betas=(0.9, 0.98), eps=1e-9)
    return NoamOpt(d_model, factor, warmup, base)


================================================
FILE: nets/pytorch_backend/transformer/plot.py
================================================
# Copyright 2019 Shigeki Karita
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

import logging

import matplotlib.pyplot as plt
import numpy
import os

from espnet.asr import asr_utils


def _plot_and_save_attention(att_w, filename, xtokens=None, ytokens=None):
    # dynamically import matplotlib due to not found error
    from matplotlib.ticker import MaxNLocator

    d = os.path.dirname(filename)
    if not os.path.exists(d):
        os.makedirs(d)
    w, h = plt.figaspect(1.0 / len(att_w))
    fig = plt.Figure(figsize=(w * 2, h * 2))
    axes = fig.subplots(1, len(att_w))
    if len(att_w) == 1:
        axes = [axes]
    for ax, aw in zip(axes, att_w):
        # plt.subplot(1, len(att_w), h)
        ax.imshow(aw.astype(numpy.float32), aspect="auto")
        ax.set_xlabel("Input")
        ax.set_ylabel("Output")
        ax.xaxis.set_major_locator(MaxNLocator(integer=True))
        ax.yaxis.set_major_locator(MaxNLocator(integer=True))
        # Labels for major ticks
        if xtokens is not None:
            ax.set_xticks(numpy.linspace(0, len(xtokens) - 1, len(xtokens)))
            ax.set_xticks(numpy.linspace(0, len(xtokens) - 1, 1), minor=True)
            ax.set_xticklabels(xtokens + [""], rotation=40)
        if ytokens is not None:
            ax.set_yticks(numpy.linspace(0, len(ytokens) - 1, len(ytokens)))
            ax.set_yticks(numpy.linspace(0, len(ytokens) - 1, 1), minor=True)
            ax.set_yticklabels(ytokens + [""])
    fig.tight_layout()
    return fig


def savefig(plot, filename):
    plot.savefig(filename)
    plt.clf()


def plot_multi_head_attention(
    data,
    uttid_list,
    attn_dict,
    outdir,
    suffix="png",
    savefn=savefig,
    ikey="input",
    iaxis=0,
    okey="output",
    oaxis=0,
    subsampling_factor=4,
):
    """Plot multi head attentions.

    :param dict data: utts info from json file
    :param List uttid_list: utterance IDs
    :param dict[str, torch.Tensor] attn_dict: multi head attention dict.
        values should be torch.Tensor (head, input_length, output_length)
    :param str outdir: dir to save fig
    :param str suffix: filename suffix including image type (e.g., png)
    :param savefn: function to save
    :param str ikey: key to access input
    :param int iaxis: dimension to access input
    :param str okey: key to access output
    :param int oaxis: dimension to access output
    :param subsampling_factor: subsampling factor in encoder

    """
    for name, att_ws in attn_dict.items():
        for idx, att_w in enumerate(att_ws):
            data_i = data[uttid_list[idx]]
            filename = "%s/%s.%s.%s" % (outdir, uttid_list[idx], name, suffix)
            dec_len = int(data_i[okey][oaxis]["shape"][0]) + 1  # +1 for <eos>
            enc_len = int(data_i[ikey][iaxis]["shape"][0])
            is_mt = "token" in data_i[ikey][iaxis].keys()
            # for ASR/ST
            if not is_mt:
                enc_len //= subsampling_factor
            xtokens, ytokens = None, None
            if "encoder" in name:
                att_w = att_w[:, :enc_len, :enc_len]
                # for MT
                if is_mt:
                    xtokens = data_i[ikey][iaxis]["token"].split()
                    ytokens = xtokens[:]
            elif "decoder" in name:
                if "self" in name:
                    # self-attention
                    att_w = att_w[:, :dec_len, :dec_len]
                    if "token" in data_i[okey][oaxis].keys():
                        ytokens = data_i[okey][oaxis]["token"].split() + ["<eos>"]
                        xtokens = ["<sos>"] + data_i[okey][oaxis]["token"].split()
                else:
                    # cross-attention
                    att_w = att_w[:, :dec_len, :enc_len]
                    if "token" in data_i[okey][oaxis].keys():
                        ytokens = data_i[okey][oaxis]["token"].split() + ["<eos>"]
                    # for MT
                    if is_mt:
                        xtokens = data_i[ikey][iaxis]["token"].split()
            else:
                logging.warning("unknown name for shaping attention")
            fig = _plot_and_save_attention(att_w, filename, xtokens, ytokens)
            savefn(fig, filename)


class PlotAttentionReport(asr_utils.PlotAttentionReport):
    def plotfn(self, *args, **kwargs):
        kwargs["ikey"] = self.ikey
        kwargs["iaxis"] = self.iaxis
        kwargs["okey"] = self.okey
        kwargs["oaxis"] = self.oaxis
        kwargs["subsampling_factor"] = self.factor
        plot_multi_head_attention(*args, **kwargs)

    def __call__(self, trainer):
        attn_dict, uttid_list = self.get_attention_weights()
        suffix = "ep.{.updater.epoch}.png".format(trainer)
        self.plotfn(self.data_dict, uttid_list, attn_dict, self.outdir, suffix, savefig)

    def get_attention_weights(self):
        return_batch, uttid_list = self.transform(self.data, return_uttid=True)
        batch = self.converter([return_batch], self.device)
        if isinstance(batch, tuple):
            att_ws = self.att_vis_fn(*batch)
        elif isinstance(batch, dict):
            att_ws = self.att_vis_fn(**batch)
        return att_ws, uttid_list

    def log_attentions(self, logger, step):
        def log_fig(plot, filename):
            logger.add_figure(os.path.basename(filename), plot, step)
            plt.clf()

        attn_dict, uttid_list = self.get_attention_weights()
        self.plotfn(self.data_dict, uttid_list, attn_dict, self.outdir, "", log_fig)


================================================
FILE: nets/pytorch_backend/transformer/positionwise_feed_forward.py
================================================
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

# Copyright 2019 Shigeki Karita
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

"""Positionwise feed forward layer definition."""

import torch


class PositionwiseFeedForward(torch.nn.Module):
    """Positionwise feed forward layer.

    Args:
        idim (int): Input dimenstion.
        hidden_units (int): The number of hidden units.
        dropout_rate (float): Dropout rate.

    """

    def __init__(self, idim, hidden_units, dropout_rate, activation=torch.nn.ReLU()):
        """Construct an PositionwiseFeedForward object."""
        super(PositionwiseFeedForward, self).__init__()
        self.w_1 = torch.nn.Linear(idim, hidden_units)
        self.w_2 = torch.nn.Linear(hidden_units, idim)
        self.dropout = torch.nn.Dropout(dropout_rate)
        self.activation = activation

    def forward(self, x):
        """Forward funciton."""
        return self.w_2(self.dropout(self.activation(self.w_1(x))))


================================================
FILE: nets/pytorch_backend/transformer/repeat.py
================================================
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

# Copyright 2019 Shigeki Karita
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

"""Repeat the same layer definition."""

import torch


class MultiSequential(torch.nn.Sequential):
    """Multi-input multi-output torch.nn.Sequential."""

    def forward(self, *args):
        """Repeat."""
        for m in self:
            args = m(*args)
        return args


def repeat(N, fn):
    """Repeat module N times.

    Args:
        N (int): Number of repeat time.
        fn (Callable): Function to generate module.

    Returns:
        MultiSequential: Repeated model instance.

    """
    return MultiSequential(*[fn(n) for n in range(N)])


================================================
FILE: nets/pytorch_backend/transformer/sgd_optimizer.py
================================================
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

# Copyright 2019 Shigeki Karita
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

"""Optimizer module."""

import torch


class NoamOpt(object):
    """Optim wrapper that implements rate."""

    def __init__(self, model_size, factor, warmup, optimizer):
        """Construct an NoamOpt object."""
        self.optimizer = optimizer
        self._step = 0
        self.warmup = warmup
        self.factor = factor
        self.model_size = model_size
        self._rate = 0

    @property
    def param_groups(self):
        """Return param_groups."""
        return self.optimizer.param_groups

    def step(self):
        """Update parameters and rate."""
        self._step += 1
        rate = self.rate()
        #for p in self.optimizer.param_groups:
        #    p["lr"] = rate
        self._rate = rate
        self.optimizer.step()

    def rate(self, step=None):
        """Implement `lrate` above."""
        if step is None:
            step = self._step
        return (
            self.factor
            * self.model_size ** (-0.5)
            * min(step ** (-0.5), step * self.warmup ** (-1.5))
        )

    def zero_grad(self):
        """Reset gradient."""
        self.optimizer.zero_grad()

    def state_dict(self):
        """Return state_dict."""
        return {
            "_step": self._step,
            "warmup": self.warmup,
            "factor": self.factor,
            "model_size": self.model_size,
            "_rate": self._rate,
            "optimizer": self.optimizer.state_dict(),
        }

    def load_state_dict(self, state_dict):
        """Load state_dict."""
        for key, value in state_dict.items():
            if key == "optimizer":
                self.optimizer.load_state_dict(state_dict["optimizer"])
            else:
                setattr(self, key, value)


def get_sgd_opt(model_params, d_model, warmup, factor):
    """Get standard SGD optimizer with NOAM scheduling."""
    """Adam is then implemented by global optimizer"""
    base = torch.optim.SGD(model_params, lr=1.0) # No momentum
    return NoamOpt(d_model, factor, warmup, base)


================================================
FILE: nets/pytorch_backend/transformer/subsampling.py
================================================
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

# Copyright 2019 Shigeki Karita
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

"""Subsampling layer definition."""

import torch

from espnet.nets.pytorch_backend.transformer.embedding import PositionalEncoding


class TooShortUttError(Exception):
    """Raised when the utt is too short for subsampling.

    Args:
        message (str): Message for error catch
        actual_size (int): the short size that cannot pass the subsampling
        limit (int): the limit size for subsampling

    """

    def __init__(self, message, actual_size, limit):
        """Construct a TooShortUttError for error handler."""
        super().__init__(message)
        self.actual_size = actual_size
        self.limit = limit


def check_short_utt(ins, size):
    """Check if the utterance is too short for subsampling."""
    if isinstance(ins, Conv2dSubsampling) and size < 7:
        return True, 7
    if isinstance(ins, Conv2dSubsampling6) and size < 11:
        return True, 11
    if isinstance(ins, Conv2dSubsampling8) and size < 15:
        return True, 15
    return False, -1


class Conv2dSubsampling(torch.nn.Module):
    """Convolutional 2D subsampling (to 1/4 length).

    Args:
        idim (int): Input dimension.
        odim (int): Output dimension.
        dropout_rate (float): Dropout rate.
        pos_enc (torch.nn.Module): Custom position encoding layer.

    """

    def __init__(self, idim, odim, dropout_rate, pos_enc=None):
        """Construct an Conv2dSubsampling object."""
        super(Conv2dSubsampling, self).__init__()
        self.conv = torch.nn.Sequential(
            torch.nn.Conv2d(1, odim, 3, 2),
            torch.nn.ReLU(),
            torch.nn.Conv2d(odim, odim, 3, 2),
            torch.nn.ReLU(),
        )
        self.out = torch.nn.Sequential(
            torch.nn.Linear(odim * (((idim - 1) // 2 - 1) // 2), odim),
            pos_enc if pos_enc is not None else PositionalEncoding(odim, dropout_rate),
        )

    def forward(self, x, x_mask):
        """Subsample x.

        Args:
            x (torch.Tensor): Input tensor (#batch, time, idim).
            x_mask (torch.Tensor): Input mask (#batch, 1, time).

        Returns:
            torch.Tensor: Subsampled tensor (#batch, time', odim),
                where time' = time // 4.
            torch.Tensor: Subsampled mask (#batch, 1, time'),
                where time' = time // 4.

        """
        x = x.unsqueeze(1)  # (b, c, t, f)
        x = self.conv(x)
        b, c, t, f = x.size()
        x = self.out(x.transpose(1, 2).contiguous().view(b, t, c * f))
        if x_mask is None:
            return x, None
        return x, x_mask[:, :, :-2:2][:, :, :-2:2]

    def __getitem__(self, key):
        """Get item.

        When reset_parameters() is called, if use_scaled_pos_enc is used,
            return the positioning encoding.

        """
        if key != -1:
            raise NotImplementedError("Support only `-1` (for `reset_parameters`).")
        return self.out[key]


class Conv2dSubsampling6(torch.nn.Module):
    """Convolutional 2D subsampling (to 1/6 length).

    Args:
        idim (int): Input dimension.
        odim (int): Output dimension.
        dropout_rate (float): Dropout rate.
        pos_enc (torch.nn.Module): Custom position encoding layer.

    """

    def __init__(self, idim, odim, dropout_rate, pos_enc=None):
        """Construct an Conv2dSubsampling6 object."""
        super(Conv2dSubsampling6, self).__init__()
        self.conv = torch.nn.Sequential(
            torch.nn.Conv2d(1, odim, 3, 2),
            torch.nn.ReLU(),
            torch.nn.Conv2d(odim, odim, 5, 3),
            torch.nn.ReLU(),
        )
        self.out = torch.nn.Sequential(
            torch.nn.Linear(odim * (((idim - 1) // 2 - 2) // 3), odim),
            pos_enc if pos_enc is not None else PositionalEncoding(odim, dropout_rate),
        )

    def forward(self, x, x_mask):
        """Subsample x.

        Args:
            x (torch.Tensor): Input tensor (#batch, time, idim).
            x_mask (torch.Tensor): Input mask (#batch, 1, time).

        Returns:
            torch.Tensor: Subsampled tensor (#batch, time', odim),
                where time' = time // 6.
            torch.Tensor: Subsampled mask (#batch, 1, time'),
                where time' = time // 6.

        """
        x = x.unsqueeze(1)  # (b, c, t, f)
        x = self.conv(x)
        b, c, t, f = x.size()
        x = self.out(x.transpose(1, 2).contiguous().view(b, t, c * f))
        if x_mask is None:
            return x, None
        return x, x_mask[:, :, :-2:2][:, :, :-4:3]


class Conv2dSubsampling8(torch.nn.Module):
    """Convolutional 2D subsampling (to 1/8 length).

    Args:
        idim (int): Input dimension.
        odim (int): Output dimension.
        dropout_rate (float): Dropout rate.
        pos_enc (torch.nn.Module): Custom position encoding layer.

    """

    def __init__(self, idim, odim, dropout_rate, pos_enc=None):
        """Construct an Conv2dSubsampling8 object."""
        super(Conv2dSubsampling8, self).__init__()
        self.conv = torch.nn.Sequential(
            torch.nn.Conv2d(1, odim, 3, 2),
            torch.nn.ReLU(),
            torch.nn.Conv2d(odim, odim, 3, 2),
            torch.nn.ReLU(),
            torch.nn.Conv2d(odim, odim, 3, 2),
            torch.nn.ReLU(),
        )
        self.out = torch.nn.Sequential(
            torch.nn.Linear(odim * ((((idim - 1) // 2 - 1) // 2 - 1) // 2), odim),
            pos_enc if pos_enc is not None else PositionalEncoding(odim, dropout_rate),
        )

    def forward(self, x, x_mask):
        """Subsample x.

        Args:
            x (torch.Tensor): Input tensor (#batch, time, idim).
            x_mask (torch.Tensor): Input mask (#batch, 1, time).

        Returns:
            torch.Tensor: Subsampled tensor (#batch, time', odim),
                where time' = time // 8.
            torch.Tensor: Subsampled mask (#batch, 1, time'),
                where time' = time // 8.

        """
        x = x.unsqueeze(1)  # (b, c, t, f)
        x = self.conv(x)
        b, c, t, f = x.size()
        x = self.out(x.transpose(1, 2).contiguous().view(b, t, c * f))
        if x_mask is None:
            return x, None
        return x, x_mask[:, :, :-2:2][:, :, :-2:2][:, :, :-2:2]


================================================
FILE: nets/pytorch_backend/transformer/subsampling_without_posenc.py
================================================
# Copyright 2020 Emiru Tsunoo
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

"""Subsampling layer definition."""

import math
import torch


class Conv2dSubsamplingWOPosEnc(torch.nn.Module):
    """Convolutional 2D subsampling.

    Args:
        idim (int): Input dimension.
        odim (int): Output dimension.
        dropout_rate (float): Dropout rate.
        kernels (list): kernel sizes
        strides (list): stride sizes

    """

    def __init__(self, idim, odim, dropout_rate, kernels, strides):
        """Construct an Conv2dSubsamplingWOPosEnc object."""
        assert len(kernels) == len(strides)
        super().__init__()
        conv = []
        olen = idim
        for i, (k, s) in enumerate(zip(kernels, strides)):
            conv += [
                torch.nn.Conv2d(1 if i == 0 else odim, odim, k, s),
                torch.nn.ReLU(),
            ]
            olen = math.floor((olen - k) / s + 1)
        self.conv = torch.nn.Sequential(*conv)
        self.out = torch.nn.Linear(odim * olen, odim)
        self.strides = strides
        self.kernels = kernels

    def forward(self, x, x_mask):
        """Subsample x.

        Args:
            x (torch.Tensor): Input tensor (#batch, time, idim).
            x_mask (torch.Tensor): Input mask (#batch, 1, time).

        Returns:
            torch.Tensor: Subsampled tensor (#batch, time', odim),
                where time' = time // 4.
            torch.Tensor: Subsampled mask (#batch, 1, time'),
                where time' = time // 4.

        """
        x = x.unsqueeze(1)  # (b, c, t, f)
        x = self.conv(x)
        b, c, t, f = x.size()
        x = self.out(x.transpose(1, 2).contiguous().view(b, t, c * f))
        if x_mask is None:
            return x, None
        for k, s in zip(self.kernels, self.strides):
            x_mask = x_mask[:, :, : -k + 1 : s]
        return x, x_mask


================================================
FILE: nets/pytorch_backend/wavenet.py
================================================
# -*- coding: utf-8 -*-

# Copyright 2019 Tomoki Hayashi (Nagoya University)
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

"""This code is based on https://github.com/kan-bayashi/PytorchWaveNetVocoder."""

import logging
import sys
import time

import numpy as np
import torch
import torch.nn.functional as F

from torch import nn


def encode_mu_law(x, mu=256):
    """Perform mu-law encoding.

    Args:
        x (ndarray): Audio signal with the range from -1 to 1.
        mu (int): Quantized level.

    Returns:
        ndarray: Quantized audio signal with the range from 0 to mu - 1.

    """
    mu = mu - 1
    fx = np.sign(x) * np.log(1 + mu * np.abs(x)) / np.log(1 + mu)
    return np.floor((fx + 1) / 2 * mu + 0.5).astype(np.int64)


def decode_mu_law(y, mu=256):
    """Perform mu-law decoding.

    Args:
        x (ndarray): Quantized audio signal with the range from 0 to mu - 1.
        mu (int): Quantized level.

    Returns:
        ndarray: Audio signal with the range from -1 to 1.

    """
    mu = mu - 1
    fx = (y - 0.5) / mu * 2 - 1
    x = np.sign(fx) / mu * ((1 + mu) ** np.abs(fx) - 1)
    return x


def initialize(m):
    """Initilize conv layers with xavier.

    Args:
        m (torch.nn.Module): Torch module.

    """
    if isinstance(m, nn.Conv1d):
        nn.init.xavier_uniform_(m.weight)
        nn.init.constant_(m.bias, 0.0)

    if isinstance(m, nn.ConvTranspose2d):
        nn.init.constant_(m.weight, 1.0)
        nn.init.constant_(m.bias, 0.0)


class OneHot(nn.Module):
    """Convert to one-hot vector.

    Args:
        depth (int): Dimension of one-hot vector.

    """

    def __init__(self, depth):
        super(OneHot, self).__init__()
        self.depth = depth

    def forward(self, x):
        """Calculate forward propagation.

        Args:
            x (LongTensor): long tensor variable with the shape  (B, T)

        Returns:
            Tensor: float tensor variable with the shape (B, depth, T)

        """
        x = x % self.depth
        x = torch.unsqueeze(x, 2)
        x_onehot = x.new_zeros(x.size(0), x.size(1), self.depth).float()

        return x_onehot.scatter_(2, x, 1)


class CausalConv1d(nn.Module):
    """1D dilated causal convolution."""

    def __init__(self, in_channels, out_channels, kernel_size, dilation=1, bias=True):
        super(CausalConv1d, self).__init__()
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.kernel_size = kernel_size
        self.dilation = dilation
        self.padding = padding = (kernel_size - 1) * dilation
        self.conv = nn.Conv1d(
            in_channels,
            out_channels,
            kernel_size,
            padding=padding,
            dilation=dilation,
            bias=bias,
        )

    def forward(self, x):
        """Calculate forward propagation.

        Args:
            x (Tensor): Input tensor with the shape (B, in_channels, T).

        Returns:
            Tensor: Tensor with the shape (B, out_channels, T)

        """
        x = self.conv(x)
        if self.padding != 0:
            x = x[:, :, : -self.padding]
        return x


class UpSampling(nn.Module):
    """Upsampling layer with deconvolution.

    Args:
        upsampling_factor (int): Upsampling factor.

    """

    def __init__(self, upsampling_factor, bias=True):
        super(UpSampling, self).__init__()
        self.upsampling_factor = upsampling_factor
        self.bias = bias
        self.conv = nn.ConvTranspose2d(
            1,
            1,
            kernel_size=(1, self.upsampling_factor),
            stride=(1, self.upsampling_factor),
            bias=self.bias,
        )

    def forward(self, x):
        """Calculate forward propagation.

        Args:
            x (Tensor): Input tensor with the shape  (B, C, T)

        Returns:
            Tensor: Tensor with the shape (B, C, T') where T' = T * upsampling_factor.

        """
        x = x.unsqueeze(1)  # B x 1 x C x T
        x = self.conv(x)  # B x 1 x C x T'
        return x.squeeze(1)


class WaveNet(nn.Module):
    """Conditional wavenet.

    Args:
        n_quantize (int): Number of quantization.
        n_aux (int): Number of aux feature dimension.
        n_resch (int): Number of filter channels for residual block.
        n_skipch (int): Number of filter channels for skip connection.
        dilation_depth (int): Number of dilation depth
            (e.g. if set 10, max dilation = 2^(10-1)).
        dilation_repeat (int): Number of dilation repeat.
        kernel_size (int): Filter size of dilated causal convolution.
        upsampling_factor (int): Upsampling factor.

    """

    def __init__(
        self,
        n_quantize=256,
        n_aux=28,
        n_resch=512,
        n_skipch=256,
        dilation_depth=10,
        dilation_repeat=3,
        kernel_size=2,
        upsampling_factor=0,
    ):
        super(WaveNet, self).__init__()
        self.n_aux = n_aux
        self.n_quantize = n_quantize
        self.n_resch = n_resch
        self.n_skipch = n_skipch
        self.kernel_size = kernel_size
        self.dilation_depth = dilation_depth
        self.dilation_repeat = dilation_repeat
        self.upsampling_factor = upsampling_factor

        self.dilations = [
            2 ** i for i in range(self.dilation_depth)
        ] * self.dilation_repeat
        self.receptive_field = (self.kernel_size - 1) * sum(self.dilations) + 1

        # for preprocessing
        self.onehot = OneHot(self.n_quantize)
        self.causal = CausalConv1d(self.n_quantize, self.n_resch, self.kernel_size)
        if self.upsampling_factor > 0:
            self.upsampling = UpSampling(self.upsampling_factor)

        # for residual blocks
        self.dil_sigmoid = nn.ModuleList()
        self.dil_tanh = nn.ModuleList()
        self.aux_1x1_sigmoid = nn.ModuleList()
        self.aux_1x1_tanh = nn.ModuleList()
        self.skip_1x1 = nn.ModuleList()
        self.res_1x1 = nn.ModuleList()
        for d in self.dilations:
            self.dil_sigmoid += [
                CausalConv1d(self.n_resch, self.n_resch, self.kernel_size, d)
            ]
            self.dil_tanh += [
                CausalConv1d(self.n_resch, self.n_resch, self.kernel_size, d)
            ]
            self.aux_1x1_sigmoid += [nn.Conv1d(self.n_aux, self.n_resch, 1)]
            self.aux_1x1_tanh += [nn.Conv1d(self.n_aux, self.n_resch, 1)]
            self.skip_1x1 += [nn.Conv1d(self.n_resch, self.n_skipch, 1)]
            self.res_1x1 += [nn.Conv1d(self.n_resch, self.n_resch, 1)]

        # for postprocessing
        self.conv_post_1 = nn.Conv1d(self.n_skipch, self.n_skipch, 1)
        self.conv_post_2 = nn.Conv1d(self.n_skipch, self.n_quantize, 1)

    def forward(self, x, h):
        """Calculate forward propagation.

        Args:
            x (LongTensor): Quantized input waveform tensor with the shape  (B, T).
            h (Tensor): Auxiliary feature tensor with the shape  (B, n_aux, T).

        Returns:
            Tensor: Logits with the shape (B, T, n_quantize).

        """
        # preprocess
        output = self._preprocess(x)
        if self.upsampling_factor > 0:
            h = self.upsampling(h)

        # residual block
        skip_connections = []
        for i in range(len(self.dilations)):
            output, skip = self._residual_forward(
                output,
                h,
                self.dil_sigmoid[i],
                self.dil_tanh[i],
                self.aux_1x1_sigmoid[i],
                self.aux_1x1_tanh[i],
                self.skip_1x1[i],
                self.res_1x1[i],
            )
            skip_connections.append(skip)

        # skip-connection part
        output = sum(skip_connections)
        output = self._postprocess(output)

        return output

    def generate(self, x, h, n_samples, interval=None, mode="sampling"):
        """Generate a waveform with fast genration algorithm.

        This generation based on `Fast WaveNet Generation Algorithm`_.

        Args:
            x (LongTensor): Initial waveform tensor with the shape  (T,).
            h (Tensor): Auxiliary feature tensor with the shape  (n_samples + T, n_aux).
            n_samples (int): Number of samples to be generated.
            interval (int, optional): Log interval.
            mode (str, optional): "sampling" or "argmax".

        Return:
            ndarray: Generated quantized waveform (n_samples).

        .. _`Fast WaveNet Generation Algorithm`: https://arxiv.org/abs/1611.09482

        """
        # reshape inputs
        assert len(x.shape) == 1
        assert len(h.shape) == 2 and h.shape[1] == self.n_aux
        x = x.unsqueeze(0)
        h = h.transpose(0, 1).unsqueeze(0)

        # perform upsampling
        if self.upsampling_factor > 0:
            h = self.upsampling(h)

        # padding for shortage
        if n_samples > h.shape[2]:
            h = F.pad(h, (0, n_samples - h.shape[2]), "replicate")

        # padding if the length less than
        n_pad = self.receptive_field - x.size(1)
        if n_pad > 0:
            x = F.pad(x, (n_pad, 0), "constant", self.n_quantize // 2)
            h = F.pad(h, (n_pad, 0), "replicate")

        # prepare buffer
        output = self._preprocess(x)
        h_ = h[:, :, : x.size(1)]
        output_buffer = []
        buffer_size = []
        for i, d in enumerate(self.dilations):
            output, _ = self._residual_forward(
                output,
                h_,
                self.dil_sigmoid[i],
                self.dil_tanh[i],
                self.aux_1x1_sigmoid[i],
                self.aux_1x1_tanh[i],
                self.skip_1x1[i],
                self.res_1x1[i],
            )
            if d == 2 ** (self.dilation_depth - 1):
                buffer_size.append(self.kernel_size - 1)
            else:
                buffer_size.append(d * 2 * (self.kernel_size - 1))
            output_buffer.append(output[:, :, -buffer_size[i] - 1 : -1])

        # generate
        samples = x[0]
        start_time = time.time()
        for i in range(n_samples):
            output = samples[-self.kernel_size * 2 + 1 :].unsqueeze(0)
            output = self._preprocess(output)
            h_ = h[:, :, samples.size(0) - 1].contiguous().view(1, self.n_aux, 1)
            output_buffer_next = []
            skip_connections = []
            for j, d in enumerate(self.dilations):
                output, skip = self._generate_residual_forward(
                    output,
                    h_,
                    self.dil_sigmoid[j],
                    self.dil_tanh[j],
                    self.aux_1x1_sigmoid[j],
                    self.aux_1x1_tanh[j],
                    self.skip_1x1[j],
                    self.res_1x1[j],
                )
                output = torch.cat([output_buffer[j], output], dim=2)
                output_buffer_next.append(output[:, :, -buffer_size[j] :])
                skip_connections.append(skip)

            # update buffer
            output_buffer = output_buffer_next

            # get predicted sample
            output = sum(skip_connections)
            output = self._postprocess(output)[0]
            if mode == "sampling":
                posterior = F.softmax(output[-1], dim=0)
                dist = torch.distributions.Categorical(posterior)
                sample = dist.sample().unsqueeze(0)
            elif mode == "argmax":
                sample = output.argmax(-1)
            else:
                logging.error("mode should be sampling or argmax")
                sys.exit(1)
            samples = torch.cat([samples, sample], dim=0)

            # show progress
            if interval is not None and (i + 1) % interval == 0:
                elapsed_time_per_sample = (time.time() - start_time) / interval
                logging.info(
                    "%d/%d estimated time = %.3f sec (%.3f sec / sample)"
                    % (
                        i + 1,
                        n_samples,
                        (n_samples - i - 1) * elapsed_time_per_sample,
                        elapsed_time_per_sample,
                    )
                )
                start_time = time.time()

        return samples[-n_samples:].cpu().numpy()

    def _preprocess(self, x):
        x = self.onehot(x).transpose(1, 2)
        output = self.causal(x)
        return output

    def _postprocess(self, x):
        output = F.relu(x)
        output = self.conv_post_1(output)
        output = F.relu(output)  # B x C x T
        output = self.conv_post_2(output).transpose(1, 2)  # B x T x C
        return output

    def _residual_forward(
        self,
        x,
        h,
        dil_sigmoid,
        dil_tanh,
        aux_1x1_sigmoid,
        aux_1x1_tanh,
        skip_1x1,
        res_1x1,
    ):
        output_sigmoid = dil_sigmoid(x)
        output_tanh = dil_tanh(x)
        aux_output_sigmoid = aux_1x1_sigmoid(h)
        aux_output_tanh = aux_1x1_tanh(h)
        output = torch.sigmoid(output_sigmoid + aux_output_sigmoid) * torch.tanh(
            output_tanh + aux_output_tanh
        )
        skip = skip_1x1(output)
        output = res_1x1(output)
        output = output + x
        return output, skip

    def _generate_residual_forward(
        self,
        x,
        h,
        dil_sigmoid,
        dil_tanh,
        aux_1x1_sigmoid,
        aux_1x1_tanh,
        skip_1x1,
        res_1x1,
    ):
        output_sigmoid = dil_sigmoid(x)[:, :, -1:]
        output_tanh = dil_tanh(x)[:, :, -1:]
        aux_output_sigmoid = aux_1x1_sigmoid(h)
        aux_output_tanh = aux_1x1_tanh(h)
        output = torch.sigmoid(output_sigmoid + aux_output_sigmoid) * torch.tanh(
            output_tanh + aux_output_tanh
        )
        skip = skip_1x1(output)
        output = res_1x1(output)
        output = output + x[:, :, -1:]  # B x C x 1
        return output, skip


================================================
FILE: nets/scorer_interface.py
================================================
"""Scorer interface module."""

from typing import Any
from typing import List
from typing import Tuple

import torch
import warnings


class ScorerInterface:
    """Scorer interface for beam search.

    The scorer performs scoring of the all tokens in vocabulary.

    Examples:
        * Search heuristics
            * :class:`espnet.nets.scorers.length_bonus.LengthBonus`
        * Decoder networks of the sequence-to-sequence models
            * :class:`espnet.nets.pytorch_backend.nets.transformer.decoder.Decoder`
            * :class:`espnet.nets.pytorch_backend.nets.rnn.decoders.Decoder`
        * Neural language models
            * :class:`espnet.nets.pytorch_backend.lm.transformer.TransformerLM`
            * :class:`espnet.nets.pytorch_backend.lm.default.DefaultRNNLM`
            * :class:`espnet.nets.pytorch_backend.lm.seq_rnn.SequentialRNNLM`

    """

    def init_state(self, x: torch.Tensor) -> Any:
        """Get an initial state for decoding (optional).

        Args:
            x (torch.Tensor): The encoded feature tensor

        Returns: initial state

        """
        return None

    def select_state(self, state: Any, i: int, new_id: int = None) -> Any:
        """Select state with relative ids in the main beam search.

        Args:
            state: Decoder state for prefix tokens
            i (int): Index to select a state in the main beam search
            new_id (int): New label index to select a state if necessary

        Returns:
            state: pruned state

        """
        return None if state is None else state[i]

    def score(
        self, y: torch.Tensor, state: Any, x: torch.Tensor
    ) -> Tuple[torch.Tensor, Any]:
        """Score new token (required).

        Args:
            y (torch.Tensor): 1D torch.int64 prefix tokens.
            state: Scorer state for prefix tokens
            x (torch.Tensor): The encoder feature that generates ys.

        Returns:
            tuple[torch.Tensor, Any]: Tuple of
                scores for next token that has a shape of `(n_vocab)`
                and next state for ys

        """
        raise NotImplementedError

    def final_score(self, state: Any) -> float:
        """Score eos (optional).

        Args:
            state: Scorer state for prefix tokens

        Returns:
            float: final score

        """
        return 0.0


class BatchScorerInterface(ScorerInterface):
    """Batch scorer interface."""

    def batch_init_state(self, x: torch.Tensor) -> Any:
        """Get an initial state for decoding (optional).

        Args:
            x (torch.Tensor): The encoded feature tensor

        Returns: initial state

        """
        return self.init_state(x)

    def batch_score(
        self, ys: torch.Tensor, states: List[Any], xs: torch.Tensor
    ) -> Tuple[torch.Tensor, List[Any]]:
        """Score new token batch (required).

        Args:
            ys (torch.Tensor): torch.int64 prefix tokens (n_batch, ylen).
            states (List[Any]): Scorer states for prefix tokens.
            xs (torch.Tensor):
                The encoder feature that generates ys (n_batch, xlen, n_feat).

        Returns:
            tuple[torch.Tensor, List[Any]]: Tuple of
                batchfied scores for next token with shape of `(n_batch, n_vocab)`
                and next state list for ys.

        """
        warnings.warn(
            "{} batch score is implemented through for loop not parallelized".format(
                self.__class__.__name__
            )
        )
        scores = list()
        outstates = list()
        for i, (y, state, x) in enumerate(zip(ys, states, xs)):
            score, outstate = self.score(y, state, x)
            outstates.append(outstate)
            scores.append(score)
        scores = torch.cat(scores, 0).view(ys.shape[0], -1)
        return scores, outstates


class PartialScorerInterface(ScorerInterface):
    """Partial scorer interface for beam search.

    The partial scorer performs scoring when non-partial scorer finished scoring,
    and recieves pre-pruned next tokens to score because it is too heavy to score
    all the tokens.

    Examples:
         * Prefix search for connectionist-temporal-classification models
             * :class:`espnet.nets.scorers.ctc.CTCPrefixScorer`

    """

    def score_partial(
        self, y: torch.Tensor, next_tokens: torch.Tensor, state: Any, x: torch.Tensor
    ) -> Tuple[torch.Tensor, Any]:
        """Score new token (required).

        Args:
            y (torch.Tensor): 1D prefix token
            next_tokens (torch.Tensor): torch.int64 next token to score
            state: decoder state for prefix tokens
            x (torch.Tensor): The encoder feature that generates ys

        Returns:
            tuple[torch.Tensor, Any]:
                Tuple of a score tensor for y that has a shape `(len(next_tokens),)`
                and next state for ys

        """
        raise NotImplementedError


class BatchPartialScorerInterface(BatchScorerInterface, PartialScorerInterface):
    """Batch partial scorer interface for beam search."""

    def batch_score_partial(
        self,
        ys: torch.Tensor,
        next_tokens: torch.Tensor,
        states: List[Any],
        xs: torch.Tensor,
    ) -> Tuple[torch.Tensor, Any]:
        """Score new token (required).

        Args:
            ys (torch.Tensor): torch.int64 prefix tokens (n_batch, ylen).
            next_tokens (torch.Tensor): torch.int64 tokens to score (n_batch, n_token).
            states (List[Any]): Scorer states for prefix tokens.
            xs (torch.Tensor):
                The encoder feature that generates ys (n_batch, xlen, n_feat).

        Returns:
            tuple[torch.Tensor, Any]:
                Tuple of a score tensor for ys that has a shape `(n_batch, n_vocab)`
                and next states for ys
        """
        raise NotImplementedError


================================================
FILE: nets/scorers/__init__.py
================================================
"""Initialize sub package."""


================================================
FILE: nets/scorers/_mmi_utils.py
================================================
# Author: Jinchuan Tian ; Jan 2022
# jinchuantian@stu.pku.edu.cn

# We test our code on k2 version 1.2; other versions may encounter problems due to API change.
# This file contains the MMI-related utility functions:
# 1. The (equivalent implementation of) step composition between the training / decoding graph;
# 2. The Lattice generation process with look-ahead mechanism.

from typing import List
from typing import Optional
from typing import Tuple

import torch
import k2
import _k2

from k2 import Fsa, DenseFsaVec 

"""
Intersection function without autograd.

(1) We write this function since the arc_map_a is not accessible in k2 API
(2) Currently we are not using the pruned version to keep all paths.
    We will try to find a balance between the speed and the precision later.
"""
def intersect_dense_forward(a_fsas: Fsa,
                           b_fsas: DenseFsaVec,
                           search_beam: float,
                           output_beam: float,
                           prune: bool,
                           min_active_states: int,
                           max_active_states: int,
                           seqframe_idx_name: Optional[str] = None,
                           frame_idx_name: Optional[str] = None): 

    out_fsa = [0]

    if prune:
        ragged_arc, arc_map_a, arc_map_b = _k2.intersect_dense_pruned(
            a_fsas=a_fsas.arcs,
            b_fsas=b_fsas.dense_fsa_vec,
            search_beam=search_beam,
            output_beam=output_beam,
            min_active_states=min_active_states,
            max_active_states=max_active_states)
    else:
        ragged_arc, arc_map_a, arc_map_b = _k2.intersect_dense(
            a_fsas=a_fsas.arcs,
            b_fsas=b_fsas.dense_fsa_vec,
            a_to_b_map=None,
            output_beam=output_beam)

    out_fsa[0] = Fsa(ragged_arc)

    seqframe_idx = None
    if frame_idx_name is not None:
        num_cols = b_fsas.dense_fsa_vec.scores_dim1()
        seqframe_idx = arc_map_b // num_cols
        shape = b_fsas.dense_fsa_vec.shape()
        fsa_idx0 = _k2.index_select(shape.row_ids(1), seqframe_idx)
        frame_idx = seqframe_idx - _k2.index_select(
            shape.row_splits(1), fsa_idx0)
        assert not hasattr(out_fsa[0], frame_idx_name)
        setattr(out_fsa[0], frame_idx_name, frame_idx)

    if seqframe_idx_name is not None:
        if seqframe_idx is None:
            num_cols = b_fsas.dense_fsa_vec.scores_dim1()
            seqframe_idx = arc_map_b // num_cols

        assert not hasattr(out_fsa[0], seqframe_idx_name)
        setattr(out_fsa[0], seqframe_idx_name, seqframe_idx)

    return out_fsa[0], arc_map_a, arc_map_b

# Recover the frame-level probability so we avoid using a loop to 
# do the intersection for each frame
# TODO: support batch trace 
def step_trace(out_fsas, a_fsas, arc_map_a):
    assert out_fsas.shape[0] == a_fsas.shape[0]
    num_fsa = a_fsas.shape[0]

    # K2 FsaVec Meta-info: num_state; 0; 
    # state_accumulated_counts (row_splits1); arc_accumulated_counts (row_splits12)
    
    # 1.1 Find all a_fsas arcs and meta-info
    a_fsa_dict = a_fsas.as_dict()
    a_fsa_meta = a_fsa_dict["arcs"][: 2 * num_fsa + 4]
    a_fsa_arcs = a_fsa_dict["arcs"][2 * num_fsa + 4:].view(-1, 4) # exclude meta-info

    # 1.2 Assign global state-ids
    for i in range(num_fsa):
        a_fsa_arcs[a_fsa_meta[i+num_fsa+3]: a_fsa_meta[i+num_fsa+4]][:, :2] += a_fsa_meta[i + 2]

    # 1.3 Find all ending states and their scores
    a_fsa_ending_mask = a_fsa_arcs[:, 2] == -1
    a_ending_states = torch.masked_select(a_fsa_arcs[:, 0], a_fsa_ending_mask)
    a_ending_scores = torch.masked_select(a_fsas.scores, a_fsa_ending_mask)

    # 2.1 Find all out_fsas arcs and sort by entering states 
    out_fsa_dict = out_fsas.as_dict()
    out_fsa_meta = out_fsa_dict["arcs"][:2 * num_fsa + 4]
    out_fsa_arcs = out_fsa_dict["arcs"][2 * num_fsa + 4:].view(-1, 4)
    out_incoming_ragged = out_fsas._get_incoming_arcs()

    # 2.2 For each state, find an arc entering it
    # We actually do not need arcs in out_fsas but need those in a_fsas. -> select arc_map
    transform_index = out_incoming_ragged.values().long()
    select_index = torch.unique_consecutive(out_incoming_ragged.row_splits(2))[:-1].long()
   
    arc_map_a_uniq = arc_map_a[transform_index][select_index]
    frame_idx = out_fsas.frame_idx[transform_index][select_index]

    # 2.3 Find all corresponding arcs in a_fsas and their entering states
    a_fsa_arcs_uniq = a_fsa_arcs[arc_map_a_uniq.long()]
    a_states_uniq = a_fsa_arcs_uniq[:, 1]

    # 3.1 Find the forward scores and remove scores on starting states
    raw_state_scores = out_fsas._get_forward_scores(True, True)
    raw_state_scores_ = []
    for i in range(num_fsa):
        s, e = out_fsa_meta[2 + i], out_fsa_meta[3 + i] 
        raw_state_scores_.append(raw_state_scores[s + 1: e])
    raw_state_scores = torch.cat(raw_state_scores_, dim=0)
 
    # 3.2 Add ending state scores to the raw state_scores 
    # if the final state is reachable. Else set to -inf
    state_scores = torch.ones_like(raw_state_scores) * - float('inf')
    for state, score in zip(a_ending_states, a_ending_scores):
        state_scores = torch.where(a_states_uniq==state, 
                                   raw_state_scores + score, 
                                   state_scores)
    
    # 3.3 Allocate scores on each frames and each Fsa
    frame_ids, counts = torch.unique_consecutive(frame_idx, return_counts=True)
    score_sequences, start = [], 0
    score_sequence = []
    for i, (fid, fc) in enumerate(zip(frame_ids.tolist(), counts.tolist())):
        frame_score = torch.logsumexp(state_scores[start: start+fc], dim=0)        
        score_sequence.append(frame_score)
        start += fc

        if i == len(counts) - 1 or fid > frame_ids[i+1]:
            score_sequences.append(torch.stack(score_sequence, dim=0)[:-1])
            score_sequence = []
            
    return score_sequences

"""
Step intersection implementation

Input:
fsa, FsaVec, training graph like CTC, MMI. Need duplication.
dense_fsa_vec, DenseFsaVec, created from nnet_output and the corresponding length in t-axis.
prune: bool, If true, use a pruned version of intersection.
search_beam: float, parameter used in pruned intersection only.
output_beam: float, paramtere used in intersection.
min_active_states: int, parameter used in pruned intersection only.
max_active_states: int, parameter used in pruned intersection only.

Output: 
score_sequences: List of 1-D tensors. The number of tensors is equal to the number fsas in of `fsa`
                 Each tensor has length of T where T is the number of effective frames in nnet_ouptut.
                 The t-th element represent the `tot_score` of interseted Fsa beteewn the input `fsa` 
                 and the first t frames.

This implementation is much faster than using a loop for T times. As the intersection is only used once
for each Fsa. The sequence is recovered from the generated Fsa and the arc_map_a.
"""
def step_intersect(fsa, 
                   dense_fsa_vec, 
                   prune=False, 
                   search_beam=100, 
                   output_beam=100,
                   min_active_states=30,
                   max_active_states=50000):
    
    out_fsa, arc_map_a, arc_map_b = intersect_dense_forward(
      a_fsas = fsa,
      b_fsas = dense_fsa_vec,
      search_beam = search_beam,
      output_beam = output_beam,
      prune = prune,
      min_active_states = min_active_states,
      max_active_states = max_active_states,
      seqframe_idx_name = "seqframe_idx",
      frame_idx_name = "frame_idx"
    )

    return step_trace(out_fsa, fsa, arc_map_a) 

def step_intersect_test(): 
    from pathlib import Path
    lang=Path("data/lang_phone")
    device = torch.device("cpu")
    
    # import for test only
    from espnet.nets.scorer_interface import PartialScorerInterface
    from snowfall.training.mmi_graph import MmiTrainingGraphCompiler
    from snowfall.lexicon import Lexicon
    from snowfall.training.mmi_graph import create_bigram_phone_lm

    lexicon = Lexicon(lang)
    oov = open(lang / 'oov.txt').read().strip()
    graph_compiler = MmiTrainingGraphCompiler(lexicon, device, oov)
    phone_ids = lexicon.phone_symbols()

    torch.manual_seed(888)
    P = create_bigram_phone_lm(phone_ids)
    P.scores = torch.randn_like(P.scores)

    # texts = ["你好", "再见"]
    texts = ["中华人民共和国万岁", "世界人民大团结万岁"]
    num, den = graph_compiler.compile(texts, P, replicate_den=True)
    graph = num 
 
    T = 100
    beam_size = len(texts)
    odim = len(phone_ids) + 1
    nnet_output = torch.rand([beam_size, T, odim])

    supervision = torch.stack([
                          torch.arange(beam_size),
                          torch.zeros(beam_size),
                          torch.ones(beam_size) * T,
                          ], dim=-1).cpu().int()   
    dense_fsa_vec = k2.DenseFsaVec(nnet_output, supervision) 

    score_sequences = step_intersect(graph, 
                                    dense_fsa_vec,
                                    prune=False,
                                    search_beam=30,
                                    output_beam=20,
                                    min_active_states=30,
                                    max_active_states=100000) 

    print("####  old method ###")
    buf = []
    for t in range(1, T+1):
        supervision = torch.stack([
                          torch.arange(beam_size),
                          torch.zeros(beam_size),
                          torch.ones(beam_size) * t,
                          ], dim=-1).cpu().int()
        dense_fsa_vec = k2.DenseFsaVec(nnet_output, supervision)
        num_lats = k2.intersect_dense(graph, dense_fsa_vec, output_beam=30.0)
        num_tot_scores = num_lats.get_tot_scores(log_semiring=True, use_double_scores=True)
        buf.append(num_tot_scores)

    buf = torch.stack(buf, dim=1)
    score_sequences = torch.stack(score_sequences, dim=0)
    print(buf - score_sequences)
 
if __name__ == "__main__":
    step_intersect_test() 


================================================
FILE: nets/scorers/ctc.py
================================================
"""ScorerInterface implementation for CTC."""

import numpy as np
import torch

from espnet.nets.ctc_prefix_score import CTCPrefixScore
from espnet.nets.ctc_prefix_score import CTCPrefixScoreTH
from espnet.nets.scorer_interface import BatchPartialScorerInterface


class CTCPrefixScorer(BatchPartialScorerInterface):
    """Decoder interface wrapper for CTCPrefixScore."""

    def __init__(self, ctc: torch.nn.Module, eos: int):
        """Initialize class.

        Args:
            ctc (torch.nn.Module): The CTC implementaiton.
                For example, :class:`espnet.nets.pytorch_backend.ctc.CTC`
            eos (int): The end-of-sequence id.

        """
        self.ctc = ctc
        self.eos = eos
        self.impl = None

    def init_state(self, x: torch.Tensor):
        """Get an initial state for decoding.

        Args:
            x (torch.Tensor): The encoded feature tensor

        Returns: initial state

        """
        logp = self.ctc.log_softmax(x.unsqueeze(0)).detach().squeeze(0).cpu().numpy()
        # TODO(karita): use CTCPrefixScoreTH
        self.impl = CTCPrefixScore(logp, 0, self.eos, np)
        return 0, self.impl.initial_state()

    def select_state(self, state, i, new_id=None):
        """Select state with relative ids in the main beam search.

        Args:
            state: Decoder state for prefix tokens
            i (int): Index to select a state in the main beam search
            new_id (int): New label id to select a state if necessary

        Returns:
            state: pruned state

        """
        if type(state) == tuple:
            if len(state) == 2:  # for CTCPrefixScore
                sc, st = state
                return sc[i], st[i]
            else:  # for CTCPrefixScoreTH (need new_id > 0)
                r, log_psi, f_min, f_max, scoring_idmap = state
                s = log_psi[i, new_id].expand(log_psi.size(1))
                if scoring_idmap is not None:
                    return r[:, :, i, scoring_idmap[i, new_id]], s, f_min, f_max
                else:
                    return r[:, :, i, new_id], s, f_min, f_max
        return None if state is None else state[i]

    def score_partial(self, y, ids, state, x):
        """Score new token.

        Args:
            y (torch.Tensor): 1D prefix token
            next_tokens (torch.Tensor): torch.int64 next token to score
            state: decoder state for prefix tokens
            x (torch.Tensor): 2D encoder feature that generates ys

        Returns:
            tuple[torch.Tensor, Any]:
                Tuple of a score tensor for y that has a shape `(len(next_tokens),)`
                and next state for ys

        """
        prev_score, state = state
        presub_score, new_st = self.impl(y.cpu(), ids.cpu(), state)
        tscore = torch.as_tensor(
            presub_score - prev_score, device=x.device, dtype=x.dtype
        )
        return tscore, (presub_score, new_st)

    def batch_init_state(self, x: torch.Tensor):
        """Get an initial state for decoding.

        Args:
            x (torch.Tensor): The encoded feature tensor

        Returns: initial state

        """
        logp = self.ctc.log_softmax(x.unsqueeze(0))  # assuming batch_size = 1
        xlen = torch.tensor([logp.size(1)])
        self.impl = CTCPrefixScoreTH(logp, xlen, 0, self.eos)
        return None

    def batch_score_partial(self, y, ids, state, x):
        """Score new token.

        Args:
            y (torch.Tensor): 1D prefix token
            ids (torch.Tensor): torch.int64 next token to score
            state: decoder state for prefix tokens
            x (torch.Tensor): 2D encoder feature that generates ys

        Returns:
            tuple[torch.Tensor, Any]:
                Tuple of a score tensor for y that has a shape `(len(next_tokens),)`
                and next state for ys

        """
        batch_state = (
            (
                torch.stack([s[0] for s in state], dim=2),
                torch.stack([s[1] for s in state]),
                state[0][2],
                state[0][3],
            )
            if state[0] is not None
            else None
        )
        return self.impl(y, batch_state, ids)

    def extend_prob(self, x: torch.Tensor):
        """Extend probs for decoding.

        This extention is for streaming decoding
        as in Eq (14) in https://arxiv.org/abs/2006.14941

        Args:
            x (torch.Tensor): The encoded feature tensor

        """
        logp = self.ctc.log_softmax(x.unsqueeze(0))
        self.impl.extend_prob(logp)

    def extend_state(self, state):
        """Extend state for decoding.

        This extention is for streaming decoding
        as in Eq (14) in https://arxiv.org/abs/2006.14941

        Args:
            state: The states of hyps

        Returns: exteded state

        """
        new_state = []
        for s in state:
            new_state.append(self.impl.extend_state(s))

        return new_state


================================================
FILE: nets/scorers/ctc_rnnt_scorer.py
================================================
import k2
import torch
import math
from snowfall.training.ctc_graph import CtcTrainingGraphCompiler
from snowfall.lexicon import Lexicon


class CTCRNNTScorer():
    def __init__(self, lang, device, idim, sos_id, rank, use_segment, char_list):
        
        self.device = device

        # compiler
        self.lang = lang
        self.lexicon = Lexicon(lang)
        self.graph_compiler = CtcTrainingGraphCompiler(
                              L_inv=self.lexicon.L_inv,
                              phones=self.lexicon.phones,
                              words=self.lexicon.words
                              )

        # linear
        self.phone_ids = self.lexicon.phone_symbols()
        self.lo = torch.nn.Linear(idim, len(self.phone_ids) + 1) 
        self.load_weight(rank)

        self.char_list = char_list

    def load_weight(self, rank):
        # load lo weight and lm_scores
        ckpt_dict = torch.load(self.lang / f"ctc_param.{rank}.pth")
        for v in ckpt_dict.values():
            v.requires_grad=False
        lo_dict = {"weight": ckpt_dict["lo.1.weight"],
                   "bias": ckpt_dict["lo.1.bias"]}
        self.lo.load_state_dict(lo_dict)

    def den_scores(self, x):
        # (1) nnet_output
        x = x.unsqueeze(0)
        nnet_output = self.lo(x)
        nnet_output = torch.nn.functional.log_softmax(nnet_output, dim=-1)

        # None is to be compatible with MMIRNNTScorer
        return nnet_output, None 

    def batch_score(self, A, nnet_output, den_scores, tu_sum, mmi_weight):
        print("tu_sum: ", tu_sum)

        batch = len(A)
        if batch == 0:
            return A

        # (1) supervision
        # +1 since frame start with 0; +1 since redundant blank
        # the supervision must be descending order.
        ts = [tu_sum - len(h.yseq) + 2 for h in A]
        ts = torch.Tensor(ts).long()
        supervision = torch.stack([torch.arange(batch),
                                   torch.zeros(batch),
                                   ts
                                  ], dim=1).to(torch.int32) 
        indices = torch.argsort(supervision[:, 2], descending=True)
        supervision = supervision[indices]
        dense_fsa_vec = k2.DenseFsaVec(nnet_output.repeat(batch, 1, 1),
                                       supervision)

        # (2) texts
        texts = [h.yseq[1:] for h in A] # exclude starting <sos>
        texts = [" ".join([self.char_list[x] for x in text]) for text in texts] # need modification for BPE
        texts = [texts[idx] for idx in indices] # reorder
        graphs = self.graph_compiler.compile(texts)

        # (3) intersection.  
        lats = k2.intersect_dense(graphs, dense_fsa_vec, output_beam=10.0)
        tot_scores = lats.get_tot_scores(log_semiring=True, use_double_scores=True)
        tot_scores = torch.where(tot_scores == -math.inf, 0.0, tot_scores)
        
        # (4) assign and post-process
        # Question: How to deal with the hypothesis with empty yseq
        idx_to_empty_str = [j for j, x in enumerate(texts) if x == ""]
        for j in idx_to_empty_str:
            tot_scores[j] = 0.0

        for j in range(batch):
            h = A[indices[j]]

            # step_score = (tot_scores[j] - h.mmi_tot_score)*mmi_weight
            h.score += (tot_scores[j].item() - h.mmi_tot_score) * mmi_weight
            h.mmi_tot_score = tot_scores[j].item()
            # print(f"idx: {indices[j]} | Hypothesis: {texts[j]} | CTC Score: {h.mmi_tot_score} | Tot Score: {h.score} | CTC step Score: {step_score}", flush=True)
        
        return A
         

================================================
FILE: nets/scorers/length_bonus.py
================================================
"""Length bonus module."""
from typing import Any
from typing import List
from typing import Tuple

import torch

from espnet.nets.scorer_interface import BatchScorerInterface


class LengthBonus(BatchScorerInterface):
    """Length bonus in beam search."""

    def __init__(self, n_vocab: int):
        """Initialize class.

        Args:
            n_vocab (int): The number of tokens in vocabulary for beam search

        """
        self.n = n_vocab

    def score(self, y, state, x):
        """Score new token.

        Args:
            y (torch.Tensor): 1D torch.int64 prefix tokens.
            state: Scorer state for prefix tokens
            x (torch.Tensor): 2D encoder feature that generates ys.

        Returns:
            tuple[torch.Tensor, Any]: Tuple of
                torch.float32 scores for next token (n_vocab)
                and None

        """
        return torch.tensor([1.0], device=x.device, dtype=x.dtype).expand(self.n), None

    def batch_score(
        self, ys: torch.Tensor, states: List[Any], xs: torch.Tensor
    ) -> Tuple[torch.Tensor, List[Any]]:
        """Score new token batch.

        Args:
            ys (torch.Tensor): torch.int64 prefix tokens (n_batch, ylen).
            states (List[Any]): Scorer states for prefix tokens.
            xs (torch.Tensor):
                The encoder feature that generates ys (n_batch, xlen, n_feat).

        Returns:
            tuple[torch.Tensor, List[Any]]: Tuple of
                batchfied scores for next token with shape of `(n_batch, n_vocab)`
                and next state list for ys.

        """
        return (
            torch.tensor([1.0], device=xs.device, dtype=xs.dtype).expand(
                ys.shape[0], self.n
            ),
            None,
        )


================================================
FILE: nets/scorers/lookahead.py
================================================
import k2
import torch
import numpy as np

def search_lexical_tree(self, node, next_tokens):
    if node is None:
        print("None node given!")

    intervals, next_nodes = [], []
    # some tokens are invalid (e.g., invalid word combination). however, we should
    # still compute the score for it to be compatible. We will force that score
    # to logzero in postprocess stage, but need to use the index_to_kill make a record.
    index_to_kill = []

    for idx, i in enumerate(next_tokens):
        # node is the previous one if _ is not proposed else root
        subword = self.char_list[i]
        # case (1): '_' or <eos> is proposed, which means end of the word
        if subword == self.bpe_space or subword == "<eos>":
            this_node = node # keep 'node' unchanged
            # Invalid and kill. Previous node cannot be root
            if this_node == self.lexroot:
                interval = [self.word_unk_id-1, self.word_unk_id]
                this_node = None
                index_to_kill.append(idx)
            # score is for a word, not a word prefix -> interval for only one word 
            else:
                interval = [this_node[2][0], this_node[2][0] + 1]
                # next_node is root so the next token is valid even though it is not
                # start with '_' 
                this_node = self.lexroot

        # case (2): impossible token. kill them
        elif subword == "<blank>" or subword == "<unk>":
            this_node = None
            interval = [self.word_unk_id-1, self.word_unk_id]
            index_to_kill.append(idx)

        # case (3): ordinary tokens. All special token should never reach this branch
        else:
            # subword start with '_' means a prefix of new word -> search from root
            this_node = self.lexroot if subword.startswith(self.bpe_space) else node

            subword = subword.replace(self.bpe_space, "")
            for c in subword:
                cid = self.alphabet_dict[c]
                # descent to successor
                if cid in this_node[0]:
                    this_node = this_node[0][cid]
                # no valid successor found. kill this hypothesis
                else:
                    this_node = None
                    break

            if this_node is not None and this_node[2] is not None:
                interval = this_node[2]
            else:
                interval = [self.word_unk_id-1, self.word_unk_id]
                index_to_kill.append(idx)

        # plus one to correct the interval. see building process of lexroot
        interval = [interval[0] + 1, interval[1] + 1]
        intervals.append(interval)
        # this_node == None always means a kill
        next_nodes.append(this_node)

    return intervals, next_nodes, index_to_kill


def parse_lookahead(yseq, lexroot, char_list, alphabet, word_dict, bpe_space):

    # (1) check if the final word finishes
    final_token = char_list[yseq[-1]]
    if final_token in ["<blank>", "<eos>", "<unk>", bpe_space]:
        tail_complete = True
    else:
        tail_complete = False

    # (2) recover the string
    yseq = "".join([char_list[y] for y in yseq])\
           .replace("<blank>", "")\
           .replace("<eos>", "")\
           .replace("<unk>", bpe_space + "<unk>")\
           .replace(bpe_space, " ")\
           .strip().split()

    # (3) parse prefix
    unk_id = word_dict["<UNK>"]
    prefix = [word_dict[tok] if tok in word_dict else unk_id 
                for tok in yseq[:-1]]

    # (4) parse interval of tail

    tail = yseq[-1] if len(yseq) > 0 else "<unk>"
    if tail == "<unk>":
        interval = [unk_id-1, unk_id]
    else:
        node = lexroot
        for c in tail:
            cid = alphabet[c]
            if cid in node[0]:
                node = node[0][cid]
                interval = [node[2][0], node[2][0] + 1]\
                               if tail_complete else node[2]
            else:
                interval = [unk_id-1, unk_id]
                break

    # shift by 1: see building process of lexroot
    interval = [interval[0] + 1, interval[1] + 1]   

    # yseq = " ".join(yseq)
    # print(f"yseq: {yseq} prefix: {prefix} interval: {interval}") 
    return prefix, interval

def build_word_fsa_mat(prefix, interval):
    prefix_len = len(prefix)

    # prefix part
    start_state = np.arange(prefix_len)
    end_state = np.arange(prefix_len) + 1
    labels = np.array(prefix)
    scores = np.zeros(prefix_len)
    prefix_part = np.stack([start_state, end_state, labels, scores], axis=1)

    # interval_part
    interval_len = interval[1] - interval[0]
    start_state = np.ones(interval_len) * prefix_len
    end_state = np.ones(interval_len) * (prefix_len + 1)
    labels = np.arange(interval[0], interval[1])
    scores = np.zeros(interval_len)
    interval_part = np.stack([start_state, end_state, labels, scores], axis=1)

    # final arc
    final_arc = np.array([[prefix_len + 1, prefix_len + 2, -1, 0]])

    # combine 
    mat = np.concatenate([prefix_part, interval_part, final_arc], axis=0)
    mat = torch.from_numpy(mat).int()
    return mat


================================================
FILE: nets/scorers/mmi.py
================================================
import k2
import torch
import torch.nn.functional as F
import sys
import jieba
from espnet.nets.scorer_interface import PartialScorerInterface
from snowfall.training.mmi_graph import MmiTrainingGraphCompiler
from snowfall.lexicon import Lexicon
from snowfall.training.mmi_graph import create_bigram_phone_lm
from snowfall.warpper.mmi_utils import build_word_mapping, convert_transcription

# All methods are overrided
class MMIPrefixScores(PartialScorerInterface):
    def __init__(self, lang, device, idim, sos_id, rank, use_segment, char_list):
        """
        lang: Path object of lang dir
        device: torch device
 
        We do not refer K2MMI module in model to avoid device conflict
        """
        self.lang = lang
        self.device = device
        self.sos_id = sos_id # sos and eos are the same
        self.char_list = char_list
        self.oovid = int(open(self.lang / 'oov.int').read().strip())

        self.lexicon = Lexicon(lang)
        self.graph_compiler = MmiTrainingGraphCompiler(
                                 self.lexicon, device=device)    

        self.phone_ids = self.lexicon.phone_symbols()
        # ignore dropout here; +1 for blank; maybe need log_softmax
        self.lo = torch.nn.Linear(idim, len(self.phone_ids) + 1) 
        self.lm_scores = None
        self.load_weight(rank)

        self.P = create_bigram_phone_lm(self.phone_ids)
        self.P.set_scores_stochastic_(self.lm_scores)

        # if True, the numerator score would be calculated on segments instead of single characters
        self.use_segment = use_segment 

    def load_weight(self, rank):
        # load lo weight and lm_scores
        ckpt_dict = torch.load(self.lang / f"mmi_param.{rank}.pth")
        for v in ckpt_dict.values():
            v.requires_grad=False
        self.lm_scores = ckpt_dict["lm_scores"] 
        lo_dict = {"weight": ckpt_dict["lo.1.weight"],
                   "bias": ckpt_dict["lo.1.bias"]}
        self.lo.load_state_dict(lo_dict)

    def init_state(self, x):
        """
        x: 2-D tensor, utterance encoder output without batch
        
        return: nnet_output: [B, T, D], log_distribution without being normalized (exp-sum!=1)
                supervision: supervision for single utterance
                prev_score: initial score

        The denominator score is independent to the prefix. It is a constant for any hypothesis.
        So it would be ignored during decoding: We only consider the numerator, and the initial 
        score is set 0.
        """
        
        x = x.unsqueeze(0)
        nnet_output = self.lo(x)
        
        T = x.size()[1]
        supervision = torch.Tensor([[0, 0, T]]).to(torch.int32)

        prev_score = torch.Tensor([0]).to(torch.float32)
        return nnet_output, supervision, prev_score 

    def select_state(self, states, j):
        nnet_output_single, supervision_single, prev_scores = states
        prev_score = torch.Tensor([prev_scores[j]])
        return nnet_output_single, supervision_single, prev_score

    def score(**kargs):        
        raise NotImplementedError

    def score_partial(self, y, next_tokens, state, hs_pad):
        """
        y: prefix hypothesis, start with sos
        next_tokens: candidates for next token
	state: decoding state
	hs_pad: encoder output, ignore

        return:
        tok_scores: prefix g, new token c, hypothesis h. token_scores = score(h) - score(g)
                    which is the score for c.
        state: directly copy nnet_output_single and supervision_single. 
               save the scores for diffeent score(h), it would be score(g) in next round.
        """
        nnet_output_single, supervision_single, prev_score = state
        batch_size = next_tokens.size()[0]

        # acoustic
        supervision = supervision_single.repeat(batch_size, 1)
        nnet_output = nnet_output_single.repeat(batch_size, 1, 1)
        dense_fsa_vec = k2.DenseFsaVec(nnet_output, supervision)

        # texts
        y = y.unsqueeze(0).repeat(batch_size, 1)
        next_tokens = next_tokens.unsqueeze(1)
        ys = torch.cat([y, next_tokens], dim=-1)
        # texts = convert_transcription(ys, self.word_mapping, self.lexicon.words,
        #                               self.oovid, [self.sos_id])
        texts = ["".join([self.char_list[tid] for tid in text[1:]]) for text in ys]
        texts = [text.replace("<space>", " ") for text in texts]
        print(texts)

        if self.use_segment:
            texts = self.segmentation(texts)
        
        num_graphs, _ = self.graph_compiler.compile(
                            texts, self.P, replicate_den=True)
        num_lats = k2.intersect_dense(num_graphs, dense_fsa_vec, output_beam=10.0)
        
        num_tot_scores = num_lats.get_tot_scores(log_semiring=True,
                                                 use_double_scores=True)
        tok_scores = num_tot_scores - prev_score
        
        return tok_scores, (nnet_output_single, supervision_single, num_tot_scores)

    def score(self, state):
        raise NotImplementedError

    def final_score(self, state):
        # This will add a score for the <eos>
        # We do not give a special score for the last <eos>
        return 0

    def segmentation(self, ys):
        ys = [y.replace(" ", "") for y in ys]
        ys = [jieba.cut(y, cut_all=False) for y in ys]
        ys = [" ".join(list(y)) for y in ys]
        return ys


================================================
FILE: nets/scorers/mmi_alignment_score.py
================================================
import k2
import torch
import math
from espnet.nets.scorer_interface import PartialScorerInterface
from snowfall.training.mmi_graph import MmiTrainingGraphCompiler
from snowfall.lexicon import Lexicon
from snowfall.training.mmi_graph import create_bigram_phone_lm
from espnet.nets.scorers.mmi_utils import step_intersect

class MMIRNNTScorer():
    def __init__(self, lang, device, idim, sos_id, rank, use_segment, char_list, mas_lookahead=1):
        
        self.lang = lang
        self.device = device
        
        self.lexicon = Lexicon(lang)
        self.graph_compiler = MmiTrainingGraphCompiler(self.lexicon, self.device)
        self.phone_ids = self.lexicon.phone_symbols()
      
        self.lo = torch.nn.Linear(idim, len(self.phone_ids) + 1) 
        self.lm_scores = None
        self.load_weight(rank)

        self.P = create_bigram_phone_lm(self.phone_ids)
        self.P.set_scores_stochastic_(self.lm_scores)
        
        self.char_list = char_list
        self.eos = sos_id  # <sos> is identical to <eos>
        self.blank = 0 # by default 0 means CTC blank 
        self.logzero = -10000
     
        self.mas_lookahead = mas_lookahead

    def load_weight(self, rank):
        # load lo weight and lm_scores
        ckpt_dict = torch.load(self.lang / f"mmi_param.{rank}.pth")
        for v in ckpt_dict.values():
            v.requires_grad=False
        self.lm_scores = ckpt_dict["lm_scores"]
        lo_dict = {"weight": ckpt_dict["lo.1.weight"],
                   "bias": ckpt_dict["lo.1.bias"]}
        self.lo.load_state_dict(lo_dict)

    def den_scores(self, x):
        # (1) nnet_output
        x = x.unsqueeze(0)
        nnet_output = self.lo(x)

        # (2) den_scores
        texts = ["<UNK>"] # use a random text, just to get den graph
        _, den = self.graph_compiler.compile(texts, self.P, replicate_den=True)

        T = nnet_output.size(1)
        supervision = torch.Tensor([[0, 0, T]]).int()
        dense_fsa_vec = k2.DenseFsaVec(nnet_output, supervision)
        den_scores = step_intersect(den, dense_fsa_vec)[0]

        return nnet_output, den_scores

    def batch_rescore(self, A, nnet_output, den_scores):
        batch, T = len(A), nnet_output.size(1)
        if batch == 0:
            return A

        texts = [h.yseq[1:] for h in A]
        texts = [" ".join([self.char_list[x] for x in text]) for text in texts]
        num, _ = self.graph_compiler.compile(texts, self.P, replicate_den=False)

        supervision = torch.stack([torch.arange(batch),
                                   torch.zeros(batch),
                                   torch.ones(batch) * T], dim=1).int()
        dense_fsa_vec = k2.DenseFsaVec(nnet_output.repeat(batch, 1, 1), supervision)

        num_lats = k2.intersect_dense_pruned(num,
                                             dense_fsa_vec,
                                             search_beam=20.0,
                                             output_beam=10.0,
                                             min_active_states=30,
                                             max_active_states=100000)
        num_scores = num_lats.get_tot_scores(True, True)
        tot_scores = num_scores - den_scores[-1] # num_frame -> index

        for h, s in zip(A, tot_scores):
            h.mmi_prev_score = s.item()
        
        return A

    def batch_score(self, A, nnet_output, den_scores, tu_sum, mmi_weight):
        if self.mas_lookahead > 15:
            return self.Imp_A(A, nnet_output, den_scores, tu_sum, mmi_weight)
        else:
            return self.Imp_B(A, nnet_output, den_scores, tu_sum, mmi_weight)

    def Imp_B(self, A, nnet_output, den_scores, tu_sum, mmi_weight):
        batch = len(A)
        if batch == 0:
            return A

        # reorder: increasing order in u means decreasing order in t
        #          this is required by k2 supervision
        A.sort(key=lambda h: len(h.yseq))

        # (1) get ts: the alignment length in t-axis 
        # +1 since frame start with 0; +1 since redundant blank
        ts = [tu_sum - len(h.yseq) + 2 for h in A]
        ts = torch.Tensor(ts).long()

        # (2) compile numerator graph
        texts = [h.yseq[1:] for h in A] # exclude starting <sos>
        texts = [" ".join([self.char_list[x] for x in text]) for text in texts] # need modification for BPE
        num_graphs, _ = self.graph_compiler.compile(texts, self.P, replicate_den=False)

        # (3) intersection
        lookahead_range = (0, self.mas_lookahead)
        tot_scores_collection = []
        T = nnet_output.size()[1]
        for s in range(lookahead_range[0], lookahead_range[1] + 1):
            ts_shift = torch.clamp(ts + s, min=1, max=T)
            supervision = torch.stack([torch.arange(batch),
                                       torch.zeros(batch),
                                       ts_shift
                                       ], dim=1).to(torch.int32)
            dense_fsa_vec = k2.DenseFsaVec(nnet_output.repeat(batch, 1, 1),
                                           supervision)
            num_lats = k2.intersect_dense_pruned(num_graphs,
                                             dense_fsa_vec,
                                             search_beam=20.0,
                                             output_beam=10.0,
                                             min_active_states=30,
                                             max_active_states=20000)
            num_scores = num_lats.get_tot_scores(True, True)
            # num_scores = torch.where(num_scores == -math.inf, 0.0, num_scores)
            tot_scores = num_scores - den_scores[ts_shift-1] # num_frame -> idx_frame
            tot_scores_collection.append(tot_scores)
        tot_scores = torch.stack(tot_scores_collection, dim=1) # [beam, T]

        # hint: we can only use top-1 score rather than logsumexp or top-k-sum
        # since torch.clamp leads to repeatition of these scores at boundaries
        tot_scores, _ = torch.topk(tot_scores, 1, dim=-1)

        # (4) assign and post-process
        idx_to_empty_str = [j for j, h in enumerate(A) if len(h.yseq) == 1]
        for j in idx_to_empty_str:
            tot_scores[j] = 0.0

        for j in range(batch):
            h = A[j]
            h.score += (tot_scores[j].item() - h.mmi_prev_score) * mmi_weight
            h.mmi_prev_score = tot_scores[j].item()

        A = [h for h in A if h.score > -1e8]
        return A
    

    "Version using step_intersect. Very slow. Only used when lookahead is very large"
    def Imp_A(self, A, nnet_output, den_scores, tu_sum, mmi_weight):
        batch = len(A)
        if batch == 0 or mmi_weight == 0:
            return A
        
        # For hypotheses without mmi_tot_score, compute the score sequences and assign
        texts = [h.yseq[1:] for h in A if h.mmi_tot_score is None]
        texts = [" ".join([self.char_list[x] for x in text]) for text in texts]
        indices = [i for i, h in enumerate(A) if h.mmi_tot_score is None]

        T, num_texts = nnet_output.size(1), len(texts)
        if num_texts > 0:
            num, _ = self.graph_compiler.compile(texts, self.P, replicate_den=False)
            supervision = torch.stack([torch.arange(num_texts),
                                       torch.zeros(num_texts),
                                       torch.ones(num_texts) * T], dim=1).int()
            dense_fsa_vec = k2.DenseFsaVec(nnet_output.repeat(num_texts, 1, 1), supervision)
            num_scores = step_intersect(num, dense_fsa_vec)
            assert len(num_scores) == num_texts
    
            tot_scores = [x - den_scores for x in num_scores]
            
            for i, score in zip(indices, tot_scores):
                A[i].mmi_tot_score = score.tolist()

        # selecting the scores accordingly
        assert all([h.mmi_tot_score is not None for h in A])
        ts = [tu_sum - len(h.yseq[1:]) for h in A]
        curr_scores = [max(h.mmi_tot_score[t: min(t + self.mas_lookahead, T)])
                      for t, h in zip(ts, A)]
        prev_scores = [h.mmi_prev_score for h in A]
        diff_scores = [a - b for a, b in zip(curr_scores, prev_scores)]

        for curr_s, diff_s, h in zip(curr_scores, diff_scores, A):
            h.score += mmi_weight * diff_s
            h.mmi_prev_score = curr_s
       
        # exclude all hypotheses whose end-states are not reachable 
        A = [h for h in A if h.score > -1e8]
        
        return A

    
================================================
FILE: nets/scorers/mmi_frame_prefix_scorer.py
================================================
import k2
import torch
from espnet.lm.lm_utils import make_lexical_tree
from espnet.nets.scorer_interface import PartialScorerInterface
from snowfall.training.mmi_graph import MmiTrainingGraphCompiler
from snowfall.lexicon import Lexicon
from snowfall.training.mmi_graph import create_bigram_phone_lm


class MMIFramePrefixScorer(PartialScorerInterface):
    def __init__(self, lang, device, idim, sos_id, rank, use_segment, char_list):
        
        self.lang = lang
        # Although the main decoding process adopt CPU. MMI computation may
        # Adopt GPU to accelerate
        #self.device = torch.device(f"cuda:{rank-1}") if device == "cuda" \
        #              else torch.device("cpu")
        self.device = torch.device("cpu")
        print("MMI scorer device: ", self.device)
        
        self.lexicon = Lexicon(lang)
        self.graph_compiler = MmiTrainingGraphCompiler(self.lexicon, self.device)
        self.phone_ids = self.lexicon.phone_symbols()
      
        self.lo = torch.nn.Linear(idim, len(self.phone_ids) + 1) 
        self.lm_scores = None
        self.load_weight(rank)

        self.P = create_bigram_phone_lm(self.phone_ids)
        self.P.set_scores_stochastic_(self.lm_scores)
        
        self.blank = 0 # by default 0 means CTC blank 
        self.logzero = -10000

        # special tokens and lexical root
        self.word2id = self.lexicon.words._sym2id 
        self.char2id = {c: cid for cid, c in enumerate(char_list)}
        self.id2char = {cid: c for cid, c in enumerate(char_list)}
        self.eos = sos_id
        self.word_unk_id = self.word2id["<UNK>"]
        self.char_space_id = self.char2id["<space>"]
        self.lexroot = make_lexical_tree(self.word2id, self.char2id, "<UNK>")
        print(f"lexroot succ: {self.lexroot[0].keys()}")

    def load_weight(self, rank):
        # load lo weight and lm_scores
        ckpt_dict = torch.load(self.lang / f"mmi_param.{rank}.pth")
        for v in ckpt_dict.values():
            v.requires_grad=False
        self.lm_scores = ckpt_dict["lm_scores"]
        lo_dict = {"weight": ckpt_dict["lo.1.weight"],
                   "bias": ckpt_dict["lo.1.bias"]}
        self.lo.load_state_dict(lo_dict)

    def init_state(self, x):
        # (1) nnet_output
        x = x.unsqueeze(0)
        nnet_output = self.lo(x)

        # (2) den_scores
        texts = ["<UNK>"] # use a random text, just to get den graph
        _, den = self.graph_compiler.compile(texts, self.P, replicate_den=True)

        T = x.size()[1]
        den_scores = []
        # use a loop since denominator would consume much memory
        # in descending order
        for t in range(T, 0, -1):
            supervision = torch.Tensor([[0, 0, t]]).to(torch.int32) # [idx, start, length]
            dense_fsa_vec = k2.DenseFsaVec(nnet_output, supervision)
            den_lats = k2.intersect_dense(den, dense_fsa_vec, output_beam=10.0)
            den_tot_scores = den_lats.get_tot_scores(log_semiring=True, use_double_scores=True)
            den_scores.append(den_tot_scores)
        den_scores = torch.cat(den_scores).unsqueeze(0).cpu() # [T] -> [B, T]
        print("Den scores: ", den_scores)

        # (3) Prev Score is zero
        prev_score = torch.Tensor([0]).to(torch.float32)
        return nnet_output, den_scores, prev_score, self.lexroot

    def select_state(self, states, j):
        nnet_output_single, den_scores, prev_scores, next_roots = states
        return nnet_output_single, den_scores, prev_scores[j], next_roots[j]

    def score(**kargs):
        raise NotImplementedError

    def split_intervals(self, olds, max_interval=2000):
        # it may cause error in k2 if the interval is too large. e.g., > 5k
        news, indices = [], []
        cnt = 0
        for start, end in olds:
            group_idx = []
            while start <= end:
                news.append([start, min(end, start + max_interval)])
                start += max_interval
                group_idx.append(cnt)
                cnt += 1
            indices.append(group_idx)
        return news, indices

    def score_partial(self, y, next_tokens, state, hs_pad):
        # (1) unpack the state
        nnet_output_single, den_scores, prev_score, root = state
    
        # (2) process the prefix and intervals; build numerator graphs       
        prefix = "".join([self.id2char[c.item()] for c in y]).replace("<eos>", "").replace("<space>", " ")
        prefix = [self.word2id.get(tok, self.word_unk_id) for tok in prefix.strip().split()]
        if y[-1] == self.char_space_id:
            prefix = prefix # end with <space>
        else:
            prefix = prefix[:-1] # the last part is a sub-word
        print(f"prefix: {prefix}")

        intervals = []
        force_zero = [] # invalid case. Set prob to logzero finally
        next_roots = []
        for idx, tok in enumerate(next_tokens):
            tok = tok.item()
            # case 1: <space>, <eos> indicate the end of a word
            # This reduce the probability from a prefix prob
            # to an exact probability of a word
            if tok == self.char_space_id or tok == self.eos:
                if root[1] > -1:
                    intervals.append([root[1]-1, root[1]])
                    next_roots.append(self.lexroot)
                    print(f"{idx}-th: {tok}, space / eos. This is a valid space")
                else:
                    intervals.append((self.word_unk_id-1, self.word_unk_id))
                    force_zero.append(idx)
                    next_roots.append(self.lexroot)
                    print(f"{idx}-th: {tok}, space / eos. This is an invalid space")
            # case 2: OOV. kill it
            elif not tok in root[0]:
                intervals.append((self.word_unk_id-1, self.word_unk_id))
                force_zero.append(idx)
                next_roots.append(self.lexroot)
                print(f"{idx}-th: {tok}, oov")
            # case 3: A valid intra-word transition. 
            # shift to next lexicon node
            else:
                intervals.append(root[0][tok][2])
                next_roots.append(root[0][tok])
                print(f"{idx}-th: {tok}, intra-trans")
       
        # Being compatible with lexroot format 
        intervals = [(l+1, r+1) for l, r in intervals]
        # Long intervals may cause error in k2. split them into
        split_intervals, interval_indices = self.split_intervals(intervals)
        print(f"intervals: {intervals}, split intervals: {split_intervals}", flush=True)
        num_split = len(split_intervals)

        num_graphs = self.graph_compiler.compile_nums_for_prefix_scoring(
                                          prefix, split_intervals, self.P)
        
        # (3) frame-wise intersection
        # calculate frame-by-frame to avoid OOM
        T = nnet_output_single.size()[1]
        nnet_output = nnet_output_single.repeat(num_split, 1, 1)
        score_segment = []
   
        for t in range(1, T+1):
            supervision = torch.stack([
                          torch.arange(num_split),
                          torch.zeros(num_split),
                          torch.ones(num_split) * t
                          ], dim=1).to(torch.int32)
            dense_fsa_vec = k2.DenseFsaVec(nnet_output[:, :t], supervision)
            num_lats = k2.intersect_dense(num_graphs, dense_fsa_vec, output_beam=10.0) 
            num_tot_scores_split = num_lats.get_tot_scores(log_semiring=True,
                                                           use_double_scores=True)
            score_segment.append(num_tot_scores_split)
        
        score_segment = torch.stack(score_segment, dim=1) # [num_split, T]
        num_scores = torch.zeros(len(interval_indices), T) # [batch_size, T]
        for i in range(len(interval_indices)):
            num_scores[i] = torch.logsumexp(score_segment[interval_indices[i]], dim=0)
            if i in force_zero:
                num_scores[i] = self.logzero

        # (4) finalize
        tot_scores = num_scores - den_scores.unsqueeze(0)
        tot_scores = torch.logsumexp(tot_scores, dim=-1)[0]
        tok_scores = tot_scores - prev_score
        state = (nnet_output_single, den_scores, tot_scores, next_roots)
        return tok_scores.numpy(), state

    def final_score(self, state):
        return 0     
                

================================================
FILE: nets/scorers/mmi_frame_scorer.py
================================================
import k2
import torch
from espnet.nets.scorer_interface import PartialScorerInterface
from snowfall.training.mmi_graph import MmiTrainingGraphCompiler
from snowfall.lexicon import Lexicon
from snowfall.training.mmi_graph import create_bigram_phone_lm
# from espnet.nets.scorers.trace_frame import trace_frame
from espnet.nets.scorers.mmi_utils import step_intersect


class MMIFrameScorer(PartialScorerInterface):
    def __init__(self, lang, device, idim, sos_id, rank, use_segment, char_list):
        
        self.lang = lang
        self.device = device
        
        self.lexicon = Lexicon(lang)
        self.oov = self.oovid = open(self.lang / 'oov.txt').read().strip()
        self.graph_compiler = MmiTrainingGraphCompiler(self.lexicon, self.device, self.oov)
        self.phone_ids = self.lexicon.phone_symbols()
      
        self.lo = torch.nn.Linear(idim, len(self.phone_ids) + 1) 
        self.lm_scores = None

        for i in range(10):
            try:
                self.load_weight(rank)
            except:
                print(f"{i}-th trail to load MMI matrix weight but fail")

        self.P = create_bigram_phone_lm(self.phone_ids)
        self.P.set_scores_stochastic_(self.lm_scores)
        
        self.char_list = char_list
        self.eos = sos_id  # <sos> is identical to <eos>
        self.blank = 0 # by default 0 means CTC blank 
        self.logzero = -10000

    def load_weight(self, rank):
        # load lo weight and lm_scores
        ckpt_dict = torch.load(self.lang / f"mmi_param.{rank}.pth")
        for v in ckpt_dict.values():
            v.requires_grad=False
        self.lm_scores = ckpt_dict["lm_scores"]
        lo_dict = {"weight": ckpt_dict["lo.1.weight"],
                   "bias": ckpt_dict["lo.1.bias"]}
        self.lo.load_state_dict(lo_dict)

    def init_state(self, x):
        # (1) nnet_output
        x = x.unsqueeze(0)
        nnet_output = self.lo(x)

        # (2) den_scores
        texts = ["<UNK>"] # use a random text, just to get den graph
        _, den = self.graph_compiler.compile(texts, self.P, replicate_den=True)

        #torch.set_printoptions(sci_mode=False)
        #fake_den_scores = trace_frame(nnet_output, den)

        T = x.size()[1]
        den_scores = []
        # use a loop since denominator would consume much memory
        # in descending order
        for t in range(T, 0, -1):
            supervision = torch.Tensor([[0, 0, t]]).to(torch.int32) # [idx, start, length]
            dense_fsa_vec = k2.DenseFsaVec(nnet_output, supervision)
            den_lats = k2.intersect_dense(den, dense_fsa_vec, output_beam=10.0)
            den_tot_scores = den_lats.get_tot_scores(log_semiring=True, use_double_scores=True)
            den_scores.append(den_tot_scores)
        den_scores = torch.cat(den_scores).unsqueeze(0) # [T] -> [B, T]
        print("den_scores: ", den_scores)

        ### DEBUG ###
        supervision = torch.Tensor([[0, 0, T]]).to(torch.int32)
        dense_fsa_vec = k2.DenseFsaVec(nnet_output, supervision)
        den_scores_ = step_intersect(den, dense_fsa_vec)[0].unsqueeze(0)
        den_scores_ = torch.flip(den_scores_, [1]) 

        max_diff = torch.max(torch.abs(den_scores - den_scores_)).item()
        if abs(max_diff) > 0.02:
            print("denominator error: ", den_scores, den_scores_)
            raise ValueError
        ### END DEBUG ###

        # (3) Prev Score is zero
        prev_score = torch.Tensor([0]).to(torch.float32)
        return nnet_output, den_scores, prev_score 

    def select_state(self, states, j):
        nnet_output_single, den_scores, prev_scores = states
        return nnet_output_single, den_scores, prev_scores[j]

    def score(**kargs):
        raise NotImplementedError

    def score_partial(self, y, next_tokens, state, hs_pad):
        # Warning: All frame-level scores are adopted in reverse order in time-axis 
        # since k2 requires a descending input length

        # (1) unpack state
        nnet_output_single, den_scores, prev_score = state

        # (2) acoustic
        T = nnet_output_single.size()[1]
        batch_size = len(next_tokens)
        num_egs = T * batch_size
        supervision = torch.stack([
                      torch.arange(num_egs),
                      torch.zeros(num_egs),
                      torch.arange(T, 0, -1).unsqueeze(1).repeat(1, batch_size).view(-1)
                      ], dim=1).to(torch.int32) 
        nnet_output = nnet_output_single.repeat(num_egs, 1, 1)
        dense_fsa_vec = k2.DenseFsaVec(nnet_output, supervision)

        # (3) texts
        y = y.unsqueeze(0).repeat(num_egs, 1)
        next_tokens = next_tokens.unsqueeze(1).repeat(T, 1)
        ys = torch.cat([y, next_tokens], dim=1)
        
        # This is for Chinese. Need more tuning on English
        #if not "<space>" in self.char_list:
        #    texts = [" ".join([self.char_list[tid] for tid in text[1:]]) for text in ys]
        #    texts = [text.replace("<eos>", "").strip() for text in texts]
        #else:
        #    texts = ["".join([self.char_list[tid] for tid in text[1:]]) for text in ys]
        #    texts = [text.replace("<eos>", "").replace("<space>", "<space> ").strip() for text in texts]
        texts = [" ".join([self.char_list[tid] for tid in text[1:]]) for text in ys]
        texts = [text.replace("<eos>", "").strip() for text in texts]

        # (4) compute and accumulate
        num_graphs, _ = self.graph_compiler.compile(texts, self.P, replicate_den=False)
        num_lats = k2.intersect_dense(num_graphs, dense_fsa_vec, output_beam=10.0)         
        num_tot_scores = num_lats.get_tot_scores(log_semiring=True, use_double_scores=True)
        num_tot_scores = num_tot_scores.view(T, batch_size).transpose(0, 1) # -> [B, T]
        
        ### DEBUG ###
        supervision = torch.stack([
                      torch.arange(batch_size),
                      torch.zeros(batch_size),
                      torch.ones(batch_size).int() * T
                      ], dim=1).to(torch.int32)
        dense_fsa_vec = k2.DenseFsaVec(nnet_output, supervision)

        ys = ys[: batch_size]
        texts = [" ".join([self.char_list[tid] for tid in text[1:]]) for text in ys]
        texts = [text.replace("<eos>", "").strip() for text in texts]
        num_graphs, _ = self.graph_compiler.compile(texts, self.P, replicate_den=False)
        num_tot_scores_ = torch.stack(step_intersect(num_graphs, dense_fsa_vec), dim=0)
        num_tot_scores_ = torch.flip(num_tot_scores_, [1])

        max_diff = torch.max(torch.abs(num_tot_scores - num_tot_scores_)).item()
        if abs(max_diff) > 0.02:
            print("numerator error: ", num_tot_scores, num_tot_scores_)
            raise ValueError 
        ### END DEBUG ### 

        #     minus the denominator scores. 
        tot_scores_frame = num_tot_scores - den_scores
        tot_scores = torch.logsumexp(tot_scores_frame, dim=-1)

        # (5) treat <eos> and ctc <blk> specailly
        next_tokens = next_tokens.squeeze(1)[:batch_size] # recover the initail next_tokens
       
        # <eos> means the exact probability rather than the prefix probability 
        eos_pos = torch.where(next_tokens == self.eos)[0]
        if len(eos_pos) > 0:
            tot_scores[eos_pos] = tot_scores_frame[eos_pos.item(), 0]

        # CTC blank is never allowed in hypothesis. kill it
        blk_pos = torch.where(next_tokens == self.blank)[0]
        if len(blk_pos) > 0:
            tot_scores[blk_pos] = self.logzero
        
        # (6) finalize
        tok_scores = tot_scores - prev_score
        state = nnet_output_single, den_scores, tot_scores
        return tok_scores, state

    def final_score(self, state):
        return 0     
                

================================================
FILE: nets/scorers/mmi_frame_scorer_trace.py
================================================
import k2
import torch
from espnet.nets.scorer_interface import PartialScorerInterface
from snowfall.training.mmi_graph import MmiTrainingGraphCompiler
from snowfall.lexicon import Lexicon
from snowfall.training.mmi_graph import create_bigram_phone_lm
from espnet.nets.scorers.trace_frame import compute_frame_level_scores_batch

class MMIFrameScorer(PartialScorerInterface):
    def __init__(self, lang, device, idim, sos_id, rank, use_segment, char_list):
        
        self.lang = lang
        self.device = device
        
        self.lexicon = Lexicon(lang)
        self.graph_compiler = MmiTrainingGraphCompiler(self.lexicon, self.device)
        self.phone_ids = self.lexicon.phone_symbols()
      
        self.lo = torch.nn.Linear(idim, len(self.phone_ids) + 1) 
        self.lm_scores = None
        self.load_weight(rank)

        self.P = create_bigram_phone_lm(self.phone_ids)
        self.P.set_scores_stochastic_(self.lm_scores)
        
        self.char_list = char_list
        self.eos = sos_id  # <sos> is identical to <eos>
        self.blank = 0 # by default 0 means CTC blank 
        self.logzero = -10000

    def load_weight(self, rank):
        # load lo weight and lm_scores
        ckpt_dict = torch.load(self.lang / f"mmi_param.{rank}.pth")
        for v in ckpt_dict.values():
            v.requires_grad=False
        self.lm_scores = ckpt_dict["lm_scores"]
        lo_dict = {"weight": ckpt_dict["lo.1.weight"],
                   "bias": ckpt_dict["lo.1.bias"]}
        self.lo.load_state_dict(lo_dict)

    def init_state(self, x):
        # (1) nnet_output
        x = x.unsqueeze(0)
        nnet_output = self.lo(x)

        # (2) den_scores
        """
        texts = ["<UNK>"] # use a random text, just to get den graph
        _, den = self.graph_compiler.compile(texts, self.P, replicate_den=True)

        T = nnet_output.size()[1]
        print(T)
        den_scores = []
        for t in range(T, 0, -1):
            supervision = torch.Tensor([[0, 0, t]]).to(torch.int32) # [idx, start, length]
            dense_fsa_vec = k2.DenseFsaVec(nnet_output, supervision)
            den_lats = k2.intersect_dense(den, dense_fsa_vec, output_beam=10.0)
            den_tot_scores = den_lats.get_tot_scores(log_semiring=True, use_double_scores=True)
            den_scores.append(den_tot_scores)
        den_scores = torch.cat(den_scores).unsqueeze(0) # [T] -> [B, T]
        print("den score computed from previous version: ", den_scores)
        """

        # trace the lattice: this is much faster
        texts = ["<UNK>"] 
        _, den = self.graph_compiler.compile(texts, self.P, replicate_den=True)
        assert len(texts) == nnet_output.size()[0]
        den_scores = compute_frame_level_scores_batch(den, nnet_output)
        print("den_scores: ", den_scores)

        # (3) Prev Score is zero
        prev_score = torch.Tensor([0]).to(torch.float32)
        return nnet_output, den_scores, prev_score 

    def select_state(self, states, j):
        nnet_output_single, den_scores, prev_scores = states
        return nnet_output_single, den_scores, prev_scores[j]

    def score(**kargs):
        raise NotImplementedError

    def score_partial(self, y, next_tokens, state, hs_pad):
        # Warning: All frame-level scores are adopted in reverse order in time-axis 
        # since k2 requires a descending input length

        # (1) unpack state
        nnet_output_single, den_frame_scores, prev_score = state

        """
        # (2) acoustic
        T = nnet_output_single.size()[1]
        batch_size = len(next_tokens)
        num_egs = T * batch_size
        supervision = torch.stack([
                      torch.arange(num_egs),
                      torch.zeros(num_egs),
                      torch.arange(T, 0, -1).unsqueeze(1).repeat(1, batch_size).view(-1)
                      ], dim=1).to(torch.int32) 
        nnet_output = nnet_output_single.repeat(num_egs, 1, 1)
        dense_fsa_vec = k2.DenseFsaVec(nnet_output, supervision)

        # (3) texts
        y = y.unsqueeze(0).repeat(num_egs, 1)
        next_tokens = next_tokens.unsqueeze(1).repeat(T, 1)
        ys = torch.cat([y, next_tokens], dim=1)
        # This is for Chinese. Need more tuning on English
        if not "<space>" in self.char_list:
            texts = [" ".join([self.char_list[tid] for tid in text[1:]]) for text in ys]
            texts = [text.replace("<eos>", "").strip() for text in texts]
        else:
            texts = ["".join([self.char_list[tid] for tid in text[1:]]) for text in ys]
            texts = [text.replace("<eos>", "").replace("<space>", "<space> ").strip() for text in texts]

        # (4) compute and accumulate
        num_graphs, _ = self.graph_compiler.compile(texts, self.P, replicate_den=False)
        num_lats = k2.intersect_dense(num_graphs, dense_fsa_vec, output_beam=10.0)         
        num_tot_scores = num_lats.get_tot_scores(log_semiring=True, use_double_scores=True)
        num_tot_scores = num_tot_scores.view(T, batch_size).transpose(0, 1) # -> [B, T]
        """

        # (2) acoustic
        T = nnet_output_single.size()[1]
        batch_size = len(next_tokens)
        nnet_output = nnet_output_single.repeat(batch_size, 1, 1) 
        supervision = torch.stack([
                      torch.arange(batch_size),
                      torch.zeros(batch_size),
                      torch.ones(batch_size) * T
                      ], dim=1).to(torch.int32)
        dense_fsa_vec = k2.DenseFsaVec(nnet_output, supervision)

        # (3) texts:
        y = y.unsqueeze(0).repeat(batch_size, 1)
        next_tokens = next_tokens.unsqueeze(1)
        ys = torch.cat([y, next_tokens], dim=1)
        if not "<space>" in self.char_list:
            texts = [" ".join([self.char_list[tid] for tid in text[1:]]) for text in ys]
            texts = [text.replace("<eos>", "").strip() for text in texts]
        else:
            texts = ["".join([self.char_list[tid] for tid in text[1:]]) for text in ys]
            texts = [text.replace("<eos>", "").replace("<space>", "<space> ").strip() for text in texts]
        num, _  = self.graph_compiler.compile(texts, self.P, replicate_den=False)
        print(texts)

        # (4) compute and accumulate
        num_frame_scores = compute_frame_level_scores_batch(num, nnet_output) 

        #     combine the denominator scores
        tot_scores_frame = num_frame_scores - den_frame_scores
        tot_scores = torch.logsumexp(tot_scores_frame, dim=-1)
        print("numerator scores: ", num_frame_scores)
        print("log_posterior: ", tot_scores_frame)

        # (5) treat <eos> and ctc <blk> specailly
        next_tokens = next_tokens.squeeze(1) # recover the initail next_tokens
       
        # <eos> means the exact probability rather than the prefix probability 
        eos_pos = torch.where(next_tokens == self.eos)[0]
        if len(eos_pos) > 0:
            tot_scores[eos_pos] = tot_scores_frame[eos_pos.item(), 0]

        # CTC blank is never allowed in hypothesis. kill it
        blk_pos = torch.where(next_tokens == self.blank)[0]
        if len(blk_pos) > 0:
            tot_scores[blk_pos] = self.logzero
        
        # (6) finalize
        tok_scores = tot_scores - prev_score
        state = nnet_output_single, den_frame_scores, tot_scores
        return tok_scores, state

    def final_score(self, state):
        return 0     
                

================================================
FILE: nets/scorers/mmi_lookahead.py
================================================
import k2
import torch
from espnet.nets.scorer_interface import PartialScorerInterface
from espnet.lm.lm_utils import make_lexical_tree
from snowfall.training.mmi_graph import MmiTrainingGraphCompiler
from snowfall.lexicon import Lexicon
from snowfall.training.mmi_graph import create_bigram_phone_lm
from espnet.nets.scorers.lookahead import parse_lookahead, build_word_fsa_mat

class MMILookaheadScorer(PartialScorerInterface):
    def __init__(self, lang, device, idim, sos_id, rank, use_segment, char_list):
        
        self.lang = lang
        self.device = device
        
        self.lexicon = Lexicon(lang)
        self.oov = self.oovid = open(self.lang / 'oov.txt').read().strip()
        self.graph_compiler = MmiTrainingGraphCompiler(self.lexicon, self.device, self.oov)
        self.phone_ids = self.lexicon.phone_symbols()
      
        self.lo = torch.nn.Linear(idim, len(self.phone_ids) + 1) 
        self.lm_scores = None
        self.load_weight(rank)

        self.P = create_bigram_phone_lm(self.phone_ids)
        self.P.set_scores_stochastic_(self.lm_scores)
       
        # We use a char-level lexicon root but search in it by BPE
        alphabet = [chr(i) for i in range(65, 91)] + ["\'"] if "A" in char_list else\
                   [chr(i) for i in range(97, 123)] + ["\'"]
        self.alphabet_dict = {c: i+1 for i, c in enumerate(alphabet)} 
        self.word_dict = self.lexicon.words._sym2id
        self.word_unk_id = int(open(self.lang / 'oov.int').read().strip()) 
        self.lexroot = make_lexical_tree(self.word_dict, self.alphabet_dict, self.word_unk_id) # 3 is unknown-id
        print("end of lex building", flush=True)

        self.char_list = char_list # BPE char list
        self.bpe_space = char_list[-2][0]

        # special value
        self.logzero = -10000.0
        self.eos = sos_id

    def load_weight(self, rank):
        # load lo weight and lm_scores
        ckpt_dict = torch.load(self.lang / f"mmi_param.{rank}.pth")
        for v in ckpt_dict.values():
            v.requires_grad=False
        self.lm_scores = ckpt_dict["lm_scores"]
        lo_dict = {"weight": ckpt_dict["lo.1.weight"],
                   "bias": ckpt_dict["lo.1.bias"]}
        self.lo.load_state_dict(lo_dict)

    def init_state(self, x):
        # (1) nnet_output
        nnet_output = self.lo(x.unsqueeze(0))

        # (2) den_graph
        texts = ["<UNK>"]
        _, den = self.graph_compiler.compile(texts, self.P, replicate_den=True)

        # (3) den_scores. Use a loop to avoid memory spark
        T = nnet_output.size()[1]
        den_scores = []
        for t in range(T, 0, -1):
            supervision = torch.Tensor([[0, 0, t]]).to(torch.int32) # [idx, start, length]
            dense_fsa_vec = k2.DenseFsaVec(nnet_output, supervision)
            den_lats = k2.intersect_dense(den, dense_fsa_vec, output_beam=10.0)
            den_tot_scores = den_lats.get_tot_scores(log_semiring=True, use_double_scores=True)
            den_scores.append(den_tot_scores)
        den_scores = torch.cat(den_scores).unsqueeze(0) # [T] -> [B, T]

        # (4) others
        prev_score = torch.Tensor([0]).to(torch.float32)

        return nnet_output, den_scores, prev_score

    def select_state(self, states, j):
        # only the prev_scores and next_nodes should be selected
        nnet_output_single, den_scores, prev_scores = states
        return nnet_output_single, den_scores, prev_scores[j]

    def score(**kargs):
        raise NotImplementedError

    def score_partial(self, y, next_tokens, state, hs_pad):
        # (1) unpack the state
        nnet_output_single, den_frame_scores, prev_score = state
        beam_size = len(next_tokens)        

        # (2) build numerator graph
        yseqs = torch.cat([
                y.unsqueeze(0).repeat(beam_size, 1),
                next_tokens.unsqueeze(1)], dim=1).cpu().tolist()
        prefix_and_interval = [parse_lookahead(yseq, self.lexroot,
                               self.char_list, self.alphabet_dict,
                               self.word_dict, self.bpe_space)
                               for yseq in yseqs]
        word_fsa_mats = [build_word_fsa_mat(*x) for x in prefix_and_interval]
        word_fsa = [k2.Fsa.from_dict({"arcs": mat}) for mat in word_fsa_mats]
        word_fsa = k2.create_fsa_vec(word_fsa)
        num_graphs = self.graph_compiler.compile_lookahead_numerators(word_fsa, self.P) 

        # (3) loop to compute frame-level prob. cannot do this in one go due to memory limitation
        T = nnet_output_single.size()[1]
        num_frame_scores = []
        nnet_output = nnet_output_single.expand(beam_size, -1, -1)
        for t in range(T, 0, -1):
            supervision = torch.stack([
                          torch.arange(beam_size),
                          torch.zeros(beam_size),
                          torch.ones(beam_size) * t,
                          ], dim=-1).cpu().int()
            dense_fsa_vec = k2.DenseFsaVec(nnet_output[:, :t, :], supervision)
            
            # A pruned version. Or it would be much slow. parameters are tunable
            lats = k2.intersect_dense_pruned(num_graphs,
                                         dense_fsa_vec,
                                         search_beam=10.0,
                                         output_beam=5.0,
                                         min_active_states=30,
                                         max_active_states=10000)
            num_frame_score = lats.get_tot_scores(log_semiring=True, use_double_scores=True)
            num_frame_scores.append(num_frame_score)
        num_frame_scores = torch.stack(num_frame_scores, dim=-1) # [beam, T]
        # Important: exclude all -inf
        num_frame_scores = torch.clamp(num_frame_scores, min=self.logzero)
 
        frame_scores = num_frame_scores - den_frame_scores
        scores = torch.logsumexp(frame_scores, dim=-1)

        # (4) postprocess
        # (4.1) <eos> should only aligned with last frame
        eos_pos = torch.where(next_tokens == self.eos)[0]
        if len(eos_pos) > 0:
            scores[eos_pos] = frame_scores[eos_pos.item(), 0]

        # TODO: kill some valid token like <blank>
        
        # (5) return
        token_scores = scores - prev_score
        states = nnet_output_single, den_frame_scores, scores
        return token_scores, states

    """  
    def search_lexical_tree(self, node, next_tokens):
        if node is None:
            print("None node given!")

        intervals, next_nodes = [], []
        # some tokens are invalid (e.g., invalid word combination). however, we should
        # still compute the score for it to be compatible. We will force that score
        # to logzero in postprocess stage, but need to use the index_to_kill make a record.
        index_to_kill = [] 

        for idx, i in enumerate(next_tokens):
            # node is the previous one if _ is not proposed else root
            subword = self.char_list[i]
            # case (1): '_' or <eos> is proposed, which means end of the word
            if subword == self.bpe_space or subword == "<eos>":
                this_node = node # keep 'node' unchanged
                # Invalid and kill. Previous node cannot be root
                if this_node == self.lexroot:
                    interval = [self.word_unk_id-1, self.word_unk_id]
                    this_node = None
                    index_to_kill.append(idx)
                # score is for a word, not a word prefix -> interval for only one word 
                else:
                    interval = [this_node[2][0], this_node[2][0] + 1]
                    # next_node is root so the next token is valid even though it is not
                    # start with '_' 
                    this_node = self.lexroot

            # case (2): impossible token. kill them
            elif subword == "<blank>" or subword == "<unk>":
                this_node = None
                interval = [self.word_unk_id-1, self.word_unk_id]
                index_to_kill.append(idx)

            # case (3): ordinary tokens. All special token should never reach this branch
            else:
                # subword start with '_' means a prefix of new word -> search from root
                this_node = self.lexroot if subword.startswith(self.bpe_space) else node

                subword = subword.replace(self.bpe_space, "")
                for c in subword:
                    cid = self.alphabet_dict[c]
                    # descent to successor
                    if cid in this_node[0]:
                        this_node = this_node[0][cid]
                    # no valid successor found. kill this hypothesis
                    else:
                        this_node = None
                        break
            
                if this_node is not None and this_node[2] is not None:
                    interval = this_node[2]
                else:
                    interval = [self.word_unk_id-1, self.word_unk_id]
                    index_to_kill.append(idx)
            
            # plus one to correct the interval. see building process of lexroot
            interval = [interval[0] + 1, interval[1] + 1]  
            intervals.append(interval)
            # this_node == None always means a kill
            next_nodes.append(this_node)

        return intervals, next_nodes, index_to_kill
    """
    def final_score(self, state):
        return 0
     

================================================
FILE: nets/scorers/mmi_lookahead_bak.py
================================================
import k2
import torch
from espnet.nets.scorer_interface import PartialScorerInterface
from espnet.lm.lm_utils import make_lexical_tree
from snowfall.training.mmi_graph import MmiTrainingGraphCompiler
from snowfall.lexicon import Lexicon
from snowfall.training.mmi_graph import create_bigram_phone_lm


class MMILookaheadScorer(PartialScorerInterface):
    def __init__(self, lang, device, idim, sos_id, rank, use_segment, char_list):
        
        self.lang = lang
        self.device = device
        
        self.lexicon = Lexicon(lang)
        self.graph_compiler = MmiTrainingGraphCompiler(self.lexicon, self.device)
        self.phone_ids = self.lexicon.phone_symbols()
      
        self.lo = torch.nn.Linear(idim, len(self.phone_ids) + 1) 
        self.lm_scores = None
        self.load_weight(rank)

        self.P = create_bigram_phone_lm(self.phone_ids)
        self.P.set_scores_stochastic_(self.lm_scores)

        self.wdict = self.lexicon.words._sym2id
        self.pdict = self.lexicon.phones._sym2id
        self.lexroot = make_lexical_tree(self.wdict, self.pdict, "<UNK>")
        self.char_list = char_list
        self.space_str = "<space>"
        self.oovid = int(open(self.lang / 'oov.int').read().strip()) 
        self.unk_id = self.wdict["<UNK>"]
        self.sos_id = sos_id
        
        self.logzero = -10000.0

    def load_weight(self, rank):
        # load lo weight and lm_scores
        ckpt_dict = torch.load(self.lang / f"mmi_param.{rank}.pth")
        for v in ckpt_dict.values():
            v.requires_grad=False
        self.lm_scores = ckpt_dict["lm_scores"]
        lo_dict = {"weight": ckpt_dict["lo.1.weight"],
                   "bias": ckpt_dict["lo.1.bias"]}
        self.lo.load_state_dict(lo_dict)

    def init_state(self, x):
        # TODO: determine the format of state
        x = x.unsqueeze(0)
        nnet_output = self.lo(x)

        T = x.size()[1]
        supervision = torch.Tensor([[0, 0, T]]).to(torch.int32)

        prev_score = torch.Tensor([0]).to(torch.float32)
        return nnet_output, supervision, prev_score, self.lexroot 

    def select_state(self, states, j):
        # TODO: select the state
        pass

    def score(**kargs):
        raise NotImplementedError

    def score_partial(self, y, next_tokens, state, hs_pad):
        # (1) unpack state
        nnet_output_single, supervision_single, prev_score, node = state
        batch_size = next_tokens.size()[0]

        # (2) handle the prefix and candidate, find the prefix_ids and intervals
        # prefix
        prefix = [self.char_list[x] for x in y[1:]]
        # prefix = [char_list[x] for x in y[1:]]
        prefix = "".join(prefix).replace(self.space_str, " ").strip().split(' ')
        word_transition = False
        if self.char_list[y[-1]] == self.space_str:
            # end with <space>, word transition
            prefix = prefix[:-1]
            word_transition = True
        prefix = [self.wdict.get(w, self.oovid)  for w in prefix]
        
        # candidate: any interval: start-1, end -> [start, end)
        intervals, next_nodes = [], []
        for tok in next_tokens:
            tok = self.char_list[tok]
            
            if tok == self.space_str: 
                next_nodes.append(self.lexroot)
            # attention symbol but not mmi symbol
            if not tok in self.pdict:
                intervals.append((self.unk_id - 1, self.unk_id))
                next_nodes.append(None)
            else:
                pid = self.pdict[tok]
                if pid in node[0]: # successors exists: valid word
                    intervals.append(node[0][pid][2])
                    next_nodes.append(node[0][pid])
                else: # OOV: 
                    intervals.append((self.unk_id - 1, self.unk_id))
                    next_nodes.append(None)
        intervals = [(l+1, r+1) for l, r in intervals] # work around: to be compatible with lex-tree 
        split_intervals, indices = self.split_intervals(intervals)
        num_split = len(split_intervals)

        # (3) acoustic information
        supervision = supervision_single.repeat(num_split, 1)
        nnet_output = nnet_output_single.repeat(num_split, 1, 1)
        dense_fsa_vec = k2.DenseFsaVec(nnet_output, supervision)
        
        # (4) accumulate probability
        num_graphs = self.graph_compiler.compile_nums_for_prefix_scoring(prefix, split_intervals, self.P)
        num_lats = k2.intersect_dense(num_graphs, dense_fsa_vec, output_beam=5.0)
        num_tot_scores_split = num_lats.get_tot_scores(log_semiring=True, use_double_scores=True)
        num_tot_scores = torch.zeros(batch_size).to(y.device)
        for i in range(batch_size):
            num_tot_scores[i] = torch.logsumexp(num_tot_scores_split[indices[i]], dim=-1)

        num_tok_scores = num_tot_scores - prev_score
        return num_tok_scores, (nnet_output_single, supervision_single, num_tot_scores, next_nodes)    
         

    def split_intervals(self, olds, max_interval=2000):
        # it may cause error in k2 if the interval is too large. e.g., > 5k
        # so a large interval should be split into multiple small intervals.
        news, indices = [], []
        cnt = 0
        for start, end in olds:
            group_idx = []
            while start <= end:
                news.append([start, min(end, start + max_interval)])
                start += max_interval
                group_idx.append(cnt)
                cnt += 1
            indices.append(group_idx)
        return news, indices


================================================
FILE: nets/scorers/mmi_lookahead_split.py
================================================
import k2
import torch
from espnet.nets.scorer_interface import PartialScorerInterface
from espnet.lm.lm_utils import make_lexical_tree
from snowfall.training.mmi_graph import MmiTrainingGraphCompiler
from snowfall.lexicon import Lexicon
from snowfall.training.mmi_graph import create_bigram_phone_lm


class MMILookaheadScorer(PartialScorerInterface):
    def __init__(self, lang, device, idim, sos_id, rank, use_segment, char_list):
        
        self.lang = lang
        self.device = device
        
        self.lexicon = Lexicon(lang)
        self.graph_compiler = MmiTrainingGraphCompiler(self.lexicon, self.device)
        self.phone_ids = self.lexicon.phone_symbols()
      
        self.lo = torch.nn.Linear(idim, len(self.phone_ids) + 1) 
        self.lm_scores = None
        self.load_weight(rank)

        self.P = create_bigram_phone_lm(self.phone_ids)
        self.P.set_scores_stochastic_(self.lm_scores)
       
        # We use a char-level lexicon root but search in it by BPE
        alphabet = [chr(i) for i in range(65, 91)] + ["\'"] # upper class
        self.alphabet_dict = {c: i+1 for i, c in enumerate(alphabet)} 
        self.word_dict = self.lexicon.words._sym2id
        self.word_unk_id = int(open(self.lang / 'oov.int').read().strip()) 
        self.lexroot = make_lexical_tree(self.word_dict, self.alphabet_dict, self.word_unk_id) # 3 is unknown-id
        print("end of lex building", flush=True)

        self.char_list = char_list # BPE char list
        self.bpe_space = char_list[-2][0]

        # special value
        self.logzero = -10000.0
        self.eos = sos_id

    def load_weight(self, rank):
        # load lo weight and lm_scores
        ckpt_dict = torch.load(self.lang / f"mmi_param.{rank}.pth")
        for v in ckpt_dict.values():
            v.requires_grad=False
        self.lm_scores = ckpt_dict["lm_scores"]
        lo_dict = {"weight": ckpt_dict["lo.1.weight"],
                   "bias": ckpt_dict["lo.1.bias"]}
        self.lo.load_state_dict(lo_dict)

    def init_state(self, x):
        # (1) nnet_output
        nnet_output = self.lo(x.unsqueeze(0))

        # (2) den_graph
        texts = ["<UNK>"]
        _, den = self.graph_compiler.compile(texts, self.P, replicate_den=True)

        # (3) den_scores. Use a loop to avoid memory spark
        T = nnet_output.size()[1]
        den_scores = []
        for t in range(T, 0, -1):
            supervision = torch.Tensor([[0, 0, t]]).to(torch.int32) # [idx, start, length]
            dense_fsa_vec = k2.DenseFsaVec(nnet_output, supervision)
            den_lats = k2.intersect_dense(den, dense_fsa_vec, output_beam=10.0)
            den_tot_scores = den_lats.get_tot_scores(log_semiring=True, use_double_scores=True)
            den_scores.append(den_tot_scores)
        den_scores = torch.cat(den_scores).unsqueeze(0) # [T] -> [B, T]

        # (4) others
        prev_score = torch.Tensor([0]).to(torch.float32)

        return nnet_output, den_scores, prev_score, self.lexroot

    def select_state(self, states, j):
        # only the prev_scores and next_nodes should be selected
        nnet_output_single, den_scores, prev_scores, next_nodes = states
        return nnet_output_single, den_scores, prev_scores[j], next_nodes[j]

    def score(**kargs):
        raise NotImplementedError

    def score_partial(self, y, next_tokens, state, hs_pad):
        print("start a new partial score", flush=True)
        torch.set_printoptions(sci_mode=False)
        # (1) unpack state
        nnet_output_single, den_frame_scores, prev_score, node = state
        beam_size = len(next_tokens)
        T = nnet_output_single.size()[1]   

        # (2) build numerator graph
        
        prefix = "".join([self.char_list[x.item()] for x in y[1:]])\
                 .replace(" ", "").replace(self.bpe_space, " ").strip().split(" ")

        intervals, next_nodes, index_to_kill = self.search_lexical_tree(node, next_tokens)
        # if proposed token does not start with '_', y[-1] should be removed during graph composition
        # '_' means the proposal of new word. <eos> cannot be seen as a new word. other special tokens
        # would be killed
        drop_prefix_tail = [int(not self.char_list[x.item()].startswith(self.bpe_space)) for x in next_tokens]
        # some intervals are too long. we need to split them before computation and then combine 
        split_intervals, interval_indexes, split_drop_prefix_tail = self.split_intervals(intervals, drop_prefix_tail)
        print("intervals: \n", intervals, "split intervals: \n", split_intervals, "interval_index: \n", interval_indexes)
        graphs = self.graph_compiler.compile_nums_for_prefix_scoring(
                     prefix, split_intervals, self.P, split_drop_prefix_tail)

        # (3) loop to compute frame-level prob. cannot do this in one go due to memory limitation
        split_size = len(split_intervals)
        split_frame_scores = []
        nnet_output = nnet_output_single.expand(split_size, -1, -1)
        for t in range(T, 0, -1):
            supervision = torch.stack([
                          torch.arange(split_size),
                          torch.zeros(split_size),
                          torch.ones(split_size) * t,
                          ], dim=-1).cpu().int()
            dense_fsa_vec = k2.DenseFsaVec(nnet_output[:, :t, :], supervision)
            # split_lats = k2.intersect_dense(graphs, dense_fsa_vec, output_beam=3.0)
            # must use a pruned version! very slow here especially for large interval
            split_lats = k2.intersect_dense_pruned(graphs,
                                         dense_fsa_vec,
                                         search_beam=10.0,
                                         output_beam=5.0,
                                         min_active_states=30,
                                         max_active_states=10000)
            split_frame_score = split_lats.get_tot_scores(log_semiring=True, use_double_scores=True)
            split_frame_scores.append(split_frame_score)
            print(f"intersection t = {t}", flush=True)
        split_frame_scores = torch.stack(split_frame_scores, dim=-1) # [split_size, T]
        
        # (4) combine split and then den_scores to compute scores
        num_frame_scores = []
        for interval_index in interval_indexes:
            num_frame_score = torch.logsumexp(split_frame_scores[interval_index], dim=0)
            num_frame_scores.append(num_frame_score)
        num_frame_scores = torch.stack(num_frame_scores, dim=0) # [beam_size, T]

        frame_scores = num_frame_scores - den_frame_scores
        scores = torch.logsumexp(frame_scores, dim=-1)

        # (5) postprocess
        # A. <eos> should only aligned with last frame
        eos_pos = torch.where(next_tokens == self.eos)[0]
        if len(eos_pos) > 0:
            print("found eos position: ", eos_pos)
            scores[eos_pos] = frame_scores[eos_pos.item(), 0]

        # B. kill hypothesis in "index_to_kill"
        print("index to kill: ", index_to_kill)
        if len(index_to_kill) > 0:
            for idx in index_to_kill:
                scores[idx] = self.logzero
        
        # (6) return
        token_scores = scores - prev_score
        print("token score: ", token_scores, flush=True) 
        states = nnet_output_single, den_frame_scores, scores, next_nodes
        return token_scores, states

   
    def search_lexical_tree(self, node, next_tokens):
        if node is None:
            print("None node given!")

        intervals, next_nodes = [], []
        # some tokens are invalid (e.g., invalid word combination). however, we should
        # still compute the score for it to be compatible. We will force that score
        # to logzero in postprocess stage, but need to use the index_to_kill make a record.
        index_to_kill = [] 

        for idx, i in enumerate(next_tokens):
            # node is the previous one if _ is not proposed else root
            subword = self.char_list[i]
            print("subword: ", subword)
            # case (1): '_' or <eos> is proposed, which means end of the word
            if subword == self.bpe_space or subword == "<eos>":
                this_node = node # keep 'node' unchanged
                # Invalid and kill. Previous node cannot be root
                if this_node == self.lexroot:
                    interval = [self.word_unk_id-1, self.word_unk_id]
                    this_node = None
                    index_to_kill.append(idx)
                # score is for a word, not a word prefix -> interval for only one word 
                else:
                    interval = [this_node[2][0], this_node[2][0] + 1]
                    # next_node is root so the next token is valid even though it is not
                    # start with '_' 
                    this_node = self.lexroot

            # case (2): impossible token. kill them
            elif subword == "<blank>" or subword == "<unk>":
                this_node = None
                interval = [self.word_unk_id-1, self.word_unk_id]
                index_to_kill.append(idx)

            # case (3): ordinary tokens. All special token should never reach this branch
            else:
                # subword start with '_' means a prefix of new word -> search from root
                this_node = self.lexroot if subword.startswith(self.bpe_space) else node

                subword = subword.replace(self.bpe_space, "")
                for c in subword:
                    cid = self.alphabet_dict[c]
                    # descent to successor
                    if cid in this_node[0]:
                        this_node = this_node[0][cid]
                    # no valid successor found. kill this hypothesis
                    else:
                        this_node = None
                        break
            
                if this_node is not None and this_node[2] is not None:
                    interval = this_node[2]
                else:
                    interval = [self.word_unk_id-1, self.word_unk_id]
                    index_to_kill.append(idx)
            
            # plus one to correct the interval. see building process of lexroot
            interval = [interval[0] + 1, interval[1] + 1]  
            intervals.append(interval)
            # this_node == None always means a kill
            next_nodes.append(this_node)

        return intervals, next_nodes, index_to_kill

    def split_intervals(self, intervals, is_new_word, max_len=20000):
        """
        large interval (length > 2000) will cause errors in k2. 
        Split it and keep the index
        """
        new_intervals = []
        interval_indexes = []
        interval_is_new_word = []
        for idx, (left, right) in enumerate(intervals):
            interval_index = []
            new_word = is_new_word[idx]
            while left < right:
                new_intervals.append([left, min(left + max_len, right)])   
                left += max_len       
                interval_index.append(idx) 
                interval_is_new_word.append(new_word) # means this split is a new_word
            interval_indexes.append(interval_index)

        return new_intervals, interval_indexes, interval_is_new_word

    def final_score(self, state):
        return 0
             

================================================
FILE: nets/scorers/mmi_prefix_score.py
================================================
import os
import k2
import torch
from espnet.nets.scorer_interface import PartialScorerInterface
from snowfall.training.mmi_graph import MmiTrainingGraphCompiler
from snowfall.lexicon import Lexicon
from snowfall.training.mmi_graph import create_bigram_phone_lm
from espnet.nets.scorers.mmi_utils import step_intersect

class MMIFrameScorer(PartialScorerInterface):
    def __init__(self, lang, device, idim, sos_id, rank, use_segment, char_list, weight_path):
        
        self.lang = lang
        self.device = device
        
        self.lexicon = Lexicon(lang)
        self.oov = self.oovid = open(self.lang / 'oov.txt').read().strip()
        self.graph_compiler = MmiTrainingGraphCompiler(self.lexicon, self.device, self.oov)
        self.phone_ids = self.lexicon.phone_symbols()
      
        self.lo = torch.nn.Linear(idim, len(self.phone_ids) + 1) 
        self.lm_scores = None

        for i in range(10):
            try:
                self.load_weight(rank, weight_path)
            except:
                print(f"{i}-th trail to load MMI matrix weight but fail")

        self.P = create_bigram_phone_lm(self.phone_ids)
        self.P.set_scores_stochastic_(self.lm_scores)
        
        self.char_list = char_list
        self.eos = sos_id  # <sos> is identical to <eos>
        self.blank = 0 # by default 0 means CTC blank 
        self.logzero = -10000

    def load_weight(self, rank, path):
        # load lo weight and lm_scores
        ckpt_path = os.path.join(path, f"mmi_param.{rank}.pth")
        ckpt_dict = torch.load(ckpt_path)
        for v in ckpt_dict.values():
            v.requires_grad=False
        self.lm_scores = ckpt_dict["lm_scores"]
        lo_dict = {"weight": ckpt_dict["lo.1.weight"],
                   "bias": ckpt_dict["lo.1.bias"]}
        self.lo.load_state_dict(lo_dict)

    def init_state(self, x):
        # (1) den_graphs
        texts = ["<UNK>"] # use a random text, just to get den graph
        _, den = self.graph_compiler.compile(texts, self.P, replicate_den=True)

        # (2) DenseFsaVec
        nnet_output = self.lo(x.unsqueeze(0))
        supervision = torch.Tensor([[0, 0, nnet_output.size(1)]]).int()
        dense_fsa_vec = k2.DenseFsaVec(nnet_output, supervision)

        # (3) den_scores: [B, T]
        den_scores = step_intersect(den, dense_fsa_vec)[0].unsqueeze(0)

        # (4) initialize prev_score by 0.0
        prev_score = torch.Tensor([0.0]).to(torch.float32)
        return nnet_output, den_scores, prev_score 

    def select_state(self, states, j):
        nnet_output_single, den_scores, prev_scores = states
        return nnet_output_single, den_scores, prev_scores[j]

    def score(**kargs):
        raise NotImplementedError

    def score_partial(self, y, next_tokens, state, hs_pad):
        nnet_output_single, den_scores, prev_score = state
        batch_size = len(next_tokens)

        # (1) num_graphs
        ys = torch.cat([y.unsqueeze(0).repeat(batch_size, 1), next_tokens.unsqueeze(1)], dim=1)
        texts = [" ".join([self.char_list[tid] for tid in text[1:]]) for text in ys]
        texts = [text.replace("<eos>", "").strip() for text in texts]
        num, _ = self.graph_compiler.compile(texts, self.P, replicate_den=False)

        # (2) DenseFsaVec
        supervision = torch.stack([
                      torch.arange(batch_size),
                      torch.zeros(batch_size),
                      torch.ones(batch_size) * nnet_output_single.size(1)
                      ], dim=1).to(torch.int32)
        dense_fsa_vec = k2.DenseFsaVec(nnet_output_single.repeat(batch_size, 1, 1), supervision)

        # (3) num_scores: [B, T]
        num_scores = torch.stack(step_intersect(num, dense_fsa_vec), dim=0) 

        # (4) compute frame scores and accumulate along t-axis
        tot_scores_frame = num_scores - den_scores
        tot_scores = torch.logsumexp(tot_scores_frame, dim=-1)

        # (5) post-process
        # <eos> means the exact probability rather than the prefix probability 
        eos_pos = torch.where(next_tokens == self.eos)[0]
        if len(eos_pos) > 0:
            tot_scores[eos_pos] = tot_scores_frame[eos_pos.item(), -1]

        # CTC blank is never allowed in hypothesis. kill it
        blk_pos = torch.where(next_tokens == self.blank)[0]
        if len(blk_pos) > 0:
            tot_scores[blk_pos] = self.logzero
        
        # (6) finalize
        tok_scores = tot_scores - prev_score
        state = nnet_output_single, den_scores, tot_scores
        return tok_scores, state

    def final_score(self, state):
        return 0     
                

================================================
FILE: nets/scorers/mmi_rescorer.py
================================================
import os
import k2
import torch
from snowfall.training.mmi_graph import MmiTrainingGraphCompiler
from snowfall.lexicon import Lexicon
from snowfall.training.mmi_graph import create_bigram_phone_lm
from espnet.asr.asr_utils import parse_hypothesis

class MMIRescorer(object):
    def __init__(self, lang, device, idim, sos_id, rank, use_segment, char_list, weight_path):
        
        self.lang = lang
        self.device = device
        
        self.lexicon = Lexicon(lang)
        self.oov = self.oovid = open(self.lang / 'oov.txt').read().strip()
        self.graph_compiler = MmiTrainingGraphCompiler(self.lexicon, self.device, self.oov)
        self.phone_ids = self.lexicon.phone_symbols()
      
        self.lo = torch.nn.Linear(idim, len(self.phone_ids) + 1) 
        self.lm_scores = None
        self.load_weight(rank, weight_path)

        self.P = create_bigram_phone_lm(self.phone_ids)
        self.P.set_scores_stochastic_(self.lm_scores)
        
        self.char_list = char_list

        if not char_list[-2][0].isalnum():
            self.bpe_space = char_list[-2][0]
            print("use bpe. bpe space is: ", self.bpe_space)
        else:
            self.bpe_space = ""
        
    def load_weight(self, rank, path):
        # load lo weight and lm_scores
        ckpt_path = os.path.join(path, f"mmi_param.{rank}.pth")
        ckpt_dict = torch.load(ckpt_path)
        for v in ckpt_dict.values():
            v.requires_grad=False
        self.lm_scores = ckpt_dict["lm_scores"]
        lo_dict = {"weight": ckpt_dict["lo.1.weight"],
                   "bias": ckpt_dict["lo.1.bias"]}
        self.lo.load_state_dict(lo_dict)

    def score(self, x, nbest_hyps, v2=False):
        batch_size = len(nbest_hyps)        

        # (1) acoustic
        x = x.unsqueeze(0)
        nnet_output = self.lo(x)
        T = x.size()[1]
        supervision = torch.Tensor([[0, 0, T]]).to(torch.int32)
        
        nnet_output = nnet_output.repeat(batch_size, 1, 1)
        supervision = supervision.repeat(batch_size, 1)
        dense_fsa_vec = k2.DenseFsaVec(nnet_output, supervision)

        # (2) texts
        """
        texts, scores = [], []
        for idx, hyp in enumerate(nbest_hyps):
            if v2:
                tokenid = hyp.yseq.tolist()[1:]
                text = " ".join([char_list[x] for x in tokenid])
            else:
                text, token, tokenid, score = parse_hypothesis(hyp, char_list)
            text = text.replace("<eos>", "").strip()
            texts.append(text)
            text_split = text.strip().split()
            for tok in text_split:
                if not tok in self.lexicon.words:
                    print(f"{idx}: Found oov {tok}")
        """
        texts = []
        for idx, hyp in enumerate(nbest_hyps):
            if v2:
                tokenid = hyp.yseq[1:]
                tokens = [self.char_list[x] for x in tokenid]
                if isinstance(tokenid, torch.Tensor):
                    tokenid = tokenid.tolist()
                if "<space>" in self.char_list: # English
                    text = "".join(tokens).replace("<space>", " ")
                else: # Mandarin, BPE
                    text = " ".join(tokens)
                text = text.replace("<eos>", "") 
            else:
                text, _, tokenid, _ = parse_hypothesis(hyp, self.char_list)

            # BPE space:
            if self.bpe_space is not None:
                text = text.replace(" ", "").replace(self.bpe_space, " ").strip() 

            texts.append(text)

        # (3) computation
        num_graphs, _ = self.graph_compiler.compile(texts, self.P, replicate_den=True)
        num_lats = k2.intersect_dense(num_graphs, dense_fsa_vec, output_beam=10.0)
        num_tot_scores = num_lats.get_tot_scores(log_semiring=True, use_double_scores=True)
        
                
        # (4) add MMI scores
        new_hyps = []
        for i, hyp in enumerate(nbest_hyps):
            if v2:
                if hasattr(hyp, "scores"):
                    hyp.scores["mmi_tot_score"] = num_tot_scores[i].item()
                else:
                    setattr(hyp, 'mmi_tot_score', num_tot_scores[i].item())
            else:
                hyp["mmi_tot_score"] = num_tot_scores[i].item()
            new_hyps.append(hyp)
        return new_hyps 


================================================
FILE: nets/scorers/mmi_rnnt_lookahead_scorer.py
================================================
import k2
import torch
import math
from espnet.nets.scorer_interface import PartialScorerInterface
from snowfall.training.mmi_graph import MmiTrainingGraphCompiler
from snowfall.lexicon import Lexicon
from snowfall.training.mmi_graph import create_bigram_phone_lm
from espnet.lm.lm_utils import make_lexical_tree
from espnet.nets.scorers.lookahead import parse_lookahead, build_word_fsa_mat


class MMIRNNTLookaheadScorer():
    def __init__(self, lang, device, idim, sos_id, rank, use_segment, char_list):
        
        self.lang = lang
        self.device = device
        
        self.lexicon = Lexicon(lang)
        self.graph_compiler = MmiTrainingGraphCompiler(self.lexicon, self.device)
        self.phone_ids = self.lexicon.phone_symbols()
      
        self.lo = torch.nn.Linear(idim, len(self.phone_ids) + 1) 
        self.lm_scores = None
        self.load_weight(rank)

        self.P = create_bigram_phone_lm(self.phone_ids)
        self.P.set_scores_stochastic_(self.lm_scores)
       
        # lexical root
        alphabet = [chr(i) for i in range(65, 91)] + ["\'"] # upper class
        self.alphabet_dict = {c: i+1 for i, c in enumerate(alphabet)}
        self.word_dict = self.lexicon.words._sym2id
        self.word_unk_id = int(open(self.lang / 'oov.int').read().strip())
        self.lexroot = make_lexical_tree(self.word_dict, self.alphabet_dict, self.word_unk_id) # 3 is unknown-id
        print("end of lex building", flush=True)
 
        self.char_list = char_list
        self.bpe_space = char_list[-2][0]
        
        self.eos = sos_id  # <sos> is identical to <eos>
        self.logzero = -10000

        self.lookahead = True

    def load_weight(self, rank):
        # load lo weight and lm_scores
        ckpt_dict = torch.load(self.lang / f"mmi_param.{rank}.pth")
        for v in ckpt_dict.values():
            v.requires_grad=False
        self.lm_scores = ckpt_dict["lm_scores"]
        lo_dict = {"weight": ckpt_dict["lo.1.weight"],
                   "bias": ckpt_dict["lo.1.bias"]}
        self.lo.load_state_dict(lo_dict)

    def den_scores(self, x):
        # (1) nnet_output
        x = x.unsqueeze(0)
        nnet_output = self.lo(x)

        # (2) den_scores
        texts = ["<UNK>"] # use a random text, just to get den graph
        _, den = self.graph_compiler.compile(texts, self.P, replicate_den=True)

        T = x.size()[1]
        den_scores = []
        # use a loop since denominator would consume much memory
        # in acscending order
        for t in range(1, T+1):
            supervision = torch.Tensor([[0, 0, t]]).to(torch.int32) # [idx, start, length]
            dense_fsa_vec = k2.DenseFsaVec(nnet_output, supervision)
            den_lats = k2.intersect_dense(den, dense_fsa_vec, output_beam=10.0)
            den_tot_scores = den_lats.get_tot_scores(log_semiring=True, use_double_scores=True)
            den_scores.append(den_tot_scores)
        den_scores = torch.cat(den_scores).unsqueeze(0) # [T] -> [B, T]

        return nnet_output, den_scores 

    # this is deprecated: just reorder A at the beginning
    def batch_score_(self, A, nnet_output, den_scores, tu_sum, mmi_weight):

        batch = len(A)
        if batch == 0:
            return A

        # (1) supervision
        # +1 since frame start with 0; +1 since redundant blank
        # the supervision must be descending order.
        ts = [tu_sum - len(h.yseq) + 2 for h in A]
        ts = torch.Tensor(ts).long()
        supervision = torch.stack([torch.arange(batch),
                                   torch.zeros(batch),
                                   ts
                                  ], dim=1).to(torch.int32) 
        indices = torch.argsort(supervision[:, 2], descending=True)
        supervision = supervision[indices]
        dense_fsa_vec = k2.DenseFsaVec(nnet_output.repeat(batch, 1, 1),
                                       supervision)

        # compile numerator graph and keep it in the order of indices
        prefix_and_interval = [parse_lookahead(h.yseq, self.lexroot, 
                               self.char_list, self.alphabet_dict, 
                               self.word_dict, self.bpe_space)
                               for h in A]
        prefix_and_interval = [prefix_and_interval[j] for j in indices]
        word_fsa_mats = [build_word_fsa_mat(*x) for x in prefix_and_interval]
        word_fsa = [k2.Fsa.from_dict({"arcs": mat}) for mat in word_fsa_mats]
        word_fsa = k2.create_fsa_vec(word_fsa)
        num_graphs = self.graph_compiler.compile_lookahead_numerators(word_fsa, self.P) 

        # (3) intersection.  
        num_lats = k2.intersect_dense(num_graphs, dense_fsa_vec, output_beam=10.0)
        num_scores = num_lats.get_tot_scores(log_semiring=True, use_double_scores=True)
        num_scores = torch.where(num_scores == -math.inf, 0.0, num_scores)
        # num_scores is in the order of indices
        ts = torch.Tensor([ts[j] for j in indices]).long()
        tot_scores = num_scores - den_scores[0][ts-1] # -1: num_frames -> idx_frames

        # (4) assign and post-process
        # Question: How to deal with the hypothesis with empty yseq
        idx_to_empty_str = [j for j, h in enumerate(A) if len(h.yseq) == 1]
        for j in idx_to_empty_str:
            tot_scores[indicesj] = 0.0

        for j in range(batch):
            h = A[indices[j]]
            # print(f"idx: {indices[j]} | Hypothesis: {texts[j]} | Prev MMI Score: {h.mmi_tot_score} | Prev Score: {h.score} | MMI step Score: {(tot_scores[j] - h.mmi_tot_score)*mmi_weight}")
            h.score += (tot_scores[j].item() - h.mmi_tot_score) * mmi_weight
            h.mmi_tot_score = tot_scores[j].item()
        
        return A

    # version of no lookahead in time-axis
    def __batch_score(self, A, nnet_output, den_scores, tu_sum, mmi_weight):
        print(f"Result of tu_sum: {tu_sum}")
        batch = len(A)
        if batch == 0:
            return A

        # reorder: increasing order in u means decreasing order in t
        #          this is required by k2 supervision
        A.sort(key=lambda h: len(h.yseq))

        # (1) supervision
        # +1 since frame start with 0; +1 since redundant blank
        ts = [tu_sum - len(h.yseq) + 2 for h in A]
        ts = torch.Tensor(ts).long() 

        supervision = torch.stack([torch.arange(batch),
                                   torch.zeros(batch),
                                   ts
                                  ], dim=1).to(torch.int32)
        dense_fsa_vec = k2.DenseFsaVec(nnet_output.repeat(batch, 1, 1),
                                       supervision)

        # (2) compile numerator graph
        prefix_and_interval = [parse_lookahead(h.yseq, self.lexroot,
                               self.char_list, self.alphabet_dict,
                               self.word_dict, self.bpe_space)
                               for h in A]
        word_fsa_mats = [build_word_fsa_mat(*x) for x in prefix_and_interval]
        word_fsa = [k2.Fsa.from_dict({"arcs": mat}) for mat in word_fsa_mats]
        word_fsa = k2.create_fsa_vec(word_fsa)
        num_graphs = self.graph_compiler.compile_lookahead_numerators(word_fsa, self.P)

        # (3) intersection
        # num_lats = k2.intersect_dense(num_graphs, dense_fsa_vec, output_beam=10.0)
        num_lats = k2.intersect_dense_pruned(num_graphs,
                                         dense_fsa_vec,
                                         search_beam=20.0,
                                         output_beam=10.0,
                                         min_active_states=30,
                                         max_active_states=20000)
        num_scores = num_lats.get_tot_scores(True, True)
        num_scores = torch.where(num_scores == -math.inf, 0.0, num_scores)
        tot_scores = num_scores - den_scores[0][ts-1] # num_frame -> idx_frame

        # (4) assign and post-process
        idx_to_empty_str = [j for j, h in enumerate(A) if len(h.yseq) == 1]
        for j in idx_to_empty_str:
            tot_scores[j] = 0.0

        for j in range(batch):
            h = A[j]
            h.score += (tot_scores[j].item() - h.mmi_tot_score) * mmi_weight
            h.mmi_tot_score = tot_scores[j].item()
            
            text = "".join([self.char_list[x] for x in h.yseq[1:]])
            print(f"text: {text} | score: {h.score} | mmi_score: {h.mmi_tot_score}", flush=True)

        return A

    # version of lookahead in time-axis
    def batch_score(self, A, nnet_output, den_scores, tu_sum, mmi_weight):
        print(f"Result of tu_sum: {tu_sum}")
        batch = len(A)
        if batch == 0:
            return A

        # reorder: increasing order in u means decreasing order in t
        #          this is required by k2 supervision
        A.sort(key=lambda h: len(h.yseq))

        # (1) get ts: the alignment length in t-axis 
        # +1 since frame start with 0; +1 since redundant blank
        ts = [tu_sum - len(h.yseq) + 2 for h in A]
        ts = torch.Tensor(ts).long()

        # (2) compile numerator graph
        prefix_and_interval = [parse_lookahead(h.yseq, self.lexroot,
                               self.char_list, self.alphabet_dict,
                               self.word_dict, self.bpe_space)
                               for h in A]
        word_fsa_mats = [build_word_fsa_mat(*x) for x in prefix_and_interval]
        word_fsa = [k2.Fsa.from_dict({"arcs": mat}) for mat in word_fsa_mats]
        word_fsa = k2.create_fsa_vec(word_fsa)
        num_graphs = self.graph_compiler.compile_lookahead_numerators(word_fsa, self.P)

        # (3) intersection
        lookahead_range = (0, 25) # tunable paramter. avoid this hard code in the future
        tot_scores_collection = []
        T = nnet_output.size()[1]
        for s in range(lookahead_range[0], lookahead_range[1] + 1): # be symmetric
            ts_shift = torch.clamp(ts + s, min=1, max=T)
            supervision = torch.stack([torch.arange(batch),
                                       torch.zeros(batch),
                                       ts_shift
                                       ], dim=1).to(torch.int32)
            dense_fsa_vec = k2.DenseFsaVec(nnet_output.repeat(batch, 1, 1),
                                           supervision)
            num_lats = k2.intersect_dense_pruned(num_graphs,
                                             dense_fsa_vec,
                                             search_beam=20.0,
                                             output_beam=10.0,
                                             min_active_states=30,
                                             max_active_states=20000)
            num_scores = num_lats.get_tot_scores(True, True)
            num_scores = torch.where(num_scores == -math.inf, 0.0, num_scores)
            tot_scores = num_scores - den_scores[0][ts_shift-1] # num_frame -> idx_frame
            tot_scores_collection.append(tot_scores)
        tot_scores = torch.stack(tot_scores_collection, dim=1) # [beam, T]
        
        # hint: we can only use top-1 score rather than logsumexp or top-k-sum
        # since torch.clamp leads to repeatition of these scores at boundaries
        tot_scores, _ = torch.topk(tot_scores, 1, dim=-1)

        # (4) assign and post-process
        idx_to_empty_str = [j for j, h in enumerate(A) if len(h.yseq) == 1]
        for j in idx_to_empty_str:
            tot_scores[j] = 0.0

        for j in range(batch):
            h = A[j]
            h.score += (tot_scores[j].item() - h.mmi_tot_score) * mmi_weight
            h.mmi_tot_score = tot_scores[j].item()

            #text = "".join([self.char_list[x] for x in h.yseq[1:]])
            #print(f"text: {text} | score: {h.score} | mmi_score: {h.mmi_tot_score} | score_rnnt: {h.score - h.mmi_tot_score * mmi_weight}", flush=True)

        return A


================================================
FILE: nets/scorers/mmi_rnnt_scorer.py
================================================
import os
import k2
import torch
import math
from espnet.nets.scorer_interface import PartialScorerInterface
from snowfall.training.mmi_graph import MmiTrainingGraphCompiler
from snowfall.lexicon import Lexicon
from snowfall.training.mmi_graph import create_bigram_phone_lm


class MMIRNNTScorer():
    def __init__(self, lang, device, idim, sos_id, rank, use_segment, char_list, weight_path, lookahead=0):
        
        self.lang = lang
        self.device = device
        
        self.lexicon = Lexicon(lang)
        self.oov = self.oovid = open(self.lang / 'oov.txt').read().strip()
        self.graph_compiler = MmiTrainingGraphCompiler(self.lexicon, self.device, self.oov)
        self.phone_ids = self.lexicon.phone_symbols()
      
        self.lo = torch.nn.Linear(idim, len(self.phone_ids) + 1) 
        self.lm_scores = None
        self.load_weight(rank, weight_path)

        self.P = create_bigram_phone_lm(self.phone_ids)
        self.P.set_scores_stochastic_(self.lm_scores)
        
        self.char_list = char_list
        self.eos = sos_id  # <sos> is identical to <eos>
        self.blank = 0 # by default 0 means CTC blank 
        self.logzero = -10000

        self.lookahead = lookahead

    def load_weight(self, rank, path):
        # load lo weight and lm_scores
        ckpt_path = os.path.join(path, f"mmi_param.{rank}.pth") 
        ckpt_dict = torch.load(ckpt_path)
        for v in ckpt_dict.values():
            v.requires_grad=False
        self.lm_scores = ckpt_dict["lm_scores"]
        lo_dict = {"weight": ckpt_dict["lo.1.weight"],
                   "bias": ckpt_dict["lo.1.bias"]}
        self.lo.load_state_dict(lo_dict)

    def den_scores(self, x):
        # (1) nnet_output
        x = x.unsqueeze(0)
        nnet_output = self.lo(x)

        # (2) den_scores
        texts = ["<UNK>"] # use a random text, just to get den graph
        _, den = self.graph_compiler.compile(texts, self.P, replicate_den=True)

        T = x.size()[1]
        den_scores = []
        # use a loop since denominator would consume much memory
        # in acscending order
        for t in range(1, T+1):
            supervision = torch.Tensor([[0, 0, t]]).to(torch.int32) # [idx, start, length]
            dense_fsa_vec = k2.DenseFsaVec(nnet_output, supervision)
            den_lats = k2.intersect_dense(den, dense_fsa_vec, output_beam=10.0)
            den_tot_scores = den_lats.get_tot_scores(log_semiring=True, use_double_scores=True)
            den_scores.append(den_tot_scores)
        den_scores = torch.cat(den_scores).unsqueeze(0) # [T] -> [B, T]

        return nnet_output, den_scores

    def batch_rescore(self, A, h):
        ans = []
        start = 0
        while start < len(A):
            ans.append(self._batch_rescore(A[start: start + 50], h))
            start += 50

        return A

    def _batch_rescore(self, A, h):
        nnet_output = self.lo(h.unsqueeze(0))
        batch, T = len(A), nnet_output.size(1)
        if batch == 0:
            return A

        texts = [h.yseq[1:] for h in A]
        texts = [" ".join([self.char_list[x] for x in text]) for text in texts]
        num_graphs, _ = self.graph_compiler.compile(texts, self.P, replicate_den=False)

        supervision = torch.stack([torch.arange(batch),
                                   torch.zeros(batch),
                                   torch.ones(batch) * T
                                  ], dim=1).to(torch.int32)
        dense_fsa_vec = k2.DenseFsaVec(nnet_output.repeat(batch, 1, 1),
                                       supervision)

        num_lats = k2.intersect_dense(num_graphs, dense_fsa_vec, output_beam=30.0)
        num_scores = num_lats.get_tot_scores(log_semiring=True, use_double_scores=True)

        for h, s in zip(A, num_scores):
            h.mmi_tot_score = s.item()
        
        return A

    # batch score without time-axis lookahead. TODO: remove the indices as the order is not important
    def batch_score(self, A, nnet_output, den_scores, tu_sum, mmi_weight):
        
        batch, T = len(A), nnet_output.size(1)
        if batch == 0:
            return A

        # (1) supervision
        # +1 since frame start with 0; +1 since redundant <sos>
        # the supervision must be descending order.
        ts = [tu_sum - len(h.yseq) + 2 for h in A]
        ts = torch.Tensor(ts).long()
        supervision = torch.stack([torch.arange(batch),
                                   torch.zeros(batch),
                                   ts
                                  ], dim=1).to(torch.int32) 
        indices = torch.argsort(supervision[:, 2], descending=True)
        supervision = supervision[indices]
        # dense_fsa_vec = k2.DenseFsaVec(nnet_output.repeat(batch, 1, 1),
        #                                supervision)

        # (2) texts
        texts = [h.yseq[1:] for h in A] # exclude starting <sos>
        texts = [" ".join([self.char_list[x] for x in text]) for text in texts] # need modification for BPE
        texts = [texts[idx] for idx in indices] # reorder
        num_graphs, _ = self.graph_compiler.compile(texts, self.P, replicate_den=False)

        # (3) intersection. 
        num_score_collection = []
        for i in range(self.lookahead + 1):
            supervision = torch.stack([torch.arange(batch),
                                       torch.zeros(batch),
                                       torch.clamp(ts + i, min=1, max=T)
                                       ], dim=1).to(torch.int32)[indices]
            dense_fsa_vec = k2.DenseFsaVec(nnet_output.repeat(batch, 1, 1),
                                           supervision)
            num_lats = k2.intersect_dense(num_graphs, dense_fsa_vec, output_beam=200.0)
            num_scores = num_lats.get_tot_scores(log_semiring=True, use_double_scores=True)
            num_scores = torch.where(num_scores == -math.inf, 0.0, num_scores)
            num_score_collection.append(num_scores)

        num_scores = torch.stack(num_score_collection, dim=1).max(1)[0]
        ts = torch.Tensor([ts[j] for j in indices]).long()
        tot_scores = num_scores - den_scores[0][ts-1] # -1: num_frames -> idx_frames

        # (4) assign and post-process
        idx_to_empty_str = [j for j, x in enumerate(texts) if x == ""]
        for j in idx_to_empty_str:
            tot_scores[j] = 0.0

        for j in range(batch):
            h = A[indices[j]]
            h.score += (tot_scores[j].item() - h.mmi_tot_score) * mmi_weight
            h.mmi_tot_score = tot_scores[j].item()
        
        return A


================================================
FILE: nets/scorers/mmi_utils.py
================================================
# Author: Jinchuan Tian ; Jan 2022
# jinchuantian@stu.pku.edu.cn

# We test our code on k2 version 1.2; other versions may encounter problems due to API change.
# This file contains the MMI-related utility functions:
# 1. The (equivalent implementation of) step composition between the training / decoding graph;
# 2. The Lattice generation process with look-ahead mechanism.

from typing import List
from typing import Optional
from typing import Tuple

import torch
import k2
import _k2
import numpy as np # debug
from k2 import Fsa, DenseFsaVec 

"""
Intersection function without autograd.

(1) We write this function since the arc_map_a is not accessible in k2 API
(2) Currently we are not using the pruned version to keep all paths.
    We will try to find a balance between the speed and the precision later.
"""
def intersect_dense_forward(a_fsas: Fsa,
                           b_fsas: DenseFsaVec,
                           search_beam: float,
                           output_beam: float,
                           prune: bool,
                           min_active_states: int,
                           max_active_states: int,
                           seqframe_idx_name: Optional[str] = None,
                           frame_idx_name: Optional[str] = None): 

    out_fsa = [0]

    if prune:
        ragged_arc, arc_map_a, arc_map_b = _k2.intersect_dense_pruned(
            a_fsas=a_fsas.arcs,
            b_fsas=b_fsas.dense_fsa_vec,
            search_beam=search_beam,
            output_beam=output_beam,
            min_active_states=min_active_states,
            max_active_states=max_active_states)
    else:
        ragged_arc, arc_map_a, arc_map_b = _k2.intersect_dense(
            a_fsas=a_fsas.arcs,
            b_fsas=b_fsas.dense_fsa_vec,
            a_to_b_map=None,
            output_beam=output_beam)

    out_fsa[0] = Fsa(ragged_arc)

    seqframe_idx = None
    if frame_idx_name is not None:
        num_cols = b_fsas.dense_fsa_vec.scores_dim1()
        seqframe_idx = arc_map_b // num_cols
        shape = b_fsas.dense_fsa_vec.shape()
        fsa_idx0 = _k2.index_select(shape.row_ids(1), seqframe_idx)
        frame_idx = seqframe_idx - _k2.index_select(
            shape.row_splits(1), fsa_idx0)
        assert not hasattr(out_fsa[0], frame_idx_name)
        setattr(out_fsa[0], frame_idx_name, frame_idx)

    if seqframe_idx_name is not None:
        if seqframe_idx is None:
            num_cols = b_fsas.dense_fsa_vec.scores_dim1()
            seqframe_idx = arc_map_b // num_cols

        assert not hasattr(out_fsa[0], seqframe_idx_name)
        setattr(out_fsa[0], seqframe_idx_name, seqframe_idx)

    return out_fsa[0], arc_map_a, arc_map_b


# For each state, Add the score on the ending arc if the end state is reachable 
# from this state. Then return the frame-level scores for each Fsa.
def step_trace(out_fsas, a_fsas, arc_map_a, durations):
    assert out_fsas.shape[0] == a_fsas.shape[0]
    num_fsa = a_fsas.shape[0]

    # K2 FsaVec Meta-info: num_state; 0; 
    # state_accumulated_counts (row_splits1); 
    # arc_accumulated_counts (row_splits12);
    
    # 1.1 Find all a_fsas arcs and meta-info
    a_fsa_dict = a_fsas.as_dict()
    a_fsa_meta = a_fsa_dict["arcs"][: 2 * num_fsa + 4].long()
    a_fsa_arcs = a_fsa_dict["arcs"][2 * num_fsa + 4:].view(-1, 4) 

    # 1.2 Assign global state-ids
    for i in range(num_fsa):
        a_fsa_arcs[a_fsa_meta[i+num_fsa+3]: a_fsa_meta[i+num_fsa+4]][:, :2] += a_fsa_meta[i + 2]

    # 1.3 Find all ending states and their scores. -1 means arcs entering ending states.
    a_fsa_ending_mask = a_fsa_arcs[:, 2] == -1
    a_ending_states = torch.masked_select(a_fsa_arcs[:, 0], a_fsa_ending_mask)
    a_ending_scores = torch.masked_select(a_fsas.scores, a_fsa_ending_mask)

    # 2.1 Find all out_fsas arcs and sort by entering states 
    out_fsa_dict = out_fsas.as_dict()
    out_fsa_meta = out_fsa_dict["arcs"][:2 * num_fsa + 4].long()
    out_fsa_arcs = out_fsa_dict["arcs"][2 * num_fsa + 4:].view(-1, 4)
    out_incoming_ragged = out_fsas._get_incoming_arcs()

    # 2.2 For each state, find an arc entering it
    #     No entering arcs for start states but a fake one is assigned to it.
    #     We need the corresponding arcs in a_fsas so arc_map_a is selected
    transform_index = out_incoming_ragged.values().long()
    select_index = out_incoming_ragged.row_splits(2).long()[:-1]
    arc_map_a_uniq = arc_map_a[transform_index][select_index]
    frame_idx = out_fsas.frame_idx[transform_index][select_index]

    # 2.3 Find all corresponding arcs in a_fsas and their entering states
    #     Starting states of each Fsa is set to 0 to disable the fake arcs
    #     They are not needed to be accurate as long as 0 is not in `a_ending_states`
    a_fsa_arcs_uniq = a_fsa_arcs[arc_map_a_uniq.long()]
    a_states_uniq = a_fsa_arcs_uniq[:, 1]
    # We use this to avoid out-of-range error when last several FSAs are empty
    start_indices = torch.where(out_fsa_meta[2: 2 + num_fsa] == out_fsa_meta[2 + num_fsa],
                                0, out_fsa_meta[2: 2 + num_fsa])
    a_states_uniq[start_indices] = 0

    # 3.1 Find the forward scores
    #     Add ending state scores to the raw state_scores 
    #     if the final state is reachable. Else set to -inf
    raw_state_scores = out_fsas._get_forward_scores(True, True)
    state_scores = torch.ones_like(raw_state_scores) * float('-1e10')
    for state, score in zip(a_ending_states, a_ending_scores):
        state_scores = torch.where(a_states_uniq==state, 
                                   raw_state_scores + score, 
                                   state_scores)
    
    # 3.2 Allocate scores on each frames and each Fsa
    #     Score on starting state is also accumulated on frame 1
    #     But it is always ok since this score is always -inf
    #     TODO: maybe we need assume T in dense_fsa_vec supervison is identical 
    frame_ids, counts = torch.unique_consecutive(frame_idx, return_counts=True)

    score_sequences, start = [], 0
    score_sequence = []
    for i, (fid, fc) in enumerate(zip(frame_ids.tolist(), counts.tolist())):
        frame_score = torch.logsumexp(state_scores[start: start+fc], dim=0)        
        score_sequence.append(frame_score)
        start += fc

        if i == len(counts) - 1 or fid > frame_ids[i+1]:
            score_sequences.append(torch.stack(score_sequence, dim=0)[:-1])
            score_sequence = []

    # 3.3 For empty Fsas, assgin -inf score sequences.
    ans, index = [], 0
    is_empties = out_fsa_meta[2: 2 + num_fsa] == out_fsa_meta[3: 3 + num_fsa]
    for i, is_empty in enumerate(is_empties):
        if is_empty:
            ans.append(torch.ones(durations[i]) * float('-1e10'))
        else:
            ans.append(score_sequences[index])
            index += 1
    assert len(ans) == num_fsa
    return ans

"""
Step intersection implementation

Input:
fsa, FsaVec, training graph like CTC, MMI. Need duplication.
dense_fsa_vec, DenseFsaVec, created from nnet_output and the corresponding length in t-axis.
prune: bool, If true, use a pruned version of intersection.
search_beam: float, parameter used in pruned intersection only.
output_beam: float, paramtere used in intersection.
min_active_states: int, parameter used in pruned intersection only.
max_active_states: int, parameter used in pruned intersection only.

Output: 
score_sequences: List of 1-D tensors. The number of tensors is equal to the number fsas in of `fsa`
                 Each tensor has length of T where T is the number of effective frames in nnet_ouptut.
                 The t-th element represent the `tot_score` of interseted Fsa beteewn the input `fsa` 
                 and the first t frames.

This implementation is much faster than using a loop for T times. As the intersection is only used once
for each Fsa. The sequence is recovered from the output Fsa and the arc_map_a generated by T frames.

The intersection would fail if the hypothesis is too long to reach the end-state. If all intersection
fails, the step_trace would result in an error so we check the out_fsa in advance. (Currently we only
use this in MMI prefix score computation so all hypotheses and dense_fsa_vec have the same lengths. 
This could also be used in MMI alignment score computation with time-axis lookahead and this function
might be further revised later)

Currently we find the `search_beam` and the `output_beam` should be very large to achieve accurate
decoding performance.
"""
def step_intersect(fsa, 
                   dense_fsa_vec, 
                   prune=False, 
                   search_beam=3000, 
                   output_beam=2000,
                   min_active_states=30,
                   max_active_states=5000000):

    out_fsa, arc_map_a, arc_map_b = intersect_dense_forward(
      a_fsas = fsa,
      b_fsas = dense_fsa_vec,
      search_beam = search_beam,
      output_beam = output_beam,
      prune = prune,
      min_active_states = min_active_states,
      max_active_states = max_active_states,
      seqframe_idx_name = "seqframe_idx",
      frame_idx_name = "frame_idx"
    )

     # If all intersections fail
    if out_fsa.num_arcs == 0:
        ans = []
        for d in dense_fsa_vec.duration:
            ans.append(torch.ones(d.item()) * float("-1e10"))
        return ans

    return step_trace(out_fsa, fsa, arc_map_a, dense_fsa_vec.duration) 

def step_intersect_test(): 
    from pathlib import Path
    lang=Path("data/lang_phone")
    device = torch.device("cpu")
    
    # import for test only
    from espnet.nets.scorer_interface import PartialScorerInterface
    from snowfall.training.mmi_graph import MmiTrainingGraphCompiler
    from snowfall.lexicon import Lexicon
    from snowfall.training.mmi_graph import create_bigram_phone_lm

    lexicon = Lexicon(lang)
    oov = open(lang / 'oov.txt').read().strip()
    graph_compiler = MmiTrainingGraphCompiler(lexicon, device, oov)
    phone_ids = lexicon.phone_symbols()

    torch.manual_seed(888)
    P = create_bigram_phone_lm(phone_ids)
    P.scores = torch.randn_like(P.scores)

    texts = ["你", "好"]
    # texts = ['本 市 警 察 近', '本 市 警 察 今', '本 市 警 察 信', '本 市 警 察 昨', '本 市 警 察 二', '本 市 警 察 记', '本 市 警 察 数', '本 市 警 察 甚', '本 市 警 察 继', '本 市 警 察 至', '本 市 警 察 进', '本 市 警 察 几', '本 市 警 察 五', '本 市 警 察 仅', '本 市 警 察 上']
    num, den = graph_compiler.compile(texts, P, replicate_den=True)
    graph = num 
 
    T = 3
    beam_size = len(texts)
    odim = len(phone_ids) + 1
    nnet_output = torch.rand([beam_size, T, odim])

    supervision = torch.stack([
                          torch.arange(beam_size),
                          torch.zeros(beam_size),
                          torch.ones(beam_size) * T,
                          ], dim=-1).cpu().int()   
    dense_fsa_vec = k2.DenseFsaVec(nnet_output, supervision) 
    score_sequences = step_intersect(graph, 
                                    dense_fsa_vec,
                                    prune=False,
                                    search_beam=30,
                                    output_beam=20,
                                    min_active_states=30,
                                    max_active_states=100000) 
    print(score_sequences)
    return
    print("####  old method ###")
    buf = []
    for t in range(1, T+1):
        supervision = torch.stack([
                          torch.arange(beam_size),
                          torch.zeros(beam_size),
                          torch.ones(beam_size) * t,
                          ], dim=-1).cpu().int()
        dense_fsa_vec = k2.DenseFsaVec(nnet_output, supervision)
        num_lats = k2.intersect_dense(graph, dense_fsa_vec, output_beam=30.0)
        num_tot_scores = num_lats.get_tot_scores(log_semiring=True, use_double_scores=True)
        buf.append(num_tot_scores)

    buf = torch.stack(buf, dim=1)
    score_sequences = torch.stack(score_sequences, dim=0)
    print(buf - score_sequences)
 
if __name__ == "__main__":
    step_intersect_test() 


================================================
FILE: nets/scorers/new_mmi_frame_scorer.py
================================================
import k2
import torch
from espnet.nets.scorer_interface import PartialScorerInterface
from snowfall.training.mmi_graph import MmiTrainingGraphCompiler
from snowfall.lexicon import Lexicon
from snowfall.training.mmi_graph import create_bigram_phone_lm
from espnet.nets.scorers.trace_frame import trace_frame

class MMIFrameScorer(PartialScorerInterface):
    def __init__(self, lang, device, idim, sos_id, rank, use_segment, char_list):
        
        self.lang = lang
        self.device = device
        
        self.lexicon = Lexicon(lang)
        self.graph_compiler = MmiTrainingGraphCompiler(self.lexicon, self.device)
        self.phone_ids = self.lexicon.phone_symbols()
      
        self.lo = torch.nn.Linear(idim, len(self.phone_ids) + 1) 
        self.lm_scores = None
        self.load_weight(rank)

        self.P = create_bigram_phone_lm(self.phone_ids)
        self.P.set_scores_stochastic_(self.lm_scores)
        
        self.char_list = char_list
        self.eos = sos_id  # <sos> is identical to <eos>
        self.blank = 0 # by default 0 means CTC blank 
        self.logzero = -10000

    def load_weight(self, rank):
        # load lo weight and lm_scores
        ckpt_dict = torch.load(self.lang / f"mmi_param.{rank}.pth")
        for v in ckpt_dict.values():
            v.requires_grad=False
        self.lm_scores = ckpt_dict["lm_scores"]
        lo_dict = {"weight": ckpt_dict["lo.1.weight"],
                   "bias": ckpt_dict["lo.1.bias"]}
        self.lo.load_state_dict(lo_dict)

    def init_state(self, x):
        torch.set_printoptions(sci_mode=False)

        x = x[:50]
        # (1) nnet_output
        x = x.unsqueeze(0)
        nnet_output = self.lo(x)

        # (2) den_scores
        texts = ["<UNK>"] # use a random text, just to get den graph
        _, den = self.graph_compiler.compile(texts, self.P, replicate_den=True)

        T = x.size()[1]

        den_scores = trace_frame(nnet_output, den, "<UNK>")
        # [T] -> [B, T]
        den_scores = den_scores.unsqueeze(0)
        print("den_score: ", den_scores)

        # (3) Prev Score is zero
        prev_score = torch.Tensor([0]).to(torch.float32)
        return nnet_output, den_scores, prev_score 

    def select_state(self, states, j):
        nnet_output_single, den_scores, prev_scores = states
        return nnet_output_single, den_scores, prev_scores[j]

    def score(**kargs):
        raise NotImplementedError

    def score_partial(self, y, next_tokens, state, hs_pad):
        # Warning: All frame-level scores are adopted in reverse order in time-axis 
        # since k2 requires a descending input length

        # (1) unpack state
        nnet_output_single, den_scores, prev_score = state
        batch_size = len(next_tokens)

        # (3) texts
        y = y.unsqueeze(0).repeat(batch_size, 1)
        next_tokens = next_tokens.unsqueeze(1)
        ys = torch.cat([y, next_tokens], dim=1)
        # This is for Chinese. Need more tuning on English
        if not "<space>" in self.char_list:
            texts = [" ".join([self.char_list[tid] for tid in text[1:]]) for text in ys]
            texts = [text.replace("<eos>", "").strip() for text in texts]
        else:
            texts = ["".join([self.char_list[tid] for tid in text[1:]]) for text in ys]
            texts = [text.replace("<eos>", "").replace("<space>", "<space> ").strip() for text in texts]

        num_scores = []
        for text in texts:
             print(text, flush=True)
             num_graph, _ = self.graph_compiler.compile([text], self.P, replicate_den=False)
             num_graph[0].draw(f"{text}_graph.svg")
             num_score = trace_frame(nnet_output_single, num_graph, text)
             num_scores.append(num_score)
        num_scores = torch.stack(num_scores, dim=0)

        #     minus the denominator scores
        # we should keep the frame-level result for <eos>
        tot_scores_frame = num_scores - den_scores
        tot_scores = torch.logsumexp(tot_scores_frame, dim=-1)
        print(tot_scores_frame)       
 
        # (5) treat <eos> and ctc <blk> specailly
        # <eos> means the exact probability rather than the prefix probability 
        eos_pos = torch.where(next_tokens == self.eos)[0]
        if len(eos_pos) > 0:
            tot_scores[eos_pos] = tot_scores_frame[eos_pos.item(), 0]

        # CTC blank is never allowed in hypothesis. kill it
        blk_pos = torch.where(next_tokens == self.blank)[0]
        if len(blk_pos) > 0:
            tot_scores[blk_pos] = self.logzero
        
        # (6) finalize
        tok_scores = tot_scores - prev_score
        state = nnet_output_single, den_scores, tot_scores
        return tok_scores, state

    def final_score(self, state):
        return 0     
                

================================================
FILE: nets/scorers/ngram.py
================================================
"""Ngram lm implement."""

from abc import ABC

import kenlm
import torch

from espnet.nets.scorer_interface import BatchScorerInterface
from espnet.nets.scorer_interface import PartialScorerInterface


class Ngrambase(ABC):
    """Ngram base implemented throught ScorerInterface."""

    def __init__(self, ngram_model, token_list):
        """Initialize Ngrambase.

        Args:
            ngram_model: ngram model path
            token_list: token list from dict or model.json

        """
        self.chardict = [x if x != "<eos>" else "</s>" for x in token_list]
        self.charlen = len(self.chardict)
        self.lm = kenlm.LanguageModel(ngram_model)
        self.tmpkenlmstate = kenlm.State()

    def init_state(self, x):
        """Initialize tmp state."""
        state = kenlm.State()
        self.lm.NullContextWrite(state)
        return state

    def score_partial_(self, y, next_token, state, x):
        """Score interface for both full and partial scorer.

        Args:
            y: previous char
            next_token: next token need to be score
            state: previous state
            x: encoded feature

        Returns:
            tuple[torch.Tensor, List[Any]]: Tuple of
                batchfied scores for next token with shape of `(n_batch, n_vocab)`
                and next state list for ys.

        """
        out_state = kenlm.State()
        ys = self.chardict[y[-1]] if y.shape[0] > 1 else "<s>"
        self.lm.BaseScore(state, ys, out_state)
        scores = torch.empty_like(next_token, dtype=x.dtype, device=y.device)
        for i, j in enumerate(next_token):
            scores[i] = self.lm.BaseScore(
                out_state, self.chardict[j], self.tmpkenlmstate
            )
        return scores, out_state


class NgramFullScorer(Ngrambase, BatchScorerInterface):
    """Fullscorer for ngram."""

    def score(self, y, state, x):
        """Score interface for both full and partial scorer.

        Args:
            y: previous char
            state: previous state
            x: encoded feature

        Returns:
            tuple[torch.Tensor, List[Any]]: Tuple of
                batchfied scores for next token with shape of `(n_batch, n_vocab)`
                and next state list for ys.

        """
        return self.score_partial_(y, torch.tensor(range(self.charlen)), state, x)


class NgramPartScorer(Ngrambase, PartialScorerInterface):
    """Partialscorer for ngram."""

    def score_partial(self, y, next_token, state, x):
        """Score interface for both full and partial scorer.

        Args:
            y: previous char
            next_token: next token need to be score
            state: previous state
            x: encoded feature

        Returns:
            tuple[torch.Tensor, List[Any]]: Tuple of
                batchfied scores for next token with shape of `(n_batch, n_vocab)`
                and next state list for ys.

        """
        return self.score_partial_(y, next_token, state, x)

    def select_state(self, state, i):
        """Empty select state for scorer interface."""
        return state


================================================
FILE: nets/scorers/sorted_matcher.py
================================================
import math
import kaldi.fstext as fst

class SortedMatcher(object):
    """
    class implements searching arc/scores on FST
    
    Args:
        vector_fst (object): loaded fst
        max_num_arcs (int): maximum number of arcs starting from one fst state
        max_id (int): maximum i/o label id of LM fst
        backoff_id (int): backoff id of LM fst
        disambig_ids (List of int): disambig ids of LM fst
    """
    def __init__(self, vector_fst, max_num_arcs,
                 max_id, backoff_id, disambig_ids):
        #make sure fst is i/o label sorted
        self.fst = vector_fst
        self.max_num_arcs = max_num_arcs
        self.max_id = max_id
        self.backoff_id = backoff_id
        self.disambig_ids = disambig_ids

    def search(self, state_id, ilabel):
        """
        binary search on ArcIterator
        """
        aiter = self.fst.arcs(state_id)
        #binary search on ArcIterator
        size = self.max_num_arcs
        high = size - 1
        while size > 1:
            half = size // 2
            mid = high - half
            aiter.seek(mid)
            if aiter.done():
                cur_id = self.max_id
            else:
                cur_id = aiter.value().ilabel
            if cur_id >= ilabel:
                high = mid
            size -= half
        aiter.seek(high)
        if aiter.done():
            return False, None
        if aiter.value().ilabel == ilabel:
            return True, aiter
        return False, None

    """
    Tyriontian, Questions:
    (1) This function tries to find every paths (with backoff) that 
        accepts ilabel. There are possibly many of them, and will not
        lead to much difference (hori's book). 
        To achieve better speed, try to only find the path that 
        has samllest order of backoff.
    """
    def get_scores_wodisambig(self, state_id, ilabel, init_score=0.0):
        scores = []
        states = []
        bf_score = init_score
        cur_state = state_id
        while True:
            has_arc, aiter = self.search(cur_state, ilabel)
            if has_arc:
                scores.append(bf_score + aiter.value().weight.value)
                states.append(aiter.value().nextstate)
            
            has_backoff, aiter_bf = self.search(cur_state, self.backoff_id)
            if has_backoff:
                bf_score += aiter_bf.value().weight.value
                cur_state = aiter_bf.value().nextstate
            else:
                return scores, states

    """
    Given the state_id and ilabel, this function tries to find all 
    paths that have different disambig symbols and backoff order.
    This could be much time consuming. 
    O( log(max_num_arcs) * num_disambig * (lm_rank - 1) )
    """
    def get_scores(self, state_id, ilabel):
        init_scores = [0.0]
        init_states = [state_id]
        #check disambig arcs,
        for label in self.disambig_ids:
            found, aiter = self.search(state_id, label)
            if found:
                init_scores.append(aiter.value().weight.value)
                init_states.append(aiter.value().nextstate)
        scores, states = [], []
        for i, init_score in enumerate(init_scores):
            cur_sc, cur_st = self.get_scores_wodisambig(init_states[i],
                                                        ilabel, init_score)
            scores.extend(cur_sc)
            states.extend(cur_st)
        return scores, states


    """
    Similarly, find all path with different disambig symbols. Then 
    add the final score with all possible backoff
    """
    def final_score(self, state_id):
        final_scores = [0.0]
        final_states = [state_id]
        #check disambig arcs,
        for label in self.disambig_ids:
            found, aiter = self.search(state_id, label)
            if found:
                final_scores.append(aiter.value().weight.value)
                final_states.append(aiter.value().nextstate)
        def search_final(state_id, init_score=0.0):
            score = init_score
            cur_state = state_id
            while True:
                final_score = self.fst.final(cur_state).value
                if math.isinf(final_score):
                    found, aiter = self.search(cur_state, self.backoff_id)
                    if found:
                        score += aiter.value().weight.value
                        cur_state = aiter.value().nextstate
                    else:
                        return float('inf'), None
                else:
                    score += final_score
                    return score, cur_state
        for i, final_score in enumerate(final_scores):
            final_scores[i], final_states[i] = search_final(final_states[i],
                                                            final_score)
        return final_scores, final_states


================================================
FILE: nets/scorers/test.py
================================================
        
        num, den = self.graph_compiler.compile(texts, self.P, replicate_den=True)
        T = x.size()[1]
        scores = []
        for t in range(T, 0, -1):
            supervision = torch.Tensor([[0, 0, t]]).to(torch.int32) # [idx, start, length]
            dense_fsa_vec = k2.DenseFsaVec(nnet_output, supervision)
            lats = k2.intersect_dense(den, dense_fsa_vec, output_beam=10.0)
            frame_score = lats.get_tot_scores(log_semiring=True, use_double_scores=True)
            scores.append(frame_score)
        tot_scores = torch.cat(scores).unsqueeze(0)


================================================
FILE: nets/scorers/tlg_scorer.py
================================================
# Author: tyriontian
# tyriontian@tencent.com

import os
import sys
import torch
import kaldi.fstext as fst

from pathlib import Path
from espnet.nets.scorer_interface import PartialScorerInterface
from espnet.nets.scorers.sorted_matcher import SortedMatcher

class TlgPartialScorer(PartialScorerInterface):
    """
    This is a wrapper for Espnet: the word-level N-gram LM on-the-fly decoding method.
    (proposed by cweng, cweng@tencent.com)
    """

    def __init__(self, lang, nonblk_reward=0.0):
        self.lang = Path(lang)
        
        # build the SortedMatcher: core of this algorithm
        # the `lang` directory should have these files 
        disambig_ids = open(self.lang / "disambig_ids").readline().replace("\n", "").split(",")
        disambig_ids = [int(i) for i in disambig_ids]
        backoff_id = int(open(self.lang / "backoff_id").readline().strip())
        max_id = int(open(self.lang / "max_id").readline().strip())
        max_num_arcs = int(open(self.lang / "max_num_arcs").readline().strip())
        fst_lm = fst.StdVectorFst.read(str(self.lang / "LG.fst"))

        self.scorer = SortedMatcher(fst_lm, max_num_arcs, max_id, backoff_id, disambig_ids) 
        
        # reward whenever a new non-blank token generated
        assert nonblk_reward >= 0.0
        self.nonblk_reward = nonblk_reward

        print("Build TLG scorer successfully!", flush=True)

    def init_state(self, x=None):
        """
        0 is the starting state
        """
        return {0: 0.0}

    def score_partial(self, y, next_tokens, state, x):
        """
        args:
        y: interface required. Not used here
        next_tokens: list of token-ids to search
        state: dict, {state1: score1, state2: score2, ...}
               state is shared for all token-ids
        x: interface required, Not used here

        return:
        scores: list of scores for each token-id
        next_states: list of dicts, each of which is in format like `state`

        Hint: next_tokens contains no <blank> 
        """
        scores = []
        next_states = []
        for tok_id in next_tokens:
            # <eps> is not in our vocab but in the compilation of LG.fst
            score, next_state = self.score_one(tok_id + 1, state)
            scores.append(score)
            next_states.append(next_state)

        return scores, next_states

    def score_one(self, tok_id, state_dict):
        # In case the searched results are all empty.
        scores = [1e10]
        next_states = [0]
        for state, prev_score in state_dict.items():
            searched = list(self.scorer.get_scores(state, tok_id))
            searched[0] = [x + prev_score for x in searched[0]]
            scores += searched[0]
            next_states += searched[1]
        
        # the scores used for comparison have considered previous scores.  
        next_dict = {}
        for state, score in zip(next_states, scores):
            if state in next_dict:
                next_dict[state] = min(next_dict[state], score)
            else:
                next_dict[state] = score
        
        next_dict = {k: v + self.nonblk_reward for k, v in next_dict.items()}
        # Minimum value in the state dict is exactly the accumulated socre of the 
        # whole history. The first-order difference is the token-level score.
        score = min(next_dict.values()) - min(state_dict.values())
        return - score, next_dict
           
    def final_score(self, states):
        """
        args:
        states: list of dict {state1: score1, state2: score2, ...}
        
        return: 
        scores: final scores for each hypothesis
        state are not returned and considered any longer
        """
        scores = []
        for state in states:
            score = self.final_score_one(state)
            scores.append(score)
        return scores

    def final_score_one(self, state_dict):
        scores = []
        for state, _ in state_dict.items():
            searched = self.scorer.final_score(state)
            scores += searched[0]
        score = min(scores) - min(state_dict.values())
        return score
        
if __name__ == "__main__":
   token_list = [s.split()[0] for s in open("data/char.txt").readlines()]
   token_list.insert(0, "<blk>")
   scorer = TlgPartialScorer("data/tlg_ngram", token_list=token_list) 

   texts = ["天空很蓝", "天坑很蓝", "我爱你", "我艾你", "宇智波鼬", "宇子波鼬", "翁超","余剑威", "田晋川"]
   for text in texts:
       text_ids = [token_list.index(t) for t in text]
       state = scorer.init_state(None)
       for text_id in text_ids:
           score, next_states = scorer.score_partial(None, [text_id], state, None)
           state = next_states[0]
           print(f"token: {token_list[text_id]} | score: {score} | state: {state}")
       score = scorer.final_score([state])
       print(f"Final score: {score}") 
   

================================================
FILE: nets/scorers/trace_frame.py
================================================
import torch
import k2
import numpy as np
import _k2

"""
def _trace_frame(lats): 
    arcs = lats[0].as_dict()['arcs']
    lats[0].draw("den_3frame.svg")

    frame2state = []
    prev_buf, cur_buf = [0], []

    for arc in arcs:
        f, t, _, _ = arc
        f, t = int(f), int(t)

        if f in prev_buf:
            if not t in cur_buf:
                cur_buf.append(t)

        else:
            frame2state.append(prev_buf)
            prev_buf = cur_buf
            cur_buf = [t]
    
    frame2state.append(prev_buf) # last frame
    frame2state.append([t]) # final state
    return frame2state
"""

def trace_lattice(lats):
    arcs = lats.arcs.values()[:, :2]
    T = max(lats.frame).item()
    frame2state = [[] for _ in range(T+1)]

    for idx, (_, dst) in enumerate(arcs.tolist()):
        frame_idx = lats.frame[idx]
        if dst not in frame2state[frame_idx]:
            frame2state[frame_idx].append(dst)
     
    return frame2state

def compute_frame_level_scores(graph, nnet_output):
    T = nnet_output.size()[1]

    # dump lattice
    supervision = torch.Tensor([[0, 0, T]]).to(torch.int32) 
    dense_fsa_vec = k2.DenseFsaVec(nnet_output, supervision)
    lats = k2.intersect_dense(graph, dense_fsa_vec, output_beam=10.0,\
           seqframe_idx_name='seqframe', frame_idx_name='frame')
    
    # compute frame-level scores
    forward_scores = lats.get_forward_scores(True, True)
    frame2states = trace_lattice(lats)
    assert len(frame2states) == T + 1 # extra final state

    tot_scores = []
    for t in range(T, 0, -1):
        # scores for the last frame
        if t == T:
            tot_scores.append(forward_scores[-1])
        
        # scores for other frames
        else:
            states = frame2states[t-1]
            frame_score = torch.logsumexp(forward_scores[states], dim=-1)
            tot_scores.append(frame_score)
    tot_scores = torch.stack(tot_scores, dim=0)
    
    return tot_scores

def trace_lattice_batch(lats, batch):
    T = max(lats.frame).item()
    frame2state = [[[] for _ in range(T+1)] for __ in range(batch)] # 2-D list: [batch, T]
    arcs = lats.arcs.values()[:, :2].tolist() 

    batch_idx, last_is_zero = -1, False
    for idx, (src, dst) in enumerate(arcs):
        
        if src == 0 and last_is_zero == False:
            batch_idx += 1
            last_is_zero = True

        if not src == 0:
            last_is_zero = False

        frame_idx = lats.frame[idx]
        if dst not in frame2state[batch_idx][frame_idx]:
            frame2state[batch_idx][frame_idx].append(dst) 

    return frame2state

def split_forward_scores(scores):
    # splits the forward_scores according to the start state
    scores_splits = []
    prev_idx = 0 
    for i in range(1, len(scores)):
        if scores[i] == 0:
            scores_splits.append(scores[prev_idx: i])
            prev_idx = i
    scores_splits.append(scores[prev_idx:])
    return scores_splits

def compute_frame_level_scores_batch(graphs, nnet_output):
    # We would assume that nnet_output in different batch
    # is the same. This is only used for batch decoding
    batch, T, _ = nnet_output.size()

    supervision = torch.stack([
                  torch.arange(batch),
                  torch.zeros(batch),
                  torch.ones(batch) * T
                  ], dim=1).to(torch.int32)
    dense_fsa_vec = k2.DenseFsaVec(nnet_output, supervision)
    lats = k2.intersect_dense(graphs, dense_fsa_vec, output_beam=10.0,
              seqframe_idx_name='seqframe', frame_idx_name='frame')

    forward_scores = lats.get_forward_scores(True, True)
    forward_scores = split_forward_scores(forward_scores)

    frame2state = trace_lattice_batch(lats, batch)
    
    tot_scores = [[] for _ in range(batch)]
    for b in range(batch):
        for f in range(T, 0, -1): # descent order
            state = frame2state[b][f-1]
            frame_score = torch.logsumexp(forward_scores[b][state], dim=-1)
            tot_scores[b].append(frame_score)
    tot_scores = torch.Tensor(tot_scores)
    return tot_scores 

# this only for debug
def compute_frame_level_scores_loop(graph, nnet_output):
    T = nnet_output.size()[1]

    tot_scores = []
    for t in range(T, 0, -1):
        # feed one more frame it it is not the last frame
        # so the states in first t frames is identical to
        # the those in whole lattice
        t_ = t if t == T else t + 1
        supervision = torch.Tensor([[0, 0, t_]]).to(torch.int32)
        dense_fsa_vec = k2.DenseFsaVec(nnet_output, supervision)
        lats = k2.intersect_dense(graph, dense_fsa_vec, output_beam=10.0,\
               seqframe_idx_name='seqframe', frame_idx_name='frame')

        forward_scores = lats.get_forward_scores(True, True)
        frame2states = trace_lattice(lats)
        
        if t == T:
            tot_scores.append(forward_scores[-1])
        else:
            assert len(frame2states) == t + 2
            states = frame2states[t-1]
            frame_score = torch.logsumexp(forward_scores[states], dim=-1) 
            tot_scores.append(frame_score)
    tot_scores = torch.stack(tot_scores, dim=0)
    return tot_scores

if __name__ == '__main__':
    batch_size = 3
    nnet_output = torch.tensor(
    [
     [0.1, 0.22, 0.28, 0.4],
     [0.1, 0.13, 0.07, 0.7],
     [0.6, 0.2, 0.05, 0.15],
    ], dtype=torch.float32
    ).unsqueeze(0).repeat(batch_size, 1, 1)
    nnet_output = torch.nn.functional.log_softmax(nnet_output, -1)

    graph = k2.ctc_graph([[1], [1,2], [1,2,3]])
    
    scores = compute_frame_level_scores_batch(graph, nnet_output)
    
    #scores = compute_frame_level_scores(graph, nnet_output)
    #print("Scores computed by new version: ", scores)

    #scores = compute_frame_level_scores_loop(graph, nnet_output)
    #print("Scores computed by original version: ", scores)


================================================
FILE: nets/scorers/word_ngram.py
================================================
# Author: tyriontian
# tyriontian@tencent.com
# also see: https://github.com/k2-fsa/k2/issues/874

import os
import sys
import re
import torch
import k2
import copy
import fcntl
import time

from pathlib import Path
from espnet.nets.scorer_interface import PartialScorerInterface


def is_disambig_symbol(symbol: str, pattern: re.Pattern = re.compile(r'^#\d+$')) -> bool:
    return pattern.match(symbol) is not None

def find_first_disambig_symbol(symbols: k2.SymbolTable) -> int:
    return min(v for k, v in symbols._sym2id.items() if is_disambig_symbol(k))

"""
Several reminders:
1. make sure '#0' is the last symbol in words.txt so '<s>' and '</s>' will not be 
   changed to epsilon in `G.labels[G.labels >= first_word_disambig_id] = 0`
2. There might be a bug that `k2.create_fsa_vec` will make G.fst unsorted. make sure
   you sort the G.fst as a FsaVec rather than a Fsa.
3. To be compatible with the back-off paths in G.fst, self-loops are added to the 
   raw lattices.
4. This module works on both CPU and GPU. it depends on the type of `device` you feed.
   We find that CPU is even faster than GPU when the test scale is small
5. Each time we load the G.fst from disk we need to call `k2.arc_sort` to make sure 
   the `properties` in it is correct. However, this would leads to a spike in memory
   use and will slow the initialization. Directly change the G.properties is dangerous
   and cannot solve this problem.
6. Reading the G.pt exclusively, which is ensured by the system file lock.
   Each time only one process would do the reading. This is to avoid memory spike. 
"""
class WordNgram():
    def __init__(self, lang, device):
        self.lang = Path(lang)
        self.device = device
        self.is_cuda = device.type == "cuda"

        self.symbol_table = k2.SymbolTable.from_file(self.lang / 'words.txt')
        self.oovid = int(open(self.lang / 'oov.int').read().strip())


        self.load_G()
        return
        # rapid access on the disk may lead to error
        # try many times
        for i in range(10):
            try:
                self.load_G()
                break
            except:
                print(f"{i}-th trial to load G.fst but failed")              

    def load_G(self):
        
        if os.path.exists(self.lang / 'G.pt'):
            f = open(self.lang / 'G.pt', 'r')
            fcntl.flock(f, fcntl.LOCK_EX) # lock
            G_dict = torch.load(self.lang / 'G.pt')
            fcntl.flock(f, fcntl.LOCK_UN) # unlock
            G = k2.Fsa.from_dict(G_dict).to(self.device)
            G = k2.create_fsa_vec([G])
            G = k2.arc_sort(G)
            print("Successfully load the cached G.pt", flush=True)
            
        else:
            f = open(self.lang / 'G.fst.txt') 
            G = k2.Fsa.from_openfst(f.read(), acceptor=False)
            del G.aux_labels

            first_word_disambig_id = find_first_disambig_symbol(self.symbol_table)
            G.labels[G.labels >= first_word_disambig_id] = 0 
            G = k2.arc_sort(G)
            
            torch.save(G.as_dict(), self.lang / 'G.pt')
            print("No cached G.pt found. Build a new G from G.fst.txt")
        
        self.G = G.to(self.device)

    def text2lat(self, text, gram_len=6):
        """
        Enumreate all possible paths that split text into word sequence. 
        Output will be a epsilon-free, acyclic lattice on self.device
        
        text: list of token 
        ngram_len: the maximum length of each word gram, a.k.a, characters
        """
        text = [tok.replace(" ", "") for tok in text] # exclude blank
        
        if gram_len < 1:
            raise ValueError("invalid ngram_len. it should be larger than 1")

        arcs = []
        for s in range(len(text) + 1):
            for r in range(1, gram_len + 1):
                if len(text) - s - r < 0:
                    continue
                
                w = "".join(text[s: s + r])
                if w in self.symbol_table:
                    wid = self.symbol_table[w] 
                elif r == 1:
                    wid = self.oovid
                else:
                    continue
                
                arc = [s, s+r, wid, 0.0]
                arcs.append(arc)
        arcs.append([len(text), len(text) + 1, -1, 0.0])
        arcs.append([len(text) + 1])
      
        arcs = sorted(arcs, key=lambda arc: arc[0])
        arcs = [[str(i) for i in arc] for arc in arcs]
        arcs = [' '.join(arc) for arc in arcs]
        arcs = '\n'.join(arcs)

        lat = k2.Fsa.from_str(arcs, True)
        lat = k2.arc_sort(lat)
        return lat.to(self.device)

    def score_lattice(self, lats, log_semiring=True):
        """
        Apply the scores on G.fst to the lattice.
        Both the input and the output are k2.FsaVec
        """
        assert lats.device == self.device

        lats = k2.add_epsilon_self_loops(lats)
        if self.is_cuda:
            b_to_a_map = torch.zeros(lats.shape[0],
                         device=self.device, dtype=torch.int32)
            scored_lattice = k2.intersect_device(
                             self.G, lats, b_to_a_map,
                             sorted_match_a=True 
                             )
            scored_lattice = k2.top_sort(
                             k2.connect(
                             k2.remove_epsilon_self_loops(
                             scored_lattice
                             )))
        else:
            scored_lattice = k2.intersect(self.G, lats, 
                             treat_epsilons_specially=False)
            scored_lattice = k2.top_sort(
                             k2.connect(
                             k2.remove_epsilon_self_loops(scored_lattice
                             )))
        return scored_lattice

    def score_texts(self, texts, log_semiring=True):
        lats = [self.text2lat(t) for t in texts]
        lats = k2.create_fsa_vec(lats)
     
        scored_lattice = self.score_lattice(lats, log_semiring)
        
        scores = scored_lattice._get_tot_scores(log_semiring, True)
        
        return scores

    def draw(self, fsavec, prefix=None):
        for i in range(fsavec.shape[0]):
            fsa = fsavec[i]
            fsa.draw(f"{prefix}_{i}.svg")


# Warpper to Espnet scorer interface
class WordNgramPartialScorer(PartialScorerInterface):
    def __init__(self, lang, device, token_list, ignore_tokens=["<eos>", "<blank>"], log_semiring=True, lower_char=True):
        self.WordNgram = WordNgram(lang, device)
        self.log_semiring = log_semiring
        
        self.token_list = copy.deepcopy(token_list)
        for tok in ignore_tokens:
            if tok in self.token_list:
                idx = self.token_list.index(tok)
                self.token_list[idx] = ""
 
        # oov should be a single character
        if "<unk>" in self.token_list:
            idx = self.token_list.index("<unk>")
            self.token_list[idx] = "兲"

        # ignore upper / lower case of english characters.
        # this should be consistent with your words.txt
        for tok in self.token_list:
            if tok.isalpha():
                idx = self.token_list.index(tok)
                if lower_char:
                    self.token_list[idx] = tok.lower()
                else:
                    self.token_list[idx] = tok.upper()

    def init_state(self, x):
        return 0.0

    def score_partial(self, y, next_tokens, state, x):
        prefix = [self.token_list[i] for i in y] 
        next_tokens = [self.token_list[i] for i in next_tokens]
        texts = [prefix + [tok] for tok in next_tokens]

        scores = self.WordNgram.score_texts(texts, 
                     log_semiring=self.log_semiring)
        
        return scores - state, scores
    
if __name__ == "__main__":
    device = torch.device("cpu") # cpu or cuda:0
    lang = sys.argv[1]
    word_ngram = WordNgram(lang, device)

    texts = [["这", "件", "事", u"\u2581"+"INTEREST", "ING", u"\u2581"+"ALLOW" , "ED"]]
    print(texts)
    for i in range(1):
        scores = word_ngram.score_texts(texts, log_semiring=True)
        print(scores)


================================================
FILE: nets/st_interface.py
================================================
"""ST Interface module."""

from espnet.nets.asr_interface import ASRInterface
from espnet.utils.dynamic_import import dynamic_import


class STInterface(ASRInterface):
    """ST Interface for ESPnet model implementation.

    NOTE: This class is inherited from ASRInterface to enable joint translation
    and recognition when performing multi-task learning with the ASR task.

    """

    def translate(self, x, trans_args, char_list=None, rnnlm=None, ensemble_models=[]):
        """Recognize x for evaluation.

        :param ndarray x: input acouctic feature (B, T, D) or (T, D)
        :param namespace trans_args: argment namespace contraining options
        :param list char_list: list of characters
        :param torch.nn.Module rnnlm: language model module
        :return: N-best decoding results
        :rtype: list
        """
        raise NotImplementedError("translate method is not implemented")

    def translate_batch(self, x, trans_args, char_list=None, rnnlm=None):
        """Beam search implementation for batch.

        :param torch.Tensor x: encoder hidden state sequences (B, Tmax, Henc)
        :param namespace trans_args: argument namespace containing options
        :param list char_list: list of characters
        :param torch.nn.Module rnnlm: language model module
        :return: N-best decoding results
        :rtype: list
        """
        raise NotImplementedError("Batch decoding is not supported yet.")


predefined_st = {
    "pytorch": {
        "rnn": "espnet.nets.pytorch_backend.e2e_st:E2E",
        "transformer": "espnet.nets.pytorch_backend.e2e_st_transformer:E2E",
    },
    # "chainer": {
    #     "rnn": "espnet.nets.chainer_backend.e2e_st:E2E",
    #     "transformer": "espnet.nets.chainer_backend.e2e_st_transformer:E2E",
    # }
}


def dynamic_import_st(module, backend):
    """Import ST models dynamically.

    Args:
        module (str): module_name:class_name or alias in `predefined_st`
        backend (str): NN backend. e.g., pytorch, chainer

    Returns:
        type: ST class

    """
    model_class = dynamic_import(module, predefined_st.get(backend, dict()))
    assert issubclass(
        model_class, STInterface
    ), f"{module} does not implement STInterface"
    return model_class


================================================
FILE: nets/transducer_decoder_interface.py
================================================
"""Transducer decoder interface module."""

from dataclasses import dataclass
from typing import Any
from typing import Dict
from typing import List
from typing import Optional
from typing import Tuple
from typing import Union

import torch


@dataclass
class Hypothesis:
    """Default hypothesis definition for beam search."""

    score: float
    yseq: List[int]
    dec_state: Union[
        Tuple[torch.Tensor, Optional[torch.Tensor]], List[torch.Tensor], torch.Tensor
    ]
    lm_state: Union[Dict[str, Any], List[Any]] = None
    mmi_tot_score: float = None
    word_ngram_score: float = None
    tlg_state: dict = None


@dataclass
class NSCHypothesis(Hypothesis):
    """Extended hypothesis definition for NSC beam search."""

    y: List[torch.Tensor] = None
    lm_scores: torch.Tensor = None


class TransducerDecoderInterface:
    """Decoder interface for transducer models."""

    def init_state(
        self,
        batch_size: int,
        device: torch.device,
    ) -> Union[
        Tuple[torch.Tensor, Optional[torch.Tensor]], List[Optional[torch.Tensor]]
    ]:
        """Initialize decoder states.

        Args:
            batch_size: Batch size for initial state
            device: Device for initial state

        Returns:
            state: Initialized state

        """
        raise NotImplementedError("init_state method is not implemented")

    def score(
        self,
        hyp: Union[Hypothesis, NSCHypothesis],
        cache: Dict[str, Any],
    ) -> Union[
        Tuple[torch.Tensor, Optional[torch.Tensor]],
        torch.Tensor,
        List[Optional[torch.Tensor]],
    ]:
        """Forward one hypothesis.

        Args:
            hyp: Hypothesis.
            cache: Pairs of (y, state) for each token sequence (key)

        Returns:
            y: Decoder outputs
            new_state: New decoder state
            lm_tokens: Token id for LM

        """
        raise NotImplementedError("score method is not implemented")

    def batch_score(
        self,
        hyps: Union[List[Hypothesis], List[NSCHypothesis]],
        batch_states: Union[
            Tuple[torch.Tensor, Optional[torch.Tensor]], List[Optional[torch.Tensor]]
        ],
        cache: Dict[str, Any],
    ) -> Union[
        Tuple[torch.Tensor, Optional[torch.Tensor]],
        torch.Tensor,
        List[Optional[torch.Tensor]],
    ]:
        """Forward batch of hypotheses.

        Args:
            hyps: Batch of hypotheses
            batch_states: Batch of decoder states
            cache: pairs of (y, state) for each token sequence (key)

        Returns:
            batch_y: Decoder outputs
            batch_states: Batch of decoder states
            lm_tokens: Batch of token ids for LM

        """
        raise NotImplementedError("batch_score method is not implemented")

    def select_state(
        self,
        batch_states: Union[
            Tuple[torch.Tensor, Optional[torch.Tensor]], List[Optional[torch.Tensor]]
        ],
        idx: int,
    ) -> Union[
        Tuple[torch.Tensor, Optional[torch.Tensor]], List[Optional[torch.Tensor]]
    ]:
        """Get decoder state from batch for given id.

        Args:
            batch_states: Batch of decoder states
            idx: Index to extract state from batch

        Returns:
            state_idx: Decoder state for given id

        """
        raise NotImplementedError("select_state method is not implemented")

    def create_batch_states(
        self,
        batch_states: Union[
            Tuple[torch.Tensor, Optional[torch.Tensor]], List[Optional[torch.Tensor]]
        ],
        l_states: List[
            Union[
                Tuple[torch.Tensor, Optional[torch.Tensor]],
                List[Optional[torch.Tensor]],
            ]
        ],
        l_tokens: List[List[int]],
    ) -> Union[
        Tuple[torch.Tensor, Optional[torch.Tensor]], List[Optional[torch.Tensor]]
    ]:
        """Create batch of decoder states.

        Args:
            batch_states: Batch of decoder states
            l_states: List of decoder states
            l_tokens: List of token sequences for input batch

        Returns:
            batch_states: Batch of decoder states

        """
        raise NotImplementedError("create_batch_states method is not implemented")


================================================
FILE: nets/tts_interface.py
================================================
# -*- coding: utf-8 -*-

# Copyright 2018 Nagoya University (Tomoki Hayashi)
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

"""TTS Interface realted modules."""

from espnet.asr.asr_utils import torch_load


try:
    import chainer
except ImportError:
    Reporter = None
else:

    class Reporter(chainer.Chain):
        """Reporter module."""

        def report(self, dicts):
            """Report values from a given dict."""
            for d in dicts:
                chainer.reporter.report(d, self)


class TTSInterface(object):
    """TTS Interface for ESPnet model implementation."""

    @staticmethod
    def add_arguments(parser):
        """Add model specific argments to parser."""
        return parser

    def __init__(self):
        """Initilize TTS module."""
        self.reporter = Reporter()

    def forward(self, *args, **kwargs):
        """Calculate TTS forward propagation.

        Returns:
            Tensor: Loss value.

        """
        raise NotImplementedError("forward method is not implemented")

    def inference(self, *args, **kwargs):
        """Generate the sequence of features given the sequences of characters.

        Returns:
            Tensor: The sequence of generated features (L, odim).
            Tensor: The sequence of stop probabilities (L,).
            Tensor: The sequence of attention weights (L, T).

        """
        raise NotImplementedError("inference method is not implemented")

    def calculate_all_attentions(self, *args, **kwargs):
        """Calculate TTS attention weights.

        Args:
            Tensor: Batch of attention weights (B, Lmax, Tmax).

        """
        raise NotImplementedError("calculate_all_attentions method is not implemented")

    def load_pretrained_model(self, model_path):
        """Load pretrained model parameters."""
        torch_load(model_path, self)

    @property
    def attention_plot_class(self):
        """Plot attention weights."""
        from espnet.asr.asr_utils import PlotAttentionReport

        return PlotAttentionReport

    @property
    def base_plot_keys(self):
        """Return base key names to plot during training.

        The keys should match what `chainer.reporter` reports.
        if you add the key `loss`,
        the reporter will report `main/loss` and `validation/main/loss` values.
        also `loss.png` will be created as a figure visulizing `main/loss`
        and `validation/main/loss` values.

        Returns:
            list[str]:  Base keys to plot during training.

        """
        return ["loss"]


================================================
FILE: optimizer/__init__.py
================================================
"""Initialize sub package."""


================================================
FILE: optimizer/chainer.py
================================================
"""Chainer optimizer builders."""
import argparse

import chainer
from chainer.optimizer_hooks import WeightDecay

from espnet.optimizer.factory import OptimizerFactoryInterface
from espnet.optimizer.parser import adadelta
from espnet.optimizer.parser import adam
from espnet.optimizer.parser import sgd


class AdamFactory(OptimizerFactoryInterface):
    """Adam factory."""

    @staticmethod
    def add_arguments(parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
        """Register args."""
        return adam(parser)

    @staticmethod
    def from_args(target, args: argparse.Namespace):
        """Initialize optimizer from argparse Namespace.

        Args:
            target: for pytorch `model.parameters()`,
                for chainer `model`
            args (argparse.Namespace): parsed command-line args

        """
        opt = chainer.optimizers.Adam(
            alpha=args.lr,
            beta1=args.beta1,
            beta2=args.beta2,
        )
        opt.setup(target)
        opt.add_hook(WeightDecay(args.weight_decay))
        return opt


class SGDFactory(OptimizerFactoryInterface):
    """SGD factory."""

    @staticmethod
    def add_arguments(parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
        """Register args."""
        return sgd(parser)

    @staticmethod
    def from_args(target, args: argparse.Namespace):
        """Initialize optimizer from argparse Namespace.

        Args:
            target: for pytorch `model.parameters()`,
                for chainer `model`
            args (argparse.Namespace): parsed command-line args

        """
        opt = chainer.optimizers.SGD(
            lr=args.lr,
        )
        opt.setup(target)
        opt.add_hook(WeightDecay(args.weight_decay))
        return opt


class AdadeltaFactory(OptimizerFactoryInterface):
    """Adadelta factory."""

    @staticmethod
    def add_arguments(parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
        """Register args."""
        return adadelta(parser)

    @staticmethod
    def from_args(target, args: argparse.Namespace):
        """Initialize optimizer from argparse Namespace.

        Args:
            target: for pytorch `model.parameters()`,
                for chainer `model`
            args (argparse.Namespace): parsed command-line args

        """
        opt = chainer.optimizers.AdaDelta(
            rho=args.rho,
            eps=args.eps,
        )
        opt.setup(target)
        opt.add_hook(WeightDecay(args.weight_decay))
        return opt


OPTIMIZER_FACTORY_DICT = {
    "adam": AdamFactory,
    "sgd": SGDFactory,
    "adadelta": AdadeltaFactory,
}


================================================
FILE: optimizer/factory.py
================================================
"""Import optimizer class dynamically."""
import argparse

from espnet.utils.dynamic_import import dynamic_import
from espnet.utils.fill_missing_args import fill_missing_args


class OptimizerFactoryInterface:
    """Optimizer adaptor."""

    @staticmethod
    def from_args(target, args: argparse.Namespace):
        """Initialize optimizer from argparse Namespace.

        Args:
            target: for pytorch `model.parameters()`,
                for chainer `model`
            args (argparse.Namespace): parsed command-line args

        """
        raise NotImplementedError()

    @staticmethod
    def add_arguments(parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
        """Register args."""
        return parser

    @classmethod
    def build(cls, target, **kwargs):
        """Initialize optimizer with python-level args.

        Args:
            target: for pytorch `model.parameters()`,
                for chainer `model`

        Returns:
            new Optimizer

        """
        args = argparse.Namespace(**kwargs)
        args = fill_missing_args(args, cls.add_arguments)
        return cls.from_args(target, args)


def dynamic_import_optimizer(name: str, backend: str) -> OptimizerFactoryInterface:
    """Import optimizer class dynamically.

    Args:
        name (str): alias name or dynamic import syntax `module:class`
        backend (str): backend name e.g., chainer or pytorch

    Returns:
        OptimizerFactoryInterface or FunctionalOptimizerAdaptor

    """
    if backend == "pytorch":
        from espnet.optimizer.pytorch import OPTIMIZER_FACTORY_DICT

        return OPTIMIZER_FACTORY_DICT[name]
    elif backend == "chainer":
        from espnet.optimizer.chainer import OPTIMIZER_FACTORY_DICT

        return OPTIMIZER_FACTORY_DICT[name]
    else:
        raise NotImplementedError(f"unsupported backend: {backend}")

    factory_class = dynamic_import(name)
    assert issubclass(factory_class, OptimizerFactoryInterface)
    return factory_class


================================================
FILE: optimizer/parser.py
================================================
"""Common optimizer default config for multiple backends."""


def sgd(parser):
    """Add arguments."""
    parser.add_argument("--lr", type=float, default=1.0, help="Learning rate")
    parser.add_argument("--weight-decay", type=float, default=0.0, help="Weight decay")
    return parser


def adam(parser):
    """Add arguments."""
    parser.add_argument("--lr", type=float, default=1e-3, help="Learning rate")
    parser.add_argument("--beta1", type=float, default=0.9, help="Beta1")
    parser.add_argument("--beta2", type=float, default=0.999, help="Beta2")
    parser.add_argument("--weight-decay", type=float, default=0.0, help="Weight decay")
    return parser


def adadelta(parser):
    """Add arguments."""
    parser.add_argument("--rho", type=float, default=0.95, help="Rho")
    parser.add_argument("--eps", type=float, default=1e-8, help="Eps")
    parser.add_argument("--weight-decay", type=float, default=0.0, help="Weight decay")
    return parser


================================================
FILE: optimizer/pytorch.py
================================================
"""PyTorch optimizer builders."""
import argparse

import torch

from espnet.optimizer.factory import OptimizerFactoryInterface
from espnet.optimizer.parser import adadelta
from espnet.optimizer.parser import adam
from espnet.optimizer.parser import sgd


class AdamFactory(OptimizerFactoryInterface):
    """Adam factory."""

    @staticmethod
    def add_arguments(parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
        """Register args."""
        return adam(parser)

    @staticmethod
    def from_args(target, args: argparse.Namespace):
        """Initialize optimizer from argparse Namespace.

        Args:
            target: for pytorch `model.parameters()`,
                for chainer `model`
            args (argparse.Namespace): parsed command-line args

        """
        return torch.optim.Adam(
            target,
            lr=args.lr,
            weight_decay=args.weight_decay,
            betas=(args.beta1, args.beta2),
        )


class SGDFactory(OptimizerFactoryInterface):
    """SGD factory."""

    @staticmethod
    def add_arguments(parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
        """Register args."""
        return sgd(parser)

    @staticmethod
    def from_args(target, args: argparse.Namespace):
        """Initialize optimizer from argparse Namespace.

        Args:
            target: for pytorch `model.parameters()`,
                for chainer `model`
            args (argparse.Namespace): parsed command-line args

        """
        return torch.optim.SGD(
            target,
            lr=args.lr,
            weight_decay=args.weight_decay,
        )


class AdadeltaFactory(OptimizerFactoryInterface):
    """Adadelta factory."""

    @staticmethod
    def add_arguments(parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
        """Register args."""
        return adadelta(parser)

    @staticmethod
    def from_args(target, args: argparse.Namespace):
        """Initialize optimizer from argparse Namespace.

        Args:
            target: for pytorch `model.parameters()`,
                for chainer `model`
            args (argparse.Namespace): parsed command-line args

        """
        return torch.optim.Adadelta(
            target,
            rho=args.rho,
            eps=args.eps,
            weight_decay=args.weight_decay,
        )


OPTIMIZER_FACTORY_DICT = {
    "adam": AdamFactory,
    "sgd": SGDFactory,
    "adadelta": AdadeltaFactory,
}


================================================
FILE: scheduler/__init__.py
================================================
"""Initialize sub package."""


================================================
FILE: scheduler/chainer.py
================================================
"""Chainer optimizer schdulers."""

from typing import List

from chainer.optimizer import Optimizer

from espnet.scheduler.scheduler import SchedulerInterface


class ChainerScheduler:
    """Chainer optimizer scheduler."""

    def __init__(self, schedulers: List[SchedulerInterface], optimizer: Optimizer):
        """Initialize class."""
        self.schedulers = schedulers
        self.optimizer = optimizer
        self.init_values = dict()
        for s in self.schedulers:
            self.init_values[s.key] = getattr(self.optimizer, s.key)

    def step(self, n_iter: int):
        """Update optimizer by scheduling."""
        for s in self.schedulers:
            new_val = self.init_values[s.key] * s.scale(n_iter)
            setattr(self.optimizer, s.key, new_val)


================================================
FILE: scheduler/pytorch.py
================================================
"""PyTorch optimizer schdulers."""

from typing import List

from torch.optim import Optimizer

from espnet.scheduler.scheduler import SchedulerInterface


class PyTorchScheduler:
    """PyTorch optimizer scheduler."""

    def __init__(self, schedulers: List[SchedulerInterface], optimizer: Optimizer):
        """Initialize class."""
        self.schedulers = schedulers
        self.optimizer = optimizer
        for s in self.schedulers:
            for group in optimizer.param_groups:
                group.setdefault("initial_" + s.key, group[s.key])

    def step(self, n_iter: int):
        """Update optimizer by scheduling."""
        for s in self.schedulers:
            for group in self.optimizer.param_groups:
                group[s.key] = group["initial_" + s.key] * s.scale(n_iter)


================================================
FILE: scheduler/scheduler.py
================================================
"""Schedulers."""

import argparse

from espnet.utils.dynamic_import import dynamic_import
from espnet.utils.fill_missing_args import fill_missing_args


class _PrefixParser:
    def __init__(self, parser, prefix):
        self.parser = parser
        self.prefix = prefix

    def add_argument(self, name, **kwargs):
        assert name.startswith("--")
        self.parser.add_argument(self.prefix + name[2:], **kwargs)


class SchedulerInterface:
    """Scheduler interface."""

    alias = ""

    def __init__(self, key: str, args: argparse.Namespace):
        """Initialize class."""
        self.key = key
        prefix = key + "_" + self.alias + "_"
        for k, v in vars(args).items():
            if k.startswith(prefix):
                setattr(self, k[len(prefix) :], v)

    def get_arg(self, name):
        """Get argument without prefix."""
        return getattr(self.args, f"{self.key}_{self.alias}_{name}")

    @classmethod
    def add_arguments(cls, key: str, parser: argparse.ArgumentParser):
        """Add arguments for CLI."""
        group = parser.add_argument_group(f"{cls.alias} scheduler")
        cls._add_arguments(_PrefixParser(parser=group, prefix=f"--{key}-{cls.alias}-"))
        return parser

    @staticmethod
    def _add_arguments(parser: _PrefixParser):
        pass

    @classmethod
    def build(cls, key: str, **kwargs):
        """Initialize this class with python-level args.

        Args:
            key (str): key of hyper parameter

        Returns:
            LMinterface: A new instance of LMInterface.

        """

        def add(parser):
            return cls.add_arguments(key, parser)

        kwargs = {f"{key}_{cls.alias}_" + k: v for k, v in kwargs.items()}
        args = argparse.Namespace(**kwargs)
        args = fill_missing_args(args, add)
        return cls(key, args)

    def scale(self, n_iter: int) -> float:
        """Scale at `n_iter`.

        Args:
            n_iter (int): number of current iterations.

        Returns:
            float: current scale of learning rate.

        """
        raise NotImplementedError()


SCHEDULER_DICT = {}


def register_scheduler(cls):
    """Register scheduler."""
    SCHEDULER_DICT[cls.alias] = cls.__module__ + ":" + cls.__name__
    return cls


def dynamic_import_scheduler(module):
    """Import Scheduler class dynamically.

    Args:
        module (str): module_name:class_name or alias in `SCHEDULER_DICT`

    Returns:
        type: Scheduler class

    """
    model_class = dynamic_import(module, SCHEDULER_DICT)
    assert issubclass(
        model_class, SchedulerInterface
    ), f"{module} does not implement SchedulerInterface"
    return model_class


@register_scheduler
class NoScheduler(SchedulerInterface):
    """Scheduler which does nothing."""

    alias = "none"

    def scale(self, n_iter):
        """Scale of lr."""
        return 1.0


@register_scheduler
class NoamScheduler(SchedulerInterface):
    """Warmup + InverseSqrt decay scheduler.

    Args:
        noam_warmup (int): number of warmup iterations.

    """

    alias = "noam"

    @staticmethod
    def _add_arguments(parser: _PrefixParser):
        """Add scheduler args."""
        parser.add_argument(
            "--warmup", type=int, default=1000, help="Number of warmup iterations."
        )

    def __init__(self, key, args):
        """Initialize class."""
        super().__init__(key, args)
        self.normalize = 1 / (self.warmup * self.warmup ** -1.5)

    def scale(self, step):
        """Scale of lr."""
        step += 1  # because step starts from 0
        return self.normalize * min(step ** -0.5, step * self.warmup ** -1.5)


@register_scheduler
class CyclicCosineScheduler(SchedulerInterface):
    """Cyclic cosine annealing.

    Args:
        cosine_warmup (int): number of warmup iterations.
        cosine_total (int): number of total annealing iterations.

    Notes:
        Proposed in https://openreview.net/pdf?id=BJYwwY9ll
        (and https://arxiv.org/pdf/1608.03983.pdf).
        Used in the GPT2 config of Megatron-LM https://github.com/NVIDIA/Megatron-LM

    """

    alias = "cosine"

    @staticmethod
    def _add_arguments(parser: _PrefixParser):
        """Add scheduler args."""
        parser.add_argument(
            "--warmup", type=int, default=1000, help="Number of warmup iterations."
        )
        parser.add_argument(
            "--total",
            type=int,
            default=100000,
            help="Number of total annealing iterations.",
        )

    def scale(self, n_iter):
        """Scale of lr."""
        import math

        return 0.5 * (math.cos(math.pi * (n_iter - self.warmup) / self.total) + 1)


================================================
FILE: snowfall/__init__.py
================================================


================================================
FILE: snowfall/common.py
================================================
#!/usr/bin/env python3

# Copyright 2019-2020 Mobvoi AI Lab, Beijing, China (author: Fangjun Kuang)
# Apache 2.0
import argparse
import logging
import os
import re
from collections import defaultdict
from datetime import datetime
from pathlib import Path
from typing import Any, Dict, Iterable, List, Optional, TextIO, Tuple, Union

import k2
import kaldialign
import torch
import torch.distributed as dist
from torch.cuda.amp import GradScaler
from torch.nn.parallel import DistributedDataParallel

from snowfall.models import AcousticModel

Pathlike = Union[str, Path]


def setup_logger(log_filename: Pathlike, log_level: str = 'info', use_console: bool = True) -> None:
    now = datetime.now()
    date_time = now.strftime('%Y-%m-%d-%H-%M-%S')
    log_filename = '{}-{}'.format(log_filename, date_time)
    os.makedirs(os.path.dirname(log_filename), exist_ok=True)

    if dist.is_available() and dist.is_initialized():
        world_size = dist.get_world_size()
        rank = dist.get_rank()
        formatter = f'%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] ({rank}/{world_size}) %(message)s'
    else:
        formatter = '%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s'

    level = logging.ERROR
    if log_level == 'debug':
        level = logging.DEBUG
    elif log_level == 'info':
        level = logging.INFO
    elif log_level == 'warning':
        level = logging.WARNING
    logging.basicConfig(filename=log_filename,
                        format=formatter,
                        level=level,
                        filemode='w')
    if use_console:
        console = logging.StreamHandler()
        console.setLevel(level)
        console.setFormatter(logging.Formatter(formatter))
        logging.getLogger('').addHandler(console)


def load_checkpoint(
        filename: Pathlike,
        model: AcousticModel,
        optimizer: Optional[object] = None,
        scheduler: Optional[object] = None,
        scaler: Optional[GradScaler] = None,
) -> Dict[str, Any]:
    logging.info('load checkpoint from {}'.format(filename))

    checkpoint = torch.load(filename, map_location='cpu')

    keys = [
        'state_dict', 'optimizer', 'scheduler', 'epoch', 'learning_rate', 'objf', 'valid_objf',
        'num_features', 'num_classes', 'subsampling_factor',
        'global_batch_idx_train'
    ]
    missing_keys = set(keys) - set(checkpoint.keys())
    if missing_keys:
        raise ValueError(f"Missing keys in checkpoint: {missing_keys}")

    if isinstance(model, DistributedDataParallel):
        model = model.module

    if not list(model.state_dict().keys())[0].startswith('module.') \
            and list(checkpoint['state_dict'])[0].startswith('module.'):
        # the checkpoint was saved by DDP
        logging.info('load checkpoint from DDP')
        dst_state_dict = model.state_dict()
        src_state_dict = checkpoint['state_dict']
        for key in dst_state_dict.keys():
            src_key = '{}.{}'.format('module', key)
            dst_state_dict[key] = src_state_dict.pop(src_key)
        assert len(src_state_dict) == 0
        model.load_state_dict(dst_state_dict)
    else:
        model.load_state_dict(checkpoint['state_dict'])

    model.num_features = checkpoint['num_features']
    model.num_classes = checkpoint['num_classes']
    model.subsampling_factor = checkpoint['subsampling_factor']

    if optimizer is not None:
        optimizer.load_state_dict(checkpoint['optimizer'])

    if scheduler is not None:
        scheduler.load_state_dict(checkpoint['scheduler'])

    if scaler is not None:
        scaler.load_state_dict(checkpoint['grad_scaler'])

    return checkpoint


def average_checkpoint(filenames: List[Pathlike], model: AcousticModel) -> Dict[str, Any]:
    logging.info('average over checkpoints {}'.format(filenames))

    avg_model = None

    # sum
    for filename in filenames:
        checkpoint = torch.load(filename, map_location='cpu')
        checkpoint_model = checkpoint['state_dict']
        if avg_model is None:
            avg_model = checkpoint_model
        else:
            for k in avg_model.keys():
                avg_model[k] += checkpoint_model[k]
    # average
    for k in avg_model.keys():
        if avg_model[k] is not None:
            if avg_model[k].is_floating_point():
                avg_model[k] /= len(filenames)
            else:
                avg_model[k] //= len(filenames)

    checkpoint['state_dict'] = avg_model

    keys = [
        'state_dict', 'optimizer', 'scheduler', 'epoch', 'learning_rate', 'objf', 'valid_objf',
        'num_features', 'num_classes', 'subsampling_factor',
        'global_batch_idx_train'
    ]
    missing_keys = set(keys) - set(checkpoint.keys())
    if missing_keys:
        raise ValueError(f"Missing keys in checkpoint: {missing_keys}")

    if not list(model.state_dict().keys())[0].startswith('module.') \
            and list(checkpoint['state_dict'])[0].startswith('module.'):
        # the checkpoint was saved by DDP
        logging.info('load checkpoint from DDP')
        dst_state_dict = model.state_dict()
        src_state_dict = checkpoint['state_dict']
        for key in dst_state_dict.keys():
            src_key = '{}.{}'.format('module', key)
            dst_state_dict[key] = src_state_dict.pop(src_key)
        assert len(src_state_dict) == 0
        model.load_state_dict(dst_state_dict)
    else:
        model.load_state_dict(checkpoint['state_dict'])

    model.num_features = checkpoint['num_features']
    model.num_classes = checkpoint['num_classes']
    model.subsampling_factor = checkpoint['subsampling_factor']

    return checkpoint


def save_checkpoint(
        filename: Pathlike,
        model: Union[AcousticModel, DistributedDataParallel],
        optimizer: object,
        scheduler: object,
        epoch: int,
        learning_rate: float,
        objf: float,
        valid_objf: float,
        global_batch_idx_train: int,
        local_rank: int = 0,
        scaler: Optional[GradScaler] = None
) -> None:
    if local_rank is not None and local_rank != 0:
        return
    if isinstance(model, DistributedDataParallel):
        model = model.module
    logging.info(f'Save checkpoint to {filename}: epoch={epoch}, '
                 f'learning_rate={learning_rate}, objf={objf}, valid_objf={valid_objf}')
    checkpoint = {
        'state_dict': model.state_dict(),
        'optimizer': optimizer.state_dict() if optimizer is not None else None,
        'scheduler': scheduler.state_dict() if scheduler is not None else None,
        'grad_scaler': scaler.state_dict() if scaler is not None else None,
        'epoch': epoch,
        'learning_rate': learning_rate,
        'objf': objf,
        'valid_objf': valid_objf,
        'global_batch_idx_train': global_batch_idx_train,
        'num_features': model.num_features,
        'num_classes': model.num_classes,
        'subsampling_factor': model.subsampling_factor,
    }
    torch.save(checkpoint, filename)


def save_training_info(
        filename: Pathlike,
        model_path: Pathlike,
        current_epoch: int,
        learning_rate: float,
        objf: float,
        best_objf: float,
        valid_objf: float,
        best_valid_objf: float,
        best_epoch: int,
        local_rank: int = 0
):
    if local_rank is not None and local_rank != 0:
        return

    with open(filename, 'w') as f:
        f.write('model_path: {}\n'.format(model_path))
        f.write('epoch: {}\n'.format(current_epoch))
        f.write('learning rate: {}\n'.format(learning_rate))
        f.write('objf: {}\n'.format(objf))
        f.write('best objf: {}\n'.format(best_objf))
        f.write('valid objf: {}\n'.format(valid_objf))
        f.write('best valid objf: {}\n'.format(best_valid_objf))
        f.write('best epoch: {}\n'.format(best_epoch))

    logging.info('write training info to {}'.format(filename))


def get_phone_symbols(symbol_table: k2.SymbolTable,
                      pattern: str = r'^#\d+$') -> List[int]:
    '''Return a list of phone IDs containing no disambiguation symbols.

    Caution:
      0 is not a phone ID so it is excluded from the return value.

    Args:
      symbol_table:
        A symbol table in k2.
      pattern:
        Symbols containing this pattern are disambiguation symbols.
    Returns:
      Return a list of symbol IDs excluding those from disambiguation symbols.
    '''
    regex = re.compile(pattern)
    symbols = symbol_table.symbols
    ans = []
    for s in symbols:
        if not regex.match(s):
            ans.append(symbol_table[s])
    if 0 in ans:
        ans.remove(0)
    ans.sort()
    return ans


def cut_id_dumper(dataloader, path: Path):
    """
    Debugging utility. Writes processed cut IDs to a file.
    Expects ``return_cuts=True`` to be passed to the Dataset class.

    Example::

        >>> for batch in cut_id_dumper(dataloader):
        ...     pass
    """
    if not dataloader.dataset.return_cuts:
        return dataloader  # do nothing, "return_cuts=True" was not set
    with path.open('w') as f:
        for batch in dataloader:
            for cut in batch['supervisions']['cut']:
                print(cut.id, file=f)
            yield batch


def str2bool(v):
    if isinstance(v, bool):
        return v
    if v.lower() in ('yes', 'true', 't', 'y', '1'):
        return True
    elif v.lower() in ('no', 'false', 'f', 'n', '0'):
        return False
    else:
        raise argparse.ArgumentTypeError('Boolean value expected.')


def describe(model: torch.nn.Module):
    logging.info('=' * 80)
    logging.info('Model parameters summary:')
    logging.info('=' * 80)
    total = 0
    for name, param in model.named_parameters():
        num_params = param.numel()
        total += num_params
        logging.info(f'* {name}: {num_params:>{80 - len(name) - 4}}')
    logging.info('=' * 80)
    logging.info(f'Total: {total}')
    logging.info('=' * 80)


def get_texts(best_paths: k2.Fsa, indices: Optional[torch.Tensor] = None) -> List[List[int]]:
    '''Extract the texts from the best-path FSAs, in the original order (before
       the permutation given by `indices`).
       Args:
           best_paths:  a k2.Fsa with best_paths.arcs.num_axes() == 3, i.e.
                    containing multiple FSAs, which is expected to be the result
                    of k2.shortest_path (otherwise the returned values won't
                    be meaningful).  Must have the 'aux_labels' attribute, as
                  a ragged tensor.
           indices: possibly a torch.Tensor giving the permutation that we used
                    on the supervisions of this minibatch to put them in decreasing
                    order of num-frames.  We'll apply the inverse permutation.
                    Doesn't have to be on the same device as `best_paths`
      Return:
          Returns a list of lists of int, containing the label sequences we
          decoded.
    '''
    # remove any 0's or -1's (there should be no 0's left but may be -1's.)
    aux_labels = k2.ragged.remove_values_leq(best_paths.aux_labels, 0)
    aux_shape = k2.ragged.compose_ragged_shapes(best_paths.arcs.shape(),
                                                aux_labels.shape())
    # remove the states and arcs axes.
    aux_shape = k2.ragged.remove_axis(aux_shape, 1)
    aux_shape = k2.ragged.remove_axis(aux_shape, 1)
    aux_labels = k2.RaggedInt(aux_shape, aux_labels.values())
    assert (aux_labels.num_axes() == 2)
    aux_labels, _ = k2.ragged.index(aux_labels,
                                    invert_permutation(indices).to(dtype=torch.int32,
                                                                   device=best_paths.device))
    return k2.ragged.to_list(aux_labels)


def invert_permutation(indices: torch.Tensor) -> torch.Tensor:
    ans = torch.zeros(indices.shape, device=indices.device, dtype=torch.long)
    ans[indices] = torch.arange(0, indices.shape[0], device=indices.device)
    return ans


def find_first_disambig_symbol(symbols: k2.SymbolTable) -> int:
    return min(v for k, v in symbols._sym2id.items() if k.startswith('#'))


def store_transcripts(path: Pathlike, texts: Iterable[Tuple[str, str]]):
    with open(path, 'w') as f:
        for ref, hyp in texts:
            print(f'ref={ref}', file=f)
            print(f'hyp={hyp}', file=f)

def write_error_stats(f: TextIO, test_set_name: str, results: List[Tuple[str,str]]) -> None:
    subs: Dict[Tuple[str,str], int] = defaultdict(int)
    ins: Dict[str, int] = defaultdict(int)
    dels: Dict[str, int] = defaultdict(int)

    # `words` stores counts per word, as follows:
    #   corr, ref_sub, hyp_sub, ins, dels
    words: Dict[str, List[int]] = defaultdict(lambda: [0,0,0,0,0])
    num_corr = 0
    ERR = '*'
    for ref, hyp in results:
        ali = kaldialign.align(ref, hyp, ERR)
        for ref_word,hyp_word in ali:
            if ref_word == ERR:
                ins[hyp_word] += 1
                words[hyp_word][3] += 1
            elif hyp_word == ERR:
                dels[ref_word] += 1
                words[ref_word][4] += 1
            elif hyp_word != ref_word:
                subs[(ref_word,hyp_word)] += 1
                words[ref_word][1] += 1
                words[hyp_word][2] += 1
            else:
                words[ref_word][0] += 1
                num_corr += 1
    ref_len = sum([len(r) for r,_ in results])
    sub_errs = sum(subs.values())
    ins_errs = sum(ins.values())
    del_errs = sum(dels.values())
    tot_errs = sub_errs + ins_errs + del_errs
    tot_err_rate = '%.2f' % (100.0 * tot_errs / ref_len)

    logging.info(
        f'[{test_set_name}] %WER {tot_errs / ref_len:.2%} '
        f'[{tot_errs} / {ref_len}, {ins_errs} ins, {del_errs} del, {sub_errs} sub ]'
    )

    print(f"%WER = {tot_err_rate}", file=f)
    print(f"Errors: {ins_errs} insertions, {del_errs} deletions, {sub_errs} substitutions, over {ref_len} reference words ({num_corr} correct)",
          file=f)
    print("Search below for sections starting with PER-UTT DETAILS:, SUBSTITUTIONS:, DELETIONS:, INSERTIONS:, PER-WORD STATS:",
          file=f)

    print("", file=f)
    print("PER-UTT DETAILS: corr or (ref->hyp)  ", file=f)
    for ref, hyp in results:
        ali = kaldialign.align(ref, hyp, ERR)
        combine_successive_errors = True
        if combine_successive_errors:
            ali = [ [[x],[y]] for x,y in ali ]
            for i in range(len(ali) - 1):
                if ali[i][0] != ali[i][1] and ali[i+1][0] != ali[i+1][1]:
                    ali[i+1][0] = ali[i][0] + ali[i+1][0]
                    ali[i+1][1] = ali[i][1] + ali[i+1][1]
                    ali[i] = [[],[]]
            ali = [ [list(filter(lambda a: a != ERR, x)),
                     list(filter(lambda a: a != ERR, y))]
                     for x,y in ali ]
            ali = list(filter(lambda x: x != [[],[]], ali))
            ali = [ [ERR if x == [] else ' '.join(x),
                     ERR if y == [] else ' '.join(y)]
                    for x,y in ali ]

        print(' '.join((ref_word if ref_word == hyp_word else f'({ref_word}->{hyp_word})'
                        for ref_word,hyp_word in ali)), file=f)


    print("", file=f)
    print("SUBSTITUTIONS: count ref -> hyp", file=f)

    for count,(ref,hyp) in sorted([(v,k) for k,v in subs.items()], reverse=True):
        print(f"{count}   {ref} -> {hyp}", file=f)

    print("", file=f)
    print("DELETIONS: count ref", file=f)
    for count,ref in sorted([(v,k) for k,v in dels.items()], reverse=True):
        print(f"{count}   {ref}", file=f)

    print("", file=f)
    print("INSERTIONS: count hyp", file=f)
    for count,hyp in sorted([(v,k) for k,v in ins.items()], reverse=True):
        print(f"{count}   {hyp}", file=f)

    print("", file=f)
    print("PER-WORD STATS: word  corr tot_errs count_in_ref count_in_hyp", file=f)
    for _,word,counts in sorted([(sum(v[1:]),k,v) for k,v in words.items()], reverse=True):
        (corr, ref_sub, hyp_sub, ins, dels) = counts
        tot_errs = ref_sub + hyp_sub + ins + dels
        ref_count = corr + ref_sub + dels
        hyp_count = corr + hyp_sub + ins

        print(f"{word}   {corr} {tot_errs} {ref_count} {hyp_count}", file=f)


================================================
FILE: snowfall/data/__init__.py
================================================
from .aishell import AishellAsrDataModule
from .asr_datamodule import AsrDataModule
from .datamodule import DataModule
from .librispeech import LibriSpeechAsrDataModule

================================================
FILE: snowfall/data/aishell.py
================================================
import logging
from functools import lru_cache

from lhotse import CutSet, load_manifest
from snowfall.data.asr_datamodule import AsrDataModule


class AishellAsrDataModule(AsrDataModule):
    """
    Aishell ASR data module.
    """
    @lru_cache()
    def train_cuts(self) -> CutSet:
        logging.info("About to get train cuts")
        return load_manifest(self.args.feature_dir / 'cuts_train.json.gz')

    @lru_cache()
    def valid_cuts(self) -> CutSet:
        logging.info("About to get valid cuts")
        return load_manifest(self.args.feature_dir / 'cuts_dev.json.gz')

    @lru_cache()
    def test_cuts(self) -> CutSet:
        logging.info("About to get test cuts")
        return load_manifest(self.args.feature_dir / 'cuts_test.json.gz')


================================================
FILE: snowfall/data/asr_datamodule.py
================================================
import argparse
import logging
from pathlib import Path
from typing import List, Union

from torch.utils.data import DataLoader

from lhotse import Fbank, FbankConfig, load_manifest
from lhotse.dataset import BucketingSampler, CutConcatenate, CutMix, K2SpeechRecognitionDataset, SingleCutSampler, \
    SpecAugment
from lhotse.dataset.input_strategies import OnTheFlyFeatures
from snowfall.common import str2bool
from snowfall.data.datamodule import DataModule


class AsrDataModule(DataModule):
    """
    DataModule for K2 ASR experiments.
    It assumes there is always one train and valid dataloader,
    but there can be multiple test dataloaders (e.g. LibriSpeech test-clean and test-other).

    It contains all the common data pipeline modules used in ASR experiments, e.g.:
    - dynamic batch size,
    - bucketing samplers,
    - cut concatenation,
    - augmentation,
    - on-the-fly feature extraction

    This class should be derived for specific corpora used in ASR tasks.
    """

    @classmethod
    def add_arguments(cls, parser: argparse.ArgumentParser):
        super().add_arguments(parser)
        group = parser.add_argument_group(
            title='ASR data related options',
            description='These options are used for the preparation of PyTorch DataLoaders '
                        'from Lhotse CutSet\'s -- they control the effective batch sizes, '
                        'sampling strategies, applied data augmentations, etc.'
        )
        group.add_argument(
            '--feature-dir',
            type=Path,
            default=Path('exp/data'),
            help='Path to directory with train/valid/test cuts.'
        )
        group.add_argument(
            '--max-duration',
            type=int,
            default=500.0,
            help="Maximum pooled recordings duration (seconds) in a single batch.")
        group.add_argument(
            '--bucketing-sampler',
            type=str2bool,
            default=False,
            help='When enabled, the batches will come from buckets of '
                 'similar duration (saves padding frames).')
        group.add_argument(
            '--num-buckets',
            type=int,
            default=30,
            help='The number of buckets for the BucketingSampler'
                 '(you might want to increase it for larger datasets).')
        group.add_argument(
            '--concatenate-cuts',
            type=str2bool,
            default=True,
            help='When enabled, utterances (cuts) will be concatenated '
                 'to minimize the amount of padding.')
        group.add_argument(
            '--duration-factor',
            type=float,
            default=1.0,
            help='Determines the maximum duration of a concatenated cut '
                 'relative to the duration of the longest cut in a batch.')
        group.add_argument(
            '--gap',
            type=float,
            default=1.0,
            help='The amount of padding (in seconds) inserted between concatenated cuts. '
                 'This padding is filled with noise when noise augmentation is used.')
        group.add_argument(
            '--on-the-fly-feats',
            type=str2bool,
            default=False,
            help='When enabled, use on-the-fly cut mixing and feature extraction. '
                 'Will drop existing precomputed feature manifests if available.'
        )

    def train_dataloaders(self) -> DataLoader:
        logging.info("About to get train cuts")
        cuts_train = self.train_cuts()

        logging.info("About to get Musan cuts")
        cuts_musan = load_manifest(self.args.feature_dir / 'cuts_musan.json.gz')

        logging.info("About to create train dataset")
        transforms = [CutMix(cuts=cuts_musan, prob=0.5, snr=(10, 20))]
        if self.args.concatenate_cuts:
            logging.info(f'Using cut concatenation with duration factor '
                         f'{self.args.duration_factor} and gap {self.args.gap}.')
            # Cut concatenation should be the first transform in the list,
            # so that if we e.g. mix noise in, it will fill the gaps between different utterances.
            transforms = [
                             CutConcatenate(
                                 duration_factor=self.args.duration_factor,
                                 gap=self.args.gap
                             )
                         ] + transforms

        input_transforms = [
            SpecAugment(num_frame_masks=2, features_mask_size=27, num_feature_masks=2, frames_mask_size=100)
        ]

        train = K2SpeechRecognitionDataset(
            cuts_train,
            cut_transforms=transforms,
            input_transforms=input_transforms
        )

        if self.args.on_the_fly_feats:
            # NOTE: the PerturbSpeed transform should be added only if we remove it from data prep stage.
            # # Add on-the-fly speed perturbation; since originally it would have increased epoch
            # # size by 3, we will apply prob 2/3 and use 3x more epochs.
            # # Speed perturbation probably should come first before concatenation,
            # # but in principle the transforms order doesn't have to be strict (e.g. could be randomized)
            # transforms = [PerturbSpeed(factors=[0.9, 1.1], p=2 / 3)] + transforms
            # Drop feats to be on the safe side.
            cuts_train = cuts_train.drop_features()
            train = K2SpeechRecognitionDataset(
                cuts=cuts_train,
                cut_transforms=transforms,
                input_strategy=OnTheFlyFeatures(Fbank(FbankConfig(num_mel_bins=80))),
                input_transforms=input_transforms
            )

        if self.args.bucketing_sampler:
            logging.info('Using BucketingSampler.')
            train_sampler = BucketingSampler(
                cuts_train,
                max_duration=self.args.max_duration,
                shuffle=True,
                num_buckets=self.args.num_buckets
            )
        else:
            logging.info('Using SingleCutSampler.')
            train_sampler = SingleCutSampler(
                cuts_train,
                max_duration=self.args.max_duration,
                shuffle=True,
            )
        logging.info("About to create train dataloader")
        train_dl = DataLoader(
            train,
            sampler=train_sampler,
            batch_size=None,
            num_workers=4,
            persistent_workers=True,
        )
        return train_dl

    def valid_dataloaders(self) -> DataLoader:
        logging.info("About to get dev cuts")
        cuts_valid = self.valid_cuts()

        transforms = [ ]
        if self.args.concatenate_cuts:
            transforms = [ CutConcatenate(
                                 duration_factor=self.args.duration_factor,
                                 gap=self.args.gap)
                          ] + transforms


        logging.info("About to create dev dataset")
        if self.args.on_the_fly_feats:
            cuts_valid = cuts_valid.drop_features()
            validate = K2SpeechRecognitionDataset(
                cuts_valid.drop_features(),
                cut_transforms=transforms,
                input_strategy=OnTheFlyFeatures(Fbank(FbankConfig(num_mel_bins=80)))
            )
        else:
            validate = K2SpeechRecognitionDataset(cuts_valid,
                                                  cut_transforms=transforms)
        valid_sampler = SingleCutSampler(
            cuts_valid,
            max_duration=self.args.max_duration,
            shuffle=True,
        )
        logging.info("About to create dev dataloader")
        valid_dl = DataLoader(
            validate,
            sampler=valid_sampler,
            batch_size=None,
            num_workers=2,
            persistent_workers=True,
        )
        return valid_dl

    def test_dataloaders(self) -> Union[DataLoader, List[DataLoader]]:
        cuts = self.test_cuts()
        is_list = isinstance(cuts, list)
        test_loaders = []
        if not is_list:
            cuts = [cuts]

        for cuts_test in cuts:
            logging.debug("About to create test dataset")
            test = K2SpeechRecognitionDataset(
                cuts_test,
                input_strategy=OnTheFlyFeatures(Fbank(FbankConfig(num_mel_bins=80)))
            )
            sampler = SingleCutSampler(cuts_test, max_duration=self.args.max_duration)
            logging.debug("About to create test dataloader")
            test_dl = DataLoader(test, batch_size=None, sampler=sampler, num_workers=1)
            test_loaders.append(test_dl)

        if is_list:
            return test_loaders
        else:
            return test_loaders[0]


================================================
FILE: snowfall/data/datamodule.py
================================================
from torch.utils.data import DataLoader
from typing import List, Union

import argparse

from lhotse import CutSet


class DataModule:
    """
    Contains dataset-related code. It is intended to read/construct Lhotse cuts,
    and create Dataset/Sampler/DataLoader out of them.

    There is a separate method to create each of train/valid/test DataLoader.
    In principle, there might be multiple DataLoaders for each of train/valid/test
    (e.g. when a corpus has multiple test sets).
    The API of this class allows to return lists of CutSets/DataLoaders.
    """
    def __init__(self, args: argparse.Namespace):
        self.args = args

    @classmethod
    def add_arguments(cls, parser: argparse.ArgumentParser):
        pass

    def train_cuts(self) -> Union[CutSet, List[CutSet]]:
        raise NotImplementedError()

    def valid_cuts(self) -> Union[CutSet, List[CutSet]]:
        raise NotImplementedError()

    def test_cuts(self) -> Union[CutSet, List[CutSet]]:
        raise NotImplementedError()

    def train_dataloaders(self) -> Union[DataLoader, List[DataLoader]]:
        raise NotImplementedError()

    def valid_dataloaders(self) -> Union[DataLoader, List[DataLoader]]:
        raise NotImplementedError()

    def test_dataloaders(self) -> Union[DataLoader, List[DataLoader]]:
        raise NotImplementedError()


================================================
FILE: snowfall/data/librispeech.py
================================================
import argparse

from functools import lru_cache

import logging
from typing import List

from lhotse import CutSet, load_manifest
from snowfall.common import str2bool
from snowfall.data.asr_datamodule import AsrDataModule


class LibriSpeechAsrDataModule(AsrDataModule):
    """
    LibriSpeech ASR data module. Can be used for 100h subset (``--full-libri false``) or full 960h set.
    The train and valid cuts for standard Libri splits are concatenated into a single CutSet/DataLoader.
    """

    @classmethod
    def add_arguments(cls, parser: argparse.ArgumentParser):
        super().add_arguments(parser)
        group = parser.add_argument_group(title='LibriSpeech specific options')
        group.add_argument(
            '--full-libri',
            type=str2bool,
            default=True,
            help='When enabled, use 960h LibriSpeech.')

    @lru_cache()
    def train_cuts(self) -> CutSet:
        logging.info("About to get train cuts")
        cuts_train = load_manifest(self.args.feature_dir / 'cuts_train-clean-100.json.gz')
        if self.args.full_libri:
            cuts_train = (
                    cuts_train +
                    load_manifest(self.args.feature_dir / 'cuts_train-clean-360.json.gz') +
                    load_manifest(self.args.feature_dir / 'cuts_train-other-500.json.gz')
            )
        return cuts_train

    @lru_cache()
    def valid_cuts(self) -> CutSet:
        logging.info("About to get dev cuts")
        cuts_valid = (
                load_manifest(self.args.feature_dir / 'cuts_dev-clean.json.gz') +
                load_manifest(self.args.feature_dir / 'cuts_dev-other.json.gz')
        )
        return cuts_valid

    @lru_cache()
    def test_cuts(self) -> List[CutSet]:
        test_sets = ['test-clean', 'test-other']
        cuts = []
        for test_set in test_sets:
            logging.debug("About to get test cuts")
            cuts.append(load_manifest(self.args.feature_dir / f'cuts_{test_set}.json.gz'))
        return cuts


================================================
FILE: snowfall/decoding/__init__.py
================================================


================================================
FILE: snowfall/decoding/graph.py
================================================
import logging

import k2
import torch
from k2 import Fsa


def compile_HLG(
        L: Fsa,
        G: Fsa,
        H: Fsa,
        labels_disambig_id_start: int,
        aux_labels_disambig_id_start: int
) -> Fsa:
    """
    Creates a decoding graph using a lexicon fst ``L`` and language model fsa ``G``.
    Involves arc sorting, intersection, determinization, removal of disambiguation symbols
    and adding epsilon self-loops.

    Args:
        L:
            An ``Fsa`` that represents the lexicon (L), i.e. has phones as ``symbols``
                and words as ``aux_symbols``.
        G:
            An ``Fsa`` that represents the language model (G), i.e. it's an acceptor
            with words as ``symbols``.
        H:  An ``Fsa`` that represents a specific topology used to convert the network
            outputs to a sequence of phones.
            Typically, it's a CTC topology fst, in which when 0 appears on the left
            side, it represents the blank symbol; when it appears on the right side,
            it indicates an epsilon.
        labels_disambig_id_start:
            An integer ID corresponding to the first disambiguation symbol in the
            phonetic alphabet.
        aux_labels_disambig_id_start:
            An integer ID corresponding to the first disambiguation symbol in the
            words vocabulary.
    :return:
    """
    L = k2.arc_sort(L)
    G = k2.arc_sort(G)
    logging.info("Intersecting L and G")
    LG = k2.compose(L, G)
    logging.info(f'LG shape = {LG.shape}')
    logging.info("Connecting L*G")
    LG = k2.connect(LG)
    logging.info(f'LG shape = {LG.shape}')
    logging.info("Determinizing L*G")
    LG = k2.determinize(LG)
    logging.info(f'LG shape = {LG.shape}')
    logging.info("Connecting det(L*G)")
    LG = k2.connect(LG)
    logging.info(f'LG shape = {LG.shape}')
    logging.info("Removing disambiguation symbols on L*G")
    LG.labels[LG.labels >= labels_disambig_id_start] = 0
    if isinstance(LG.aux_labels, torch.Tensor):
        LG.aux_labels[LG.aux_labels >= aux_labels_disambig_id_start] = 0
    else:
        LG.aux_labels.values()[LG.aux_labels.values() >= aux_labels_disambig_id_start] = 0
    logging.info("Removing epsilons")
    LG = k2.remove_epsilon(LG)
    logging.info(f'LG shape = {LG.shape}')
    logging.info("Connecting rm-eps(det(L*G))")
    LG = k2.connect(LG)
    logging.info(f'LG shape = {LG.shape}')
    LG.aux_labels = k2.ragged.remove_values_eq(LG.aux_labels, 0)

    logging.info("Arc sorting LG")
    LG = k2.arc_sort(LG)

    logging.info("Composing ctc_topo LG")
    HLG = k2.compose(H, LG, inner_labels='phones')

    logging.info("Connecting LG")
    HLG = k2.connect(HLG)

    logging.info("Arc sorting LG")
    HLG = k2.arc_sort(HLG)
    logging.info(
        f'LG is arc sorted: {(HLG.properties & k2.fsa_properties.ARC_SORTED) != 0}'
    )

    # Attach a new attribute `lm_scores` so that we can recover
    # the `am_scores` later.
    # The scores on an arc consists of two parts:
    #  scores = am_scores + lm_scores
    # NOTE: we assume that both kinds of scores are in log-space.
    HLG.lm_scores = HLG.scores.clone()
    return HLG


================================================
FILE: snowfall/decoding/lm_rescore.py
================================================
# Copyright (c)  2021  Xiaomi Corporation (authors: Fangjun Kuang)

from typing import Optional

import math

import k2
import torch


def _intersect_device(a_fsas: k2.Fsa, b_fsas: k2.Fsa, b_to_a_map: torch.Tensor,
                      sorted_match_a: bool):
    '''This is a wrapper of k2.intersect_device and its purpose is to split
    b_fsas into several batches and process each batch separately to avoid
    CUDA OOM error.

    The arguments and return value of this function are the same as
    k2.intersect_device.
    '''
    # NOTE: You can decrease batch_size in case of CUDA out of memory error.
    batch_size = 500
    num_fsas = b_fsas.shape[0]
    if num_fsas <= batch_size:
        return k2.intersect_device(a_fsas,
                                   b_fsas,
                                   b_to_a_map=b_to_a_map,
                                   sorted_match_a=sorted_match_a)

    num_batches = int(math.ceil(float(num_fsas) / batch_size))
    splits = []
    for i in range(num_batches):
        start = i * batch_size
        end = min(start + batch_size, num_fsas)
        splits.append((start, end))

    ans = []
    for start, end in splits:
        indexes = torch.arange(start, end).to(b_to_a_map)

        fsas = k2.index(b_fsas, indexes)
        b_to_a = k2.index(b_to_a_map, indexes)
        path_lats = k2.intersect_device(a_fsas,
                                        fsas,
                                        b_to_a_map=b_to_a,
                                        sorted_match_a=sorted_match_a)
        ans.append(path_lats)

    return k2.cat(ans)


def compute_am_scores(lats: k2.Fsa, word_fsas_with_epsilon_loops: k2.Fsa,
                      path_to_seq_map: torch.Tensor) -> torch.Tensor:
    '''Compute AM scores of n-best lists (represented as word_fsas).

    Args:
      lats:
        An FsaVec, which is the output of `k2.intersect_dense_pruned`.
        It must have the attribute `lm_scores`.
      word_fsas_with_epsilon_loops:
        An FsaVec representing a n-best list. Note that it has been processed
        by `k2.add_epsilon_self_loops`.
      path_to_seq_map:
        A 1-D torch.Tensor with dtype torch.int32. path_to_seq_map[i] indicates
        which sequence the i-th Fsa in word_fsas_with_epsilon_loops belongs to.
        path_to_seq_map.numel() == word_fsas_with_epsilon_loops.arcs.dim0().
    Returns:
      Return a 1-D torch.Tensor containing the AM scores of each path.
      `ans.numel() == word_fsas_with_epsilon_loops.shape[0]`
    '''
    device = lats.device
    assert len(lats.shape) == 3
    assert hasattr(lats, 'lm_scores')

    # k2.compose() currently does not support b_to_a_map. To void
    # replicating `lats`, we use k2.intersect_device here.
    #
    # lats has phone IDs as `labels` and word IDs as aux_labels, so we
    # need to invert it here.
    inverted_lats = k2.invert(lats)

    # Now the `labels` of inverted_lats are word IDs (a 1-D torch.Tensor)
    # and its `aux_labels` are phone IDs ( a k2.RaggedInt with 2 axes)

    # Remove its `aux_labels` since it is not needed in the
    # following computation
    del inverted_lats.aux_labels
    inverted_lats = k2.arc_sort(inverted_lats)

    am_path_lats = _intersect_device(inverted_lats,
                                     word_fsas_with_epsilon_loops,
                                     b_to_a_map=path_to_seq_map,
                                     sorted_match_a=True)

    # NOTE: `k2.connect` and `k2.top_sort` support only CPU at present
    am_path_lats = k2.top_sort(k2.connect(am_path_lats.to('cpu'))).to(device)

    # The `scores` of every arc consists of `am_scores` and `lm_scores`
    am_path_lats.scores = am_path_lats.scores - am_path_lats.lm_scores

    am_scores = am_path_lats.get_tot_scores(True, True)

    return am_scores


@torch.no_grad()
def rescore_with_n_best_list(lats: k2.Fsa, G: k2.Fsa,
                             num_paths: int) -> k2.Fsa:
    '''Decode using n-best list with LM rescoring.

    `lats` is a decoding lattice, which has 3 axes. This function first
    extracts `num_paths` paths from `lats` for each sequence using
    `k2.random_paths`. The `am_scores` of these paths are computed.
    For each path, its `lm_scores` is computed using `G` (which is an LM).
    The final `tot_scores` is the sum of `am_scores` and `lm_scores`.
    The path with the greatest `tot_scores` within a sequence is used
    as the decoding output.

    Args:
      lats:
        An FsaVec. It can be the output of `k2.intersect_dense_pruned`.
      G:
        An FsaVec representing the language model (LM). Note that it
        is an FsaVec, but it contains only one Fsa.
      num_paths:
        It is the size `n` in `n-best` list.
    Returns:
      An FsaVec representing the best decoding path for each sequence
      in the lattice.
    '''
    device = lats.device

    assert len(lats.shape) == 3
    assert hasattr(lats, 'aux_labels')
    assert hasattr(lats, 'lm_scores')

    assert G.shape == (1, None, None)
    assert G.device == device
    assert hasattr(G, 'aux_labels') is False

    # First, extract `num_paths` paths for each sequence.
    # paths is a k2.RaggedInt with axes [seq][path][arc_pos]
    paths = k2.random_paths(lats, num_paths=num_paths, use_double_scores=True)

    # word_seqs is a k2.RaggedInt sharing the same shape as `paths`
    # but it contains word IDs. Note that it also contains 0s and -1s.
    # The last entry in each sublist is -1.
    word_seqs = k2.index(lats.aux_labels, paths)

    # Remove epsilons and -1 from word_seqs
    word_seqs = k2.ragged.remove_values_leq(word_seqs, 0)

    # Remove repeated sequences to avoid redundant computation later.
    #
    # unique_word_seqs is still a k2.RaggedInt with 3 axes [seq][path][word]
    # except that there are no repeated paths with the same word_seq
    # within a seq.
    #
    # num_repeats is also a k2.RaggedInt with 2 axes containing the
    # multiplicities of each path.
    # num_repeats.num_elements() == unique_word_seqs.num_elements()
    #
    # Since k2.ragged.unique_sequences will reorder paths within a seq,
    # `new2old` is a 1-D torch.Tensor mapping from the output path index
    # to the input path index.
    # new2old.numel() == unique_word_seqs.num_elements()
    unique_word_seqs, num_repeats, new2old = k2.ragged.unique_sequences(
        word_seqs, need_num_repeats=True, need_new2old_indexes=True)

    seq_to_path_shape = k2.ragged.get_layer(unique_word_seqs.shape(), 0)

    # path_to_seq_map is a 1-D torch.Tensor.
    # path_to_seq_map[i] is the seq to which the i-th path
    # belongs.
    path_to_seq_map = seq_to_path_shape.row_ids(1)

    # Remove the seq axis.
    # Now unique_word_seqs has only two axes [path][word]
    unique_word_seqs = k2.ragged.remove_axis(unique_word_seqs, 0)

    # word_fsas is an FsaVec with axes [path][state][arc]
    word_fsas = k2.linear_fsa(unique_word_seqs)

    word_fsas_with_epsilon_loops = k2.add_epsilon_self_loops(word_fsas)

    am_scores = compute_am_scores(lats, word_fsas_with_epsilon_loops,
                                  path_to_seq_map)

    # Now compute lm_scores
    b_to_a_map = torch.zeros_like(path_to_seq_map)
    lm_path_lats = _intersect_device(G,
                                     word_fsas_with_epsilon_loops,
                                     b_to_a_map=b_to_a_map,
                                     sorted_match_a=True)
    lm_path_lats = k2.top_sort(k2.connect(lm_path_lats.to('cpu'))).to(device)
    lm_scores = lm_path_lats.get_tot_scores(True, True)

    tot_scores = am_scores + lm_scores

    # Remember that we used `k2.ragged.unique_sequences` to remove repeated
    # paths to avoid redundant computation in `k2.intersect_device`.
    # Now we use `num_repeats` to correct the scores for each path.
    #
    # NOTE(fangjun): It is commented out as it leads to a worse WER
    # tot_scores = tot_scores * num_repeats.values()

    # TODO(fangjun): We may need to add `k2.RaggedDouble`
    ragged_tot_scores = k2.RaggedFloat(seq_to_path_shape,
                                       tot_scores.to(torch.float32))
    argmax_indexes = k2.ragged.argmax_per_sublist(ragged_tot_scores)

    # Use k2.index here since argmax_indexes' dtype is torch.int32
    best_path_indexes = k2.index(new2old, argmax_indexes)

    paths = k2.ragged.remove_axis(paths, 0)

    # best_path is a k2.RaggedInt with 2 axes [path][arc_pos]
    best_paths = k2.index(paths, best_path_indexes)

    # labels is a k2.RaggedInt with 2 axes [path][phone_id]
    # Note that it contains -1s.
    labels = k2.index(lats.labels.contiguous(), best_paths)

    labels = k2.ragged.remove_values_eq(labels, -1)

    # lats.aux_labels is a k2.RaggedInt tensor with 2 axes, so
    # aux_labels is also a k2.RaggedInt with 2 axes
    aux_labels = k2.index(lats.aux_labels, best_paths.values())

    best_path_fsas = k2.linear_fsa(labels)
    best_path_fsas.aux_labels = aux_labels

    return best_path_fsas


@torch.no_grad()
def rescore_with_whole_lattice(lats: k2.Fsa,
                               G_with_epsilon_loops: k2.Fsa) -> k2.Fsa:
    '''Use whole lattice to rescore.

    Args:
      lats:
        An FsaVec It can be the output of `k2.intersect_dense_pruned`.
      G_with_epsilon_loops:
        An FsaVec representing the language model (LM). Note that it
        is an FsaVec, but it contains only one Fsa.
    '''
    assert len(lats.shape) == 3
    assert hasattr(lats, 'lm_scores')
    assert G_with_epsilon_loops.shape == (1, None, None)

    device = lats.device
    lats.scores = lats.scores - lats.lm_scores
    # Now, lats.scores contains only am_scores

    # inverted_lats has word IDs as labels.
    # Its aux_labels are phone IDs, which is a ragged tensor k2.RaggedInt
    inverted_lats = k2.invert(lats)
    num_seqs = lats.shape[0]
    inverted_lats_with_epsilon_loops = k2.add_epsilon_self_loops(inverted_lats)

    b_to_a_map = torch.zeros(num_seqs, device=device, dtype=torch.int32)
    try:
        rescoring_lats = k2.intersect_device(G_with_epsilon_loops,
                                             inverted_lats_with_epsilon_loops,
                                             b_to_a_map,
                                             sorted_match_a=True)
    except RuntimeError as e:
        print(f'Caught exception:\n{e}\n')
        print(f'Number of FSAs: {inverted_lats.shape[0]}')
        print('num_arcs before pruning: ',
              inverted_lats_with_epsilon_loops.arcs.num_elements())

        # NOTE(fangjun): The choice of the threshold 0.01 is arbitrary here
        # to avoid OOM. We may need to fine tune it.
        inverted_lats = k2.prune_on_arc_post(inverted_lats, 0.001, True)
        inverted_lats_with_epsilon_loops = k2.add_epsilon_self_loops(
            inverted_lats)
        print('num_arcs after pruning: ',
              inverted_lats_with_epsilon_loops.arcs.num_elements())

        rescoring_lats = k2.intersect_device(G_with_epsilon_loops,
                                             inverted_lats_with_epsilon_loops,
                                             b_to_a_map,
                                             sorted_match_a=True)

    rescoring_lats = k2.top_sort(k2.connect(
        rescoring_lats.to('cpu'))).to(device)
    inverted_rescoring_lats = k2.invert(rescoring_lats)
    # inverted rescoring_lats has phone IDs as labels
    # and word IDs as aux_labels.

    inverted_rescoring_lats = k2.remove_epsilon_self_loops(
        inverted_rescoring_lats)
    best_paths = k2.shortest_path(inverted_rescoring_lats,
                                  use_double_scores=True)
    return best_paths


@torch.no_grad()
def decode_with_lm_rescoring(lats: k2.Fsa, G: k2.Fsa, num_paths: int,
                             use_whole_lattice: bool) -> k2.Fsa:
    '''Decode using n-best list with LM rescoring.

    `lats` is a decoding lattice, which has 3 axes. This function first
    extracts `num_paths` paths from `lats` for each sequence using
    `k2.random_paths`. The `am_scores` of these paths are computed.
    For each path, its `lm_scores` is computed using `G` (which is an LM).
    The final `tot_scores` is the sum of `am_scores` and `lm_scores`.
    The path with the greatest `tot_scores` within a sequence is used
    as the decoding output.

    Args:
      lats:
        An FsaVec It can be the output of `k2.intersect_dense_pruned`.
      G:
        An FsaVec representing the language model (LM). Note that it
        is an FsaVec, but it contains only one Fsa.
      num_paths:
        It is the size `n` in `n-best` list.
        Used only if use_whole_lattice is False.
      use_whole_lattice:
        True to use whole lattice for rescoring. False to use n-best list
        for rescoring.
    Returns:
      An FsaVec representing the best decoding path for each sequence
      in the lattice.
    '''
    if use_whole_lattice:
        return rescore_with_whole_lattice(lats, G)
    else:
        return rescore_with_n_best_list(lats, G, num_paths)


================================================
FILE: snowfall/dist.py
================================================
import os
import torch
from torch import distributed as dist


def setup_dist(rank, world_size, master_port = None):
    os.environ['MASTER_ADDR'] = 'localhost'
    os.environ['MASTER_PORT'] = ('12354' if master_port is None
                                 else str(master_port))
    dist.init_process_group("nccl", rank=rank, world_size=world_size)
    torch.cuda.set_device(rank)


def cleanup_dist():
    dist.destroy_process_group()


================================================
FILE: snowfall/lexicon.py
================================================
import re

from typing import List

import logging

import torch
from pathlib import Path

import k2


class Lexicon:
    def __init__(self, lang_dir: Path):
        self.lang_dir = lang_dir
        self.phones = k2.SymbolTable.from_file(self.lang_dir / 'phones.txt')
        self.words = k2.SymbolTable.from_file(self.lang_dir / 'words.txt')

        logging.info("Loading L.fst")
        if (self.lang_dir / 'Linv.pt').exists():
            L_inv = k2.Fsa.from_dict(torch.load(self.lang_dir / 'Linv.pt'))
        else:
            with open(self.lang_dir / 'L.fst.txt') as f:
                L = k2.Fsa.from_openfst(f.read(), acceptor=False)
                L_inv = k2.arc_sort(L.invert_())
                torch.save(L_inv.as_dict(), self.lang_dir / 'Linv.pt')
        self.L_inv = L_inv

    def phone_symbols(
            self,
            regex: str = re.compile(r'^#\d+$')
    ) -> List[int]:
        '''Return a list of phone IDs containing no disambiguation symbols.

        Caution:
          0 is not a phone ID so it is excluded from the return value.

        Args:
          regex:
            Symbols containing this pattern are disambiguation symbols.
        Returns:
          Return a list of symbol IDs excluding those from disambiguation symbols.
        '''
        symbols = self.phones.symbols
        ans = []
        for s in symbols:
            if not regex.match(s):
                ans.append(self.phones[s])
        if 0 in ans:
            ans.remove(0)
        ans.sort()
        return ans


================================================
FILE: snowfall/models/__init__.py
================================================
from .interface import AcousticModel
from .tdnn import *


================================================
FILE: snowfall/models/conformer.py
================================================
#!/usr/bin/env python3

# Copyright (c)  2021  University of Chinese Academy of Sciences (author: Han Zhu)
# Apache 2.0

import k2
import math
import torch
from torch import Tensor, nn
from typing import Dict, List, Optional, Tuple
import warnings
import copy

from snowfall.common import get_texts
from snowfall.models.transformer import Transformer, encoder_padding_mask


class Conformer(Transformer):
    """
    Args:
        num_features (int): Number of input features
        num_classes (int): Number of output classes
        subsampling_factor (int): subsampling factor of encoder (the convolution layers before transformers)
        d_model (int): attention dimension
        nhead (int): number of head
        dim_feedforward (int): feedforward dimention
        num_encoder_layers (int): number of encoder layers
        num_decoder_layers (int): number of decoder layers
        dropout (float): dropout rate
        cnn_module_kernel (int): Kernel size of convolution module
        normalize_before (bool): whether to use layer_norm before the first block.
        vgg_frontend (bool): whether to use vgg frontend.
    """

    def __init__(self, num_features: int, num_classes: int, subsampling_factor: int = 4,
                 d_model: int = 256, nhead: int = 4, dim_feedforward: int = 2048,
                 num_encoder_layers: int = 12, num_decoder_layers: int = 6,
                 dropout: float = 0.1, cnn_module_kernel: int = 31,
                 normalize_before: bool = True, vgg_frontend: bool = False) -> None:
        super(Conformer, self).__init__(num_features=num_features, num_classes=num_classes, subsampling_factor=subsampling_factor,
                 d_model=d_model, nhead=nhead, dim_feedforward=dim_feedforward,
                 num_encoder_layers=num_encoder_layers, num_decoder_layers=num_decoder_layers,
                 dropout=dropout, normalize_before=normalize_before, vgg_frontend=vgg_frontend)

        self.encoder_pos = RelPositionalEncoding(d_model, dropout)

        encoder_layer = ConformerEncoderLayer(d_model, nhead, dim_feedforward, dropout, cnn_module_kernel, normalize_before)
        self.encoder = ConformerEncoder(encoder_layer, num_encoder_layers)

    def encode(self, x: Tensor, supervisions: Optional[Dict] = None) -> Tuple[Tensor, Optional[Tensor]]:
        """
        Args:
            x: Tensor of dimension (batch_size, num_features, input_length).
            supervisions : Supervison in lhotse format, i.e., batch['supervisions']

        Returns:
            Tensor: Predictor tensor of dimension (input_length, batch_size, d_model).
            Tensor: Mask tensor of dimension (batch_size, input_length)
        """
        x = x.permute(0, 2, 1)  # (B, F, T) -> (B, T, F)

        x = self.encoder_embed(x)
        x, pos_emb = self.encoder_pos(x)
        x = x.permute(1, 0, 2)  # (B, T, F) -> (T, B, F)
        mask = encoder_padding_mask(x.size(0), supervisions)
        mask = mask.to(x.device) if mask != None else None
        x = self.encoder(x, pos_emb, src_key_padding_mask=mask)  # (T, B, F)

        return x, mask


class ConformerEncoderLayer(nn.Module):
    """
    ConformerEncoderLayer is made up of self-attn, feedforward and convolution networks.
    See: "Conformer: Convolution-augmented Transformer for Speech Recognition"

    Args:
        d_model: the number of expected features in the input (required).
        nhead: the number of heads in the multiheadattention models (required).
        dim_feedforward: the dimension of the feedforward network model (default=2048).
        dropout: the dropout value (default=0.1).
        cnn_module_kernel (int): Kernel size of convolution module.
        normalize_before: whether to use layer_norm before the first block.

    Examples::
        >>> encoder_layer = ConformerEncoderLayer(d_model=512, nhead=8)
        >>> src = torch.rand(10, 32, 512)
        >>> pos_emb = torch.rand(32, 19, 512)
        >>> out = encoder_layer(src, pos_emb)
    """

    def __init__(self, d_model: int, nhead: int, dim_feedforward: int = 2048, dropout: float = 0.1,
                 cnn_module_kernel: int = 31, normalize_before: bool = True) -> None:
        super(ConformerEncoderLayer, self).__init__()
        self.self_attn = RelPositionMultiheadAttention(d_model, nhead, dropout=0.0)

        self.feed_forward = nn.Sequential(
            nn.Linear(d_model, dim_feedforward),
            Swish(),
            nn.Dropout(dropout),
            nn.Linear(dim_feedforward, d_model)
        )

        self.feed_forward_macaron = nn.Sequential(
            nn.Linear(d_model, dim_feedforward),
            Swish(),
            nn.Dropout(dropout),
            nn.Linear(dim_feedforward, d_model)
        )

        self.conv_module = ConvolutionModule(d_model, cnn_module_kernel)

        self.norm_ff_macaron = nn.LayerNorm(d_model)  # for the macaron style FNN module
        self.norm_ff = nn.LayerNorm(d_model)  # for the FNN module
        self.norm_mha = nn.LayerNorm(d_model)  # for the MHA module

        self.ff_scale = 0.5

        self.norm_conv = nn.LayerNorm(d_model)  # for the CNN module
        self.norm_final = nn.LayerNorm(d_model)  # for the final output of the block

        self.dropout = nn.Dropout(dropout)

        self.normalize_before = normalize_before

    def forward(self, src: Tensor, pos_emb: Tensor, src_mask: Optional[Tensor] = None,
                src_key_padding_mask: Optional[Tensor] = None) -> Tensor:
        """
        Pass the input through the encoder layer.

        Args:
            src: the sequence to the encoder layer (required).
            pos_emb: Positional embedding tensor (required).
            src_mask: the mask for the src sequence (optional).
            src_key_padding_mask: the mask for the src keys per batch (optional).

        Shape:
            src: (S, N, E).
            pos_emb: (N, 2*S-1, E)
            src_mask: (S, S).
            src_key_padding_mask: (N, S).
            S is the source sequence length, N is the batch size, E is the feature number
        """

        # macaron style feed forward module
        residual = src
        if self.normalize_before:
            src = self.norm_ff_macaron(src)
        src = residual + self.ff_scale * self.dropout(self.feed_forward_macaron(src))
        if not self.normalize_before:
            src = self.norm_ff_macaron(src)

        # multi-headed self-attention module
        residual = src
        if self.normalize_before:
            src = self.norm_mha(src)
        src_att = self.self_attn(src, src, src, pos_emb=pos_emb, attn_mask=src_mask,
                              key_padding_mask=src_key_padding_mask)[0]
        src = residual + self.dropout(src_att)
        if not self.normalize_before:
            src = self.norm_mha(src)

        # convolution module
        residual = src
        if self.normalize_before:
            src = self.norm_conv(src)
        src = residual + self.dropout(self.conv_module(src))
        if not self.normalize_before:
            src = self.norm_conv(src)

        # feed forward module
        residual = src
        if self.normalize_before:
            src = self.norm_ff(src)
        src = residual + self.ff_scale * self.dropout(self.feed_forward(src))
        if not self.normalize_before:
            src = self.norm_ff(src)
        
        if self.normalize_before:
            src = self.norm_final(src)

        return src


class ConformerEncoder(nn.TransformerEncoder):
    r"""ConformerEncoder is a stack of N encoder layers

    Args:
        encoder_layer: an instance of the ConformerEncoderLayer() class (required).
        num_layers: the number of sub-encoder-layers in the encoder (required).
        norm: the layer normalization component (optional).

    Examples::
        >>> encoder_layer = ConformerEncoderLayer(d_model=512, nhead=8)
        >>> conformer_encoder = ConformerEncoder(encoder_layer, num_layers=6)
        >>> src = torch.rand(10, 32, 512)
        >>> pos_emb = torch.rand(32, 19, 512)
        >>> out = conformer_encoder(src, pos_emb)
    """

    def __init__(self, encoder_layer: nn.Module, num_layers: int, norm: nn.Module = None) -> None:
        super(ConformerEncoder, self).__init__(encoder_layer=encoder_layer, num_layers=num_layers, norm=norm)

    def forward(self, src: Tensor, pos_emb: Tensor, mask: Optional[Tensor] = None, src_key_padding_mask: Optional[Tensor] = None) -> Tensor:
        r"""Pass the input through the encoder layers in turn.

        Args:
            src: the sequence to the encoder (required).
            pos_emb: Positional embedding tensor (required).
            mask: the mask for the src sequence (optional).
            src_key_padding_mask: the mask for the src keys per batch (optional).

        Shape:
            src: (S, N, E).
            pos_emb: (N, 2*S-1, E)
            mask: (S, S).
            src_key_padding_mask: (N, S).
            S is the source sequence length, T is the target sequence length, N is the batch size, E is the feature number

        """
        output = src

        for mod in self.layers:
            output = mod(output, pos_emb, src_mask=mask, src_key_padding_mask=src_key_padding_mask)

        if self.norm is not None:
            output = self.norm(output)

        return output


class RelPositionalEncoding(torch.nn.Module):
    """Relative positional encoding module.

    See : Appendix B in "Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context"
    Modified from https://github.com/espnet/espnet/blob/master/espnet/nets/pytorch_backend/transformer/embedding.py

    Args:
        d_model: Embedding dimension.
        dropout_rate: Dropout rate.
        max_len: Maximum input length.

    """

    def __init__(self, d_model: int, dropout_rate: float, max_len: int = 5000) -> None:
        """Construct an PositionalEncoding object."""
        super(RelPositionalEncoding, self).__init__()
        self.d_model = d_model
        self.xscale = math.sqrt(self.d_model)
        self.dropout = torch.nn.Dropout(p=dropout_rate)
        self.pe = None
        self.extend_pe(torch.tensor(0.0).expand(1, max_len))

    def extend_pe(self, x: Tensor) -> None:
        """Reset the positional encodings."""
        if self.pe is not None:
            # self.pe contains both positive and negative parts
            # the length of self.pe is 2 * input_len - 1
            if self.pe.size(1) >= x.size(1) * 2 - 1:
                if self.pe.dtype != x.dtype or self.pe.device != x.device:
                    self.pe = self.pe.to(dtype=x.dtype, device=x.device)
                return
        # Suppose `i` means to the position of query vecotr and `j` means the
        # position of key vector. We use position relative positions when keys
        # are to the left (i>j) and negative relative positions otherwise (i<j).
        pe_positive = torch.zeros(x.size(1), self.d_model)
        pe_negative = torch.zeros(x.size(1), self.d_model)
        position = torch.arange(0, x.size(1), dtype=torch.float32).unsqueeze(1)
        div_term = torch.exp(
            torch.arange(0, self.d_model, 2, dtype=torch.float32)
            * -(math.log(10000.0) / self.d_model)
        )
        pe_positive[:, 0::2] = torch.sin(position * div_term)
        pe_positive[:, 1::2] = torch.cos(position * div_term)
        pe_negative[:, 0::2] = torch.sin(-1 * position * div_term)
        pe_negative[:, 1::2] = torch.cos(-1 * position * div_term)

        # Reserve the order of positive indices and concat both positive and
        # negative indices. This is used to support the shifting trick
        # as in "Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context"
        pe_positive = torch.flip(pe_positive, [0]).unsqueeze(0)
        pe_negative = pe_negative[1:].unsqueeze(0)
        pe = torch.cat([pe_positive, pe_negative], dim=1)
        self.pe = pe.to(device=x.device, dtype=x.dtype)

    def forward(self, x: torch.Tensor) -> Tuple[Tensor, Tensor]:
        """Add positional encoding.

        Args:
            x (torch.Tensor): Input tensor (batch, time, `*`).

        Returns:
            torch.Tensor: Encoded tensor (batch, time, `*`).
            torch.Tensor: Encoded tensor (batch, 2*time-1, `*`).

        """
        self.extend_pe(x)
        x = x * self.xscale
        pos_emb = self.pe[
            :,
            self.pe.size(1) // 2 - x.size(1) + 1 : self.pe.size(1) // 2 + x.size(1),
        ]
        return self.dropout(x), self.dropout(pos_emb)


class RelPositionMultiheadAttention(nn.Module):
    r"""Multi-Head Attention layer with relative position encoding

    See reference: "Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context"

    Args:
        embed_dim: total dimension of the model.
        num_heads: parallel attention heads.
        dropout: a Dropout layer on attn_output_weights. Default: 0.0.

    Examples::

        >>> rel_pos_multihead_attn = RelPositionMultiheadAttention(embed_dim, num_heads)
        >>> attn_output, attn_output_weights = multihead_attn(query, key, value, pos_emb)
    """

    def __init__(self, embed_dim: int, num_heads: int, dropout: float = 0.) -> None:
        super(RelPositionMultiheadAttention, self).__init__()
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.dropout = dropout
        self.head_dim = embed_dim // num_heads
        assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"

        self.in_proj = nn.Linear(embed_dim, 3 * embed_dim, bias=True)
        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=True)

        # linear transformation for positional encoding.
        self.linear_pos = nn.Linear(embed_dim, embed_dim, bias=False)
        # these two learnable bias are used in matrix c and matrix d
        # as described in "Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context" Section 3.3
        self.pos_bias_u = nn.Parameter(torch.Tensor(num_heads, self.head_dim))
        self.pos_bias_v = nn.Parameter(torch.Tensor(num_heads, self.head_dim))

        self._reset_parameters()

    def _reset_parameters(self) -> None:
        nn.init.xavier_uniform_(self.in_proj.weight)
        nn.init.constant_(self.in_proj.bias, 0.)
        nn.init.constant_(self.out_proj.bias, 0.)

        nn.init.xavier_uniform_(self.pos_bias_u)
        nn.init.xavier_uniform_(self.pos_bias_v)

    def forward(self, query: Tensor, key: Tensor, value: Tensor, pos_emb: Tensor, key_padding_mask: Optional[Tensor] = None,
                need_weights: bool = True, attn_mask: Optional[Tensor] = None) -> Tuple[Tensor, Optional[Tensor]]:
        r"""
        Args:
            query, key, value: map a query and a set of key-value pairs to an output.
            pos_emb: Positional embedding tensor
            key_padding_mask: if provided, specified padding elements in the key will
                be ignored by the attention. When given a binary mask and a value is True,
                the corresponding value on the attention layer will be ignored. When given
                a byte mask and a value is non-zero, the corresponding value on the attention
                layer will be ignored
            need_weights: output attn_output_weights.
            attn_mask: 2D or 3D mask that prevents attention to certain positions. A 2D mask will be broadcasted for all
                the batches while a 3D mask allows to specify a different mask for the entries of each batch.

        Shape:
            - Inputs:
            - query: :math:`(L, N, E)` where L is the target sequence length, N is the batch size, E is
            the embedding dimension.
            - key: :math:`(S, N, E)`, where S is the source sequence length, N is the batch size, E is
            the embedding dimension.
            - value: :math:`(S, N, E)` where S is the source sequence length, N is the batch size, E is
            the embedding dimension.
            - pos_emb: :math:`(N, 2*L-1, E)` where L is the target sequence length, N is the batch size, E is
            the embedding dimension.
            - key_padding_mask: :math:`(N, S)` where N is the batch size, S is the source sequence length.
            If a ByteTensor is provided, the non-zero positions will be ignored while the position
            with the zero positions will be unchanged. If a BoolTensor is provided, the positions with the
            value of ``True`` will be ignored while the position with the value of ``False`` will be unchanged.
            - attn_mask: 2D mask :math:`(L, S)` where L is the target sequence length, S is the source sequence length.
            3D mask :math:`(N*num_heads, L, S)` where N is the batch size, L is the target sequence length,
            S is the source sequence length. attn_mask ensure that position i is allowed to attend the unmasked
            positions. If a ByteTensor is provided, the non-zero positions are not allowed to attend
            while the zero positions will be unchanged. If a BoolTensor is provided, positions with ``True``
            is not allowed to attend while ``False`` values will be unchanged. If a FloatTensor
            is provided, it will be added to the attention weight.

            - Outputs:
            - attn_output: :math:`(L, N, E)` where L is the target sequence length, N is the batch size,
            E is the embedding dimension.
            - attn_output_weights: :math:`(N, L, S)` where N is the batch size,
            L is the target sequence length, S is the source sequence length.
        """
        return self.multi_head_attention_forward(
                query, key, value, pos_emb, self.embed_dim, self.num_heads,
                self.in_proj.weight, self.in_proj.bias,
                self.dropout, self.out_proj.weight, self.out_proj.bias,
                training=self.training,
                key_padding_mask=key_padding_mask, need_weights=need_weights,
                attn_mask=attn_mask)

    def rel_shift(self, x: Tensor) -> Tensor:
        """Compute relative positional encoding.

        Args:
            x: Input tensor (batch, head, time1, 2*time1-1).
                time1 means the length of query vector.

        Returns:
            Tensor: tensor of shape (batch, head, time1, time2)
          (note: time2 has the same value as time1, but it is for
          the key, while time1 is for the query).
        """
        (batch_size, num_heads, time1, n) = x.shape
        assert n == 2*time1 - 1
        (batch_stride, head_stride, time1_stride, n_stride) = x.stride()
        return x.as_strided((batch_size, num_heads, time1, time1),
                            (batch_stride, head_stride, time1_stride - n_stride, n_stride),
                            storage_offset=n_stride*(time1 - 1))

    def multi_head_attention_forward(self, query: Tensor,
                                    key: Tensor,
                                    value: Tensor,
                                    pos_emb: Tensor,
                                    embed_dim_to_check: int,
                                    num_heads: int,
                                    in_proj_weight: Tensor,
                                    in_proj_bias: Tensor,
                                    dropout_p: float,
                                    out_proj_weight: Tensor,
                                    out_proj_bias: Tensor,
                                    training: bool = True,
                                    key_padding_mask: Optional[Tensor] = None,
                                    need_weights: bool = True,
                                    attn_mask: Optional[Tensor] = None,
                                    ) -> Tuple[Tensor, Optional[Tensor]]:
        r"""
        Args:
            query, key, value: map a query and a set of key-value pairs to an output.
            pos_emb: Positional embedding tensor
            embed_dim_to_check: total dimension of the model.
            num_heads: parallel attention heads.
            in_proj_weight, in_proj_bias: input projection weight and bias.
            dropout_p: probability of an element to be zeroed.
            out_proj_weight, out_proj_bias: the output projection weight and bias.
            training: apply dropout if is ``True``.
            key_padding_mask: if provided, specified padding elements in the key will
                be ignored by the attention. This is an binary mask. When the value is True,
                the corresponding value on the attention layer will be filled with -inf.
            need_weights: output attn_output_weights.
            attn_mask: 2D or 3D mask that prevents attention to certain positions. A 2D mask will be broadcasted for all
                the batches while a 3D mask allows to specify a different mask for the entries of each batch.

        Shape:
            Inputs:
            - query: :math:`(L, N, E)` where L is the target sequence length, N is the batch size, E is
            the embedding dimension.
            - key: :math:`(S, N, E)`, where S is the source sequence length, N is the batch size, E is
            the embedding dimension.
            - value: :math:`(S, N, E)` where S is the source sequence length, N is the batch size, E is
            the embedding dimension.
            - pos_emb: :math:`(N, 2*L-1, E)` or :math:`(1, 2*L-1, E)` where L is the target sequence
            length, N is the batch size, E is the embedding dimension.
            - key_padding_mask: :math:`(N, S)` where N is the batch size, S is the source sequence length.
            If a ByteTensor is provided, the non-zero positions will be ignored while the zero positions
            will be unchanged. If a BoolTensor is provided, the positions with the
            value of ``True`` will be ignored while the position with the value of ``False`` will be unchanged.
            - attn_mask: 2D mask :math:`(L, S)` where L is the target sequence length, S is the source sequence length.
            3D mask :math:`(N*num_heads, L, S)` where N is the batch size, L is the target sequence length,
            S is the source sequence length. attn_mask ensures that position i is allowed to attend the unmasked
            positions. If a ByteTensor is provided, the non-zero positions are not allowed to attend
            while the zero positions will be unchanged. If a BoolTensor is provided, positions with ``True``
            are not allowed to attend while ``False`` values will be unchanged. If a FloatTensor
            is provided, it will be added to the attention weight.

            Outputs:
            - attn_output: :math:`(L, N, E)` where L is the target sequence length, N is the batch size,
            E is the embedding dimension.
            - attn_output_weights: :math:`(N, L, S)` where N is the batch size,
            L is the target sequence length, S is the source sequence length.
        """

        tgt_len, bsz, embed_dim = query.size()
        assert embed_dim == embed_dim_to_check
        assert key.size(0) == value.size(0) and key.size(1) == value.size(1)

        head_dim = embed_dim // num_heads
        assert head_dim * num_heads == embed_dim, "embed_dim must be divisible by num_heads"
        scaling = float(head_dim) ** -0.5

        if torch.equal(query, key) and torch.equal(key, value):
            # self-attention
            q, k, v = nn.functional.linear(query, in_proj_weight, in_proj_bias).chunk(3, dim=-1)

        elif torch.equal(key, value):
            # encoder-decoder attention
            # This is inline in_proj function with in_proj_weight and in_proj_bias
            _b = in_proj_bias
            _start = 0
            _end = embed_dim
            _w = in_proj_weight[_start:_end, :]
            if _b is not None:
                _b = _b[_start:_end]
            q = nn.functional.linear(query, _w, _b)
            # This is inline in_proj function with in_proj_weight and in_proj_bias
            _b = in_proj_bias
            _start = embed_dim
            _end = None
            _w = in_proj_weight[_start:, :]
            if _b is not None:
                _b = _b[_start:]
            k, v = nn.functional.linear(key, _w, _b).chunk(2, dim=-1)

        else:
            # This is inline in_proj function with in_proj_weight and in_proj_bias
            _b = in_proj_bias
            _start = 0
            _end = embed_dim
            _w = in_proj_weight[_start:_end, :]
            if _b is not None:
                _b = _b[_start:_end]
            q = nn.functional.linear(query, _w, _b)

            # This is inline in_proj function with in_proj_weight and in_proj_bias
            _b = in_proj_bias
            _start = embed_dim
            _end = embed_dim * 2
            _w = in_proj_weight[_start:_end, :]
            if _b is not None:
                _b = _b[_start:_end]
            k = nn.functional.linear(key, _w, _b)

            # This is inline in_proj function with in_proj_weight and in_proj_bias
            _b = in_proj_bias
            _start = embed_dim * 2
            _end = None
            _w = in_proj_weight[_start:, :]
            if _b is not None:
                _b = _b[_start:]
            v = nn.functional.linear(value, _w, _b)

        q = q * scaling

        if attn_mask is not None:
            assert attn_mask.dtype == torch.float32 or attn_mask.dtype == torch.float64 or \
                attn_mask.dtype == torch.float16 or attn_mask.dtype == torch.uint8 or attn_mask.dtype == torch.bool, \
                'Only float, byte, and bool types are supported for attn_mask, not {}'.format(attn_mask.dtype)
            if attn_mask.dtype == torch.uint8:
                warnings.warn("Byte tensor for attn_mask is deprecated. Use bool tensor instead.")
                attn_mask = attn_mask.to(torch.bool)

            if attn_mask.dim() == 2:
                attn_mask = attn_mask.unsqueeze(0)
                if list(attn_mask.size()) != [1, query.size(0), key.size(0)]:
                    raise RuntimeError('The size of the 2D attn_mask is not correct.')
            elif attn_mask.dim() == 3:
                if list(attn_mask.size()) != [bsz * num_heads, query.size(0), key.size(0)]:
                    raise RuntimeError('The size of the 3D attn_mask is not correct.')
            else:
                raise RuntimeError("attn_mask's dimension {} is not supported".format(attn_mask.dim()))
            # attn_mask's dim is 3 now.

        # convert ByteTensor key_padding_mask to bool
        if key_padding_mask is not None and key_padding_mask.dtype == torch.uint8:
            warnings.warn("Byte tensor for key_padding_mask is deprecated. Use bool tensor instead.")
            key_padding_mask = key_padding_mask.to(torch.bool)

        q = q.contiguous().view(tgt_len, bsz, num_heads, head_dim)
        k = k.contiguous().view(-1, bsz, num_heads, head_dim)
        v = v.contiguous().view(-1, bsz * num_heads, head_dim).transpose(0, 1)

        src_len = k.size(0)

        if key_padding_mask is not None:
            assert key_padding_mask.size(0) == bsz, "{} == {}".format(key_padding_mask.size(0), bsz)
            assert key_padding_mask.size(1) == src_len, "{} == {}".format(key_padding_mask.size(1), src_len)


        q = q.transpose(0, 1)  # (batch, time1, head, d_k)

        pos_emb_bsz = pos_emb.size(0)
        assert pos_emb_bsz in (1, bsz)  # actually it is 1
        p = self.linear_pos(pos_emb).view(pos_emb_bsz, -1, num_heads, head_dim)
        p = p.transpose(1, 2)  # (batch, head, 2*time1-1, d_k)

        q_with_bias_u = (q + self.pos_bias_u).transpose(1, 2) # (batch, head, time1, d_k)

        q_with_bias_v = (q + self.pos_bias_v).transpose(1, 2) # (batch, head, time1, d_k)

        # compute attention score
        # first compute matrix a and matrix c
        # as described in "Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context" Section 3.3
        k = k.permute(1, 2, 3, 0) # (batch, head, d_k, time2)
        matrix_ac = torch.matmul(q_with_bias_u, k) # (batch, head, time1, time2)

        # compute matrix b and matrix d
        matrix_bd = torch.matmul(q_with_bias_v, p.transpose(-2, -1)) # (batch, head, time1, 2*time1-1)
        matrix_bd = self.rel_shift(matrix_bd)

        attn_output_weights = (matrix_ac + matrix_bd)  # (batch, head, time1, time2)

        attn_output_weights = attn_output_weights.view(bsz * num_heads, tgt_len, -1)

        assert list(attn_output_weights.size()) == [bsz * num_heads, tgt_len, src_len]

        if attn_mask is not None:
            if attn_mask.dtype == torch.bool:
                attn_output_weights.masked_fill_(attn_mask, float('-inf'))
            else:
                attn_output_weights += attn_mask

        if key_padding_mask is not None:
            attn_output_weights = attn_output_weights.view(bsz, num_heads, tgt_len, src_len)
            attn_output_weights = attn_output_weights.masked_fill(
                key_padding_mask.unsqueeze(1).unsqueeze(2),
                float('-inf'),
            )
            attn_output_weights = attn_output_weights.view(bsz * num_heads, tgt_len, src_len)

        attn_output_weights = nn.functional.softmax(
            attn_output_weights, dim=-1)
        attn_output_weights = nn.functional.dropout(attn_output_weights, p=dropout_p, training=training)

        attn_output = torch.bmm(attn_output_weights, v)
        assert list(attn_output.size()) == [bsz * num_heads, tgt_len, head_dim]
        attn_output = attn_output.transpose(0, 1).contiguous().view(tgt_len, bsz, embed_dim)
        attn_output = nn.functional.linear(attn_output, out_proj_weight, out_proj_bias)

        if need_weights:
            # average attention weights over heads
            attn_output_weights = attn_output_weights.view(bsz, num_heads, tgt_len, src_len)
            return attn_output, attn_output_weights.sum(dim=1) / num_heads
        else:
            return attn_output, None


class ConvolutionModule(nn.Module):
    """ConvolutionModule in Conformer model.
    Modified from https://github.com/espnet/espnet/blob/master/espnet/nets/pytorch_backend/conformer/convolution.py

    Args:
        channels (int): The number of channels of conv layers.
        kernel_size (int): Kernerl size of conv layers.
        bias (bool): Whether to use bias in conv layers (default=True).

    """

    def __init__(self, channels: int, kernel_size: int, bias: bool = True) -> None:
        """Construct an ConvolutionModule object."""
        super(ConvolutionModule, self).__init__()
        # kernerl_size should be a odd number for 'SAME' padding
        assert (kernel_size - 1) % 2 == 0

        self.pointwise_conv1 = nn.Conv1d(
            channels,
            2 * channels,
            kernel_size=1,
            stride=1,
            padding=0,
            bias=bias,
        )
        self.depthwise_conv = nn.Conv1d(
            channels,
            channels,
            kernel_size,
            stride=1,
            padding=(kernel_size - 1) // 2,
            groups=channels,
            bias=bias,
        )
        self.norm = nn.BatchNorm1d(channels)
        self.pointwise_conv2 = nn.Conv1d(
            channels,
            channels,
            kernel_size=1,
            stride=1,
            padding=0,
            bias=bias,
        )
        self.activation = Swish()

    def forward(self, x: Tensor) -> Tensor:
        """Compute convolution module.

        Args:
            x: Input tensor (#time, batch, channels).

        Returns:
            Tensor: Output tensor (#time, batch, channels).

        """
        # exchange the temporal dimension and the feature dimension
        x = x.permute(1, 2, 0) # (#batch, channels, time).

        # GLU mechanism
        x = self.pointwise_conv1(x)  # (batch, 2*channels, time)
        x = nn.functional.glu(x, dim=1)  # (batch, channels, time)

        # 1D Depthwise Conv
        x = self.depthwise_conv(x)
        x = self.activation(self.norm(x))

        x = self.pointwise_conv2(x) # (batch, channel, time)

        return x.permute(2, 0, 1)


class Swish(torch.nn.Module):
    """Construct an Swish object."""

    def forward(self, x: Tensor) -> Tensor:
        """Return Swich activation function."""
        return x * torch.sigmoid(x)


================================================
FILE: snowfall/models/contextnet.py
================================================
#!/usr/bin/env python3

# Copyright (c)  2021  University of Chinese Academy of Sciences (author: Han Zhu)
# Apache 2.0


import torch
from torch import Tensor, nn
from typing import List
from snowfall.models import AcousticModel
from snowfall.models.conformer import Swish


class ContextNet(AcousticModel):
    """ContextNet. Reference: https://arxiv.org/pdf/2005.03191.pdf

    Args:
        num_features (int): Number of input features
        num_classes (int): Number of output classes
        kernel_size (int): Kernel size of convolution layers (default 3).
        num_blocks (int): Number of context block (default 6).
        num_layers (int): Number of depthwise convolution layers for each 
                context block (except first and last block) (default 5).
        conv_out_channels (List[int]): Number of output channels produced by context blocks, 
                len(conv_out_channels) = num_blocks (default [*[256] * 2, *[512] * 3, 640]).
        subsampling_layers (List[int]): Indexs of subsampling layers (default [1, 3]).
        alpha (float): The factor to scale the output channel of the network (default 1.5).
        dropout (float): Dropout (default 0.1).
    """

    def __init__(
        self,
        num_features: int,
        num_classes: int,
        kernel_size: int = 3,
        num_blocks: int = 6,
        num_layers: int = 5,
        conv_out_channels: List[int] = [*[256] * 2, *[512] * 3, 640],
        subsampling_layers: List[int] = [1, 3],
        alpha: float = 1.5,
        dropout: int = 0.1,
    ):
        super().__init__()

        self.num_features = num_features
        self.num_classes = num_classes
        self.subsampling_factor = 2 * len(subsampling_layers)

        conv_channels = [num_features] +  \
                [int(channels * alpha) for channels in conv_out_channels]

        strides = [1] * num_blocks
        for layer in subsampling_layers:
            strides[layer] = 2
            strides[layer] = 2

        residuals = [False, *[True] * (num_blocks - 2), False ] 

        blocks_num_layers = [1, *[num_layers] * (num_blocks - 2), 1 ] 

        self.block_list = [
            ContextNetBlock(
                conv_channels[i],
                conv_channels[i+1],
                kernel_size=kernel_size,
                stride=strides[i],
                num_layers=blocks_num_layers[i],
                dropout=dropout,
                residual=residuals[i]
            ) for i in range(num_blocks)]

        self.blocks = nn.Sequential(*self.block_list)

        self.output_layer = nn.Linear(conv_channels[-1], num_classes)
    
    def forward(self, x, supervision = None):
        """
        Args:
            x (torch.Tensor): Input tensor (batch, channels, time).
            supervision: Supervison in lhotse format, get from batch['supervisions'].
                        It's not used here, just to keep consistent with transformer.

        Returns:
            torch.Tensor: Output tensor (batch, channels, time).
        """
        x = x.transpose(1, -1)
        x = self.blocks(x)
        x = self.output_layer(x)
        x = nn.functional.log_softmax(x, dim=-1).transpose(1, -1)
        return x, None, None


class ContextNetBlock(torch.nn.Module):
    """A block in ContextNet.

    Args:
        in_channels (int): Number of output channels of this model.
        out_channels (int): Number of input channels of this model.
        kernel_size (int) : Kernel size of convolution layers (default 3).
        stride (int): Stride of this context block (default 1).
        num_layers (int): Number of depthwise convolution layers for this context block (default 5).
        dropout (float): Dropout (default 0.1).
        residual (bool): Whether to apply residual connection at this context block (default None).
    """

    def __init__(
        self,
        in_channels: int,
        out_channels: int,
        kernel_size: int = 3,
        stride: int = 1,
        num_layers: int = 1,
        dropout: float = 0.1,
        residual: bool = True,
    ):
        super().__init__()

        self.convs_list = [
            ConvModule(
                in_channels if i == 0 else out_channels,
                out_channels,
                kernel_size=kernel_size,
                stride=stride if i == num_layers - 1 else 1,
                padding=kernel_size // 2 - stride + 1 if i == num_layers - 1 else kernel_size // 2
            ) for i in range(num_layers)]

        self.convs = nn.Sequential(*self.convs_list)

        self.SE = SEModule(channels=out_channels, kernel_size=kernel_size, padding=kernel_size // 2)

        self.drop = nn.Dropout(dropout)

        if residual:
            self.residual = ConvModule(in_channels,
                out_channels,
                kernel_size=kernel_size,
                padding=kernel_size // 2 - stride + 1,
                stride=stride,
                activation=None)
        else:
            self.residual = None
        
        self.activation = Swish()

    def forward(self, x):
        """
        Args:
            x (torch.Tensor): Input tensor (batch, time, channels).

        Returns:
            torch.Tensor: Output tensor (batch, time, channels).
        """
        out = self.convs(x)
        out = self.SE(out)
        if self.residual:
            out = out + self.residual(x)
        out = self.activation(out)
        out = self.drop(out)
        return out


class SEModule(torch.nn.Module):
    """Squeeze-and-Excitation module.

    Args:
        channels (int): Input and output channels.
        kernel_size (int) : Kernel size of convolution layers (default 3).
        padding (int): Zero-padding added to both sides of the input.
    """

    def __init__(
        self,
        channels: int,
        kernel_size: int,
        padding: int
    ):
        super().__init__()

        self.conv = ConvModule(channels, channels, kernel_size=kernel_size, padding=padding, stride=1)

        self.avg_pool = nn.AdaptiveAvgPool1d(1)

        self.bottleneck = nn.Sequential(
            torch.nn.Linear(channels, channels // 8),
            Swish(),
            torch.nn.Linear(channels // 8, channels),
            Swish(),
        )

        self.final_act = torch.nn.Sigmoid()

    def forward(self, x):
        """Squeeze and excitation

        Args:
            x (torch.Tensor): Input tensor (batch, time, channels).

        Returns:
            torch.Tensor: Output tensor (batch, time, channels).
        """
        B, T, C = x.shape

        x = self.conv(x).transpose(1, -1) # (B, C, T)
        avg = self.avg_pool(x).transpose(1,-1) # (B, 1, C)
        avg = self.bottleneck(avg)
        avg = self.final_act(avg)
        context = avg.repeat(1, T, 1) # (B, T, C)
        out = x.transpose(1, -1) * context
        return out


class ConvModule(torch.nn.Module):
    """
    Args:
        in_channels (int): Number of input channels.
        out_channels (int): Number of output channels.
        kernel_size (int): Size of the convolving kernel.
        stride (int): Stride of the convolution (default 1).
        dilation (int): Spacing between kernel elements (default 1).
        padding (int): Zero-padding added to both sides of the input.
        padding_mode (str): 'zeros', 'reflect', 'replicate' or 'circular' (default 'zeros').
        bias (bool): If True, adds a learnable bias to the output (default: True).
        activation (object): activation function used in this convolution module. (default: Swish)
    """

    def __init__(
        self,
        in_channels: int,
        out_channels: int,
        kernel_size: int,
        stride: int = 1,
        dilation: int = 1,
        padding: int = 0,
        padding_mode : str = 'zeros',
        bias: bool = True,
        activation = Swish
    ):
        super().__init__()

        self.conv = SeparableConv1D(
            in_channels,
            out_channels,
            kernel_size=kernel_size,
            stride=stride,
            dilation=dilation,
            padding=padding,
            padding_mode=padding_mode,
            bias=bias,
        )

        self.norm = torch.nn.BatchNorm1d(out_channels)

        if activation:
            self.activation = activation()
        else:
            self.activation = None

    def forward(self, x):
        """
        Args:
            x (torch.Tensor): Input tensor (batch, time, channels).

        Returns:
            torch.Tensor: Output tensor (batch, new_time, channels).
        """
        x = self.conv(x).transpose(1, -1) # (B, C, T)
        x = self.norm(x)
        if self.activation:   
            x = self.activation(x)
        x = x.transpose(1, -1) # (B, T, C)
        return x


class SeparableConv1D(nn.Module):
    """Depthwise separable 1D convolution.

    Args:
        in_channels (int): Number of input channels.
        out_channels (int): Number of output channels.
        kernel_size (int): Size of the convolving kernel.
        stride (int): Stride of the convolution (default 1).
        dilation (int): Spacing between kernel elements (default 1).
        padding (int): Zero-padding added to both sides of the input.
        padding_mode (str): 'zeros', 'reflect', 'replicate' or 'circular' (default 'zeros').
        bias (bool): If True, adds a learnable bias to the output (default: True).

    """

    def __init__(
        self,
        in_channels: int,
        out_channels: int,
        kernel_size: int,
        stride: int = 1,
        dilation: int = 1,
        padding: int = 0,
        padding_mode : str = 'zeros',
        bias: bool = True,
    ):
        super().__init__()

        self.depthwise = nn.Conv1d(
            in_channels,
            in_channels,
            kernel_size=kernel_size,
            stride=stride,
            dilation=dilation,
            padding=padding,
            padding_mode=padding_mode,
            groups=in_channels,
            bias=bias,
        )

        self.pointwise = nn.Conv1d(
            in_channels,
            out_channels,
            kernel_size=1
        )

    def forward(self, x):
        """
        Args:
            x (torch.Tensor): Input tensor (batch, time, channels).

        Returns:
            torch.Tensor: Output tensor (batch, time, channels).
        """
        x = x.transpose(1, -1) # (B, C, T)
        x = self.pointwise(self.depthwise(x)).transpose(1, -1) # (B, T, C)
        return x

================================================
FILE: snowfall/models/interface.py
================================================
from typing import Optional

from torch import nn
from torch.utils.tensorboard import SummaryWriter


class AcousticModel(nn.Module):
    """
    AcousticModel specifies the common attributes/methods that
    will be exposed by all Snowfall acoustic model networks.
    Think of it as of an interface class.
    """

    # A.k.a. the input feature dimension.
    num_features: int

    # A.k.a. the output dimension (could be the number of phones or
    # characters in the vocabulary).
    num_classes: int

    # When greater than one, the networks output sequence length will be
    # this many times smaller than the input sequence length.
    subsampling_factor: int

    def write_tensorboard_diagnostics(
            self,
            tb_writer: SummaryWriter,
            global_step: Optional[int] = None
    ):
        """
        Collect interesting diagnostic info about the model and write to to TensorBoard.
        Unless overridden, logs nothing.

        :param tb_writer: a TensorBoard ``SummaryWriter`` instance.
        :param global_step: optional number of total training steps done so far.
        """
        pass


================================================
FILE: snowfall/models/tdnn.py
================================================
from typing import Optional

from torch import Tensor
from torch import nn

# Copyright (c)  2020  Xiaomi Corporation (authors: Daniel Povey, Haowen Qiu)
# Apache 2.0
from torch.utils.tensorboard import SummaryWriter

from snowfall.models import AcousticModel
from snowfall.training.diagnostics import measure_weight_norms


class Tdnn1a(AcousticModel):
    """
    Args:
        num_features (int): Number of input features
        num_classes (int): Number of output classes
    """

    def __init__(self, num_features: int, num_classes: int, subsampling_factor: int = 3) -> None:
        super(Tdnn1a, self).__init__()
        self.num_features = num_features
        self.num_classes = num_classes
        self.subsampling_factor = subsampling_factor
        self.tdnn = nn.Sequential(
            nn.Conv1d(in_channels=num_features,
                      out_channels=500,
                      kernel_size=3,
                      stride=1,
                      padding=1), nn.ReLU(inplace=True),
            nn.BatchNorm1d(num_features=500, affine=False),
            nn.Conv1d(in_channels=500,
                      out_channels=500,
                      kernel_size=3,
                      stride=1,
                      padding=1), nn.ReLU(inplace=True),
            nn.BatchNorm1d(num_features=500, affine=False),
            nn.Conv1d(in_channels=500,
                      out_channels=500,
                      kernel_size=3,
                      stride=1,
                      padding=1), nn.ReLU(inplace=True),
            nn.BatchNorm1d(num_features=500, affine=False),
            nn.Conv1d(in_channels=500,
                      out_channels=500,
                      kernel_size=3,
                      stride=1,
                      padding=1), nn.ReLU(inplace=True),
            nn.BatchNorm1d(num_features=500, affine=False),
            nn.Conv1d(in_channels=500,
                      out_channels=500,
                      kernel_size=3,
                      stride=1,
                      padding=1), nn.ReLU(inplace=True),
            nn.BatchNorm1d(num_features=500, affine=False),
            nn.Conv1d(in_channels=500,
                      out_channels=500,
                      kernel_size=3,
                      stride=1,
                      padding=1), nn.ReLU(inplace=True),
            nn.BatchNorm1d(num_features=500, affine=False),
            nn.Conv1d(in_channels=500,
                      out_channels=500,
                      kernel_size=3,
                      stride=self.subsampling_factor,  # <---- stride=3: subsampling_factor!
                      padding=1), nn.ReLU(inplace=True),
            nn.BatchNorm1d(num_features=500, affine=False),
            nn.Conv1d(in_channels=500,
                      out_channels=500,
                      kernel_size=3,
                      stride=1,
                      padding=1), nn.ReLU(inplace=True),
            nn.BatchNorm1d(num_features=500, affine=False),
            nn.Conv1d(in_channels=500,
                      out_channels=2000,
                      kernel_size=3,
                      stride=1,
                      padding=1), nn.ReLU(inplace=True),
            nn.BatchNorm1d(num_features=2000, affine=False),
            nn.Conv1d(in_channels=2000,
                      out_channels=2000,
                      kernel_size=1,
                      stride=1,
                      padding=0), nn.ReLU(inplace=True),
            nn.BatchNorm1d(num_features=2000, affine=False),
            nn.Conv1d(in_channels=2000,
                      out_channels=num_classes,
                      kernel_size=1,
                      stride=1,
                      padding=0))

    def forward(self, x: Tensor) -> Tensor:
        r"""
        Args:
            x (torch.Tensor): Tensor of dimension (batch_size, num_features, input_length).

        Returns:
            Tensor: Predictor tensor of dimension (batch_size, number_of_classes, input_length).
        """

        x = self.tdnn(x)
        x = nn.functional.log_softmax(x, dim=1)
        return x

    def write_tensorboard_diagnostics(
            self,
            tb_writer: SummaryWriter,
            global_step: Optional[int] = None
    ):
        tb_writer.add_scalars(
            'train/weight_l2_norms',
            measure_weight_norms(self, norm='l2'),
            global_step=global_step
        )
        tb_writer.add_scalars(
            'train/weight_max_norms',
            measure_weight_norms(self, norm='linf'),
            global_step=global_step
        )


================================================
FILE: snowfall/models/tdnn_lstm.py
================================================
from typing import Optional

from torch import Tensor
from torch import nn
from torch.utils.tensorboard import SummaryWriter

from snowfall.models import AcousticModel
from snowfall.training.diagnostics import measure_weight_norms


class TdnnLstm1a(AcousticModel):
    """
    Args:
        num_features (int): Number of input features
        num_classes (int): Number of output classes
    """

    def __init__(self, num_features: int, num_classes: int, subsampling_factor: int = 3) -> None:
        super().__init__()
        self.num_features = num_features
        self.num_classes = num_classes
        self.subsampling_factor = subsampling_factor
        self.tdnn = nn.Sequential(
            nn.Conv1d(in_channels=num_features,
                      out_channels=500,
                      kernel_size=3,
                      stride=1,
                      padding=1), nn.ReLU(inplace=True),
            nn.BatchNorm1d(num_features=500, affine=False),
            nn.Conv1d(in_channels=500,
                      out_channels=500,
                      kernel_size=3,
                      stride=1,
                      padding=1), nn.ReLU(inplace=True),
            nn.BatchNorm1d(num_features=500, affine=False),
            nn.Conv1d(in_channels=500,
                      out_channels=500,
                      kernel_size=3,
                      stride=1,
                      padding=1), nn.ReLU(inplace=True),
            nn.BatchNorm1d(num_features=500, affine=False),
            nn.Conv1d(in_channels=500,
                      out_channels=500,
                      kernel_size=3,
                      stride=1,
                      padding=1), nn.ReLU(inplace=True),
            nn.BatchNorm1d(num_features=500, affine=False),
            nn.Conv1d(in_channels=500,
                      out_channels=500,
                      kernel_size=3,
                      stride=1,
                      padding=1), nn.ReLU(inplace=True),
            nn.BatchNorm1d(num_features=500, affine=False),
            nn.Conv1d(in_channels=500,
                      out_channels=500,
                      kernel_size=3,
                      stride=1,
                      padding=1), nn.ReLU(inplace=True),
            nn.BatchNorm1d(num_features=500, affine=False),
            nn.Conv1d(in_channels=500,
                      out_channels=500,
                      kernel_size=3,
                      stride=self.subsampling_factor,  # <---- stride=3: subsampling_factor!
                      padding=1), nn.ReLU(inplace=True),
            nn.BatchNorm1d(num_features=500, affine=False),
            nn.Conv1d(in_channels=500,
                      out_channels=500,
                      kernel_size=3,
                      stride=1,
                      padding=1), nn.ReLU(inplace=True),
            nn.BatchNorm1d(num_features=500, affine=False),
        )
        self.lstm = nn.LSTM(500, 500)
        self.dropout = nn.Dropout(0.5)
        self.tdnn2 = nn.Sequential(
            nn.Conv1d(in_channels=500,
                      out_channels=2000,
                      kernel_size=1,
                      stride=1,
                      padding=0), nn.ReLU(inplace=True),
            nn.BatchNorm1d(num_features=2000, affine=False),
            nn.Conv1d(in_channels=2000,
                      out_channels=num_classes,
                      kernel_size=1,
                      stride=1,
                      padding=0)
        )

    def forward(self, x: Tensor) -> Tensor:
        r"""
        Args:
            x (torch.Tensor): Tensor of dimension (batch_size, num_features, input_length).

        Returns:
            Tensor: Predictor tensor of dimension (batch_size, number_of_classes, input_length).
        """
        x = self.tdnn(x)
        x, _ = self.lstm(x.permute(2, 0, 1))  # (B, F, T) -> (T, B, F)
        x = x.permute(1, 2, 0)  # (T, B, F) -> (B, F, T)
        x = self.dropout(x)
        x = self.tdnn2(x)
        x = nn.functional.log_softmax(x, dim=1)
        return x


class TdnnLstm1b(AcousticModel):
    """
    Args:
        num_features (int): Number of input features
        num_classes (int): Number of output classes
    """

    def __init__(self, num_features: int, num_classes: int, subsampling_factor: int = 3) -> None:
        super().__init__()
        self.num_features = num_features
        self.num_classes = num_classes
        self.subsampling_factor = subsampling_factor
        self.tdnn = nn.Sequential(
            nn.Conv1d(in_channels=num_features,
                      out_channels=500,
                      kernel_size=3,
                      stride=1,
                      padding=1), nn.ReLU(inplace=True),
            nn.BatchNorm1d(num_features=500, affine=False),
            nn.Conv1d(in_channels=500,
                      out_channels=500,
                      kernel_size=3,
                      stride=1,
                      padding=1), nn.ReLU(inplace=True),
            nn.BatchNorm1d(num_features=500, affine=False),
            nn.Conv1d(in_channels=500,
                      out_channels=500,
                      kernel_size=3,
                      stride=self.subsampling_factor,  # <---- stride: subsampling_factor!
                      padding=1), nn.ReLU(inplace=True),
            nn.BatchNorm1d(num_features=500, affine=False),
        )
        self.lstms = nn.ModuleList([
            nn.LSTM(input_size=500, hidden_size=500, num_layers=1)
            for _ in range(5)
        ])
        self.lstm_bnorms = nn.ModuleList([
            nn.BatchNorm1d(num_features=500, affine=False)
            for _ in range(5)
        ])
        self.dropout = nn.Dropout(0.2)
        self.linear = nn.Linear(in_features=500, out_features=self.num_classes)

    def forward(self, x: Tensor) -> Tensor:
        """
        Args:
            x (torch.Tensor): Tensor of dimension (batch_size, num_features, input_length).

        Returns:
            Tensor: Predictor tensor of dimension (batch_size, number_of_classes, input_length).
        """
        x = self.tdnn(x)
        x = x.permute(2, 0, 1)  # (B, F, T) -> (T, B, F) -> how LSTM expects it
        for lstm, bnorm in zip(self.lstms, self.lstm_bnorms):
            x_new, _ = lstm(x)
            x_new = bnorm(x_new.permute(1, 2, 0)).permute(2, 0, 1)  # (T, B, F) -> (B, F, T) -> (T, B, F)
            x_new = self.dropout(x_new)
            x = x_new + x  # skip connections
        x = x.transpose(1, 0)  # (T, B, F) -> (B, T, F) -> linear expects "features" in the last dim
        x = self.linear(x)
        x = x.transpose(1, 2)  # (B, T, F) -> (B, F, T) -> shape expected by Snowfall
        x = nn.functional.log_softmax(x, dim=1)
        return x

    def write_tensorboard_diagnostics(
            self,
            tb_writer: SummaryWriter,
            global_step: Optional[int] = None
    ):
        tb_writer.add_scalars(
            'train/weight_l2_norms',
            measure_weight_norms(self, norm='l2'),
            global_step=global_step
        )
        tb_writer.add_scalars(
            'train/weight_max_norms',
            measure_weight_norms(self, norm='linf'),
            global_step=global_step
        )


================================================
FILE: snowfall/models/tdnnf.py
================================================
#!/usr/bin/env python3

# Copyright 2021 John's Hopkins University (author: Piotr Żelasko)
# Copyright 2020 Mobvoi AI Lab, Beijing, China (author: Fangjun Kuang)
# Apache 2.0
from typing import Optional

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.tensorboard import SummaryWriter

from snowfall.models import AcousticModel
from snowfall.training.diagnostics import measure_semiorthogonality, measure_weight_norms

"""
CAUTION! This model is not fully ported from Kaldi. It will converge, but its training
is still unstable and it seems to underperform its Kaldi counterpart.
We expect to improve this going forward.
"""


def tdnnf_optimizer(
        model: nn.Module,
        learning_rate: float = 5e-5,
        momentum: float = 0.9,
        weight_decay: float = 1e-5) -> torch.optim.Optimizer:
    """
    This is an example of an optimizer with parameter/layer-specific learning rates.
    We don't use it by default but it can be helpful in tuning the training of a specific model.
    """
    out_layer_keys = {'output_affine.weight', 'output_affine.bias', 'prefinal_l.weight', 'prefinal_l.bias'}
    return torch.optim.SGD([
        # Default optimization settings
        {'params': [p for key, p in model.named_parameters() if key not in out_layer_keys]},
        # Output layer may need smaller LR
        {'params': [model.output_affine.weight], 'lr': learning_rate * 0.5},
        {'params': [model.output_affine.bias], 'lr': learning_rate * 0.1},
    ],
        lr=learning_rate,
        momentum=momentum,
        weight_decay=weight_decay
    )


class Tdnnf1a(AcousticModel):
    """
    This is a PyTorch implementation of a standard Kaldi TDNN-F model architecture.
    The default configuration is based on the Kaldi nnet3 xconfig below,
    except it doesn't use an LDA transform.
    Note that unlike Kaldi models it does not have a cross-entropy output layer,
    as Snowfall does not support alignments in training at this time.

    .. code-block:

        input dim=43 name=input
        fixed-affine-layer name=lda input=Append(-1,0,1) affine-transform-file=exp/chain_cleaned_1c/tdnn1c_sp/configs/lda.mat
        relu-batchnorm-dropout-layer name=tdnn1 l2-regularize=0.008 dropout-proportion=0.0 dropout-per-dim-continuous=true dim=1024
        tdnnf-layer name=tdnnf2 l2-regularize=0.008 dropout-proportion=0.0 bypass-scale=0.66 dim=1024 bottleneck-dim=128 time-stride=1
        tdnnf-layer name=tdnnf3 l2-regularize=0.008 dropout-proportion=0.0 bypass-scale=0.66 dim=1024 bottleneck-dim=128 time-stride=1
        tdnnf-layer name=tdnnf4 l2-regularize=0.008 dropout-proportion=0.0 bypass-scale=0.66 dim=1024 bottleneck-dim=128 time-stride=1
        tdnnf-layer name=tdnnf5 l2-regularize=0.008 dropout-proportion=0.0 bypass-scale=0.66 dim=1024 bottleneck-dim=128 time-stride=0
        tdnnf-layer name=tdnnf6 l2-regularize=0.008 dropout-proportion=0.0 bypass-scale=0.66 dim=1024 bottleneck-dim=128 time-stride=3
        tdnnf-layer name=tdnnf7 l2-regularize=0.008 dropout-proportion=0.0 bypass-scale=0.66 dim=1024 bottleneck-dim=128 time-stride=3
        tdnnf-layer name=tdnnf8 l2-regularize=0.008 dropout-proportion=0.0 bypass-scale=0.66 dim=1024 bottleneck-dim=128 time-stride=3
        tdnnf-layer name=tdnnf9 l2-regularize=0.008 dropout-proportion=0.0 bypass-scale=0.66 dim=1024 bottleneck-dim=128 time-stride=3
        tdnnf-layer name=tdnnf10 l2-regularize=0.008 dropout-proportion=0.0 bypass-scale=0.66 dim=1024 bottleneck-dim=128 time-stride=3
        tdnnf-layer name=tdnnf11 l2-regularize=0.008 dropout-proportion=0.0 bypass-scale=0.66 dim=1024 bottleneck-dim=128 time-stride=3
        tdnnf-layer name=tdnnf12 l2-regularize=0.008 dropout-proportion=0.0 bypass-scale=0.66 dim=1024 bottleneck-dim=128 time-stride=3
        tdnnf-layer name=tdnnf13 l2-regularize=0.008 dropout-proportion=0.0 bypass-scale=0.66 dim=1024 bottleneck-dim=128 time-stride=3
        linear-component name=prefinal-l dim=256 l2-regularize=0.008 orthonormal-constraint=-1.0
        prefinal-layer name=prefinal-chain input=prefinal-l l2-regularize=0.008 big-dim=1024 small-dim=256
        output-layer name=output include-log-softmax=false dim=3456 l2-regularize=0.002
    """

    def __init__(self,
                 num_features,
                 num_classes,
                 hidden_dim=1024,
                 bottleneck_dim=128,
                 prefinal_bottleneck_dim=256,
                 kernel_size_list=[3, 3, 3, 1, 3, 3, 3, 3, 3, 3, 3, 3],
                 subsampling_factor_list=[1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1],
                 subsampling_factor=3):
        super().__init__()

        self.num_features = num_features
        self.num_classes = num_classes
        self.subsampling_factor = subsampling_factor

        # at present, we support only frame_subsampling_factor to be 3
        assert self.subsampling_factor == 3

        assert len(kernel_size_list) == len(subsampling_factor_list)
        num_layers = len(kernel_size_list)

        self.ortho_constrain_count = 0

        self.input_batch_norm = nn.BatchNorm1d(num_features=self.num_features, affine=False)

        self.tdnn1 = TDNN(input_dim=self.num_features, hidden_dim=hidden_dim)

        tdnnfs = []
        for i in range(num_layers):
            kernel_size = kernel_size_list[i]
            subsampling_factor = subsampling_factor_list[i]
            layer = FactorizedTDNN(dim=hidden_dim,
                                   bottleneck_dim=bottleneck_dim,
                                   kernel_size=kernel_size,
                                   subsampling_factor=subsampling_factor,
                                   cnn_padding=int(subsampling_factor == 1))
            tdnnfs.append(layer)

        # tdnnfs requires [N, C, T]
        self.tdnnfs = nn.ModuleList(tdnnfs)

        # prefinal_l affine requires [N, C, T]
        self.prefinal_l = OrthonormalLinear(
            dim=hidden_dim,
            bottleneck_dim=prefinal_bottleneck_dim,
            kernel_size=1)

        # prefinal_chain requires [N, C, T]
        self.prefinal_chain = PrefinalLayer(big_dim=hidden_dim,
                                            small_dim=prefinal_bottleneck_dim)

        # output_affine requires [N, T, C]
        self.output_affine = nn.Linear(in_features=prefinal_bottleneck_dim,
                                       out_features=self.num_classes)

        self.register_forward_pre_hook(constrain_orthonormal_hook)

    def forward(self, x, dropout=0.):
        # input x is of shape: [batch_size, feat_dim, seq_len] = [N, C, T]
        assert x.ndim == 3

        # at this point, x is [N, C, T]
        x = self.input_batch_norm(x)

        # at this point, x is [N, C, T]
        x = self.tdnn1(x, dropout=dropout)

        # tdnnf requires input of shape [N, C, T]
        for layer in self.tdnnfs:
            x = layer(x, dropout=dropout)

        # at this point, x is [N, C, T]
        x = self.prefinal_l(x)

        # at this point, x is [N, C, T]
        nnet_output = self.prefinal_chain(x)
        # at this point, nnet_output is [N, C, T]
        nnet_output = nnet_output.permute(0, 2, 1)
        # at this point, nnet_output is [N, T, C]
        nnet_output = self.output_affine(nnet_output)
        nnet_output = F.log_softmax(nnet_output, dim=2)
        # we return nnet_output [N, C, T]
        nnet_output = nnet_output.permute(0, 2, 1)
        return nnet_output

    def write_tensorboard_diagnostics(self, tb_writer: SummaryWriter, global_step: Optional[int] = None):
        tb_writer.add_scalars(
            'train/semiorthogonality_score',
            measure_semiorthogonality(self),
            global_step=global_step
        )
        tb_writer.add_scalars(
            'train/weight_l2_norms',
            measure_weight_norms(self, norm='l2'),
            global_step=global_step
        )
        tb_writer.add_scalars(
            'train/weight_max_norms',
            measure_weight_norms(self, norm='linf'),
            global_step=global_step
        )


def constrain_orthonormal_hook(model, unused_x):
    if not model.training:
        return

    model.ortho_constrain_count = (model.ortho_constrain_count + 1) % 2
    if model.ortho_constrain_count != 0:
        return

    with torch.no_grad():
        for m in model.modules():
            if hasattr(m, 'constrain_orthonormal'):
                m.constrain_orthonormal()


def _constrain_orthonormal_internal(M):
    '''
    Refer to
        void ConstrainOrthonormalInternal(BaseFloat scale, CuMatrixBase<BaseFloat> *M)
    from
        https://github.com/kaldi-asr/kaldi/blob/master/src/nnet3/nnet-utils.cc#L982
    Note that we always use the **floating** case.
    '''
    assert M.ndim == 2

    num_rows = M.size(0)
    num_cols = M.size(1)

    assert num_rows <= num_cols

    # P = M * M^T
    P = torch.mm(M, M.t())
    P_PT = torch.mm(P, P.t())

    trace_P = torch.trace(P)
    trace_P_P = torch.trace(P_PT)

    scale = torch.sqrt(trace_P_P / trace_P)

    ratio = trace_P_P * num_rows / (trace_P * trace_P)
    assert ratio > 0.99

    update_speed = 0.125

    if ratio > 1.02:
        update_speed *= 0.5
        if ratio > 1.1:
            update_speed *= 0.5

    identity = torch.eye(num_rows, dtype=P.dtype, device=P.device)
    P = P - scale * scale * identity

    alpha = update_speed / (scale * scale)
    M = M - 4 * alpha * torch.mm(P, M)
    return M


class SharedDimScaleDropout(nn.Module):
    def __init__(self, dim=1):
        '''
        Continuous scaled dropout that is const over chosen dim (usually across time)
        Multiplies inputs by random mask taken from Uniform([1 - 2\alpha, 1 + 2\alpha])
        '''
        super().__init__()
        self.dim = dim
        self.register_buffer('mask', torch.tensor(0.))

    def forward(self, x, alpha=0.0):
        if self.training and alpha > 0.:
            # sample mask from uniform dist with dim of length 1 in self.dim and then repeat to match size
            tied_mask_shape = list(x.shape)
            tied_mask_shape[self.dim] = 1
            repeats = [1 if i != self.dim else x.shape[self.dim]
                       for i in range(len(x.shape))]
            return x * self.mask.repeat(tied_mask_shape).uniform_(1 - 2 * alpha, 1 + 2 * alpha).repeat(repeats)
            # expected value of dropout mask is 1 so no need to scale outputs like vanilla dropout
        return x


class OrthonormalLinear(nn.Module):

    def __init__(self, dim, bottleneck_dim, kernel_size):
        super().__init__()
        # WARNING(fangjun): kaldi uses [-1, 0] for the first linear layer
        # and [0, 1] for the second affine layer;
        # we use [-1, 0, 1] for the first linear layer if time_stride == 1

        self.kernel_size = kernel_size

        # conv requires [N, C, T]
        self.conv = nn.Conv1d(in_channels=dim,
                              out_channels=bottleneck_dim,
                              kernel_size=kernel_size,
                              bias=False)

    def forward(self, x):
        # input x is of shape: [batch_size, feat_dim, seq_len] = [N, C, T]
        assert x.ndim == 3
        x = self.conv(x)
        return x

    def constrain_orthonormal(self):
        state_dict = self.conv.state_dict()
        w = state_dict['weight']
        # w is of shape [out_channels, in_channels, kernel_size]
        out_channels = w.size(0)
        in_channels = w.size(1)
        kernel_size = w.size(2)

        w = w.reshape(out_channels, -1)

        num_rows = w.size(0)
        num_cols = w.size(1)

        need_transpose = False
        if num_rows > num_cols:
            w = w.t()
            need_transpose = True

        w = _constrain_orthonormal_internal(w)

        if need_transpose:
            w = w.t()

        w = w.reshape(out_channels, in_channels, kernel_size)

        state_dict['weight'] = w
        self.conv.load_state_dict(state_dict)


class PrefinalLayer(nn.Module):

    def __init__(self, big_dim, small_dim):
        super().__init__()
        self.affine = nn.Linear(in_features=small_dim, out_features=big_dim)
        self.batchnorm1 = nn.BatchNorm1d(num_features=big_dim, affine=False)
        self.linear = OrthonormalLinear(dim=big_dim,
                                        bottleneck_dim=small_dim,
                                        kernel_size=1)
        self.batchnorm2 = nn.BatchNorm1d(num_features=small_dim, affine=False)

    def forward(self, x):
        # x is [N, C, T]
        x = x.permute(0, 2, 1)

        # at this point, x is [N, T, C]

        x = self.affine(x)
        x = F.relu(x)

        # at this point, x is [N, T, C]

        x = x.permute(0, 2, 1)

        # at this point, x is [N, C, T]

        x = self.batchnorm1(x)

        x = self.linear(x)

        x = self.batchnorm2(x)

        return x


class TDNN(nn.Module):
    '''
    This class implements the following topology in kaldi:
      relu-batchnorm-dropout-layer name=tdnn1 dropout-per-dim-continuous=true dim=1024
    '''

    def __init__(self, input_dim, hidden_dim):
        super().__init__()
        # affine conv1d requires [N, C, T]
        self.affine = nn.Conv1d(in_channels=input_dim,
                                out_channels=hidden_dim,
                                kernel_size=1)

        # tdnn1_batchnorm requires [N, C, T]
        self.batchnorm = nn.BatchNorm1d(num_features=hidden_dim,
                                        affine=False)

        self.dropout = SharedDimScaleDropout(dim=2)

    def forward(self, x, dropout=0.):
        # input x is of shape: [batch_size, feat_dim, seq_len] = [N, C, T]
        x = self.affine(x)
        x = F.relu(x)
        x = self.batchnorm(x)
        x = self.dropout(x, alpha=dropout)
        # return shape is [N, C, T]
        return x


class FactorizedTDNN(nn.Module):
    '''
    This class implements the following topology in kaldi:
      tdnnf-layer name=tdnnf2 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=1
    References:
        - http://danielpovey.com/files/2018_interspeech_tdnnf.pdf
        - ConstrainOrthonormalInternal() from
          https://github.com/kaldi-asr/kaldi/blob/master/src/nnet3/nnet-utils.cc#L982
    '''

    def __init__(self,
                 dim,
                 bottleneck_dim,
                 kernel_size,
                 subsampling_factor,
                 bypass_scale=0.66,
                 cnn_padding=1):
        super().__init__()

        assert abs(bypass_scale) <= 1

        self.bypass_scale = bypass_scale

        self.s = subsampling_factor

        # linear requires [N, C, T]
        self.linear = OrthonormalLinear(dim=dim,
                                        bottleneck_dim=bottleneck_dim,
                                        kernel_size=kernel_size)

        # affine requires [N, C, T]
        # WARNING(fangjun): we do not use nn.Linear here
        # since we want to use `stride`
        self.affine = nn.Conv1d(in_channels=bottleneck_dim,
                                out_channels=dim,
                                kernel_size=1,
                                stride=subsampling_factor,
                                padding=cnn_padding)

        # batchnorm requires [N, C, T]
        self.batchnorm = nn.BatchNorm1d(num_features=dim, affine=False)

        self.dropout = SharedDimScaleDropout(dim=2)

    def forward(self, x, dropout=0.):
        # input x is of shape: [batch_size, feat_dim, seq_len] = [N, C, T]
        assert x.ndim == 3

        # save it for skip connection
        input_x = x

        x = self.linear(x)
        # at this point, x is [N, C, T]

        x = self.affine(x)
        # at this point, x is [N, C, T]

        x = F.relu(x)

        # at this point, x is [N, C, T]

        x = self.batchnorm(x)

        # at this point, x is [N, C, T]

        x = self.dropout(x, alpha=dropout)

        if self.linear.kernel_size > 1:
            # padding takes care of keeping the shapes correct
            x = self.bypass_scale * input_x + x
        else:
            x = self.bypass_scale * input_x[:, :, ::self.s] + x
        return x


================================================
FILE: snowfall/models/transformer.py
================================================
#!/usr/bin/env python3

# Copyright (c)  2021  University of Chinese Academy of Sciences (author: Han Zhu)
# Apache 2.0

import k2
import math
import torch
from torch import Tensor, nn
from typing import Dict, List, Optional, Tuple

from snowfall.common import get_texts
from snowfall.models import AcousticModel


class Transformer(AcousticModel):
    """
    Args:
        num_features (int): Number of input features
        num_classes (int): Number of output classes
        subsampling_factor (int): subsampling factor of encoder (the convolution layers before transformers)
        d_model (int): attention dimension
        nhead (int): number of head
        dim_feedforward (int): feedforward dimention
        num_encoder_layers (int): number of encoder layers
        num_decoder_layers (int): number of decoder layers
        dropout (float): dropout rate
        normalize_before (bool): whether to use layer_norm before the first block.
        vgg_frontend (bool): whether to use vgg frontend.
    """

    def __init__(self, num_features: int, num_classes: int, subsampling_factor: int = 4,
                 d_model: int = 256, nhead: int = 4, dim_feedforward: int = 2048,
                 num_encoder_layers: int = 12, num_decoder_layers: int = 6,
                 dropout: float = 0.1, normalize_before: bool = True,
                 vgg_frontend: bool = False) -> None:
        super().__init__()
        self.num_features = num_features
        self.num_classes = num_classes
        self.subsampling_factor = subsampling_factor
        if subsampling_factor != 4:
            raise NotImplementedError("Support only 'subsampling_factor=4'.")

        self.encoder_embed = (VggSubsampling(num_features, d_model) if vgg_frontend else
                              Conv2dSubsampling(num_features, d_model))
        self.encoder_pos = PositionalEncoding(d_model, dropout)

        encoder_layer = TransformerEncoderLayer(d_model, nhead, dim_feedforward, dropout, normalize_before=normalize_before)

        if normalize_before:
            encoder_norm = nn.LayerNorm(d_model)
        else:
            encoder_norm = None

        self.encoder = nn.TransformerEncoder(encoder_layer, num_encoder_layers, encoder_norm)

        self.encoder_output_layer = nn.Sequential(
            nn.Dropout(p=dropout),
            nn.Linear(d_model, num_classes)
        )

        if num_decoder_layers > 0:
            self.decoder_num_class = self.num_classes + 1  # +1 for the sos/eos symbol

            self.decoder_embed = nn.Embedding(self.decoder_num_class, d_model)
            self.decoder_pos = PositionalEncoding(d_model, dropout)

            decoder_layer = TransformerDecoderLayer(d_model, nhead, dim_feedforward, dropout, normalize_before=normalize_before)

            if normalize_before:
                decoder_norm = nn.LayerNorm(d_model)
            else:
                decoder_norm = None

            self.decoder = nn.TransformerDecoder(decoder_layer, num_decoder_layers, decoder_norm)

            self.decoder_output_layer = torch.nn.Linear(d_model, self.decoder_num_class)

            self.decoder_criterion = LabelSmoothingLoss(self.decoder_num_class)
        else:
            self.decoder_criterion = None

    def forward(self, x: Tensor, supervision: Optional[Dict] = None) -> Tuple[Tensor, Tensor, Optional[Tensor]]:
        """
        Args:
            x: Tensor of dimension (batch_size, num_features, input_length).
            supervision: Supervison in lhotse format, get from batch['supervisions']

        Returns:
            Tensor: After log-softmax tensor of dimension (batch_size, number_of_classes, input_length).
            Tensor: Before linear layer tensor of dimension (input_length, batch_size, d_model).
            Optional[Tensor]: Mask tensor of dimension (batch_size, input_length) or None.

        """
        encoder_memory, memory_mask = self.encode(x, supervision)
        x = self.encoder_output(encoder_memory)
        return x, encoder_memory, memory_mask

    def encode(self, x: Tensor, supervisions: Optional[Dict] = None) -> Tuple[Tensor, Optional[Tensor]]:
        """
        Args:
            x: Tensor of dimension (batch_size, num_features, input_length).
            supervisions : Supervison in lhotse format, i.e., batch['supervisions']

        Returns:
            Tensor: Predictor tensor of dimension (input_length, batch_size, d_model).
            Optional[Tensor]: Mask tensor of dimension (batch_size, input_length) or None.
        """
        x = x.permute(0, 2, 1)  # (B, F, T) -> (B, T, F)

        x = self.encoder_embed(x)
        x = self.encoder_pos(x)
        x = x.permute(1, 0, 2)  # (B, T, F) -> (T, B, F)
        mask = encoder_padding_mask(x.size(0), supervisions)
        mask = mask.to(x.device) if mask != None else None
        x = self.encoder(x, src_key_padding_mask=mask)  # (T, B, F)

        return x, mask

    def encoder_output(self, x: Tensor) -> Tensor:
        """
        Args:
            x: Tensor of dimension (input_length, batch_size, d_model).

        Returns:
            Tensor: After log-softmax tensor of dimension (batch_size, number_of_classes, input_length).
        """
        x = self.encoder_output_layer(x).permute(1, 2, 0)  # (T, B, F) ->(B, F, T)
        x = nn.functional.log_softmax(x, dim=1)  # (B, F, T)
        return x

    def decoder_forward(self, x: Tensor, encoder_mask: Tensor, supervision: Dict, graph_compiler: object) -> Tensor:
        """
        Args:
            x: Tensor of dimension (input_length, batch_size, d_model).
            encoder_mask: Mask tensor of dimension (batch_size, input_length)
            supervision: Supervison in lhotse format, get from batch['supervisions']
            graph_compiler: use graph_compiler.L_inv (Its labels are words, while its aux_labels are phones)
                            , graph_compiler.words and graph_compiler.oov

        Returns:
            Tensor: Decoder loss.
        """
        batch_text = get_normal_transcripts(supervision, graph_compiler.lexicon.words, graph_compiler.oov)
        ys_in_pad, ys_out_pad = add_sos_eos(batch_text, graph_compiler.L_inv, self.decoder_num_class - 1,
                                            self.decoder_num_class - 1)
        ys_in_pad = ys_in_pad.to(x.device)
        ys_out_pad = ys_out_pad.to(x.device)

        tgt_mask = generate_square_subsequent_mask(ys_in_pad.shape[-1]).to(x.device)

        tgt_key_padding_mask = decoder_padding_mask(ys_in_pad)

        tgt = self.decoder_embed(ys_in_pad)  # (B, T) -> (B, T, F)
        tgt = self.decoder_pos(tgt)
        tgt = tgt.permute(1, 0, 2)  # (B, T, F) -> (T, B, F)
        pred_pad = self.decoder(tgt=tgt,
                                memory=x,
                                tgt_mask=tgt_mask,
                                tgt_key_padding_mask=tgt_key_padding_mask,
                                memory_key_padding_mask=encoder_mask)  # (T, B, F)
        pred_pad = pred_pad.permute(1, 0, 2)  # (T, B, F) -> (B, T, F)
        pred_pad = self.decoder_output_layer(pred_pad)  # (B, T, F)

        decoder_loss = self.decoder_criterion(pred_pad, ys_out_pad)

        return decoder_loss


class TransformerEncoderLayer(nn.Module):
    """
    Modified from torch.nn.TransformerEncoderLayer. Add support of normalize_before,
    i.e., use layer_norm before the first block.

    Args:
        d_model: the number of expected features in the input (required).
        nhead: the number of heads in the multiheadattention models (required).
        dim_feedforward: the dimension of the feedforward network model (default=2048).
        dropout: the dropout value (default=0.1).
        activation: the activation function of intermediate layer, relu or gelu (default=relu).
        normalize_before: whether to use layer_norm before the first block.

    Examples::
        >>> encoder_layer = TransformerEncoderLayer(d_model=512, nhead=8)
        >>> src = torch.rand(10, 32, 512)
        >>> out = encoder_layer(src)
    """

    def __init__(self, d_model: int, nhead: int, dim_feedforward: int = 2048, dropout: float = 0.1,
                 activation: str = "relu", normalize_before: bool = True) -> None:
        super(TransformerEncoderLayer, self).__init__()
        self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=0.0)
        # Implementation of Feedforward model
        self.linear1 = nn.Linear(d_model, dim_feedforward)
        self.dropout = nn.Dropout(dropout)
        self.linear2 = nn.Linear(dim_feedforward, d_model)

        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)

        self.activation = _get_activation_fn(activation)

        self.normalize_before = normalize_before

    def __setstate__(self, state):
        if 'activation' not in state:
            state['activation'] = nn.functional.relu
        super(TransformerEncoderLayer, self).__setstate__(state)

    def forward(self, src: Tensor, src_mask: Optional[Tensor] = None,
                src_key_padding_mask: Optional[Tensor] = None) -> Tensor:
        """
        Pass the input through the encoder layer.

        Args:
            src: the sequence to the encoder layer (required).
            src_mask: the mask for the src sequence (optional).
            src_key_padding_mask: the mask for the src keys per batch (optional).

        Shape:
            src: (S, N, E).
            src_mask: (S, S).
            src_key_padding_mask: (N, S).
            S is the source sequence length, T is the target sequence length, N is the batch size, E is the feature number
        """
        residual = src
        if self.normalize_before:
            src = self.norm1(src)
        src2 = self.self_attn(src, src, src, attn_mask=src_mask,
                              key_padding_mask=src_key_padding_mask)[0]
        src = residual + self.dropout1(src2)
        if not self.normalize_before:
            src = self.norm1(src)

        residual = src
        if self.normalize_before:
            src = self.norm2(src)
        src2 = self.linear2(self.dropout(self.activation(self.linear1(src))))
        src = residual + self.dropout2(src2)
        if not self.normalize_before:
            src = self.norm2(src)
        return src


class TransformerDecoderLayer(nn.Module):
    """
    Modified from torch.nn.TransformerDecoderLayer. Add support of normalize_before,
    i.e., use layer_norm before the first block.

    Args:
        d_model: the number of expected features in the input (required).
        nhead: the number of heads in the multiheadattention models (required).
        dim_feedforward: the dimension of the feedforward network model (default=2048).
        dropout: the dropout value (default=0.1).
        activation: the activation function of intermediate layer, relu or gelu (default=relu).

    Examples::
        >>> decoder_layer = nn.TransformerDecoderLayer(d_model=512, nhead=8)
        >>> memory = torch.rand(10, 32, 512)
        >>> tgt = torch.rand(20, 32, 512)
        >>> out = decoder_layer(tgt, memory)
    """

    def __init__(self, d_model: int, nhead: int, dim_feedforward: int = 2048, dropout: float = 0.1,
                 activation: str = "relu", normalize_before: bool = True) -> None:
        super(TransformerDecoderLayer, self).__init__()
        self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=0.0)
        self.src_attn = nn.MultiheadAttention(d_model, nhead, dropout=0.0)
        # Implementation of Feedforward model
        self.linear1 = nn.Linear(d_model, dim_feedforward)
        self.dropout = nn.Dropout(dropout)
        self.linear2 = nn.Linear(dim_feedforward, d_model)

        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)
        self.dropout3 = nn.Dropout(dropout)

        self.activation = _get_activation_fn(activation)

        self.normalize_before = normalize_before

    def __setstate__(self, state):
        if 'activation' not in state:
            state['activation'] = nn.functional.relu
        super(TransformerDecoderLayer, self).__setstate__(state)

    def forward(self, tgt: Tensor, memory: Tensor, tgt_mask: Optional[Tensor] = None,
                memory_mask: Optional[Tensor] = None,
                tgt_key_padding_mask: Optional[Tensor] = None,
                memory_key_padding_mask: Optional[Tensor] = None) -> Tensor:
        """Pass the inputs (and mask) through the decoder layer.

        Args:
            tgt: the sequence to the decoder layer (required).
            memory: the sequence from the last layer of the encoder (required).
            tgt_mask: the mask for the tgt sequence (optional).
            memory_mask: the mask for the memory sequence (optional).
            tgt_key_padding_mask: the mask for the tgt keys per batch (optional).
            memory_key_padding_mask: the mask for the memory keys per batch (optional).

        Shape:
            tgt: (T, N, E).
            memory: (S, N, E).
            tgt_mask: (T, T).
            memory_mask: (T, S).
            tgt_key_padding_mask: (N, T).
            memory_key_padding_mask: (N, S).
            S is the source sequence length, T is the target sequence length, N is the batch size, E is the feature number
        """
        residual = tgt
        if self.normalize_before:
            tgt = self.norm1(tgt)
        tgt2 = self.self_attn(tgt, tgt, tgt, attn_mask=tgt_mask,
                              key_padding_mask=tgt_key_padding_mask)[0]
        tgt = residual + self.dropout1(tgt2)
        if not self.normalize_before:
            tgt = self.norm1(tgt)

        residual = tgt
        if self.normalize_before:
            tgt = self.norm2(tgt)
        tgt2 = self.src_attn(tgt, memory, memory, attn_mask=memory_mask,
                             key_padding_mask=memory_key_padding_mask)[0]
        tgt = residual + self.dropout2(tgt2)
        if not self.normalize_before:
            tgt = self.norm2(tgt)

        residual = tgt
        if self.normalize_before:
            tgt = self.norm3(tgt)
        tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt))))
        tgt = residual + self.dropout3(tgt2)
        if not self.normalize_before:
            tgt = self.norm3(tgt)
        return tgt


def _get_activation_fn(activation: str):
    if activation == "relu":
        return nn.functional.relu
    elif activation == "gelu":
        return nn.functional.gelu

    raise RuntimeError("activation should be relu/gelu, not {}".format(activation))


class Conv2dSubsampling(nn.Module):
    """Convolutional 2D subsampling (to 1/4 length).
        Modified from https://github.com/espnet/espnet/blob/master/espnet/nets/pytorch_backend/transformer/subsampling.py

    Args:
        idim: Input dimension.
        odim: Output dimension.

    """

    def __init__(self, idim: int, odim: int) -> None:
        """Construct a Conv2dSubsampling object."""
        super(Conv2dSubsampling, self).__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(in_channels=1, out_channels=odim, kernel_size=3, stride=2),
            nn.ReLU(),
            nn.Conv2d(in_channels=odim, out_channels=odim, kernel_size=3, stride=2),
            nn.ReLU(),
        )
        self.out = nn.Linear(odim * (((idim - 1) // 2 - 1) // 2), odim)

    def forward(self, x: Tensor) -> Tensor:
        """Subsample x.

        Args:
            x: Input tensor of dimension (batch_size, input_length, num_features). (#batch, time, idim).

        Returns:
            torch.Tensor: Subsampled tensor of dimension (batch_size, input_length, d_model).
                where time' = time // 4.

        """
        x = x.unsqueeze(1)  # (b, c, t, f)
        x = self.conv(x)
        b, c, t, f = x.size()
        x = self.out(x.transpose(1, 2).contiguous().view(b, t, c * f))
        return x


class VggSubsampling(nn.Module):
    """Trying to follow the setup described here https://arxiv.org/pdf/1910.09799.pdf
       This paper is not 100% explicit so I am guessing to some extent,
       and trying to compare with other VGG implementations.

    Args:
        idim: Input dimension.
        odim: Output dimension.

    """

    def __init__(self, idim: int, odim: int) -> None:
        """Construct a VggSubsampling object.   This uses 2 VGG blocks with 2
           Conv2d layers each, subsampling its input by a factor of 4 in the
           time dimensions.

           Args:
             idim:  Number of features at input, e.g. 40 or 80 for MFCC
                    (will be treated as the image height).
             odim:  Output dimension (number of features), e.g. 256
        """
        super(VggSubsampling, self).__init__()

        cur_channels = 1
        layers = []
        block_dims = [32,64]

        # The decision to use padding=1 for the 1st convolution, then padding=0
        # for the 2nd and for the max-pooling, and ceil_mode=True, was driven by
        # a back-compatibility concern so that the number of frames at the
        # output would be equal to:
        #  (((T-1)//2)-1)//2.
        # We can consider changing this by using padding=1 on the 2nd convolution,
        # so the num-frames at the output would be T//4.
        for block_dim in block_dims:
            layers.append(torch.nn.Conv2d(in_channels=cur_channels, out_channels=block_dim,
                                          kernel_size=3, padding=1, stride=1))
            layers.append(torch.nn.ReLU())
            layers.append(torch.nn.Conv2d(in_channels=block_dim, out_channels=block_dim,
                                          kernel_size=3, padding=0, stride=1))
            layers.append(torch.nn.MaxPool2d(kernel_size=2, stride=2,
                                             padding=0, ceil_mode=True))
            cur_channels = block_dim

        self.layers = nn.Sequential(*layers)

        self.out = nn.Linear(block_dims[-1] * (((idim - 1) // 2 - 1) // 2), odim)


    def forward(self, x: Tensor) -> Tensor:
        """Subsample x.

        Args:
            x: Input tensor of dimension (batch_size, input_length, num_features). (#batch, time, idim).

        Returns:
           torch.Tensor: Subsampled tensor of dimension (batch_size, input_length', d_model).
              where input_length' == (((input_length - 1) // 2) - 1) // 2

        """
        x = x.unsqueeze(1)  # (b, c, t, f)
        x = self.layers(x)
        b, c, t, f = x.size()
        x = self.out(x.transpose(1, 2).contiguous().view(b, t, c * f))
        return x


class PositionalEncoding(nn.Module):
    """
    Positional encoding.

    Args:
        d_model: Embedding dimension.
        dropout: Dropout rate.
        max_len: Maximum input length.

    """

    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000) -> None:
        """Construct an PositionalEncoding object."""
        super(PositionalEncoding, self).__init__()
        self.d_model = d_model
        self.xscale = math.sqrt(self.d_model)
        self.dropout = nn.Dropout(p=dropout)
        self.pe = None
        self.extend_pe(torch.tensor(0.0).expand(1, max_len))

    def extend_pe(self, x: Tensor) -> None:
        """Reset the positional encodings."""
        if self.pe is not None:
            if self.pe.size(1) >= x.size(1):
                if self.pe.dtype != x.dtype or self.pe.device != x.device:
                    self.pe = self.pe.to(dtype=x.dtype, device=x.device)
                return
        pe = torch.zeros(x.size(1), self.d_model)
        position = torch.arange(0, x.size(1), dtype=torch.float32).unsqueeze(1)
        div_term = torch.exp(
            torch.arange(0, self.d_model, 2, dtype=torch.float32)
            * -(math.log(10000.0) / self.d_model)
        )
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.pe = pe.to(device=x.device, dtype=x.dtype)

    def forward(self, x: Tensor) -> Tensor:
        """
        Add positional encoding.

        Args:
            x: Input tensor of dimention (batch_size, input_length, d_model).

        Returns:
            torch.Tensor: Encoded tensor of dimention (batch_size, input_length, d_model).

        """
        self.extend_pe(x)
        x = x * self.xscale + self.pe[:, : x.size(1)]
        return self.dropout(x)


class Noam(object):
    """
    Implements Noam optimizer. Proposed in "Attention Is All You Need", https://arxiv.org/pdf/1706.03762.pdf
    Modified from https://github.com/espnet/espnet/blob/master/espnet/nets/pytorch_backend/transformer/optimizer.py

    Args:
        params (iterable): iterable of parameters to optimize or dicts defining parameter groups
        model_size: attention dimension of the transformer model
        factor: learning rate factor
        warm_step: warmup steps
    """

    def __init__(self, params, model_size: int = 256, factor: float = 10.0, warm_step: int = 25000, weight_decay=0) -> None:
        """Construct an Noam object."""
        self.optimizer = torch.optim.Adam(params, lr=0, betas=(0.9, 0.98), eps=1e-9, weight_decay=weight_decay)
        self._step = 0
        self.warmup = warm_step
        self.factor = factor
        self.model_size = model_size
        self._rate = 0

    @property
    def param_groups(self):
        """Return param_groups."""
        return self.optimizer.param_groups

    def step(self):
        """Update parameters and rate."""
        self._step += 1
        rate = self.rate()
        for p in self.optimizer.param_groups:
            p["lr"] = rate
        self._rate = rate
        self.optimizer.step()

    def rate(self, step=None):
        """Implement `lrate` above."""
        if step is None:
            step = self._step
        return (
                self.factor
                * self.model_size ** (-0.5)
                * min(step ** (-0.5), step * self.warmup ** (-1.5))
        )

    def zero_grad(self):
        """Reset gradient."""
        self.optimizer.zero_grad()

    def state_dict(self):
        """Return state_dict."""
        return {
            "_step": self._step,
            "warmup": self.warmup,
            "factor": self.factor,
            "model_size": self.model_size,
            "_rate": self._rate,
            "optimizer": self.optimizer.state_dict(),
        }

    def load_state_dict(self, state_dict):
        """Load state_dict."""
        for key, value in state_dict.items():
            if key == "optimizer":
                self.optimizer.load_state_dict(state_dict["optimizer"])
            else:
                setattr(self, key, value)


class LabelSmoothingLoss(nn.Module):
    """
    Label-smoothing loss. KL-divergence between q_{smoothed ground truth prob.}(w)
    and p_{prob. computed by model}(w) is minimized.
    Modified from https://github.com/espnet/espnet/blob/master/espnet/nets/pytorch_backend/transformer/label_smoothing_loss.py

    Args:
        size: the number of class
        padding_idx: padding_idx: ignored class id
        smoothing: smoothing rate (0.0 means the conventional CE)
        normalize_length: normalize loss by sequence length if True
        criterion: loss function to be smoothed
    """

    def __init__(
            self,
            size: int,
            padding_idx: int = -1,
            smoothing: float = 0.1,
            normalize_length: bool = False,
            criterion: nn.Module = nn.KLDivLoss(reduction="none"),
    ) -> None:
        """Construct an LabelSmoothingLoss object."""
        super(LabelSmoothingLoss, self).__init__()
        self.criterion = criterion
        self.padding_idx = padding_idx
        assert 0.0 < smoothing <= 1.0
        self.confidence = 1.0 - smoothing
        self.smoothing = smoothing
        self.size = size
        self.true_dist = None
        self.normalize_length = normalize_length

    def forward(self, x: Tensor, target: Tensor) -> Tensor:
        """
        Compute loss between x and target.

        Args:
            x: prediction of dimention (batch_size, input_length, number_of_classes).
            target: target masked with self.padding_id of dimention (batch_size, input_length).

        Returns:
            torch.Tensor: scalar float value
        """
        assert x.size(2) == self.size
        batch_size = x.size(0)
        x = x.view(-1, self.size)
        target = target.view(-1)
        with torch.no_grad():
            true_dist = x.clone()
            true_dist.fill_(self.smoothing / (self.size - 1))
            ignore = target == self.padding_idx  # (B,)
            total = len(target) - ignore.sum().item()
            target = target.masked_fill(ignore, 0)  # avoid -1 index
            true_dist.scatter_(1, target.unsqueeze(1), self.confidence)
        kl = self.criterion(torch.log_softmax(x, dim=1), true_dist)
        denom = total if self.normalize_length else batch_size
        return kl.masked_fill(ignore.unsqueeze(1), 0).sum() / denom


def encoder_padding_mask(max_len: int, supervisions: Optional[Dict] = None) -> Optional[Tensor]:
    """Make mask tensor containing indices of padded part.

    Args:
        max_len: maximum length of input features
        supervisions : Supervison in lhotse format, i.e., batch['supervisions']

    Returns:
        Tensor: Mask tensor of dimension (batch_size, input_length), True denote the masked indices.
    """
    if supervisions == None:
        return None

    supervision_segments = torch.stack(
        (supervisions['sequence_idx'],
         supervisions['start_frame'],
         supervisions['num_frames']), 1).to(torch.int32)

    lengths = [0 for _ in range(int(max(supervision_segments[:, 0])) + 1)]
    for sequence_idx, start_frame, num_frames in supervision_segments:
        lengths[sequence_idx] = start_frame + num_frames

    lengths = [((i -1) // 2 - 1) // 2 for i in lengths]
    bs = int(len(lengths))
    seq_range = torch.arange(0, max_len, dtype=torch.int64)
    seq_range_expand = seq_range.unsqueeze(0).expand(bs, max_len)
    seq_length_expand = seq_range_expand.new(lengths).unsqueeze(-1)
    mask = seq_range_expand >= seq_length_expand

    return mask


def decoder_padding_mask(ys_pad: Tensor, ignore_id: int = -1) -> Tensor:
    """Generate a length mask for input. The masked position are filled with bool(True),
        Unmasked positions are filled with bool(False).

    Args:
        ys_pad: padded tensor of dimension (batch_size, input_length).
        ignore_id: the ignored number (the padding number) in ys_pad

    Returns:
        Tensor: a mask tensor of dimension (batch_size, input_length).
    """
    ys_mask = ys_pad == ignore_id
    return ys_mask


def get_normal_transcripts(supervision: Dict, words: k2.SymbolTable, oov: str = '<UNK>') -> List[List[int]]:
    """Get normal transcripts (1 input recording has 1 transcript) from lhotse cut format.
    Achieved by concatenate the transcripts corresponding to the same recording.

    Args:
        supervision : Supervison in lhotse format, i.e., batch['supervisions']
        words: The word symbol table.
        oov: Out of vocabulary word.

    Returns:
        List[List[int]]: List of concatenated transcripts, length is batch_size
    """

    texts = [[token if token in words else oov
              for token in text.split(' ')] for text in supervision['text']]
    texts_ids = [[words[token] for token in text] for text in texts]

    batch_text = [[] for _ in range(int(max(supervision['sequence_idx'])) + 1)]
    for sequence_idx, text in zip(supervision['sequence_idx'], texts_ids):
        batch_text[sequence_idx] = batch_text[sequence_idx] + text
    return batch_text


def generate_square_subsequent_mask(sz: int) -> Tensor:
    """Generate a square mask for the sequence. The masked positions are filled with float('-inf').
        Unmasked positions are filled with float(0.0).

    Args:
        sz: mask size

    Returns:
        Tensor: a square mask of dimension (sz, sz)
    """
    mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    return mask


def add_sos_eos(ys: List[List[int]], lexicon: k2.Fsa, sos: int, eos: int, ignore_id: int = -1) -> Tuple[Tensor, Tensor]:
    """Add <sos> and <eos> labels.

    Args:
        ys: batch of unpadded target sequences
        lexicon: Its labels are words, while its aux_labels are phones.
        sos: index of <sos>
        eos: index of <eos>
        ignore_id: index of padding

    Returns:
        Tensor: Input of transformer decoder. Padded tensor of dimention (batch_size, max_length).
        Tensor: Output of transformer decoder. padded tensor of dimention (batch_size, max_length).
    """

    _sos = torch.tensor([sos])
    _eos = torch.tensor([eos])
    ys = get_hierarchical_targets(ys, lexicon)
    ys_in = [torch.cat([_sos, y], dim=0) for y in ys]
    ys_out = [torch.cat([y, _eos], dim=0) for y in ys]
    return pad_list(ys_in, eos), pad_list(ys_out, ignore_id)


def pad_list(ys: List[Tensor], pad_value: float) -> Tensor:
    """Perform padding for the list of tensors.

    Args:
        ys: List of tensors. len(ys) = batch_size.
        pad_value: Value for padding.

    Returns:
        Tensor: Padded tensor (batch_size, max_length, `*`).

    Examples:
        >>> x = [torch.ones(4), torch.ones(2), torch.ones(1)]
        >>> x
        [tensor([1., 1., 1., 1.]), tensor([1., 1.]), tensor([1.])]
        >>> pad_list(x, 0)
        tensor([[1., 1., 1., 1.],
                [1., 1., 0., 0.],
                [1., 0., 0., 0.]])

    """
    n_batch = len(ys)
    max_len = max(x.size(0) for x in ys)
    pad = ys[0].new_full((n_batch, max_len, *ys[0].size()[1:]), pad_value)

    for i in range(n_batch):
        pad[i, : ys[i].size(0)] = ys[i]

    return pad


def get_hierarchical_targets(ys: List[List[int]], lexicon: k2.Fsa) -> List[Tensor]:
    """Get hierarchical transcripts (i.e., phone level transcripts) from transcripts (i.e., word level transcripts).

    Args:
        ys: Word level transcripts.
        lexicon: Its labels are words, while its aux_labels are phones.

    Returns:
        List[Tensor]: Phone level transcripts.

    """

    if lexicon is None:
        return ys
    else:
        L_inv = lexicon

    n_batch = len(ys)
    indices = torch.tensor(range(n_batch))
    device = L_inv.device

    transcripts = k2.create_fsa_vec([k2.linear_fsa(x, device=device) for x in ys])
    transcripts_with_self_loops = k2.add_epsilon_self_loops(transcripts)

    transcripts_lexicon = k2.intersect(
        L_inv, transcripts_with_self_loops,
        treat_epsilons_specially=False)
    # Don't call invert_() above because we want to return phone IDs,
    # which is the `aux_labels` of transcripts_lexicon
    transcripts_lexicon = k2.remove_epsilon(transcripts_lexicon)
    transcripts_lexicon = k2.top_sort(transcripts_lexicon)

    transcripts_lexicon = k2.shortest_path(transcripts_lexicon, use_double_scores=True)

    ys = get_texts(transcripts_lexicon, indices)
    ys = [torch.tensor(y) for y in ys]

    return ys


def test_transformer():
    t = Transformer(40, 1281)
    T = 200
    f = torch.rand(31, 40, T)
    g, _, _ = t(f)
    assert g.shape == (31, 1281, (((T-1)//2)-1)//2)

def main():
    test_transformer()


if __name__ == '__main__':
    main()


================================================
FILE: snowfall/objectives/__init__.py
================================================
from .common import encode_supervisions
from .ctc import CTCLoss
from .mmi import LFMMILoss


================================================
FILE: snowfall/objectives/common.py
================================================
import math
import torch
from torch import Tensor
from typing import Dict, List, Tuple


def encode_supervisions(supervisions: Dict[str, Tensor]) -> Tuple[Tensor, List[str]]:
    """
    Encodes Lhotse's ``batch["supervisions"]`` dict into a pair of torch Tensor,
    and a list of transcription strings.

    The supervision tensor has shape ``(batch_size, 3)``.
    Its second dimension contains information about sequence index [0],
    start frames [1] and num frames [2].

    The batch items might become re-ordered during this operation -- the returned tensor
    and list of strings are guaranteed to be consistent with each other.

    This mimics subsampling by a factor of 4 with Conv1D layer with no padding.
    """
    supervision_segments = torch.stack(
        (supervisions['sequence_idx'],
         (((supervisions['start_frame'] - 1) // 2 - 1) // 2),
         (((supervisions['num_frames'] - 1) // 2 - 1) // 2)),
        1
    ).to(torch.int32)
    supervision_segments = torch.clamp(supervision_segments, min=0)
    indices = torch.argsort(supervision_segments[:, 2], descending=True)
    supervision_segments = supervision_segments[indices]
    texts = supervisions['text']
    texts = [texts[idx] for idx in indices]
    return supervision_segments, texts


def get_tot_objf_and_num_frames(
        tot_scores: Tensor,
        frames_per_seq: Tensor
    ) -> Tuple[torch.Tensor, int, int]:
    """Figures out the total score(log-prob) over all successful supervision segments
    (i.e. those for which the total score wasn't -infinity), and the corresponding
    number of frames of neural net output
         Args:
            tot_scores: a Torch tensor of shape (num_segments,) containing total scores
                       from forward-backward
            frames_per_seq: a Torch tensor of shape (num_segments,) containing the number of
                           frames for each segment
        Returns:
             Returns a tuple of 3 scalar tensors:  (tot_score, ok_frames, all_frames)
        where ok_frames is the frames for successful (finite) segments, and
       all_frames is the frames for all segments (finite or not).
    """
    mask = torch.ne(tot_scores, -math.inf)
    # finite_indexes is a tensor containing successful segment indexes, e.g.
    # [ 0 1 3 4 5 ]
    finite_indexes = torch.nonzero(mask).squeeze(1)
    ok_frames = frames_per_seq[finite_indexes].sum()
    all_frames = frames_per_seq.sum()
    return tot_scores[finite_indexes].sum(), ok_frames, all_frames


================================================
FILE: snowfall/objectives/ctc.py
================================================
from typing import List, Tuple

import torch
from torch import nn

import k2

from snowfall.objectives.common import get_tot_objf_and_num_frames
from snowfall.training.ctc_graph import CtcTrainingGraphCompiler


class CTCLoss(nn.Module):
    """
    Connectionist Temporal Classification (CTC) loss.

    TODO: more detailed description
    """
    def __init__(
            self,
            graph_compiler: CtcTrainingGraphCompiler,
    ):
        super().__init__()
        self.graph_compiler = graph_compiler

    def forward(
            self,
            nnet_output: torch.Tensor,
            texts: List,
            supervision_segments: torch.Tensor
    ) -> Tuple[torch.Tensor, int, int]:
        num_graphs = self.graph_compiler.compile(texts).to(nnet_output.device)
        dense_fsa_vec = k2.DenseFsaVec(nnet_output, supervision_segments)

        num_lats = k2.intersect_dense(num_graphs, dense_fsa_vec, 10.0)

        num_tot_scores = num_lats.get_tot_scores(
            log_semiring=True,
            use_double_scores=True
        )
        tot_scores = num_tot_scores
        tot_score, tot_frames, all_frames = get_tot_objf_and_num_frames(
            tot_scores,
            supervision_segments[:, 2]
        )
        return tot_score, tot_frames, all_frames


================================================
FILE: snowfall/objectives/mmi.py
================================================
from typing import List, Tuple

import torch
from torch import nn

import k2

from snowfall.objectives.common import get_tot_objf_and_num_frames
from snowfall.training.mmi_graph import MmiTrainingGraphCompiler


def _compute_mmi_loss_exact_optimized(
        nnet_output: torch.Tensor,
        texts: List[str],
        supervision_segments: torch.Tensor,
        graph_compiler: MmiTrainingGraphCompiler,
        P: k2.Fsa,
        den_scale: float = 1.0
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
    '''
    The function name contains `exact`, which means it uses a version of
    intersection without pruning.

    `optimized` in the function name means this function is optimized
    in that it calls k2.intersect_dense only once

    Note:
      It is faster at the cost of using more memory.

    Args:
      nnet_output:
        A 3-D tensor of shape [N, T, C]
      texts:
        The transcript. Each element consists of space(s) separated words.
      supervision_segments:
        A 2-D tensor that will be passed to :func:`k2.DenseFsaVec`.
      graph_compiler:
        Used to build num_graphs and den_graphs
      P:
        Represents a bigram Fsa.
      den_scale:
        The scale applied to the denominator tot_scores.
    '''

    num_graphs, den_graphs = graph_compiler.compile(texts,
                                                    P,
                                                    replicate_den=False)

    dense_fsa_vec = k2.DenseFsaVec(nnet_output, supervision_segments)

    device = num_graphs.device

    num_fsas = num_graphs.shape[0]
    assert dense_fsa_vec.dim0() == num_fsas

    assert den_graphs.shape[0] == 1

    # the aux_labels of num_graphs is k2.RaggedInt
    # but it is torch.Tensor for den_graphs.
    #
    # The following converts den_graphs.aux_labels
    # from torch.Tensor to k2.RaggedInt so that
    # we can use k2.append() later
    den_graphs.convert_attr_to_ragged_(name='aux_labels')

    # The motivation to concatenate num_graphs and den_graphs
    # is to reduce the number of calls to k2.intersect_dense.
    num_den_graphs = k2.cat([num_graphs, den_graphs])

    # NOTE: The a_to_b_map in k2.intersect_dense must be sorted
    # so the following reorders num_den_graphs.
    #
    # The following code computes a_to_b_map

    # [0, 1, 2, ... ]
    num_graphs_indexes = torch.arange(num_fsas, dtype=torch.int32)

    # [num_fsas, num_fsas, num_fsas, ... ]
    den_graphs_indexes = torch.tensor([num_fsas] * num_fsas, dtype=torch.int32)

    # [0, num_fsas, 1, num_fsas, 2, num_fsas, ... ]
    num_den_graphs_indexes = torch.stack(
        [num_graphs_indexes, den_graphs_indexes]).t().reshape(-1).to(device)

    num_den_reordered_graphs = k2.index(num_den_graphs, num_den_graphs_indexes)

    # [[0, 1, 2, ...]]
    a_to_b_map = torch.arange(num_fsas, dtype=torch.int32).reshape(1, -1)

    # [[0, 1, 2, ...]] -> [0, 0, 1, 1, 2, 2, ... ]
    a_to_b_map = a_to_b_map.repeat(2, 1).t().reshape(-1).to(device)

    num_den_lats = k2.intersect_dense(num_den_reordered_graphs,
                                      dense_fsa_vec,
                                      output_beam=10.0,
                                      a_to_b_map=a_to_b_map)

    num_den_tot_scores = num_den_lats.get_tot_scores(log_semiring=True,
                                                     use_double_scores=True)

    num_tot_scores = num_den_tot_scores[::2]
    den_tot_scores = num_den_tot_scores[1::2]

    tot_scores = num_tot_scores - den_scale * den_tot_scores
    tot_score, tot_frames, all_frames = get_tot_objf_and_num_frames(
        tot_scores, supervision_segments[:, 2])
    return tot_score, tot_frames, all_frames


def _compute_mmi_loss_exact_non_optimized(
        nnet_output: torch.Tensor,
        texts: List[str],
        supervision_segments: torch.Tensor,
        graph_compiler: MmiTrainingGraphCompiler,
        P: k2.Fsa,
        den_scale: float = 1.0
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
    '''
    See :func:`_compute_mmi_loss_exact_optimized` for the meaning
    of the arguments.

    It's more readable, though it invokes k2.intersect_dense twice.

    Note:
      It uses less memory at the cost of speed. It is slower.
    '''
    num_graphs, den_graphs = graph_compiler.compile(texts,
                                                    P,
                                                    replicate_den=True)
    dense_fsa_vec = k2.DenseFsaVec(nnet_output, supervision_segments)

    num_lats = k2.intersect_dense(num_graphs, dense_fsa_vec, output_beam=10.0)
    den_lats = k2.intersect_dense(den_graphs, dense_fsa_vec, output_beam=10.0)

    num_tot_scores = num_lats.get_tot_scores(log_semiring=True,
                                             use_double_scores=True)

    den_tot_scores = den_lats.get_tot_scores(log_semiring=True,
                                             use_double_scores=True)
    tot_scores = num_tot_scores - den_scale * den_tot_scores
    tot_score, tot_frames, all_frames = get_tot_objf_and_num_frames(
        tot_scores, supervision_segments[:, 2])
    return tot_score, tot_frames, all_frames


def _compute_mmi_loss_pruned(
        nnet_output: torch.Tensor,
        texts: List[str],
        supervision_segments: torch.Tensor,
        graph_compiler: MmiTrainingGraphCompiler,
        P: k2.Fsa,
        den_scale: float = 1.0
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
    '''
    See :func:`_compute_mmi_loss_exact_optimized` for the meaning
    of the arguments.

    `pruned` means it uses k2.intersect_dense_pruned

    Note:
      It uses the least amount of memory, but the loss is not exact due
      to pruning.
    '''
    num_graphs, den_graphs = graph_compiler.compile(texts,
                                                    P,
                                                    replicate_den=False)

    dense_fsa_vec = k2.DenseFsaVec(nnet_output, supervision_segments)

    num_lats = k2.intersect_dense(num_graphs, dense_fsa_vec, output_beam=10.0)

    # the values for search_beam/output_beam/min_active_states/max_active_states
    # are not tuned. You may want to tune them.
    # 20 7 30 10000
    # for wsj: 10 5 30 10000
    # for aishell 20 7 30 10000
    den_lats = k2.intersect_dense_pruned(den_graphs,
                                         dense_fsa_vec,
                                         search_beam=10.0,
                                         output_beam=5.0,
                                         min_active_states=30,
                                         max_active_states=10000)

    num_tot_scores = num_lats.get_tot_scores(log_semiring=True,
                                             use_double_scores=True)

    den_tot_scores = den_lats.get_tot_scores(log_semiring=True,
                                             use_double_scores=True)

    tot_scores = num_tot_scores - den_scale * den_tot_scores
    tot_score, tot_frames, all_frames = get_tot_objf_and_num_frames(
        tot_scores, supervision_segments[:, 2])
    return tot_score, tot_frames, all_frames


class LFMMILoss(nn.Module):
    """
    Computes Lattice-Free Maximum Mutual Information (LFMMI) loss.

    TODO: more detailed description
    """

    def __init__(
            self,
            graph_compiler: MmiTrainingGraphCompiler,
            P: k2.Fsa,
            use_pruned_intersect: bool = False,
            den_scale: float = 1.0,
    ):
        super().__init__()
        self.graph_compiler = graph_compiler
        self.P = P
        self.den_scale = den_scale
        self.use_pruned_intersect = use_pruned_intersect

    def forward(self, nnet_output: torch.Tensor, texts: List[str],
                supervision_segments: torch.Tensor
               ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
        if self.use_pruned_intersect:
            func = _compute_mmi_loss_pruned
        else:
            func = _compute_mmi_loss_exact_non_optimized
            # func = _compute_mmi_loss_exact_optimized

        return func(nnet_output=nnet_output,
                    texts=texts,
                    supervision_segments=supervision_segments,
                    graph_compiler=self.graph_compiler,
                    P=self.P,
                    den_scale=self.den_scale)


================================================
FILE: snowfall/training/__init__.py
================================================


================================================
FILE: snowfall/training/ctc_graph.py
================================================
# Copyright (c)  2020  Xiaomi Corp.       (author: Fangjun Kuang)

from functools import lru_cache
from typing import Iterable
from typing import List

import torch
import k2

from snowfall.common import get_phone_symbols


def build_ctc_topo(tokens: List[int]) -> k2.Fsa:
    '''Build CTC topology.
    A token which appears once on the right side (i.e. olabels) may
    appear multiple times on the left side (ilabels), possibly with
    epsilons in between.
    When 0 appears on the left side, it represents the blank symbol;
    when it appears on the right side, it indicates an epsilon. That
    is, 0 has two meanings here.
    Args:
      tokens:
        A list of tokens, e.g., phones, characters, etc.
    Returns:
      Returns an FST that converts repeated tokens to a single token.
    '''
    assert 0 in tokens, 'We assume 0 is ID of the blank symbol'

    num_states = len(tokens)
    final_state = num_states
    arcs = ''
    for i in range(num_states):
        for j in range(num_states):
            if i == j:
                arcs += f'{i} {i} {tokens[i]} 0 0.0\n'
            else:
                arcs += f'{i} {j} {tokens[j]} {tokens[j]} 0.0\n'
        arcs += f'{i} {final_state} -1 -1 0.0\n'
    arcs += f'{final_state}'
    ans = k2.Fsa.from_str(arcs, num_aux_labels=1)
    ans = k2.arc_sort(ans)
    return ans


class CtcTrainingGraphCompiler(object):

    def __init__(self,
                 L_inv: k2.Fsa,
                 phones: k2.SymbolTable,
                 words: k2.SymbolTable,
                 oov: str = '<UNK>'):
        '''
        Args:
          L_inv:
            Its labels are words, while its aux_labels are phones.
        phones:
          The phone symbol table.
        words:
          The word symbol table.
        oov:
          Out of vocabulary word.
        '''
        if L_inv.properties & k2.fsa_properties.ARC_SORTED != 0:
            L_inv = k2.arc_sort(L_inv)

        assert oov in words

        self.L_inv = L_inv
        self.phones = phones
        self.words = words
        self.oov = oov
        phone_ids = get_phone_symbols(phones)
        phone_ids_with_blank = [0] + phone_ids
        self.ctc_topo = k2.arc_sort(build_ctc_topo(phone_ids_with_blank))

    def compile(self, texts: Iterable[str]) -> k2.Fsa:
        decoding_graphs = k2.create_fsa_vec(
            [self.compile_one_and_cache(text) for text in texts])

        # make sure the gradient is not accumulated
        decoding_graphs.requires_grad_(False)
        return decoding_graphs

    @lru_cache(maxsize=100000)
    def compile_one_and_cache(self, text: str) -> k2.Fsa:
        tokens = (token if token in self.words else self.oov
                  for token in text.split(' '))
        word_ids = [self.words[token] for token in tokens]
        label_graph = k2.linear_fsa(word_ids)
        decoding_graph = k2.connect(k2.intersect(label_graph,
                                                 self.L_inv)).invert_()
        decoding_graph = k2.arc_sort(decoding_graph)
        decoding_graph = k2.compose(self.ctc_topo, decoding_graph)
        decoding_graph = k2.connect(decoding_graph)
        return decoding_graph


================================================
FILE: snowfall/training/diagnostics.py
================================================
from typing import Dict, Optional

import torch
from torch import nn
from torch.cuda.amp import GradScaler


def l1_norm(x):
    return torch.sum(torch.abs(x))


def l2_norm(x):
    return torch.sum(torch.pow(x, 2))


def linf_norm(x):
    return torch.max(torch.abs(x))


def measure_weight_norms(model: nn.Module, norm: str = 'l2') -> Dict[str, float]:
    """
    Compute the norms of the model's parameters.

    :param model: a torch.nn.Module instance
    :param norm: how to compute the norm. Available values: 'l1', 'l2', 'linf'
    :return: a dict mapping from parameter's name to its norm.
    """
    with torch.no_grad():
        norms = {}
        for name, param in model.named_parameters():
            if norm == 'l1':
                val = l1_norm(param)
            elif norm == 'l2':
                val = l2_norm(param)
            elif norm == 'linf':
                val = linf_norm(param)
            else:
                raise ValueError(f"Unknown norm type: {norm}")
            norms[name] = val.item()
        return norms


def measure_semiorthogonality(model: nn.Module) -> Dict[str, float]:
    """
    Compute the semi-orthogonality objective function proposed by:

        "Semi-Orthogonal Low-Rank Matrix Factorization for Deep Neural Networks",
        Daniel Povey, Gaofeng Cheng, Yiming Wang, Ke Li, Hainan Xu, Mahsa Yarmohamadi,
        Sanjeev Khudanpur, Interspeech 2018
    """
    with torch.no_grad():
        scores = {}
        for name, m in model.named_modules():
            if hasattr(m, 'constrain_orthonormal'):
                weight = m.state_dict()['conv.weight']
                dim = weight.shape[0]
                w = weight.reshape(dim, -1)
                P = torch.mm(w, w.t())
                scale = torch.trace(torch.mm(P, P.t()) / torch.trace(P))
                I = torch.eye(dim, dtype=P.dtype, device=P.device)
                Q = P - scale * I
                score = torch.trace(torch.mm(Q, Q.t()))
                scores[name] = score.item()
        return scores


def measure_gradient_norms(model: nn.Module, norm: str = 'l1') -> Dict[str, float]:
    """
    Compute the norms of the gradients for each of model's parameters.

    :param model: a torch.nn.Module instance
    :param norm: how to compute the norm. Available values: 'l1', 'l2', 'linf'
    :return: a dict mapping from parameter's name to its gradient's norm.
    """
    with torch.no_grad():
        norms = {}
        for name, param in model.named_parameters():
            if norm == 'l1':
                val = l1_norm(param)
            elif norm == 'l2':
                val = l2_norm(param)
            elif norm == 'linf':
                val = linf_norm(param)
            else:
                raise ValueError(f"Unknown norm type: {norm}")
            norms[name] = val.item()
        return norms


def optim_step_and_measure_param_change(
        model: nn.Module,
        optimizer: torch.optim.Optimizer,
        scaler: Optional[GradScaler] = None
) -> Dict[str, float]:
    """
    Perform model weight update and measure the "relative change in parameters per minibatch."
    It is understood as a ratio between the L2 norm of the difference between original and updates parameters,
    and the L2 norm of the original parameter. It is given by the formula:

        .. math::
            \begin{aligned}
                \delta = \frac{\Vert\theta - \theta_{new}\Vert^2}{\Vert\theta\Vert^2}
            \end{aligned}
    """
    param_copy = {n: p.detach().clone() for n, p in model.named_parameters()}
    if scaler:
        scaler.step(optimizer)
    else:
        optimizer.step()
    relative_change = {}
    with torch.no_grad():
        for n, p_new in model.named_parameters():
            p_orig = param_copy[n]
            delta = l2_norm(p_orig - p_new) / l2_norm(p_orig)
            relative_change[n] = delta.item()
    return relative_change


================================================
FILE: snowfall/training/mmi_graph.py
================================================
# Copyright (c)  2020  Xiaomi Corp.       (author: Fangjun Kuang)

from typing import Iterable
from typing import List
from typing import Tuple
import numpy as np
import k2
import torch

from .ctc_graph import build_ctc_topo
from snowfall.common import get_phone_symbols
from ..lexicon import Lexicon


def create_bigram_phone_lm(phones: List[int]) -> k2.Fsa:
    '''Create a bigram phone LM.
    The resulting FSA (P) has a start-state and a state for
    each phone 1, 2, ....; and each of the above-mentioned states
    has a transition to the state for each phone and also to the final-state.

    Caution:
      blank is not a phone.

    Args:
      A list of phone IDs.

    Returns:
      An FSA representing the bigram phone LM.
    '''
    assert 0 not in phones
    final_state = len(phones) + 1
    rules = ''
    for i in range(1, final_state):
        rules += f'0 {i} {phones[i-1]} 0.0\n'

    for i in range(1, final_state):
        for j in range(1, final_state):
            rules += f'{i} {j} {phones[j-1]} 0.0\n'
        rules += f'{i} {final_state} -1 0.0\n'
    rules += f'{final_state}'
    return k2.Fsa.from_str(rules)


class MmiTrainingGraphCompiler(object):

    def __init__(
            self,
            lexicon: Lexicon,
            device: torch.device,
            oov: str = '<UNK>'
    ):
        '''
        Args:
          L_inv:
            Its labels are words, while its aux_labels are phones.
        phones:
          The phone symbol table.
        words:
          The word symbol table.
        oov:
          Out of vocabulary word.
        '''
        self.lexicon = lexicon
        L_inv = self.lexicon.L_inv.to(device)

        if L_inv.properties & k2.fsa_properties.ARC_SORTED != 0:
            L_inv = k2.arc_sort(L_inv)

        assert L_inv.requires_grad is False

        assert oov in self.lexicon.words

        self.L_inv = L_inv
        self.oov_id = self.lexicon.words[oov]
        self.oov = oov
        self.device = device

        phone_symbols = get_phone_symbols(self.lexicon.phones)
        phone_symbols_with_blank = [0] + phone_symbols

        ctc_topo = build_ctc_topo(phone_symbols_with_blank).to(device)
        assert ctc_topo.requires_grad is False

        self.ctc_topo_inv = k2.arc_sort(ctc_topo.invert_())

    def compile(self,
                texts: Iterable[str],
                P: k2.Fsa,
                replicate_den: bool = True) -> Tuple[k2.Fsa, k2.Fsa]:
        '''Create numerator and denominator graphs from transcripts
        and the bigram phone LM.

        Args:
          texts:
            A list of transcripts. Within a transcript, words are
            separated by spaces.
          P:
            The bigram phone LM created by :func:`create_bigram_phone_lm`.
          replicate_den:
            If True, the returned den_graph is replicated to match the number
            of FSAs in the returned num_graph; if False, the returned den_graph
            contains only a single FSA
        Returns:
          A tuple (num_graph, den_graph), where

            - `num_graph` is the numerator graph. It is an FsaVec with
              shape `(len(texts), None, None)`.

            - `den_graph` is the denominator graph. It is an FsaVec with the same
              shape of the `num_graph` if replicate_den is True; otherwise, it
              is an FsaVec containing only a single FSA.
        '''
        assert P.device == self.device
        P_with_self_loops = k2.add_epsilon_self_loops(P)

        ctc_topo_P = k2.intersect(self.ctc_topo_inv,
                                  P_with_self_loops,
                                  treat_epsilons_specially=False).invert()

        ctc_topo_P = k2.arc_sort(ctc_topo_P)

        num_graphs = self.build_num_graphs(texts)
        num_graphs_with_self_loops = k2.remove_epsilon_and_add_self_loops(
            num_graphs)
        # num_graphs_with_self_loops[0].draw("linear_lex_rmeps_addslp.svg")

        num_graphs_with_self_loops = k2.arc_sort(num_graphs_with_self_loops)

        num = k2.compose(ctc_topo_P,
                         num_graphs_with_self_loops,
                         treat_epsilons_specially=False)
        num = k2.arc_sort(num)
        # num[0].draw("num.svg")

        ctc_topo_P_vec = k2.create_fsa_vec([ctc_topo_P.detach()])
        if replicate_den:
            indexes = torch.zeros(len(texts),
                                  dtype=torch.int32,
                                  device=self.device)
            den = k2.index_fsa(ctc_topo_P_vec, indexes)
        else:
            den = ctc_topo_P_vec

        return num, den

    def build_num_graphs(self, texts: List[str]) -> k2.Fsa:
        '''Convert transcript to an Fsa with the help of lexicon
        and word symbol table.

        Args:
          texts:
            Each element is a transcript containing words separated by spaces.
            For instance, it may be 'HELLO SNOWFALL', which contains
            two words.

        Returns:
          Return an FST (FsaVec) corresponding to the transcript. Its `labels` are
          phone IDs and `aux_labels` are word IDs.
        '''
        word_ids_list = []
        for text in texts:
            word_ids = []
            for word in text.split(' '):
                if word in self.lexicon.words:
                    word_ids.append(self.lexicon.words[word])
                else:
                    word_ids.append(self.oov_id)
            word_ids_list.append(word_ids)

        fsa = k2.linear_fsa(word_ids_list, self.device)
        fsa = k2.add_epsilon_self_loops(fsa)
        assert fsa.device == self.device
        num_graphs = k2.intersect(self.L_inv,
                                  fsa,
                                  treat_epsilons_specially=False).invert_()
        num_graphs = k2.arc_sort(num_graphs)
        return num_graphs

    def compile_lookahead_numerators(self, word_fsa_vec, P):
        # Compile lexicon graph
        fsa = k2.add_epsilon_self_loops(word_fsa_vec)
        assert fsa.device == self.device
        num_graphs = k2.intersect(self.L_inv,
                                  fsa,
                                  treat_epsilons_specially=False).invert_()
        num_graphs = k2.arc_sort(num_graphs)

        # Compile ctc_topo_P
        assert P.device == self.device
        P_with_self_loops = k2.add_epsilon_self_loops(P)

        ctc_topo_P = k2.intersect(self.ctc_topo_inv,
                                  P_with_self_loops,
                                  treat_epsilons_specially=False).invert()

        ctc_topo_P = k2.arc_sort(ctc_topo_P)

        # Combine
        num_graphs_with_self_loops = k2.remove_epsilon_and_add_self_loops(
            num_graphs)

        num_graphs_with_self_loops = k2.arc_sort(num_graphs_with_self_loops)

        num = k2.compose(ctc_topo_P,
                         num_graphs_with_self_loops,
                         treat_epsilons_specially=False)
        num = k2.arc_sort(num)
        return num

    """
    def build_word_fsa(self, prefix, candidate_intervals, drop_prefix_tail):
        # convert prefix_ids in BPE domain to word sequence.
        if '' in prefix:
            prefix.remove('')

        prefix_ids = [self.lexicon.words[word] if word in self.lexicon.words else self.oov_id 
                      for word in prefix]

        # a special token that does not start with '_' could also be proposed in first iteration
        # they requires 'drop_tail' but there is no tail to drop
        # in this case, disable the 'drop_tail' operation
        batch = len(candidate_intervals)  
        drop_prefix_tail = [0] * batch if prefix == [] else drop_prefix_tail
 
        # Prefix part 
        prefix_len = len(prefix_ids)
        start_state = np.arange(prefix_len)
        end_state = np.arange(prefix_len) + 1
        labels = np.array(prefix_ids)
        scores = np.zeros(prefix_len)
        prefix_part = np.stack([start_state, end_state, labels, scores], axis=1)
        
        # candidate part
        candidate_parts = []
        ending_parts = []
        for (start, end), drop_tail in zip(candidate_intervals, drop_prefix_tail): 
            num_candidate = end - start
            start_state = np.ones(num_candidate) * (prefix_len - drop_tail)
            end_state = np.ones(num_candidate) * (prefix_len + 1 - drop_tail)
            labels = np.arange(start, end)
            scores = np.zeros(num_candidate)
            candidate_part = np.stack([start_state, end_state, labels, scores], axis=1)
            candidate_parts.append(candidate_part)

            # end arc
            end_arc = np.array([[prefix_len + 1 - drop_tail, prefix_len + 2 - drop_tail, -1, 0]])
            ending_parts.append(end_arc)
         
       
        # assemble: do not need to arc_sort 
        num_vec = []
        for i, (candidate_part, drop_tail) in enumerate(zip(candidate_parts, drop_prefix_tail)):
            this_prefix_part = prefix_part[:-1] if drop_tail else prefix_part
            end_arc = ending_parts[i]
            num_mat = np.concatenate([this_prefix_part, candidate_part, end_arc], axis=0)
            num_mat = torch.from_numpy(num_mat).to(torch.int32)
            num_vec.append(num_mat)
       
        # convert to k2 FsaVec 
        num_vec = [k2.Fsa.from_dict({"arcs": num}) for num in num_vec]
        num_vec = k2.create_fsa_vec(num_vec)
        return num_vec    
    """


================================================
FILE: snowfall/training/mmi_mbr_graph.py
================================================
# Copyright (c)  2020  Xiaomi Corp.       (author: Fangjun Kuang)

from functools import lru_cache
from typing import Iterable
from typing import List
from typing import Tuple
from pathlib import Path

import logging

import k2
import torch

from .ctc_graph import build_ctc_topo
from snowfall.common import get_phone_symbols
from snowfall.decoding.graph import compile_HLG


def find_first_disambig_symbol(symbols: k2.SymbolTable) -> int:
    return min(v for k, v in symbols._sym2id.items() if k.startswith('#'))


class MmiMbrTrainingGraphCompiler(object):

    def __init__(self,
                 L_inv: k2.Fsa,
                 L_disambig: k2.Fsa,
                 G: k2.Fsa,
                 phones: k2.SymbolTable,
                 words: k2.SymbolTable,
                 device: torch.device,
                 oov: str = '<UNK>'):
        '''
        Args:
          L_inv:
            Its labels are words, while its aux_labels are phones.
          L_disambig:
            L with disambig symbols. Its labels are phones and aux_labels
            are words.
          G:
            The language model.
          phones:
            The phone symbol table.
          words:
            The word symbol table.
          device:
            The target device that all FSAs should be moved to.
          oov:
            Out of vocabulary word.
        '''

        L_inv = L_inv.to(device)
        G = G.to(device)

        if L_inv.properties & k2.fsa_properties.ARC_SORTED != 0:
            L_inv = k2.arc_sort(L_inv)

        if G.properties & k2.fsa_properties.ARC_SORTED != 0:
            G = k2.arc_sort(G)

        assert L_inv.requires_grad is False
        assert G.requires_grad is False

        assert oov in words

        L = L_inv.invert()
        L = k2.arc_sort(L)

        self.L_inv = L_inv
        self.L = L
        self.phones = phones
        self.words = words
        self.device = device
        self.oov_id = self.words[oov]

        phone_symbols = get_phone_symbols(phones)
        phone_symbols_with_blank = [0] + phone_symbols

        ctc_topo = k2.arc_sort(
            build_ctc_topo(phone_symbols_with_blank).to(device))
        assert ctc_topo.requires_grad is False

        self.ctc_topo = ctc_topo
        self.ctc_topo_inv = k2.arc_sort(ctc_topo.invert())

        lang_dir = Path('data/lang_nosp')
        if not (lang_dir / 'HLG_uni.pt').exists():
            logging.info("Composing (ctc_topo, L_disambig, G)")
            first_phone_disambig_id = find_first_disambig_symbol(phones)
            first_word_disambig_id = find_first_disambig_symbol(words)
            # decoding_graph is the result of composing (ctc_topo, L_disambig, G)
            decoding_graph = compile_HLG(
                L=L_disambig.to('cpu'),
                G=G.to('cpu'),
                H=ctc_topo.to('cpu'),
                labels_disambig_id_start=first_phone_disambig_id,
                aux_labels_disambig_id_start=first_word_disambig_id)
            torch.save(decoding_graph.as_dict(),
                       lang_dir / 'HLG_uni.pt')
        else:
            logging.info("Loading pre-compiled HLG")
            decoding_graph = k2.Fsa.from_dict(
                torch.load(lang_dir / 'HLG_uni.pt'))

        assert hasattr(decoding_graph, 'phones')

        self.decoding_graph = decoding_graph.to(device)

    def compile(self, texts: Iterable[str],
                P: k2.Fsa) -> Tuple[k2.Fsa, k2.Fsa, k2.Fsa]:
        '''Create numerator and denominator graphs from transcripts
        and the bigram phone LM.

        Args:
          texts:
            A list of transcripts. Within a transcript, words are
            separated by spaces.
          P:
            The bigram phone LM created by :func:`create_bigram_phone_lm`.
        Returns:
          A tuple (num_graph, den_graph, decoding_graph), where

            - `num_graph` is the numerator graph. It is an FsaVec with
              shape `(len(texts), None, None)`.
              It is the result of compose(ctc_topo, P, L, transcript)

            - `den_graph` is the denominator graph. It is an FsaVec with the same
              shape of the `num_graph`.
              It is the result of compose(ctc_topo, P).

            - decoding_graph: It is the result of compose(ctc_topo, L_disambig, G)
              Note that it is a single Fsa, not an FsaVec.
        '''
        assert P.device == self.device
        P_with_self_loops = k2.add_epsilon_self_loops(P)

        ctc_topo_P = k2.intersect(self.ctc_topo_inv,
                                  P_with_self_loops,
                                  treat_epsilons_specially=False).invert()
        ctc_topo_P = k2.arc_sort(ctc_topo_P)

        num_graphs = self.build_num_graphs(texts)

        num_graphs_with_self_loops = k2.remove_epsilon_and_add_self_loops(
            num_graphs)

        num_graphs_with_self_loops = k2.arc_sort(num_graphs_with_self_loops)

        num = k2.compose(ctc_topo_P,
                         num_graphs_with_self_loops,
                         treat_epsilons_specially=False,
                         inner_labels='phones')
        num = k2.arc_sort(num)

        ctc_topo_P_vec = k2.create_fsa_vec([ctc_topo_P.detach()])
        indexes = torch.zeros(len(texts),
                              dtype=torch.int32,
                              device=self.device)
        den = k2.index_fsa(ctc_topo_P_vec, indexes)

        return num, den, self.decoding_graph

    def build_num_graphs(self, texts: List[str]) -> k2.Fsa:
        '''Convert transcript to an Fsa with the help of lexicon
        and word symbol table.

        Args:
          texts:
            Each element is a transcript containing words separated by spaces.
            For instance, it may be 'HELLO SNOWFALL', which contains
            two words.

        Returns:
          Return an FST (FsaVec) corresponding to the transcript. Its `labels` are
          phone IDs and `aux_labels` are word IDs.
        '''
        word_ids_list = []
        for text in texts:
            word_ids = []
            for word in text.split(' '):
                if word in self.words:
                    word_ids.append(self.words[word])
                else:
                    word_ids.append(self.oov_id)
            word_ids_list.append(word_ids)

        fsa = k2.linear_fsa(word_ids_list, self.device)
        fsa = k2.add_epsilon_self_loops(fsa)
        num_graphs = k2.intersect(self.L_inv,
                                  fsa,
                                  treat_epsilons_specially=False).invert_()
        num_graphs = k2.arc_sort(num_graphs)
        return num_graphs


================================================
FILE: snowfall/warpper/k2_decode.py
================================================
import torch
import k2
import sys
import numpy as np
import logging

from espnet.nets.pytorch_backend.nets_utils import make_non_pad_mask
from kaldialign import edit_distance

MAX_LEN = 2000

def k2_decode(model, device, js, sampler, batch_size, use_segment=False):
    model.ctc.decode_init()
    model.to(device)
    model.eval()

    egs = []
    tot_results = []
    tot_loss = []
    num_egs = len(list(js.keys()))
    for idx, name in enumerate(js.keys()):
        egs.append((name, js[name]))
   
        if len(egs) == batch_size or idx == num_egs - 1:
            # for logging
            names = [eg[0] for eg in egs]
            if not use_segment: # chinese
                texts = [eg[1]["output"][0]["token"] for eg in egs] 
            else: # english
                texts = [eg[1]["output"][0]["text"] for eg in egs]
            ilens_from_json = [eg[1]["input"][0]["shape"][0] for eg in egs]
            batch_size = len(names) # for last several examples

            feats = sampler(egs)
            xs_pad, ilens = build_batch_data(feats[0])
            xs_pad = torch.from_numpy(xs_pad).to(device)
            ilens = torch.from_numpy(ilens).to(device)
            egs = []

            src_mask = make_non_pad_mask(ilens.tolist()).to(xs_pad.device).unsqueeze(-2) 
            hs_pad, hs_mask = model.encoder(xs_pad, src_mask)
            hs_len = hs_mask.view(batch_size, -1).sum(1)
                
            results = model.ctc.decode(hs_pad, hs_len, texts, use_segment) 
            tot_results += results  
            parse_results(tot_results)

def build_batch_data(feats):
    # feats: list of 2d ndarray
    batch_size = len(feats)
    dim = feats[0].shape[-1]
    max_len = 0
    buf = np.zeros((batch_size, MAX_LEN, dim), dtype=np.float32)
    ilen = np.zeros(batch_size, dtype=np.int32)

    for i in range(batch_size):
        feat = feats[i]
        feat_len = feat.shape[0]
        buf[i:, :feat_len, :] = feat
        ilen[i] = feat_len
        max_len = max(max_len, feat_len)

    buf = buf[:, :max_len, :]
    return buf, ilen

def parse_results(results):
    dists = [edit_distance(r, h) for r, h in results]
    errors = {
        key: sum(dist[key] for dist in dists)
        for key in ['sub', 'ins', 'del', 'total']
    }
    total_chars = sum(len(ref) for ref, _ in results)
    logging.warning(
        f'%WER {errors["total"] / total_chars:.2%} '
        f'[{errors["total"]} / {total_chars}, {errors["ins"]} ins, {errors["del"]} del, {errors["sub"]} sub ]'
    )


================================================
FILE: snowfall/warpper/mmi_test.py
================================================
import torch
import k2
from pathlib import Path
from snowfall.lexicon import Lexicon
from snowfall.training.mmi_graph import create_bigram_phone_lm, MmiTrainingGraphCompiler


def main():
    lang = Path("data/lang_k2mmi")
    lexicon = Lexicon(lang)
    device = torch.device("cpu")
    graph_compiler = MmiTrainingGraphCompiler(lexicon, device=device)

    phone_ids = lexicon.phone_symbols()
    P = create_bigram_phone_lm(phone_ids)
    
    dim = len(phone_ids) + 1
    T = 100
    nnet_output = torch.rand(1, T, dim)
    supervision = torch.Tensor([[0, 0, T]]).to(torch.int32)
    dense_fsa_vec = k2.DenseFsaVec(nnet_output, supervision)

    texts = ['你 好']
    num_graphs, _ = graph_compiler.compile(texts, P, replicate_den=False)

    # num_lats = k2.intersection_dense(num_graphs, dense_fsa_vec, output_beam=10.0)
    num_tot_scores = num_lats.get_tot_scores(log_semiring=True, use_double_scores=True)

    print(num_tot_scores)

main()
    

================================================
FILE: snowfall/warpper/mmi_utils.py
================================================
import torch 

def build_word_mapping(word_mapping):
    ans = {}
    for line in open(word_mapping):
        f, t = line.split()
        ans[int(f)] = int(t)
    return ans 

def convert_transcription(ys, mapping, words, oov_id, ignore_ids):
    """
    ys: 2-D torch tensor. indexs of tokens
    mapping: dict, from attention domain to MMI domain. No special tokens
    words: dict, from MMI domain index to words
    ignore_ids: list, ids to ignore
    
    We assume there should be NO KEY ERROR!
    """
    ys = ys.cpu().numpy()
    ys = [
          [mapping.get(tok, oov_id) for tok in y if not tok in ignore_ids]
          for y in ys
         ]
    ys = [
          " ".join([words[tok] for tok in y])
          for y in ys
         ]
    return ys

def encode_supervision(hlens):
    batch_size = hlens.size()[0]
    supervision = torch.stack((torch.arange(batch_size),
                              torch.zeros(batch_size),
                              hlens.cpu()), 1).to(torch.int32)
    supervision = torch.clamp(supervision, min=0)
    indices = torch.argsort(supervision[:, 2], descending=True)
    supervision = supervision[indices]
    return supervision, indices

def parse_step(hyp, words, part_ids, weights, full_scores, part_scores, weighted_scores):
    # previous hypothesis
    word_hypo = "".join([words[x] for x in hyp.yseq])
    print(f"Previous Hypothesis:   {word_hypo}")
    print(f"Previous Total scores: {hyp.score}")
    
    # candidates:
    part_toks = "     ".join([words[tok] for tok in part_ids])
    print(f"Proposed Candidates:   {part_toks}")

    # slice full scores by part_ids. 
    # cannot modify the original data 
    weighted_scores_sliced = weighted_scores[part_ids]
    full_scores_sliced = {}
    for k in full_scores:
        full_scores_sliced[k] = full_scores[k][part_ids]

    # show scores from every source
    score_dict = {**full_scores_sliced, **part_scores}
    for k in score_dict:
        info = "{:<7}(weighted):   ".format(k)
        for v in score_dict[k]:
            info += "{:>6.2f} ".format(v * weights[k])
        print(info, flush=True)

    score_dict = {**full_scores_sliced, **part_scores, "total": weighted_scores_sliced}
    for k in score_dict:
        info = "{:<7}:             ".format(k)
        for v in score_dict[k]:
            info += "{:>6.2f} ".format(v)
        print(info, flush=True)


================================================
FILE: snowfall/warpper/prefix_scorer.py
================================================
import k2
import torch
import numpy as np
from pathlib import Path
from espnet.snowfall.lexicon import Lexicon
from espnet.snowfall.training.mmi_graph import MmiTrainingGraphCompiler
from espnet.snowfall.warpper.mmi_utils import encode_supervision
from espnet.snowfall.training.mmi_graph import create_bigram_phone_lm


def build_word_fsa(prefix_ids, candidate_intervals):
    batch = len(candidate_intervals)    

    # Prefix part 
    prefix_len = len(prefix_ids)
    start_state = np.arange(prefix_len)
    end_state = np.arange(prefix_len) + 1
    labels = np.array(prefix_ids)
    scores = np.zeros(prefix_len)
    
    prefix_part = np.stack([start_state, end_state, labels, scores], axis=1)

    # candidate part
    candidate_parts = []
    for start, end in candidate_intervals: 
        num_candidate = end - start
        start_state = np.ones(num_candidate) * prefix_len 
        end_state = np.ones(num_candidate) * (prefix_len + 1)
        labels = np.arange(start, end)
        scores = np.zeros(num_candidate)
        candidate_part = np.stack([start_state, end_state, labels, scores], axis=1)
        candidate_parts.append(candidate_part)

    # end arc
    end_arc = np.array([[prefix_len + 1, prefix_len + 2, -1, 0]])
   
    # assemble: do not need to arc_sort 
    num_vec = []
    for i, candidate_part in enumerate(candidate_parts):
        num_mat = np.concatenate([prefix_part, candidate_part, end_arc], axis=0)
        num_mat = torch.from_numpy(num_mat).to(torch.int32)
        num_vec.append(num_mat)

    num_vec = [k2.Fsa.from_dict({"arcs": num}) for num in num_vec]
    num_vec = k2.create_fsa_vec(num_vec)
    return num_vec
    
if __name__ == '__main__':
    lang = Path("data/lang_char")
    device = torch.device("cpu")
    lexicon = Lexicon(lang)
    compiler = MmiTrainingGraphCompiler(lexicon, device)
    phones = lexicon.phone_symbols()
    P = create_bigram_phone_lm(phones).to(device)

    prefix_ids = [1] 
    candidate_intervals =  [[58968, 60968], [60968, 62968], [62968, 64968], [64968, 66298]]
    num_graphs = compiler.compile_nums_for_prefix_scoring(prefix_ids, candidate_intervals, P)

    batch = len(candidate_intervals)
    nnet_output = torch.randn(batch, 500, len(phones) + 1).to(device)
    nnet_output = torch.nn.functional.log_softmax(nnet_output, dim=-1)
    hlens = torch.ones(batch) * 500
    supervision, _ = encode_supervision(hlens)
    dense_fsa_vec = k2.DenseFsaVec(nnet_output, supervision)
    num_lats = k2.intersect_dense(num_graphs, dense_fsa_vec, output_beam=5.0)
    print(num_lats[0].as_dict())
    num_tot_scores = num_lats.get_tot_scores(log_semiring=True, use_double_scores=True)
    print(num_tot_scores)


================================================
FILE: snowfall/warpper/warpper_ctc.py
================================================
import torch
import k2
import torch.nn.functional as F
import os
import sys
import logging
import numpy as np
from pathlib import Path
from typing import List
from typing import Union
from k2 import Fsa, SymbolTable
from espnet.snowfall.lexicon import Lexicon
from snowfall.training.ctc_graph import CtcTrainingGraphCompiler
from espnet.snowfall.objectives.ctc import CTCLoss
from espnet.snowfall.common import get_phone_symbols, find_first_disambig_symbol, get_texts
from espnet.snowfall.training.ctc_graph import build_ctc_topo
from espnet.snowfall.decoding.graph import compile_HLG
from lhotse.utils import nullcontext
from espnet.snowfall.warpper.mmi_utils import build_word_mapping, convert_transcription, encode_supervision

"""
June 29th
self.phone_ids and self.phones are not identical
self.phone_ids is built from lexicon and have no esp/blk
self.phones is read from file and have esp/blk
Not clear would this leads to a bug
"""

class K2CTC(torch.nn.Module):

    def __init__(self, 
                 idim, 
                 lang, 
                 char_list, 
                 device, 
                 dropout, 
                 den_scale, 
                 eos_id, 
                 pad_id=-1,
                 use_segment=False):

        """
        idim: input dim, usually the transformer output dim
        lang: k2 lang directory
        word_mapping: mapping from attention vocab to MMI vocab
        device: torch.device object. device to build the loss module
        dropout: dropout rate for linear out layer
        den_scale: den_scale for MMI loss computation
        eos_id: end of sentence id
        pad_id: id of padding in ys_pad
        use_segment: If true, the supervision of MMI training would use "texts"
                     instead of ys_pad. Sensitive for Chinese
        """

        super().__init__()
        self.device = device
        
        # compiler
        self.lang = Path(lang)
        self.lexicon = Lexicon(self.lang)
        self.graph_compiler = CtcTrainingGraphCompiler(
                              L_inv=self.lexicon.L_inv,
                              phones=self.lexicon.phones,
                              words=self.lexicon.words
                              )

        # bigram LM
        self.phone_ids = self.lexicon.phone_symbols() # blank excluded
        self.words = self.lexicon.words
        self.phones = self.lexicon.phones 

        # linear
        self.idim = idim
        self.odim = len(self.phone_ids) + 1
        self.lo = torch.nn.Sequential(
                    torch.nn.Dropout(p=dropout),
                    torch.nn.Linear(self.idim, self.odim)
                                     )

        # others
        self.eos_id = eos_id
        self.pad_id = pad_id
        self.use_segment=use_segment
        self.char_list = char_list
        self.oovid = int(open(self.lang / 'oov.int').read().strip())
        self.probs = None # for visualization
        self.HLG = None # Decoding graph. build by "decode_init"
        print("INFO from CTC module:")
        print(f"device: {device}")
        print(f"use segment info: {use_segment}")
        print(f"self.lo {self.lo}")
        print(f"number of phones {len(self.phone_ids)}")

    # softmax, log_softmax and argmax for decoding and visualization
    def log_softmax(self, hs_pad):
        return self.softmax(hs_pad).log()

    def softmax(self, hs_pad):
        # self.probs is required by visualization
        self.probs = F.softmax(self.lo(hs_pad), dim=2)
        return self.probs

    def argmax(self, hs_pad):
        return torch.argmax(self.lo(hs_pad), dim=2)

    def forward(self, hs_pad, hlens, ys_pad, texts):
        
        if self.use_segment:
            ys = texts
            if "<space>" in self.char_list:
                ys = [y.replace(" ", "<space> ") for y in ys]
        else:
            # split by every character: BPE or chinese chars
            ys = [[self.char_list[c] for c in y if c != self.pad_id] for y in ys_pad]
            ys = [" ".join(y).replace("<eos>", "") for y in ys]
 
        supervision, indices = encode_supervision(hlens)
        ys = [ys[i] for i in indices]
        
        nnet_output = self.lo(hs_pad)
        nnet_output = F.log_softmax(nnet_output, dim=-1)
        
        loss_fn = CTCLoss(self.graph_compiler)

        grad_context = nullcontext if self.training else torch.no_grad

        with grad_context():
            ctc_loss, ctc_frames, all_frames = loss_fn(
                nnet_output, ys, supervision)
        batch_size = hlens.size()[0]
        ctc_loss /= batch_size
        return - ctc_loss

    def decode(self, nnet_output, hlens, texts, is_english):
       
        # add linear function 
        nnet_output = self.lo(nnet_output)
        supervision, indices = encode_supervision(hlens)
        dense_fsa_vec = k2.DenseFsaVec(nnet_output, supervision)

        # Show MMI loss before decoding
        self.P.set_scores_stochastic_(self.lm_scores)
        if self.training:
            assert self.P.is_cpu
            assert self.P.requires_grad is False
        """
        loss_fn = LFMMILoss(
            graph_compiler = self.graph_compiler,
            P = self.P,
            den_scale = self.den_scale
        )

        grad_context = nullcontext if self.training else torch.no_grad
       
        if not is_english: 
            texts_reorder = [
                      " ".join(list(text.replace(" ", "")))
                    for text in texts]

            texts_reorder = [texts_reorder[i] for i in indices]
        else:
            texts_reorder = [texts[i] for i in indices]
        with grad_context():
            mmi_loss, tot_frames, all_frames = loss_fn(
                nnet_output, texts_reorder, supervision)
        mmi_loss = - mmi_loss / len(texts)
        print("MMI Loss: ", mmi_loss)
        """
        assert nnet_output.device == self.HLG.device
        # 7.0 output beam is tunable
        lattices = k2.intersect_dense_pruned(self.HLG, dense_fsa_vec, 20.0, 7.0, 30,
                                             10000)
        best_paths = k2.shortest_path(lattices, use_double_scores=True)
        
        assert best_paths.shape[0] == len(texts)
        hyps = get_texts(best_paths, indices)
        assert len(hyps) == len(texts)

        results = []
        batch_size =len(texts)
        for i in range(batch_size):
            hyp_words = [self.words.get(x) for x in hyps[i]]
            ref_words = texts[i].split(' ')
            
            if not is_english:
                hyp = "".join(hyp_words).replace(" ", "")
                ref = "".join(ref_words).replace(" ", "")
            else:
                hyp = " ".join(hyp_words)
                ref = " ".join(ref_words)
            print("#"*20)
            print(f"Reference: {ref}")
            print(f"Hypothesis: {hyp}")
            sys.stdout.flush()

            if not is_english:
                ref_char = list(ref)
                hyp_char = list(hyp)
            else:
                ref_char = ref.split()
                hyp_char = hyp.split()
            results.append((ref_char, hyp_char))

        return results

    def decode_init(self):
        # Build HLG.fst
        phone_ids = get_phone_symbols(self.phones) # will remove 0
        phone_ids_with_blank = [0] + phone_ids
        ctc_topo = k2.arc_sort(build_ctc_topo(phone_ids_with_blank))
        if not os.path.exists(self.lang / 'HLG.pt'):
            logging.debug("Loading L_disambig.fst.txt")
            with open(self.lang / 'L_disambig.fst.txt') as f:
                L = k2.Fsa.from_openfst(f.read(), acceptor=False)
            logging.debug("Loading G.fst.txt")
            with open(self.lang / 'G.fst.txt') as f:
                G = k2.Fsa.from_openfst(f.read(), acceptor=False)
            first_phone_disambig_id = find_first_disambig_symbol(self.phones)
            first_word_disambig_id = find_first_disambig_symbol(self.words)
            print("first disambig symbol: ", first_phone_disambig_id, first_word_disambig_id, flush=True)
            HLG = compile_HLG(L=L,
                             G=G,
                             H=ctc_topo,
                             labels_disambig_id_start=first_phone_disambig_id,
                             aux_labels_disambig_id_start=first_word_disambig_id)
            torch.save(HLG.as_dict(), self.lang / 'HLG.pt')
        else:
            logging.debug("Loading pre-compiled HLG")
            d = torch.load(self.lang / 'HLG.pt')
            HLG = k2.Fsa.from_dict(d)

        HLG = HLG.to(self.device)
        HLG.aux_labels = k2.ragged.remove_values_eq(HLG.aux_labels, 0)
        HLG.requires_grad_(False)
        if not hasattr(HLG, 'lm_scores'):
            HLG.lm_scores = HLG.scores.clone()
        self.HLG = HLG
        print("Successful Initialize Decoding HLG")
        
    def dump_weight(self, rank):
        d = {}
        for k, v in self.named_parameters():
            print(f"Found parameter {k} with shape {v.size()}")
            d[k] = v
        save_path = self.lang / f"ctc_param.{rank}.pth"
        torch.save(d, save_path)


================================================
FILE: snowfall/warpper/warpper_mmi.py
================================================
import torch
import k2
import torch.nn.functional as F
import os
import sys
import logging
import numpy as np
from pathlib import Path
from typing import List
from typing import Union
from k2 import Fsa, SymbolTable
from espnet.snowfall.lexicon import Lexicon
from espnet.snowfall.objectives.mmi import LFMMILoss
from espnet.snowfall.training.mmi_graph import MmiTrainingGraphCompiler
from espnet.snowfall.training.mmi_graph import create_bigram_phone_lm
from espnet.snowfall.common import get_phone_symbols, find_first_disambig_symbol, get_texts
from espnet.snowfall.training.ctc_graph import build_ctc_topo
from espnet.snowfall.decoding.graph import compile_HLG
from lhotse.utils import nullcontext
from espnet.snowfall.warpper.mmi_utils import build_word_mapping, convert_transcription, encode_supervision

"""
June 29th
self.phone_ids and self.phones are not identical
self.phone_ids is built from lexicon and have no esp/blk
self.phones is read from file and have esp/blk
Not clear would this leads to a bug
"""

class K2MMI(torch.nn.Module):

    def __init__(self, 
                 idim, 
                 lang, 
                 char_list, 
                 device, 
                 dropout, 
                 den_scale, 
                 eos_id, 
                 pad_id=-1,
                 use_segment=False):

        """
        idim: input dim, usually the transformer output dim
        lang: k2 lang directory
        word_mapping: mapping from attention vocab to MMI vocab
        device: torch.device object. device to build the loss module
        dropout: dropout rate for linear out layer
        den_scale: den_scale for MMI loss computation
        eos_id: end of sentence id
        pad_id: id of padding in ys_pad
        use_segment: If true, the supervision of MMI training would use "texts"
                     instead of ys_pad. Sensitive for Chinese
        """

        super().__init__()
        self.device = device
        
        # compiler
        self.lang = Path(lang)
        self.lexicon = Lexicon(self.lang)
        self.oovid = int(open(self.lang / 'oov.int').read().strip())
        self.oov = self.lexicon.words[self.oovid]
        self.graph_compiler = MmiTrainingGraphCompiler(
                              lexicon=self.lexicon,
                              device=self.device,
                              oov=self.oov
                              )

        # bigram LM
        self.phone_ids = self.lexicon.phone_symbols() # blank excluded
        self.words = self.lexicon.words
        self.phones = self.lexicon.phones 
        self.P = create_bigram_phone_lm(self.phone_ids)
        self.P.scores = torch.zeros_like(self.P.scores)
        self.P = self.P.to(self.device)
        self.lm_scores = torch.nn.Parameter(self.P.scores.clone(), requires_grad=True)
        self.use_pruned_intersect = len(self.phone_ids) > 500

        # linear
        self.idim = idim
        self.odim = len(self.phone_ids) + 1
        self.lo = torch.nn.Sequential(
                    torch.nn.Dropout(p=dropout),
                    torch.nn.Linear(self.idim, self.odim)
                                     )

        # others
        self.eos_id = eos_id
        self.pad_id = pad_id
        self.den_scale = den_scale 
        self.use_segment=use_segment
        self.char_list = char_list
        self.probs = None # for visualization
        self.HLG = None # Decoding graph. build by "decode_init"
        print("INFO from MMI module:")
        print(f"device: {device}")
        print(f"use pruned_intersect: {self.use_pruned_intersect}")
        print(f"use segment info: {use_segment}")
        print(f"self.lo {self.lo}")
        print(f"number of phones {len(self.phone_ids)}")

    # softmax, log_softmax and argmax for decoding and visualization
    def log_softmax(self, hs_pad):
        return self.softmax(hs_pad).log()

    def softmax(self, hs_pad):
        # self.probs is required by visualization
        self.probs = F.softmax(self.lo(hs_pad), dim=2)
        return self.probs

    def argmax(self, hs_pad):
        return torch.argmax(self.lo(hs_pad), dim=2)

    def forward(self, hs_pad, hlens, ys_pad, texts):
         
        if self.use_segment:
            ys = texts
        else:
            ys = [[self.char_list[c] for c in y if c != self.pad_id] for y in ys_pad]
            ys = [" ".join(y).replace("<eos>", "") for y in ys]

        supervision, indices = encode_supervision(hlens)
        ys = [ys[i] for i in indices]
        
        nnet_output = self.lo(hs_pad)
        
        self.P.set_scores_stochastic_(self.lm_scores)
        if self.training:
            assert self.P.is_cpu
            assert self.P.requires_grad is True
        else:
            # Never use segmentation in evaluation: to approximate the decoding stage
            ys = [[self.char_list[c] for c in y if c != self.pad_id] for y in ys_pad]
            ys = [" ".join(y).replace("<eos>", "") for y in ys]

        loss_fn = LFMMILoss(
            graph_compiler=self.graph_compiler,
            P=self.P,
            den_scale=self.den_scale,
            use_pruned_intersect=self.use_pruned_intersect
        )

        grad_context = nullcontext if self.training else torch.no_grad

        with grad_context():
            mmi_loss, tot_frames, all_frames = loss_fn(
                nnet_output, ys, supervision)
        batch_size = hlens.size()[0]
        mmi_loss /= batch_size
        return - mmi_loss

    def decode(self, nnet_output, hlens, texts, is_english):
       
        # add linear function 
        nnet_output = self.lo(nnet_output)
        supervision, indices = encode_supervision(hlens)
        dense_fsa_vec = k2.DenseFsaVec(nnet_output, supervision)

        # Show MMI loss before decoding
        self.P.set_scores_stochastic_(self.lm_scores)
        if self.training:
            assert self.P.is_cpu
            assert self.P.requires_grad is False
        """
        loss_fn = LFMMILoss(
            graph_compiler = self.graph_compiler,
            P = self.P,
            den_scale = self.den_scale
        )

        grad_context = nullcontext if self.training else torch.no_grad
       
        if not is_english: 
            texts_reorder = [
                      " ".join(list(text.replace(" ", "")))
                    for text in texts]

            texts_reorder = [texts_reorder[i] for i in indices]
        else:
            texts_reorder = [texts[i] for i in indices]
        with grad_context():
            mmi_loss, tot_frames, all_frames = loss_fn(
                nnet_output, texts_reorder, supervision)
        mmi_loss = - mmi_loss / len(texts)
        print("MMI Loss: ", mmi_loss)
        """
        assert nnet_output.device == self.HLG.device
        # 7.0 output beam is tunable
        lattices = k2.intersect_dense_pruned(self.HLG, dense_fsa_vec, 20.0, 7.0, 30,
                                             10000)
        best_paths = k2.shortest_path(lattices, use_double_scores=True)
        
        assert best_paths.shape[0] == len(texts)
        hyps = get_texts(best_paths, indices)
        assert len(hyps) == len(texts)

        results = []
        batch_size =len(texts)
        for i in range(batch_size):
            hyp_words = [self.words.get(x) for x in hyps[i]]
            ref_words = texts[i].split(' ')
            
            if not is_english:
                hyp = "".join(hyp_words).replace(" ", "")
                ref = "".join(ref_words).replace(" ", "")
            else:
                hyp = " ".join(hyp_words)
                ref = " ".join(ref_words)
            print("#"*20)
            print(f"Reference: {ref}")
            print(f"Hypothesis: {hyp}")
            sys.stdout.flush()

            if not is_english:
                ref_char = list(ref)
                hyp_char = list(hyp)
            else:
                ref_char = ref.split()
                hyp_char = hyp.split()
            results.append((ref_char, hyp_char))

        return results

    def decode_init(self):
        # Build HLG.fst
        phone_ids = get_phone_symbols(self.phones) # will remove 0
        phone_ids_with_blank = [0] + phone_ids
        ctc_topo = k2.arc_sort(build_ctc_topo(phone_ids_with_blank))
        if not os.path.exists(self.lang / 'HLG.pt'):
            logging.debug("Loading L_disambig.fst.txt")
            with open(self.lang / 'L_disambig.fst.txt') as f:
                L = k2.Fsa.from_openfst(f.read(), acceptor=False)
            logging.debug("Loading G.fst.txt")
            with open(self.lang / 'G.fst.txt') as f:
                G = k2.Fsa.from_openfst(f.read(), acceptor=False)
            first_phone_disambig_id = find_first_disambig_symbol(self.phones)
            first_word_disambig_id = find_first_disambig_symbol(self.words)
            print("first disambig symbol: ", first_phone_disambig_id, first_word_disambig_id, flush=True)
            HLG = compile_HLG(L=L,
                             G=G,
                             H=ctc_topo,
                             labels_disambig_id_start=first_phone_disambig_id,
                             aux_labels_disambig_id_start=first_word_disambig_id)
            torch.save(HLG.as_dict(), self.lang / 'HLG.pt')
        else:
            logging.debug("Loading pre-compiled HLG")
            d = torch.load(self.lang / 'HLG.pt')
            HLG = k2.Fsa.from_dict(d)

        HLG = HLG.to(self.device)
        HLG.aux_labels = k2.ragged.remove_values_eq(HLG.aux_labels, 0)
        HLG.requires_grad_(False)
        if not hasattr(HLG, 'lm_scores'):
            HLG.lm_scores = HLG.scores.clone()
        self.HLG = HLG
        print("Successful Initialize Decoding HLG")
    
    def dump_weight(self, rank, path):
        d = {}
        for k, v in self.named_parameters():
            print(f"Found parameter {k} with shape {v.size()}")
            d[k] = v
        save_path = os.path.join(path, f"mmi_param.{rank}.pth")
        torch.save(d, save_path) 


================================================
FILE: st/__init__.py
================================================
"""Initialize sub package."""


================================================
FILE: st/pytorch_backend/__init__.py
================================================
"""Initialize sub package."""


================================================
FILE: st/pytorch_backend/st.py
================================================
# Copyright 2019 Kyoto University (Hirofumi Inaguma)
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

"""Training/decoding definition for the speech translation task."""

import json
import logging
import os
import sys

from chainer import training
from chainer.training import extensions
import numpy as np
from tensorboardX import SummaryWriter
import torch

from espnet.asr.asr_utils import adadelta_eps_decay
from espnet.asr.asr_utils import adam_lr_decay
from espnet.asr.asr_utils import add_results_to_json
from espnet.asr.asr_utils import CompareValueTrigger
from espnet.asr.asr_utils import restore_snapshot
from espnet.asr.asr_utils import snapshot_object
from espnet.asr.asr_utils import torch_load
from espnet.asr.asr_utils import torch_resume
from espnet.asr.asr_utils import torch_snapshot
from espnet.asr.pytorch_backend.asr_init import load_trained_model
from espnet.asr.pytorch_backend.asr_init import load_trained_modules

from espnet.nets.pytorch_backend.e2e_asr import pad_list
from espnet.nets.st_interface import STInterface
from espnet.utils.dataset import ChainerDataLoader
from espnet.utils.dataset import TransformDataset
from espnet.utils.deterministic_utils import set_deterministic_pytorch
from espnet.utils.dynamic_import import dynamic_import
from espnet.utils.io_utils import LoadInputsAndTargets
from espnet.utils.training.batchfy import make_batchset
from espnet.utils.training.iterators import ShufflingEnabler
from espnet.utils.training.tensorboard_logger import TensorboardLogger
from espnet.utils.training.train_utils import check_early_stop
from espnet.utils.training.train_utils import set_early_stop

from espnet.asr.pytorch_backend.asr import CustomConverter as ASRCustomConverter
from espnet.asr.pytorch_backend.asr import CustomEvaluator
from espnet.asr.pytorch_backend.asr import CustomUpdater

import matplotlib

matplotlib.use("Agg")

if sys.version_info[0] == 2:
    from itertools import izip_longest as zip_longest
else:
    from itertools import zip_longest as zip_longest


class CustomConverter(ASRCustomConverter):
    """Custom batch converter for Pytorch.

    Args:
        subsampling_factor (int): The subsampling factor.
        dtype (torch.dtype): Data type to convert.
        use_source_text (bool): use source transcription.

    """

    def __init__(
        self, subsampling_factor=1, dtype=torch.float32, use_source_text=False
    ):
        """Construct a CustomConverter object."""
        super().__init__(subsampling_factor=subsampling_factor, dtype=dtype)
        self.use_source_text = use_source_text

    def __call__(self, batch, device=torch.device("cpu")):
        """Transform a batch and send it to a device.

        Args:
            batch (list): The batch to transform.
            device (torch.device): The device to send to.

        Returns:
            tuple(torch.Tensor, torch.Tensor, torch.Tensor)

        """
        # batch should be located in list
        assert len(batch) == 1
        xs, ys, ys_src = batch[0]

        # get batch of lengths of input sequences
        ilens = np.array([x.shape[0] for x in xs])
        ilens = torch.from_numpy(ilens).to(device)

        xs_pad = pad_list([torch.from_numpy(x).float() for x in xs], 0).to(
            device, dtype=self.dtype
        )

        ys_pad = pad_list(
            [torch.from_numpy(np.array(y, dtype=np.int64)) for y in ys],
            self.ignore_id,
        ).to(device)

        if self.use_source_text:
            ys_pad_src = pad_list(
                [torch.from_numpy(np.array(y, dtype=np.int64)) for y in ys_src],
                self.ignore_id,
            ).to(device)
        else:
            ys_pad_src = None

        return xs_pad, ilens, ys_pad, ys_pad_src


def train(args):
    """Train with the given args.

    Args:
        args (namespace): The program arguments.

    """
    set_deterministic_pytorch(args)

    # check cuda availability
    if not torch.cuda.is_available():
        logging.warning("cuda is not available")

    # get input and output dimension info
    with open(args.valid_json, "rb") as f:
        valid_json = json.load(f)["utts"]
    utts = list(valid_json.keys())
    idim = int(valid_json[utts[0]]["input"][0]["shape"][-1])
    odim = int(valid_json[utts[0]]["output"][0]["shape"][-1])
    logging.info("#input dims : " + str(idim))
    logging.info("#output dims: " + str(odim))

    # Initialize with pre-trained ASR encoder and MT decoder
    if args.enc_init is not None or args.dec_init is not None:
        model = load_trained_modules(idim, odim, args, interface=STInterface)
    else:
        model_class = dynamic_import(args.model_module)
        model = model_class(idim, odim, args)
    assert isinstance(model, STInterface)
    total_subsampling_factor = model.get_total_subsampling_factor()

    # write model config
    if not os.path.exists(args.outdir):
        os.makedirs(args.outdir)
    model_conf = args.outdir + "/model.json"
    with open(model_conf, "wb") as f:
        logging.info("writing a model config file to " + model_conf)
        f.write(
            json.dumps(
                (idim, odim, vars(args)), indent=4, ensure_ascii=False, sort_keys=True
            ).encode("utf_8")
        )
    for key in sorted(vars(args).keys()):
        logging.info("ARGS: " + key + ": " + str(vars(args)[key]))

    reporter = model.reporter

    # check the use of multi-gpu
    if args.ngpu > 1:
        if args.batch_size != 0:
            logging.warning(
                "batch size is automatically increased (%d -> %d)"
                % (args.batch_size, args.batch_size * args.ngpu)
            )
            args.batch_size *= args.ngpu

    # set torch device
    device = torch.device("cuda" if args.ngpu > 0 else "cpu")
    if args.train_dtype in ("float16", "float32", "float64"):
        dtype = getattr(torch, args.train_dtype)
    else:
        dtype = torch.float32
    model = model.to(device=device, dtype=dtype)

    logging.warning(
        "num. model params: {:,} (num. trained: {:,} ({:.1f}%))".format(
            sum(p.numel() for p in model.parameters()),
            sum(p.numel() for p in model.parameters() if p.requires_grad),
            sum(p.numel() for p in model.parameters() if p.requires_grad)
            * 100.0
            / sum(p.numel() for p in model.parameters()),
        )
    )

    # Setup an optimizer
    if args.opt == "adadelta":
        optimizer = torch.optim.Adadelta(
            model.parameters(), rho=0.95, eps=args.eps, weight_decay=args.weight_decay
        )
    elif args.opt == "adam":
        optimizer = torch.optim.Adam(
            model.parameters(), lr=args.lr, weight_decay=args.weight_decay
        )
    elif args.opt == "noam":
        from espnet.nets.pytorch_backend.transformer.optimizer import get_std_opt

        optimizer = get_std_opt(
            model.parameters(),
            args.adim,
            args.transformer_warmup_steps,
            args.transformer_lr,
        )
    else:
        raise NotImplementedError("unknown optimizer: " + args.opt)

    # setup apex.amp
    if args.train_dtype in ("O0", "O1", "O2", "O3"):
        try:
            from apex import amp
        except ImportError as e:
            logging.error(
                f"You need to install apex for --train-dtype {args.train_dtype}. "
                "See https://github.com/NVIDIA/apex#linux"
            )
            raise e
        if args.opt == "noam":
            model, optimizer.optimizer = amp.initialize(
                model, optimizer.optimizer, opt_level=args.train_dtype
            )
        else:
            model, optimizer = amp.initialize(
                model, optimizer, opt_level=args.train_dtype
            )
        use_apex = True
    else:
        use_apex = False

    # FIXME: TOO DIRTY HACK
    setattr(optimizer, "target", reporter)
    setattr(optimizer, "serialize", lambda s: reporter.serialize(s))

    # Setup a converter
    converter = CustomConverter(
        subsampling_factor=model.subsample[0],
        dtype=dtype,
        use_source_text=args.asr_weight > 0 or args.mt_weight > 0,
    )

    # read json data
    with open(args.train_json, "rb") as f:
        train_json = json.load(f)["utts"]
    with open(args.valid_json, "rb") as f:
        valid_json = json.load(f)["utts"]

    use_sortagrad = args.sortagrad == -1 or args.sortagrad > 0
    # make minibatch list (variable length)
    train = make_batchset(
        train_json,
        args.batch_size,
        args.maxlen_in,
        args.maxlen_out,
        args.minibatches,
        min_batch_size=args.ngpu if args.ngpu > 1 else 1,
        shortest_first=use_sortagrad,
        count=args.batch_count,
        batch_bins=args.batch_bins,
        batch_frames_in=args.batch_frames_in,
        batch_frames_out=args.batch_frames_out,
        batch_frames_inout=args.batch_frames_inout,
        iaxis=0,
        oaxis=0,
    )
    valid = make_batchset(
        valid_json,
        args.batch_size,
        args.maxlen_in,
        args.maxlen_out,
        args.minibatches,
        min_batch_size=args.ngpu if args.ngpu > 1 else 1,
        count=args.batch_count,
        batch_bins=args.batch_bins,
        batch_frames_in=args.batch_frames_in,
        batch_frames_out=args.batch_frames_out,
        batch_frames_inout=args.batch_frames_inout,
        iaxis=0,
        oaxis=0,
    )

    load_tr = LoadInputsAndTargets(
        mode="asr",
        load_output=True,
        preprocess_conf=args.preprocess_conf,
        preprocess_args={"train": True},  # Switch the mode of preprocessing
    )
    load_cv = LoadInputsAndTargets(
        mode="asr",
        load_output=True,
        preprocess_conf=args.preprocess_conf,
        preprocess_args={"train": False},  # Switch the mode of preprocessing
    )
    # hack to make batchsize argument as 1
    # actual bathsize is included in a list
    # default collate function converts numpy array to pytorch tensor
    # we used an empty collate function instead which returns list
    train_iter = ChainerDataLoader(
        dataset=TransformDataset(train, lambda data: converter([load_tr(data)])),
        batch_size=1,
        num_workers=args.n_iter_processes,
        shuffle=not use_sortagrad,
        collate_fn=lambda x: x[0],
    )
    valid_iter = ChainerDataLoader(
        dataset=TransformDataset(valid, lambda data: converter([load_cv(data)])),
        batch_size=1,
        shuffle=False,
        collate_fn=lambda x: x[0],
        num_workers=args.n_iter_processes,
    )

    # Set up a trainer
    updater = CustomUpdater(
        model,
        args.grad_clip,
        {"main": train_iter},
        optimizer,
        device,
        args.ngpu,
        args.grad_noise,
        args.accum_grad,
        use_apex=use_apex,
    )
    trainer = training.Trainer(updater, (args.epochs, "epoch"), out=args.outdir)

    if use_sortagrad:
        trainer.extend(
            ShufflingEnabler([train_iter]),
            trigger=(args.sortagrad if args.sortagrad != -1 else args.epochs, "epoch"),
        )

    # Resume from a snapshot
    if args.resume:
        logging.info("resumed from %s" % args.resume)
        torch_resume(args.resume, trainer)

    # Evaluate the model with the test dataset for each epoch
    if args.save_interval_iters > 0:
        trainer.extend(
            CustomEvaluator(model, {"main": valid_iter}, reporter, device, args.ngpu),
            trigger=(args.save_interval_iters, "iteration"),
        )
    else:
        trainer.extend(
            CustomEvaluator(model, {"main": valid_iter}, reporter, device, args.ngpu)
        )

    # Save attention weight at each epoch
    if args.num_save_attention > 0:
        data = sorted(
            list(valid_json.items())[: args.num_save_attention],
            key=lambda x: int(x[1]["input"][0]["shape"][1]),
            reverse=True,
        )
        if hasattr(model, "module"):
            att_vis_fn = model.module.calculate_all_attentions
            plot_class = model.module.attention_plot_class
        else:
            att_vis_fn = model.calculate_all_attentions
            plot_class = model.attention_plot_class
        att_reporter = plot_class(
            att_vis_fn,
            data,
            args.outdir + "/att_ws",
            converter=converter,
            transform=load_cv,
            device=device,
            subsampling_factor=total_subsampling_factor,
        )
        trainer.extend(att_reporter, trigger=(1, "epoch"))
    else:
        att_reporter = None

    # Save CTC prob at each epoch
    if (args.asr_weight > 0 and args.mtlalpha > 0) and args.num_save_ctc > 0:
        # NOTE: sort it by output lengths
        data = sorted(
            list(valid_json.items())[: args.num_save_ctc],
            key=lambda x: int(x[1]["output"][0]["shape"][0]),
            reverse=True,
        )
        if hasattr(model, "module"):
            ctc_vis_fn = model.module.calculate_all_ctc_probs
            plot_class = model.module.ctc_plot_class
        else:
            ctc_vis_fn = model.calculate_all_ctc_probs
            plot_class = model.ctc_plot_class
        ctc_reporter = plot_class(
            ctc_vis_fn,
            data,
            args.outdir + "/ctc_prob",
            converter=converter,
            transform=load_cv,
            device=device,
            subsampling_factor=total_subsampling_factor,
        )
        trainer.extend(ctc_reporter, trigger=(1, "epoch"))
    else:
        ctc_reporter = None

    # Make a plot for training and validation values
    trainer.extend(
        extensions.PlotReport(
            [
                "main/loss",
                "validation/main/loss",
                "main/loss_asr",
                "validation/main/loss_asr",
                "main/loss_mt",
                "validation/main/loss_mt",
                "main/loss_st",
                "validation/main/loss_st",
            ],
            "epoch",
            file_name="loss.png",
        )
    )
    trainer.extend(
        extensions.PlotReport(
            [
                "main/acc",
                "validation/main/acc",
                "main/acc_asr",
                "validation/main/acc_asr",
                "main/acc_mt",
                "validation/main/acc_mt",
            ],
            "epoch",
            file_name="acc.png",
        )
    )
    trainer.extend(
        extensions.PlotReport(
            ["main/bleu", "validation/main/bleu"], "epoch", file_name="bleu.png"
        )
    )

    # Save best models
    trainer.extend(
        snapshot_object(model, "model.loss.best"),
        trigger=training.triggers.MinValueTrigger("validation/main/loss"),
    )
    trainer.extend(
        snapshot_object(model, "model.acc.best"),
        trigger=training.triggers.MaxValueTrigger("validation/main/acc"),
    )

    # save snapshot which contains model and optimizer states
    if args.save_interval_iters > 0:
        trainer.extend(
            torch_snapshot(filename="snapshot.iter.{.updater.iteration}"),
            trigger=(args.save_interval_iters, "iteration"),
        )
    else:
        trainer.extend(torch_snapshot(), trigger=(1, "epoch"))

    # epsilon decay in the optimizer
    if args.opt == "adadelta":
        if args.criterion == "acc":
            trainer.extend(
                restore_snapshot(
                    model, args.outdir + "/model.acc.best", load_fn=torch_load
                ),
                trigger=CompareValueTrigger(
                    "validation/main/acc",
                    lambda best_value, current_value: best_value > current_value,
                ),
            )
            trainer.extend(
                adadelta_eps_decay(args.eps_decay),
                trigger=CompareValueTrigger(
                    "validation/main/acc",
                    lambda best_value, current_value: best_value > current_value,
                ),
            )
        elif args.criterion == "loss":
            trainer.extend(
                restore_snapshot(
                    model, args.outdir + "/model.loss.best", load_fn=torch_load
                ),
                trigger=CompareValueTrigger(
                    "validation/main/loss",
                    lambda best_value, current_value: best_value < current_value,
                ),
            )
            trainer.extend(
                adadelta_eps_decay(args.eps_decay),
                trigger=CompareValueTrigger(
                    "validation/main/loss",
                    lambda best_value, current_value: best_value < current_value,
                ),
            )
    elif args.opt == "adam":
        if args.criterion == "acc":
            trainer.extend(
                restore_snapshot(
                    model, args.outdir + "/model.acc.best", load_fn=torch_load
                ),
                trigger=CompareValueTrigger(
                    "validation/main/acc",
                    lambda best_value, current_value: best_value > current_value,
                ),
            )
            trainer.extend(
                adam_lr_decay(args.lr_decay),
                trigger=CompareValueTrigger(
                    "validation/main/acc",
                    lambda best_value, current_value: best_value > current_value,
                ),
            )
        elif args.criterion == "loss":
            trainer.extend(
                restore_snapshot(
                    model, args.outdir + "/model.loss.best", load_fn=torch_load
                ),
                trigger=CompareValueTrigger(
                    "validation/main/loss",
                    lambda best_value, current_value: best_value < current_value,
                ),
            )
            trainer.extend(
                adam_lr_decay(args.lr_decay),
                trigger=CompareValueTrigger(
                    "validation/main/loss",
                    lambda best_value, current_value: best_value < current_value,
                ),
            )

    # Write a log of evaluation statistics for each epoch
    trainer.extend(
        extensions.LogReport(trigger=(args.report_interval_iters, "iteration"))
    )
    report_keys = [
        "epoch",
        "iteration",
        "main/loss",
        "main/loss_st",
        "main/loss_asr",
        "validation/main/loss",
        "validation/main/loss_st",
        "validation/main/loss_asr",
        "main/acc",
        "validation/main/acc",
    ]
    if args.asr_weight > 0:
        report_keys.append("main/acc_asr")
        report_keys.append("validation/main/acc_asr")
    report_keys += ["elapsed_time"]
    if args.opt == "adadelta":
        trainer.extend(
            extensions.observe_value(
                "eps",
                lambda trainer: trainer.updater.get_optimizer("main").param_groups[0][
                    "eps"
                ],
            ),
            trigger=(args.report_interval_iters, "iteration"),
        )
        report_keys.append("eps")
    elif args.opt in ["adam", "noam"]:
        trainer.extend(
            extensions.observe_value(
                "lr",
                lambda trainer: trainer.updater.get_optimizer("main").param_groups[0][
                    "lr"
                ],
            ),
            trigger=(args.report_interval_iters, "iteration"),
        )
        report_keys.append("lr")
    if args.asr_weight > 0:
        if args.mtlalpha > 0:
            report_keys.append("main/cer_ctc")
            report_keys.append("validation/main/cer_ctc")
        if args.mtlalpha < 1:
            if args.report_cer:
                report_keys.append("validation/main/cer")
            if args.report_wer:
                report_keys.append("validation/main/wer")
    if args.report_bleu:
        report_keys.append("main/bleu")
        report_keys.append("validation/main/bleu")
    trainer.extend(
        extensions.PrintReport(report_keys),
        trigger=(args.report_interval_iters, "iteration"),
    )

    trainer.extend(extensions.ProgressBar(update_interval=args.report_interval_iters))
    set_early_stop(trainer, args)

    if args.tensorboard_dir is not None and args.tensorboard_dir != "":
        trainer.extend(
            TensorboardLogger(
                SummaryWriter(args.tensorboard_dir),
                att_reporter=att_reporter,
                ctc_reporter=ctc_reporter,
            ),
            trigger=(args.report_interval_iters, "iteration"),
        )
    # Run the training
    trainer.run()
    check_early_stop(trainer, args.epochs)


def trans(args):
    """Decode with the given args.

    Args:
        args (namespace): The program arguments.

    """
    set_deterministic_pytorch(args)
    model, train_args = load_trained_model(args.model)
    assert isinstance(model, STInterface)
    model.trans_args = args

    # gpu
    if args.ngpu == 1:
        gpu_id = list(range(args.ngpu))
        logging.info("gpu id: " + str(gpu_id))
        model.cuda()

    # read json data
    with open(args.trans_json, "rb") as f:
        js = json.load(f)["utts"]
    new_js = {}

    load_inputs_and_targets = LoadInputsAndTargets(
        mode="asr",
        load_output=False,
        sort_in_input_length=False,
        preprocess_conf=train_args.preprocess_conf
        if args.preprocess_conf is None
        else args.preprocess_conf,
        preprocess_args={"train": False},
    )

    if args.batchsize == 0:
        with torch.no_grad():
            for idx, name in enumerate(js.keys(), 1):
                logging.info("(%d/%d) decoding " + name, idx, len(js.keys()))
                batch = [(name, js[name])]
                feat = load_inputs_and_targets(batch)[0][0]
                nbest_hyps = model.translate(
                    feat,
                    args,
                    train_args.char_list,
                )
                new_js[name] = add_results_to_json(
                    js[name], nbest_hyps, train_args.char_list
                )

    else:

        def grouper(n, iterable, fillvalue=None):
            kargs = [iter(iterable)] * n
            return zip_longest(*kargs, fillvalue=fillvalue)

        # sort data if batchsize > 1
        keys = list(js.keys())
        if args.batchsize > 1:
            feat_lens = [js[key]["input"][0]["shape"][0] for key in keys]
            sorted_index = sorted(range(len(feat_lens)), key=lambda i: -feat_lens[i])
            keys = [keys[i] for i in sorted_index]

        with torch.no_grad():
            for names in grouper(args.batchsize, keys, None):
                names = [name for name in names if name]
                batch = [(name, js[name]) for name in names]
                feats = load_inputs_and_targets(batch)[0]
                nbest_hyps = model.translate_batch(
                    feats,
                    args,
                    train_args.char_list,
                )

                for i, nbest_hyp in enumerate(nbest_hyps):
                    name = names[i]
                    new_js[name] = add_results_to_json(
                        js[name], nbest_hyp, train_args.char_list
                    )

    with open(args.result_label, "wb") as f:
        f.write(
            json.dumps(
                {"utts": new_js}, indent=4, ensure_ascii=False, sort_keys=True
            ).encode("utf_8")
        )


================================================
FILE: transform/__init__.py
================================================
"""Initialize main package."""


================================================
FILE: transform/add_deltas.py
================================================
import numpy as np


def delta(feat, window):
    assert window > 0
    delta_feat = np.zeros_like(feat)
    for i in range(1, window + 1):
        delta_feat[:-i] += i * feat[i:]
        delta_feat[i:] += -i * feat[:-i]
        delta_feat[-i:] += i * feat[-1]
        delta_feat[:i] += -i * feat[0]
    delta_feat /= 2 * sum(i ** 2 for i in range(1, window + 1))
    return delta_feat


def add_deltas(x, window=2, order=2):
    feats = [x]
    for _ in range(order):
        feats.append(delta(feats[-1], window))
    return np.concatenate(feats, axis=1)


class AddDeltas(object):
    def __init__(self, window=2, order=2):
        self.window = window
        self.order = order

    def __repr__(self):
        return "{name}(window={window}, order={order}".format(
            name=self.__class__.__name__, window=self.window, order=self.order
        )

    def __call__(self, x):
        return add_deltas(x, window=self.window, order=self.order)


================================================
FILE: transform/channel_selector.py
================================================
import numpy


class ChannelSelector(object):
    """Select 1ch from multi-channel signal"""

    def __init__(self, train_channel="random", eval_channel=0, axis=1):
        self.train_channel = train_channel
        self.eval_channel = eval_channel
        self.axis = axis

    def __repr__(self):
        return (
            "{name}(train_channel={train_channel}, "
            "eval_channel={eval_channel}, axis={axis})".format(
                name=self.__class__.__name__,
                train_channel=self.train_channel,
                eval_channel=self.eval_channel,
                axis=self.axis,
            )
        )

    def __call__(self, x, train=True):
        # Assuming x: [Time, Channel] by default

        if x.ndim <= self.axis:
            # If the dimension is insufficient, then unsqueeze
            # (e.g [Time] -> [Time, 1])
            ind = tuple(
                slice(None) if i < x.ndim else None for i in range(self.axis + 1)
            )
            x = x[ind]

        if train:
            channel = self.train_channel
        else:
            channel = self.eval_channel

        if channel == "random":
            ch = numpy.random.randint(0, x.shape[self.axis])
        else:
            ch = channel

        ind = tuple(slice(None) if i != self.axis else ch for i in range(x.ndim))
        return x[ind]


================================================
FILE: transform/cmvn.py
================================================
import io

import h5py
import kaldiio
import numpy as np


class CMVN(object):
    def __init__(
        self,
        stats,
        norm_means=True,
        norm_vars=False,
        filetype="mat",
        utt2spk=None,
        spk2utt=None,
        reverse=False,
        std_floor=1.0e-20,
    ):
        self.stats_file = stats
        self.norm_means = norm_means
        self.norm_vars = norm_vars
        self.reverse = reverse

        if isinstance(stats, dict):
            stats_dict = dict(stats)
        else:
            # Use for global CMVN
            if filetype == "mat":
                stats_dict = {None: kaldiio.load_mat(stats)}
            # Use for global CMVN
            elif filetype == "npy":
                stats_dict = {None: np.load(stats)}
            # Use for speaker CMVN
            elif filetype == "ark":
                self.accept_uttid = True
                stats_dict = dict(kaldiio.load_ark(stats))
            # Use for speaker CMVN
            elif filetype == "hdf5":
                self.accept_uttid = True
                stats_dict = h5py.File(stats)
            else:
                raise ValueError("Not supporting filetype={}".format(filetype))

        if utt2spk is not None:
            self.utt2spk = {}
            with io.open(utt2spk, "r", encoding="utf-8") as f:
                for line in f:
                    utt, spk = line.rstrip().split(None, 1)
                    self.utt2spk[utt] = spk
        elif spk2utt is not None:
            self.utt2spk = {}
            with io.open(spk2utt, "r", encoding="utf-8") as f:
                for line in f:
                    spk, utts = line.rstrip().split(None, 1)
                    for utt in utts.split():
                        self.utt2spk[utt] = spk
        else:
            self.utt2spk = None

        # Kaldi makes a matrix for CMVN which has a shape of (2, feat_dim + 1),
        # and the first vector contains the sum of feats and the second is
        # the sum of squares. The last value of the first, i.e. stats[0,-1],
        # is the number of samples for this statistics.
        self.bias = {}
        self.scale = {}
        for spk, stats in stats_dict.items():
            assert len(stats) == 2, stats.shape

            count = stats[0, -1]

            # If the feature has two or more dimensions
            if not (np.isscalar(count) or isinstance(count, (int, float))):
                # The first is only used
                count = count.flatten()[0]

            mean = stats[0, :-1] / count
            # V(x) = E(x^2) - (E(x))^2
            var = stats[1, :-1] / count - mean * mean
            std = np.maximum(np.sqrt(var), std_floor)
            self.bias[spk] = -mean
            self.scale[spk] = 1 / std

    def __repr__(self):
        return (
            "{name}(stats_file={stats_file}, "
            "norm_means={norm_means}, norm_vars={norm_vars}, "
            "reverse={reverse})".format(
                name=self.__class__.__name__,
                stats_file=self.stats_file,
                norm_means=self.norm_means,
                norm_vars=self.norm_vars,
                reverse=self.reverse,
            )
        )

    def __call__(self, x, uttid=None):
        if self.utt2spk is not None:
            spk = self.utt2spk[uttid]
        else:
            spk = uttid

        if not self.reverse:
            if self.norm_means:
                x = np.add(x, self.bias[spk])
            if self.norm_vars:
                x = np.multiply(x, self.scale[spk])

        else:
            if self.norm_vars:
                x = np.divide(x, self.scale[spk])
            if self.norm_means:
                x = np.subtract(x, self.bias[spk])

        return x


class UtteranceCMVN(object):
    def __init__(self, norm_means=True, norm_vars=False, std_floor=1.0e-20):
        self.norm_means = norm_means
        self.norm_vars = norm_vars
        self.std_floor = std_floor

    def __repr__(self):
        return "{name}(norm_means={norm_means}, norm_vars={norm_vars})".format(
            name=self.__class__.__name__,
            norm_means=self.norm_means,
            norm_vars=self.norm_vars,
        )

    def __call__(self, x, uttid=None):
        # x: [Time, Dim]
        square_sums = (x ** 2).sum(axis=0)
        mean = x.mean(axis=0)

        if self.norm_means:
            x = np.subtract(x, mean)

        if self.norm_vars:
            var = square_sums / x.shape[0] - mean ** 2
            std = np.maximum(np.sqrt(var), self.std_floor)
            x = np.divide(x, std)

        return x


================================================
FILE: transform/functional.py
================================================
import inspect

from espnet.transform.transform_interface import TransformInterface
from espnet.utils.check_kwargs import check_kwargs


class FuncTrans(TransformInterface):
    """Functional Transformation

    WARNING:
        Builtin or C/C++ functions may not work properly
        because this class heavily depends on the `inspect` module.

    Usage:

    >>> def foo_bar(x, a=1, b=2):
    ...     '''Foo bar
    ...     :param x: input
    ...     :param int a: default 1
    ...     :param int b: default 2
    ...     '''
    ...     return x + a - b


    >>> class FooBar(FuncTrans):
    ...     _func = foo_bar
    ...     __doc__ = foo_bar.__doc__
    """

    _func = None

    def __init__(self, **kwargs):
        self.kwargs = kwargs
        check_kwargs(self.func, kwargs)

    def __call__(self, x):
        return self.func(x, **self.kwargs)

    @classmethod
    def add_arguments(cls, parser):
        fname = cls._func.__name__.replace("_", "-")
        group = parser.add_argument_group(fname + " transformation setting")
        for k, v in cls.default_params().items():
            # TODO(karita): get help and choices from docstring?
            attr = k.replace("_", "-")
            group.add_argument(f"--{fname}-{attr}", default=v, type=type(v))
        return parser

    @property
    def func(self):
        return type(self)._func

    @classmethod
    def default_params(cls):
        try:
            d = dict(inspect.signature(cls._func).parameters)
        except ValueError:
            d = dict()
        return {
            k: v.default for k, v in d.items() if v.default != inspect.Parameter.empty
        }

    def __repr__(self):
        params = self.default_params()
        params.update(**self.kwargs)
        ret = self.__class__.__name__ + "("
        if len(params) == 0:
            return ret + ")"
        for k, v in params.items():
            ret += "{}={}, ".format(k, v)
        return ret[:-2] + ")"


================================================
FILE: transform/perturb.py
================================================
import librosa
import numpy
import scipy
import soundfile

from espnet.utils.io_utils import SoundHDF5File


class SpeedPerturbation(object):
    """SpeedPerturbation

    The speed perturbation in kaldi uses sox-speed instead of sox-tempo,
    and sox-speed just to resample the input,
    i.e pitch and tempo are changed both.

    "Why use speed option instead of tempo -s in SoX for speed perturbation"
    https://groups.google.com/forum/#!topic/kaldi-help/8OOG7eE4sZ8

    Warning:
        This function is very slow because of resampling.
        I recommmend to apply speed-perturb outside the training using sox.

    """

    def __init__(
        self,
        lower=0.9,
        upper=1.1,
        utt2ratio=None,
        keep_length=True,
        res_type="kaiser_best",
        seed=None,
    ):
        self.res_type = res_type
        self.keep_length = keep_length
        self.state = numpy.random.RandomState(seed)

        if utt2ratio is not None:
            self.utt2ratio = {}
            # Use the scheduled ratio for each utterances
            self.utt2ratio_file = utt2ratio
            self.lower = None
            self.upper = None
            self.accept_uttid = True

            with open(utt2ratio, "r") as f:
                for line in f:
                    utt, ratio = line.rstrip().split(None, 1)
                    ratio = float(ratio)
                    self.utt2ratio[utt] = ratio
        else:
            self.utt2ratio = None
            # The ratio is given on runtime randomly
            self.lower = lower
            self.upper = upper

    def __repr__(self):
        if self.utt2ratio is None:
            return "{}(lower={}, upper={}, " "keep_length={}, res_type={})".format(
                self.__class__.__name__,
                self.lower,
                self.upper,
                self.keep_length,
                self.res_type,
            )
        else:
            return "{}({}, res_type={})".format(
                self.__class__.__name__, self.utt2ratio_file, self.res_type
            )

    def __call__(self, x, uttid=None, train=True):
        if not train:
            return x

        x = x.astype(numpy.float32)
        if self.accept_uttid:
            ratio = self.utt2ratio[uttid]
        else:
            ratio = self.state.uniform(self.lower, self.upper)

        # Note1: resample requires the sampling-rate of input and output,
        #        but actually only the ratio is used.
        y = librosa.resample(x, ratio, 1, res_type=self.res_type)

        if self.keep_length:
            diff = abs(len(x) - len(y))
            if len(y) > len(x):
                # Truncate noise
                y = y[diff // 2 : -((diff + 1) // 2)]
            elif len(y) < len(x):
                # Assume the time-axis is the first: (Time, Channel)
                pad_width = [(diff // 2, (diff + 1) // 2)] + [
                    (0, 0) for _ in range(y.ndim - 1)
                ]
                y = numpy.pad(
                    y, pad_width=pad_width, constant_values=0, mode="constant"
                )
        return y


class BandpassPerturbation(object):
    """BandpassPerturbation

    Randomly dropout along the frequency axis.

    The original idea comes from the following:
        "randomly-selected frequency band was cut off under the constraint of
         leaving at least 1,000 Hz band within the range of less than 4,000Hz."
        (The Hitachi/JHU CHiME-5 system: Advances in speech recognition for
         everyday home environments using multiple microphone arrays;
         http://spandh.dcs.shef.ac.uk/chime_workshop/papers/CHiME_2018_paper_kanda.pdf)

    """

    def __init__(self, lower=0.0, upper=0.75, seed=None, axes=(-1,)):
        self.lower = lower
        self.upper = upper
        self.state = numpy.random.RandomState(seed)
        # x_stft: (Time, Channel, Freq)
        self.axes = axes

    def __repr__(self):
        return "{}(lower={}, upper={})".format(
            self.__class__.__name__, self.lower, self.upper
        )

    def __call__(self, x_stft, uttid=None, train=True):
        if not train:
            return x_stft

        if x_stft.ndim == 1:
            raise RuntimeError(
                "Input in time-freq domain: " "(Time, Channel, Freq) or (Time, Freq)"
            )

        ratio = self.state.uniform(self.lower, self.upper)
        axes = [i if i >= 0 else x_stft.ndim - i for i in self.axes]
        shape = [s if i in axes else 1 for i, s in enumerate(x_stft.shape)]

        mask = self.state.randn(*shape) > ratio
        x_stft *= mask
        return x_stft


class VolumePerturbation(object):
    def __init__(self, lower=-1.6, upper=1.6, utt2ratio=None, dbunit=True, seed=None):
        self.dbunit = dbunit
        self.utt2ratio_file = utt2ratio
        self.lower = lower
        self.upper = upper
        self.state = numpy.random.RandomState(seed)

        if utt2ratio is not None:
            # Use the scheduled ratio for each utterances
            self.utt2ratio = {}
            self.lower = None
            self.upper = None
            self.accept_uttid = True

            with open(utt2ratio, "r") as f:
                for line in f:
                    utt, ratio = line.rstrip().split(None, 1)
                    ratio = float(ratio)
                    self.utt2ratio[utt] = ratio
        else:
            # The ratio is given on runtime randomly
            self.utt2ratio = None

    def __repr__(self):
        if self.utt2ratio is None:
            return "{}(lower={}, upper={}, dbunit={})".format(
                self.__class__.__name__, self.lower, self.upper, self.dbunit
            )
        else:
            return '{}("{}", dbunit={})'.format(
                self.__class__.__name__, self.utt2ratio_file, self.dbunit
            )

    def __call__(self, x, uttid=None, train=True):
        if not train:
            return x

        x = x.astype(numpy.float32)

        if self.accept_uttid:
            ratio = self.utt2ratio[uttid]
        else:
            ratio = self.state.uniform(self.lower, self.upper)
        if self.dbunit:
            ratio = 10 ** (ratio / 20)
        return x * ratio


class NoiseInjection(object):
    """Add isotropic noise"""

    def __init__(
        self,
        utt2noise=None,
        lower=-20,
        upper=-5,
        utt2ratio=None,
        filetype="list",
        dbunit=True,
        seed=None,
    ):
        self.utt2noise_file = utt2noise
        self.utt2ratio_file = utt2ratio
        self.filetype = filetype
        self.dbunit = dbunit
        self.lower = lower
        self.upper = upper
        self.state = numpy.random.RandomState(seed)

        if utt2ratio is not None:
            # Use the scheduled ratio for each utterances
            self.utt2ratio = {}
            with open(utt2noise, "r") as f:
                for line in f:
                    utt, snr = line.rstrip().split(None, 1)
                    snr = float(snr)
                    self.utt2ratio[utt] = snr
        else:
            # The ratio is given on runtime randomly
            self.utt2ratio = None

        if utt2noise is not None:
            self.utt2noise = {}
            if filetype == "list":
                with open(utt2noise, "r") as f:
                    for line in f:
                        utt, filename = line.rstrip().split(None, 1)
                        signal, rate = soundfile.read(filename, dtype="int16")
                        # Load all files in memory
                        self.utt2noise[utt] = (signal, rate)

            elif filetype == "sound.hdf5":
                self.utt2noise = SoundHDF5File(utt2noise, "r")
            else:
                raise ValueError(filetype)
        else:
            self.utt2noise = None

        if utt2noise is not None and utt2ratio is not None:
            if set(self.utt2ratio) != set(self.utt2noise):
                raise RuntimeError(
                    "The uttids mismatch between {} and {}".format(utt2ratio, utt2noise)
                )

    def __repr__(self):
        if self.utt2ratio is None:
            return "{}(lower={}, upper={}, dbunit={})".format(
                self.__class__.__name__, self.lower, self.upper, self.dbunit
            )
        else:
            return '{}("{}", dbunit={})'.format(
                self.__class__.__name__, self.utt2ratio_file, self.dbunit
            )

    def __call__(self, x, uttid=None, train=True):
        if not train:
            return x
        x = x.astype(numpy.float32)

        # 1. Get ratio of noise to signal in sound pressure level
        if uttid is not None and self.utt2ratio is not None:
            ratio = self.utt2ratio[uttid]
        else:
            ratio = self.state.uniform(self.lower, self.upper)

        if self.dbunit:
            ratio = 10 ** (ratio / 20)
        scale = ratio * numpy.sqrt((x ** 2).mean())

        # 2. Get noise
        if self.utt2noise is not None:
            # Get noise from the external source
            if uttid is not None:
                noise, rate = self.utt2noise[uttid]
            else:
                # Randomly select the noise source
                noise = self.state.choice(list(self.utt2noise.values()))
            # Normalize the level
            noise /= numpy.sqrt((noise ** 2).mean())

            # Adjust the noise length
            diff = abs(len(x) - len(noise))
            offset = self.state.randint(0, diff)
            if len(noise) > len(x):
                # Truncate noise
                noise = noise[offset : -(diff - offset)]
            else:
                noise = numpy.pad(noise, pad_width=[offset, diff - offset], mode="wrap")

        else:
            # Generate white noise
            noise = self.state.normal(0, 1, x.shape)

        # 3. Add noise to signal
        return x + noise * scale


class RIRConvolve(object):
    def __init__(self, utt2rir, filetype="list"):
        self.utt2rir_file = utt2rir
        self.filetype = filetype

        self.utt2rir = {}
        if filetype == "list":
            with open(utt2rir, "r") as f:
                for line in f:
                    utt, filename = line.rstrip().split(None, 1)
                    signal, rate = soundfile.read(filename, dtype="int16")
                    self.utt2rir[utt] = (signal, rate)

        elif filetype == "sound.hdf5":
            self.utt2rir = SoundHDF5File(utt2rir, "r")
        else:
            raise NotImplementedError(filetype)

    def __repr__(self):
        return '{}("{}")'.format(self.__class__.__name__, self.utt2rir_file)

    def __call__(self, x, uttid=None, train=True):
        if not train:
            return x

        x = x.astype(numpy.float32)

        if x.ndim != 1:
            # Must be single channel
            raise RuntimeError(
                "Input x must be one dimensional array, but got {}".format(x.shape)
            )

        rir, rate = self.utt2rir[uttid]
        if rir.ndim == 2:
            # FIXME(kamo): Use chainer.convolution_1d?
            # return [Time, Channel]
            return numpy.stack(
                [scipy.convolve(x, r, mode="same") for r in rir], axis=-1
            )
        else:
            return scipy.convolve(x, rir, mode="same")


================================================
FILE: transform/spec_augment.py
================================================
"""Spec Augment module for preprocessing i.e., data augmentation"""

import random

import numpy
from PIL import Image
from PIL.Image import BICUBIC

from espnet.transform.functional import FuncTrans


def time_warp(x, max_time_warp=80, inplace=False, mode="PIL"):
    """time warp for spec augment

    move random center frame by the random width ~ uniform(-window, window)
    :param numpy.ndarray x: spectrogram (time, freq)
    :param int max_time_warp: maximum time frames to warp
    :param bool inplace: overwrite x with the result
    :param str mode: "PIL" (default, fast, not differentiable) or "sparse_image_warp"
        (slow, differentiable)
    :returns numpy.ndarray: time warped spectrogram (time, freq)
    """
    window = max_time_warp
    if mode == "PIL":
        t = x.shape[0]
        if t - window <= window:
            return x
        # NOTE: randrange(a, b) emits a, a + 1, ..., b - 1
        center = random.randrange(window, t - window)
        warped = random.randrange(center - window, center + window) + 1  # 1 ... t - 1

        left = Image.fromarray(x[:center]).resize((x.shape[1], warped), BICUBIC)
        right = Image.fromarray(x[center:]).resize((x.shape[1], t - warped), BICUBIC)
        if inplace:
            x[:warped] = left
            x[warped:] = right
            return x
        return numpy.concatenate((left, right), 0)
    elif mode == "sparse_image_warp":
        import torch

        from espnet.utils import spec_augment

        # TODO(karita): make this differentiable again
        return spec_augment.time_warp(torch.from_numpy(x), window).numpy()
    else:
        raise NotImplementedError(
            "unknown resize mode: "
            + mode
            + ", choose one from (PIL, sparse_image_warp)."
        )


class TimeWarp(FuncTrans):
    _func = time_warp
    __doc__ = time_warp.__doc__

    def __call__(self, x, train):
        if not train:
            return x
        return super().__call__(x)


def freq_mask(x, F=30, n_mask=2, replace_with_zero=True, inplace=False):
    """freq mask for spec agument

    :param numpy.ndarray x: (time, freq)
    :param int n_mask: the number of masks
    :param bool inplace: overwrite
    :param bool replace_with_zero: pad zero on mask if true else use mean
    """
    if inplace:
        cloned = x
    else:
        cloned = x.copy()

    num_mel_channels = cloned.shape[1]
    fs = numpy.random.randint(0, F, size=(n_mask, 2))

    for f, mask_end in fs:
        f_zero = random.randrange(0, num_mel_channels - f)
        mask_end += f_zero

        # avoids randrange error if values are equal and range is empty
        if f_zero == f_zero + f:
            continue

        if replace_with_zero:
            cloned[:, f_zero:mask_end] = 0
        else:
            cloned[:, f_zero:mask_end] = cloned.mean()
    return cloned


class FreqMask(FuncTrans):
    _func = freq_mask
    __doc__ = freq_mask.__doc__

    def __call__(self, x, train):
        if not train:
            return x
        return super().__call__(x)


def time_mask(spec, T=40, n_mask=2, replace_with_zero=True, inplace=False):
    """freq mask for spec agument

    :param numpy.ndarray spec: (time, freq)
    :param int n_mask: the number of masks
    :param bool inplace: overwrite
    :param bool replace_with_zero: pad zero on mask if true else use mean
    """
    if inplace:
        cloned = spec
    else:
        cloned = spec.copy()
    len_spectro = cloned.shape[0]
    ts = numpy.random.randint(0, T, size=(n_mask, 2))
    for t, mask_end in ts:
        # avoid randint range error
        if len_spectro - t <= 0:
            continue
        t_zero = random.randrange(0, len_spectro - t)

        # avoids randrange error if values are equal and range is empty
        if t_zero == t_zero + t:
            continue

        mask_end += t_zero
        if replace_with_zero:
            cloned[t_zero:mask_end] = 0
        else:
            cloned[t_zero:mask_end] = cloned.mean()
    return cloned


class TimeMask(FuncTrans):
    _func = time_mask
    __doc__ = time_mask.__doc__

    def __call__(self, x, train):
        if not train:
            return x
        return super().__call__(x)


def spec_augment(
    x,
    resize_mode="PIL",
    max_time_warp=80,
    max_freq_width=27,
    n_freq_mask=2,
    max_time_width=100,
    n_time_mask=2,
    inplace=True,
    replace_with_zero=True,
):
    """spec agument

    apply random time warping and time/freq masking
    default setting is based on LD (Librispeech double) in Table 2
        https://arxiv.org/pdf/1904.08779.pdf

    :param numpy.ndarray x: (time, freq)
    :param str resize_mode: "PIL" (fast, nondifferentiable) or "sparse_image_warp"
        (slow, differentiable)
    :param int max_time_warp: maximum frames to warp the center frame in spectrogram (W)
    :param int freq_mask_width: maximum width of the random freq mask (F)
    :param int n_freq_mask: the number of the random freq mask (m_F)
    :param int time_mask_width: maximum width of the random time mask (T)
    :param int n_time_mask: the number of the random time mask (m_T)
    :param bool inplace: overwrite intermediate array
    :param bool replace_with_zero: pad zero on mask if true else use mean
    """
    assert isinstance(x, numpy.ndarray)
    assert x.ndim == 2
    x = time_warp(x, max_time_warp, inplace=inplace, mode=resize_mode)
    x = freq_mask(
        x,
        max_freq_width,
        n_freq_mask,
        inplace=inplace,
        replace_with_zero=replace_with_zero,
    )
    x = time_mask(
        x,
        max_time_width,
        n_time_mask,
        inplace=inplace,
        replace_with_zero=replace_with_zero,
    )
    return x


class SpecAugment(FuncTrans):
    _func = spec_augment
    __doc__ = spec_augment.__doc__

    def __call__(self, x, train):
        if not train:
            return x
        return super().__call__(x)


================================================
FILE: transform/spectrogram.py
================================================
import librosa
import numpy as np


def stft(
    x, n_fft, n_shift, win_length=None, window="hann", center=True, pad_mode="reflect"
):
    # x: [Time, Channel]
    if x.ndim == 1:
        single_channel = True
        # x: [Time] -> [Time, Channel]
        x = x[:, None]
    else:
        single_channel = False
    x = x.astype(np.float32)

    # FIXME(kamo): librosa.stft can't use multi-channel?
    # x: [Time, Channel, Freq]
    x = np.stack(
        [
            librosa.stft(
                x[:, ch],
                n_fft=n_fft,
                hop_length=n_shift,
                win_length=win_length,
                window=window,
                center=center,
                pad_mode=pad_mode,
            ).T
            for ch in range(x.shape[1])
        ],
        axis=1,
    )

    if single_channel:
        # x: [Time, Channel, Freq] -> [Time, Freq]
        x = x[:, 0]
    return x


def istft(x, n_shift, win_length=None, window="hann", center=True):
    # x: [Time, Channel, Freq]
    if x.ndim == 2:
        single_channel = True
        # x: [Time, Freq] -> [Time, Channel, Freq]
        x = x[:, None, :]
    else:
        single_channel = False

    # x: [Time, Channel]
    x = np.stack(
        [
            librosa.istft(
                x[:, ch].T,  # [Time, Freq] -> [Freq, Time]
                hop_length=n_shift,
                win_length=win_length,
                window=window,
                center=center,
            )
            for ch in range(x.shape[1])
        ],
        axis=1,
    )

    if single_channel:
        # x: [Time, Channel] -> [Time]
        x = x[:, 0]
    return x


def stft2logmelspectrogram(x_stft, fs, n_mels, n_fft, fmin=None, fmax=None, eps=1e-10):
    # x_stft: (Time, Channel, Freq) or (Time, Freq)
    fmin = 0 if fmin is None else fmin
    fmax = fs / 2 if fmax is None else fmax

    # spc: (Time, Channel, Freq) or (Time, Freq)
    spc = np.abs(x_stft)
    # mel_basis: (Mel_freq, Freq)
    mel_basis = librosa.filters.mel(fs, n_fft, n_mels, fmin, fmax)
    # lmspc: (Time, Channel, Mel_freq) or (Time, Mel_freq)
    lmspc = np.log10(np.maximum(eps, np.dot(spc, mel_basis.T)))

    return lmspc


def spectrogram(x, n_fft, n_shift, win_length=None, window="hann"):
    # x: (Time, Channel) -> spc: (Time, Channel, Freq)
    spc = np.abs(stft(x, n_fft, n_shift, win_length, window=window))
    return spc


def logmelspectrogram(
    x,
    fs,
    n_mels,
    n_fft,
    n_shift,
    win_length=None,
    window="hann",
    fmin=None,
    fmax=None,
    eps=1e-10,
    pad_mode="reflect",
):
    # stft: (Time, Channel, Freq) or (Time, Freq)
    x_stft = stft(
        x,
        n_fft=n_fft,
        n_shift=n_shift,
        win_length=win_length,
        window=window,
        pad_mode=pad_mode,
    )

    return stft2logmelspectrogram(
        x_stft, fs=fs, n_mels=n_mels, n_fft=n_fft, fmin=fmin, fmax=fmax, eps=eps
    )


class Spectrogram(object):
    def __init__(self, n_fft, n_shift, win_length=None, window="hann"):
        self.n_fft = n_fft
        self.n_shift = n_shift
        self.win_length = win_length
        self.window = window

    def __repr__(self):
        return (
            "{name}(n_fft={n_fft}, n_shift={n_shift}, "
            "win_length={win_length}, window={window})".format(
                name=self.__class__.__name__,
                n_fft=self.n_fft,
                n_shift=self.n_shift,
                win_length=self.win_length,
                window=self.window,
            )
        )

    def __call__(self, x):
        return spectrogram(
            x,
            n_fft=self.n_fft,
            n_shift=self.n_shift,
            win_length=self.win_length,
            window=self.window,
        )


class LogMelSpectrogram(object):
    def __init__(
        self,
        fs,
        n_mels,
        n_fft,
        n_shift,
        win_length=None,
        window="hann",
        fmin=None,
        fmax=None,
        eps=1e-10,
    ):
        self.fs = fs
        self.n_mels = n_mels
        self.n_fft = n_fft
        self.n_shift = n_shift
        self.win_length = win_length
        self.window = window
        self.fmin = fmin
        self.fmax = fmax
        self.eps = eps

    def __repr__(self):
        return (
            "{name}(fs={fs}, n_mels={n_mels}, n_fft={n_fft}, "
            "n_shift={n_shift}, win_length={win_length}, window={window}, "
            "fmin={fmin}, fmax={fmax}, eps={eps}))".format(
                name=self.__class__.__name__,
                fs=self.fs,
                n_mels=self.n_mels,
                n_fft=self.n_fft,
                n_shift=self.n_shift,
                win_length=self.win_length,
                window=self.window,
                fmin=self.fmin,
                fmax=self.fmax,
                eps=self.eps,
            )
        )

    def __call__(self, x):
        return logmelspectrogram(
            x,
            fs=self.fs,
            n_mels=self.n_mels,
            n_fft=self.n_fft,
            n_shift=self.n_shift,
            win_length=self.win_length,
            window=self.window,
        )


class Stft2LogMelSpectrogram(object):
    def __init__(self, fs, n_mels, n_fft, fmin=None, fmax=None, eps=1e-10):
        self.fs = fs
        self.n_mels = n_mels
        self.n_fft = n_fft
        self.fmin = fmin
        self.fmax = fmax
        self.eps = eps

    def __repr__(self):
        return (
            "{name}(fs={fs}, n_mels={n_mels}, n_fft={n_fft}, "
            "fmin={fmin}, fmax={fmax}, eps={eps}))".format(
                name=self.__class__.__name__,
                fs=self.fs,
                n_mels=self.n_mels,
                n_fft=self.n_fft,
                fmin=self.fmin,
                fmax=self.fmax,
                eps=self.eps,
            )
        )

    def __call__(self, x):
        return stft2logmelspectrogram(
            x,
            fs=self.fs,
            n_mels=self.n_mels,
            n_fft=self.n_fft,
            fmin=self.fmin,
            fmax=self.fmax,
        )


class Stft(object):
    def __init__(
        self,
        n_fft,
        n_shift,
        win_length=None,
        window="hann",
        center=True,
        pad_mode="reflect",
    ):
        self.n_fft = n_fft
        self.n_shift = n_shift
        self.win_length = win_length
        self.window = window
        self.center = center
        self.pad_mode = pad_mode

    def __repr__(self):
        return (
            "{name}(n_fft={n_fft}, n_shift={n_shift}, "
            "win_length={win_length}, window={window},"
            "center={center}, pad_mode={pad_mode})".format(
                name=self.__class__.__name__,
                n_fft=self.n_fft,
                n_shift=self.n_shift,
                win_length=self.win_length,
                window=self.window,
                center=self.center,
                pad_mode=self.pad_mode,
            )
        )

    def __call__(self, x):
        return stft(
            x,
            self.n_fft,
            self.n_shift,
            win_length=self.win_length,
            window=self.window,
            center=self.center,
            pad_mode=self.pad_mode,
        )


class IStft(object):
    def __init__(self, n_shift, win_length=None, window="hann", center=True):
        self.n_shift = n_shift
        self.win_length = win_length
        self.window = window
        self.center = center

    def __repr__(self):
        return (
            "{name}(n_shift={n_shift}, "
            "win_length={win_length}, window={window},"
            "center={center})".format(
                name=self.__class__.__name__,
                n_shift=self.n_shift,
                win_length=self.win_length,
                window=self.window,
                center=self.center,
            )
        )

    def __call__(self, x):
        return istft(
            x,
            self.n_shift,
            win_length=self.win_length,
            window=self.window,
            center=self.center,
        )


================================================
FILE: transform/transform_interface.py
================================================
# TODO(karita): add this to all the transform impl.
class TransformInterface:
    """Transform Interface"""

    def __call__(self, x):
        raise NotImplementedError("__call__ method is not implemented")

    @classmethod
    def add_arguments(cls, parser):
        return parser

    def __repr__(self):
        return self.__class__.__name__ + "()"


class Identity(TransformInterface):
    """Identity Function"""

    def __call__(self, x):
        return x


================================================
FILE: transform/transformation.py
================================================
from collections import OrderedDict
import copy
import io
import logging
import sys

import yaml

from espnet.utils.dynamic_import import dynamic_import


PY2 = sys.version_info[0] == 2

if PY2:
    from collections import Sequence
    from funcsigs import signature
else:
    # The ABCs from 'collections' will stop working in 3.8
    from collections.abc import Sequence
    from inspect import signature


# TODO(karita): inherit TransformInterface
# TODO(karita): register cmd arguments in asr_train.py
import_alias = dict(
    identity="espnet.transform.transform_interface:Identity",
    time_warp="espnet.transform.spec_augment:TimeWarp",
    time_mask="espnet.transform.spec_augment:TimeMask",
    freq_mask="espnet.transform.spec_augment:FreqMask",
    spec_augment="espnet.transform.spec_augment:SpecAugment",
    speed_perturbation="espnet.transform.perturb:SpeedPerturbation",
    volume_perturbation="espnet.transform.perturb:VolumePerturbation",
    noise_injection="espnet.transform.perturb:NoiseInjection",
    bandpass_perturbation="espnet.transform.perturb:BandpassPerturbation",
    rir_convolve="espnet.transform.perturb:RIRConvolve",
    delta="espnet.transform.add_deltas:AddDeltas",
    cmvn="espnet.transform.cmvn:CMVN",
    utterance_cmvn="espnet.transform.cmvn:UtteranceCMVN",
    fbank="espnet.transform.spectrogram:LogMelSpectrogram",
    spectrogram="espnet.transform.spectrogram:Spectrogram",
    stft="espnet.transform.spectrogram:Stft",
    istft="espnet.transform.spectrogram:IStft",
    stft2fbank="espnet.transform.spectrogram:Stft2LogMelSpectrogram",
    wpe="espnet.transform.wpe:WPE",
    channel_selector="espnet.transform.channel_selector:ChannelSelector",
)


class Transformation(object):
    """Apply some functions to the mini-batch

    Examples:
        >>> kwargs = {"process": [{"type": "fbank",
        ...                        "n_mels": 80,
        ...                        "fs": 16000},
        ...                       {"type": "cmvn",
        ...                        "stats": "data/train/cmvn.ark",
        ...                        "norm_vars": True},
        ...                       {"type": "delta", "window": 2, "order": 2}]}
        >>> transform = Transformation(kwargs)
        >>> bs = 10
        >>> xs = [np.random.randn(100, 80).astype(np.float32)
        ...       for _ in range(bs)]
        >>> xs = transform(xs)
    """

    def __init__(self, conffile=None):
        if conffile is not None:
            if isinstance(conffile, dict):
                self.conf = copy.deepcopy(conffile)
            else:
                with io.open(conffile, encoding="utf-8") as f:
                    self.conf = yaml.safe_load(f)
                    assert isinstance(self.conf, dict), type(self.conf)
        else:
            self.conf = {"mode": "sequential", "process": []}

        self.functions = OrderedDict()
        if self.conf.get("mode", "sequential") == "sequential":
            for idx, process in enumerate(self.conf["process"]):
                assert isinstance(process, dict), type(process)
                opts = dict(process)
                process_type = opts.pop("type")
                class_obj = dynamic_import(process_type, import_alias)
                # TODO(karita): assert issubclass(class_obj, TransformInterface)
                try:
                    self.functions[idx] = class_obj(**opts)
                except TypeError:
                    try:
                        signa = signature(class_obj)
                    except ValueError:
                        # Some function, e.g. built-in function, are failed
                        pass
                    else:
                        logging.error(
                            "Expected signature: {}({})".format(
                                class_obj.__name__, signa
                            )
                        )
                    raise
        else:
            raise NotImplementedError(
                "Not supporting mode={}".format(self.conf["mode"])
            )

    def __repr__(self):
        rep = "\n" + "\n".join(
            "    {}: {}".format(k, v) for k, v in self.functions.items()
        )
        return "{}({})".format(self.__class__.__name__, rep)

    def __call__(self, xs, uttid_list=None, **kwargs):
        """Return new mini-batch

        :param Union[Sequence[np.ndarray], np.ndarray] xs:
        :param Union[Sequence[str], str] uttid_list:
        :return: batch:
        :rtype: List[np.ndarray]
        """
        if not isinstance(xs, Sequence):
            is_batch = False
            xs = [xs]
        else:
            is_batch = True

        if isinstance(uttid_list, str):
            uttid_list = [uttid_list for _ in range(len(xs))]

        if self.conf.get("mode", "sequential") == "sequential":
            for idx in range(len(self.conf["process"])):
                func = self.functions[idx]
                # TODO(karita): use TrainingTrans and UttTrans to check __call__ args
                # Derive only the args which the func has
                try:
                    param = signature(func).parameters
                except ValueError:
                    # Some function, e.g. built-in function, are failed
                    param = {}
                _kwargs = {k: v for k, v in kwargs.items() if k in param}
                try:
                    if uttid_list is not None and "uttid" in param:
                        xs = [func(x, u, **_kwargs) for x, u in zip(xs, uttid_list)]
                    else:
                        xs = [func(x, **_kwargs) for x in xs]
                except Exception:
                    logging.fatal(
                        "Catch a exception from {}th func: {}".format(idx, func)
                    )
                    raise
        else:
            raise NotImplementedError(
                "Not supporting mode={}".format(self.conf["mode"])
            )

        if is_batch:
            return xs
        else:
            return xs[0]


================================================
FILE: transform/wpe.py
================================================
from nara_wpe.wpe import wpe


class WPE(object):
    def __init__(
        self, taps=10, delay=3, iterations=3, psd_context=0, statistics_mode="full"
    ):
        self.taps = taps
        self.delay = delay
        self.iterations = iterations
        self.psd_context = psd_context
        self.statistics_mode = statistics_mode

    def __repr__(self):
        return (
            "{name}(taps={taps}, delay={delay}"
            "iterations={iterations}, psd_context={psd_context}, "
            "statistics_mode={statistics_mode})".format(
                name=self.__class__.__name__,
                taps=self.taps,
                delay=self.delay,
                iterations=self.iterations,
                psd_context=self.psd_context,
                statistics_mode=self.statistics_mode,
            )
        )

    def __call__(self, xs):
        """Return enhanced

        :param np.ndarray xs: (Time, Channel, Frequency)
        :return: enhanced_xs
        :rtype: np.ndarray

        """
        # nara_wpe.wpe: (F, C, T)
        xs = wpe(
            xs.transpose((2, 1, 0)),
            taps=self.taps,
            delay=self.delay,
            iterations=self.iterations,
            psd_context=self.psd_context,
            statistics_mode=self.statistics_mode,
        )
        return xs.transpose(2, 1, 0)


================================================
FILE: tts/__init__.py
================================================
"""Initialize sub package."""


================================================
FILE: tts/pytorch_backend/__init__.py
================================================
"""Initialize sub package."""


================================================
FILE: tts/pytorch_backend/tts.py
================================================
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

# Copyright 2018 Nagoya University (Tomoki Hayashi)
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

"""E2E-TTS training / decoding functions."""

import copy
import json
import logging
import math
import os
import time

import chainer
import kaldiio
import numpy as np
import torch

from chainer import training
from chainer.training import extensions

from espnet.asr.asr_utils import get_model_conf
from espnet.asr.asr_utils import snapshot_object
from espnet.asr.asr_utils import torch_load
from espnet.asr.asr_utils import torch_resume
from espnet.asr.asr_utils import torch_snapshot
from espnet.asr.pytorch_backend.asr_init import load_trained_modules
from espnet.nets.pytorch_backend.nets_utils import pad_list
from espnet.nets.tts_interface import TTSInterface
from espnet.utils.dataset import ChainerDataLoader
from espnet.utils.dataset import TransformDataset
from espnet.utils.dynamic_import import dynamic_import
from espnet.utils.io_utils import LoadInputsAndTargets
from espnet.utils.training.batchfy import make_batchset
from espnet.utils.training.evaluator import BaseEvaluator

from espnet.utils.deterministic_utils import set_deterministic_pytorch
from espnet.utils.training.train_utils import check_early_stop
from espnet.utils.training.train_utils import set_early_stop

from espnet.utils.training.iterators import ShufflingEnabler

import matplotlib

from espnet.utils.training.tensorboard_logger import TensorboardLogger
from tensorboardX import SummaryWriter

matplotlib.use("Agg")


class CustomEvaluator(BaseEvaluator):
    """Custom evaluator."""

    def __init__(self, model, iterator, target, device):
        """Initilize module.

        Args:
            model (torch.nn.Module): Pytorch model instance.
            iterator (chainer.dataset.Iterator): Iterator for validation.
            target (chainer.Chain): Dummy chain instance.
            device (torch.device): The device to be used in evaluation.

        """
        super(CustomEvaluator, self).__init__(iterator, target)
        self.model = model
        self.device = device

    # The core part of the update routine can be customized by overriding.
    def evaluate(self):
        """Evaluate over validation iterator."""
        iterator = self._iterators["main"]

        if self.eval_hook:
            self.eval_hook(self)

        if hasattr(iterator, "reset"):
            iterator.reset()
            it = iterator
        else:
            it = copy.copy(iterator)

        summary = chainer.reporter.DictSummary()

        self.model.eval()
        with torch.no_grad():
            for batch in it:
                if isinstance(batch, tuple):
                    x = tuple(arr.to(self.device) for arr in batch)
                else:
                    x = batch
                    for key in x.keys():
                        x[key] = x[key].to(self.device)
                observation = {}
                with chainer.reporter.report_scope(observation):
                    # convert to torch tensor
                    if isinstance(x, tuple):
                        self.model(*x)
                    else:
                        self.model(**x)
                summary.add(observation)
        self.model.train()

        return summary.compute_mean()


class CustomUpdater(training.StandardUpdater):
    """Custom updater."""

    def __init__(self, model, grad_clip, iterator, optimizer, device, accum_grad=1):
        """Initilize module.

        Args:
            model (torch.nn.Module) model: Pytorch model instance.
            grad_clip (float) grad_clip : The gradient clipping value.
            iterator (chainer.dataset.Iterator): Iterator for training.
            optimizer (torch.optim.Optimizer) : Pytorch optimizer instance.
            device (torch.device): The device to be used in training.

        """
        super(CustomUpdater, self).__init__(iterator, optimizer)
        self.model = model
        self.grad_clip = grad_clip
        self.device = device
        self.clip_grad_norm = torch.nn.utils.clip_grad_norm_
        self.accum_grad = accum_grad
        self.forward_count = 0

    # The core part of the update routine can be customized by overriding.
    def update_core(self):
        """Update model one step."""
        # When we pass one iterator and optimizer to StandardUpdater.__init__,
        # they are automatically named 'main'.
        train_iter = self.get_iterator("main")
        optimizer = self.get_optimizer("main")

        # Get the next batch (a list of json files)
        batch = train_iter.next()
        if isinstance(batch, tuple):
            x = tuple(arr.to(self.device) for arr in batch)
        else:
            x = batch
            for key in x.keys():
                x[key] = x[key].to(self.device)

        # compute loss and gradient
        if isinstance(x, tuple):
            loss = self.model(*x).mean() / self.accum_grad
        else:
            loss = self.model(**x).mean() / self.accum_grad
        loss.backward()

        # update parameters
        self.forward_count += 1
        if self.forward_count != self.accum_grad:
            return
        self.forward_count = 0

        # compute the gradient norm to check if it is normal or not
        grad_norm = self.clip_grad_norm(self.model.parameters(), self.grad_clip)
        logging.debug("grad norm={}".format(grad_norm))
        if math.isnan(grad_norm):
            logging.warning("grad norm is nan. Do not update model.")
        else:
            optimizer.step()
        optimizer.zero_grad()

    def update(self):
        """Run update function."""
        self.update_core()
        if self.forward_count == 0:
            self.iteration += 1


class CustomConverter(object):
    """Custom converter."""

    def __init__(self):
        """Initilize module."""
        # NOTE: keep as class for future development
        pass

    def __call__(self, batch, device=torch.device("cpu")):
        """Convert a given batch.

        Args:
            batch (list): List of ndarrays.
            device (torch.device): The device to be send.

        Returns:
            dict: Dict of converted tensors.

        Examples:
            >>> batch = [([np.arange(5), np.arange(3)],
                          [np.random.randn(8, 2), np.random.randn(4, 2)],
                          None, None)]
            >>> conveter = CustomConverter()
            >>> conveter(batch, torch.device("cpu"))
            {'xs': tensor([[0, 1, 2, 3, 4],
                           [0, 1, 2, 0, 0]]),
             'ilens': tensor([5, 3]),
             'ys': tensor([[[-0.4197, -1.1157],
                            [-1.5837, -0.4299],
                            [-2.0491,  0.9215],
                            [-2.4326,  0.8891],
                            [ 1.2323,  1.7388],
                            [-0.3228,  0.6656],
                            [-0.6025,  1.3693],
                            [-1.0778,  1.3447]],
                           [[ 0.1768, -0.3119],
                            [ 0.4386,  2.5354],
                            [-1.2181, -0.5918],
                            [-0.6858, -0.8843],
                            [ 0.0000,  0.0000],
                            [ 0.0000,  0.0000],
                            [ 0.0000,  0.0000],
                            [ 0.0000,  0.0000]]]),
             'labels': tensor([[0., 0., 0., 0., 0., 0., 0., 1.],
                               [0., 0., 0., 1., 1., 1., 1., 1.]]),
             'olens': tensor([8, 4])}

        """
        # batch should be located in list
        assert len(batch) == 1
        xs, ys, spembs, extras = batch[0]

        # get list of lengths (must be tensor for DataParallel)
        ilens = torch.from_numpy(np.array([x.shape[0] for x in xs])).long().to(device)
        olens = torch.from_numpy(np.array([y.shape[0] for y in ys])).long().to(device)

        # perform padding and conversion to tensor
        xs = pad_list([torch.from_numpy(x).long() for x in xs], 0).to(device)
        ys = pad_list([torch.from_numpy(y).float() for y in ys], 0).to(device)

        # make labels for stop prediction
        labels = ys.new_zeros(ys.size(0), ys.size(1))
        for i, l in enumerate(olens):
            labels[i, l - 1 :] = 1.0

        # prepare dict
        new_batch = {
            "xs": xs,
            "ilens": ilens,
            "ys": ys,
            "labels": labels,
            "olens": olens,
        }

        # load speaker embedding
        if spembs is not None:
            spembs = torch.from_numpy(np.array(spembs)).float()
            new_batch["spembs"] = spembs.to(device)

        # load second target
        if extras is not None:
            extras = pad_list([torch.from_numpy(extra).float() for extra in extras], 0)
            new_batch["extras"] = extras.to(device)

        return new_batch


def train(args):
    """Train E2E-TTS model."""
    set_deterministic_pytorch(args)

    # check cuda availability
    if not torch.cuda.is_available():
        logging.warning("cuda is not available")

    # get input and output dimension info
    with open(args.valid_json, "rb") as f:
        valid_json = json.load(f)["utts"]
    utts = list(valid_json.keys())

    # reverse input and output dimension
    idim = int(valid_json[utts[0]]["output"][0]["shape"][1])
    odim = int(valid_json[utts[0]]["input"][0]["shape"][1])
    logging.info("#input dims : " + str(idim))
    logging.info("#output dims: " + str(odim))

    # get extra input and output dimenstion
    if args.use_speaker_embedding:
        args.spk_embed_dim = int(valid_json[utts[0]]["input"][1]["shape"][0])
    else:
        args.spk_embed_dim = None
    if args.use_second_target:
        args.spc_dim = int(valid_json[utts[0]]["input"][1]["shape"][1])
    else:
        args.spc_dim = None

    # write model config
    if not os.path.exists(args.outdir):
        os.makedirs(args.outdir)
    model_conf = args.outdir + "/model.json"
    with open(model_conf, "wb") as f:
        logging.info("writing a model config file to" + model_conf)
        f.write(
            json.dumps(
                (idim, odim, vars(args)), indent=4, ensure_ascii=False, sort_keys=True
            ).encode("utf_8")
        )
    for key in sorted(vars(args).keys()):
        logging.info("ARGS: " + key + ": " + str(vars(args)[key]))

    # specify model architecture
    if args.enc_init is not None or args.dec_init is not None:
        model = load_trained_modules(idim, odim, args, TTSInterface)
    else:
        model_class = dynamic_import(args.model_module)
        model = model_class(idim, odim, args)
    assert isinstance(model, TTSInterface)
    logging.info(model)
    reporter = model.reporter

    # check the use of multi-gpu
    if args.ngpu > 1:
        model = torch.nn.DataParallel(model, device_ids=list(range(args.ngpu)))
        if args.batch_size != 0:
            logging.warning(
                "batch size is automatically increased (%d -> %d)"
                % (args.batch_size, args.batch_size * args.ngpu)
            )
            args.batch_size *= args.ngpu

    # set torch device
    device = torch.device("cuda" if args.ngpu > 0 else "cpu")
    model = model.to(device)

    # freeze modules, if specified
    if args.freeze_mods:
        if hasattr(model, "module"):
            freeze_mods = ["module." + x for x in args.freeze_mods]
        else:
            freeze_mods = args.freeze_mods

        for mod, param in model.named_parameters():
            if any(mod.startswith(key) for key in freeze_mods):
                logging.info(f"{mod} is frozen not to be updated.")
                param.requires_grad = False

        model_params = filter(lambda x: x.requires_grad, model.parameters())
    else:
        model_params = model.parameters()

    logging.warning(
        "num. model params: {:,} (num. trained: {:,} ({:.1f}%))".format(
            sum(p.numel() for p in model.parameters()),
            sum(p.numel() for p in model.parameters() if p.requires_grad),
            sum(p.numel() for p in model.parameters() if p.requires_grad)
            * 100.0
            / sum(p.numel() for p in model.parameters()),
        )
    )

    # Setup an optimizer
    if args.opt == "adam":
        optimizer = torch.optim.Adam(
            model_params, args.lr, eps=args.eps, weight_decay=args.weight_decay
        )
    elif args.opt == "noam":
        from espnet.nets.pytorch_backend.transformer.optimizer import get_std_opt

        optimizer = get_std_opt(
            model_params, args.adim, args.transformer_warmup_steps, args.transformer_lr
        )
    else:
        raise NotImplementedError("unknown optimizer: " + args.opt)

    # FIXME: TOO DIRTY HACK
    setattr(optimizer, "target", reporter)
    setattr(optimizer, "serialize", lambda s: reporter.serialize(s))

    # read json data
    with open(args.train_json, "rb") as f:
        train_json = json.load(f)["utts"]
    with open(args.valid_json, "rb") as f:
        valid_json = json.load(f)["utts"]

    use_sortagrad = args.sortagrad == -1 or args.sortagrad > 0
    if use_sortagrad:
        args.batch_sort_key = "input"
    # make minibatch list (variable length)
    train_batchset = make_batchset(
        train_json,
        args.batch_size,
        args.maxlen_in,
        args.maxlen_out,
        args.minibatches,
        batch_sort_key=args.batch_sort_key,
        min_batch_size=args.ngpu if args.ngpu > 1 else 1,
        shortest_first=use_sortagrad,
        count=args.batch_count,
        batch_bins=args.batch_bins,
        batch_frames_in=args.batch_frames_in,
        batch_frames_out=args.batch_frames_out,
        batch_frames_inout=args.batch_frames_inout,
        swap_io=True,
        iaxis=0,
        oaxis=0,
    )
    valid_batchset = make_batchset(
        valid_json,
        args.batch_size,
        args.maxlen_in,
        args.maxlen_out,
        args.minibatches,
        batch_sort_key=args.batch_sort_key,
        min_batch_size=args.ngpu if args.ngpu > 1 else 1,
        count=args.batch_count,
        batch_bins=args.batch_bins,
        batch_frames_in=args.batch_frames_in,
        batch_frames_out=args.batch_frames_out,
        batch_frames_inout=args.batch_frames_inout,
        swap_io=True,
        iaxis=0,
        oaxis=0,
    )

    load_tr = LoadInputsAndTargets(
        mode="tts",
        use_speaker_embedding=args.use_speaker_embedding,
        use_second_target=args.use_second_target,
        preprocess_conf=args.preprocess_conf,
        preprocess_args={"train": True},  # Switch the mode of preprocessing
        keep_all_data_on_mem=args.keep_all_data_on_mem,
    )

    load_cv = LoadInputsAndTargets(
        mode="tts",
        use_speaker_embedding=args.use_speaker_embedding,
        use_second_target=args.use_second_target,
        preprocess_conf=args.preprocess_conf,
        preprocess_args={"train": False},  # Switch the mode of preprocessing
        keep_all_data_on_mem=args.keep_all_data_on_mem,
    )

    converter = CustomConverter()
    # hack to make batchsize argument as 1
    # actual bathsize is included in a list
    train_iter = {
        "main": ChainerDataLoader(
            dataset=TransformDataset(
                train_batchset, lambda data: converter([load_tr(data)])
            ),
            batch_size=1,
            num_workers=args.num_iter_processes,
            shuffle=not use_sortagrad,
            collate_fn=lambda x: x[0],
        )
    }
    valid_iter = {
        "main": ChainerDataLoader(
            dataset=TransformDataset(
                valid_batchset, lambda data: converter([load_cv(data)])
            ),
            batch_size=1,
            shuffle=False,
            collate_fn=lambda x: x[0],
            num_workers=args.num_iter_processes,
        )
    }

    # Set up a trainer
    updater = CustomUpdater(
        model, args.grad_clip, train_iter, optimizer, device, args.accum_grad
    )
    trainer = training.Trainer(updater, (args.epochs, "epoch"), out=args.outdir)

    # Resume from a snapshot
    if args.resume:
        logging.info("resumed from %s" % args.resume)
        torch_resume(args.resume, trainer)

    # set intervals
    eval_interval = (args.eval_interval_epochs, "epoch")
    save_interval = (args.save_interval_epochs, "epoch")
    report_interval = (args.report_interval_iters, "iteration")

    # Evaluate the model with the test dataset for each epoch
    trainer.extend(
        CustomEvaluator(model, valid_iter, reporter, device), trigger=eval_interval
    )

    # Save snapshot for each epoch
    trainer.extend(torch_snapshot(), trigger=save_interval)

    # Save best models
    trainer.extend(
        snapshot_object(model, "model.loss.best"),
        trigger=training.triggers.MinValueTrigger(
            "validation/main/loss", trigger=eval_interval
        ),
    )

    # Save attention figure for each epoch
    if args.num_save_attention > 0:
        data = sorted(
            list(valid_json.items())[: args.num_save_attention],
            key=lambda x: int(x[1]["output"][0]["shape"][0]),
        )
        if hasattr(model, "module"):
            att_vis_fn = model.module.calculate_all_attentions
            plot_class = model.module.attention_plot_class
            reduction_factor = model.module.reduction_factor
        else:
            att_vis_fn = model.calculate_all_attentions
            plot_class = model.attention_plot_class
            reduction_factor = model.reduction_factor
        if reduction_factor > 1:
            # fix the length to crop attention weight plot correctly
            data = copy.deepcopy(data)
            for idx in range(len(data)):
                ilen = data[idx][1]["input"][0]["shape"][0]
                data[idx][1]["input"][0]["shape"][0] = ilen // reduction_factor
        att_reporter = plot_class(
            att_vis_fn,
            data,
            args.outdir + "/att_ws",
            converter=converter,
            transform=load_cv,
            device=device,
            reverse=True,
        )
        trainer.extend(att_reporter, trigger=eval_interval)
    else:
        att_reporter = None

    # Make a plot for training and validation values
    if hasattr(model, "module"):
        base_plot_keys = model.module.base_plot_keys
    else:
        base_plot_keys = model.base_plot_keys
    plot_keys = []
    for key in base_plot_keys:
        plot_key = ["main/" + key, "validation/main/" + key]
        trainer.extend(
            extensions.PlotReport(plot_key, "epoch", file_name=key + ".png"),
            trigger=eval_interval,
        )
        plot_keys += plot_key
    trainer.extend(
        extensions.PlotReport(plot_keys, "epoch", file_name="all_loss.png"),
        trigger=eval_interval,
    )

    # Write a log of evaluation statistics for each epoch
    trainer.extend(extensions.LogReport(trigger=report_interval))
    report_keys = ["epoch", "iteration", "elapsed_time"] + plot_keys
    trainer.extend(extensions.PrintReport(report_keys), trigger=report_interval)
    trainer.extend(extensions.ProgressBar(), trigger=report_interval)

    set_early_stop(trainer, args)
    if args.tensorboard_dir is not None and args.tensorboard_dir != "":
        writer = SummaryWriter(args.tensorboard_dir)
        trainer.extend(TensorboardLogger(writer, att_reporter), trigger=report_interval)

    if use_sortagrad:
        trainer.extend(
            ShufflingEnabler([train_iter]),
            trigger=(args.sortagrad if args.sortagrad != -1 else args.epochs, "epoch"),
        )

    # Run the training
    trainer.run()
    check_early_stop(trainer, args.epochs)


@torch.no_grad()
def decode(args):
    """Decode with E2E-TTS model."""
    set_deterministic_pytorch(args)
    # read training config
    idim, odim, train_args = get_model_conf(args.model, args.model_conf)

    # show arguments
    for key in sorted(vars(args).keys()):
        logging.info("args: " + key + ": " + str(vars(args)[key]))

    # define model
    model_class = dynamic_import(train_args.model_module)
    model = model_class(idim, odim, train_args)
    assert isinstance(model, TTSInterface)
    logging.info(model)

    # load trained model parameters
    logging.info("reading model parameters from " + args.model)
    torch_load(args.model, model)
    model.eval()

    # set torch device
    device = torch.device("cuda" if args.ngpu > 0 else "cpu")
    model = model.to(device)

    # read json data
    with open(args.json, "rb") as f:
        js = json.load(f)["utts"]

    # check directory
    outdir = os.path.dirname(args.out)
    if len(outdir) != 0 and not os.path.exists(outdir):
        os.makedirs(outdir)

    load_inputs_and_targets = LoadInputsAndTargets(
        mode="tts",
        load_input=False,
        sort_in_input_length=False,
        use_speaker_embedding=train_args.use_speaker_embedding,
        preprocess_conf=train_args.preprocess_conf
        if args.preprocess_conf is None
        else args.preprocess_conf,
        preprocess_args={"train": False},  # Switch the mode of preprocessing
    )

    # define function for plot prob and att_ws
    def _plot_and_save(array, figname, figsize=(6, 4), dpi=150):
        import matplotlib.pyplot as plt

        shape = array.shape
        if len(shape) == 1:
            # for eos probability
            plt.figure(figsize=figsize, dpi=dpi)
            plt.plot(array)
            plt.xlabel("Frame")
            plt.ylabel("Probability")
            plt.ylim([0, 1])
        elif len(shape) == 2:
            # for tacotron 2 attention weights, whose shape is (out_length, in_length)
            plt.figure(figsize=figsize, dpi=dpi)
            plt.imshow(array, aspect="auto")
            plt.xlabel("Input")
            plt.ylabel("Output")
        elif len(shape) == 4:
            # for transformer attention weights,
            # whose shape is (#leyers, #heads, out_length, in_length)
            plt.figure(figsize=(figsize[0] * shape[0], figsize[1] * shape[1]), dpi=dpi)
            for idx1, xs in enumerate(array):
                for idx2, x in enumerate(xs, 1):
                    plt.subplot(shape[0], shape[1], idx1 * shape[1] + idx2)
                    plt.imshow(x, aspect="auto")
                    plt.xlabel("Input")
                    plt.ylabel("Output")
        else:
            raise NotImplementedError("Support only from 1D to 4D array.")
        plt.tight_layout()
        if not os.path.exists(os.path.dirname(figname)):
            # NOTE: exist_ok = True is needed for parallel process decoding
            os.makedirs(os.path.dirname(figname), exist_ok=True)
        plt.savefig(figname)
        plt.close()

    # define function to calculate focus rate
    # (see section 3.3 in https://arxiv.org/abs/1905.09263)
    def _calculate_focus_rete(att_ws):
        if att_ws is None:
            # fastspeech case -> None
            return 1.0
        elif len(att_ws.shape) == 2:
            # tacotron 2 case -> (L, T)
            return float(att_ws.max(dim=-1)[0].mean())
        elif len(att_ws.shape) == 4:
            # transformer case -> (#layers, #heads, L, T)
            return float(att_ws.max(dim=-1)[0].mean(dim=-1).max())
        else:
            raise ValueError("att_ws should be 2 or 4 dimensional tensor.")

    # define function to convert attention to duration
    def _convert_att_to_duration(att_ws):
        if len(att_ws.shape) == 2:
            # tacotron 2 case -> (L, T)
            pass
        elif len(att_ws.shape) == 4:
            # transformer case -> (#layers, #heads, L, T)
            # get the most diagonal head according to focus rate
            att_ws = torch.cat(
                [att_w for att_w in att_ws], dim=0
            )  # (#heads * #layers, L, T)
            diagonal_scores = att_ws.max(dim=-1)[0].mean(dim=-1)  # (#heads * #layers,)
            diagonal_head_idx = diagonal_scores.argmax()
            att_ws = att_ws[diagonal_head_idx]  # (L, T)
        else:
            raise ValueError("att_ws should be 2 or 4 dimensional tensor.")
        # calculate duration from 2d attention weight
        durations = torch.stack(
            [att_ws.argmax(-1).eq(i).sum() for i in range(att_ws.shape[1])]
        )
        return durations.view(-1, 1).float()

    # define writer instances
    feat_writer = kaldiio.WriteHelper("ark,scp:{o}.ark,{o}.scp".format(o=args.out))
    if args.save_durations:
        dur_writer = kaldiio.WriteHelper(
            "ark,scp:{o}.ark,{o}.scp".format(o=args.out.replace("feats", "durations"))
        )
    if args.save_focus_rates:
        fr_writer = kaldiio.WriteHelper(
            "ark,scp:{o}.ark,{o}.scp".format(o=args.out.replace("feats", "focus_rates"))
        )

    # start decoding
    for idx, utt_id in enumerate(js.keys()):
        # setup inputs
        batch = [(utt_id, js[utt_id])]
        data = load_inputs_and_targets(batch)
        x = torch.LongTensor(data[0][0]).to(device)
        spemb = None
        if train_args.use_speaker_embedding:
            spemb = torch.FloatTensor(data[1][0]).to(device)

        # decode and write
        start_time = time.time()
        outs, probs, att_ws = model.inference(x, args, spemb=spemb)
        logging.info(
            "inference speed = %.1f frames / sec."
            % (int(outs.size(0)) / (time.time() - start_time))
        )
        if outs.size(0) == x.size(0) * args.maxlenratio:
            logging.warning("output length reaches maximum length (%s)." % utt_id)
        focus_rate = _calculate_focus_rete(att_ws)
        logging.info(
            "(%d/%d) %s (size: %d->%d, focus rate: %.3f)"
            % (idx + 1, len(js.keys()), utt_id, x.size(0), outs.size(0), focus_rate)
        )
        feat_writer[utt_id] = outs.cpu().numpy()
        if args.save_durations:
            ds = _convert_att_to_duration(att_ws)
            dur_writer[utt_id] = ds.cpu().numpy()
        if args.save_focus_rates:
            fr_writer[utt_id] = np.array(focus_rate).reshape(1, 1)

        # plot and save prob and att_ws
        if probs is not None:
            _plot_and_save(
                probs.cpu().numpy(),
                os.path.dirname(args.out) + "/probs/%s_prob.png" % utt_id,
            )
        if att_ws is not None:
            _plot_and_save(
                att_ws.cpu().numpy(),
                os.path.dirname(args.out) + "/att_ws/%s_att_ws.png" % utt_id,
            )

    # close file object
    feat_writer.close()
    if args.save_durations:
        dur_writer.close()
    if args.save_focus_rates:
        fr_writer.close()


================================================
FILE: utils/__init__.py
================================================
"""Initialize sub package."""


================================================
FILE: utils/bmuf.py
================================================
"""
BMUF (block model update filtering) module
implementation of block model update filtering
"""

import torch
import torch.distributed as dist
#import torch.distributed.ReduceOp as ReduceOp
import torch.nn as nn

SUCCESS = 1
STOP = 0

def _copy_vec_to_param(vec, parameters, is_grad=False):
    """Copy vector to the parameters

    Args:
        vec (Tensor): a single vector represents the parameters of a model.
        parameters (Iterable[Tensor]): an iterator of Tensors that are the
            parameters of a model.
    """
    # Ensure vec of type Tensor
    if not isinstance(vec, torch.Tensor):
        raise TypeError('expected torch.Tensor, but got: {}'
                        .format(torch.typename(vec)))
    # Pointer for slicing the vector for each parameter
    pointer = 0
    for param in parameters:
        # The length of the parameter
        num_param = param.numel()
        # Slice the vector, reshape it, and replace the old data of the parameter
        if is_grad: 
            param.grad = param.grad.copy_(vec[pointer:pointer + num_param]
                                      .view_as(param).data)
        else:
            param.data = param.data.copy_(vec[pointer:pointer + num_param]
                                      .view_as(param).data)
        # Increment the pointer
        pointer += num_param


class BmufTrainer():
    """
    Basic BMUF Trainer Class,
    implements Nesterov Block Momentum

    Args:
        master_node (int): master node index, zero in most cases
        rank (int): local rank, eg, 0-7 if 8GPUs are used
        world_size (int): total number of workers
        model (nn.module): model
        block_momentum (float): block momentum value
        block_lr (float): block learning rate
    """
    def __init__(self, master_node, rank, world_size, model,
                 block_momentum, block_lr):
        self.master_node = master_node
        self.rank = rank
        self.world_size = world_size
        self.model = model
        self.block_momentum = block_momentum
        self.block_lr = block_lr
        dist.init_process_group(backend="nccl", init_method="env://")
        #clone() make sure self.param
        #NOT tied to model parameters
        #data() enforces no grad
        param_vec = nn.utils.parameters_to_vector(model.parameters())
        self.param = param_vec.data.clone()
        #broadcast initial param to other nodes
        dist.broadcast(tensor=self.param, src=master_node, async_op=False)
        num_param = self.param.numel()
        if self.rank == master_node:
            self.delta_prev = torch.FloatTensor([0]*num_param).cuda(self.rank)
        else:
            self.delta_prev = None
            #nn.utils.vector_to_parameters(self.param.clone(),
            #                              self.model.parameters())
            _copy_vec_to_param(self.param, self.model.parameters())

    def update_and_sync(self):
        """
        Performs a single block sync and update
        return SUCCESS if numericals are healthy
        return STOP otherwise

        """
        delta = self.param - \
                nn.utils.parameters_to_vector(self.model.parameters()).data
        #gather block gradients into delta
        #default: op=ReduceOp.SUM,
        dist.reduce(tensor=delta, dst=self.master_node)
        #check if model params are still healthy
        if torch.isnan(delta).sum().item():
            return STOP
        if self.rank == self.master_node:
            #for master node
            delta = delta / float(self.world_size)
            self.delta_prev = self.block_momentum * self.delta_prev + \
                              (self.block_lr *(1 - self.block_momentum)* delta)
            #self.delta_prev = self.block_momentum * self.delta_prev + \
            #                   (self.block_lr * delta)
            
            self.param -= (1+self.block_momentum) * self.delta_prev
        dist.broadcast(tensor=self.param, src=self.master_node, async_op=False)
        _copy_vec_to_param(self.param, self.model.parameters())

        return SUCCESS

    def broadcast(self, tensor):
        """broadcast interface for trainer"""
        dist.broadcast(tensor=tensor, src=self.master_node, async_op=False)

    def sum_reduce(self, tensor):
        """sumreduce interface for trainer"""
        #op=ReduceOp.SUM,
        dist.reduce(tensor=tensor, dst=self.master_node)


class BlockAdamTrainer():
    """
    By tyrion: Does this trainer requires the local optimizer being
    SGD? which means the delta is still the gradients (scaled)
    The learning rate is scheduled by the local optimizer so 
    the block_lr should be set 1.0.

    This is essentially sync adam optimizer but
    allows each worker to have individual loader
    to improve the training efficiency, to replace
    replace DataParallel()

    Args:
        master_node (int): master node index, zero in most cases
        rank (int): local rank, eg, 0-7 if 8 GPUs are used
        world_size (int): total number of workers
        model (nn.module): torch model
        block_lr (float): block learning rate

    """
    def __init__(self, args, master_node, rank, world_size, model):
        # Communication related
        self.master_node = master_node
        self.rank = rank
        self.world_size = world_size
        dist.init_process_group(backend="nccl", init_method="env://")
     
        # Model and optimizer 
        self.model = model
        param_vec = nn.utils.parameters_to_vector(model.parameters()).data.clone() 
        dist.broadcast(tensor=param_vec, src=master_node, async_op=False)
        _copy_vec_to_param(param_vec, self.model.parameters())

        from espnet.nets.pytorch_backend.transformer.optimizer import get_std_opt
        if hasattr(args, "enc_block_arch") or hasattr(args, "dec_block_arch"):
            adim = model.most_dom_dim
        else:
            adim = args.adim

        # consider when some modules are freezed
        params = [p for p in model.parameters() if p.requires_grad]
        self.optimizer = get_std_opt(
            params, adim, args.transformer_warmup_steps, args.transformer_lr
        )

    def update_and_sync(self):
        # Before calling this function we assume the forward-backword has finished
        # so the grad for params are non-zero.

        params = [p for p in self.optimizer.param_groups[0]["params"] if hasattr(p.grad, "data")]
        
        # average gradients
        grad_vec = nn.utils.parameters_to_vector([p.grad.data for p in params])
        dist.all_reduce(tensor=grad_vec.data)
        
        # Update with the global gradients
        _copy_vec_to_param(grad_vec, params, is_grad=True)
        self.optimizer.step()
        self.optimizer.zero_grad()

        return SUCCESS

#class BlockAdamTrainer():
#    """
#    By tyrion: Does this trainer requires the local optimizer being
#    SGD? which means the delta is still the gradients (scaled)
#    The learning rate is scheduled by the local optimizer so 
#    the block_lr should be set 1.0.
#    This is essentially sync adam optimizer but
#    allows each worker to have individual loader
#    to improve the training efficiency, to replace
#    replace DataParallel()
#    Args:
#        master_node (int): master node index, zero in most cases
#        rank (int): local rank, eg, 0-7 if 8 GPUs are used
#        world_size (int): total number of workers
#        model (nn.module): torch model
#        block_lr (float): block learning rate
#    """
#    def __init__(self, args, master_node, rank, world_size, model):
#        self.master_node = master_node
#        self.rank = rank
#        self.world_size = world_size
#        self.model = model
#        dist.init_process_group(backend="nccl", init_method="env://")
#        #clone() make sure self.param
#        #NOT tied to model parameters
#        #data() enforces no grad
#        param_vec = nn.utils.parameters_to_vector(model.parameters())
#        self.param = nn.parameter.Parameter(param_vec.data.clone())
#        #broadcast initial param to other nodes
#        dist.broadcast(tensor=self.param.data, src=master_node, async_op=False)
#        if self.rank == master_node:
#            from espnet.nets.pytorch_backend.transformer.optimizer import get_std_opt
#            if hasattr(args, "enc_block_arch") or hasattr(args, "dec_block_arch"):
#                adim = model.most_dom_dim
#            else:
#                adim = args.adim
#    
#            # consider when some modules are freezed
#            self.optimizer = get_std_opt(
#                [self.param], adim, args.transformer_warmup_steps, args.transformer_lr
#            )
#        else:
#            _copy_vec_to_param(self.param.data, self.model.parameters())
#
#    def update_and_sync(self):
#        """Perform a single block sync and update
#           when the block size equals to batch size
#           we are doing sync adam
#        """
#        delta = self.param.data - \
#                nn.utils.parameters_to_vector(self.model.parameters()).data
#        #gather block gradients into delta
#        #op=ReduceOp.SUM,
#        dist.reduce(tensor=delta, dst=self.master_node)
#        #check if model params are still healthy
#        if torch.isnan(delta).sum().item():
#            return STOP
#        if self.rank == self.master_node:
#            #local rank is master node
#            #delta = delta / float(self.world_size)
#            #use delta.data to detach from computation graph
#            self.param.grad = delta.data
#            self.optimizer.step()
#        dist.broadcast(tensor=self.param.data, src=self.master_node, async_op=False)
#        _copy_vec_to_param(self.param.data, self.model.parameters())
#
#        return SUCCESS
#
#    def reset_model(self, model):
#        del self.param
#        param_vec = nn.utils.parameters_to_vector(model.parameters())
#        self.param = nn.parameter.Parameter(param_vec.data.clone())


class BmufAdamTrainer():
    """The implementation of BMUF-adam, check more detils in,
       Chen. et al, 2020, "Parallelizing Adam Optimizer with
       Blockwise Model-Update Filtering."

    Args:
        master_node (int): master node index, zero in most cases
        rank (int): local rank, eg, 0-7 if 8 GPUs are used
        world_size (int): total number of workers
        model (nn.module): torch model
        block_momentum (float): block momentum value
        block_lr (float): block learning rate
        sync_period (int): sync period in number of batches
        optim (torch.optim.Optimizer): adam optimizer
    """
    def __init__(self, master_node, rank, world_size, model,
                 block_momentum, block_lr, sync_period, optim):
        self.master_node = master_node
        self.rank = rank
        self.world_size = world_size
        self.model = model
        self.block_momentum = block_momentum
        self.block_lr = block_lr
        self.sync_period = sync_period
        self.optim = optim
        dist.init_process_group(backend="nccl", init_method="env://")
        self.rho = 0.0
        #default setup
        self.betas = (0.9, 0.999)
	#clone() make sure self.param
        #NOT tied to model parameters
        #data() enforces no grad
        param_vec = nn.utils.parameters_to_vector(model.parameters())
        self.param = param_vec.data.clone()
        #broadcast initial param to other nodes
        dist.broadcast(tensor=self.param, src=master_node, async_op=False)
        self.num_param = self.param.numel()
        if self.rank == master_node:
            self.delta_prev = torch.FloatTensor([0]*self.num_param)\
                                   .cuda(master_node)
        else:
            self.delta_prev = None
            _copy_vec_to_param(self.param, self.model.parameters())

        #initialize first and second moment buffer
        dim = 0
        for group in optim.param_groups:
            self.betas = group['betas']
            for p in group['params']:
                dim += p.numel()
        if self.rank == master_node:
            self.exp_avg = torch.FloatTensor([0]*dim).cuda(self.rank)
            self.exp_avg_sq = torch.FloatTensor([0]*dim).cuda(self.rank)
        else:
            self.exp_avg = None
            self.exp_avg_sq = None
        #extend param to accommodate first and second moments
        vec_ext = torch.FloatTensor([0]*dim*2).cuda(self.rank)
        self.param = torch.cat([self.param, vec_ext])

    def update_and_sync(self):
        """perform single block sync and update"""
        #gather block gradients into delta
        delta = self.param[:self.num_param] - \
                nn.utils.parameters_to_vector(self.model.parameters()).data
        #gather local first and second moment
        exp_avg, exp_avg_sq = [], []
        for group in self.optim.param_groups:
            for p in group['params']:
                if p.grad is None:
                    continue
                state = self.optim.state[p]
                exp_avg.append(state['exp_avg'].view(-1))
                exp_avg_sq.append(state['exp_avg_sq'].view(-1))
        exp_avg = torch.cat(exp_avg)
        exp_avg_sq = torch.cat(exp_avg_sq)
        vec = torch.cat([delta, exp_avg, exp_avg_sq])
        #op=ReduceOp.SUM,
        dist.reduce(tensor=vec, dst=self.master_node)
        #check if model params are still healthy
        if torch.isnan(vec).sum().item():
            return STOP
        self.rho = self.block_momentum * self.rho + self.sync_period
        if self.rank == self.master_node:
            #local rank is master node
            vec = vec / float(self.world_size)
            self.delta_prev = self.block_momentum * self.delta_prev + \
                              (self.block_lr *(1 - self.block_momentum)*\
                               vec[:self.num_param])
            self.param[:self.num_param] -= (1+self.block_momentum) \
                                           * self.delta_prev
            #calculate first and second moment for next block
            dim = (vec.numel() - self.num_param) // 2
            beta1_tau = self.betas[0]**self.sync_period
            beta2_tau = self.betas[1]**self.sync_period
            beta1_rho = self.betas[0]**(self.rho*self.block_momentum)
            beta2_rho = self.betas[1]**(self.rho*self.block_momentum)
            self.exp_avg = beta1_tau * (beta1_rho - 1) * self.exp_avg
            self.exp_avg += (1 - beta1_tau * beta1_rho) *\
                            vec[self.num_param:self.num_param+dim]
            self.exp_avg = self.exp_avg / (1 - beta1_tau)
            self.exp_avg_sq = beta2_tau * (beta2_rho - 1) * self.exp_avg_sq
            self.exp_avg_sq += (1 - beta2_tau * beta2_rho) *\
                               vec[self.num_param+dim:]
            self.exp_avg_sq = self.exp_avg_sq / (1 - beta2_tau)
            self.param[self.num_param:self.num_param+dim] = self.exp_avg
            self.param[self.num_param+dim:] = self.exp_avg_sq

        dist.broadcast(tensor=self.param, src=self.master_node,
                       async_op=False)
        _copy_vec_to_param(self.param[:self.num_param],
                           self.model.parameters())
        #assign flattened moments to optimizer
        ptr1 = self.num_param
        ptr2 = self.num_param+(self.param.numel()-self.num_param)//2
        for group in self.optim.param_groups:
            for p in group['params']:
                if p.grad is None:
                    continue
                state = self.optim.state[p]
                state['step'] += self.rho * self.block_momentum
                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
                numel = exp_avg.numel()
                exp_avg.data = exp_avg.data\
                               .copy_(self.param[ptr1:ptr1+numel]
                                      .view_as(exp_avg).data)
                exp_avg_sq.data = exp_avg_sq.data\
                                  .copy_(self.param[ptr2:ptr2+numel]
                                         .view_as(exp_avg_sq).data)
                ptr1 += numel
                ptr2 += numel


        return SUCCESS

    def broadcast(self, tensor):
        """broadcast interface for trainer"""
        dist.broadcast(tensor=tensor, src=self.master_node, async_op=False)

    def sum_reduce(self, tensor):
        """sum reduce interface for trainer"""
        #op=ReduceOp.SUM,
        dist.reduce(tensor=tensor, dst=self.master_node)


================================================
FILE: utils/check_kwargs.py
================================================
import inspect


def check_kwargs(func, kwargs, name=None):
    """check kwargs are valid for func

    If kwargs are invalid, raise TypeError as same as python default
    :param function func: function to be validated
    :param dict kwargs: keyword arguments for func
    :param str name: name used in TypeError (default is func name)
    """
    try:
        params = inspect.signature(func).parameters
    except ValueError:
        return
    if name is None:
        name = func.__name__
    for k in kwargs.keys():
        if k not in params:
            raise TypeError(f"{name}() got an unexpected keyword argument '{k}'")


================================================
FILE: utils/cli_readers.py
================================================
import io
import logging
import sys

import h5py
import kaldiio
import soundfile

from espnet.utils.io_utils import SoundHDF5File


def file_reader_helper(
    rspecifier: str,
    filetype: str = "mat",
    return_shape: bool = False,
    segments: str = None,
):
    """Read uttid and array in kaldi style

    This function might be a bit confusing as "ark" is used
    for HDF5 to imitate "kaldi-rspecifier".

    Args:
        rspecifier: Give as "ark:feats.ark" or "scp:feats.scp"
        filetype: "mat" is kaldi-martix, "hdf5": HDF5
        return_shape: Return the shape of the matrix,
            instead of the matrix. This can reduce IO cost for HDF5.
    Returns:
        Generator[Tuple[str, np.ndarray], None, None]:

    Examples:
        Read from kaldi-matrix ark file:

        >>> for u, array in file_reader_helper('ark:feats.ark', 'mat'):
        ...     array

        Read from HDF5 file:

        >>> for u, array in file_reader_helper('ark:feats.h5', 'hdf5'):
        ...     array

    """
    if filetype == "mat":
        return KaldiReader(rspecifier, return_shape=return_shape, segments=segments)
    elif filetype == "hdf5":
        return HDF5Reader(rspecifier, return_shape=return_shape)
    elif filetype == "sound.hdf5":
        return SoundHDF5Reader(rspecifier, return_shape=return_shape)
    elif filetype == "sound":
        return SoundReader(rspecifier, return_shape=return_shape)
    else:
        raise NotImplementedError(f"filetype={filetype}")


class KaldiReader:
    def __init__(self, rspecifier, return_shape=False, segments=None):
        self.rspecifier = rspecifier
        self.return_shape = return_shape
        self.segments = segments

    def __iter__(self):
        with kaldiio.ReadHelper(self.rspecifier, segments=self.segments) as reader:
            for key, array in reader:
                if self.return_shape:
                    array = array.shape
                yield key, array


class HDF5Reader:
    def __init__(self, rspecifier, return_shape=False):
        if ":" not in rspecifier:
            raise ValueError(
                'Give "rspecifier" such as "ark:some.ark: {}"'.format(self.rspecifier)
            )
        self.rspecifier = rspecifier
        self.ark_or_scp, self.filepath = self.rspecifier.split(":", 1)
        if self.ark_or_scp not in ["ark", "scp"]:
            raise ValueError(f"Must be scp or ark: {self.ark_or_scp}")

        self.return_shape = return_shape

    def __iter__(self):
        if self.ark_or_scp == "scp":
            hdf5_dict = {}
            with open(self.filepath, "r", encoding="utf-8") as f:
                for line in f:
                    key, value = line.rstrip().split(None, 1)

                    if ":" not in value:
                        raise RuntimeError(
                            "scp file for hdf5 should be like: "
                            '"uttid filepath.h5:key": {}({})'.format(
                                line, self.filepath
                            )
                        )
                    path, h5_key = value.split(":", 1)

                    hdf5_file = hdf5_dict.get(path)
                    if hdf5_file is None:
                        try:
                            hdf5_file = h5py.File(path, "r")
                        except Exception:
                            logging.error("Error when loading {}".format(path))
                            raise
                        hdf5_dict[path] = hdf5_file

                    try:
                        data = hdf5_file[h5_key]
                    except Exception:
                        logging.error(
                            "Error when loading {} with key={}".format(path, h5_key)
                        )
                        raise

                    if self.return_shape:
                        yield key, data.shape
                    else:
                        yield key, data[()]

            # Closing all files
            for k in hdf5_dict:
                try:
                    hdf5_dict[k].close()
                except Exception:
                    pass

        else:
            if self.filepath == "-":
                # Required h5py>=2.9
                filepath = io.BytesIO(sys.stdin.buffer.read())
            else:
                filepath = self.filepath
            with h5py.File(filepath, "r") as f:
                for key in f:
                    if self.return_shape:
                        yield key, f[key].shape
                    else:
                        yield key, f[key][()]


class SoundHDF5Reader:
    def __init__(self, rspecifier, return_shape=False):
        if ":" not in rspecifier:
            raise ValueError(
                'Give "rspecifier" such as "ark:some.ark: {}"'.format(rspecifier)
            )
        self.ark_or_scp, self.filepath = rspecifier.split(":", 1)
        if self.ark_or_scp not in ["ark", "scp"]:
            raise ValueError(f"Must be scp or ark: {self.ark_or_scp}")
        self.return_shape = return_shape

    def __iter__(self):
        if self.ark_or_scp == "scp":
            hdf5_dict = {}
            with open(self.filepath, "r", encoding="utf-8") as f:
                for line in f:
                    key, value = line.rstrip().split(None, 1)

                    if ":" not in value:
                        raise RuntimeError(
                            "scp file for hdf5 should be like: "
                            '"uttid filepath.h5:key": {}({})'.format(
                                line, self.filepath
                            )
                        )
                    path, h5_key = value.split(":", 1)

                    hdf5_file = hdf5_dict.get(path)
                    if hdf5_file is None:
                        try:
                            hdf5_file = SoundHDF5File(path, "r")
                        except Exception:
                            logging.error("Error when loading {}".format(path))
                            raise
                        hdf5_dict[path] = hdf5_file

                    try:
                        data = hdf5_file[h5_key]
                    except Exception:
                        logging.error(
                            "Error when loading {} with key={}".format(path, h5_key)
                        )
                        raise

                    # Change Tuple[ndarray, int] -> Tuple[int, ndarray]
                    # (soundfile style -> scipy style)
                    array, rate = data
                    if self.return_shape:
                        array = array.shape
                    yield key, (rate, array)

            # Closing all files
            for k in hdf5_dict:
                try:
                    hdf5_dict[k].close()
                except Exception:
                    pass

        else:
            if self.filepath == "-":
                # Required h5py>=2.9
                filepath = io.BytesIO(sys.stdin.buffer.read())
            else:
                filepath = self.filepath
            for key, (a, r) in SoundHDF5File(filepath, "r").items():
                if self.return_shape:
                    a = a.shape
                yield key, (r, a)


class SoundReader:
    def __init__(self, rspecifier, return_shape=False):
        if ":" not in rspecifier:
            raise ValueError(
                'Give "rspecifier" such as "scp:some.scp: {}"'.format(rspecifier)
            )
        self.ark_or_scp, self.filepath = rspecifier.split(":", 1)
        if self.ark_or_scp != "scp":
            raise ValueError(
                'Only supporting "scp" for sound file: {}'.format(self.ark_or_scp)
            )
        self.return_shape = return_shape

    def __iter__(self):
        with open(self.filepath, "r", encoding="utf-8") as f:
            for line in f:
                key, sound_file_path = line.rstrip().split(None, 1)
                # Assume PCM16
                array, rate = soundfile.read(sound_file_path, dtype="int16")
                # Change Tuple[ndarray, int] -> Tuple[int, ndarray]
                # (soundfile style -> scipy style)
                if self.return_shape:
                    array = array.shape
                yield key, (rate, array)


================================================
FILE: utils/cli_utils.py
================================================
from collections.abc import Sequence
from distutils.util import strtobool as dist_strtobool
import sys

import numpy


def strtobool(x):
    # distutils.util.strtobool returns integer, but it's confusing,
    return bool(dist_strtobool(x))


def get_commandline_args():
    extra_chars = [
        " ",
        ";",
        "&",
        "(",
        ")",
        "|",
        "^",
        "<",
        ">",
        "?",
        "*",
        "[",
        "]",
        "$",
        "`",
        '"',
        "\\",
        "!",
        "{",
        "}",
    ]

    # Escape the extra characters for shell
    argv = [
        arg.replace("'", "'\\''")
        if all(char not in arg for char in extra_chars)
        else "'" + arg.replace("'", "'\\''") + "'"
        for arg in sys.argv
    ]

    return sys.executable + " " + " ".join(argv)


def is_scipy_wav_style(value):
    # If Tuple[int, numpy.ndarray] or not
    return (
        isinstance(value, Sequence)
        and len(value) == 2
        and isinstance(value[0], int)
        and isinstance(value[1], numpy.ndarray)
    )


def assert_scipy_wav_style(value):
    assert is_scipy_wav_style(
        value
    ), "Must be Tuple[int, numpy.ndarray], but got {}".format(
        type(value)
        if not isinstance(value, Sequence)
        else "{}[{}]".format(type(value), ", ".join(str(type(v)) for v in value))
    )


================================================
FILE: utils/cli_writers.py
================================================
from pathlib import Path
from typing import Dict

import h5py
import kaldiio
import numpy
import soundfile

from espnet.utils.cli_utils import assert_scipy_wav_style
from espnet.utils.io_utils import SoundHDF5File


def file_writer_helper(
    wspecifier: str,
    filetype: str = "mat",
    write_num_frames: str = None,
    compress: bool = False,
    compression_method: int = 2,
    pcm_format: str = "wav",
):
    """Write matrices in kaldi style

    Args:
        wspecifier: e.g. ark,scp:out.ark,out.scp
        filetype: "mat" is kaldi-martix, "hdf5": HDF5
        write_num_frames: e.g. 'ark,t:num_frames.txt'
        compress: Compress or not
        compression_method: Specify compression level

    Write in kaldi-matrix-ark with "kaldi-scp" file:

    >>> with file_writer_helper('ark,scp:out.ark,out.scp') as f:
    >>>     f['uttid'] = array

    This "scp" has the following format:

        uttidA out.ark:1234
        uttidB out.ark:2222

    where, 1234 and 2222 points the strating byte address of the matrix.
    (For detail, see official documentation of Kaldi)

    Write in HDF5 with "scp" file:

    >>> with file_writer_helper('ark,scp:out.h5,out.scp', 'hdf5') as f:
    >>>     f['uttid'] = array

    This "scp" file is created as:

        uttidA out.h5:uttidA
        uttidB out.h5:uttidB

    HDF5 can be, unlike "kaldi-ark", accessed to any keys,
    so originally "scp" is not required for random-reading.
    Nevertheless we create "scp" for HDF5 because it is useful
    for some use-case. e.g. Concatenation, Splitting.

    """
    if filetype == "mat":
        return KaldiWriter(
            wspecifier,
            write_num_frames=write_num_frames,
            compress=compress,
            compression_method=compression_method,
        )
    elif filetype == "hdf5":
        return HDF5Writer(
            wspecifier, write_num_frames=write_num_frames, compress=compress
        )
    elif filetype == "sound.hdf5":
        return SoundHDF5Writer(
            wspecifier, write_num_frames=write_num_frames, pcm_format=pcm_format
        )
    elif filetype == "sound":
        return SoundWriter(
            wspecifier, write_num_frames=write_num_frames, pcm_format=pcm_format
        )
    else:
        raise NotImplementedError(f"filetype={filetype}")


class BaseWriter:
    def __setitem__(self, key, value):
        raise NotImplementedError

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.close()

    def close(self):
        try:
            self.writer.close()
        except Exception:
            pass

        if self.writer_scp is not None:
            try:
                self.writer_scp.close()
            except Exception:
                pass

        if self.writer_nframe is not None:
            try:
                self.writer_nframe.close()
            except Exception:
                pass


def get_num_frames_writer(write_num_frames: str):
    """get_num_frames_writer

    Examples:
        >>> get_num_frames_writer('ark,t:num_frames.txt')
    """
    if write_num_frames is not None:
        if ":" not in write_num_frames:
            raise ValueError(
                'Must include ":", write_num_frames={}'.format(write_num_frames)
            )

        nframes_type, nframes_file = write_num_frames.split(":", 1)
        if nframes_type != "ark,t":
            raise ValueError(
                "Only supporting text mode. "
                "e.g. --write-num-frames=ark,t:foo.txt :"
                "{}".format(nframes_type)
            )

    return open(nframes_file, "w", encoding="utf-8")


class KaldiWriter(BaseWriter):
    def __init__(
        self, wspecifier, write_num_frames=None, compress=False, compression_method=2
    ):
        if compress:
            self.writer = kaldiio.WriteHelper(
                wspecifier, compression_method=compression_method
            )
        else:
            self.writer = kaldiio.WriteHelper(wspecifier)
        self.writer_scp = None
        if write_num_frames is not None:
            self.writer_nframe = get_num_frames_writer(write_num_frames)
        else:
            self.writer_nframe = None

    def __setitem__(self, key, value):
        self.writer[key] = value
        if self.writer_nframe is not None:
            self.writer_nframe.write(f"{key} {len(value)}\n")


def parse_wspecifier(wspecifier: str) -> Dict[str, str]:
    """Parse wspecifier to dict

    Examples:
        >>> parse_wspecifier('ark,scp:out.ark,out.scp')
        {'ark': 'out.ark', 'scp': 'out.scp'}

    """
    ark_scp, filepath = wspecifier.split(":", 1)
    if ark_scp not in ["ark", "scp,ark", "ark,scp"]:
        raise ValueError("{} is not allowed: {}".format(ark_scp, wspecifier))
    ark_scps = ark_scp.split(",")
    filepaths = filepath.split(",")
    if len(ark_scps) != len(filepaths):
        raise ValueError("Mismatch: {} and {}".format(ark_scp, filepath))
    spec_dict = dict(zip(ark_scps, filepaths))
    return spec_dict


class HDF5Writer(BaseWriter):
    """HDF5Writer

    Examples:
        >>> with HDF5Writer('ark:out.h5', compress=True) as f:
        ...     f['key'] = array
    """

    def __init__(self, wspecifier, write_num_frames=None, compress=False):
        spec_dict = parse_wspecifier(wspecifier)
        self.filename = spec_dict["ark"]

        if compress:
            self.kwargs = {"compression": "gzip"}
        else:
            self.kwargs = {}
        self.writer = h5py.File(spec_dict["ark"], "w")
        if "scp" in spec_dict:
            self.writer_scp = open(spec_dict["scp"], "w", encoding="utf-8")
        else:
            self.writer_scp = None
        if write_num_frames is not None:
            self.writer_nframe = get_num_frames_writer(write_num_frames)
        else:
            self.writer_nframe = None

    def __setitem__(self, key, value):
        self.writer.create_dataset(key, data=value, **self.kwargs)

        if self.writer_scp is not None:
            self.writer_scp.write(f"{key} {self.filename}:{key}\n")
        if self.writer_nframe is not None:
            self.writer_nframe.write(f"{key} {len(value)}\n")


class SoundHDF5Writer(BaseWriter):
    """SoundHDF5Writer

    Examples:
        >>> fs = 16000
        >>> with SoundHDF5Writer('ark:out.h5') as f:
        ...     f['key'] = fs, array
    """

    def __init__(self, wspecifier, write_num_frames=None, pcm_format="wav"):
        self.pcm_format = pcm_format
        spec_dict = parse_wspecifier(wspecifier)
        self.filename = spec_dict["ark"]
        self.writer = SoundHDF5File(spec_dict["ark"], "w", format=self.pcm_format)
        if "scp" in spec_dict:
            self.writer_scp = open(spec_dict["scp"], "w", encoding="utf-8")
        else:
            self.writer_scp = None
        if write_num_frames is not None:
            self.writer_nframe = get_num_frames_writer(write_num_frames)
        else:
            self.writer_nframe = None

    def __setitem__(self, key, value):
        assert_scipy_wav_style(value)
        # Change Tuple[int, ndarray] -> Tuple[ndarray, int]
        # (scipy style -> soundfile style)
        value = (value[1], value[0])
        self.writer.create_dataset(key, data=value)

        if self.writer_scp is not None:
            self.writer_scp.write(f"{key} {self.filename}:{key}\n")
        if self.writer_nframe is not None:
            self.writer_nframe.write(f"{key} {len(value[0])}\n")


class SoundWriter(BaseWriter):
    """SoundWriter

    Examples:
        >>> fs = 16000
        >>> with SoundWriter('ark,scp:outdir,out.scp') as f:
        ...     f['key'] = fs, array
    """

    def __init__(self, wspecifier, write_num_frames=None, pcm_format="wav"):
        self.pcm_format = pcm_format
        spec_dict = parse_wspecifier(wspecifier)
        # e.g. ark,scp:dirname,wav.scp
        # -> The wave files are found in dirname/*.wav
        self.dirname = spec_dict["ark"]
        Path(self.dirname).mkdir(parents=True, exist_ok=True)
        self.writer = None

        if "scp" in spec_dict:
            self.writer_scp = open(spec_dict["scp"], "w", encoding="utf-8")
        else:
            self.writer_scp = None
        if write_num_frames is not None:
            self.writer_nframe = get_num_frames_writer(write_num_frames)
        else:
            self.writer_nframe = None

    def __setitem__(self, key, value):
        assert_scipy_wav_style(value)
        rate, signal = value
        wavfile = Path(self.dirname) / (key + "." + self.pcm_format)
        soundfile.write(wavfile, signal.astype(numpy.int16), rate)

        if self.writer_scp is not None:
            self.writer_scp.write(f"{key} {wavfile}\n")
        if self.writer_nframe is not None:
            self.writer_nframe.write(f"{key} {len(signal)}\n")


================================================
FILE: utils/dataset.py
================================================
#!/usr/bin/env python

# Copyright 2017 Johns Hopkins University (Shinji Watanabe)
# Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

"""pytorch dataset and dataloader implementation for chainer training."""

import torch
import torch.utils.data
import time

def get_time():
    return time.asctime( time.localtime(time.time()))

class TransformDataset(torch.utils.data.Dataset):
    """Transform Dataset for pytorch backend.

    Args:
        data: list object from make_batchset
        transfrom: transform function

    """

    def __init__(self, data, transform):
        """Init function."""
        super(TransformDataset).__init__()
        self.data = data
        self.transform = transform

    def __len__(self):
        """Len function."""
        return len(self.data)

    def __getitem__(self, idx):
        """[] operator."""
        # print(f"{get_time()}: data laoder call getitem")
        return self.transform(self.data[idx])


class ChainerDataLoader(object):
    """Pytorch dataloader in chainer style.

    Args:
        all args for torch.utils.data.dataloader.Dataloader

    """

    def __init__(self, **kwargs):
        """Init function."""
        self.loader = torch.utils.data.dataloader.DataLoader(**kwargs)
        self.len = len(kwargs["dataset"])
        self.current_position = 0
        self.epoch = 0
        self.iter = None
        self.kwargs = kwargs

    def next(self):
        """Implement next function."""
        if self.iter is None:
            self.iter = iter(self.loader)
        try:
            ret = next(self.iter)
        except StopIteration:
            self.iter = None
            return self.next()
        self.current_position += 1
        if self.current_position == self.len:
            self.epoch = self.epoch + 1
            self.current_position = 0
        return ret

    def __iter__(self):
        """Implement iter function."""
        for batch in self.loader:
            yield batch

    @property
    def epoch_detail(self):
        """Epoch_detail required by chainer."""
        return self.epoch + self.current_position / self.len

    def serialize(self, serializer):
        """Serialize and deserialize function."""
        epoch = serializer("epoch", self.epoch)
        current_position = serializer("current_position", self.current_position)
        self.epoch = epoch
        self.current_position = current_position

    def start_shuffle(self):
        """Shuffle function for sortagrad."""
        self.kwargs["shuffle"] = True if "sampler" in list(self.kwargs.keys()) else None
        self.loader = torch.utils.data.dataloader.DataLoader(**self.kwargs)

    def finalize(self):
        """Implement finalize function."""
        del self.loader


================================================
FILE: utils/deterministic_utils.py
================================================
import logging
import os

import chainer
import torch


def set_deterministic_pytorch(args):
    """Ensures pytorch produces deterministic results depending on the program arguments

    :param Namespace args: The program arguments
    """
    # seed setting
    torch.manual_seed(args.seed)

    # debug mode setting
    # 0 would be fastest, but 1 seems to be reasonable
    # considering reproducibility
    # remove type check
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = (
        False  # https://github.com/pytorch/pytorch/issues/6351
    )
    if args.debugmode < 2:
        chainer.config.type_check = False
        logging.info("torch type check is disabled")
    # use deterministic computation or not
    if args.debugmode < 1:
        torch.backends.cudnn.deterministic = False
        torch.backends.cudnn.benchmark = True
        logging.info("torch cudnn deterministic is disabled")


def set_deterministic_chainer(args):
    """Ensures chainer produces deterministic results depending on the program arguments

    :param Namespace args: The program arguments
    """
    # seed setting (chainer seed may not need it)
    os.environ["CHAINER_SEED"] = str(args.seed)
    logging.info("chainer seed = " + os.environ["CHAINER_SEED"])

    # debug mode setting
    # 0 would be fastest, but 1 seems to be reasonable
    # considering reproducibility
    # remove type check
    if args.debugmode < 2:
        chainer.config.type_check = False
        logging.info("chainer type check is disabled")
    # use deterministic computation or not
    if args.debugmode < 1:
        chainer.config.cudnn_deterministic = False
        logging.info("chainer cudnn deterministic is disabled")
    else:
        chainer.config.cudnn_deterministic = True


================================================
FILE: utils/draw_num_fst.py
================================================
#!/usr/bin/env python3
# encoding: utf-8
import sys
import torch
import k2
from pathlib import Path
from espnet.snowfall.training.mmi_graph import MmiTrainingGraphCompiler
from espnet.snowfall.lexicon import Lexicon
from espnet.snowfall.training.mmi_graph import create_bigram_phone_lm

def main():

    # compiler
    lang = Path("data/lang_k2mmi")
    lexicon = Lexicon(lang)
    device = torch.device("cuda:0")
    graph_compiler = MmiTrainingGraphCompiler(lexicon=lexicon, device=device)
    
    # P
    phone_ids = lexicon.phone_symbols()
    P = create_bigram_phone_lm(phone_ids)
    P = P.to(device)

    # compile num graph
    ys = ["S O U R C E <space> C O L O N"]
    num_graphs, _ = graph_compiler.compile(ys, P, replicate_den=True)
    num = num_graphs[0]

    # draw
    num.draw("num.svg") 


main() 


================================================
FILE: utils/dynamic_import.py
================================================
import importlib


def dynamic_import(import_path, alias=dict()):
    """dynamic import module and class

    :param str import_path: syntax 'module_name:class_name'
        e.g., 'espnet.transform.add_deltas:AddDeltas'
    :param dict alias: shortcut for registered class
    :return: imported class
    """
    if import_path not in alias and ":" not in import_path:
        raise ValueError(
            "import_path should be one of {} or "
            'include ":", e.g. "espnet.transform.add_deltas:AddDeltas" : '
            "{}".format(set(alias), import_path)
        )
    if ":" not in import_path:
        import_path = alias[import_path]

    module_name, objname = import_path.split(":")
    m = importlib.import_module(module_name)
    return getattr(m, objname)


================================================
FILE: utils/fill_missing_args.py
================================================
# -*- coding: utf-8 -*-

# Copyright 2018 Nagoya University (Tomoki Hayashi)
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

import argparse
import logging


def fill_missing_args(args, add_arguments):
    """Fill missing arguments in args.

    Args:
        args (Namespace or None): Namesapce containing hyperparameters.
        add_arguments (function): Function to add arguments.

    Returns:
        Namespace: Arguments whose missing ones are filled with default value.

    Examples:
        >>> from argparse import Namespace
        >>> from espnet.nets.pytorch_backend.e2e_tts_tacotron2 import Tacotron2
        >>> args = Namespace()
        >>> fill_missing_args(args, Tacotron2.add_arguments_fn)
        Namespace(aconv_chans=32, aconv_filts=15, adim=512, atype='location', ...)

    """
    # check argument type
    assert isinstance(args, argparse.Namespace) or args is None
    assert callable(add_arguments)

    # get default arguments
    default_args, _ = add_arguments(argparse.ArgumentParser()).parse_known_args()

    # convert to dict
    args = {} if args is None else vars(args)
    default_args = vars(default_args)

    for key, value in default_args.items():
        if key not in args:
            logging.info(
                'attribute "%s" does not exist. use default %s.' % (key, str(value))
            )
            args[key] = value

    return argparse.Namespace(**args)


================================================
FILE: utils/io_utils.py
================================================
from collections import OrderedDict
import io
import logging
import os
import copy
import h5py
import kaldiio
import numpy as np
import soundfile
import time
import psutil
import torch
from espnet.transform.transformation import Transformation

def get_time():
    return time.asctime( time.localtime(time.time()))

class LoadInputsAndTargets(object):
    """Create a mini-batch from a list of dicts

    >>> batch = [('utt1',
    ...           dict(input=[dict(feat='some.ark:123',
    ...                            filetype='mat',
    ...                            name='input1',
    ...                            shape=[100, 80])],
    ...                output=[dict(tokenid='1 2 3 4',
    ...                             name='target1',
    ...                             shape=[4, 31])]]))
    >>> l = LoadInputsAndTargets()
    >>> feat, target = l(batch)

    :param: str mode: Specify the task mode, "asr" or "tts"
    :param: str preprocess_conf: The path of a json file for pre-processing
    :param: bool load_input: If False, not to load the input data
    :param: bool load_output: If False, not to load the output data
    :param: bool sort_in_input_length: Sort the mini-batch in descending order
        of the input length
    :param: bool use_speaker_embedding: Used for tts mode only
    :param: bool use_second_target: Used for tts mode only
    :param: dict preprocess_args: Set some optional arguments for preprocessing
    :param: Optional[dict] preprocess_args: Used for tts mode only
    """

    def __init__(
        self,
        mode="asr",
        preprocess_conf=None,
        load_input=True,
        load_output=True,
        sort_in_input_length=True,
        use_speaker_embedding=False,
        use_second_target=False,
        preprocess_args=None,
        keep_all_data_on_mem=False,
        block_load=False,
    ):
        self._loaders = {}
        if mode not in ["asr", "tts", "mt", "vc"]:
            raise ValueError("Only asr or tts are allowed: mode={}".format(mode))
        if preprocess_conf is not None:
            self.preprocessing = Transformation(preprocess_conf)
            logging.warning(
                "[Experimental feature] Some preprocessing will be done "
                "for the mini-batch creation using {}".format(self.preprocessing)
            )
        else:
            # If conf doesn't exist, this function don't touch anything.
            self.preprocessing = None

        if use_second_target and use_speaker_embedding and mode == "tts":
            raise ValueError(
                'Choose one of "use_second_target" and ' '"use_speaker_embedding "'
            )
        if (
            (use_second_target or use_speaker_embedding)
            and mode != "tts"
            and mode != "vc"
        ):
            logging.warning(
                '"use_second_target" and "use_speaker_embedding" is '
                "used only for tts or vc mode"
            )

        self.mode = mode
        self.load_output = load_output
        self.load_input = load_input
        self.sort_in_input_length = sort_in_input_length
        self.use_speaker_embedding = use_speaker_embedding
        self.use_second_target = use_second_target
        if preprocess_args is None:
            self.preprocess_args = {}
        else:
            assert isinstance(preprocess_args, dict), type(preprocess_args)
            self.preprocess_args = dict(preprocess_args)

        self.keep_all_data_on_mem = keep_all_data_on_mem
        self.block_load = block_load

    def __call__(self, batch, return_uttid=False):
        """Function to load inputs and targets from list of dicts

        :param List[Tuple[str, dict]] batch: list of dict which is subset of
            loaded data.json
        :param bool return_uttid: return utterance ID information for visualization
        :return: list of input token id sequences [(L_1), (L_2), ..., (L_B)]
        :return: list of input feature sequences
            [(T_1, D), (T_2, D), ..., (T_B, D)]
        :rtype: list of float ndarray
        :return: list of target token id sequences [(L_1), (L_2), ..., (L_B)]
        :rtype: list of int ndarray

        """
        x_feats_dict = OrderedDict()  # OrderedDict[str, List[np.ndarray]]
        y_feats_dict = OrderedDict()  # OrderedDict[str, List[np.ndarray]]
        uttid_list = []  # List[str]
        text_list = [] # List[str]

        if self.block_load:
            _, info = batch[0]
            ark_names = [parse_arkpath(inp["feat"]) for inp in info["input"]]

        for uttid, info in batch:
            uttid_list.append(uttid)

            if self.load_input:
                # Note(kamo): This for-loop is for multiple inputs
                for idx, inp in enumerate(info["input"]):
                    # {"input":
                    #  [{"feat": "some/path.h5:F01_050C0101_PED_REAL",
                    #    "filetype": "hdf5",
                    #    "name": "input1", ...}], ...}
                    if self.block_load:
                        assert parse_arkpath(inp["feat"]) == ark_names[idx],\
                               f"The batch should from the same ark if use block_load, key error: {inp['feat']}"
                    x = self._get_from_loader(
                        filepath=inp["feat"], filetype=inp.get("filetype", "mat"),
                        uttid=uttid
                    )
                    x_feats_dict.setdefault(inp["name"], []).append(x)
            # FIXME(kamo): Dirty way to load only speaker_embedding
            elif self.mode == "tts" and self.use_speaker_embedding:
                for idx, inp in enumerate(info["input"]):
                    if idx != 1 and len(info["input"]) > 1:
                        x = None
                    else:
                        x = self._get_from_loader(
                            filepath=inp["feat"], filetype=inp.get("filetype", "mat")
                        )
                    x_feats_dict.setdefault(inp["name"], []).append(x)

            if self.load_output:
                if self.mode == "mt":
                    x = np.fromiter(
                        map(int, info["output"][1]["tokenid"].split()), dtype=np.int64
                    )
                    x_feats_dict.setdefault(info["output"][1]["name"], []).append(x)

                for idx, inp in enumerate(info["output"]):
                    if "tokenid" in inp:
                        # ======= Legacy format for output =======
                        # {"output": [{"tokenid": "1 2 3 4"}])
                        x = np.fromiter(
                            map(int, inp["tokenid"].split()), dtype=np.int64
                        )
                    else:
                        # ======= New format =======
                        # {"input":
                        #  [{"feat": "some/path.h5:F01_050C0101_PED_REAL",
                        #    "filetype": "hdf5",
                        #    "name": "target1", ...}], ...}
                        x = self._get_from_loader(
                            filepath=inp["feat"], filetype=inp.get("filetype", "mat")
                        )

                    y_feats_dict.setdefault(inp["name"], []).append(x)
           
            if "text_org" in info: 
                text_list.append(info["text_org"])
            else:
                text_list.append(info["output"][0]["text"])
        
        if self.mode == "asr":
            return_batch, uttid_list = self._create_batch_asr(
                x_feats_dict, y_feats_dict, uttid_list
            )
        elif self.mode == "tts":
            _, info = batch[0]
            eos = int(info["output"][0]["shape"][1]) - 1
            return_batch, uttid_list = self._create_batch_tts(
                x_feats_dict, y_feats_dict, uttid_list, eos
            )
        elif self.mode == "mt":
            return_batch, uttid_list = self._create_batch_mt(
                x_feats_dict, y_feats_dict, uttid_list
            )
        elif self.mode == "vc":
            return_batch, uttid_list = self._create_batch_vc(
                x_feats_dict, y_feats_dict, uttid_list
            )
        else:
            raise NotImplementedError(self.mode)
        
        """
        Additional information by tyriontian
        xs_orig is the identical spectrum with xs but ignore preprocess (like specaug)
        we need xs_orig for on-the-fly decoding in MBR training
        text_org is the original text label sequence. we need this for MMI training
        """

        return_batch["text_org"] = text_list
        return_batch["xs_orig"] = copy.deepcopy(return_batch["input1"])

        if self.preprocessing is not None:
            # Apply pre-processing all input features
            for x_name in return_batch.keys():
                if x_name.startswith("input"):
                    return_batch[x_name] = self.preprocessing(
                        return_batch[x_name], uttid_list, **self.preprocess_args
                    )
      
        if return_uttid:
            return tuple(return_batch.values()), uttid_list

        # Doesn't return the names now.
        return tuple(return_batch.values())

    def _create_batch_asr(self, x_feats_dict, y_feats_dict, uttid_list):
        """Create a OrderedDict for the mini-batch

        :param OrderedDict x_feats_dict:
            e.g. {"input1": [ndarray, ndarray, ...],
                  "input2": [ndarray, ndarray, ...]}
        :param OrderedDict y_feats_dict:
            e.g. {"target1": [ndarray, ndarray, ...],
                  "target2": [ndarray, ndarray, ...]}
        :param: List[str] uttid_list:
            Give uttid_list to sort in the same order as the mini-batch
        :return: batch, uttid_list
        :rtype: Tuple[OrderedDict, List[str]]
        """
        # handle single-input and multi-input (paralell) asr mode
        xs = list(x_feats_dict.values())

        if self.load_output:
            ys = list(y_feats_dict.values())
            assert len(xs[0]) == len(ys[0]), (len(xs[0]), len(ys[0]))

            # get index of non-zero length samples
            nonzero_idx = list(filter(lambda i: len(ys[0][i]) > 0, range(len(ys[0]))))
            for n in range(1, len(y_feats_dict)):
                nonzero_idx = filter(lambda i: len(ys[n][i]) > 0, nonzero_idx)
        else:
            # Note(kamo): Be careful not to make nonzero_idx to a generator
            nonzero_idx = list(range(len(xs[0])))

        if self.sort_in_input_length:
            # sort in input lengths based on the first input
            nonzero_sorted_idx = sorted(nonzero_idx, key=lambda i: -len(xs[0][i]))
        else:
            nonzero_sorted_idx = nonzero_idx

        if len(nonzero_sorted_idx) != len(xs[0]):
            logging.warning(
                "Target sequences include empty tokenid (batch {} -> {}).".format(
                    len(xs[0]), len(nonzero_sorted_idx)
                )
            )

        # remove zero-length samples
        xs = [[x[i] for i in nonzero_sorted_idx] for x in xs]
        uttid_list = [uttid_list[i] for i in nonzero_sorted_idx]

        x_names = list(x_feats_dict.keys())
        if self.load_output:
            ys = [[y[i] for i in nonzero_sorted_idx] for y in ys]
            y_names = list(y_feats_dict.keys())

            # Keeping x_name and y_name, e.g. input1, for future extension
            return_batch = OrderedDict(
                [
                    *[(x_name, x) for x_name, x in zip(x_names, xs)],
                    *[(y_name, y) for y_name, y in zip(y_names, ys)],
                ]
            )
        else:
            return_batch = OrderedDict([(x_name, x) for x_name, x in zip(x_names, xs)])
        return return_batch, uttid_list

    def _create_batch_mt(self, x_feats_dict, y_feats_dict, uttid_list):
        """Create a OrderedDict for the mini-batch

        :param OrderedDict x_feats_dict:
        :param OrderedDict y_feats_dict:
        :return: batch, uttid_list
        :rtype: Tuple[OrderedDict, List[str]]
        """
        # Create a list from the first item
        xs = list(x_feats_dict.values())[0]

        if self.load_output:
            ys = list(y_feats_dict.values())[0]
            assert len(xs) == len(ys), (len(xs), len(ys))

            # get index of non-zero length samples
            nonzero_idx = filter(lambda i: len(ys[i]) > 0, range(len(ys)))
        else:
            nonzero_idx = range(len(xs))

        if self.sort_in_input_length:
            # sort in input lengths
            nonzero_sorted_idx = sorted(nonzero_idx, key=lambda i: -len(xs[i]))
        else:
            nonzero_sorted_idx = nonzero_idx

        if len(nonzero_sorted_idx) != len(xs):
            logging.warning(
                "Target sequences include empty tokenid (batch {} -> {}).".format(
                    len(xs), len(nonzero_sorted_idx)
                )
            )

        # remove zero-length samples
        xs = [xs[i] for i in nonzero_sorted_idx]
        uttid_list = [uttid_list[i] for i in nonzero_sorted_idx]

        x_name = list(x_feats_dict.keys())[0]
        if self.load_output:
            ys = [ys[i] for i in nonzero_sorted_idx]
            y_name = list(y_feats_dict.keys())[0]

            return_batch = OrderedDict([(x_name, xs), (y_name, ys)])
        else:
            return_batch = OrderedDict([(x_name, xs)])
        return return_batch, uttid_list

    def _create_batch_tts(self, x_feats_dict, y_feats_dict, uttid_list, eos):
        """Create a OrderedDict for the mini-batch

        :param OrderedDict x_feats_dict:
            e.g. {"input1": [ndarray, ndarray, ...],
                  "input2": [ndarray, ndarray, ...]}
        :param OrderedDict y_feats_dict:
            e.g. {"target1": [ndarray, ndarray, ...],
                  "target2": [ndarray, ndarray, ...]}
        :param: List[str] uttid_list:
        :param int eos:
        :return: batch, uttid_list
        :rtype: Tuple[OrderedDict, List[str]]
        """
        # Use the output values as the input feats for tts mode
        xs = list(y_feats_dict.values())[0]
        # get index of non-zero length samples
        nonzero_idx = list(filter(lambda i: len(xs[i]) > 0, range(len(xs))))
        # sort in input lengths
        if self.sort_in_input_length:
            # sort in input lengths
            nonzero_sorted_idx = sorted(nonzero_idx, key=lambda i: -len(xs[i]))
        else:
            nonzero_sorted_idx = nonzero_idx
        # remove zero-length samples
        xs = [xs[i] for i in nonzero_sorted_idx]
        uttid_list = [uttid_list[i] for i in nonzero_sorted_idx]
        # Added eos into input sequence
        xs = [np.append(x, eos) for x in xs]

        if self.load_input:
            ys = list(x_feats_dict.values())[0]
            assert len(xs) == len(ys), (len(xs), len(ys))
            ys = [ys[i] for i in nonzero_sorted_idx]

            spembs = None
            spcs = None
            spembs_name = "spembs_none"
            spcs_name = "spcs_none"

            if self.use_second_target:
                spcs = list(x_feats_dict.values())[1]
                spcs = [spcs[i] for i in nonzero_sorted_idx]
                spcs_name = list(x_feats_dict.keys())[1]

            if self.use_speaker_embedding:
                spembs = list(x_feats_dict.values())[1]
                spembs = [spembs[i] for i in nonzero_sorted_idx]
                spembs_name = list(x_feats_dict.keys())[1]

            x_name = list(y_feats_dict.keys())[0]
            y_name = list(x_feats_dict.keys())[0]

            return_batch = OrderedDict(
                [(x_name, xs), (y_name, ys), (spembs_name, spembs), (spcs_name, spcs)]
            )
        elif self.use_speaker_embedding:
            if len(x_feats_dict) == 0:
                raise IndexError("No speaker embedding is provided")
            elif len(x_feats_dict) == 1:
                spembs_idx = 0
            else:
                spembs_idx = 1

            spembs = list(x_feats_dict.values())[spembs_idx]
            spembs = [spembs[i] for i in nonzero_sorted_idx]

            x_name = list(y_feats_dict.keys())[0]
            spembs_name = list(x_feats_dict.keys())[spembs_idx]

            return_batch = OrderedDict([(x_name, xs), (spembs_name, spembs)])
        else:
            x_name = list(y_feats_dict.keys())[0]

            return_batch = OrderedDict([(x_name, xs)])
        return return_batch, uttid_list

    def _create_batch_vc(self, x_feats_dict, y_feats_dict, uttid_list):
        """Create a OrderedDict for the mini-batch

        :param OrderedDict x_feats_dict:
            e.g. {"input1": [ndarray, ndarray, ...],
                  "input2": [ndarray, ndarray, ...]}
        :param OrderedDict y_feats_dict:
            e.g. {"target1": [ndarray, ndarray, ...],
                  "target2": [ndarray, ndarray, ...]}
        :param: List[str] uttid_list:
        :return: batch, uttid_list
        :rtype: Tuple[OrderedDict, List[str]]
        """
        # Create a list from the first item
        xs = list(x_feats_dict.values())[0]

        # get index of non-zero length samples
        nonzero_idx = list(filter(lambda i: len(xs[i]) > 0, range(len(xs))))

        # sort in input lengths
        if self.sort_in_input_length:
            # sort in input lengths
            nonzero_sorted_idx = sorted(nonzero_idx, key=lambda i: -len(xs[i]))
        else:
            nonzero_sorted_idx = nonzero_idx

        # remove zero-length samples
        xs = [xs[i] for i in nonzero_sorted_idx]
        uttid_list = [uttid_list[i] for i in nonzero_sorted_idx]

        if self.load_output:
            ys = list(y_feats_dict.values())[0]
            assert len(xs) == len(ys), (len(xs), len(ys))
            ys = [ys[i] for i in nonzero_sorted_idx]

            spembs = None
            spcs = None
            spembs_name = "spembs_none"
            spcs_name = "spcs_none"

            if self.use_second_target:
                raise ValueError("Currently second target not supported.")
                spcs = list(x_feats_dict.values())[1]
                spcs = [spcs[i] for i in nonzero_sorted_idx]
                spcs_name = list(x_feats_dict.keys())[1]

            if self.use_speaker_embedding:
                spembs = list(x_feats_dict.values())[1]
                spembs = [spembs[i] for i in nonzero_sorted_idx]
                spembs_name = list(x_feats_dict.keys())[1]

            x_name = list(x_feats_dict.keys())[0]
            y_name = list(y_feats_dict.keys())[0]

            return_batch = OrderedDict(
                [(x_name, xs), (y_name, ys), (spembs_name, spembs), (spcs_name, spcs)]
            )
        elif self.use_speaker_embedding:
            if len(x_feats_dict) == 0:
                raise IndexError("No speaker embedding is provided")
            elif len(x_feats_dict) == 1:
                spembs_idx = 0
            else:
                spembs_idx = 1

            spembs = list(x_feats_dict.values())[spembs_idx]
            spembs = [spembs[i] for i in nonzero_sorted_idx]

            x_name = list(x_feats_dict.keys())[0]
            spembs_name = list(x_feats_dict.keys())[spembs_idx]

            return_batch = OrderedDict([(x_name, xs), (spembs_name, spembs)])
        else:
            x_name = list(x_feats_dict.keys())[0]

            return_batch = OrderedDict([(x_name, xs)])
        return return_batch, uttid_list

    def _get_from_loader(self, filepath, filetype, uttid=None):
        """Return ndarray

        In order to make the fds to be opened only at the first referring,
        the loader are stored in self._loaders

        >>> ndarray = loader.get_from_loader(
        ...     'some/path.h5:F01_050C0101_PED_REAL', filetype='hdf5')

        :param: str filepath:
        :param: str filetype:
        :return:
        :rtype: np.ndarray
        """
        if filetype == "hdf5":
            # e.g.
            #    {"input": [{"feat": "some/path.h5:F01_050C0101_PED_REAL",
            #                "filetype": "hdf5",
            # -> filepath = "some/path.h5", key = "F01_050C0101_PED_REAL"
            filepath, key = filepath.split(":", 1)

            loader = self._loaders.get(filepath)
            if loader is None:
                # To avoid disk access, create loader only for the first time
                loader = h5py.File(filepath, "r")
                self._loaders[filepath] = loader
            return loader[key][()]
        elif filetype == "sound.hdf5":
            # e.g.
            #    {"input": [{"feat": "some/path.h5:F01_050C0101_PED_REAL",
            #                "filetype": "sound.hdf5",
            # -> filepath = "some/path.h5", key = "F01_050C0101_PED_REAL"
            filepath, key = filepath.split(":", 1)

            loader = self._loaders.get(filepath)
            if loader is None:
                # To avoid disk access, create loader only for the first time
                loader = SoundHDF5File(filepath, "r", dtype="int16")
                self._loaders[filepath] = loader
            array, rate = loader[key]
            return array
        elif filetype == "sound":
            # e.g.
            #    {"input": [{"feat": "some/path.wav",
            #                "filetype": "sound"},
            # Assume PCM16
            if not self.keep_all_data_on_mem:
                array, _ = soundfile.read(filepath, dtype="int16")
                return array
            if filepath not in self._loaders:
                array, _ = soundfile.read(filepath, dtype="int16")
                self._loaders[filepath] = array
            return self._loaders[filepath]
        elif filetype == "npz":
            # e.g.
            #    {"input": [{"feat": "some/path.npz:F01_050C0101_PED_REAL",
            #                "filetype": "npz",
            filepath, key = filepath.split(":", 1)

            loader = self._loaders.get(filepath)
            if loader is None:
                # To avoid disk access, create loader only for the first time
                loader = np.load(filepath)
                self._loaders[filepath] = loader
            return loader[key]
        elif filetype == "npy":
            # e.g.
            #    {"input": [{"feat": "some/path.npy",
            #                "filetype": "npy"},
            if not self.keep_all_data_on_mem:
                return np.load(filepath)
            if filepath not in self._loaders:
                self._loaders[filepath] = np.load(filepath)
            return self._loaders[filepath]
        elif filetype in ["mat", "vec"]:
            # e.g.
            #    {"input": [{"feat": "some/path.ark:123",
            #                "filetype": "mat"}]},
            # In this case, "123" indicates the starting points of the matrix
            # load_mat can load both matrix and vector
            if self.block_load:
                ark_name = parse_arkpath(filepath) 

                # remove empty ark
                if ark_name in self._loaders:
                    if self._loaders[ark_name] == {}:
                        del self._loaders[ark_name]

                # load the ark when requested
                elif not ark_name in self._loaders:
                    ark_dict = load_ark_full(ark_name)
                    self._loaders[ark_name] = ark_dict
                
                # use deep copy as the memory will be released sooner
                try:
                    data_copy = copy.deepcopy(self._loaders[ark_name][uttid])
                    del self._loaders[ark_name][uttid]
                
                except:
                    # in batchfy process the last minibatch contains repeatitive
                    # uttrances. In this case the features have been deleted
                    # and then leads to an error. 
                    print(f"Warning: {filepath} is loaded from disk directly",flush=True)
                    data = kaldiio.load_mat(filepath)
                    data_copy = copy.deepcopy(data)
                return data_copy

            if not self.keep_all_data_on_mem:
                return kaldiio.load_mat(filepath)
            if filepath not in self._loaders:
                self._loaders[filepath] = kaldiio.load_mat(filepath)
            return self._loaders[filepath]
        elif filetype == "scp":
            # e.g.
            #    {"input": [{"feat": "some/path.scp:F01_050C0101_PED_REAL",
            #                "filetype": "scp",
            filepath, key = filepath.split(":", 1)
            loader = self._loaders.get(filepath)
            if loader is None:
                # To avoid disk access, create loader only for the first time
                loader = kaldiio.load_scp(filepath)
                self._loaders[filepath] = loader
            return loader[key]
        else:
            raise NotImplementedError("Not supported: loader_type={}".format(filetype))


class SoundHDF5File(object):
    """Collecting sound files to a HDF5 file

    >>> f = SoundHDF5File('a.flac.h5', mode='a')
    >>> array = np.random.randint(0, 100, 100, dtype=np.int16)
    >>> f['id'] = (array, 16000)
    >>> array, rate = f['id']


    :param: str filepath:
    :param: str mode:
    :param: str format: The type used when saving wav. flac, nist, htk, etc.
    :param: str dtype:

    """

    def __init__(self, filepath, mode="r+", format=None, dtype="int16", **kwargs):
        self.filepath = filepath
        self.mode = mode
        self.dtype = dtype

        self.file = h5py.File(filepath, mode, **kwargs)
        if format is None:
            # filepath = a.flac.h5 -> format = flac
            second_ext = os.path.splitext(os.path.splitext(filepath)[0])[1]
            format = second_ext[1:]
            if format.upper() not in soundfile.available_formats():
                # If not found, flac is selected
                format = "flac"

        # This format affects only saving
        self.format = format

    def __repr__(self):
        return '<SoundHDF5 file "{}" (mode {}, format {}, type {})>'.format(
            self.filepath, self.mode, self.format, self.dtype
        )

    def create_dataset(self, name, shape=None, data=None, **kwds):
        f = io.BytesIO()
        array, rate = data
        soundfile.write(f, array, rate, format=self.format)
        self.file.create_dataset(name, shape=shape, data=np.void(f.getvalue()), **kwds)

    def __setitem__(self, name, data):
        self.create_dataset(name, data=data)

    def __getitem__(self, key):
        data = self.file[key][()]
        f = io.BytesIO(data.tobytes())
        array, rate = soundfile.read(f, dtype=self.dtype)
        return array, rate

    def keys(self):
        return self.file.keys()

    def values(self):
        for k in self.file:
            yield self[k]

    def items(self):
        for k in self.file:
            yield k, self[k]

    def __iter__(self):
        return iter(self.file)

    def __contains__(self, item):
        return item in self.file

    def __len__(self, item):
        return len(self.file)

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.file.close()

    def close(self):
        self.file.close()

# return the path filename
def parse_arkpath(path):
    return path.strip().split(":")[0]

# read the whole ark without lazy loading
def load_ark_full(path):
    ret = {}
    for k, v in kaldiio.load_ark(path):
        ret[k] = v
    return ret

# worker id suffix
def wid_suffix():
    return "_" + str(torch.utils.data.get_worker_info().id)

# monitor the system memory usage
def memory_ratio():
    mem = psutil.virtual_memory()
    return float(mem.used) / float(mem.total)


================================================
FILE: utils/parse_decoding_process.py
================================================
import os
import numpy as np
import matplotlib.pyplot as plt


def plot_decoding_logs(graph_dir, char_list, recog_args, uttid, nbest_hyps):
    graph_subdir = os.path.join(graph_dir, uttid)
    os.makedirs(graph_subdir, exist_ok=True)

    for i, hyp in enumerate(nbest_hyps):
        hyp_chr = "".join([char_list[int(x)] for x in hyp["yseq"][1:]]).replace("<space>", " ")
        print(f"{i}-th hypothesis of {uttid}: {hyp_chr}")

        logs = hyp["logs"]
        step_logs = process_logs(logs, recog_args, accum=False)
        tot_logs = process_logs(logs, recog_args, accum=True)

        filename = f"{uttid}-{i}-step.png"
        filename = os.path.join(graph_subdir, filename)
        plot_dict(filename, step_logs, hyp_chr)

        filename = f"{uttid}-{i}-tot.png"
        filename = os.path.join(graph_subdir, filename)
        plot_dict(filename, tot_logs, hyp_chr) 
           
def plot_dict(filename, d, title):
    
    plt.clf()
    plt.cla() 
    lines = []
    for k, v in d.items():
        x = np.arange(len(v)) 
        line = plt.plot(x, v, label=k)
        lines.append(line)
    plt.legend()
    plt.title(title)
    plt.savefig(filename)

def process_logs(logs, args, accum=False):
    ans = {}
    
    for k, v in logs.items():
        
        if accum:
            v = [sum(v[:l+1]) for l in range(len(v))]
        
        v = np.array(v)
        if k == "att":
            v = v * (1 - args.ctc_weight)
        elif k == "ctc":
            v = v * args.ctc_weight
        elif k == "mmi":
            v = v * args.mmi_weight
        elif k == "lm":
            v = v * args.lm_weight

        ans[k] = v
    
    tot = np.zeros_like(ans["att"])
    for k, v in ans.items():
        tot += v
    ans["sum"] = tot

    return ans
        

================================================
FILE: utils/parse_npy.py
================================================
import numpy as np
import sys

def main():
    npy_file = sys.argv[1]
    symbol_table = sys.argv[2]
    
    print(f"Parsing file {npy_file}")

    log_probs = np.load(npy_file)
    probs = np.exp(log_probs) 

    syms = {}
    for line in open(symbol_table):
        ph, pid = line.split()
        syms[int(pid)] = ph

    max_probs = np.argmax(probs, axis=-1)
    max_syms = [syms[x] for x in max_probs]
    max_syms = " ".join(max_syms)
    print(max_syms)

main()


================================================
FILE: utils/print.py
================================================
import time
import torch.distributed as dist


def step_print(ctx, flush=False):
    tmark = time.asctime(time.localtime(time.time()))
    rank = dist.get_rank()
    print(f"{tmark} | rank: {rank} | {ctx}", flush=flush)


================================================
FILE: utils/rtf_calculator.py
================================================
import time

class RTF_calculator():
    def __init__(self, js, fps=100):
        self.js = js
        self.fps = fps
        self.time_stamp = None
    
    def tik(self):
        self.time_stamp = time.time()

    def tok(self):
        time_elapsed = time.time() - self.time_stamp
        time_utts = sum(
                    v["input"][0]["shape"][0] for v in self.js.values()
                    )
        time_utts /= self.fps 

        rtf =  time_elapsed / time_utts
        print("RTF calculator: RTF is {:.2f} | time_utts: {:2f} | time_elapsed: {:.2f}".format(rtf, time_utts, time_elapsed))


================================================
FILE: utils/sampler.py
================================================
import torch
import torch.utils.data as data
import random

# We cannot make the data loading totally random due to the slow ceph 
# So we use this sampler to ensure that the data reading will be contrained
# in limited number of arks
class BufferSampler(object):
    def __init__(self, length, utts_per_ark, batch_size, buf_size, seed=0, prefetch_ratio=0.3):
        """
        length: number of minibatches
        utts_per_ark: the number of utterances in each ark except the last one
        batch_size: the batch size used in training
        buf_size: the number of arks that you want to put in the buffer
        prefetch_ratio: when the remained number of minibatches is below this ratio, 
                        we start to featch the arks in next group
                        0.5 means we begin to read the next group of arks when half of
                        this group is consumed
        """
    
        self.batch_per_ark = int(utts_per_ark / batch_size)
        self.buf_size = buf_size
        self.prefetch_ratio = prefetch_ratio
        self.num_batches = length
        self.seed = seed
        
        # seed2 is a bias on seed. It never work independently
        # it is different on different GPU rank
        try:
            import torch.distributed as dist
            self.seed2 = dist.get_rank()
        except:
            print("Sampler: you are not using DDP training paradigm.")
            print("Sampler: So the rank bias of random seed is set to 0", flush=True)
            self.seed2 = 0

    def __iter__(self):
        self.reset()
        print("A new iterator in sampler is built")
        # make 0, ..., length - 1 in indices
        assert sum(self.indices) == self.num_batches * (self.num_batches - 1) / 2
        return iter(self.indices)
 
    def __len__(self):
        return self.num_batches
    
    """
    This is the core function of this sampler
    The output indices have features below:
    (1) All arks are divided into several groups. Each group consists
        of at most `buf_size` arks
    (2) The indices are from the same group until all data in this group
        is consumed. This is to avoid buffering too many arks.
    (3) For DDP training, the grouping results are identical. This is to 
        ensure that the length distribution in this group is similar 
        across the different ranks. This is controlled by self.seed.
    (4) Within the group, the order of indices cannot be identical 
        across the ranks, or the global mini-batch will be identical
        in each epochs. In this case, we ensure that for any valid
        t, the t-th minibatch in this group across the different 
        ranks are from the same ark-id but not necessarily the same.
        This provides more variation in training data. This is controlled
        by `self.seed2`
    """
    def _get_indices(self):
        num_arks = int(self.num_batches // self.batch_per_ark) + \
                     int(self.num_batches % self.batch_per_ark != 0)

        # group arks
        ark_ids = list(range(num_arks))
        random.shuffle(ark_ids)
        start = 0
        groups = []
        while start < num_arks:
            end = min(start + self.buf_size, num_arks)
            group = ark_ids[start: end]
            groups.append(group)
            start += self.buf_size
 
        def process_group(group, seed_bias):
            eg_indices = [] # global idx of the mini-batches
            ark_indices = [] # ark idx of the mini-batches
            for i, arkid in enumerate(group):
                start = arkid * self.batch_per_ark
                end = min((arkid+1) * self.batch_per_ark, self.num_batches)
 
                eg_indice = list(range(start, end))
                eg_indices.append(eg_indice)
 
                ark_indice = [i] * (end - start)
                ark_indices.append(ark_indice)

            ark_indices = self._splice_list(ark_indices)

            # the ark_indices is with self.seed
            # as we need it identical on different GPU ranks
            random.shuffle(ark_indices)

            # eg_indices is with self.seed + self.seed2
            # we need it different on different GPU ranks 
            random.seed(self.seed + self.seed2)
            for e in eg_indices:
                random.shuffle(e)
           
            # we need recover the seed so the next time
            # we shuffle ark_indices will still have
            # the same results across the GPUs.
            # we do not use `self.seed` only as it 
            # always return to the same start point
            random.seed(self.seed + seed_bias + 888) 
            
            # combine finally
            group_indice = []
            for i in ark_indices:
                batch_idx = eg_indices[i].pop()
                group_indice.append(batch_idx)
            return group_indice

        group_indices = [process_group(g, b) for b, g in enumerate(groups)]
        return self._splice_list(group_indices)
   
    # Using these indices leads to identical global batches in 
    # each epoch 
    def _get_indices_deprecated(self):
        num_arks = int(self.num_batches // self.batch_per_ark) + \
                     int(self.num_batches % self.batch_per_ark != 0)

        ark_ids = list(range(num_arks))
        random.shuffle(ark_ids)
        ark_indices = [(idx * self.batch_per_ark, 
                        min((idx+1) * self.batch_per_ark, self.num_batches))
                        for idx in ark_ids]
        ark_indices = [list(range(*idx)) for idx in ark_indices]

        # grouping ark indices and shuffle within the group
        start = 0 
        group_indices = []
        while start < num_arks:
            end = min(start + self.buf_size, num_arks)
            
            group_indice = ark_indices[start: end]
            group_indice = self._splice_list(group_indice)
            random.shuffle(group_indice)
            group_indices.append(group_indice)
            start += self.buf_size

        group_indices = self._splice_list(group_indices)
        
        return group_indices


    def reset(self, seed=None):
        # change the seed and reset the indices
        # It is important to use the seed in DDP training
        # as the result of sampler is identical on each GPU.
        # Since the index of minibatch is proportional to the
        # length of utternace, this will help us to balance 
        # the load of each GPU 
        seed = seed if seed is not None else self.seed + 1
        self.seed = seed
        random.seed(seed)
        self.indices = self._get_indices()

    def _splice_list(self, lsts):
        out = []
        for l in lsts:
            out += l
        return out

    # this provides the prefetch factor of dataloader
    # no matter how much mini-batches to preload, all
    # arks in the next group will be loaded. So a small
    # ratio is enough and will save memory
    # just make sure 0.3 group of data will not run out
    # before the next group is loaded
    def get_prefetch_factor(self):
        return int(self.buf_size * self.batch_per_ark * self.prefetch_ratio)

class testdataset:
    def __init__(self, length):
        self.l = length

    def __len__(self):
        return self.l

if __name__ == '__main__':
    # 26 batches (52 utts), 4 batches in each ark, max 3 arks in buf, batch_size = 2
    num_minibatches = 26
    sampler = BufferSampler(num_minibatches, utts_per_ark=4, batch_size=2, buf_size=3)
    out = ""
    for i in iter(sampler):
        out += f"{i}\t"


================================================
FILE: utils/spec_augment.py
================================================
# -*- coding: utf-8 -*-

"""
This implementation is modified from https://github.com/zcaceres/spec_augment

MIT License

Copyright (c) 2019 Zach Caceres

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETjjHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
"""

import random

import torch


def specaug(
    spec, W=5, F=30, T=40, num_freq_masks=2, num_time_masks=2, replace_with_zero=False
):
    """SpecAugment

    Reference:
        SpecAugment: A Simple Data Augmentation Method for Automatic Speech Recognition
        (https://arxiv.org/pdf/1904.08779.pdf)

    This implementation modified from https://github.com/zcaceres/spec_augment

    :param torch.Tensor spec: input tensor with the shape (T, dim)
    :param int W: time warp parameter
    :param int F: maximum width of each freq mask
    :param int T: maximum width of each time mask
    :param int num_freq_masks: number of frequency masks
    :param int num_time_masks: number of time masks
    :param bool replace_with_zero: if True, masked parts will be filled with 0,
        if False, filled with mean
    """
    return time_mask(
        freq_mask(
            time_warp(spec, W=W),
            F=F,
            num_masks=num_freq_masks,
            replace_with_zero=replace_with_zero,
        ),
        T=T,
        num_masks=num_time_masks,
        replace_with_zero=replace_with_zero,
    )


def time_warp(spec, W=5):
    """Time warping

    :param torch.Tensor spec: input tensor with shape (T, dim)
    :param int W: time warp parameter
    """
    spec = spec.unsqueeze(0)
    spec_len = spec.shape[1]
    num_rows = spec.shape[2]
    device = spec.device

    y = num_rows // 2
    horizontal_line_at_ctr = spec[0, :, y]
    assert len(horizontal_line_at_ctr) == spec_len

    point_to_warp = horizontal_line_at_ctr[random.randrange(W, spec_len - W)]
    assert isinstance(point_to_warp, torch.Tensor)

    # Uniform distribution from (0,W) with chance to be up to W negative
    dist_to_warp = random.randrange(-W, W)
    src_pts, dest_pts = (
        torch.tensor([[[point_to_warp, y]]], device=device),
        torch.tensor([[[point_to_warp + dist_to_warp, y]]], device=device),
    )
    warped_spectro, dense_flows = sparse_image_warp(spec, src_pts, dest_pts)
    return warped_spectro.squeeze(3).squeeze(0)


def freq_mask(spec, F=30, num_masks=1, replace_with_zero=False):
    """Frequency masking

    :param torch.Tensor spec: input tensor with shape (T, dim)
    :param int F: maximum width of each mask
    :param int num_masks: number of masks
    :param bool replace_with_zero: if True, masked parts will be filled with 0,
        if False, filled with mean
    """
    cloned = spec.unsqueeze(0).clone()
    num_mel_channels = cloned.shape[2]

    for i in range(0, num_masks):
        f = random.randrange(0, F)
        f_zero = random.randrange(0, num_mel_channels - f)

        # avoids randrange error if values are equal and range is empty
        if f_zero == f_zero + f:
            return cloned.squeeze(0)

        mask_end = random.randrange(f_zero, f_zero + f)
        if replace_with_zero:
            cloned[0][:, f_zero:mask_end] = 0
        else:
            cloned[0][:, f_zero:mask_end] = cloned.mean()
    return cloned.squeeze(0)


def time_mask(spec, T=40, num_masks=1, replace_with_zero=False):
    """Time masking

    :param torch.Tensor spec: input tensor with shape (T, dim)
    :param int T: maximum width of each mask
    :param int num_masks: number of masks
    :param bool replace_with_zero: if True, masked parts will be filled with 0,
        if False, filled with mean
    """
    cloned = spec.unsqueeze(0).clone()
    len_spectro = cloned.shape[1]

    for i in range(0, num_masks):
        t = random.randrange(0, T)
        t_zero = random.randrange(0, len_spectro - t)

        # avoids randrange error if values are equal and range is empty
        if t_zero == t_zero + t:
            return cloned.squeeze(0)

        mask_end = random.randrange(t_zero, t_zero + t)
        if replace_with_zero:
            cloned[0][t_zero:mask_end, :] = 0
        else:
            cloned[0][t_zero:mask_end, :] = cloned.mean()
    return cloned.squeeze(0)


def sparse_image_warp(
    img_tensor,
    source_control_point_locations,
    dest_control_point_locations,
    interpolation_order=2,
    regularization_weight=0.0,
    num_boundaries_points=0,
):
    device = img_tensor.device
    control_point_flows = dest_control_point_locations - source_control_point_locations

    batch_size, image_height, image_width = img_tensor.shape
    flattened_grid_locations = get_flat_grid_locations(
        image_height, image_width, device
    )

    flattened_flows = interpolate_spline(
        dest_control_point_locations,
        control_point_flows,
        flattened_grid_locations,
        interpolation_order,
        regularization_weight,
    )

    dense_flows = create_dense_flows(
        flattened_flows, batch_size, image_height, image_width
    )

    warped_image = dense_image_warp(img_tensor, dense_flows)

    return warped_image, dense_flows


def get_grid_locations(image_height, image_width, device):
    y_range = torch.linspace(0, image_height - 1, image_height, device=device)
    x_range = torch.linspace(0, image_width - 1, image_width, device=device)
    y_grid, x_grid = torch.meshgrid(y_range, x_range)
    return torch.stack((y_grid, x_grid), -1)


def flatten_grid_locations(grid_locations, image_height, image_width):
    return torch.reshape(grid_locations, [image_height * image_width, 2])


def get_flat_grid_locations(image_height, image_width, device):
    y_range = torch.linspace(0, image_height - 1, image_height, device=device)
    x_range = torch.linspace(0, image_width - 1, image_width, device=device)
    y_grid, x_grid = torch.meshgrid(y_range, x_range)
    return torch.stack((y_grid, x_grid), -1).reshape([image_height * image_width, 2])


def create_dense_flows(flattened_flows, batch_size, image_height, image_width):
    # possibly .view
    return torch.reshape(flattened_flows, [batch_size, image_height, image_width, 2])


def interpolate_spline(
    train_points,
    train_values,
    query_points,
    order,
    regularization_weight=0.0,
):
    # First, fit the spline to the observed data.
    w, v = solve_interpolation(train_points, train_values, order, regularization_weight)
    # Then, evaluate the spline at the query locations.
    query_values = apply_interpolation(query_points, train_points, w, v, order)

    return query_values


def solve_interpolation(train_points, train_values, order, regularization_weight):
    device = train_points.device
    b, n, d = train_points.shape
    k = train_values.shape[-1]

    c = train_points
    f = train_values.float()

    matrix_a = phi(cross_squared_distance_matrix(c, c), order).unsqueeze(0)  # [b, n, n]

    # Append ones to the feature values for the bias term in the linear model.
    ones = torch.ones(1, dtype=train_points.dtype, device=device).view([-1, 1, 1])
    matrix_b = torch.cat((c, ones), 2).float()  # [b, n, d + 1]

    # [b, n + d + 1, n]
    left_block = torch.cat((matrix_a, torch.transpose(matrix_b, 2, 1)), 1)

    num_b_cols = matrix_b.shape[2]  # d + 1

    # In Tensorflow, zeros are used here. Pytorch solve fails with zeros
    # for some reason we don't understand.
    # So instead we use very tiny randn values (variance of one, zero mean)
    # on one side of our multiplication.
    lhs_zeros = torch.randn((b, num_b_cols, num_b_cols), device=device) / 1e10
    right_block = torch.cat((matrix_b, lhs_zeros), 1)  # [b, n + d + 1, d + 1]
    lhs = torch.cat((left_block, right_block), 2)  # [b, n + d + 1, n + d + 1]

    rhs_zeros = torch.zeros(
        (b, d + 1, k), dtype=train_points.dtype, device=device
    ).float()
    rhs = torch.cat((f, rhs_zeros), 1)  # [b, n + d + 1, k]

    # Then, solve the linear system and unpack the results.
    X, LU = torch.gesv(rhs, lhs)
    w = X[:, :n, :]
    v = X[:, n:, :]

    return w, v


def cross_squared_distance_matrix(x, y):
    """Pairwise squared distance between two (batch) matrices' rows (2nd dim).

    Computes the pairwise distances between rows of x and rows of y
    Args:
    x: [batch_size, n, d] float `Tensor`
    y: [batch_size, m, d] float `Tensor`
    Returns:
    squared_dists: [batch_size, n, m] float `Tensor`, where
    squared_dists[b,i,j] = ||x[b,i,:] - y[b,j,:]||^2
    """
    x_norm_squared = torch.sum(torch.mul(x, x))
    y_norm_squared = torch.sum(torch.mul(y, y))

    x_y_transpose = torch.matmul(x.squeeze(0), y.squeeze(0).transpose(0, 1))

    # squared_dists[b,i,j] = ||x_bi - y_bj||^2 = x_bi'x_bi- 2x_bi'x_bj + x_bj'x_bj
    squared_dists = x_norm_squared - 2 * x_y_transpose + y_norm_squared

    return squared_dists.float()


def phi(r, order):
    """Coordinate-wise nonlinearity used to define the order of the interpolation.

    See https://en.wikipedia.org/wiki/Polyharmonic_spline for the definition.
    Args:
    r: input op
    order: interpolation order
    Returns:
    phi_k evaluated coordinate-wise on r, for k = r
    """
    EPSILON = torch.tensor(1e-10, device=r.device)
    # using EPSILON prevents log(0), sqrt0), etc.
    # sqrt(0) is well-defined, but its gradient is not
    if order == 1:
        r = torch.max(r, EPSILON)
        r = torch.sqrt(r)
        return r
    elif order == 2:
        return 0.5 * r * torch.log(torch.max(r, EPSILON))
    elif order == 4:
        return 0.5 * torch.square(r) * torch.log(torch.max(r, EPSILON))
    elif order % 2 == 0:
        r = torch.max(r, EPSILON)
        return 0.5 * torch.pow(r, 0.5 * order) * torch.log(r)
    else:
        r = torch.max(r, EPSILON)
        return torch.pow(r, 0.5 * order)


def apply_interpolation(query_points, train_points, w, v, order):
    """Apply polyharmonic interpolation model to data.

    Notes:
        Given coefficients w and v for the interpolation model, we evaluate
        interpolated function values at query_points.

    Args:
        query_points: `[b, m, d]` x values to evaluate the interpolation at
        train_points: `[b, n, d]` x values that act as the interpolation centers
            ( the c variables in the wikipedia article)
            w: `[b, n, k]` weights on each interpolation center
            v: `[b, d, k]` weights on each input dimension
        order: order of the interpolation

    Returns:
        Polyharmonic interpolation evaluated at points defined in query_points.
    """
    query_points = query_points.unsqueeze(0)
    # First, compute the contribution from the rbf term.
    pairwise_dists = cross_squared_distance_matrix(
        query_points.float(), train_points.float()
    )
    phi_pairwise_dists = phi(pairwise_dists, order)

    rbf_term = torch.matmul(phi_pairwise_dists, w)

    # Then, compute the contribution from the linear term.
    # Pad query_points with ones, for the bias term in the linear model.
    ones = torch.ones_like(query_points[..., :1])
    query_points_pad = torch.cat((query_points, ones), 2).float()
    linear_term = torch.matmul(query_points_pad, v)

    return rbf_term + linear_term


def dense_image_warp(image, flow):
    """Image warping using per-pixel flow vectors.

    Apply a non-linear warp to the image, where the warp is specified by a dense
    flow field of offset vectors that define the correspondences of pixel values
    in the output image back to locations in the  source image. Specifically, the
    pixel value at output[b, j, i, c] is
    images[b, j - flow[b, j, i, 0], i - flow[b, j, i, 1], c].
    The locations specified by this formula do not necessarily map to an int
    index. Therefore, the pixel value is obtained by bilinear
    interpolation of the 4 nearest pixels around
    (b, j - flow[b, j, i, 0], i - flow[b, j, i, 1]). For locations outside
    of the image, we use the nearest pixel values at the image boundary.
    Args:
    image: 4-D float `Tensor` with shape `[batch, height, width, channels]`.
    flow: A 4-D float `Tensor` with shape `[batch, height, width, 2]`.
    name: A name for the operation (optional).
    Note that image and flow can be of type tf.half, tf.float32, or tf.float64,
    and do not necessarily have to be the same type.
    Returns:
    A 4-D float `Tensor` with shape`[batch, height, width, channels]`
    and same type as input image.
    Raises:
    ValueError: if height < 2 or width < 2 or the inputs have the wrong number
    of dimensions.
    """
    image = image.unsqueeze(3)  # add a single channel dimension to image tensor
    batch_size, height, width, channels = image.shape
    device = image.device

    # The flow is defined on the image grid. Turn the flow into a list of query
    # points in the grid space.
    grid_x, grid_y = torch.meshgrid(
        torch.arange(width, device=device), torch.arange(height, device=device)
    )

    stacked_grid = torch.stack((grid_y, grid_x), dim=2).float()

    batched_grid = stacked_grid.unsqueeze(-1).permute(3, 1, 0, 2)

    query_points_on_grid = batched_grid - flow
    query_points_flattened = torch.reshape(
        query_points_on_grid, [batch_size, height * width, 2]
    )
    # Compute values at the query points, then reshape the result back to the
    # image grid.
    interpolated = interpolate_bilinear(image, query_points_flattened)
    interpolated = torch.reshape(interpolated, [batch_size, height, width, channels])
    return interpolated


def interpolate_bilinear(
    grid, query_points, name="interpolate_bilinear", indexing="ij"
):
    """Similar to Matlab's interp2 function.

    Notes:
        Finds values for query points on a grid using bilinear interpolation.

    Args:
        grid: a 4-D float `Tensor` of shape `[batch, height, width, channels]`.
        query_points: a 3-D float `Tensor` of N points with shape `[batch, N, 2]`.
        name: a name for the operation (optional).
        indexing: whether the query points are specified as row and column (ij),
            or Cartesian coordinates (xy).

    Returns:
        values: a 3-D `Tensor` with shape `[batch, N, channels]`

    Raises:
        ValueError: if the indexing mode is invalid, or if the shape of the inputs
        invalid.
    """
    if indexing != "ij" and indexing != "xy":
        raise ValueError("Indexing mode must be 'ij' or 'xy'")

    shape = grid.shape
    if len(shape) != 4:
        msg = "Grid must be 4 dimensional. Received size: "
        raise ValueError(msg + str(grid.shape))

    batch_size, height, width, channels = grid.shape

    shape = [batch_size, height, width, channels]
    query_type = query_points.dtype
    grid_type = grid.dtype
    grid_device = grid.device

    num_queries = query_points.shape[1]

    alphas = []
    floors = []
    ceils = []
    index_order = [0, 1] if indexing == "ij" else [1, 0]
    unstacked_query_points = query_points.unbind(2)

    for dim in index_order:
        queries = unstacked_query_points[dim]

        size_in_indexing_dimension = shape[dim + 1]

        # max_floor is size_in_indexing_dimension - 2 so that max_floor + 1
        # is still a valid index into the grid.
        max_floor = torch.tensor(
            size_in_indexing_dimension - 2, dtype=query_type, device=grid_device
        )
        min_floor = torch.tensor(0.0, dtype=query_type, device=grid_device)
        maxx = torch.max(min_floor, torch.floor(queries))
        floor = torch.min(maxx, max_floor)
        int_floor = floor.long()
        floors.append(int_floor)
        ceil = int_floor + 1
        ceils.append(ceil)

        # alpha has the same type as the grid, as we will directly use alpha
        # when taking linear combinations of pixel values from the image.

        alpha = torch.tensor((queries - floor), dtype=grid_type, device=grid_device)
        min_alpha = torch.tensor(0.0, dtype=grid_type, device=grid_device)
        max_alpha = torch.tensor(1.0, dtype=grid_type, device=grid_device)
        alpha = torch.min(torch.max(min_alpha, alpha), max_alpha)

        # Expand alpha to [b, n, 1] so we can use broadcasting
        # (since the alpha values don't depend on the channel).
        alpha = torch.unsqueeze(alpha, 2)
        alphas.append(alpha)

    flattened_grid = torch.reshape(grid, [batch_size * height * width, channels])
    batch_offsets = torch.reshape(
        torch.arange(batch_size, device=grid_device) * height * width, [batch_size, 1]
    )

    # This wraps array_ops.gather. We reshape the image data such that the
    # batch, y, and x coordinates are pulled into the first dimension.
    # Then we gather. Finally, we reshape the output back. It's possible this
    # code would be made simpler by using array_ops.gather_nd.
    def gather(y_coords, x_coords, name):
        linear_coordinates = batch_offsets + y_coords * width + x_coords
        gathered_values = torch.gather(flattened_grid.t(), 1, linear_coordinates)
        return torch.reshape(gathered_values, [batch_size, num_queries, channels])

    # grab the pixel values in the 4 corners around each query point
    top_left = gather(floors[0], floors[1], "top_left")
    top_right = gather(floors[0], ceils[1], "top_right")
    bottom_left = gather(ceils[0], floors[1], "bottom_left")
    bottom_right = gather(ceils[0], ceils[1], "bottom_right")

    interp_top = alphas[1] * (top_right - top_left) + top_left
    interp_bottom = alphas[1] * (bottom_right - bottom_left) + bottom_left
    interp = alphas[0] * (interp_bottom - interp_top) + interp_top

    return interp


================================================
FILE: utils/training/__init__.py
================================================
"""Initialize sub package."""


================================================
FILE: utils/training/batchfy.py
================================================
import itertools
import logging
import numpy as np
import random

def batchfy_by_seq(
    sorted_data,
    batch_size,
    max_length_in,
    max_length_out,
    min_batch_size=1,
    shortest_first=False,
    ikey="input",
    iaxis=0,
    okey="output",
    oaxis=0,
):
    """Make batch set from json dictionary

    :param Dict[str, Dict[str, Any]] sorted_data: dictionary loaded from data.json
    :param int batch_size: batch size
    :param int max_length_in: maximum length of input to decide adaptive batch size
    :param int max_length_out: maximum length of output to decide adaptive batch size
    :param int min_batch_size: mininum batch size (for multi-gpu)
    :param bool shortest_first: Sort from batch with shortest samples
        to longest if true, otherwise reverse
    :param str ikey: key to access input
        (for ASR ikey="input", for TTS, MT ikey="output".)
    :param int iaxis: dimension to access input
        (for ASR, TTS iaxis=0, for MT iaxis="1".)
    :param str okey: key to access output
        (for ASR, MT okey="output". for TTS okey="input".)
    :param int oaxis: dimension to access output
        (for ASR, TTS, MT oaxis=0, reserved for future research, -1 means all axis.)
    :return: List[List[Tuple[str, dict]]] list of batches
    """
    if batch_size <= 0:
        raise ValueError(f"Invalid batch_size={batch_size}")

    # check #utts is more than min_batch_size
    if len(sorted_data) < min_batch_size:
        raise ValueError(
            f"#utts({len(sorted_data)}) is less than min_batch_size({min_batch_size})."
        )

    # make list of minibatches
    minibatches = []
    start = 0
    while True:
        _, info = sorted_data[start]
        ilen = int(info[ikey][iaxis]["shape"][0])
        olen = (
            int(info[okey][oaxis]["shape"][0])
            if oaxis >= 0
            else max(map(lambda x: int(x["shape"][0]), info[okey]))
        )
        factor = max(int(ilen / max_length_in), int(olen / max_length_out))
        # change batchsize depending on the input and output length
        # if ilen = 1000 and max_length_in = 800
        # then b = batchsize / 2
        # and max(min_batches, .) avoids batchsize = 0
        bs = max(min_batch_size, int(batch_size / (1 + factor)))
        end = min(len(sorted_data), start + bs)
        minibatch = sorted_data[start:end]
        if shortest_first:
            minibatch.reverse()

        # check each batch is more than minimum batchsize
        # we repeat the data in this mini-batch,
        # so they are from the same ark
        if len(minibatch) < min_batch_size:
            #mod = min_batch_size - len(minibatch) % min_batch_size
            #additional_minibatch = [
            #    sorted_data[i] for i in np.random.randint(0, start, mod)
            #]
            #if shortest_first:
            #    additional_minibatch.reverse()
            repeat_data = minibatch[0] if shortest_first else minibatch[-1]
            repeat_data = [repeat_data for _ in range(min_batch_size - len(minibatch))]
            minibatch = repeat_data + minibatch if shortest_first else\
                        minibatch + repeat_data
            # minibatch.extend(additional_minibatch)
        minibatches.append(minibatch)

        if end == len(sorted_data):
            break
        start = end

    # batch: List[List[Tuple[str, dict]]]
    return minibatches


def batchfy_by_bin(
    sorted_data,
    batch_bins,
    num_batches=0,
    min_batch_size=1,
    shortest_first=False,
    ikey="input",
    okey="output",
):
    """Make variably sized batch set, which maximizes

    the number of bins up to `batch_bins`.

    :param Dict[str, Dict[str, Any]] sorted_data: dictionary loaded from data.json
    :param int batch_bins: Maximum frames of a batch
    :param int num_batches: # number of batches to use (for debug)
    :param int min_batch_size: minimum batch size (for multi-gpu)
    :param int test: Return only every `test` batches
    :param bool shortest_first: Sort from batch with shortest samples
        to longest if true, otherwise reverse

    :param str ikey: key to access input (for ASR ikey="input", for TTS ikey="output".)
    :param str okey: key to access output (for ASR okey="output". for TTS okey="input".)

    :return: List[Tuple[str, Dict[str, List[Dict[str, Any]]]] list of batches
    """
    if batch_bins <= 0:
        raise ValueError(f"invalid batch_bins={batch_bins}")
    length = len(sorted_data)
    idim = int(sorted_data[0][1][ikey][0]["shape"][1])
    odim = int(sorted_data[0][1][okey][0]["shape"][1])
    logging.info("# utts: " + str(len(sorted_data)))
    minibatches = []
    start = 0
    n = 0
    while True:
        # Dynamic batch size depending on size of samples
        b = 0
        next_size = 0
        max_olen = 0
        while next_size < batch_bins and (start + b) < length:
            ilen = int(sorted_data[start + b][1][ikey][0]["shape"][0]) * idim
            olen = int(sorted_data[start + b][1][okey][0]["shape"][0]) * odim
            if olen > max_olen:
                max_olen = olen
            next_size = (max_olen + ilen) * (b + 1)
            if next_size <= batch_bins:
                b += 1
            elif next_size == 0:
                raise ValueError(
                    f"Can't fit one sample in batch_bins ({batch_bins}): "
                    f"Please increase the value"
                )
        end = min(length, start + max(min_batch_size, b))
        batch = sorted_data[start:end]
        if shortest_first:
            batch.reverse()
        minibatches.append(batch)
        # Check for min_batch_size and fixes the batches if needed
        i = -1
        while len(minibatches[i]) < min_batch_size:
            missing = min_batch_size - len(minibatches[i])
            if -i == len(minibatches):
                minibatches[i + 1].extend(minibatches[i])
                minibatches = minibatches[1:]
                break
            else:
                minibatches[i].extend(minibatches[i - 1][:missing])
                minibatches[i - 1] = minibatches[i - 1][missing:]
                i -= 1
        if end == length:
            break
        start = end
        n += 1
    if num_batches > 0:
        minibatches = minibatches[:num_batches]
    lengths = [len(x) for x in minibatches]
    logging.info(
        str(len(minibatches))
        + " batches containing from "
        + str(min(lengths))
        + " to "
        + str(max(lengths))
        + " samples "
        + "(avg "
        + str(int(np.mean(lengths)))
        + " samples)."
    )
    return minibatches


def batchfy_by_frame(
    sorted_data,
    max_frames_in,
    max_frames_out,
    max_frames_inout,
    num_batches=0,
    min_batch_size=1,
    shortest_first=False,
    ikey="input",
    okey="output",
):
    """Make variable batch set, which maximizes the number of frames to max_batch_frame.

    :param Dict[str, Dict[str, Any]] sorteddata: dictionary loaded from data.json
    :param int max_frames_in: Maximum input frames of a batch
    :param int max_frames_out: Maximum output frames of a batch
    :param int max_frames_inout: Maximum input+output frames of a batch
    :param int num_batches: # number of batches to use (for debug)
    :param int min_batch_size: minimum batch size (for multi-gpu)
    :param int test: Return only every `test` batches
    :param bool shortest_first: Sort from batch with shortest samples
        to longest if true, otherwise reverse

    :param str ikey: key to access input (for ASR ikey="input", for TTS ikey="output".)
    :param str okey: key to access output (for ASR okey="output". for TTS okey="input".)

    :return: List[Tuple[str, Dict[str, List[Dict[str, Any]]]] list of batches
    """
    if max_frames_in <= 0 and max_frames_out <= 0 and max_frames_inout <= 0:
        raise ValueError(
            "At least, one of `--batch-frames-in`, `--batch-frames-out` or "
            "`--batch-frames-inout` should be > 0"
        )
    length = len(sorted_data)
    minibatches = []
    start = 0
    end = 0
    while end != length:
        # Dynamic batch size depending on size of samples
        b = 0
        max_olen = 0
        max_ilen = 0
        while (start + b) < length:
            ilen = int(sorted_data[start + b][1][ikey][0]["shape"][0])
            if ilen > max_frames_in and max_frames_in != 0:
                raise ValueError(
                    f"Can't fit one sample in --batch-frames-in ({max_frames_in}): "
                    f"Please increase the value"
                )
            olen = int(sorted_data[start + b][1][okey][0]["shape"][0])
            if olen > max_frames_out and max_frames_out != 0:
                raise ValueError(
                    f"Can't fit one sample in --batch-frames-out ({max_frames_out}): "
                    f"Please increase the value"
                )
            if ilen + olen > max_frames_inout and max_frames_inout != 0:
                raise ValueError(
                    f"Can't fit one sample in --batch-frames-out ({max_frames_inout}): "
                    f"Please increase the value"
                )
            max_olen = max(max_olen, olen)
            max_ilen = max(max_ilen, ilen)
            in_ok = max_ilen * (b + 1) <= max_frames_in or max_frames_in == 0
            out_ok = max_olen * (b + 1) <= max_frames_out or max_frames_out == 0
            inout_ok = (max_ilen + max_olen) * (
                b + 1
            ) <= max_frames_inout or max_frames_inout == 0
            if in_ok and out_ok and inout_ok:
                # add more seq in the minibatch
                b += 1
            else:
                # no more seq in the minibatch
                break
        end = min(length, start + b)
        batch = sorted_data[start:end]
        if shortest_first:
            batch.reverse()
        minibatches.append(batch)
        # Check for min_batch_size and fixes the batches if needed
        i = -1
        while len(minibatches[i]) < min_batch_size:
            missing = min_batch_size - len(minibatches[i])
            if -i == len(minibatches):
                minibatches[i + 1].extend(minibatches[i])
                minibatches = minibatches[1:]
                break
            else:
                minibatches[i].extend(minibatches[i - 1][:missing])
                minibatches[i - 1] = minibatches[i - 1][missing:]
                i -= 1
        start = end
    if num_batches > 0:
        minibatches = minibatches[:num_batches]
    lengths = [len(x) for x in minibatches]
    logging.info(
        str(len(minibatches))
        + " batches containing from "
        + str(min(lengths))
        + " to "
        + str(max(lengths))
        + " samples"
        + "(avg "
        + str(int(np.mean(lengths)))
        + " samples)."
    )

    return minibatches


def batchfy_shuffle(data, batch_size, min_batch_size, num_batches, shortest_first):
    import random

    logging.info("use shuffled batch.")
    sorted_data = random.sample(data.items(), len(data.items()))
    logging.info("# utts: " + str(len(sorted_data)))
    # make list of minibatches
    minibatches = []
    start = 0
    while True:
        end = min(len(sorted_data), start + batch_size)
        # check each batch is more than minimum batchsize
        minibatch = sorted_data[start:end]
        if shortest_first:
            minibatch.reverse()
        if len(minibatch) < min_batch_size:
            mod = min_batch_size - len(minibatch) % min_batch_size
            additional_minibatch = [
                sorted_data[i] for i in np.random.randint(0, start, mod)
            ]
            if shortest_first:
                additional_minibatch.reverse()
            minibatch.extend(additional_minibatch)
        minibatches.append(minibatch)
        if end == len(sorted_data):
            break
        start = end

    # for debugging
    if num_batches > 0:
        minibatches = minibatches[:num_batches]
        logging.info("# minibatches: " + str(len(minibatches)))
    return minibatches


BATCH_COUNT_CHOICES = ["auto", "seq", "bin", "frame"]
BATCH_SORT_KEY_CHOICES = ["input", "output", "shuffle"]


def make_batchset(
    data,
    batch_size=0,
    max_length_in=float("inf"),
    max_length_out=float("inf"),
    num_batches=0,
    min_batch_size=1,
    shortest_first=False,
    batch_sort_key="input",
    swap_io=False,
    mt=False,
    no_sort=False,
    count="auto",
    batch_bins=0,
    batch_frames_in=0,
    batch_frames_out=0,
    batch_frames_inout=0,
    iaxis=0,
    oaxis=0,
):
    """Make batch set from json dictionary

    if utts have "category" value,

        >>> data = {'utt1': {'category': 'A', 'input': ...},
        ...         'utt2': {'category': 'B', 'input': ...},
        ...         'utt3': {'category': 'B', 'input': ...},
        ...         'utt4': {'category': 'A', 'input': ...}}
        >>> make_batchset(data, batchsize=2, ...)
        [[('utt1', ...), ('utt4', ...)], [('utt2', ...), ('utt3': ...)]]

    Note that if any utts doesn't have "category",
    perform as same as batchfy_by_{count}

    :param Dict[str, Dict[str, Any]] data: dictionary loaded from data.json
    :param int batch_size: maximum number of sequences in a minibatch.
    :param int batch_bins: maximum number of bins (frames x dim) in a minibatch.
    :param int batch_frames_in:  maximum number of input frames in a minibatch.
    :param int batch_frames_out: maximum number of output frames in a minibatch.
    :param int batch_frames_out: maximum number of input+output frames in a minibatch.
    :param str count: strategy to count maximum size of batch.
        For choices, see espnet.asr.batchfy.BATCH_COUNT_CHOICES

    :param int max_length_in: maximum length of input to decide adaptive batch size
    :param int max_length_out: maximum length of output to decide adaptive batch size
    :param int num_batches: # number of batches to use (for debug)
    :param int min_batch_size: minimum batch size (for multi-gpu)
    :param bool shortest_first: Sort from batch with shortest samples
        to longest if true, otherwise reverse
    :param str batch_sort_key: how to sort data before creating minibatches
        ["input", "output", "shuffle"]
    :param bool swap_io: if True, use "input" as output and "output"
        as input in `data` dict
    :param bool mt: if True, use 0-axis of "output" as output and 1-axis of "output"
        as input in `data` dict
    :param int iaxis: dimension to access input
        (for ASR, TTS iaxis=0, for MT iaxis="1".)
    :param int oaxis: dimension to access output (for ASR, TTS, MT oaxis=0,
        reserved for future research, -1 means all axis.)
    :return: List[List[Tuple[str, dict]]] list of batches
    """

    # check args
    if count not in BATCH_COUNT_CHOICES:
        raise ValueError(
            f"arg 'count' ({count}) should be one of {BATCH_COUNT_CHOICES}"
        )
    if batch_sort_key not in BATCH_SORT_KEY_CHOICES:
        raise ValueError(
            f"arg 'batch_sort_key' ({batch_sort_key}) should be "
            f"one of {BATCH_SORT_KEY_CHOICES}"
        )

    # TODO(karita): remove this by creating converter from ASR to TTS json format
    batch_sort_axis = 0
    if swap_io:
        # for TTS
        ikey = "output"
        okey = "input"
        if batch_sort_key == "input":
            batch_sort_key = "output"
        elif batch_sort_key == "output":
            batch_sort_key = "input"
    elif mt:
        # for MT
        ikey = "output"
        okey = "output"
        batch_sort_key = "output"
        batch_sort_axis = 1
        assert iaxis == 1
        assert oaxis == 0
        # NOTE: input is json['output'][1] and output is json['output'][0]
    else:
        ikey = "input"
        okey = "output"

    if count == "auto":
        if batch_size != 0:
            count = "seq"
        elif batch_bins != 0:
            count = "bin"
        elif batch_frames_in != 0 or batch_frames_out != 0 or batch_frames_inout != 0:
            count = "frame"
        else:
            raise ValueError(
                f"cannot detect `count` manually set one of {BATCH_COUNT_CHOICES}"
            )
        logging.info(f"count is auto detected as {count}")

    if count != "seq" and batch_sort_key == "shuffle":
        raise ValueError("batch_sort_key=shuffle is only available if batch_count=seq")

    category2data = {}  # Dict[str, dict]
    for k, v in data.items():
        category2data.setdefault(v.get("category"), {})[k] = v

    batches_list = []  # List[List[List[Tuple[str, dict]]]]
    for d in category2data.values():
        if batch_sort_key == "shuffle":
            batches = batchfy_shuffle(
                d, batch_size, min_batch_size, num_batches, shortest_first
            )
            batches_list.append(batches)
            continue

        # sort it by input lengths (long to short)
        # add a random float in (0, 1) to shuffle multilingual data with the same length
        if not no_sort:
            sorted_data = sorted(
                d.items(),
                key=lambda data: int(data[1][batch_sort_key][batch_sort_axis]["shape"][0]) + random.random(),
                reverse=not shortest_first,
            )
        else:
            sorted_data = list(d.items())

        logging.info("# utts: " + str(len(sorted_data)))
        if count == "seq":
            batches = batchfy_by_seq(
                sorted_data,
                batch_size=batch_size,
                max_length_in=max_length_in,
                max_length_out=max_length_out,
                min_batch_size=min_batch_size,
                shortest_first=shortest_first,
                ikey=ikey,
                iaxis=iaxis,
                okey=okey,
                oaxis=oaxis,
            )
        if count == "bin":
            batches = batchfy_by_bin(
                sorted_data,
                batch_bins=batch_bins,
                min_batch_size=min_batch_size,
                shortest_first=shortest_first,
                ikey=ikey,
                okey=okey,
            )
        if count == "frame":
            batches = batchfy_by_frame(
                sorted_data,
                max_frames_in=batch_frames_in,
                max_frames_out=batch_frames_out,
                max_frames_inout=batch_frames_inout,
                min_batch_size=min_batch_size,
                shortest_first=shortest_first,
                ikey=ikey,
                okey=okey,
            )
        batches_list.append(batches)

    if len(batches_list) == 1:
        batches = batches_list[0]
    else:
        # Concat list. This way is faster than "sum(batch_list, [])"
        batches = list(itertools.chain(*batches_list))

    # for debugging
    if num_batches > 0:
        batches = batches[:num_batches]
        print(f"only keep {len(batches)} minibatches")
    logging.info("# minibatches: " + str(len(batches)))
    print("# minibatches: " + str(len(batches)))
    # batch: List[List[Tuple[str, dict]]]
    return batches


================================================
FILE: utils/training/evaluator.py
================================================
from chainer.training.extensions import Evaluator

from espnet.utils.training.tensorboard_logger import TensorboardLogger


class BaseEvaluator(Evaluator):
    """Base Evaluator in ESPnet"""

    def __call__(self, trainer=None):
        ret = super().__call__(trainer)
        try:
            if trainer is not None:
                # force tensorboard to report evaluation log
                tb_logger = trainer.get_extension(TensorboardLogger.default_name)
                tb_logger(trainer)
        except ValueError:
            pass
        return ret


================================================
FILE: utils/training/iterators.py
================================================
import chainer
from chainer.iterators import MultiprocessIterator
from chainer.iterators import SerialIterator
from chainer.iterators import ShuffleOrderSampler
from chainer.training.extension import Extension

import numpy as np


class ShufflingEnabler(Extension):
    """An extension enabling shuffling on an Iterator"""

    def __init__(self, iterators):
        """Inits the ShufflingEnabler

        :param list[Iterator] iterators: The iterators to enable shuffling on
        """
        self.set = False
        self.iterators = iterators

    def __call__(self, trainer):
        """Calls the enabler on the given iterator

        :param trainer: The iterator
        """
        if not self.set:
            for iterator in self.iterators:
                iterator.start_shuffle()
            self.set = True


class ToggleableShufflingSerialIterator(SerialIterator):
    """A SerialIterator having its shuffling property activated during training"""

    def __init__(self, dataset, batch_size, repeat=True, shuffle=True):
        """Init the Iterator

        :param torch.nn.Tensor dataset: The dataset to take batches from
        :param int batch_size: The batch size
        :param bool repeat: Whether to repeat data (allow multiple epochs)
        :param bool shuffle: Whether to shuffle the batches
        """
        super(ToggleableShufflingSerialIterator, self).__init__(
            dataset, batch_size, repeat, shuffle
        )

    def start_shuffle(self):
        """Starts shuffling (or reshuffles) the batches"""
        self._shuffle = True
        if int(chainer._version.__version__[0]) <= 4:
            self._order = np.random.permutation(len(self.dataset))
        else:
            self.order_sampler = ShuffleOrderSampler()
            self._order = self.order_sampler(np.arange(len(self.dataset)), 0)


class ToggleableShufflingMultiprocessIterator(MultiprocessIterator):
    """A MultiprocessIterator having its shuffling property activated during training"""

    def __init__(
        self,
        dataset,
        batch_size,
        repeat=True,
        shuffle=True,
        n_processes=None,
        n_prefetch=1,
        shared_mem=None,
        maxtasksperchild=20,
    ):
        """Init the iterator

        :param torch.nn.Tensor dataset: The dataset to take batches from
        :param int batch_size: The batch size
        :param bool repeat: Whether to repeat batches or not (enables multiple epochs)
        :param bool shuffle: Whether to shuffle the order of the batches
        :param int n_processes: How many processes to use
        :param int n_prefetch: The number of prefetch to use
        :param int shared_mem: How many memory to share between processes
        :param int maxtasksperchild: Maximum number of tasks per child
        """
        super(ToggleableShufflingMultiprocessIterator, self).__init__(
            dataset=dataset,
            batch_size=batch_size,
            repeat=repeat,
            shuffle=shuffle,
            n_processes=n_processes,
            n_prefetch=n_prefetch,
            shared_mem=shared_mem,
            maxtasksperchild=maxtasksperchild,
        )

    def start_shuffle(self):
        """Starts shuffling (or reshuffles) the batches"""
        self.shuffle = True
        if int(chainer._version.__version__[0]) <= 4:
            self._order = np.random.permutation(len(self.dataset))
        else:
            self.order_sampler = ShuffleOrderSampler()
            self._order = self.order_sampler(np.arange(len(self.dataset)), 0)
        self._set_prefetch_state()


================================================
FILE: utils/training/tensorboard_logger.py
================================================
from chainer.training.extension import Extension


class TensorboardLogger(Extension):
    """A tensorboard logger extension"""

    default_name = "espnet_tensorboard_logger"

    def __init__(
        self, logger, att_reporter=None, ctc_reporter=None, entries=None, epoch=0
    ):
        """Init the extension

        :param SummaryWriter logger: The logger to use
        :param PlotAttentionReporter att_reporter: The (optional) PlotAttentionReporter
        :param entries: The entries to watch
        :param int epoch: The starting epoch
        """
        self._entries = entries
        self._att_reporter = att_reporter
        self._ctc_reporter = ctc_reporter
        self._logger = logger
        self._epoch = epoch

    def __call__(self, trainer):
        """Updates the events file with the new values

        :param trainer: The trainer
        """
        observation = trainer.observation
        for k, v in observation.items():
            if (self._entries is not None) and (k not in self._entries):
                continue
            if k is not None and v is not None:
                if "cupy" in str(type(v)):
                    v = v.get()
                if "cupy" in str(type(k)):
                    k = k.get()
                self._logger.add_scalar(k, v, trainer.updater.iteration)
        if (
            self._att_reporter is not None
            and trainer.updater.get_iterator("main").epoch > self._epoch
        ):
            self._epoch = trainer.updater.get_iterator("main").epoch
            self._att_reporter.log_attentions(self._logger, trainer.updater.iteration)
        if (
            self._ctc_reporter is not None
            and trainer.updater.get_iterator("main").epoch > self._epoch
        ):
            self._epoch = trainer.updater.get_iterator("main").epoch
            self._ctc_reporter.log_ctc_probs(self._logger, trainer.updater.iteration)


================================================
FILE: utils/training/train_utils.py
================================================
import chainer
import logging


def check_early_stop(trainer, epochs):
    """Checks an early stopping trigger and warns the user if it's the case

    :param trainer: The trainer used for training
    :param epochs: The maximum number of epochs
    """
    end_epoch = trainer.updater.get_iterator("main").epoch
    if end_epoch < (epochs - 1):
        logging.warning(
            "Hit early stop at epoch "
            + str(end_epoch)
            + "\nYou can change the patience or set it to 0 to run all epochs"
        )


def set_early_stop(trainer, args, is_lm=False):
    """Sets the early stop trigger given the program arguments

    :param trainer: The trainer used for training
    :param args: The program arguments
    :param is_lm: If the trainer is for a LM (epoch instead of epochs)
    """
    patience = args.patience
    criterion = args.early_stop_criterion
    epochs = args.epoch if is_lm else args.epochs
    mode = "max" if "acc" in criterion else "min"
    if patience > 0:
        trainer.stop_trigger = chainer.training.triggers.EarlyStoppingTrigger(
            monitor=criterion,
            mode=mode,
            patients=patience,
            max_trigger=(epochs, "epoch"),
        )


================================================
FILE: vc/pytorch_backend/vc.py
================================================
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

# Copyright 2020 Nagoya University (Wen-Chin Huang)
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

"""E2E VC training / decoding functions."""

import copy
import json
import logging
import math
import os
import time

import chainer
import kaldiio
import numpy as np
import torch

from chainer import training
from chainer.training import extensions

from espnet.asr.asr_utils import get_model_conf
from espnet.asr.asr_utils import snapshot_object
from espnet.asr.asr_utils import torch_load
from espnet.asr.asr_utils import torch_resume
from espnet.asr.asr_utils import torch_snapshot
from espnet.asr.pytorch_backend.asr_init import load_trained_modules
from espnet.nets.pytorch_backend.nets_utils import pad_list
from espnet.nets.tts_interface import TTSInterface
from espnet.utils.dataset import ChainerDataLoader
from espnet.utils.dataset import TransformDataset
from espnet.utils.dynamic_import import dynamic_import
from espnet.utils.io_utils import LoadInputsAndTargets
from espnet.utils.training.batchfy import make_batchset
from espnet.utils.training.evaluator import BaseEvaluator

from espnet.utils.deterministic_utils import set_deterministic_pytorch
from espnet.utils.training.train_utils import check_early_stop
from espnet.utils.training.train_utils import set_early_stop

from espnet.utils.training.iterators import ShufflingEnabler

import matplotlib

from espnet.utils.training.tensorboard_logger import TensorboardLogger
from tensorboardX import SummaryWriter

matplotlib.use("Agg")


class CustomEvaluator(BaseEvaluator):
    """Custom evaluator."""

    def __init__(self, model, iterator, target, device):
        """Initilize module.

        Args:
            model (torch.nn.Module): Pytorch model instance.
            iterator (chainer.dataset.Iterator): Iterator for validation.
            target (chainer.Chain): Dummy chain instance.
            device (torch.device): The device to be used in evaluation.

        """
        super(CustomEvaluator, self).__init__(iterator, target)
        self.model = model
        self.device = device

    # The core part of the update routine can be customized by overriding.
    def evaluate(self):
        """Evaluate over validation iterator."""
        iterator = self._iterators["main"]

        if self.eval_hook:
            self.eval_hook(self)

        if hasattr(iterator, "reset"):
            iterator.reset()
            it = iterator
        else:
            it = copy.copy(iterator)

        summary = chainer.reporter.DictSummary()

        self.model.eval()
        with torch.no_grad():
            for batch in it:
                if isinstance(batch, tuple):
                    x = tuple(arr.to(self.device) for arr in batch)
                else:
                    x = batch
                    for key in x.keys():
                        x[key] = x[key].to(self.device)
                observation = {}
                with chainer.reporter.report_scope(observation):
                    # convert to torch tensor
                    if isinstance(x, tuple):
                        self.model(*x)
                    else:
                        self.model(**x)
                summary.add(observation)
        self.model.train()

        return summary.compute_mean()


class CustomUpdater(training.StandardUpdater):
    """Custom updater."""

    def __init__(self, model, grad_clip, iterator, optimizer, device, accum_grad=1):
        """Initilize module.

        Args:
            model (torch.nn.Module) model: Pytorch model instance.
            grad_clip (float) grad_clip : The gradient clipping value.
            iterator (chainer.dataset.Iterator): Iterator for training.
            optimizer (torch.optim.Optimizer) : Pytorch optimizer instance.
            device (torch.device): The device to be used in training.

        """
        super(CustomUpdater, self).__init__(iterator, optimizer)
        self.model = model
        self.grad_clip = grad_clip
        self.device = device
        self.clip_grad_norm = torch.nn.utils.clip_grad_norm_
        self.accum_grad = accum_grad
        self.forward_count = 0

    # The core part of the update routine can be customized by overriding.
    def update_core(self):
        """Update model one step."""
        # When we pass one iterator and optimizer to StandardUpdater.__init__,
        # they are automatically named 'main'.
        train_iter = self.get_iterator("main")
        optimizer = self.get_optimizer("main")

        # Get the next batch (a list of json files)
        batch = train_iter.next()
        if isinstance(batch, tuple):
            x = tuple(arr.to(self.device) for arr in batch)
        else:
            x = batch
            for key in x.keys():
                x[key] = x[key].to(self.device)

        # compute loss and gradient
        if isinstance(x, tuple):
            loss = self.model(*x).mean() / self.accum_grad
        else:
            loss = self.model(**x).mean() / self.accum_grad
        loss.backward()

        # update parameters
        self.forward_count += 1
        if self.forward_count != self.accum_grad:
            return
        self.forward_count = 0

        # compute the gradient norm to check if it is normal or not
        grad_norm = self.clip_grad_norm(self.model.parameters(), self.grad_clip)
        logging.debug("grad norm={}".format(grad_norm))
        if math.isnan(grad_norm):
            logging.warning("grad norm is nan. Do not update model.")
        else:
            optimizer.step()
        optimizer.zero_grad()

    def update(self):
        """Run update function."""
        self.update_core()
        if self.forward_count == 0:
            self.iteration += 1


class CustomConverter(object):
    """Custom converter."""

    def __init__(self):
        """Initilize module."""
        # NOTE: keep as class for future development
        pass

    def __call__(self, batch, device=torch.device("cpu")):
        """Convert a given batch.

        Args:
            batch (list): List of ndarrays.
            device (torch.device): The device to be send.

        Returns:
            dict: Dict of converted tensors.

        Examples:
            >>> batch = [([np.arange(5), np.arange(3)],
                          [np.random.randn(8, 2), np.random.randn(4, 2)],
                          None, None)]
            >>> conveter = CustomConverter()
            >>> conveter(batch, torch.device("cpu"))
            {'xs': tensor([[0, 1, 2, 3, 4],
                           [0, 1, 2, 0, 0]]),
             'ilens': tensor([5, 3]),
             'ys': tensor([[[-0.4197, -1.1157],
                            [-1.5837, -0.4299],
                            [-2.0491,  0.9215],
                            [-2.4326,  0.8891],
                            [ 1.2323,  1.7388],
                            [-0.3228,  0.6656],
                            [-0.6025,  1.3693],
                            [-1.0778,  1.3447]],
                           [[ 0.1768, -0.3119],
                            [ 0.4386,  2.5354],
                            [-1.2181, -0.5918],
                            [-0.6858, -0.8843],
                            [ 0.0000,  0.0000],
                            [ 0.0000,  0.0000],
                            [ 0.0000,  0.0000],
                            [ 0.0000,  0.0000]]]),
             'labels': tensor([[0., 0., 0., 0., 0., 0., 0., 1.],
                               [0., 0., 0., 1., 1., 1., 1., 1.]]),
             'olens': tensor([8, 4])}

        """
        # batch should be located in list
        assert len(batch) == 1
        xs, ys, spembs, extras = batch[0]

        # get list of lengths (must be tensor for DataParallel)
        ilens = torch.from_numpy(np.array([x.shape[0] for x in xs])).long().to(device)
        olens = torch.from_numpy(np.array([y.shape[0] for y in ys])).long().to(device)

        # perform padding and conversion to tensor
        xs = pad_list([torch.from_numpy(x).float() for x in xs], 0).to(device)
        ys = pad_list([torch.from_numpy(y).float() for y in ys], 0).to(device)

        # make labels for stop prediction
        labels = ys.new_zeros(ys.size(0), ys.size(1))
        for i, l in enumerate(olens):
            labels[i, l - 1 :] = 1.0

        # prepare dict
        new_batch = {
            "xs": xs,
            "ilens": ilens,
            "ys": ys,
            "labels": labels,
            "olens": olens,
        }

        # load speaker embedding
        if spembs is not None:
            spembs = torch.from_numpy(np.array(spembs)).float()
            new_batch["spembs"] = spembs.to(device)

        # load second target
        if extras is not None:
            extras = pad_list([torch.from_numpy(extra).float() for extra in extras], 0)
            new_batch["extras"] = extras.to(device)

        return new_batch


def train(args):
    """Train E2E VC model."""
    set_deterministic_pytorch(args)

    # check cuda availability
    if not torch.cuda.is_available():
        logging.warning("cuda is not available")

    # get input and output dimension info
    with open(args.valid_json, "rb") as f:
        valid_json = json.load(f)["utts"]
    utts = list(valid_json.keys())

    # In TTS, this is reversed, but not in VC. See `espnet.utils.training.batchfy`
    idim = int(valid_json[utts[0]]["input"][0]["shape"][1])
    odim = int(valid_json[utts[0]]["output"][0]["shape"][1])
    logging.info("#input dims : " + str(idim))
    logging.info("#output dims: " + str(odim))

    # get extra input and output dimenstion
    if args.use_speaker_embedding:
        args.spk_embed_dim = int(valid_json[utts[0]]["input"][1]["shape"][0])
    else:
        args.spk_embed_dim = None
    if args.use_second_target:
        args.spc_dim = int(valid_json[utts[0]]["input"][1]["shape"][1])
    else:
        args.spc_dim = None

    # write model config
    if not os.path.exists(args.outdir):
        os.makedirs(args.outdir)
    model_conf = args.outdir + "/model.json"
    with open(model_conf, "wb") as f:
        logging.info("writing a model config file to" + model_conf)
        f.write(
            json.dumps(
                (idim, odim, vars(args)), indent=4, ensure_ascii=False, sort_keys=True
            ).encode("utf_8")
        )
    for key in sorted(vars(args).keys()):
        logging.info("ARGS: " + key + ": " + str(vars(args)[key]))

    # specify model architecture
    if args.enc_init is not None or args.dec_init is not None:
        model = load_trained_modules(idim, odim, args, TTSInterface)
    else:
        model_class = dynamic_import(args.model_module)
        model = model_class(idim, odim, args)
    assert isinstance(model, TTSInterface)
    logging.info(model)
    reporter = model.reporter

    # freeze modules, if specified
    if args.freeze_mods:
        for mod, param in model.named_parameters():
            if any(mod.startswith(key) for key in args.freeze_mods):
                logging.info("freezing %s" % mod)
                param.requires_grad = False

    for mod, param in model.named_parameters():
        if not param.requires_grad:
            logging.info("Frozen module %s" % mod)

    # check the use of multi-gpu
    if args.ngpu > 1:
        model = torch.nn.DataParallel(model, device_ids=list(range(args.ngpu)))
        if args.batch_size != 0:
            logging.warning(
                "batch size is automatically increased (%d -> %d)"
                % (args.batch_size, args.batch_size * args.ngpu)
            )
            args.batch_size *= args.ngpu

    # set torch device
    device = torch.device("cuda" if args.ngpu > 0 else "cpu")
    model = model.to(device)

    logging.warning(
        "num. model params: {:,} (num. trained: {:,} ({:.1f}%))".format(
            sum(p.numel() for p in model.parameters()),
            sum(p.numel() for p in model.parameters() if p.requires_grad),
            sum(p.numel() for p in model.parameters() if p.requires_grad)
            * 100.0
            / sum(p.numel() for p in model.parameters()),
        )
    )

    # Setup an optimizer
    if args.opt == "adam":
        optimizer = torch.optim.Adam(
            model.parameters(), args.lr, eps=args.eps, weight_decay=args.weight_decay
        )
    elif args.opt == "noam":
        from espnet.nets.pytorch_backend.transformer.optimizer import get_std_opt

        optimizer = get_std_opt(
            model, args.adim, args.transformer_warmup_steps, args.transformer_lr
        )
    elif args.opt == "lamb":
        from pytorch_lamb import Lamb

        optimizer = Lamb(
            model.parameters(), lr=args.lr, weight_decay=0.01, betas=(0.9, 0.999)
        )
    else:
        raise NotImplementedError("unknown optimizer: " + args.opt)

    # FIXME: TOO DIRTY HACK
    setattr(optimizer, "target", reporter)
    setattr(optimizer, "serialize", lambda s: reporter.serialize(s))

    # read json data
    with open(args.train_json, "rb") as f:
        train_json = json.load(f)["utts"]
    with open(args.valid_json, "rb") as f:
        valid_json = json.load(f)["utts"]

    use_sortagrad = args.sortagrad == -1 or args.sortagrad > 0
    if use_sortagrad:
        args.batch_sort_key = "input"
    # make minibatch list (variable length)
    train_batchset = make_batchset(
        train_json,
        args.batch_size,
        args.maxlen_in,
        args.maxlen_out,
        args.minibatches,
        batch_sort_key=args.batch_sort_key,
        min_batch_size=args.ngpu if args.ngpu > 1 else 1,
        shortest_first=use_sortagrad,
        count=args.batch_count,
        batch_bins=args.batch_bins,
        batch_frames_in=args.batch_frames_in,
        batch_frames_out=args.batch_frames_out,
        batch_frames_inout=args.batch_frames_inout,
        swap_io=False,
        iaxis=0,
        oaxis=0,
    )
    valid_batchset = make_batchset(
        valid_json,
        args.batch_size,
        args.maxlen_in,
        args.maxlen_out,
        args.minibatches,
        batch_sort_key=args.batch_sort_key,
        min_batch_size=args.ngpu if args.ngpu > 1 else 1,
        count=args.batch_count,
        batch_bins=args.batch_bins,
        batch_frames_in=args.batch_frames_in,
        batch_frames_out=args.batch_frames_out,
        batch_frames_inout=args.batch_frames_inout,
        swap_io=False,
        iaxis=0,
        oaxis=0,
    )

    load_tr = LoadInputsAndTargets(
        mode="vc",
        use_speaker_embedding=args.use_speaker_embedding,
        use_second_target=args.use_second_target,
        preprocess_conf=args.preprocess_conf,
        preprocess_args={"train": True},  # Switch the mode of preprocessing
        keep_all_data_on_mem=args.keep_all_data_on_mem,
    )

    load_cv = LoadInputsAndTargets(
        mode="vc",
        use_speaker_embedding=args.use_speaker_embedding,
        use_second_target=args.use_second_target,
        preprocess_conf=args.preprocess_conf,
        preprocess_args={"train": False},  # Switch the mode of preprocessing
        keep_all_data_on_mem=args.keep_all_data_on_mem,
    )

    converter = CustomConverter()
    # hack to make batchsize argument as 1
    # actual bathsize is included in a list
    train_iter = {
        "main": ChainerDataLoader(
            dataset=TransformDataset(
                train_batchset, lambda data: converter([load_tr(data)])
            ),
            batch_size=1,
            num_workers=args.num_iter_processes,
            shuffle=not use_sortagrad,
            collate_fn=lambda x: x[0],
        )
    }
    valid_iter = {
        "main": ChainerDataLoader(
            dataset=TransformDataset(
                valid_batchset, lambda data: converter([load_cv(data)])
            ),
            batch_size=1,
            shuffle=False,
            collate_fn=lambda x: x[0],
            num_workers=args.num_iter_processes,
        )
    }

    # Set up a trainer
    updater = CustomUpdater(
        model, args.grad_clip, train_iter, optimizer, device, args.accum_grad
    )
    trainer = training.Trainer(updater, (args.epochs, "epoch"), out=args.outdir)

    # Resume from a snapshot
    if args.resume:
        logging.info("resumed from %s" % args.resume)
        torch_resume(args.resume, trainer)

    # set intervals
    eval_interval = (args.eval_interval_epochs, "epoch")
    save_interval = (args.save_interval_epochs, "epoch")
    report_interval = (args.report_interval_iters, "iteration")

    # Evaluate the model with the test dataset for each epoch
    trainer.extend(
        CustomEvaluator(model, valid_iter, reporter, device), trigger=eval_interval
    )

    # Save snapshot for each epoch
    trainer.extend(torch_snapshot(), trigger=save_interval)

    # Save best models
    trainer.extend(
        snapshot_object(model, "model.loss.best"),
        trigger=training.triggers.MinValueTrigger(
            "validation/main/loss", trigger=eval_interval
        ),
    )

    # Save attention figure for each epoch
    if args.num_save_attention > 0:
        data = sorted(
            list(valid_json.items())[: args.num_save_attention],
            key=lambda x: int(x[1]["input"][0]["shape"][1]),
            reverse=True,
        )
        if hasattr(model, "module"):
            att_vis_fn = model.module.calculate_all_attentions
            plot_class = model.module.attention_plot_class
        else:
            att_vis_fn = model.calculate_all_attentions
            plot_class = model.attention_plot_class
        att_reporter = plot_class(
            att_vis_fn,
            data,
            args.outdir + "/att_ws",
            converter=converter,
            transform=load_cv,
            device=device,
            reverse=True,
        )
        trainer.extend(att_reporter, trigger=eval_interval)
    else:
        att_reporter = None

    # Make a plot for training and validation values
    if hasattr(model, "module"):
        base_plot_keys = model.module.base_plot_keys
    else:
        base_plot_keys = model.base_plot_keys
    plot_keys = []
    for key in base_plot_keys:
        plot_key = ["main/" + key, "validation/main/" + key]
        trainer.extend(
            extensions.PlotReport(plot_key, "epoch", file_name=key + ".png"),
            trigger=eval_interval,
        )
        plot_keys += plot_key
    trainer.extend(
        extensions.PlotReport(plot_keys, "epoch", file_name="all_loss.png"),
        trigger=eval_interval,
    )

    # Write a log of evaluation statistics for each epoch
    trainer.extend(extensions.LogReport(trigger=report_interval))
    report_keys = ["epoch", "iteration", "elapsed_time"] + plot_keys
    trainer.extend(extensions.PrintReport(report_keys), trigger=report_interval)
    trainer.extend(extensions.ProgressBar(), trigger=report_interval)

    set_early_stop(trainer, args)
    if args.tensorboard_dir is not None and args.tensorboard_dir != "":
        writer = SummaryWriter(args.tensorboard_dir)
        trainer.extend(TensorboardLogger(writer, att_reporter), trigger=report_interval)

    if use_sortagrad:
        trainer.extend(
            ShufflingEnabler([train_iter]),
            trigger=(args.sortagrad if args.sortagrad != -1 else args.epochs, "epoch"),
        )

    # Run the training
    trainer.run()
    check_early_stop(trainer, args.epochs)


@torch.no_grad()
def decode(args):
    """Decode with E2E VC model."""
    set_deterministic_pytorch(args)
    # read training config
    idim, odim, train_args = get_model_conf(args.model, args.model_conf)

    # show arguments
    for key in sorted(vars(args).keys()):
        logging.info("args: " + key + ": " + str(vars(args)[key]))

    # define model
    model_class = dynamic_import(train_args.model_module)
    model = model_class(idim, odim, train_args)
    assert isinstance(model, TTSInterface)
    logging.info(model)

    # load trained model parameters
    logging.info("reading model parameters from " + args.model)
    torch_load(args.model, model)
    model.eval()

    # set torch device
    device = torch.device("cuda" if args.ngpu > 0 else "cpu")
    model = model.to(device)

    # read json data
    with open(args.json, "rb") as f:
        js = json.load(f)["utts"]

    # check directory
    outdir = os.path.dirname(args.out)
    if len(outdir) != 0 and not os.path.exists(outdir):
        os.makedirs(outdir)

    load_inputs_and_targets = LoadInputsAndTargets(
        mode="vc",
        load_output=False,
        sort_in_input_length=False,
        use_speaker_embedding=train_args.use_speaker_embedding,
        preprocess_conf=train_args.preprocess_conf
        if args.preprocess_conf is None
        else args.preprocess_conf,
        preprocess_args={"train": False},  # Switch the mode of preprocessing
    )

    # define function for plot prob and att_ws
    def _plot_and_save(array, figname, figsize=(6, 4), dpi=150):
        import matplotlib.pyplot as plt

        shape = array.shape
        if len(shape) == 1:
            # for eos probability
            plt.figure(figsize=figsize, dpi=dpi)
            plt.plot(array)
            plt.xlabel("Frame")
            plt.ylabel("Probability")
            plt.ylim([0, 1])
        elif len(shape) == 2:
            # for tacotron 2 attention weights, whose shape is (out_length, in_length)
            plt.figure(figsize=figsize, dpi=dpi)
            plt.imshow(array, aspect="auto")
            plt.xlabel("Input")
            plt.ylabel("Output")
        elif len(shape) == 4:
            # for transformer attention weights,
            # whose shape is (#leyers, #heads, out_length, in_length)
            plt.figure(figsize=(figsize[0] * shape[0], figsize[1] * shape[1]), dpi=dpi)
            for idx1, xs in enumerate(array):
                for idx2, x in enumerate(xs, 1):
                    plt.subplot(shape[0], shape[1], idx1 * shape[1] + idx2)
                    plt.imshow(x, aspect="auto")
                    plt.xlabel("Input")
                    plt.ylabel("Output")
        else:
            raise NotImplementedError("Support only from 1D to 4D array.")
        plt.tight_layout()
        if not os.path.exists(os.path.dirname(figname)):
            # NOTE: exist_ok = True is needed for parallel process decoding
            os.makedirs(os.path.dirname(figname), exist_ok=True)
        plt.savefig(figname)
        plt.close()

    # define function to calculate focus rate
    # (see section 3.3 in https://arxiv.org/abs/1905.09263)
    def _calculate_focus_rete(att_ws):
        if att_ws is None:
            # fastspeech case -> None
            return 1.0
        elif len(att_ws.shape) == 2:
            # tacotron 2 case -> (L, T)
            return float(att_ws.max(dim=-1)[0].mean())
        elif len(att_ws.shape) == 4:
            # transformer case -> (#layers, #heads, L, T)
            return float(att_ws.max(dim=-1)[0].mean(dim=-1).max())
        else:
            raise ValueError("att_ws should be 2 or 4 dimensional tensor.")

    # define function to convert attention to duration
    def _convert_att_to_duration(att_ws):
        if len(att_ws.shape) == 2:
            # tacotron 2 case -> (L, T)
            pass
        elif len(att_ws.shape) == 4:
            # transformer case -> (#layers, #heads, L, T)
            # get the most diagonal head according to focus rate
            att_ws = torch.cat(
                [att_w for att_w in att_ws], dim=0
            )  # (#heads * #layers, L, T)
            diagonal_scores = att_ws.max(dim=-1)[0].mean(dim=-1)  # (#heads * #layers,)
            diagonal_head_idx = diagonal_scores.argmax()
            att_ws = att_ws[diagonal_head_idx]  # (L, T)
        else:
            raise ValueError("att_ws should be 2 or 4 dimensional tensor.")
        # calculate duration from 2d attention weight
        durations = torch.stack(
            [att_ws.argmax(-1).eq(i).sum() for i in range(att_ws.shape[1])]
        )
        return durations.view(-1, 1).float()

    # define writer instances
    feat_writer = kaldiio.WriteHelper("ark,scp:{o}.ark,{o}.scp".format(o=args.out))
    if args.save_durations:
        dur_writer = kaldiio.WriteHelper(
            "ark,scp:{o}.ark,{o}.scp".format(o=args.out.replace("feats", "durations"))
        )
    if args.save_focus_rates:
        fr_writer = kaldiio.WriteHelper(
            "ark,scp:{o}.ark,{o}.scp".format(o=args.out.replace("feats", "focus_rates"))
        )

    # start decoding
    for idx, utt_id in enumerate(js.keys()):
        # setup inputs
        batch = [(utt_id, js[utt_id])]
        data = load_inputs_and_targets(batch)
        x = torch.FloatTensor(data[0][0]).to(device)
        spemb = None
        if train_args.use_speaker_embedding:
            spemb = torch.FloatTensor(data[1][0]).to(device)

        # decode and write
        start_time = time.time()
        outs, probs, att_ws = model.inference(x, args, spemb=spemb)
        logging.info(
            "inference speed = %.1f frames / sec."
            % (int(outs.size(0)) / (time.time() - start_time))
        )
        if outs.size(0) == x.size(0) * args.maxlenratio:
            logging.warning("output length reaches maximum length (%s)." % utt_id)
        focus_rate = _calculate_focus_rete(att_ws)
        logging.info(
            "(%d/%d) %s (size: %d->%d, focus rate: %.3f)"
            % (idx + 1, len(js.keys()), utt_id, x.size(0), outs.size(0), focus_rate)
        )
        feat_writer[utt_id] = outs.cpu().numpy()
        if args.save_durations:
            ds = _convert_att_to_duration(att_ws)
            dur_writer[utt_id] = ds.cpu().numpy()
        if args.save_focus_rates:
            fr_writer[utt_id] = np.array(focus_rate).reshape(1, 1)

        # plot and save prob and att_ws
        if probs is not None:
            _plot_and_save(
                probs.cpu().numpy(),
                os.path.dirname(args.out) + "/probs/%s_prob.png" % utt_id,
            )
        if att_ws is not None:
            _plot_and_save(
                att_ws.cpu().numpy(),
                os.path.dirname(args.out) + "/att_ws/%s_att_ws.png" % utt_id,
            )

    # close file object
    feat_writer.close()
    if args.save_durations:
        dur_writer.close()
    if args.save_focus_rates:
        fr_writer.close()


================================================
FILE: version.txt
================================================

0.9.9